From 8d249001110839b4baf422092551b8dca3ea6186 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 4 Jun 2018 14:41:24 -0400 Subject: [PATCH 1/8] first pass at a 'localarg' --- loopy/__init__.py | 4 +-- loopy/auto_test.py | 18 +++++++++-- loopy/check.py | 4 ++- loopy/kernel/__init__.py | 20 ++++++++----- loopy/kernel/data.py | 15 ++++++++++ loopy/target/__init__.py | 3 ++ loopy/target/c/codegen/expression.py | 6 ++-- loopy/target/cuda.py | 4 +++ loopy/target/execution.py | 4 +-- loopy/target/opencl.py | 11 +++++-- loopy/target/pyopencl_execution.py | 20 ++++++++++++- loopy/transform/diff.py | 4 ++- test/test_loopy.py | 45 ++++++++++++++++++++++++++++ 13 files changed, 137 insertions(+), 21 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 54c3523d5..b5de96d14 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -44,7 +44,7 @@ from loopy.kernel.instruction import ( from loopy.kernel.data import ( auto, KernelArgument, - ValueArg, GlobalArg, ConstantArg, ImageArg, + ValueArg, GlobalArg, LocalArg, ConstantArg, ImageArg, temp_var_scope, TemporaryVariable, SubstitutionRule, CallMangleInfo) @@ -162,7 +162,7 @@ __all__ = [ "BarrierInstruction", "KernelArgument", - "ValueArg", "GlobalArg", "ConstantArg", "ImageArg", + "ValueArg", "GlobalArg", "LocalArg", "ConstantArg", "ImageArg", "temp_var_scope", "TemporaryVariable", "SubstitutionRule", "CallMangleInfo", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index daf99eaf5..70603d8e8 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -79,7 +79,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array - from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, \ + from loopy.kernel.data import ValueArg, GlobalArg, LocalArg, ImageArg, \ TemporaryVariable, ConstantArg from pymbolic import evaluate @@ -178,6 +178,14 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_numpy_strides=numpy_strides, needs_checking=is_output)) + elif arg.arg_class is LocalArg: + # generally local kernel arguments are used as dynamically sized memory + # but you can't pass data from the host to the local arg, so there are + # no "reference" local arguments + ref_args[arg.name] = None + ref_arg_data.append(None) + pass + elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass @@ -196,7 +204,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array - from loopy.kernel.data import ValueArg, GlobalArg, ImageArg,\ + from loopy.kernel.data import ValueArg, GlobalArg, LocalArg, ImageArg,\ TemporaryVariable, ConstantArg from pymbolic import evaluate @@ -281,6 +289,12 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size + elif arg.arg_class is LocalArg: + # generally local kernel arguments are used as dynamically sized memory + # but you can't pass data from the host to the local arg, so there are + # no "reference" local arguments + pass + elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass diff --git a/loopy/check.py b/loopy/check.py index 17b1186ab..ffb470dfa 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -466,7 +466,7 @@ def _check_variable_access_ordered_inner(kernel): wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import GlobalArg, ValueArg, temp_var_scope + from loopy.kernel.data import GlobalArg, LocalArg, ValueArg, temp_var_scope from loopy.kernel.tools import find_aliasing_equivalence_classes depfind = IndirectDependencyEdgeFinder(kernel) @@ -494,6 +494,8 @@ def _check_variable_access_ordered_inner(kernel): arg = kernel.arg_dict[name] if isinstance(arg, GlobalArg): scope = temp_var_scope.GLOBAL + elif isinstance(arg, LocalArg): + scope = temp_var_scope.LOCAL elif isinstance(arg, ValueArg): scope = temp_var_scope.PRIVATE else: diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 429961a71..d08014da5 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -882,7 +882,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def global_var_names(self): from loopy.kernel.data import temp_var_scope - from loopy.kernel.data import GlobalArg return ( set( @@ -1084,16 +1083,23 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def local_var_names(self): from loopy.kernel.data import temp_var_scope - return set( - tv.name - for tv in six.itervalues(self.temporary_variables) - if tv.scope == temp_var_scope.LOCAL) + from loopy.kernel.data import LocalArg + return ( + set( + arg.name for arg in self.args + if isinstance(arg, LocalArg)) + | set( + tv.name + for tv in six.itervalues(self.temporary_variables) + if tv.scope == temp_var_scope.LOCAL)) def local_mem_use(self): from loopy.kernel.data import temp_var_scope - return sum( + from loopy.kernel.data import LocalArg + return (sum( tv.nbytes for tv in six.itervalues(self.temporary_variables) - if tv.scope == temp_var_scope.LOCAL) + if tv.scope == temp_var_scope.LOCAL) + + sum(arg.nbytes for arg in self.args if isinstance(arg, LocalArg))) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 35a8e3b1d..0c9102d70 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -272,6 +272,21 @@ class GlobalArg(ArrayBase, KernelArgument): return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, dtype, is_written) +class LocalArg(ArrayBase, KernelArgument): + __doc__ = ArrayBase.__doc__ + min_target_axes = 0 + max_target_axes = 1 + + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): + return ast_builder.get_local_arg_decl(self.name + name_suffix, shape, + dtype, is_written) + + @property + def nbytes(self): + shape = self.shape + from pytools import product + return product(si for si in shape)*self.dtype.itemsize + class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a08b406f5..d09238798 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -200,6 +200,9 @@ class ASTBuilderBase(object): def get_global_arg_decl(self, name, shape, dtype, is_written): raise NotImplementedError() + def get_local_arg_decl(self, name, shape, dtype, is_written): + raise NotImplementedError() + def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 59ed77f9c..ed7fc6e4f 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -194,7 +194,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state.vectorization_info) from loopy.kernel.data import ( - ImageArg, GlobalArg, TemporaryVariable, ConstantArg) + ImageArg, GlobalArg, LocalArg, TemporaryVariable, ConstantArg) if isinstance(ary, ImageArg): extra_axes = 0 @@ -227,10 +227,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): raise NotImplementedError( "non-floating-point images not supported for now") - elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): + elif isinstance(ary, (GlobalArg, LocalArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( - (isinstance(ary, (ConstantArg, GlobalArg)) or + (isinstance(ary, (ConstantArg, GlobalArg, LocalArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = make_var(access_info.array_name)[0] diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 027f27838..2087d26c0 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -334,6 +334,10 @@ class CUDACASTBuilder(CASTBuilder): return arg_decl + def get_local_arg_decl(self, name, shape, dtype, is_written): + from cgen.cuda import CudaShared + return CudaShared(self.get_global_arg_decl(name, shape, dtype, is_written)) + def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError("not yet: texture arguments in CUDA") diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 75424b9c7..de5bfa146 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -150,14 +150,14 @@ class ExecutionWrapperGeneratorBase(object): # returning the desired integer argument. iarg_to_sources = {} - from loopy.kernel.data import GlobalArg + from loopy.kernel.data import GlobalArg, LocalArg from loopy.symbolic import DependencyMapper, StringifyMapper from loopy.diagnostic import ParameterFinderWarning dep_map = DependencyMapper() from pymbolic import var for arg in implemented_data_info: - if arg.arg_class is GlobalArg: + if arg.arg_class in [GlobalArg, LocalArg]: sym_shape = var(arg.name).attr("shape") for axis_nr, shape_i in enumerate(arg.shape): if shape_i is None: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 31e0569b9..debd69b10 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -480,6 +480,13 @@ class OpenCLCASTBuilder(CASTBuilder): return CLGlobal(super(OpenCLCASTBuilder, self).get_global_arg_decl( name, shape, dtype, is_written)) + def get_local_arg_decl(self, name, shape, dtype, is_written): + from cgen.opencl import CLLocal + + # can simply use a "global" c decl + return CLLocal(super(OpenCLCASTBuilder, self).get_global_arg_decl( + name, shape, dtype, is_written)) + def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): if is_written: mode = "w" @@ -567,11 +574,11 @@ class OpenCLCASTBuilder(CASTBuilder): else: assert False - from loopy.kernel.data import TemporaryVariable, GlobalArg + from loopy.kernel.data import TemporaryVariable, GlobalArg, LocalArg if isinstance(lhs_var, GlobalArg): var_kind = "__global" elif ( - isinstance(lhs_var, TemporaryVariable) + isinstance(lhs_var, (TemporaryVariable, LocalArg)) and lhs_var.scope == temp_var_scope.LOCAL): var_kind = "__local" elif ( diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index d33a92d66..c165253d8 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -63,8 +63,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle non-numpy args def handle_non_numpy_arg(self, gen, arg): + from loopy.kernel.data import LocalArg + is_local = arg.arg_class == LocalArg gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): + if is_local: + gen("raise Exception('Cannot pass numpy data directly to a " + "__local argument.')") + gen("# synchronous, nothing to worry about") gen("%s = _lpy_cl_array.to_device(" "queue, %s, allocator=allocator)" @@ -72,7 +78,19 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("_lpy_encountered_numpy = True") gen("elif %s is not None:" % arg.name) with Indentation(gen): - gen("_lpy_encountered_dev = True") + if is_local: + gen("assert isinstance(%s, _lpy_cl.LocalMemory), 'Arguments of " + "type LocalArg must either be None or an instance of a " + "pyopencl.LocalMemory object.'" % arg.name) + else: + gen("_lpy_encountered_dev = True") + if is_local: + from numpy import prod + gen('else:') + with Indentation(gen): + gen('# create a properly sized LocalMemory object') + gen('%s = _lpy_cl.LocalMemory(%d)' % ( + arg.name, prod(arg.shape) * arg.dtype.itemsize)) gen("") diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701..9d29f52e8 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -335,8 +335,10 @@ class DifferentiationContext(object): dim_tags = ("c",) * len(shape) if var_name in self.kernel.arg_dict: + arg_class = lp.LocalArg if isinstance( + self.kernel.arg_dict[var_name], lp.LocalArg) else lp.GlobalArg self.new_args.append( - lp.GlobalArg( + arg_class( new_var_name, arg.dtype, shape=shape, diff --git a/test/test_loopy.py b/test/test_loopy.py index 7a6b8c8a6..f9119795d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2869,6 +2869,51 @@ def test_half_complex_conditional(ctx_factory): knl(queue) +def test_local_args(ctx_factory): + ctx = ctx_factory() + from loopy.kernel.instruction import BarrierInstruction + + # simple example, allow the user to pass in a workspace local array + knl = lp.make_kernel( + "{[i,i0]: 0 <= i,i0 < 10}", + """ + tmp[i] = i {id=init, dep=*} + ... lbarrier {id=barrier, mem_kind=local, dep=init} + out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=init:barrier} + """, + [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), + lp.GlobalArg('out', shape=(10,), dtype=np.int32)] + ) + + # get vectorized form + ref_knl = knl + knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0') + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + # try with 2 local args for compatibility + knl = lp.make_kernel( + "{[i,i0]: 0 <= i,i0 < 10}", + """ + for i + tmp[i] = i {id=init, dep=*} + tmp2[i] = i + 1 {id=init2, dep=*} + end + ... lbarrier {id=barrier, mem_kind=local, dep=init*} + for i0 + out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier} + end + """, + [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), + lp.LocalArg('tmp2', shape=(10,), dtype=np.int32), + lp.GlobalArg('out', shape=(10,), dtype=np.int32)] + ) + + # get vectorized form + ref_knl = knl + knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0') + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From a3f2a49977685fb4d06492c8a3a5d60c701a3fdc Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 4 Jun 2018 17:41:31 -0400 Subject: [PATCH 2/8] flake fixes --- loopy/kernel/data.py | 1 + loopy/transform/diff.py | 2 +- test/test_loopy.py | 15 +++++++-------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 0c9102d70..cb8a648e3 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -272,6 +272,7 @@ class GlobalArg(ArrayBase, KernelArgument): return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, dtype, is_written) + class LocalArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ min_target_axes = 0 diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 9d29f52e8..d346eb5a4 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -338,7 +338,7 @@ class DifferentiationContext(object): arg_class = lp.LocalArg if isinstance( self.kernel.arg_dict[var_name], lp.LocalArg) else lp.GlobalArg self.new_args.append( - arg_class( + arg_class( new_var_name, arg.dtype, shape=shape, diff --git a/test/test_loopy.py b/test/test_loopy.py index f9119795d..5847adc17 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2871,11 +2871,10 @@ def test_half_complex_conditional(ctx_factory): def test_local_args(ctx_factory): ctx = ctx_factory() - from loopy.kernel.instruction import BarrierInstruction # simple example, allow the user to pass in a workspace local array knl = lp.make_kernel( - "{[i,i0]: 0 <= i,i0 < 10}", + "{[i,i0]: 0 <= i,i0 < 10}", """ tmp[i] = i {id=init, dep=*} ... lbarrier {id=barrier, mem_kind=local, dep=init} @@ -2893,8 +2892,8 @@ def test_local_args(ctx_factory): # try with 2 local args for compatibility knl = lp.make_kernel( "{[i,i0]: 0 <= i,i0 < 10}", - """ - for i + """ + for i tmp[i] = i {id=init, dep=*} tmp2[i] = i + 1 {id=init2, dep=*} end @@ -2902,10 +2901,10 @@ def test_local_args(ctx_factory): for i0 out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier} end - """, - [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), - lp.LocalArg('tmp2', shape=(10,), dtype=np.int32), - lp.GlobalArg('out', shape=(10,), dtype=np.int32)] + """, + [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), + lp.LocalArg('tmp2', shape=(10,), dtype=np.int32), + lp.GlobalArg('out', shape=(10,), dtype=np.int32)] ) # get vectorized form -- GitLab From 9be66f7680b89f374d4f7e08ec3624299bd1d8b7 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 14:28:31 -0400 Subject: [PATCH 3/8] update executable local arg for arrayarg PR --- loopy/auto_test.py | 32 +++++++++++++++--------------- loopy/kernel/data.py | 16 --------------- loopy/target/__init__.py | 3 --- loopy/target/opencl.py | 7 ------- loopy/target/pyopencl_execution.py | 8 ++++---- loopy/transform/diff.py | 5 ++--- test/test_loopy.py | 3 ++- 7 files changed, 24 insertions(+), 50 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 4088d0e9a..0ce3db817 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -80,7 +80,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \ - TemporaryVariable, ConstantArg + TemporaryVariable, ConstantArg, AddressSpace from pymbolic import evaluate @@ -110,6 +110,15 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: + + if arg.address_space == AddressSpace.LOCAL: + # generally local kernel arguments are used as dynamically sized + # memory but you can't pass data from the host to the local arg, + # so there are no "reference" local arguments + ref_args[arg.name] = None + ref_arg_data.append(None) + continue + if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) @@ -178,14 +187,6 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_numpy_strides=numpy_strides, needs_checking=is_output)) - elif arg.arg_class is LocalArg: - # generally local kernel arguments are used as dynamically sized memory - # but you can't pass data from the host to the local arg, so there are - # no "reference" local arguments - ref_args[arg.name] = None - ref_arg_data.append(None) - pass - elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass @@ -205,7 +206,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ - TemporaryVariable, ConstantArg + TemporaryVariable, ConstantArg, AddressSpace from pymbolic import evaluate @@ -240,6 +241,11 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): elif arg.arg_class is ArrayArg or\ arg.arg_class is ConstantArg: + + if arg.address_space == AddressSpace.LOCAL: + # handled in invocation + continue + shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) @@ -289,12 +295,6 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size - elif arg.arg_class is LocalArg: - # generally local kernel arguments are used as dynamically sized memory - # but you can't pass data from the host to the local arg, so there are - # no "reference" local arguments - pass - elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 359e65482..3e776bd06 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -397,22 +397,6 @@ def GlobalArg(*args, **kwargs): return ArrayArg(*args, **kwargs) -class LocalArg(ArrayBase, KernelArgument): - __doc__ = ArrayBase.__doc__ - min_target_axes = 0 - max_target_axes = 1 - - def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - return ast_builder.get_local_arg_decl(self.name + name_suffix, shape, - dtype, is_written) - - @property - def nbytes(self): - shape = self.shape - from pytools import product - return product(si for si in shape)*self.dtype.itemsize - - class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ min_target_axes = 0 diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 1c93e6260..a81354e2f 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -203,9 +203,6 @@ class ASTBuilderBase(object): def get_global_arg_decl(self, name, shape, dtype, is_written): raise NotImplementedError() - def get_local_arg_decl(self, name, shape, dtype, is_written): - raise NotImplementedError() - def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index f5125d8db..432c95ef3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -499,13 +499,6 @@ class OpenCLCASTBuilder(CASTBuilder): return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) - def get_local_arg_decl(self, name, shape, dtype, is_written): - from cgen.opencl import CLLocal - - # can simply use a "global" c decl - return CLLocal(super(OpenCLCASTBuilder, self).get_global_arg_decl( - name, shape, dtype, is_written)) - def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): if is_written: mode = "w" diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 02760774d..807fde01c 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -63,8 +63,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle non-numpy args def handle_non_numpy_arg(self, gen, arg): - from loopy.kernel.data import LocalArg - is_local = arg.arg_class == LocalArg + from loopy.kernel.data import AddressScope + is_local = arg.address_scope == AddressScope.LOCAL gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): if is_local: @@ -79,8 +79,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("elif %s is not None:" % arg.name) with Indentation(gen): if is_local: - gen("assert isinstance(%s, _lpy_cl.LocalMemory), 'Arguments of " - "type LocalArg must either be None or an instance of a " + gen("assert isinstance(%s, _lpy_cl.LocalMemory), 'Arguments with " + "local scope must either be None or an instance of a " "pyopencl.LocalMemory object.'" % arg.name) else: gen("_lpy_encountered_dev = True") diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d346eb5a4..b695c1327 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -335,12 +335,11 @@ class DifferentiationContext(object): dim_tags = ("c",) * len(shape) if var_name in self.kernel.arg_dict: - arg_class = lp.LocalArg if isinstance( - self.kernel.arg_dict[var_name], lp.LocalArg) else lp.GlobalArg self.new_args.append( - arg_class( + lp.ArrayArg( new_var_name, arg.dtype, + address_space=self.kernel.arg_dict[var_name].address_space, shape=shape, dim_tags=dim_tags, )) diff --git a/test/test_loopy.py b/test/test_loopy.py index 5afd9fa73..e4b3abea9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2883,7 +2883,8 @@ def test_local_arg_execution(ctx_factory): ... lbarrier {id=barrier, mem_kind=local, dep=init} out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=init:barrier} """, - [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), + [lp.ArrayArg('tmp', address_space=AddressSpace.LOCAL, shape=(10,), + dtype=np.int32), lp.GlobalArg('out', shape=(10,), dtype=np.int32)] ) -- GitLab From 16883fac98178f9563ca501a7f9feccf94ac52e4 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 14:56:45 -0400 Subject: [PATCH 4/8] compatiblity fixes for local argument execution --- loopy/codegen/__init__.py | 9 +++++++-- loopy/kernel/array.py | 6 +++--- loopy/target/pyopencl_execution.py | 4 ++-- test/test_loopy.py | 1 + 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1b..ebb43520b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -51,6 +51,10 @@ class ImplementedDataInfo(ImmutableRecord): .. attribute:: arg_class + .. attribute:: address_space + The address-space of the array. + May be *None* for non-array arguments + .. attribute:: base_name The user-facing name of the underlying array. @@ -86,7 +90,7 @@ class ImplementedDataInfo(ImmutableRecord): unvec_shape=None, unvec_strides=None, offset_for_name=None, stride_for_name_and_axis=None, allows_offset=None, - is_written=None): + is_written=None, address_space=None): from loopy.types import LoopyType assert isinstance(dtype, LoopyType) @@ -103,7 +107,8 @@ class ImplementedDataInfo(ImmutableRecord): offset_for_name=offset_for_name, stride_for_name_and_axis=stride_for_name_and_axis, allows_offset=allows_offset, - is_written=is_written) + is_written=is_written, + address_space=address_space) # }}} diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 186597c64..372bf6eca 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1058,11 +1058,11 @@ class ArrayBase(ImmutableRecord): full_name, stride_impl_axis), is_written=False)) + space = getattr(self, 'address_space', None) yield ImplementedDataInfo( target=target, name=full_name, base_name=self.name, - arg_class=type(self), dtype=dtype, shape=shape, @@ -1070,8 +1070,8 @@ class ArrayBase(ImmutableRecord): unvec_shape=unvec_shape, unvec_strides=tuple(unvec_strides), allows_offset=bool(self.offset), - - is_written=is_written) + is_written=is_written, + address_space=space) import loopy as lp diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 807fde01c..a6215d6e1 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -63,8 +63,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle non-numpy args def handle_non_numpy_arg(self, gen, arg): - from loopy.kernel.data import AddressScope - is_local = arg.address_scope == AddressScope.LOCAL + from loopy.kernel.data import AddressSpace + is_local = arg.address_space == AddressSpace.LOCAL gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): if is_local: diff --git a/test/test_loopy.py b/test/test_loopy.py index e4b3abea9..4fa4f92cb 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2874,6 +2874,7 @@ def test_half_complex_conditional(ctx_factory): def test_local_arg_execution(ctx_factory): ctx = ctx_factory() + from loopy.kernel.data import AddressSpace # simple example, allow the user to pass in a workspace local array knl = lp.make_kernel( -- GitLab From ceea46710b791538d947435dfa4327221c7a59ce Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 16:13:18 -0400 Subject: [PATCH 5/8] reformat alloc / check of local memory in invoker --- loopy/target/execution.py | 191 ++++++++++++++++------------- loopy/target/pyopencl_execution.py | 16 +-- test/test_loopy.py | 7 +- 3 files changed, 117 insertions(+), 97 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf20577..7aade780c 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -364,7 +364,7 @@ class ExecutionWrapperGeneratorBase(object): self, gen, kernel, implemented_data_info, options): import loopy as lp - from loopy.kernel.data import KernelArgument + from loopy.kernel.data import KernelArgument, AddressSpace from loopy.kernel.array import ArrayBase from loopy.symbolic import StringifyMapper from loopy.types import NumpyType @@ -463,104 +463,121 @@ class ExecutionWrapperGeneratorBase(object): gen("if True:") with Indentation(gen): - gen("if %s.dtype != %s:" - % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) - with Indentation(gen): - gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" - % (arg.name, arg.dtype, arg.name)) - - # {{{ generate shape checking code - - def strify_allowing_none(shape_axis): - if shape_axis is None: - return "None" - else: - return strify(shape_axis) - - def strify_tuple(t): - if len(t) == 0: - return "()" - else: - return "(%s,)" % ", ".join( - strify_allowing_none(sa) - for sa in t) - - shape_mismatch_msg = ( - "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " - "%% (%s.shape, %s))" - % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - - if kernel_arg.shape is None: - pass - - elif any(shape_axis is None for shape_axis in kernel_arg.shape): - gen("if len(%s.shape) != %s:" - % (arg.name, len(arg.unvec_shape))) + # check for local memory + if arg.address_space == AddressSpace.LOCAL: + from numpy import prod + # simply check that the argument size is sufficient + expected_size = prod(arg.shape) * arg.dtype.itemsize + gen("if %s.size < %d:" % (arg.name, expected_size)) + with Indentation(gen): + gen("raise TypeError(\"size mismatch on local argument " + "'%s' (got: %%d bytes, expected: %d bytes)\" %% " + "%s.size)" % ( + arg.name, expected_size, arg.name)) + else: + + gen("if %s.dtype != %s:" + % (arg.name, self.python_dtype_str( + kernel_arg.dtype.numpy_dtype))) with Indentation(gen): - gen(shape_mismatch_msg) + gen("raise TypeError(\"dtype mismatch on argument '%s' " + "(got: %%s, expected: %s)\" %% %s.dtype)" + % (arg.name, arg.dtype, arg.name)) - for i, shape_axis in enumerate(arg.unvec_shape): + # {{{ generate shape checking code + + def strify_allowing_none(shape_axis): if shape_axis is None: - continue + return "None" + else: + return strify(shape_axis) - gen("if %s.shape[%d] != %s:" - % (arg.name, i, strify(shape_axis))) + def strify_tuple(t): + if len(t) == 0: + return "()" + else: + return "(%s,)" % ", ".join( + strify_allowing_none(sa) + for sa in t) + + shape_mismatch_msg = ( + "raise TypeError(\"shape mismatch on argument '%s' " + "(got: %%s, expected: %%s)\" " + "%% (%s.shape, %s))" + % (arg.name, arg.name, strify_tuple( + arg.unvec_shape))) + + if kernel_arg.shape is None: + pass + + elif any(shape_axis is None + for shape_axis in kernel_arg.shape): + gen("if len(%s.shape) != %s:" + % (arg.name, len(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) - else: # not None, no Nones in tuple - gen("if %s.shape != %s:" - % (arg.name, strify(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) + for i, shape_axis in enumerate(arg.unvec_shape): + if shape_axis is None: + continue - # }}} + gen("if %s.shape[%d] != %s:" + % (arg.name, i, strify(shape_axis))) + with Indentation(gen): + gen(shape_mismatch_msg) - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - sym_strides = tuple( - itemsize*s_i for s_i in arg.unvec_strides) + else: # not None, no Nones in tuple + gen("if %s.shape != %s:" + % (arg.name, strify(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) - ndim = len(arg.unvec_shape) - shape = ["_lpy_shape_%d" % i for i in range(ndim)] - strides = ["_lpy_stride_%d" % i for i in range(ndim)] + # }}} - gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) - gen("(%s,) = %s.strides" % (", ".join(strides), arg.name)) + if arg.unvec_strides and kernel_arg.dim_tags: + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + sym_strides = tuple( + itemsize*s_i for s_i in arg.unvec_strides) - gen("if not %s:" - % self.get_strides_check_expr( - shape, strides, - (strify(s) for s in sym_strides))) - with Indentation(gen): - gen("_lpy_got = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s.strides) " - "if dim > 1)" - % (arg.name, arg.name)) - gen("_lpy_expected = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s) " - "if dim > 1)" - % (arg.name, strify_tuple(sym_strides))) - - gen("raise TypeError(\"strides mismatch on " - "argument '%s' " - "(after removing unit length dims, " - "got: %%s, expected: %%s)\" " - "%% (_lpy_got, _lpy_expected))" - % arg.name) - - if not arg.allows_offset: - gen("if hasattr(%s, 'offset') and %s.offset:" % ( - arg.name, arg.name)) - with Indentation(gen): - gen("raise ValueError(\"Argument '%s' does not " - "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." - "\")" % arg.name) - gen("") + ndim = len(arg.unvec_shape) + shape = ["_lpy_shape_%d" % i for i in range(ndim)] + strides = ["_lpy_stride_%d" % i for i in range(ndim)] + + gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) + gen("(%s,) = %s.strides" % ( + ", ".join(strides), arg.name)) + + gen("if not %s:" + % self.get_strides_check_expr( + shape, strides, + (strify(s) for s in sym_strides))) + with Indentation(gen): + gen("_lpy_got = tuple(stride " + "for (dim, stride) in " + "zip(%s.shape, %s.strides) " + "if dim > 1)" + % (arg.name, arg.name)) + gen("_lpy_expected = tuple(stride " + "for (dim, stride) in zip(%s.shape, %s) " + "if dim > 1)" + % (arg.name, strify_tuple(sym_strides))) + + gen("raise TypeError(\"strides mismatch on " + "argument '%s' " + "(after removing unit length dims, " + "got: %%s, expected: %%s)\" " + "%% (_lpy_got, _lpy_expected))" + % arg.name) + + if not arg.allows_offset: + gen("if hasattr(%s, 'offset') and %s.offset:" % ( + arg.name, arg.name)) + with Indentation(gen): + gen("raise ValueError(\"Argument '%s' does not " + "allow arrays with offsets. Try passing " + "default_offset=loopy.auto to make_kernel()." + "\")" % arg.name) + gen("") # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index a6215d6e1..01c515847 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -84,13 +84,6 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): "pyopencl.LocalMemory object.'" % arg.name) else: gen("_lpy_encountered_dev = True") - if is_local: - from numpy import prod - gen('else:') - with Indentation(gen): - gen('# create a properly sized LocalMemory object') - gen('%s = _lpy_cl.LocalMemory(%d)' % ( - arg.name, prod(arg.shape) * arg.dtype.itemsize)) gen("") @@ -103,6 +96,15 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): Handle allocation of non-specified arguements for pyopencl execution """ from pymbolic import var + from loopy.kernel.data import AddressSpace + + if arg.address_space == AddressSpace.LOCAL: + # handle local argument allocations + from numpy import prod + gen('# create a properly sized LocalMemory object') + gen('%s = _lpy_cl.LocalMemory(%d)' % ( + arg.name, prod(arg.shape) * arg.dtype.itemsize)) + return num_axes = len(arg.strides) for i in range(num_axes): diff --git a/test/test_loopy.py b/test/test_loopy.py index 4fa4f92cb..1971e123a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2875,6 +2875,7 @@ def test_half_complex_conditional(ctx_factory): def test_local_arg_execution(ctx_factory): ctx = ctx_factory() from loopy.kernel.data import AddressSpace + local = AddressSpace.LOCAL # simple example, allow the user to pass in a workspace local array knl = lp.make_kernel( @@ -2884,7 +2885,7 @@ def test_local_arg_execution(ctx_factory): ... lbarrier {id=barrier, mem_kind=local, dep=init} out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=init:barrier} """, - [lp.ArrayArg('tmp', address_space=AddressSpace.LOCAL, shape=(10,), + [lp.ArrayArg('tmp', address_space=local, shape=(10,), dtype=np.int32), lp.GlobalArg('out', shape=(10,), dtype=np.int32)] ) @@ -2907,8 +2908,8 @@ def test_local_arg_execution(ctx_factory): out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier} end """, - [lp.LocalArg('tmp', shape=(10,), dtype=np.int32), - lp.LocalArg('tmp2', shape=(10,), dtype=np.int32), + [lp.ArrayArg('tmp', shape=(10,), dtype=np.int32, address_space=local), + lp.ArrayArg('tmp2', shape=(10,), dtype=np.int32, address_space=local), lp.GlobalArg('out', shape=(10,), dtype=np.int32)] ) -- GitLab From a74f661362f892cfebf7b8eae4761d22938f083b Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 16:16:06 -0400 Subject: [PATCH 6/8] fix 'base_data' lookup for LocalMemory --- loopy/target/pyopencl_execution.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 01c515847..57ef0ed66 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -256,7 +256,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen.add_to_preamble(codegen_result.host_code()) def get_arg_pass(self, arg): - return "%s.base_data" % arg.name + from loopy.kernel.data import AddressSpace + is_local = arg.address_space == AddressSpace.LOCAL + return "%s%s" % (arg.name, '.base_data' if not is_local else '') # }}} -- GitLab From a242cec2f7d5c125d041dd47e1e4e5c67686a454 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 16:19:18 -0400 Subject: [PATCH 7/8] fix nosyncs in local args test --- test/test_loopy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1971e123a..c99ffec92 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2883,7 +2883,7 @@ def test_local_arg_execution(ctx_factory): """ tmp[i] = i {id=init, dep=*} ... lbarrier {id=barrier, mem_kind=local, dep=init} - out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=init:barrier} + out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=barrier, nosync=init} """, [lp.ArrayArg('tmp', address_space=local, shape=(10,), dtype=np.int32), @@ -2905,7 +2905,7 @@ def test_local_arg_execution(ctx_factory): end ... lbarrier {id=barrier, mem_kind=local, dep=init*} for i0 - out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier} + out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier, nosync=init*} end """, [lp.ArrayArg('tmp', shape=(10,), dtype=np.int32, address_space=local), -- GitLab From 0e2297bc4a66517a975a348ff5a67793d1720851 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 30 Jul 2018 16:35:42 -0400 Subject: [PATCH 8/8] add test of proper bytesize for local memory & remove 'get()' call on local args --- loopy/target/pyopencl_execution.py | 5 ++++- test/test_loopy.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 57ef0ed66..9a0f85941 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -212,7 +212,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def generate_output_handler( self, gen, options, kernel, implemented_data_info): - from loopy.kernel.data import KernelArgument + from loopy.kernel.data import KernelArgument, AddressSpace if not options.no_numpy: gen("if out_host is None and (_lpy_encountered_numpy " @@ -226,6 +226,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for arg in implemented_data_info: if not issubclass(arg.arg_class, KernelArgument): continue + elif arg.address_space == AddressSpace.LOCAL: + # local memory doesn't have a .get() + continue is_written = arg.base_name in kernel.get_written_variables() if is_written: diff --git a/test/test_loopy.py b/test/test_loopy.py index c99ffec92..3a49429d3 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2895,6 +2895,19 @@ def test_local_arg_execution(ctx_factory): knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0') lp.auto_test_vs_ref(ref_knl, ctx, knl) + # call directly w/ cl local memory + from pytools import product + nbytes = product(si for si in knl.arg_dict['tmp'].shape) * \ + knl.arg_dict['tmp'].dtype.itemsize + from pyopencl import LocalMemory + queue = cl.CommandQueue(ctx) + tmp = LocalMemory(nbytes) + knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32)) + # and that we get an error if we're short on memory + tmp = LocalMemory(nbytes - 1) + with pytest.raises(TypeError): + knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32)) + # try with 2 local args for compatibility knl = lp.make_kernel( "{[i,i0]: 0 <= i,i0 < 10}", -- GitLab