diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 015c82dd1fa5f81665f062f974149c2e93a324a9..0ce3db81761cdc9b468bc7b964da484058807705 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -80,7 +80,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \ - TemporaryVariable, ConstantArg + TemporaryVariable, ConstantArg, AddressSpace from pymbolic import evaluate @@ -110,6 +110,15 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: + + if arg.address_space == AddressSpace.LOCAL: + # generally local kernel arguments are used as dynamically sized + # memory but you can't pass data from the host to the local arg, + # so there are no "reference" local arguments + ref_args[arg.name] = None + ref_arg_data.append(None) + continue + if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) @@ -197,7 +206,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ - TemporaryVariable, ConstantArg + TemporaryVariable, ConstantArg, AddressSpace from pymbolic import evaluate @@ -232,6 +241,11 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): elif arg.arg_class is ArrayArg or\ arg.arg_class is ConstantArg: + + if arg.address_space == AddressSpace.LOCAL: + # handled in invocation + continue + shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..ebb43520b08842c0c9df5bf8a6b3eb4150584a2e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -51,6 +51,10 @@ class ImplementedDataInfo(ImmutableRecord): .. attribute:: arg_class + .. attribute:: address_space + The address-space of the array. + May be *None* for non-array arguments + .. attribute:: base_name The user-facing name of the underlying array. @@ -86,7 +90,7 @@ class ImplementedDataInfo(ImmutableRecord): unvec_shape=None, unvec_strides=None, offset_for_name=None, stride_for_name_and_axis=None, allows_offset=None, - is_written=None): + is_written=None, address_space=None): from loopy.types import LoopyType assert isinstance(dtype, LoopyType) @@ -103,7 +107,8 @@ class ImplementedDataInfo(ImmutableRecord): offset_for_name=offset_for_name, stride_for_name_and_axis=stride_for_name_and_axis, allows_offset=allows_offset, - is_written=is_written) + is_written=is_written, + address_space=address_space) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b0033808c616829e60615b92849fa6353751a82..d97d0fc73a24fe2f83c0c90b1536402d924a3323 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -978,7 +978,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def global_var_names(self): from loopy.kernel.data import AddressSpace - from loopy.kernel.data import ArrayArg return ( set( diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 186597c64734b54b8d08f0db43b57826d79f9567..372bf6eca36d04c1c44b391a41742e2863d62f7a 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1058,11 +1058,11 @@ class ArrayBase(ImmutableRecord): full_name, stride_impl_axis), is_written=False)) + space = getattr(self, 'address_space', None) yield ImplementedDataInfo( target=target, name=full_name, base_name=self.name, - arg_class=type(self), dtype=dtype, shape=shape, @@ -1070,8 +1070,8 @@ class ArrayBase(ImmutableRecord): unvec_shape=unvec_shape, unvec_strides=tuple(unvec_strides), allows_offset=bool(self.offset), - - is_written=is_written) + is_written=is_written, + address_space=space) import loopy as lp diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf20577bf995b351f90615dd18f7bd0681be0b..7aade780c725d5e2c45a6f64047f8c7fecb89b54 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -364,7 +364,7 @@ class ExecutionWrapperGeneratorBase(object): self, gen, kernel, implemented_data_info, options): import loopy as lp - from loopy.kernel.data import KernelArgument + from loopy.kernel.data import KernelArgument, AddressSpace from loopy.kernel.array import ArrayBase from loopy.symbolic import StringifyMapper from loopy.types import NumpyType @@ -463,104 +463,121 @@ class ExecutionWrapperGeneratorBase(object): gen("if True:") with Indentation(gen): - gen("if %s.dtype != %s:" - % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) - with Indentation(gen): - gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" - % (arg.name, arg.dtype, arg.name)) - - # {{{ generate shape checking code - - def strify_allowing_none(shape_axis): - if shape_axis is None: - return "None" - else: - return strify(shape_axis) - - def strify_tuple(t): - if len(t) == 0: - return "()" - else: - return "(%s,)" % ", ".join( - strify_allowing_none(sa) - for sa in t) - - shape_mismatch_msg = ( - "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " - "%% (%s.shape, %s))" - % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - - if kernel_arg.shape is None: - pass - - elif any(shape_axis is None for shape_axis in kernel_arg.shape): - gen("if len(%s.shape) != %s:" - % (arg.name, len(arg.unvec_shape))) + # check for local memory + if arg.address_space == AddressSpace.LOCAL: + from numpy import prod + # simply check that the argument size is sufficient + expected_size = prod(arg.shape) * arg.dtype.itemsize + gen("if %s.size < %d:" % (arg.name, expected_size)) + with Indentation(gen): + gen("raise TypeError(\"size mismatch on local argument " + "'%s' (got: %%d bytes, expected: %d bytes)\" %% " + "%s.size)" % ( + arg.name, expected_size, arg.name)) + else: + + gen("if %s.dtype != %s:" + % (arg.name, self.python_dtype_str( + kernel_arg.dtype.numpy_dtype))) with Indentation(gen): - gen(shape_mismatch_msg) + gen("raise TypeError(\"dtype mismatch on argument '%s' " + "(got: %%s, expected: %s)\" %% %s.dtype)" + % (arg.name, arg.dtype, arg.name)) - for i, shape_axis in enumerate(arg.unvec_shape): + # {{{ generate shape checking code + + def strify_allowing_none(shape_axis): if shape_axis is None: - continue + return "None" + else: + return strify(shape_axis) - gen("if %s.shape[%d] != %s:" - % (arg.name, i, strify(shape_axis))) + def strify_tuple(t): + if len(t) == 0: + return "()" + else: + return "(%s,)" % ", ".join( + strify_allowing_none(sa) + for sa in t) + + shape_mismatch_msg = ( + "raise TypeError(\"shape mismatch on argument '%s' " + "(got: %%s, expected: %%s)\" " + "%% (%s.shape, %s))" + % (arg.name, arg.name, strify_tuple( + arg.unvec_shape))) + + if kernel_arg.shape is None: + pass + + elif any(shape_axis is None + for shape_axis in kernel_arg.shape): + gen("if len(%s.shape) != %s:" + % (arg.name, len(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) - else: # not None, no Nones in tuple - gen("if %s.shape != %s:" - % (arg.name, strify(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) + for i, shape_axis in enumerate(arg.unvec_shape): + if shape_axis is None: + continue - # }}} + gen("if %s.shape[%d] != %s:" + % (arg.name, i, strify(shape_axis))) + with Indentation(gen): + gen(shape_mismatch_msg) - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - sym_strides = tuple( - itemsize*s_i for s_i in arg.unvec_strides) + else: # not None, no Nones in tuple + gen("if %s.shape != %s:" + % (arg.name, strify(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) - ndim = len(arg.unvec_shape) - shape = ["_lpy_shape_%d" % i for i in range(ndim)] - strides = ["_lpy_stride_%d" % i for i in range(ndim)] + # }}} - gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) - gen("(%s,) = %s.strides" % (", ".join(strides), arg.name)) + if arg.unvec_strides and kernel_arg.dim_tags: + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + sym_strides = tuple( + itemsize*s_i for s_i in arg.unvec_strides) - gen("if not %s:" - % self.get_strides_check_expr( - shape, strides, - (strify(s) for s in sym_strides))) - with Indentation(gen): - gen("_lpy_got = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s.strides) " - "if dim > 1)" - % (arg.name, arg.name)) - gen("_lpy_expected = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s) " - "if dim > 1)" - % (arg.name, strify_tuple(sym_strides))) - - gen("raise TypeError(\"strides mismatch on " - "argument '%s' " - "(after removing unit length dims, " - "got: %%s, expected: %%s)\" " - "%% (_lpy_got, _lpy_expected))" - % arg.name) - - if not arg.allows_offset: - gen("if hasattr(%s, 'offset') and %s.offset:" % ( - arg.name, arg.name)) - with Indentation(gen): - gen("raise ValueError(\"Argument '%s' does not " - "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." - "\")" % arg.name) - gen("") + ndim = len(arg.unvec_shape) + shape = ["_lpy_shape_%d" % i for i in range(ndim)] + strides = ["_lpy_stride_%d" % i for i in range(ndim)] + + gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) + gen("(%s,) = %s.strides" % ( + ", ".join(strides), arg.name)) + + gen("if not %s:" + % self.get_strides_check_expr( + shape, strides, + (strify(s) for s in sym_strides))) + with Indentation(gen): + gen("_lpy_got = tuple(stride " + "for (dim, stride) in " + "zip(%s.shape, %s.strides) " + "if dim > 1)" + % (arg.name, arg.name)) + gen("_lpy_expected = tuple(stride " + "for (dim, stride) in zip(%s.shape, %s) " + "if dim > 1)" + % (arg.name, strify_tuple(sym_strides))) + + gen("raise TypeError(\"strides mismatch on " + "argument '%s' " + "(after removing unit length dims, " + "got: %%s, expected: %%s)\" " + "%% (_lpy_got, _lpy_expected))" + % arg.name) + + if not arg.allows_offset: + gen("if hasattr(%s, 'offset') and %s.offset:" % ( + arg.name, arg.name)) + with Indentation(gen): + gen("raise ValueError(\"Argument '%s' does not " + "allow arrays with offsets. Try passing " + "default_offset=loopy.auto to make_kernel()." + "\")" % arg.name) + gen("") # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 27be619870de6939cf3de06751dea65a3fd558c0..9a0f859417ecaed20a4f013934311ba1a4197663 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -63,8 +63,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle non-numpy args def handle_non_numpy_arg(self, gen, arg): + from loopy.kernel.data import AddressSpace + is_local = arg.address_space == AddressSpace.LOCAL gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): + if is_local: + gen("raise Exception('Cannot pass numpy data directly to a " + "__local argument.')") + gen("# synchronous, nothing to worry about") gen("%s = _lpy_cl_array.to_device(" "queue, %s, allocator=allocator)" @@ -72,7 +78,12 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("_lpy_encountered_numpy = True") gen("elif %s is not None:" % arg.name) with Indentation(gen): - gen("_lpy_encountered_dev = True") + if is_local: + gen("assert isinstance(%s, _lpy_cl.LocalMemory), 'Arguments with " + "local scope must either be None or an instance of a " + "pyopencl.LocalMemory object.'" % arg.name) + else: + gen("_lpy_encountered_dev = True") gen("") @@ -85,6 +96,15 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): Handle allocation of non-specified arguements for pyopencl execution """ from pymbolic import var + from loopy.kernel.data import AddressSpace + + if arg.address_space == AddressSpace.LOCAL: + # handle local argument allocations + from numpy import prod + gen('# create a properly sized LocalMemory object') + gen('%s = _lpy_cl.LocalMemory(%d)' % ( + arg.name, prod(arg.shape) * arg.dtype.itemsize)) + return num_axes = len(arg.strides) for i in range(num_axes): @@ -192,7 +212,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def generate_output_handler( self, gen, options, kernel, implemented_data_info): - from loopy.kernel.data import KernelArgument + from loopy.kernel.data import KernelArgument, AddressSpace if not options.no_numpy: gen("if out_host is None and (_lpy_encountered_numpy " @@ -206,6 +226,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for arg in implemented_data_info: if not issubclass(arg.arg_class, KernelArgument): continue + elif arg.address_space == AddressSpace.LOCAL: + # local memory doesn't have a .get() + continue is_written = arg.base_name in kernel.get_written_variables() if is_written: @@ -236,7 +259,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen.add_to_preamble(codegen_result.host_code()) def get_arg_pass(self, arg): - return "%s.base_data" % arg.name + from loopy.kernel.data import AddressSpace + is_local = arg.address_space == AddressSpace.LOCAL + return "%s%s" % (arg.name, '.base_data' if not is_local else '') # }}} diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701f4f23a5b1c66b1559bf6c4879425902..b695c1327278e2e8663e16b6bd8a2a72d62bf63c 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -336,9 +336,10 @@ class DifferentiationContext(object): if var_name in self.kernel.arg_dict: self.new_args.append( - lp.GlobalArg( + lp.ArrayArg( new_var_name, arg.dtype, + address_space=self.kernel.arg_dict[var_name].address_space, shape=shape, dim_tags=dim_tags, )) diff --git a/test/test_loopy.py b/test/test_loopy.py index accf9c1dff5a1f660871dd63d6af3337aced6490..3a49429d3f3bd9903157efee4398c4f6608431b6 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2872,6 +2872,66 @@ def test_half_complex_conditional(ctx_factory): knl(queue) +def test_local_arg_execution(ctx_factory): + ctx = ctx_factory() + from loopy.kernel.data import AddressSpace + local = AddressSpace.LOCAL + + # simple example, allow the user to pass in a workspace local array + knl = lp.make_kernel( + "{[i,i0]: 0 <= i,i0 < 10}", + """ + tmp[i] = i {id=init, dep=*} + ... lbarrier {id=barrier, mem_kind=local, dep=init} + out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=barrier, nosync=init} + """, + [lp.ArrayArg('tmp', address_space=local, shape=(10,), + dtype=np.int32), + lp.GlobalArg('out', shape=(10,), dtype=np.int32)] + ) + + # get vectorized form + ref_knl = knl + knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0') + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + # call directly w/ cl local memory + from pytools import product + nbytes = product(si for si in knl.arg_dict['tmp'].shape) * \ + knl.arg_dict['tmp'].dtype.itemsize + from pyopencl import LocalMemory + queue = cl.CommandQueue(ctx) + tmp = LocalMemory(nbytes) + knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32)) + # and that we get an error if we're short on memory + tmp = LocalMemory(nbytes - 1) + with pytest.raises(TypeError): + knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32)) + + # try with 2 local args for compatibility + knl = lp.make_kernel( + "{[i,i0]: 0 <= i,i0 < 10}", + """ + for i + tmp[i] = i {id=init, dep=*} + tmp2[i] = i + 1 {id=init2, dep=*} + end + ... lbarrier {id=barrier, mem_kind=local, dep=init*} + for i0 + out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier, nosync=init*} + end + """, + [lp.ArrayArg('tmp', shape=(10,), dtype=np.int32, address_space=local), + lp.ArrayArg('tmp2', shape=(10,), dtype=np.int32, address_space=local), + lp.GlobalArg('out', shape=(10,), dtype=np.int32)] + ) + + # get vectorized form + ref_knl = knl + knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0') + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + def test_dep_cycle_printing_and_error(): # https://gitlab.tiker.net/inducer/loopy/issues/140 # This kernel has two dep cycles.