diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py index c877236595d55f02c2357aeae25a62f6c1f957a3..c61515baee605ee680abafaebcb510a07ca58f46 100644 --- a/loopy/codegen/expression.py +++ b/loopy/codegen/expression.py @@ -407,9 +407,14 @@ class LoopyCCodeMapper(RecursiveMapper): expr.aggregate.name, expr, len(index_expr), len(arg.strides))) - from pymbolic.primitives import Subscript + from pymbolic.primitives import Subscript, Variable + if arg.offset: + offset = Variable(arg.offset) + else: + offset = 0 + return base_impl( - Subscript(expr.aggregate, arg.offset+sum( + Subscript(expr.aggregate, offset+sum( stride*expr_i for stride, expr_i in zip( arg.strides, index_expr))), enclosing_prec, type_context) @@ -450,9 +455,14 @@ class LoopyCCodeMapper(RecursiveMapper): else: # GlobalArg + if arg.offset: + offset = Variable(arg.offset) + else: + offset = 0 + from pymbolic.primitives import Subscript return base_impl( - Subscript(expr.aggregate, arg.offset+expr.index), + Subscript(expr.aggregate, offset+expr.index), enclosing_prec, type_context) elif expr.aggregate.name in self.kernel.temporary_variables: diff --git a/loopy/compiled.py b/loopy/compiled.py index 59630a9a41d800fd6da654dca0ea8eea60f4bc58..e844cf87ac30ffe32c438e9ed5d15a0073b52918 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -62,6 +62,9 @@ def _arg_matches_spec(arg, val, other_args): "(got: %s, expected: %s)" % (arg.name, val.strides, strides)) + if val.offset != 0 and arg.offset == 0: + raise ValueError("argument '%s' does not allow offset" % arg.name) + return True # }}} @@ -120,20 +123,39 @@ class CompiledKernel: self.options = options @memoize_method - def get_kernel_info(self, dtype_mapping_set): + def get_kernel_info(self, arg_to_dtype_set, arg_to_has_offset_set): kernel = self.kernel + import loopy as lp from loopy.kernel.tools import ( add_argument_dtypes, get_arguments_with_incomplete_dtype) - if get_arguments_with_incomplete_dtype(kernel): - if dtype_mapping_set is not None: - kernel = add_argument_dtypes(kernel, dict(dtype_mapping_set)) + if arg_to_dtype_set: + kernel = add_argument_dtypes(kernel, dict(arg_to_dtype_set)) from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) + if arg_to_has_offset_set: + arg_to_has_offset = dict(arg_to_has_offset_set) + + vng = kernel.get_var_name_generator() + + new_args = [] + for arg in kernel.args: + if getattr(arg, "offset", None) is lp.auto: + if arg_to_has_offset[arg.name]: + offset_arg_name = vng(arg.name+"_offset") + new_args.append(arg.copy(offset=offset_arg_name)) + new_args.append(lp.ValueArg(offset_arg_name, kernel.index_dtype)) + else: + new_args.append(arg.copy(offset=0)) + else: + new_args.append(arg) + + kernel = kernel.copy(args=new_args) + import loopy as lp if kernel.schedule is None: kernel = _get_kernel_from_iterable( @@ -156,8 +178,8 @@ class CompiledKernel: ) @memoize_method - def get_cl_kernel(self, dtype_mapping_set): - kernel_info = self.get_kernel_info(dtype_mapping_set) + def get_cl_kernel(self, arg_to_dtype_set, arg_to_has_offset_set): + kernel_info = self.get_kernel_info(arg_to_dtype_set, arg_to_has_offset_set) kernel = kernel_info.kernel from loopy.codegen import generate_code @@ -199,17 +221,19 @@ class CompiledKernel: # {{{ debugging aids - def get_code(self, dtype_dict=None): - if dtype_dict is not None: - dtype_dict = frozenset(dtype_dict.items()) + def get_code(self, arg_to_dtype=None, arg_to_has_offset=None): + if arg_to_dtype is not None: + arg_to_dtype = frozenset(arg_to_dtype.iteritems()) + if arg_to_has_offset is not None: + arg_to_has_offset = frozenset(arg_to_has_offset.iteritems()) - kernel_info = self.get_kernel_info(dtype_dict) + kernel_info = self.get_kernel_info(arg_to_dtype, arg_to_has_offset) from loopy.codegen import generate_code return generate_code(kernel_info.kernel, **self.codegen_kwargs) - def get_highlighted_code(self, dtype_dict=None): - return get_highlighted_code(self.get_code(dtype_dict)) + def get_highlighted_code(self, arg_to_dtype=None, arg_to_has_offset=None): + return get_highlighted_code(self.get_code(arg_to_dtype, arg_to_has_offset)) @property def code(self): @@ -224,6 +248,10 @@ class CompiledKernel: def __call__(self, queue, **kwargs): """If all array arguments are :mod:`numpy` arrays, defaults to returning numpy arrays as well. + + If you want offset arguments (see + :attr:`loopy.kernel.data.GlobalArg.offset`) to be set automatically, it + must occur *after* the corresponding array argument. """ allocator = kwargs.pop("allocator", None) @@ -234,20 +262,32 @@ class CompiledKernel: # {{{ process arg types, get cl kernel - dtype_dict = {} + import loopy as lp + + arg_to_dtype = {} + arg_to_has_offset = {} for arg in self.kernel.args: val = kwargs.get(arg.name) - if val is not None: + + if arg.dtype is None and val is not None: try: dtype = val.dtype except AttributeError: pass else: - dtype_dict[arg.name] = dtype + arg_to_dtype[arg.name] = dtype - kernel_info, cl_kernel = self.get_cl_kernel(frozenset(dtype_dict.iteritems())) + if getattr(arg, "offset", None) is lp.auto: + if val is not None: + has_offset = val.offset != 0 + else: + has_offset = False + arg_to_has_offset[arg.name] = has_offset + + kernel_info, cl_kernel = self.get_cl_kernel(frozenset(arg_to_dtype.iteritems()), + frozenset(arg_to_has_offset.iteritems())) kernel = kernel_info.kernel - del dtype_dict + del arg_to_dtype # }}} @@ -267,8 +307,22 @@ class CompiledKernel: val = kwargs_copy.pop(arg.name, None) - # automatically transfer host-side arrays if isinstance(arg, lp.GlobalArg): + if arg.offset: + # arg.offset must be a string at this point. + + # /!\ Tacit assumption: If you want the offset argument to + # be set automatically, it must occur *after* the + # corresponding array argument. + + ofs, remdr = divmod(val.offset, val.dtype.itemsize) + assert remdr == 0 + kwargs_copy.setdefault(arg.offset, ofs) + del ofs + del remdr + + # {{{ automatically transfer host-side arrays, if needed + if isinstance(val, np.ndarray): # synchronous, so nothing to worry about val = cl_array.to_device(queue, val, allocator=allocator) @@ -279,6 +333,8 @@ class CompiledKernel: "performing implicit transfer" % arg.name, stacklevel=2) + # }}} + if val is None: if not is_written: raise TypeError("must supply input argument '%s'" % arg.name) @@ -311,7 +367,7 @@ class CompiledKernel: outputs.append(val) if isinstance(arg, lp.GlobalArg): - args.append(val.data) + args.append(val.base_data) else: args.append(val) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 8d340114294b53dc822d9e2fe0dd0309e32e8ce4..12c7b196ad5919e918e9f68f138fce29697d8859 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -439,7 +439,7 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables, s # It's not a temp var, and thereby not a domain parameter--the only # other writable type of variable is an argument. - kernel_args.append(GlobalArg(arg_name, shape=lp.auto)) + kernel_args.append(GlobalArg(arg_name, shape=lp.auto, offset=lp.auto)) continue irank = find_index_rank(arg_name) @@ -447,7 +447,7 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables, s # read-only, no indices kernel_args.append(ValueArg(arg_name)) else: - kernel_args.append(GlobalArg(arg_name, shape=lp.auto)) + kernel_args.append(GlobalArg(arg_name, shape=lp.auto, offset=lp.auto)) return kernel_args diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index fd0c5b6328b845256b69bcf94e999a7bacd87970..c174ca70161e2fe06634aaa374207ba751100fa1 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -161,6 +161,16 @@ class ShapedArg(KernelArgument): commas, in which case multiple arguments, each with identical properties are created for each name. + :arg dtype: the :class:`numpy.dtype` of the array. + If this is *None*, :mod:`loopy` will try to continue + without knowing the type of this array. + + Note that some operations, such as :func:`loopy.add_padding` + require this information to work. + + :class:`loopy.CompiledKernel` will automatically compile a kernel + with the right dtype when called with a concrete array on a kernel + with argument whose *dtype* is *None*. :arg shape: like :attr:`numpy.ndarray.shape`. Also allowed to be :class:`loopy.auto`, in which case shape is determined by finding the @@ -180,8 +190,13 @@ class ShapedArg(KernelArgument): string that can be parsed to such an expression. :arg order: "F" or "C" for C (row major) or Fortran (column major) - :arg offset: Offset from the beginning of the vector from which - the strides are counted. + :arg offset: Offset from the beginning of the buffer to the point from + which the strides are counted. May be one of + + * 0 + * a string (that is interpreted as an argument name). + * :class:`loopy.auto`, in which case this information is added at run time + by :class:`loopy.CompiledKernel`. """ if dtype is not None: dtype = np.dtype(dtype) diff --git a/test/test_loopy.py b/test/test_loopy.py index 6a2aa8676e8eb59bcb3dcfca467dba4eeeb93fcd..90dac0f97d4bfea6f4b76cdfc3988dce8ca5cac6 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -28,6 +28,7 @@ THE SOFTWARE. import numpy as np import loopy as lp import pyopencl as cl +import pyopencl.clrandom import logging from pyopencl.tools import pytest_generate_tests_for_pyopencl \ @@ -975,11 +976,6 @@ def test_double_sum(ctx_factory): "a = sum((i,j), i*j)", "b = sum(i, sum(j, i*j))", ], - [ - lp.GlobalArg("a", dtype, shape=()), - lp.GlobalArg("b", dtype, shape=()), - lp.ValueArg("n", np.int32, approximately=1000), - ], assumptions="n>=1") cknl = lp.CompiledKernel(ctx, knl) @@ -1182,6 +1178,35 @@ def test_triangle_domain(ctx_factory): +def test_array_with_offset(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 5 + + knl = lp.make_kernel(ctx.devices[0], [ + "{[i,j]: 0<=i<n and 0<=j<m }", + ], + """ + b[i,j] = 2*a[i,j] + """, + assumptions="n>=1 and m>=1") + + cknl = lp.CompiledKernel(ctx, knl) + + a_full = cl.clrandom.rand(queue, (n, n), np.float64) + a = a_full[3:10] + + print cknl.get_highlighted_code({"a": a.dtype}, {"a": True, "b": False}) + evt, (b,) = cknl(queue, a=a, n=a.shape[0], m=a.shape[1]) + + import numpy.linalg as la + assert la.norm(b.get() - 2*a.get()) < 1e-13 + + + + if __name__ == "__main__": import sys if len(sys.argv) > 1: