From a26b5b9ce5516de8cb30dcbc02e3b00b534f905f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 3 Jun 2013 13:15:58 -0400
Subject: [PATCH] Support for slices and offsets in elementwise computations.

---
 pyopencl/algorithm.py   |  2 ++
 pyopencl/array.py       | 15 +++++---
 pyopencl/compyte        |  2 +-
 pyopencl/elementwise.py | 55 +++++++++++++++++-----------
 pyopencl/tools.py       | 79 +++++++++++++++++++++++++++++++++++------
 test/test_array.py      | 57 +++++++++++++++++------------
 6 files changed, 150 insertions(+), 60 deletions(-)

diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index beddbb2c..737666f4 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -113,6 +113,7 @@ def remove_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
 
 # }}}
 
+
 # {{{ partition
 
 _partition_template = ScanTemplate(
@@ -523,6 +524,7 @@ class RadixSort(object):
 
 # }}}
 
+
 # {{{ generic parallel list builder
 
 # {{{ kernel template
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 7f4dd64d..22f49cf0 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -176,7 +176,8 @@ def elwise_kernel_runner(kernel_getter):
                 if not arg.flags.forc:
                     raise RuntimeError("only contiguous arrays may "
                             "be used as arguments to this operation")
-                actual_args.append(arg.data)
+                actual_args.append(arg.base_data)
+                actual_args.append(arg.offset)
                 wait_for.extend(arg.events)
             else:
                 actual_args.append(arg)
@@ -220,7 +221,7 @@ class ArrayHasOffsetError(ValueError):
     .. versionadded:: 2013.1
     """
 
-    def __init__(self, val="The operation you are attempting does not (yet?) "
+    def __init__(self, val="The operation you are attempting does not yet "
                 "support arrays that start at an offset from the beginning "
                 "of their buffer."):
         ValueError.__init__(self, val)
@@ -1477,10 +1478,16 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
+        from pytools import flatten
         knl(queue, gs, ls,
                 *([o.data for o in out[chunk_slice]]
-                    + [dest_indices.data, src_indices.data]
-                    + [i.data for i in arrays[chunk_slice]]
+                    + [dest_indices.base_data,
+                        dest_indices.offset,
+                        src_indices.base_data,
+                        src_indices.offset]
+                    + list(flatten(
+                        (i.base_data, i.offset)
+                        for i in arrays[chunk_slice]))
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]))
 
diff --git a/pyopencl/compyte b/pyopencl/compyte
index fc06f44a..479c0c6e 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit fc06f44a14dc9658a460d001a4b4a5c8fd05b993
+Subproject commit 479c0c6e1daed38ca9153b1fd42e1e9fffb0a256
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 896b2c3e..22535990 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -112,8 +112,8 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         name="elwise_kernel", options=[], preamble="", use_range=False,
         **kwargs):
 
-    from pyopencl.tools import parse_arg_list
-    parsed_args = parse_arg_list(arguments)
+    from pyopencl.tools import parse_arg_list, get_arg_offset_adjuster_code
+    parsed_args = parse_arg_list(arguments, with_offset=True)
 
     auto_preamble = kwargs.pop("auto_preamble", True)
 
@@ -147,10 +147,12 @@ def get_elwise_kernel_and_types(context, arguments, operation,
     else:
         parsed_args.append(ScalarArg(np.intp, "n"))
 
+    loop_prep = kwargs.pop("loop_prep", "")
+    loop_prep = get_arg_offset_adjuster_code(parsed_args) + loop_prep
     prg = get_elwise_program(
         context, parsed_args, operation,
         name=name, options=options, preamble=preamble,
-        use_range=use_range, **kwargs)
+        use_range=use_range, loop_prep=loop_prep, **kwargs)
 
     from pyopencl.tools import get_arg_list_scalar_arg_dtypes
 
@@ -215,6 +217,15 @@ class ElementwiseKernel:
             name=self.name, options=self.options,
             use_range=use_range, **self.kwargs)
 
+        for arg in arg_descrs:
+            if isinstance(arg, VectorArg) and not arg.with_offset:
+                from warnings import warn
+                warn("ElementwiseKernel '%s' used with VectorArgs that do not "
+                        "have offset support enabled. This usage is deprecated. "
+                        "Just pass with_offset=True to VectorArg, everything should "
+                        "sort itself out automatically." % self.name,
+                        DeprecationWarning)
+
         if not [i for i, arg in enumerate(arg_descrs)
                 if isinstance(arg, VectorArg)]:
             raise RuntimeError(
@@ -244,7 +255,9 @@ class ElementwiseKernel:
                 if repr_vec is None:
                     repr_vec = arg
 
-                invocation_args.append(arg.data)
+                invocation_args.append(arg.base_data)
+                if arg_descr.with_offset:
+                    invocation_args.append(arg.offset)
             else:
                 invocation_args.append(arg)
 
@@ -319,7 +332,7 @@ class ElementwiseTemplate(KernelTemplateBase):
                 type_aliases, var_values, context, options)
 
         arg_list = renderer.render_argument_list(
-                self.arguments, more_arguments)
+                self.arguments, more_arguments, with_offset=True)
         type_decl_preamble = renderer.get_type_decl_preamble(
                 context.devices[0], declare_types, arg_list)
 
@@ -344,11 +357,11 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
             "tp": dtype_to_ctype(dtype),
             }
 
-    args = ([VectorArg(dtype, "dest" + str(i))
+    args = ([VectorArg(dtype, "dest" + str(i), with_offset=True)
              for i in range(vec_count)]
-            + [VectorArg(dtype, "src" + str(i))
+            + [VectorArg(dtype, "src" + str(i), with_offset=True)
                for i in range(vec_count)]
-            + [VectorArg(idx_dtype, "idx")])
+            + [VectorArg(idx_dtype, "idx", with_offset=True)])
     body = (
             ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
             + "\n".join(
@@ -369,10 +382,10 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
             VectorArg(dtype, "dest%d" % i)
                 for i in range(vec_count)
             ] + [
-                VectorArg(idx_dtype, "gmem_dest_idx"),
-                VectorArg(idx_dtype, "gmem_src_idx"),
+                VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
+                VectorArg(idx_dtype, "gmem_src_idx", with_offset=True),
             ] + [
-                VectorArg(dtype, "src%d" % i)
+                VectorArg(dtype, "src%d" % i, with_offset=True)
                 for i in range(vec_count)
             ] + [
                 ScalarArg(idx_dtype, "offset%d" % i)
@@ -404,12 +417,12 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
             }
 
     args = [
-            VectorArg(dtype, "dest%d" % i)
+            VectorArg(dtype, "dest%d" % i, with_offset=True)
                 for i in range(vec_count)
             ] + [
-                VectorArg(idx_dtype, "gmem_dest_idx"),
+                VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
             ] + [
-                VectorArg(dtype, "src%d" % i)
+                VectorArg(dtype, "src%d" % i, with_offset=True)
                 for i in range(vec_count)
             ]
 
@@ -458,18 +471,18 @@ def get_linear_combination_kernel(summand_descriptors,
             preamble.append(
                     "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                     % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
-            args.append(VectorArg(vector_dtype, "x%d" % i))
+            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))
             tex_names.append("tex_a%d" % i)
             loop_prep.append(
                     "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                     % (dtype_to_ctype(scalar_dtype), i, i))
         else:
             args.append(ScalarArg(scalar_dtype, "a%d" % i))
-            args.append(VectorArg(vector_dtype, "x%d" % i))
+            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))
 
         summands.append("a%d*x%d[i]" % (i, i))
 
-    args.append(VectorArg(dtype_z, "z"))
+    args.append(VectorArg(dtype_z, "z", with_offset=True))
     args.append(ScalarArg(np.uintp, "n"))
 
     mod = get_elwise_module(args,
@@ -831,10 +844,10 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
 @context_dependent_memoize
 def get_if_positive_kernel(context, crit_dtype, dtype):
     return get_elwise_kernel(context, [
-            VectorArg(dtype, "result"),
-            VectorArg(crit_dtype, "crit"),
-            VectorArg(dtype, "then_"),
-            VectorArg(dtype, "else_"),
+            VectorArg(dtype, "result", with_offset=True),
+            VectorArg(crit_dtype, "crit", with_offset=True),
+            VectorArg(dtype, "then_", with_offset=True),
+            VectorArg(dtype, "else_", with_offset=True),
             ],
             "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
             name="if_positive")
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index bc935b2a..47c00140 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -313,8 +313,19 @@ class DtypedArgument(Argument):
 
 
 class VectorArg(DtypedArgument):
+    def __init__(self, dtype, name, with_offset=False):
+        DtypedArgument.__init__(self, dtype, name)
+        self.with_offset = with_offset
+
     def declarator(self):
-        return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
+        if self.with_offset:
+            # Two underscores -> less likelihood of a name clash.
+            return "__global %s *%s__base, long %s__offset" % (
+                    dtype_to_ctype(self.dtype), self.name, self.name)
+        else:
+            result = "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
+
+        return result
 
 
 class ScalarArg(DtypedArgument):
@@ -331,7 +342,7 @@ class OtherArg(Argument):
         return self.decl
 
 
-def parse_c_arg(c_arg):
+def parse_c_arg(c_arg, with_offset=False):
     for aspace in ["__local", "__constant"]:
         if aspace in c_arg:
             raise RuntimeError("cannot deal with local or constant "
@@ -339,11 +350,17 @@ def parse_c_arg(c_arg):
 
     c_arg = c_arg.replace("__global", "")
 
+    if with_offset:
+        vec_arg_factory = lambda dtype, name: \
+                VectorArg(dtype, name, with_offset=True)
+    else:
+        vec_arg_factory = VectorArg
+
     from pyopencl.compyte.dtypes import parse_c_arg_backend
-    return parse_c_arg_backend(c_arg, ScalarArg, VectorArg)
+    return parse_c_arg_backend(c_arg, ScalarArg, vec_arg_factory)
 
 
-def parse_arg_list(arguments):
+def parse_arg_list(arguments, with_offset=False):
     """Parse a list of kernel arguments. *arguments* may be a comma-separate
     list of C declarators in a string, a list of strings representing C
     declarators, or :class:`Argument` objects.
@@ -355,7 +372,7 @@ def parse_arg_list(arguments):
     def parse_single_arg(obj):
         if isinstance(obj, str):
             from pyopencl.tools import parse_c_arg
-            return parse_c_arg(obj)
+            return parse_c_arg(obj, with_offset=with_offset)
         else:
             return obj
 
@@ -365,15 +382,34 @@ def parse_arg_list(arguments):
 def get_arg_list_scalar_arg_dtypes(arg_types):
     result = []
 
-    from pyopencl.tools import ScalarArg
     for arg_type in arg_types:
         if isinstance(arg_type, ScalarArg):
             result.append(arg_type.dtype)
-        else:
+        elif isinstance(arg_type, VectorArg):
             result.append(None)
+            if arg_type.with_offset:
+                result.append(np.int64)
+        else:
+            raise RuntimeError("arg type not understood: %s" % type(arg_type))
 
     return result
 
+
+def get_arg_offset_adjuster_code(arg_types):
+    result = []
+
+    for arg_type in arg_types:
+        if isinstance(arg_type, VectorArg) and arg_type.with_offset:
+            result.append("__global %(type)s *%(name)s = "
+                    "(__global %(type)s *) "
+                    "((__global char *) %(name)s__base + %(name)s__offset);"
+                    % dict(
+                        type=dtype_to_ctype(arg_type.dtype),
+                        name=arg_type.name))
+
+    return "\n".join(result)
+
+
 # }}}
 
 
@@ -663,9 +699,19 @@ class _MakoTextTemplate:
 
 
 class _ArgumentPlaceholder:
-    def __init__(self, typename, name):
+    """A placeholder for subclasses of :class:`DtypedArgument`. This is needed
+    because the concrete dtype of the argument is not known at template
+    creation time--it may be a type alias that will only be filled in
+    at run time. These types take the place of these proto-arguments until
+    all types are known.
+
+    See also :class:`_TemplateRenderer.render_arg`.
+    """
+
+    def __init__(self, typename, name, **extra_kwargs):
         self.typename = typename
         self.name = name
+        self.extra_kwargs = extra_kwargs
 
 
 class _VectorArgPlaceholder(_ArgumentPlaceholder):
@@ -721,11 +767,16 @@ class _TemplateRenderer(object):
     def render_arg(self, arg_placeholder):
         return arg_placeholder.target_class(
                 self.parse_type(arg_placeholder.typename),
-                arg_placeholder.name)
+                arg_placeholder.name,
+                **arg_placeholder.extra_kwargs)
 
     _C_COMMENT_FINDER = re.compile(r"/\*.*?\*/")
 
-    def render_argument_list(self, *arg_lists):
+    def render_argument_list(self, *arg_lists, **kwargs):
+        with_offset = kwargs.pop("with_offset", False)
+        if kwargs:
+            raise TypeError("unrecognized kwargs: " + ", ".join(kwargs))
+
         all_args = []
 
         for arg_list in arg_lists:
@@ -740,6 +791,12 @@ class _TemplateRenderer(object):
             else:
                 all_args.extend(arg_list)
 
+        if with_offset:
+            vec_arg_factory = lambda typename, name: \
+                    _VectorArgPlaceholder(typename, name, with_offset=True)
+        else:
+            vec_arg_factory = _VectorArgPlaceholder
+
         from pyopencl.compyte.dtypes import parse_c_arg_backend
         parsed_args = []
         for arg in all_args:
@@ -749,7 +806,7 @@ class _TemplateRenderer(object):
                     continue
 
                 ph = parse_c_arg_backend(arg,
-                        _ScalarArgPlaceholder, _VectorArgPlaceholder,
+                        _ScalarArgPlaceholder, vec_arg_factory,
                         name_to_dtype=lambda x: x)
                 parsed_arg = self.render_arg(ph)
 
diff --git a/test/test_array.py b/test/test_array.py
index b292b132..05b6d8c8 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -30,13 +30,11 @@ import pytools.test
 import pyopencl as cl
 import pyopencl.array as cl_array
 import pyopencl.tools as cl_tools
-from pyopencl.tools import pytest_generate_tests_for_pyopencl \
-        as pytest_generate_tests
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 from pyopencl.characterize import has_double_support
 
 
-
-
 # {{{ helpers
 
 TO_REAL = {
@@ -44,6 +42,7 @@ TO_REAL = {
         np.dtype(np.complex128): np.float64
         }
 
+
 def general_clrand(queue, shape, dtype):
     from pyopencl.clrandom import rand as clrand
 
@@ -69,6 +68,7 @@ def make_random_array(queue, dtype, size):
 
 # }}}
 
+
 # {{{ dtype-related
 
 @pytools.test.mark_test.opencl
@@ -80,13 +80,14 @@ def test_basic_complex(ctx_factory):
 
     size = 500
 
-    ary =  (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
+    ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
             + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j)
     c = np.complex64(5+7j)
 
     host_ary = ary.get()
     assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
 
+
 @pytools.test.mark_test.opencl
 def test_mix_complex(ctx_factory):
     context = ctx_factory()
@@ -157,6 +158,7 @@ def test_mix_complex(ctx_factory):
 
                     assert correct
 
+
 @pytools.test.mark_test.opencl
 def test_pow_neg1_vs_inv(ctx_factory):
     ctx = ctx_factory()
@@ -189,14 +191,20 @@ def test_vector_fill(ctx_factory):
 
     a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
 
+
 @pytools.test.mark_test.opencl
 def test_absrealimag(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    def real(x): return x.real
-    def imag(x): return x.imag
-    def conj(x): return x.conj()
+    def real(x):
+        return x.real
+
+    def imag(x):
+        return x.imag
+
+    def conj(x):
+        return x.conj()
 
     n = 111
     for func in [abs, real, imag, conj]:
@@ -216,7 +224,8 @@ def test_absrealimag(ctx_factory):
 
 # }}}
 
-# {{{ operands
+
+# {{{ operators
 
 @pytools.test.mark_test.opencl
 def test_pow_array(ctx_factory):
@@ -254,9 +263,9 @@ def test_multiply(ctx_factory):
 
     for sz in [10, 50000]:
         for dtype, scalars in [
-            (np.float32, [2]),
-            (np.complex64, [2j]),
-            ]:
+                (np.float32, [2]),
+                (np.complex64, [2j]),
+                ]:
             for scalar in scalars:
                 a_gpu = make_random_array(queue, dtype, sz)
                 a = a_gpu.get()
@@ -390,6 +399,7 @@ def test_divide_array(ctx_factory):
 
 # }}}
 
+
 # {{{ RNG
 
 @pytools.test.mark_test.opencl
@@ -433,6 +443,7 @@ def test_random(ctx_factory):
 
 # }}}
 
+
 # {{{ misc
 
 @pytools.test.mark_test.opencl
@@ -443,6 +454,7 @@ def test_numpy_integer_shape(ctx_factory):
     cl_array.empty(queue, np.int32(17), np.float32)
     cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
 
+
 @pytools.test.mark_test.opencl
 def test_len(ctx_factory):
     context = ctx_factory()
@@ -452,6 +464,7 @@ def test_len(ctx_factory):
     a_cpu = cl_array.to_device(queue, a)
     assert len(a_cpu) == 10
 
+
 @pytools.test.mark_test.opencl
 def test_stride_preservation(ctx_factory):
     context = ctx_factory()
@@ -464,6 +477,7 @@ def test_stride_preservation(ctx_factory):
     print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous)
     assert np.allclose(AT_GPU.get(), AT)
 
+
 @pytools.test.mark_test.opencl
 def test_nan_arithmetic(ctx_factory):
     context = ctx_factory()
@@ -489,6 +503,7 @@ def test_nan_arithmetic(ctx_factory):
 
     assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
 
+
 @pytools.test.mark_test.opencl
 def test_mem_pool_with_arrays(ctx_factory):
     context = ctx_factory()
@@ -501,6 +516,7 @@ def test_mem_pool_with_arrays(ctx_factory):
     assert a_dev.allocator is mem_pool
     assert b_dev.allocator is mem_pool
 
+
 @pytools.test.mark_test.opencl
 def test_view(ctx_factory):
     context = ctx_factory()
@@ -523,37 +539,32 @@ def test_view(ctx_factory):
 
 # }}}
 
+
 @pytools.test.mark_test.opencl
-def no_test_slice(ctx_factory):
+def test_slice(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
     from pyopencl.clrandom import rand as clrand
 
     l = 20000
-    a_gpu = clrand(queue, (l,))
+    a_gpu = clrand(queue, (l,), dtype=np.float32)
     a = a_gpu.get()
 
     from random import randrange
-    for i in range(200):
+    for i in range(20):
         start = randrange(l)
         end = randrange(start, l)
 
-        a_gpu_slice = a_gpu[start:end]
-        a_slice = a[start:end]
+        a_gpu_slice = 2*a_gpu[start:end]
+        a_slice = 2*a[start:end]
 
         assert la.norm(a_gpu_slice.get() - a_slice) == 0
 
 
-
-
-
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.
-    import pyopencl as cl
-
-    import sys
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-- 
GitLab