From eca4cfbfe756c0c02b211829a40b4903cfdb0b14 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 22 May 2016 22:46:36 +0200
Subject: [PATCH] Add range, slice, and data-less reductions

---
 doc/misc.rst           |   2 +
 pyopencl/array.py      |  20 +++---
 pyopencl/reduction.py  | 141 ++++++++++++++++++++++++++++++-----------
 test/test_algorithm.py |  30 ++++++++-
 4 files changed, 144 insertions(+), 49 deletions(-)

diff --git a/doc/misc.rst b/doc/misc.rst
index eef15c1c..610152c2 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -121,6 +121,8 @@ Version 2016.2
 * Deprecate RANLUXCL. It will be removed in the 2018.x series of PyOpenCL.
 * Introduce Random123 random number generators. See :mod:`pyopencl.clrandom`
   for more information.
+* Add support for **range** and **slice** kwargs and data-less reductions
+  to :class:`pyopencl.reduction.ReductionKernel`.
 
 Version 2016.1
 --------------
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 996fd19a..905a9610 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2299,25 +2299,25 @@ _builtin_min = min
 _builtin_max = max
 
 
-def sum(a, dtype=None, queue=None):
+def sum(a, dtype=None, queue=None, slice=None):
     """
     .. versionadded:: 2011.1
     """
     from pyopencl.reduction import get_sum_kernel
     krnl = get_sum_kernel(a.context, dtype, a.dtype)
-    return krnl(a, queue=queue)
+    return krnl(a, queue=queue, slice=slice)
 
 
-def dot(a, b, dtype=None, queue=None):
+def dot(a, b, dtype=None, queue=None, slice=None):
     """
     .. versionadded:: 2011.1
     """
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype)
-    return krnl(a, b, queue=queue)
+    return krnl(a, b, queue=queue, slice=slice)
 
 
-def vdot(a, b, dtype=None, queue=None):
+def vdot(a, b, dtype=None, queue=None, slice=None):
     """Like :func:`numpy.vdot`.
 
     .. versionadded:: 2013.1
@@ -2325,17 +2325,17 @@ def vdot(a, b, dtype=None, queue=None):
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype,
             conjugate_first=True)
-    return krnl(a, b, queue=queue)
+    return krnl(a, b, queue=queue, slice=slice)
 
 
-def subset_dot(subset, a, b, dtype=None, queue=None):
+def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
     """
     .. versionadded:: 2011.1
     """
     from pyopencl.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(
             a.context, dtype, subset.dtype, a.dtype, b.dtype)
-    return krnl(subset, a, b, queue=queue)
+    return krnl(subset, a, b, queue=queue, slice=slice)
 
 
 def _make_minmax_kernel(what):
@@ -2358,10 +2358,10 @@ max.__doc__ = """
 
 
 def _make_subset_minmax_kernel(what):
-    def f(subset, a, queue=None):
+    def f(subset, a, queue=None, slice=None):
         from pyopencl.reduction import get_subset_minmax_kernel
         krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype)
-        return krnl(subset, a,  queue=queue)
+        return krnl(subset, a,  queue=queue, slice=slice)
 
     return f
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 689763f1..3284d75c 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -44,9 +44,9 @@ import numpy as np
 # {{{ kernel source
 
 KERNEL = r"""//CL//
-    #define GROUP_SIZE ${group_size}
-    #define READ_AND_MAP(i) (${map_expr})
-    #define REDUCE(a, b) (${reduce_expr})
+    #define PCL_GROUP_SIZE ${group_size}
+    #define PCL_READ_AND_MAP(i) (${map_expr})
+    #define PCL_REDUCE(a, b) (${reduce_expr})
 
     % if double_support:
         #if __OPENCL_C_VERSION__ < 120
@@ -59,33 +59,37 @@ KERNEL = r"""//CL//
 
     ${preamble}
 
-    typedef ${out_type} out_type;
+    typedef ${out_type} pcl_out_type;
 
     __kernel void ${name}(
-      __global out_type *out__base, long out__offset, ${arguments},
-      unsigned int seq_count, unsigned int n)
+      __global pcl_out_type *pcl_out__base, long pcl_out__offset,
+      ${arguments}
+      long pcl_start, long pcl_step, long pcl_stop,
+      unsigned int pcl_seq_count, long n)
     {
-        __global out_type *out = (__global out_type *) (
-            (__global char *) out__base + out__offset);
+        __global pcl_out_type *pcl_out = (__global pcl_out_type *) (
+            (__global char *) pcl_out__base + pcl_out__offset);
         ${arg_prep}
 
-        __local out_type ldata[GROUP_SIZE];
+        __local pcl_out_type pcl_ldata[PCL_GROUP_SIZE];
 
-        unsigned int lid = get_local_id(0);
+        unsigned int pcl_lid = get_local_id(0);
 
-        unsigned int i = get_group_id(0)*GROUP_SIZE*seq_count + lid;
+        const long pcl_base_idx =
+            get_group_id(0)*PCL_GROUP_SIZE*pcl_seq_count + pcl_lid;
+        long i = pcl_start + pcl_base_idx * pcl_step;
 
-        out_type acc = ${neutral};
-        for (unsigned s = 0; s < seq_count; ++s)
+        pcl_out_type pcl_acc = ${neutral};
+        for (unsigned pcl_s = 0; pcl_s < pcl_seq_count; ++pcl_s)
         {
-          if (i >= n)
+          if (i >= pcl_stop)
             break;
-          acc = REDUCE(acc, READ_AND_MAP(i));
+          pcl_acc = PCL_REDUCE(pcl_acc, PCL_READ_AND_MAP(i));
 
-          i += GROUP_SIZE;
+          i += PCL_GROUP_SIZE*pcl_step;
         }
 
-        ldata[lid] = acc;
+        pcl_ldata[pcl_lid] = pcl_acc;
 
         <%
           cur_size = group_size
@@ -99,18 +103,18 @@ KERNEL = r"""//CL//
             assert new_size * 2 == cur_size
             %>
 
-            if (lid < ${new_size})
+            if (pcl_lid < ${new_size})
             {
-                ldata[lid] = REDUCE(
-                  ldata[lid],
-                  ldata[lid + ${new_size}]);
+                pcl_ldata[pcl_lid] = PCL_REDUCE(
+                  pcl_ldata[pcl_lid],
+                  pcl_ldata[pcl_lid + ${new_size}]);
             }
 
             <% cur_size = new_size %>
 
         % endwhile
 
-        if (lid == 0) out[get_group_id(0)] = ldata[0];
+        if (pcl_lid == 0) pcl_out[get_group_id(0)] = pcl_ldata[0];
     }
     """
 
@@ -157,10 +161,15 @@ def _get_reduction_source(
     from mako.template import Template
     from pytools import all
     from pyopencl.characterize import has_double_support
+
+    arguments = ", ".join(arg.declarator() for arg in parsed_args)
+    if parsed_args:
+        arguments += ", "
+
     src = str(Template(KERNEL).render(
         out_type=out_type,
-        arguments=", ".join(arg.declarator() for arg in parsed_args),
         group_size=group_size,
+        arguments=arguments,
         neutral=neutral,
         reduce_expr=_process_code_for_macro(reduce_expr),
         map_expr=_process_code_for_macro(map_expr),
@@ -222,7 +231,7 @@ def get_reduction_kernel(stage,
     inf.kernel.set_scalar_arg_dtypes(
             [None, np.int64]
             + get_arg_list_scalar_arg_dtypes(inf.arg_types)
-            + [np.uint32]*2)
+            + [np.int64]*5)
 
     return inf
 
@@ -266,15 +275,21 @@ class ReductionKernel:
                 name=name+"_stage2", options=options, preamble=preamble,
                 max_group_size=max_group_size)
 
-        from pytools import any
-        from pyopencl.tools import VectorArg
-        assert any(
-                isinstance(arg_tp, VectorArg)
-                for arg_tp in self.stage_1_inf.arg_types), \
-                "ReductionKernel can only be used with functions " \
-                "that have at least one vector argument"
-
     def __call__(self, *args, **kwargs):
+        """
+        :arg range: A :class:`slice` object. Specifies the range of indices on which
+            the kernel will be executed. May not be given at the same time
+            as *slice*.
+        :arg slice: A :class:`slice` object.
+            Specifies the range of indices on which the kernel will be
+            executed, relative to the first vector-like argument.
+            May not be given at the same time as *range*.
+        :arg allocator:
+
+        .. versionchanged:: 2016.2
+
+            *range_* and *slice_* added.
+        """
         MAX_GROUP_COUNT = 1024  # noqa
         SMALL_SEQ_COUNT = 4  # noqa
 
@@ -283,10 +298,14 @@ class ReductionKernel:
         stage_inf = self.stage_1_inf
 
         queue = kwargs.pop("queue", None)
+        allocator = kwargs.pop("allocator", None)
         wait_for = kwargs.pop("wait_for", None)
         return_event = kwargs.pop("return_event", False)
         out = kwargs.pop("out", None)
 
+        range_ = kwargs.pop("range", None)
+        slice_ = kwargs.pop("slice", None)
+
         if kwargs:
             raise TypeError("invalid keyword argument to reduction kernel")
 
@@ -310,14 +329,58 @@ class ReductionKernel:
                 else:
                     invocation_args.append(arg)
 
-            repr_vec = vectors[0]
-            sz = repr_vec.size
+            if vectors:
+                repr_vec = vectors[0]
+            else:
+                repr_vec = None
+
+            # {{{ range/slice processing
+
+            if range_ is not None:
+                if slice_ is not None:
+                    raise TypeError("may not specify both range and slice "
+                            "keyword arguments")
+
+            else:
+                if slice_ is None:
+                    slice_ = slice(None)
+
+                if repr_vec is None:
+                    raise TypeError(
+                            "must have vector argument when range is not specified")
+
+                range_ = slice(*slice_.indices(repr_vec.size))
+
+            assert range_ is not None
+
+            start = range_.start
+            if start is None:
+                start = 0
+            if range_.step is None:
+                step = 1
+            else:
+                step = range_.step
+            sz = abs(range_.stop - start)//step
+
+            # }}}
 
             if queue is not None:
                 use_queue = queue
             else:
+                if repr_vec is None:
+                    raise TypeError(
+                            "must specify queue argument when no vector argument "
+                            "present")
+
                 use_queue = repr_vec.queue
 
+            if allocator is None:
+                if repr_vec is None:
+                    from pyopencl.tools import DeferredAllocator
+                    allocator = DeferredAllocator(queue.context)
+                else:
+                    allocator = repr_vec.allocator
+
             if sz <= stage_inf.group_size*SMALL_SEQ_COUNT*MAX_GROUP_COUNT:
                 total_group_size = SMALL_SEQ_COUNT*stage_inf.group_size
                 group_count = (sz + total_group_size - 1) // total_group_size
@@ -327,23 +390,25 @@ class ReductionKernel:
                 macrogroup_size = group_count*stage_inf.group_size
                 seq_count = (sz + macrogroup_size - 1) // macrogroup_size
 
+            size_args = [start, step, range_.stop, seq_count, sz]
+
             if group_count == 1 and out is not None:
                 result = out
             elif group_count == 1:
                 result = empty(use_queue,
                         (), self.dtype_out,
-                        allocator=repr_vec.allocator)
+                        allocator=allocator)
             else:
                 result = empty(use_queue,
                         (group_count,), self.dtype_out,
-                        allocator=repr_vec.allocator)
+                        allocator=allocator)
 
             last_evt = stage_inf.kernel(
                     use_queue,
                     (group_count*stage_inf.group_size,),
                     (stage_inf.group_size,),
                     *([result.base_data, result.offset]
-                        + invocation_args + [seq_count, sz]),
+                        + invocation_args + size_args),
                     **dict(wait_for=wait_for))
             wait_for = [last_evt]
 
@@ -356,6 +421,8 @@ class ReductionKernel:
                 stage_inf = self.stage_2_inf
                 args = (result,) + stage1_args
 
+                range_ = slice_ = None
+
 # }}}
 
 
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 952a38dd..ffd85676 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -244,11 +244,37 @@ def test_sum(ctx_factory):
                 slice(1000, 3000),
                 slice(1000, -3000),
                 slice(1000, None),
+                slice(1000, None, 3),
                 ]:
             sum_a = np.sum(a[slc])
-            sum_a_gpu = cl_array.sum(a_gpu[slc]).get()
 
-            assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
+            if slc.step is None:
+                sum_a_gpu = cl_array.sum(a_gpu[slc]).get()
+                assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
+
+            sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get()
+            assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4
+
+
+def test_sum_without_data(ctx_factory):
+    from pytest import importorskip
+    importorskip("mako")
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    n = 2000
+
+    from pyopencl.reduction import ReductionKernel
+    red = ReductionKernel(context, np.int32,
+            neutral="0",
+            reduce_expr="a+b", map_expr="i",
+            arguments=[])
+
+    result_dev = red(range=range(n), queue=queue).get()
+    result_ref = n*(n-1)//2
+
+    assert result_dev == result_ref
 
 
 def test_minmax(ctx_factory):
-- 
GitLab