Add `out` parameter to ReductionKernel call.

f95a6b87 · gw0 [http://gw.tnode.com/] · d3f31844 · f95a6b87 · f95a6b87
Commit f95a6b87 authored 10 years ago by gw0 [http://gw.tnode.com/]
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -73,9 +73,8 @@ Sums and counts ("reduce")

    Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
    uses the formal values "a" and "b" to indicate two operands of a binary
-    reduction operation. If you do not specify a *map_expr*, "in[i]" -- and
-    therefore the presence of only one input argument -- is automatically
-    assumed.
+    reduction operation. If you do not specify a *map_expr*, ``in[i]`` is
+    automatically assumed and treated as the only one input argument.

    *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
    performed and in which the result is returned. *neutral* is specified as
@@ -86,10 +85,14 @@ Sums and counts ("reduce")
    :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
    is inserted before the actual kernels.

-    .. method:: __call__(*args, queue=None, wait_for=None, return_event=False)
+    .. method:: __call__(*args, queue=None, wait_for=None, return_event=False, out=None)

        |explain-waitfor|

+        With *out* the resulting single-entry :class:`pyopencl.array.Array` can
+        be specified. Because offsets are supported one can store results
+        anywhere (e.g. ``out=a[3]``).
+
        :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array`
            if *return_event* is *False*, otherwise a tuple ``(scalar_array, event)``.


--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -58,10 +58,11 @@ KERNEL = """//CL//
    typedef ${out_type} out_type;

    __kernel void ${name}(
-      __global out_type *out, ${arguments},
+      __global out_type *out__base, long out__offset, ${arguments},
      unsigned int seq_count, unsigned int n)
    {
-       ${arg_prep}
+        __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset);
+        ${arg_prep}

        __local out_type ldata[GROUP_SIZE];

@@ -267,7 +268,7 @@ def get_reduction_kernel(stage,
    inf.arg_types = arguments

    inf.kernel.set_scalar_arg_dtypes(
-            [None]
+            [None, np.int64]
            + get_arg_list_scalar_arg_dtypes(inf.arg_types)
            + [np.uint32]*2)

@@ -332,6 +333,7 @@ class ReductionKernel:
        queue = kwargs.pop("queue", None)
        wait_for = kwargs.pop("wait_for", None)
        return_event = kwargs.pop("return_event", False)
+        out = kwargs.pop("out", None)

        if kwargs:
            raise TypeError("invalid keyword argument to reduction kernel")
@@ -373,7 +375,9 @@ class ReductionKernel:
                macrogroup_size = group_count*stage_inf.group_size
                seq_count = (sz + macrogroup_size - 1) // macrogroup_size

-            if group_count == 1:
+            if group_count == 1 and out is not None:
+                result = out
+            elif group_count == 1:
                result = empty(use_queue,
                        (), self.dtype_out,
                        allocator=repr_vec.allocator)
@@ -386,7 +390,7 @@ class ReductionKernel:
                    use_queue,
                    (group_count*stage_inf.group_size,),
                    (stage_inf.group_size,),
-                    *([result.data]+invocation_args+[seq_count, sz]),
+                    *([result.base_data, result.offset] + invocation_args + [seq_count, sz]),
                    **dict(wait_for=wait_for))
            wait_for = [last_evt]