diff --git a/doc/algorithm.rst b/doc/algorithm.rst index 3a2cc1ef85997a36df45815575e7cadb292d51ac..8e7dca6b39ec1b0272582fa1d676d072b656f6a8 100644 --- a/doc/algorithm.rst +++ b/doc/algorithm.rst @@ -49,9 +49,8 @@ Sums and counts ("reduce") Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr* uses the formal values "a" and "b" to indicate two operands of a binary - reduction operation. If you do not specify a *map_expr*, "in[i]" -- and - therefore the presence of only one input argument -- is automatically - assumed. + reduction operation. If you do not specify a *map_expr*, ``in[i]`` is + automatically assumed and treated as the only one input argument. *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is performed and in which the result is returned. *neutral* is specified as @@ -62,10 +61,14 @@ Sums and counts ("reduce") :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that is inserted before the actual kernels. - .. method:: __call__(*args, queue=None, wait_for=None, return_event=False) + .. method:: __call__(*args, queue=None, wait_for=None, return_event=False, out=None) |explain-waitfor| + With *out* the resulting single-entry :class:`pyopencl.array.Array` can + be specified. Because offsets are supported one can store results + anywhere (e.g. ``out=a[3]``). + :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array` if *return_event* is *False*, otherwise a tuple ``(scalar_array, event)``. diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index 99ebbc39e620ebea26fb65ef9778f466a8015296..efa210436fdbeb87f5347caf903244bb5ae4399e 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -60,10 +60,11 @@ KERNEL = """//CL// typedef ${out_type} out_type; __kernel void ${name}( - __global out_type *out, ${arguments}, + __global out_type *out__base, long out__offset, ${arguments}, unsigned int seq_count, unsigned int n) { - ${arg_prep} + __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset); + ${arg_prep} __local out_type ldata[GROUP_SIZE]; @@ -269,7 +270,7 @@ def get_reduction_kernel(stage, inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( - [None] + [None, np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32]*2) @@ -334,6 +335,7 @@ class ReductionKernel: queue = kwargs.pop("queue", None) wait_for = kwargs.pop("wait_for", None) return_event = kwargs.pop("return_event", False) + out = kwargs.pop("out", None) if kwargs: raise TypeError("invalid keyword argument to reduction kernel") @@ -375,7 +377,9 @@ class ReductionKernel: macrogroup_size = group_count*stage_inf.group_size seq_count = (sz + macrogroup_size - 1) // macrogroup_size - if group_count == 1: + if group_count == 1 and out is not None: + result = out + elif group_count == 1: result = empty(use_queue, (), self.dtype_out, allocator=repr_vec.allocator) @@ -388,7 +392,7 @@ class ReductionKernel: use_queue, (group_count*stage_inf.group_size,), (stage_inf.group_size,), - *([result.data]+invocation_args+[seq_count, sz]), + *([result.base_data, result.offset] + invocation_args + [seq_count, sz]), **dict(wait_for=wait_for)) wait_for = [last_evt]