diff --git a/doc/source/array.rst b/doc/source/array.rst index c3c81253446c3e7cb0ad0ed2b75200485a04dad3..eddf91ca43931637d1b94bc33cec61667b0048d5 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -1132,7 +1132,17 @@ Custom Reductions unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* is specified as a string of code. - .. method __call__(*args, stream=None) + .. method:: __call__(*args, stream=None, out=None) + + Invoke the generated reduction kernel. The arguments may either be scalars or + :class:`GPUArray` instances. The reduction will be done on each entry of + the first vector argument. + + If *stream* is given, it must be a :class:`pycuda.driver.Stream` object, + where the execution will be serialized. + + With *out* the resulting single-entry :class:`GPUArray` can be specified. + Because offsets are supported one can store results anywhere (e.g. out=a[3]). Here's a usage example:: @@ -1145,6 +1155,19 @@ Here's a usage example:: my_dot_prod = krnl(a, b).get() +Or by specifying the output:: + + from pycuda.curandom import rand as curand + a = curand((10, 200), dtype=np.float32) + red = ReductionKernel(np.float32, neutral=0, + reduce_expr="a+b", + arguments="float *in") + a_sum = gpuarray.empty(10, dtype=np.float32) + for i in range(10): + red(a[i], out=a_sum[i]) + assert(np.allclose(a_sum.get(), a.get().sum(axis=1))) + + Parallel Scan / Prefix Sum -------------------------- diff --git a/pycuda/reduction.py b/pycuda/reduction.py index 942910572fe749be4a0877273718de0999ab26b9..939a006a9617754c4ce24111034f08e4838c2a97 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -228,6 +228,7 @@ class ReductionKernel: s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") + out = kwargs.pop("out", None) from .gpuarray import empty @@ -267,7 +268,13 @@ class ReductionKernel: macroblock_size = block_count*self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size - if block_count == 1: + if block_count == 1 and out is not None: + if out.dtype != self.dtype_out: + raise ValueError("out must have the same dtype as dtype_out") + if out.size == 0: + raise ValueError("out array is empty") + result = out + elif block_count == 1: result = empty((), self.dtype_out, allocator=allocator) else: result = empty((block_count,), self.dtype_out, allocator=allocator) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dafe65c17dda652b0e0598354bca8951b9f312d6..98ce1813dc53f537941e0c00564e385745a2f1fd 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -931,6 +931,22 @@ class TestGPUArray: assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a) + @mark_cuda_test + def test_reduce_out(self): + from pycuda.curandom import rand as curand + a_gpu = curand((10, 200), dtype=np.float32) + a = a_gpu.get() + + from pycuda.reduction import ReductionKernel + red = ReductionKernel(np.float32, neutral=0, + reduce_expr="max(a,b)", + arguments="float *in") + max_gpu = gpuarray.empty(10, dtype=np.float32) + for i in range(10): + red(a_gpu[i], out=max_gpu[i]) + + assert np.alltrue(a.max(axis=1) == max_gpu.get()) + @mark_cuda_test def test_sum_allocator(self): # FIXME