From 7acc1a3ab6719c7f42116efce180e67443517ed9 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Thu, 14 May 2020 11:50:13 +0200 Subject: [PATCH 1/3] Add 'out' parameter to ReductionKernel, as in PyOpenCL --- pycuda/reduction.py | 5 ++++- test/test_gpuarray.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pycuda/reduction.py b/pycuda/reduction.py index 94291057..b76d9885 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -228,6 +228,7 @@ class ReductionKernel: s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") + out = kwargs.pop("out", None) from .gpuarray import empty @@ -267,7 +268,9 @@ class ReductionKernel: macroblock_size = block_count*self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size - if block_count == 1: + if block_count == 1 and out is not None: + result = out + elif block_count == 1: result = empty((), self.dtype_out, allocator=allocator) else: result = empty((block_count,), self.dtype_out, allocator=allocator) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dafe65c1..98ce1813 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -931,6 +931,22 @@ class TestGPUArray: assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a) + @mark_cuda_test + def test_reduce_out(self): + from pycuda.curandom import rand as curand + a_gpu = curand((10, 200), dtype=np.float32) + a = a_gpu.get() + + from pycuda.reduction import ReductionKernel + red = ReductionKernel(np.float32, neutral=0, + reduce_expr="max(a,b)", + arguments="float *in") + max_gpu = gpuarray.empty(10, dtype=np.float32) + for i in range(10): + red(a_gpu[i], out=max_gpu[i]) + + assert np.alltrue(a.max(axis=1) == max_gpu.get()) + @mark_cuda_test def test_sum_allocator(self): # FIXME -- GitLab From 67928d53bf1c97b6258c5b166806932c42236054 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Fri, 15 May 2020 10:17:06 +0200 Subject: [PATCH 2/3] ReductionKernel: add __call__ documentation, and check that out has the expected dtype --- doc/source/array.rst | 25 ++++++++++++++++++++++++- pycuda/reduction.py | 2 ++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/array.rst b/doc/source/array.rst index c3c81253..eddf91ca 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -1132,7 +1132,17 @@ Custom Reductions unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* is specified as a string of code. - .. method __call__(*args, stream=None) + .. method:: __call__(*args, stream=None, out=None) + + Invoke the generated reduction kernel. The arguments may either be scalars or + :class:`GPUArray` instances. The reduction will be done on each entry of + the first vector argument. + + If *stream* is given, it must be a :class:`pycuda.driver.Stream` object, + where the execution will be serialized. + + With *out* the resulting single-entry :class:`GPUArray` can be specified. + Because offsets are supported one can store results anywhere (e.g. out=a[3]). Here's a usage example:: @@ -1145,6 +1155,19 @@ Here's a usage example:: my_dot_prod = krnl(a, b).get() +Or by specifying the output:: + + from pycuda.curandom import rand as curand + a = curand((10, 200), dtype=np.float32) + red = ReductionKernel(np.float32, neutral=0, + reduce_expr="a+b", + arguments="float *in") + a_sum = gpuarray.empty(10, dtype=np.float32) + for i in range(10): + red(a[i], out=a_sum[i]) + assert(np.allclose(a_sum.get(), a.get().sum(axis=1))) + + Parallel Scan / Prefix Sum -------------------------- diff --git a/pycuda/reduction.py b/pycuda/reduction.py index b76d9885..dac30982 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -269,6 +269,8 @@ class ReductionKernel: seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1 and out is not None: + assert out.dtype == self.dtype_out, \ + "ReductionKernel: out must have the same dtype as the reduction" result = out elif block_count == 1: result = empty((), self.dtype_out, allocator=allocator) -- GitLab From 658a57217faee4678c7f9d3efc68ea0a6ba40afe Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Fri, 15 May 2020 15:52:41 +0200 Subject: [PATCH 3/3] ReductionKernel.__call__(): raise ValueError if out dtype is incorrect or empty --- pycuda/reduction.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pycuda/reduction.py b/pycuda/reduction.py index dac30982..939a006a 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -269,8 +269,10 @@ class ReductionKernel: seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1 and out is not None: - assert out.dtype == self.dtype_out, \ - "ReductionKernel: out must have the same dtype as the reduction" + if out.dtype != self.dtype_out: + raise ValueError("out must have the same dtype as dtype_out") + if out.size == 0: + raise ValueError("out array is empty") result = out elif block_count == 1: result = empty((), self.dtype_out, allocator=allocator) -- GitLab