From 7acc1a3ab6719c7f42116efce180e67443517ed9 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Thu, 14 May 2020 11:50:13 +0200
Subject: [PATCH 1/3] Add 'out' parameter to ReductionKernel, as in PyOpenCL

---
 pycuda/reduction.py   |  5 ++++-
 test/test_gpuarray.py | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index 94291057..b76d9885 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -228,6 +228,7 @@ class ReductionKernel:
             s2_func = kernel_wrapper(s2_func)
 
         stream = kwargs.get("stream")
+        out = kwargs.pop("out", None)
 
         from .gpuarray import empty
 
@@ -267,7 +268,9 @@ class ReductionKernel:
                 macroblock_size = block_count*self.block_size
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
-            if block_count == 1:
+            if block_count == 1 and out is not None:
+                result = out
+            elif block_count == 1:
                 result = empty((), self.dtype_out, allocator=allocator)
             else:
                 result = empty((block_count,), self.dtype_out, allocator=allocator)
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dafe65c1..98ce1813 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -931,6 +931,22 @@ class TestGPUArray:
         assert minmax["cur_min"] == np.min(a)
         assert minmax["cur_max"] == np.max(a)
 
+    @mark_cuda_test
+    def test_reduce_out(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((10, 200), dtype=np.float32)
+        a = a_gpu.get()
+
+        from pycuda.reduction import ReductionKernel
+        red = ReductionKernel(np.float32, neutral=0,
+                              reduce_expr="max(a,b)",
+                              arguments="float *in")
+        max_gpu = gpuarray.empty(10, dtype=np.float32)
+        for i in range(10):
+            red(a_gpu[i], out=max_gpu[i])
+
+        assert np.alltrue(a.max(axis=1) == max_gpu.get())
+
     @mark_cuda_test
     def test_sum_allocator(self):
         # FIXME
-- 
GitLab


From 67928d53bf1c97b6258c5b166806932c42236054 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Fri, 15 May 2020 10:17:06 +0200
Subject: [PATCH 2/3] ReductionKernel: add __call__ documentation, and check
 that out has the expected dtype

---
 doc/source/array.rst | 25 ++++++++++++++++++++++++-
 pycuda/reduction.py  |  2 ++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/source/array.rst b/doc/source/array.rst
index c3c81253..eddf91ca 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -1132,7 +1132,17 @@ Custom Reductions
     unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* is specified
     as a string of code.
 
-    .. method __call__(*args, stream=None)
+    .. method:: __call__(*args, stream=None, out=None)
+
+        Invoke the generated reduction kernel. The arguments may either be scalars or
+        :class:`GPUArray` instances. The reduction will be done on each entry of
+        the first vector argument.
+
+        If *stream* is given, it must be a :class:`pycuda.driver.Stream` object,
+        where the execution will be serialized.
+
+        With *out* the resulting single-entry :class:`GPUArray` can be specified.
+        Because offsets are supported one can store results anywhere (e.g. out=a[3]).
 
 Here's a usage example::
 
@@ -1145,6 +1155,19 @@ Here's a usage example::
 
     my_dot_prod = krnl(a, b).get()
 
+Or by specifying the output::
+
+    from pycuda.curandom import rand as curand
+    a = curand((10, 200), dtype=np.float32)
+    red = ReductionKernel(np.float32, neutral=0,
+                               reduce_expr="a+b",
+                               arguments="float *in")
+    a_sum = gpuarray.empty(10, dtype=np.float32)
+    for i in range(10):
+        red(a[i], out=a_sum[i])
+    assert(np.allclose(a_sum.get(), a.get().sum(axis=1)))
+
+
 Parallel Scan / Prefix Sum
 --------------------------
 
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index b76d9885..dac30982 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -269,6 +269,8 @@ class ReductionKernel:
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
             if block_count == 1 and out is not None:
+                assert out.dtype == self.dtype_out, \
+                    "ReductionKernel: out must have the same dtype as the reduction"
                 result = out
             elif block_count == 1:
                 result = empty((), self.dtype_out, allocator=allocator)
-- 
GitLab


From 658a57217faee4678c7f9d3efc68ea0a6ba40afe Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Fri, 15 May 2020 15:52:41 +0200
Subject: [PATCH 3/3] ReductionKernel.__call__(): raise ValueError if out dtype
 is incorrect or empty

---
 pycuda/reduction.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index dac30982..939a006a 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -269,8 +269,10 @@ class ReductionKernel:
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
             if block_count == 1 and out is not None:
-                assert out.dtype == self.dtype_out, \
-                    "ReductionKernel: out must have the same dtype as the reduction"
+                if out.dtype != self.dtype_out:
+                    raise ValueError("out must have the same dtype as dtype_out")
+                if out.size == 0:
+                    raise ValueError("out array is empty")
                 result = out
             elif block_count == 1:
                 result = empty((), self.dtype_out, allocator=allocator)
-- 
GitLab