diff --git a/doc/source/array.rst b/doc/source/array.rst
index c3c81253446c3e7cb0ad0ed2b75200485a04dad3..eddf91ca43931637d1b94bc33cec61667b0048d5 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -1132,7 +1132,17 @@ Custom Reductions
     unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* is specified
     as a string of code.
 
-    .. method __call__(*args, stream=None)
+    .. method:: __call__(*args, stream=None, out=None)
+
+        Invoke the generated reduction kernel. The arguments may either be scalars or
+        :class:`GPUArray` instances. The reduction will be done on each entry of
+        the first vector argument.
+
+        If *stream* is given, it must be a :class:`pycuda.driver.Stream` object,
+        where the execution will be serialized.
+
+        With *out* the resulting single-entry :class:`GPUArray` can be specified.
+        Because offsets are supported one can store results anywhere (e.g. out=a[3]).
 
 Here's a usage example::
 
@@ -1145,6 +1155,19 @@ Here's a usage example::
 
     my_dot_prod = krnl(a, b).get()
 
+Or by specifying the output::
+
+    from pycuda.curandom import rand as curand
+    a = curand((10, 200), dtype=np.float32)
+    red = ReductionKernel(np.float32, neutral=0,
+                               reduce_expr="a+b",
+                               arguments="float *in")
+    a_sum = gpuarray.empty(10, dtype=np.float32)
+    for i in range(10):
+        red(a[i], out=a_sum[i])
+    assert(np.allclose(a_sum.get(), a.get().sum(axis=1)))
+
+
 Parallel Scan / Prefix Sum
 --------------------------
 
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index 942910572fe749be4a0877273718de0999ab26b9..939a006a9617754c4ce24111034f08e4838c2a97 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -228,6 +228,7 @@ class ReductionKernel:
             s2_func = kernel_wrapper(s2_func)
 
         stream = kwargs.get("stream")
+        out = kwargs.pop("out", None)
 
         from .gpuarray import empty
 
@@ -267,7 +268,13 @@ class ReductionKernel:
                 macroblock_size = block_count*self.block_size
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
-            if block_count == 1:
+            if block_count == 1 and out is not None:
+                if out.dtype != self.dtype_out:
+                    raise ValueError("out must have the same dtype as dtype_out")
+                if out.size == 0:
+                    raise ValueError("out array is empty")
+                result = out
+            elif block_count == 1:
                 result = empty((), self.dtype_out, allocator=allocator)
             else:
                 result = empty((block_count,), self.dtype_out, allocator=allocator)
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dafe65c17dda652b0e0598354bca8951b9f312d6..98ce1813dc53f537941e0c00564e385745a2f1fd 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -931,6 +931,22 @@ class TestGPUArray:
         assert minmax["cur_min"] == np.min(a)
         assert minmax["cur_max"] == np.max(a)
 
+    @mark_cuda_test
+    def test_reduce_out(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((10, 200), dtype=np.float32)
+        a = a_gpu.get()
+
+        from pycuda.reduction import ReductionKernel
+        red = ReductionKernel(np.float32, neutral=0,
+                              reduce_expr="max(a,b)",
+                              arguments="float *in")
+        max_gpu = gpuarray.empty(10, dtype=np.float32)
+        for i in range(10):
+            red(a_gpu[i], out=max_gpu[i])
+
+        assert np.alltrue(a.max(axis=1) == max_gpu.get())
+
     @mark_cuda_test
     def test_sum_allocator(self):
         # FIXME