diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 3a2cc1ef85997a36df45815575e7cadb292d51ac..8e7dca6b39ec1b0272582fa1d676d072b656f6a8 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -49,9 +49,8 @@ Sums and counts ("reduce")
 
     Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
     uses the formal values "a" and "b" to indicate two operands of a binary
-    reduction operation. If you do not specify a *map_expr*, "in[i]" -- and
-    therefore the presence of only one input argument -- is automatically
-    assumed.
+    reduction operation. If you do not specify a *map_expr*, ``in[i]`` is
+    automatically assumed and treated as the only one input argument.
 
     *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
     performed and in which the result is returned. *neutral* is specified as
@@ -62,10 +61,14 @@ Sums and counts ("reduce")
     :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
     is inserted before the actual kernels.
 
-    .. method:: __call__(*args, queue=None, wait_for=None, return_event=False)
+    .. method:: __call__(*args, queue=None, wait_for=None, return_event=False, out=None)
 
         |explain-waitfor|
 
+        With *out* the resulting single-entry :class:`pyopencl.array.Array` can
+        be specified. Because offsets are supported one can store results
+        anywhere (e.g. ``out=a[3]``).
+
         :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array`
             if *return_event* is *False*, otherwise a tuple ``(scalar_array, event)``.
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 99ebbc39e620ebea26fb65ef9778f466a8015296..efa210436fdbeb87f5347caf903244bb5ae4399e 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -60,10 +60,11 @@ KERNEL = """//CL//
     typedef ${out_type} out_type;
 
     __kernel void ${name}(
-      __global out_type *out, ${arguments},
+      __global out_type *out__base, long out__offset, ${arguments},
       unsigned int seq_count, unsigned int n)
     {
-       ${arg_prep}
+        __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset);
+        ${arg_prep}
 
         __local out_type ldata[GROUP_SIZE];
 
@@ -269,7 +270,7 @@ def get_reduction_kernel(stage,
     inf.arg_types = arguments
 
     inf.kernel.set_scalar_arg_dtypes(
-            [None]
+            [None, np.int64]
             + get_arg_list_scalar_arg_dtypes(inf.arg_types)
             + [np.uint32]*2)
 
@@ -334,6 +335,7 @@ class ReductionKernel:
         queue = kwargs.pop("queue", None)
         wait_for = kwargs.pop("wait_for", None)
         return_event = kwargs.pop("return_event", False)
+        out = kwargs.pop("out", None)
 
         if kwargs:
             raise TypeError("invalid keyword argument to reduction kernel")
@@ -375,7 +377,9 @@ class ReductionKernel:
                 macrogroup_size = group_count*stage_inf.group_size
                 seq_count = (sz + macrogroup_size - 1) // macrogroup_size
 
-            if group_count == 1:
+            if group_count == 1 and out is not None:
+                result = out
+            elif group_count == 1:
                 result = empty(use_queue,
                         (), self.dtype_out,
                         allocator=repr_vec.allocator)
@@ -388,7 +392,7 @@ class ReductionKernel:
                     use_queue,
                     (group_count*stage_inf.group_size,),
                     (stage_inf.group_size,),
-                    *([result.data]+invocation_args+[seq_count, sz]),
+                    *([result.base_data, result.offset] + invocation_args + [seq_count, sz]),
                     **dict(wait_for=wait_for))
             wait_for = [last_evt]