diff --git a/doc/misc.rst b/doc/misc.rst
index b4e30db755258400525464fdfd4b88d2e082582d..71aea7922305b5fcfaa91850915ffdfd3cb62eca 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -209,7 +209,7 @@ other software to be turned into the corresponding :mod:`pyopencl` objects.
 User-visible Changes
 ====================
 
-Version 2017.2
+Version 2018.2
 --------------
 
 .. note::
@@ -217,6 +217,10 @@ Version 2017.2
     This version is currently under development. You can get snapshots from
     PyOpenCL's `git repository <https://github.com/inducer/pyopencl>`_
 
+* Use pybind11.
+* Many bug fixes.
+* Support arrays with offsets in scan kernels.
+
 Version 2018.1
 --------------
 
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 6a59808137d1ac8ed99b936db1c91c1a0e7d11e3..ac8052e87d3bebd01b3117ae1d8251d0083241aa 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -35,7 +35,8 @@ from pyopencl.tools import (dtype_to_ctype, bitlog2,
         KernelTemplateBase, _process_code_for_macro,
         get_arg_list_scalar_arg_dtypes,
         context_dependent_memoize,
-        _NumpyTypesKeyBuilder)
+        _NumpyTypesKeyBuilder,
+        get_arg_offset_adjuster_code)
 
 import pyopencl._mymako as mako
 from pyopencl._cluda import CLUDA_PREAMBLE
@@ -148,6 +149,8 @@ void ${kernel_name}(
     %endif
     )
 {
+    ${arg_offset_adjustment}
+
     // index K in first dimension used for carry storage
     %if use_bank_conflict_avoidance:
         // Avoid bank conflicts by adding a single 32-bit value to the size of
@@ -618,6 +621,8 @@ void ${name_prefix}_final_update(
     %endif
     )
 {
+    ${arg_offset_adjustment}
+
     %if use_lookbehind_update:
         LOCAL_MEM scan_type ldata[WG_SIZE];
     %endif
@@ -998,7 +1003,7 @@ class _GenericScanKernelBase(object):
             resulting in a C `bool` value that determines whether a new
             scan segments starts at index *i*.  If given, makes the scan a
             segmented scan. Has access to the current index `i`, the result
-            of *input_expr* as a, and in addition may use *arguments* and
+            of *input_expr* as `a`, and in addition may use *arguments* and
             *input_fetch_expr* variables just like *input_expr*.
 
             If it returns true, then previous sums will not spill over into the
@@ -1346,6 +1351,7 @@ class GenericScanKernel(_GenericScanKernelBase):
         final_update_src = str(final_update_tpl.render(
             wg_size=update_wg_size,
             output_statement=self.output_statement,
+            arg_offset_adjustment=get_arg_offset_adjuster_code(self.parsed_args),
             argument_signature=", ".join(
                 arg.declarator() for arg in self.parsed_args),
             is_segment_start_expr=self.is_segment_start_expr,
@@ -1421,6 +1427,7 @@ class GenericScanKernel(_GenericScanKernelBase):
             wg_size=wg_size,
             input_expr=input_expr,
             k_group_size=k_group_size,
+            arg_offset_adjustment=get_arg_offset_adjuster_code(arguments),
             argument_signature=", ".join(arg.declarator() for arg in arguments),
             is_segment_start_expr=is_segment_start_expr,
             input_fetch_exprs=input_fetch_exprs,
@@ -1475,7 +1482,9 @@ class GenericScanKernel(_GenericScanKernelBase):
         from pyopencl.tools import VectorArg
         for arg_descr, arg_val in zip(self.parsed_args, args):
             if isinstance(arg_descr, VectorArg):
-                data_args.append(arg_val.data)
+                data_args.append(arg_val.base_data)
+                if arg_descr.with_offset:
+                    data_args.append(arg_val.offset)
             else:
                 data_args.append(arg_val)
 
@@ -1583,6 +1592,8 @@ void ${name_prefix}_debug_scan(
     scan_type current = ${neutral};
     scan_type prev;
 
+    ${arg_offset_adjustment}
+
     for (index_type i = 0; i < N; ++i)
     {
         %for name, arg_name, ife_offset in input_fetch_exprs:
@@ -1636,6 +1647,7 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
         scan_tpl = _make_template(DEBUG_SCAN_TEMPLATE)
         scan_src = str(scan_tpl.render(
             output_statement=self.output_statement,
+            arg_offset_adjustment=get_arg_offset_adjuster_code(self.parsed_args),
             argument_signature=", ".join(
                 arg.declarator() for arg in self.parsed_args),
             is_segment_start_expr=self.is_segment_start_expr,
@@ -1680,7 +1692,9 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
         from pyopencl.tools import VectorArg
         for arg_descr, arg_val in zip(self.parsed_args, args):
             if isinstance(arg_descr, VectorArg):
-                data_args.append(arg_val.data)
+                data_args.append(arg_val.base_data)
+                if arg_descr.with_offset:
+                    data_args.append(arg_val.offset)
             else:
                 data_args.append(arg_val)
 
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 5264767c4094806fe43fdfea6056237aed20ade4..d63ed7881222aa9ed3a7c4b646bd05ffc4cc6866 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -569,6 +569,38 @@ def test_scan(ctx_factory, dtype, scan_cls):
         collect()
 
 
+@pytest.mark.parametrize("scan_cls", (GenericScanKernel, GenericDebugScanKernel))
+def test_scan_with_vectorargs_with_offsets(ctx_factory, scan_cls):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.tools import VectorArg
+
+    knl = scan_cls(
+            context, float,
+            arguments=[
+                VectorArg(float, "input", with_offset=True),
+                VectorArg(int, "segment", with_offset=True),
+                ],
+            input_expr="input[i]",
+            is_segment_start_expr="segment[i]",
+            scan_expr="a+b", neutral="0",
+            output_statement="""
+                input[i] = item;
+                """)
+
+    n = 20
+
+    host_data = np.random.randint(0, 10, n).astype(float)
+    dev_data = cl.array.to_device(queue, host_data)
+    segment_data = np.zeros(n, dtype=int)
+    dev_segment_data = cl.array.to_device(queue, segment_data)
+
+    knl(dev_data, dev_segment_data)
+
+    assert (dev_data.get() == np.cumsum(host_data)).all()
+
+
 def test_copy_if(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -654,7 +686,6 @@ def test_index_preservation(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
     classes = [GenericScanKernel]
 
     dev = context.devices[0]