diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index c877236595d55f02c2357aeae25a62f6c1f957a3..c61515baee605ee680abafaebcb510a07ca58f46 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -407,9 +407,14 @@ class LoopyCCodeMapper(RecursiveMapper):
                                 expr.aggregate.name, expr,
                                 len(index_expr), len(arg.strides)))
 
-                from pymbolic.primitives import Subscript
+                from pymbolic.primitives import Subscript, Variable
+                if arg.offset:
+                    offset = Variable(arg.offset)
+                else:
+                    offset = 0
+
                 return base_impl(
-                        Subscript(expr.aggregate, arg.offset+sum(
+                        Subscript(expr.aggregate, offset+sum(
                             stride*expr_i for stride, expr_i in zip(
                                 arg.strides, index_expr))),
                         enclosing_prec, type_context)
@@ -450,9 +455,14 @@ class LoopyCCodeMapper(RecursiveMapper):
 
             else:
                 # GlobalArg
+                if arg.offset:
+                    offset = Variable(arg.offset)
+                else:
+                    offset = 0
+
                 from pymbolic.primitives import Subscript
                 return base_impl(
-                        Subscript(expr.aggregate, arg.offset+expr.index),
+                        Subscript(expr.aggregate, offset+expr.index),
                         enclosing_prec, type_context)
 
         elif expr.aggregate.name in self.kernel.temporary_variables:
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 59630a9a41d800fd6da654dca0ea8eea60f4bc58..e844cf87ac30ffe32c438e9ed5d15a0073b52918 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -62,6 +62,9 @@ def _arg_matches_spec(arg, val, other_args):
                     "(got: %s, expected: %s)"
                     % (arg.name, val.strides, strides))
 
+        if val.offset != 0 and arg.offset == 0:
+            raise ValueError("argument '%s' does not allow offset" % arg.name)
+
     return True
 
 # }}}
@@ -120,20 +123,39 @@ class CompiledKernel:
         self.options = options
 
     @memoize_method
-    def get_kernel_info(self, dtype_mapping_set):
+    def get_kernel_info(self, arg_to_dtype_set, arg_to_has_offset_set):
         kernel = self.kernel
 
+        import loopy as lp
         from loopy.kernel.tools import (
                 add_argument_dtypes,
                 get_arguments_with_incomplete_dtype)
 
-        if get_arguments_with_incomplete_dtype(kernel):
-            if dtype_mapping_set is not None:
-                kernel = add_argument_dtypes(kernel, dict(dtype_mapping_set))
+        if arg_to_dtype_set:
+            kernel = add_argument_dtypes(kernel, dict(arg_to_dtype_set))
 
             from loopy.preprocess import infer_unknown_types
             kernel = infer_unknown_types(kernel, expect_completion=True)
 
+        if arg_to_has_offset_set:
+            arg_to_has_offset = dict(arg_to_has_offset_set)
+
+            vng = kernel.get_var_name_generator()
+
+            new_args = []
+            for arg in kernel.args:
+                if getattr(arg, "offset", None) is lp.auto:
+                    if arg_to_has_offset[arg.name]:
+                        offset_arg_name = vng(arg.name+"_offset")
+                        new_args.append(arg.copy(offset=offset_arg_name))
+                        new_args.append(lp.ValueArg(offset_arg_name, kernel.index_dtype))
+                    else:
+                        new_args.append(arg.copy(offset=0))
+                else:
+                    new_args.append(arg)
+
+            kernel = kernel.copy(args=new_args)
+
         import loopy as lp
         if kernel.schedule is None:
             kernel = _get_kernel_from_iterable(
@@ -156,8 +178,8 @@ class CompiledKernel:
                 )
 
     @memoize_method
-    def get_cl_kernel(self, dtype_mapping_set):
-        kernel_info = self.get_kernel_info(dtype_mapping_set)
+    def get_cl_kernel(self, arg_to_dtype_set, arg_to_has_offset_set):
+        kernel_info = self.get_kernel_info(arg_to_dtype_set, arg_to_has_offset_set)
         kernel = kernel_info.kernel
 
         from loopy.codegen import generate_code
@@ -199,17 +221,19 @@ class CompiledKernel:
 
     # {{{ debugging aids
 
-    def get_code(self, dtype_dict=None):
-        if dtype_dict is not None:
-            dtype_dict = frozenset(dtype_dict.items())
+    def get_code(self, arg_to_dtype=None, arg_to_has_offset=None):
+        if arg_to_dtype is not None:
+            arg_to_dtype = frozenset(arg_to_dtype.iteritems())
+        if arg_to_has_offset is not None:
+            arg_to_has_offset = frozenset(arg_to_has_offset.iteritems())
 
-        kernel_info = self.get_kernel_info(dtype_dict)
+        kernel_info = self.get_kernel_info(arg_to_dtype, arg_to_has_offset)
 
         from loopy.codegen import generate_code
         return generate_code(kernel_info.kernel, **self.codegen_kwargs)
 
-    def get_highlighted_code(self, dtype_dict=None):
-        return get_highlighted_code(self.get_code(dtype_dict))
+    def get_highlighted_code(self, arg_to_dtype=None, arg_to_has_offset=None):
+        return get_highlighted_code(self.get_code(arg_to_dtype, arg_to_has_offset))
 
     @property
     def code(self):
@@ -224,6 +248,10 @@ class CompiledKernel:
     def __call__(self, queue, **kwargs):
         """If all array arguments are :mod:`numpy` arrays, defaults to returning
         numpy arrays as well.
+
+        If you want offset arguments (see
+        :attr:`loopy.kernel.data.GlobalArg.offset`) to be set automatically, it
+        must occur *after* the corresponding array argument.
         """
 
         allocator = kwargs.pop("allocator", None)
@@ -234,20 +262,32 @@ class CompiledKernel:
 
         # {{{ process arg types, get cl kernel
 
-        dtype_dict = {}
+        import loopy as lp
+
+        arg_to_dtype = {}
+        arg_to_has_offset = {}
         for arg in self.kernel.args:
             val = kwargs.get(arg.name)
-            if val is not None:
+
+            if arg.dtype is None and val is not None:
                 try:
                     dtype = val.dtype
                 except AttributeError:
                     pass
                 else:
-                    dtype_dict[arg.name] = dtype
+                    arg_to_dtype[arg.name] = dtype
 
-        kernel_info, cl_kernel = self.get_cl_kernel(frozenset(dtype_dict.iteritems()))
+            if getattr(arg, "offset", None) is lp.auto:
+                if val is not None:
+                    has_offset = val.offset != 0
+                else:
+                    has_offset = False
+                arg_to_has_offset[arg.name] = has_offset
+
+        kernel_info, cl_kernel = self.get_cl_kernel(frozenset(arg_to_dtype.iteritems()),
+                frozenset(arg_to_has_offset.iteritems()))
         kernel = kernel_info.kernel
-        del dtype_dict
+        del arg_to_dtype
 
         # }}}
 
@@ -267,8 +307,22 @@ class CompiledKernel:
 
             val = kwargs_copy.pop(arg.name, None)
 
-            # automatically transfer host-side arrays
             if isinstance(arg, lp.GlobalArg):
+                if arg.offset:
+                    # arg.offset must be a string at this point.
+
+                    # /!\ Tacit assumption: If you want the offset argument to
+                    # be set automatically, it must occur *after* the
+                    # corresponding array argument.
+
+                    ofs, remdr =  divmod(val.offset, val.dtype.itemsize)
+                    assert remdr == 0
+                    kwargs_copy.setdefault(arg.offset, ofs)
+                    del ofs
+                    del remdr
+
+                # {{{ automatically transfer host-side arrays, if needed
+
                 if isinstance(val, np.ndarray):
                     # synchronous, so nothing to worry about
                     val = cl_array.to_device(queue, val, allocator=allocator)
@@ -279,6 +333,8 @@ class CompiledKernel:
                                 "performing implicit transfer" % arg.name,
                                 stacklevel=2)
 
+                # }}}
+
             if val is None:
                 if not is_written:
                     raise TypeError("must supply input argument '%s'" % arg.name)
@@ -311,7 +367,7 @@ class CompiledKernel:
                 outputs.append(val)
 
             if isinstance(arg, lp.GlobalArg):
-                args.append(val.data)
+                args.append(val.base_data)
             else:
                 args.append(val)
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 8d340114294b53dc822d9e2fe0dd0309e32e8ce4..12c7b196ad5919e918e9f68f138fce29697d8859 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -439,7 +439,7 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables, s
             # It's not a temp var, and thereby not a domain parameter--the only
             # other writable type of variable is an argument.
 
-            kernel_args.append(GlobalArg(arg_name, shape=lp.auto))
+            kernel_args.append(GlobalArg(arg_name, shape=lp.auto, offset=lp.auto))
             continue
 
         irank = find_index_rank(arg_name)
@@ -447,7 +447,7 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables, s
             # read-only, no indices
             kernel_args.append(ValueArg(arg_name))
         else:
-            kernel_args.append(GlobalArg(arg_name, shape=lp.auto))
+            kernel_args.append(GlobalArg(arg_name, shape=lp.auto, offset=lp.auto))
 
     return kernel_args
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index fd0c5b6328b845256b69bcf94e999a7bacd87970..c174ca70161e2fe06634aaa374207ba751100fa1 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -161,6 +161,16 @@ class ShapedArg(KernelArgument):
             commas, in which case multiple arguments,
             each with identical properties are created
             for each name.
+        :arg dtype: the :class:`numpy.dtype` of the array.
+            If this is *None*, :mod:`loopy` will try to continue
+            without knowing the type of this array.
+
+            Note that some operations, such as :func:`loopy.add_padding`
+            require this information to work.
+
+            :class:`loopy.CompiledKernel` will automatically compile a kernel
+            with the right dtype when called with a concrete array on a kernel
+            with argument whose *dtype* is *None*.
         :arg shape: like :attr:`numpy.ndarray.shape`.
             Also allowed to be :class:`loopy.auto`, in
             which case shape is determined by finding the
@@ -180,8 +190,13 @@ class ShapedArg(KernelArgument):
             string that can be parsed to such an expression.
         :arg order: "F" or "C" for C (row major) or Fortran
             (column major)
-        :arg offset: Offset from the beginning of the vector from which
-            the strides are counted.
+        :arg offset: Offset from the beginning of the buffer to the point from
+            which the strides are counted. May be one of
+
+            * 0
+            * a string (that is interpreted as an argument name).
+            * :class:`loopy.auto`, in which case this information is added at run time
+              by :class:`loopy.CompiledKernel`.
         """
         if dtype is not None:
             dtype = np.dtype(dtype)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6a2aa8676e8eb59bcb3dcfca467dba4eeeb93fcd..90dac0f97d4bfea6f4b76cdfc3988dce8ca5cac6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -28,6 +28,7 @@ THE SOFTWARE.
 import numpy as np
 import loopy as lp
 import pyopencl as cl
+import pyopencl.clrandom
 import logging
 
 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -975,11 +976,6 @@ def test_double_sum(ctx_factory):
                 "a = sum((i,j), i*j)",
                 "b = sum(i, sum(j, i*j))",
                 ],
-            [
-                lp.GlobalArg("a", dtype, shape=()),
-                lp.GlobalArg("b", dtype, shape=()),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
             assumptions="n>=1")
 
     cknl = lp.CompiledKernel(ctx, knl)
@@ -1182,6 +1178,35 @@ def test_triangle_domain(ctx_factory):
 
 
 
+def test_array_with_offset(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 5
+
+    knl = lp.make_kernel(ctx.devices[0], [
+            "{[i,j]: 0<=i<n and 0<=j<m }",
+            ],
+            """
+                b[i,j] = 2*a[i,j]
+                """,
+            assumptions="n>=1 and m>=1")
+
+    cknl = lp.CompiledKernel(ctx, knl)
+
+    a_full = cl.clrandom.rand(queue, (n, n), np.float64)
+    a = a_full[3:10]
+
+    print cknl.get_highlighted_code({"a": a.dtype}, {"a": True, "b": False})
+    evt, (b,) = cknl(queue, a=a, n=a.shape[0], m=a.shape[1])
+
+    import numpy.linalg as la
+    assert la.norm(b.get() - 2*a.get()) < 1e-13
+
+
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1: