diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index b9b65f50dc9c8d296bfec2adc1b10b130a9d0e62..efc1fa3bbfb9f67d2be72f74c8be33679e2b9beb 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -52,7 +52,7 @@ class TestArgInfo(Record):
 
 # {{{ "reference" arguments
 
-def make_ref_args(kernel, cl_arg_info, queue, parameters, fill_value):
+def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value):
     from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
 
     from pymbolic import evaluate
@@ -60,7 +60,7 @@ def make_ref_args(kernel, cl_arg_info, queue, parameters, fill_value):
     ref_args = {}
     ref_arg_data = []
 
-    for arg in cl_arg_info:
+    for arg in impl_arg_info:
         if arg.arg_class is ValueArg:
             if arg.offset_for_name:
                 continue
@@ -157,14 +157,14 @@ def make_ref_args(kernel, cl_arg_info, queue, parameters, fill_value):
 
 # {{{ "full-scale" arguments
 
-def make_args(kernel, cl_arg_info, queue, ref_arg_data, parameters,
+def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters,
         fill_value):
     from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
 
     from pymbolic import evaluate
 
     args = {}
-    for arg, arg_desc in zip(cl_arg_info, ref_arg_data):
+    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
         if arg.arg_class is ValueArg:
             arg_value = parameters[arg.name]
 
@@ -339,7 +339,7 @@ def auto_test_vs_ref(
         message) indicating correctness/acceptability of the result
     """
 
-    from loopy.compiled import CompiledKernel, get_highlighted_code
+    from loopy.compiled import CompiledKernel, get_highlighted_cl_code
 
     if isinstance(op_count, (int, float)):
         from warnings import warn
@@ -395,14 +395,14 @@ def auto_test_vs_ref(
             print 75*"-"
             print "Reference Code:"
             print 75*"-"
-            print get_highlighted_code(ref_compiled.code)
+            print get_highlighted_cl_code(ref_compiled.code)
             print 75*"-"
 
         ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())
 
         try:
             ref_args, ref_arg_data = \
-                    make_ref_args(ref_sched_kernel, ref_cl_kernel_info.cl_arg_info,
+                    make_ref_args(ref_sched_kernel, ref_cl_kernel_info.impl_arg_info,
                             ref_queue, parameters,
                             fill_value=fill_value_ref)
             ref_args["out_host"] = False
@@ -481,7 +481,7 @@ def auto_test_vs_ref(
         if args is None:
             cl_kernel_info = compiled.cl_kernel_info(frozenset())
 
-            args = make_args(kernel, cl_kernel_info.cl_arg_info,
+            args = make_args(kernel, cl_kernel_info.impl_arg_info,
                     queue, ref_arg_data, parameters, fill_value=fill_value)
         args["out_host"] = False
 
diff --git a/loopy/check.py b/loopy/check.py
index ae1d2a925fbfceaa01e6a1352d78c71615109dcf..87fcb1e862ec15366c33f09ea0607300e9c4d199 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -452,8 +452,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 print 79*"-"
                 print "CODE:"
                 print 79*"-"
-                from loopy.compiled import get_highlighted_code
-                print get_highlighted_code(code)
+                from loopy.compiled import get_highlighted_cl_code
+                print get_highlighted_cl_code(code)
                 print 79*"-"
 
             raise RuntimeError("sanity check failed--implemented and desired "
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index cdc917743c8866114825e0f56a6f999b2303f59c..3541e8f222c292bd1f90e6b2e0bbcafc9b787f4b 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -26,6 +26,8 @@ THE SOFTWARE.
 from pytools import Record
 import islpy as isl
 
+import numpy as np
+
 
 # {{{ support code for AST wrapper objects
 
@@ -215,21 +217,58 @@ class POD(PODBase):
 # }}}
 
 
-class CLArgumentInfo(Record):
+class ImplementedDataInfo(Record):
     """
     .. attribute:: name
-    .. attribute:: base_name
+
+        The expanded name of the array. Note that, for example
+        in the case of separate-array-tagged axes, multiple
+        implemented arrays may correspond to one user-facing
+        array.
+
     .. attribute:: dtype
+    .. attribute:: cgen_declarator
+
+        Declarator syntax tree as a :mod:`cgen` object.
+
+    .. attribute:: arg_class
+
+    .. attribute:: base_name
+
+        The user-facing name of the underlying array.
+        May be *None* for non-array arguments.
+
     .. attribute:: shape
     .. attribute:: strides
 
         Strides in multiples of ``dtype.itemsize``.
 
     .. attribute:: offset_for_name
+    .. attribute:: stride_for_name_and_axis
+
+        A tuple *(name, axis)* indicating the (implementation-facing)
+        name of the array and axis number for which this argument provides
+        the strides.
+
     .. attribute:: allows_offset
-    .. attribute:: arg_class
     """
 
+    def __init__(self, name, dtype, cgen_declarator, arg_class,
+            base_name=None, shape=None, strides=None,
+            offset_for_name=None, stride_for_name_and_axis=None,
+            allows_offset=None):
+        Record.__init__(self,
+                name=name,
+                dtype=np.dtype(dtype),
+                cgen_declarator=cgen_declarator,
+                arg_class=arg_class,
+                base_name=base_name,
+                shape=shape,
+                strides=strides,
+                offset_for_name=offset_for_name,
+                stride_for_name_and_axis=stride_for_name_and_axis,
+                allows_offset=allows_offset)
+
 
 # {{{ main code generation entrypoint
 
@@ -264,27 +303,20 @@ def generate_code(kernel, with_annotation=False,
     from loopy.kernel.data import ImageArg, ValueArg
     from loopy.kernel.array import ArrayBase
 
-    arg_decls = []
-    cl_arg_info = []
+    impl_arg_info = []
 
     for arg in kernel.args:
         if isinstance(arg, ArrayBase):
-            for cdecl, clai in arg.decl_info(
-                    is_written=arg.name in kernel.get_written_variables(),
-                    index_dtype=kernel.index_dtype):
-                arg_decls.append(cdecl)
-                cl_arg_info.append(clai)
+            impl_arg_info.extend(
+                    arg.decl_info(
+                        is_written=arg.name in kernel.get_written_variables(),
+                        index_dtype=kernel.index_dtype))
 
         elif isinstance(arg, ValueArg):
-            arg_decls.append(Const(POD(arg.dtype, arg.name)))
-            cl_arg_info.append(CLArgumentInfo(
+            impl_arg_info.append(ImplementedDataInfo(
                 name=arg.name,
-                base_name=arg.name,
                 dtype=arg.dtype,
-                shape=None,
-                strides=None,
-                offset_for_name=None,
-                allows_offset=None,
+                cgen_declarator=Const(POD(arg.dtype, arg.name)),
                 arg_class=ValueArg))
 
         else:
@@ -307,10 +339,11 @@ def generate_code(kernel, with_annotation=False,
 
     # {{{ build lmem array declarators for temporary variables
 
-    for tv in kernel.temporary_variables.itervalues():
-        for cdecl, clai in tv.decl_info(
-                is_written=True, index_dtype=kernel.index_dtype):
-            body.append(cdecl)
+    body.extend(
+            idi.cgen_declarator
+            for tv in kernel.temporary_variables.itervalues()
+            for idi in tv.decl_info(
+                is_written=True, index_dtype=kernel.index_dtype))
 
     # }}}
 
@@ -333,7 +366,8 @@ def generate_code(kernel, with_annotation=False,
             CLRequiredWorkGroupSize(
                 kernel.get_grid_sizes_as_exprs()[1],
                 CLKernel(FunctionDeclaration(
-                    Value("void", kernel.name), arg_decls))),
+                    Value("void", kernel.name),
+                    [iai.cgen_declarator for iai in impl_arg_info]))),
             body))
 
     # {{{ handle preambles
@@ -368,12 +402,9 @@ def generate_code(kernel, with_annotation=False,
     assert check_implemented_domains(kernel, gen_code.implemented_domains,
             result)
 
-    return result, cl_arg_info
+    return result, impl_arg_info
 
 # }}}
 
 
-
-
-
 # vim: foldmethod=marker
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index 6baae343efbfedf41641e40ded5c34e88c68ddfc..abe6dfc371f26b2c7f9a8ed6ade0246a0357b60a 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -438,26 +438,26 @@ class LoopyCCodeMapper(RecursiveMapper):
                 if isinstance(ary, GlobalArg):
                     # unsubscripted global args are pointers
                     if vec_member is not None:
-                        return "%s%s->%s" % (
-                                expr.aggregate.name, access_info.array_suffix,
+                        return "%s->%s" % (
+                                access_info.array_name,
                                 vec_member)
                     else:
-                        return "*" + expr.aggregate.name+access_info.array_suffix
+                        return "*" + access_info.array_name
 
                 else:
                     # unsubscripted temp vars are scalars
                     if vec_member is not None:
-                        return "%s%s.%s" % (
-                                expr.aggregate.name, access_info.array_suffix,
+                        return "%s.%s" % (
+                                access_info.array_name,
                                 vec_member)
                     else:
-                        return expr.aggregate.name+access_info.array_suffix
+                        return access_info.array_name
 
             else:
                 subscript, = access_info.subscripts
                 result = self.parenthesize_if_needed(
                         "%s[%s]" % (
-                            expr.aggregate.name+access_info.array_suffix,
+                            access_info.array_name,
                             self.rec(subscript, PREC_NONE, 'i')),
                         enclosing_prec, PREC_CALL)
 
diff --git a/loopy/compiled.py b/loopy/compiled.py
index c7535ccaddaf19024e62a896763676ef31782a21..28481f7a8c6a2129256ac5f34ba543fec24cba70 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -1,4 +1,4 @@
-from __future__ import division
+from __future__ import division, with_statement
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,10 +24,8 @@ THE SOFTWARE.
 
 
 import pyopencl as cl
-import pyopencl.array as cl_array
-
+import pyopencl.tools  # noqa
 import numpy as np
-
 from pytools import Record, memoize_method
 
 
@@ -52,16 +50,13 @@ class SeparateArrayPackingController(object):
     passed to the kernel.
 
     It also repacks outgoing arrays of this type back into an object array.
-
-    .. attribute:: arg_name_to_base_arg_name
     """
 
     def __init__(self, kernel):
         # map from arg name
         self.packing_info = {}
-        self.arg_name_to_base_arg_name = {}
 
-        from loopy.kernel.array import ArrayBase, SeparateArrayArrayDimTag
+        from loopy.kernel.array import ArrayBase
         for arg in kernel.args:
             if not isinstance(arg, ArrayBase):
                 continue
@@ -69,32 +64,17 @@ class SeparateArrayPackingController(object):
             if arg.shape is None or arg.dim_tags is None:
                 continue
 
-            sep_shape = []
-            for shape_i, dim_tag in zip(arg.shape, arg.dim_tags):
-                if isinstance(dim_tag, SeparateArrayArrayDimTag):
-                    if not isinstance(shape_i, int):
-                        raise TypeError("argument '%s' has non-fixed-size "
-                                "separate-array axis" % arg.name)
+            subscripts_and_names = arg.subscripts_and_names()
 
-                    sep_shape.append(shape_i)
-
-            if not sep_shape:
+            if subscripts_and_names is None:
                 continue
 
-            from pytools import indices_in_shape
-            subscripts_and_names = [
-                    (i, arg.name + "".join("_s%d" % sub_i for sub_i in i))
-                    for i in indices_in_shape(sep_shape)]
-
             self.packing_info[arg.name] = _PackingInfo(
                     name=arg.name,
-                    sep_shape=sep_shape,
+                    sep_shape=arg.sep_shape(),
                     subscripts_and_names=subscripts_and_names,
                     is_written=arg.name in kernel.get_written_variables())
 
-            for index, sub_arg_name in subscripts_and_names:
-                self.arg_name_to_base_arg_name[sub_arg_name] = arg.name
-
     def unpack(self, kernel_kwargs):
         if not self.packing_info:
             return kernel_kwargs
@@ -131,101 +111,550 @@ class SeparateArrayPackingController(object):
 # }}}
 
 
-# {{{ domain parameter finder
-
-class DomainParameterFinder(object):
-    """Finds domain parameters from shapes of passed arguments."""
-
-    def __init__(self, kernel, cl_arg_info):
-        # a mapping from parameter names to a list of tuples
-        # (arg_name, axis_nr, function), where function is a
-        # unary function of kernel.arg_dict[arg_name].shape[axis_nr]
-        # returning the desired parameter.
-        self.param_to_sources = param_to_sources = {}
-
-        from loopy.kernel.data import GlobalArg
-        from loopy.symbolic import DependencyMapper
-        from pymbolic import compile
-        dep_map = DependencyMapper()
-
-        from pymbolic import var
-        for arg in cl_arg_info:
-            if arg.arg_class is GlobalArg:
-                for axis_nr, shape_i in enumerate(arg.shape):
-                    deps = dep_map(shape_i)
-                    if len(deps) == 1:
-                        dep, = deps
-
-                        if kernel.arg_dict[dep.name].dtype.kind == "i":
-                            from pymbolic.algorithm import solve_affine_equations_for
-                            try:
-                                # friggin' overkill :)
-                                param_expr = solve_affine_equations_for(
-                                        [dep.name], [(shape_i, var("shape_i"))]
-                                        )[dep.name]
-                            except:
-                                # went wrong? oh well
-                                pass
-                            else:
-                                param_func = compile(param_expr, ["shape_i"])
-                                param_to_sources.setdefault(dep.name, []).append(
-                                        (arg.name, axis_nr, param_func))
-
-    def __call__(self, kwargs):
-        result = {}
-
-        for param_name, sources in self.param_to_sources.iteritems():
-            if param_name not in kwargs:
-                for arg_name, axis_nr, shape_func in sources:
-                    if arg_name in kwargs:
-                        try:
-                            shape_axis = kwargs[arg_name].shape[axis_nr]
-                        except IndexError:
-                            raise RuntimeError("Argument '%s' has unexpected shape. "
-                                    "Tried to access axis %d (0-based), only %d "
-                                    "axes present." %
-                                    (arg_name, axis_nr, len(kwargs[arg_name].shape)))
+# {{{ python code generation helpers
+
+class Indentation(object):
+    def __init__(self, generator):
+        self.generator = generator
+
+    def __enter__(self):
+        self.generator.indent()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.generator.dedent()
 
-                        result[param_name] = shape_func(shape_axis)
-                        continue
 
+class PythonCodeGenerator(object):
+    def __init__(self):
+        self.preamble = []
+        self.code = []
+        self.level = 0
+
+    def extend(self, sub_generator):
+        self.code.extend(sub_generator.code)
+
+    def extend_indent(self, sub_generator):
+        with Indentation(self):
+            for line in sub_generator.code:
+                self.write(line)
+
+    def get(self):
+        result = "\n".join(self.code)
+        if self.preamble:
+            result = "\n".join(self.preamble) + "\n" + result
         return result
 
+    def add_to_preamble(self, s):
+        self.preamble.append(s)
+
+    def __call__(self, string):
+        self.code.append(" "*(4*self.level) + string)
+
+    def indent(self):
+        self.level += 1
+
+    def dedent(self):
+        if self.level == 0:
+            raise RuntimeError("internal error in python code generator")
+        self.level -= 1
+
+
+class PythonFunctionGenerator(PythonCodeGenerator):
+    def __init__(self, name, args):
+        PythonCodeGenerator.__init__(self)
+        self.name = name
+
+        self("def %s(%s):" % (name, ", ".join(args)))
+        self.indent()
+
+    def get_function(self):
+        result_dict = {}
+        exec(compile(self.get(), "<generated function %s>" % self.name, "exec"),
+                result_dict)
+        return result_dict[self.name]
+
+
+# }}}
+
+
+# {{{ invoker generation
+
+# /!\ This code runs in a namespace controlled by the user.
+# Prefix all auxiliary variables with "_lpy".
+
+
+def python_dtype_str(dtype):
+    if dtype.isbuiltin:
+        return "_lpy_np."+dtype.name
+    else:
+        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
+                % cl.tools.dtype_to_ctype(dtype))
+
+
+# {{{ integer arg finding from shapes
+
+def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, flags):
+    # a mapping from integer argument names to a list of tuples
+    # (arg_name, expression), where expression is a
+    # unary function of kernel.arg_dict[arg_name]
+    # returning the desired integer argument.
+    iarg_to_sources = {}
+
+    from loopy.kernel.data import GlobalArg
+    from loopy.symbolic import DependencyMapper, StringifyMapper
+    dep_map = DependencyMapper()
+
+    from pymbolic import var
+    for arg in impl_arg_info:
+        if arg.arg_class is GlobalArg:
+            sym_shape = var(arg.name).attr("shape")
+            for axis_nr, shape_i in enumerate(arg.shape):
+                deps = dep_map(shape_i)
+
+                if len(deps) == 1:
+                    integer_arg_var, = deps
+
+                    if kernel.arg_dict[integer_arg_var.name].dtype.kind == "i":
+                        from pymbolic.algorithm import solve_affine_equations_for
+                        try:
+                            # friggin' overkill :)
+                            iarg_expr = solve_affine_equations_for(
+                                    [integer_arg_var.name],
+                                    [(shape_i, sym_shape[axis_nr])]
+                                    )[integer_arg_var]
+                        except:
+                            # went wrong? oh well
+                            pass
+                        else:
+                            iarg_to_sources.setdefault(integer_arg_var.name, []) \
+                                    .append((arg.name, iarg_expr))
+
+    gen("# {{{ find integer arguments from shapes")
+    gen("")
+
+    for iarg_name, sources in iarg_to_sources.iteritems():
+        gen("if %s is None:" % iarg_name)
+        with Indentation(gen):
+            if_stmt = "if"
+            for arg_name, value_expr in sources:
+                gen("%s %s is not None:" % (if_stmt, arg_name))
+                with Indentation(gen):
+                    gen("%s = %s"
+                            % (iarg_name, StringifyMapper()(value_expr)))
+
+                if_stmt = "elif"
+
+        gen("")
+
+    gen("# }}}")
+    gen("")
+
 # }}}
 
 
-# {{{ argument checking
+# {{{ integer arg finding from offsets
+
+def generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, flags):
+    gen("# {{{ find integer arguments from offsets")
+    gen("")
+
+    for arg in impl_arg_info:
+        impl_array_name = arg.offset_for_name
+        if impl_array_name is not None:
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("if %s is None:" % impl_array_name)
+                with Indentation(gen):
+                    gen("# Output variable, we'll be allocating "
+                            "it, with zero offset.")
+                    gen("%s = 0" % arg.name)
+                gen("else:")
+                with Indentation(gen):
+                    if flags.allow_numpy:
+                        gen("_lpy_offset = getattr(%s, \"offset\", 0)"
+                                % impl_array_name)
+                    else:
+                        gen("_lpy_offset = %s.offset" % impl_array_name)
+
+                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                    if flags.paranoid:
+                        gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+                        gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
+                                "not divisible by its dtype itemsize\""
+                                % impl_array_name)
+                        gen("del _lpy_remdr")
+                    else:
+                        gen("%s = _lpy_offset // %d)"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+                    if flags.paranoid:
+                        gen("del _lpy_offset")
+
+    gen("# }}}")
+    gen("")
+
+# }}}
 
-def _arg_matches_spec(arg, val, other_args):
+
+# {{{ integer arg finding from offsets
+
+def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, flags):
+    gen("# {{{ find integer arguments from strides")
+    gen("")
+
+    for arg in impl_arg_info:
+        if arg.stride_for_name_and_axis is not None:
+            impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
+
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                if flags.paranoid:
+                    gen("if %s is None:" % impl_array_name)
+                    with Indentation(gen):
+                        gen("raise RuntimeError(\"required stride '%s' for "
+                                "argument '%s' not given or deducible from "
+                                "passed array\")"
+                                % (arg.name, impl_array_name))
+
+                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                    if flags.paranoid:
+                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
+                                % (arg.name, impl_array_name, stride_impl_axis,
+                                    base_arg.dtype.itemsize))
+
+                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is "
+                                "not divisible by its dtype itemsize\""
+                                % (stride_impl_axis, impl_array_name))
+                        gen("del _lpy_remdr")
+                    else:
+                        gen("%s = divmod(%s.strides[%d], %d)"
+                                % (arg.name, impl_array_name, stride_impl_axis,
+                                    base_arg.dtype.itemsize))
+                        gen("%s = _lpy_offset // %d)"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+# {{{ arg setup
+
+def generate_arg_setup(gen, kernel, impl_arg_info, flags):
     import loopy as lp
-    if arg.shape is not None and arg.arg_class is not lp.ImageArg:
-        from pymbolic import evaluate
-
-        if arg.dtype != val.dtype:
-            raise TypeError("dtype mismatch on argument '%s' "
-                    "(got: %s, expected: %s)"
-                    % (arg.name, val.dtype, arg.dtype))
-
-        if arg.shape is not None:
-            shape = evaluate(arg.shape, other_args)
-            if shape != val.shape:
-                raise TypeError("shape mismatch on argument '%s' "
-                        "(got: %s, expected: %s)"
-                        % (arg.name, val.shape, shape))
-
-        itemsize = arg.dtype.itemsize
-        strides = tuple(itemsize*i for i in evaluate(arg.strides, other_args))
-        if strides != tuple(val.strides):
-            raise ValueError("strides mismatch on argument '%s' "
-                    "(got: %s, expected: %s)"
-                    % (arg.name, val.strides, strides))
-
-        if val.offset != 0 and not arg.allows_offset:
-            raise ValueError("Argument '%s' does not allow arrays "
-                    "with offsets. Try passing default_offset=loopy.auto "
-                    "to make_kernel()." % arg.name)
-
-    return True
+
+    gen("# {{{ set up arguments")
+    gen("")
+
+    if flags.allow_numpy:
+        gen("_lpy_encountered_numpy = False")
+        gen("_lpy_encountered_dev = False")
+        gen("")
+
+    from loopy.kernel.array import ArrayBase
+    from loopy.symbolic import StringifyMapper
+    from pymbolic import var
+
+    strify = StringifyMapper()
+
+    for arg_idx, arg in enumerate(impl_arg_info):
+        is_written = arg.base_name in kernel.get_written_variables()
+        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
+
+        gen("# {{{ process %s" % arg.name)
+        gen("")
+
+        if issubclass(arg.arg_class, ArrayBase):
+            if flags.allow_numpy:
+                gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
+                with Indentation(gen):
+                    gen("# synchronous, nothing to worry about")
+                    gen("%s = _lpy_cl_array.to_device("
+                            "queue, %s, allocator=allocator)"
+                            % (arg.name, arg.name))
+                    gen("_lpy_encountered_numpy = True")
+                gen("else:")
+                with Indentation(gen):
+                    gen("_lpy_encountered_dev = True")
+
+                gen("")
+
+        if arg.arg_class is lp.ValueArg:
+            if flags.paranoid:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"input argument '%s' must "
+                            "be supplied\")" % arg.name)
+                    gen("")
+            if arg.dtype.kind == "i":
+                gen("# cast to int to avoid numpy scalar trouble with Boost.Python")
+                gen("%s = int(%s)" % (arg.name, arg.name))
+                gen("")
+
+        else:
+            if flags.paranoid and not is_written:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"input argument '%s' must "
+                            "be supplied\")" % arg.name)
+                    gen("")
+
+            if is_written and arg.arg_class is lp.ImageArg and flags.paranoid:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"written image '%s' must "
+                            "be supplied\")" % arg.name)
+                    gen("")
+
+            if is_written and arg.shape is None and flags.paranoid:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"written argument '%s' has "
+                            "unknown shape and must be supplied\")" % arg.name)
+                    gen("")
+
+            possibly_made_by_loopy = False
+
+            # {{{ allocate written arrays, if needed
+
+            if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                    and arg.shape is not None:
+
+                possibly_made_by_loopy = True
+                gen("_lpy_made_by_loopy = False")
+                gen("")
+
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    num_axes = len(arg.strides)
+                    for i in xrange(num_axes):
+                        gen("_lpy_shape_%d = %s" % (i, strify(arg.shape[i])))
+
+                    itemsize = kernel_arg.dtype.itemsize
+                    for i in xrange(num_axes):
+                        gen("_lpy_strides_%d = %s" % (i, strify(
+                            itemsize*arg.strides[i])))
+
+                    if flags.paranoid:
+                        for i in xrange(num_axes):
+                            gen("assert _lpy_strides_%d > 0, "
+                                    "\"'%s' has negative stride in axis %d\""
+                                    % (i, arg.name, i))
+
+                    sym_strides = tuple(
+                            var("_lpy_strides_%d" % i)
+                            for i in xrange(num_axes))
+                    sym_shape = tuple(
+                            var("_lpy_shape_%d" % i)
+                            for i in xrange(num_axes))
+
+                    alloc_size_expr = (sum(astrd*(alen-1)
+                        for alen, astrd in zip(sym_shape, sym_strides))
+                        + itemsize)
+
+                    gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
+                    gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
+                            "%(dtype)s, strides=%(strides)s, "
+                            "data=allocator(_lpy_alloc_size), allocator=allocator)"
+                            % dict(
+                                name=arg.name,
+                                shape=strify(sym_shape),
+                                strides=strify(sym_strides),
+                                dtype=python_dtype_str(arg.dtype)))
+
+                    if flags.paranoid:
+                        for i in xrange(num_axes):
+                            gen("del _lpy_shape_%d" % i)
+                            gen("del _lpy_strides_%d" % i)
+                        gen("del _lpy_alloc_size")
+                        gen("")
+
+                    gen("_lpy_made_by_loopy = True")
+                    gen("")
+
+            # }}}
+
+            # {{{ argument checking
+
+            if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                    and flags.paranoid:
+                if possibly_made_by_loopy:
+                    gen("if not _lpy_made_by_loopy:")
+                else:
+                    gen("if True:")
+
+                with Indentation(gen):
+                    gen("if %s.dtype != %s:"
+                            % (arg.name, python_dtype_str(arg.dtype)))
+                    with Indentation(gen):
+                        gen("raise TypeError(\"dtype mismatch on argument '%s' "
+                                "(got: %%s, expected: %s)\" %% %s.dtype)"
+                                % (arg.name, arg.dtype, arg.name))
+
+                    if arg.shape is not None:
+                        gen("if %s.shape != %s:"
+                                % (arg.name, strify(arg.shape)))
+                        with Indentation(gen):
+                            gen("raise TypeError(\"shape mismatch on argument '%s' "
+                                    "(got: %%s, expected: %%s)\" "
+                                    "%% (%s.shape, %s))"
+                                    % (arg.name, arg.name, strify(arg.shape)))
+
+                    if arg.strides is not None:
+                        itemsize = kernel_arg.dtype.itemsize
+                        sym_strides = tuple(itemsize*s_i for s_i in arg.strides)
+                        gen("if %s.strides != %s:"
+                                % (arg.name, strify(sym_strides)))
+                        with Indentation(gen):
+                            gen("raise TypeError(\"strides mismatch on "
+                                    "argument '%s' (got: %%s, expected: %%s)\" "
+                                    "%% (%s.strides, %s))"
+                                    % (arg.name, arg.name, strify(sym_strides)))
+
+                    if not arg.allows_offset:
+                        gen("if %s.offset:" % arg.name)
+                        with Indentation(gen):
+                            gen("raise ValueError(\"Argument '%s' does not "
+                                    "allow arrays with offsets. Try passing "
+                                    "default_offset=loopy.auto to make_kernel()."
+                                    "\")" % arg.name)
+                            gen("")
+
+            # }}}
+
+            if possibly_made_by_loopy and flags.paranoid:
+                gen("del _lpy_made_by_loopy")
+                gen("")
+
+        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
+            gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name))
+        elif arg.arg_class is lp.ValueArg:
+            if arg.dtype.char == "V":
+                gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+            else:
+                gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))"
+                        % (arg_idx, arg.dtype.char, arg.name))
+        else:
+            gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+        gen("")
+
+        gen("# }}}")
+        gen("")
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+class InvocationFlags(Record):
+    """
+    .. attribute:: paranoid
+    .. attribute:: allow_numpy
+    .. attribute:: return_dict
+    .. attribute:: print_wrapper
+    .. attribute:: print_hl_wrapper
+    """
+
+    def __init__(self, paranoid=True, allow_numpy=True, return_dict=False,
+            print_wrapper=False):
+        Record.__init__(self, paranoid=paranoid, allow_numpy=allow_numpy,
+                return_dict=return_dict, print_wrapper=print_wrapper)
+
+
+def generate_invoker(kernel, impl_arg_info, flags):
+    system_args = [
+            "cl_kernel", "queue", "allocator=None", "wait_for=None",
+            # ignored if not flags.allow_numpy
+            "out_host=None"
+            ]
+
+    gen = PythonFunctionGenerator(
+            "invoke_%s_loopy_kernel" % kernel.name,
+            system_args + ["%s=None" % iai.name for iai in impl_arg_info])
+
+    gen.add_to_preamble("from __future__ import division")
+    gen.add_to_preamble("")
+    gen.add_to_preamble("import pyopencl as _lpy_cl")
+    gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
+    gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
+    gen.add_to_preamble("import numpy as _lpy_np")
+    gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack")
+    gen.add_to_preamble("")
+
+    gen("if allocator is None:")
+    with Indentation(gen):
+        gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
+    gen("")
+
+    generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, flags)
+    generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, flags)
+    generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, flags)
+
+    generate_arg_setup(gen, kernel, impl_arg_info, flags)
+
+    # {{{ generate invocation
+
+    from loopy.symbolic import StringifyMapper
+
+    strify = StringifyMapper()
+    gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs()
+
+    if not gsize_expr:
+        gsize_expr = (1,)
+    if not lsize_expr:
+        lsize_expr = (1,)
+
+    gen("_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, "
+            "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
+            % dict(
+                gsize=strify(gsize_expr),
+                lsize=strify(lsize_expr)))
+    gen("")
+
+    # }}}
+
+    # {{{ output
+
+    if flags.allow_numpy:
+        gen("if out_host is None and (_lpy_encountered_numpy "
+                "and not _lpy_encountered_dev):")
+        with Indentation(gen):
+            gen("out_host = True")
+
+        gen("if out_host:")
+        with Indentation(gen):
+            gen("pass")  # if no outputs (?!)
+            for arg_idx, arg in enumerate(impl_arg_info):
+                is_written = arg.base_name in kernel.get_written_variables()
+                if is_written:
+                    gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
+
+        gen("")
+
+    if flags.return_dict:
+        gen("return _lpy_evt, {%s}"
+                % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    for arg in impl_arg_info
+                    if arg.base_name in kernel.get_written_variables()))
+    else:
+        gen("return _lpy_evt, (%s,)"
+                % ", ".join(arg.name
+                    for arg in impl_arg_info
+                    if arg.base_name in kernel.get_written_variables()))
+
+    # }}}
+
+    if flags.print_wrapper:
+        print get_highlighted_python_code(gen.get())
+
+    return gen.get_function()
+
 
 # }}}
 
@@ -251,17 +680,21 @@ def _get_kernel_from_iterable(iterable):
     return result
 
 
-class _KernelInfo(Record):
+class _CLKernelInfo(Record):
     pass
 
 
 class CompiledKernel:
-    def __init__(self, context, kernel, options=[], codegen_kwargs={}):
+    def __init__(self, context, kernel, options=[], codegen_kwargs={},
+            iflags=InvocationFlags()):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
-          (a warning will be issued if more than one is returned). If the
-          kernel has not yet been loop-scheduled, that is done, too, with no
-          specific arguments.
+            (a warning will be issued if more than one is returned). If the
+            kernel has not yet been loop-scheduled, that is done, too, with no
+            specific arguments.
+        :arg iflags: A :class:`InvocationFlags` instance, or a dictionary
+            of arguments with which a :class:`InvocationFlags` instance
+            can be initialized.
         """
 
         self.context = context
@@ -269,13 +702,17 @@ class CompiledKernel:
         self.codegen_kwargs = codegen_kwargs
         self.options = options
 
+        if not isinstance(iflags, InvocationFlags):
+            iflags = InvocationFlags(**iflags)
+        self.iflags = iflags
+
         self.packing_controller = SeparateArrayPackingController(kernel)
 
         self.output_names = tuple(arg.name for arg in self.kernel.args
                 if arg.name in self.kernel.get_written_variables())
 
     @memoize_method
-    def get_kernel_info(self, arg_to_dtype_set):
+    def get_kernel(self, arg_to_dtype_set):
         kernel = self.kernel
 
         import loopy as lp
@@ -284,8 +721,7 @@ class CompiledKernel:
         if arg_to_dtype_set:
             arg_to_dtype = {}
             for arg, dtype in arg_to_dtype_set:
-                arg_to_dtype[self.packing_controller
-                        .arg_name_to_base_arg_name.get(arg, arg)] = dtype
+                arg_to_dtype[kernel.impl_arg_to_arg[arg].name] = dtype
 
             kernel = add_argument_dtypes(kernel, arg_to_dtype)
 
@@ -296,71 +732,38 @@ class CompiledKernel:
             kernel = _get_kernel_from_iterable(
                     lp.generate_loop_schedules(kernel))
 
-        # {{{ precompile, store grid size functions
-
-        gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs()
-
-        if not gsize_expr:
-            gsize_expr = (1,)
-        if not lsize_expr:
-            lsize_expr = (1,)
-
-        # }}}
-
-        from pymbolic import compile
-        return _KernelInfo(
-                kernel=kernel,
-                global_size_func=compile(gsize_expr, kernel.scalar_loop_args),
-                local_size_func=compile(lsize_expr, kernel.scalar_loop_args),
-                )
+        return kernel
 
     @memoize_method
-    def cl_kernel_info(self,
-            arg_to_dtype_set, code_op=False):
-        kernel_info = self.get_kernel_info(arg_to_dtype_set)
-        kernel = kernel_info.kernel
+    def cl_kernel_info(self, arg_to_dtype_set, code_op=None):
+        kernel = self.get_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code
-        code, cl_arg_info = generate_code(kernel, **self.codegen_kwargs)
+        code, impl_arg_info = generate_code(kernel, **self.codegen_kwargs)
+
+        if code_op is None:
+            code_op = ""
 
-        if code_op == "print":
+        code_op = code_op.split(",")
+        if "print" in code_op:
             print code
-        elif code_op == "print_hl":
-            print get_highlighted_code(code)
-        elif code_op == "edit":
+        elif "print_hl" in code_op:
+            print get_highlighted_cl_code(code)
+        elif "edit" in code_op:
             from pytools import invoke_editor
             code = invoke_editor(code, "code.cl")
 
-        try:
-            cl_program = cl.Program(self.context, code)
-            cl_kernel = getattr(
-                    cl_program.build(options=self.options),
-                    kernel.name)
-        except KeyboardInterrupt:
-            raise
-        except:
-            print "[Loopy] "+70*"-"
-            print "[Loopy] build failed, here's the source code:"
-            print "[Loopy] "+70*"-"
-            print code
-            print "[Loopy] "+70*"-"
-            print "[Loopy] end source code"
-            print "[Loopy] "+70*"-"
-            raise
-
-        arg_types = []
-        for arg_info in cl_arg_info:
-            if arg_info.shape is None:
-                arg_types.append(arg_info.dtype)
-            else:
-                arg_types.append(None)
-
-        cl_kernel.set_scalar_arg_dtypes(arg_types)
+        cl_program = cl.Program(self.context, code)
+        cl_kernel = getattr(
+                cl_program.build(options=self.options),
+                kernel.name)
 
-        return kernel_info.copy(
+        return _CLKernelInfo(
+                kernel=kernel,
                 cl_kernel=cl_kernel,
-                cl_arg_info=cl_arg_info,
-                domain_parameter_finder=DomainParameterFinder(kernel, cl_arg_info))
+                impl_arg_info=impl_arg_info,
+                invoker=generate_invoker(
+                    kernel, impl_arg_info, InvocationFlags()))
 
     # {{{ debugging aids
 
@@ -368,14 +771,14 @@ class CompiledKernel:
         if arg_to_dtype is not None:
             arg_to_dtype = frozenset(arg_to_dtype.iteritems())
 
-        kernel_info = self.get_kernel_info(arg_to_dtype)
+        kernel = self.get_kernel(arg_to_dtype)
 
         from loopy.codegen import generate_code
-        code, arg_info = generate_code(kernel_info.kernel, **self.codegen_kwargs)
+        code, arg_info = generate_code(kernel, **self.codegen_kwargs)
         return code
 
     def get_highlighted_code(self, arg_to_dtype=None):
-        return get_highlighted_code(
+        return get_highlighted_cl_code(
                 self.get_code(arg_to_dtype))
 
     @property
@@ -392,36 +795,27 @@ class CompiledKernel:
         """If all array arguments are :mod:`numpy` arrays, defaults to
         returning numpy arrays as well.
 
-        If you want offset arguments (see
-        :attr:`loopy.kernel.data.GlobalArg.offset`) to be set automatically, it
-        must occur *after* the corresponding array argument.
-
         :arg allocator:
         :arg wait_for:
         :arg out_host:
-        :arg warn_numpy:
-        :arg return_dict:
+        :arg code_op:
         """
 
         allocator = kwargs.pop("allocator", None)
         wait_for = kwargs.pop("wait_for", None)
         out_host = kwargs.pop("out_host", None)
-        no_run = kwargs.pop("no_run", None)
         code_op = kwargs.pop("code_op", None)
-        warn_numpy = kwargs.pop("warn_numpy", None)
-        return_dict = kwargs.pop("return_dict", False)
 
         kwargs = self.packing_controller.unpack(kwargs)
 
-        # {{{ process arg types, get cl kernel
-
-        import loopy as lp
-
+        impl_arg_to_arg = self.kernel.impl_arg_to_arg
         arg_to_dtype = {}
         for arg_name, val in kwargs.iteritems():
-            arg_name = self.packing_controller \
-                    .arg_name_to_base_arg_name.get(arg_name, arg_name)
-            arg = self.kernel.arg_dict[arg_name]
+            arg = impl_arg_to_arg.get(arg_name, None)
+
+            if arg is None:
+                # offsets, strides and such
+                continue
 
             if arg.dtype is None and val is not None:
                 try:
@@ -429,148 +823,32 @@ class CompiledKernel:
                 except AttributeError:
                     pass
                 else:
-                    arg_to_dtype[arg.name] = dtype
+                    arg_to_dtype[arg_name] = dtype
 
         kernel_info = self.cl_kernel_info(
                 frozenset(arg_to_dtype.iteritems()),
                 code_op)
-        kernel = kernel_info.kernel
-        cl_kernel = kernel_info.cl_kernel
-        del arg_to_dtype
-
-        # }}}
-
-        kwargs.update(
-                kernel_info.domain_parameter_finder(kwargs))
 
-        domain_parameters = dict((name, int(kwargs[name]))
-                for name in kernel.scalar_loop_args)
+        return kernel_info.invoker(
+                kernel_info.cl_kernel, queue, allocator, wait_for,
+                out_host, **kwargs)
 
-        args = []
-        outputs = {}
-        encountered_numpy = False
-        encountered_cl = False
-
-        kwargs_copy = kwargs.copy()
-
-        for arg in kernel_info.cl_arg_info:
-            is_written = arg.base_name in kernel.get_written_variables()
-
-            val = kwargs_copy.pop(arg.name, None)
-
-            # {{{ if this argument is an offset for another, try to determine it
-
-            if arg.offset_for_name is not None and val is None:
-                try:
-                    array_arg_val = kwargs[arg.offset_for_name]
-                except KeyError:
-                    # Output variable, we'll be allocating it, with zero offset.
-                    offset = 0
-                else:
-                    try:
-                        offset = array_arg_val.offset
-                    except AttributeError:
-                        offset = 0
-
-                if offset:
-                    val, remdr = divmod(offset, array_arg_val.dtype.itemsize)
-                    assert remdr == 0
-                    del remdr
-                else:
-                    val = 0
-
-                del offset
-
-            # }}}
-
-            if arg.shape is not None:
-                # {{{ automatically transfer host-side arrays, if needed
-
-                if isinstance(val, np.ndarray):
-                    # synchronous, so nothing to worry about
-                    val = cl_array.to_device(queue, val, allocator=allocator)
-                    encountered_numpy = True
-                    if warn_numpy:
-                        from warnings import warn
-                        warn("argument '%s' was passed as a numpy array, "
-                                "performing implicit transfer" % arg.name,
-                                stacklevel=2)
-                else:
-                    encountered_cl = True
-
-                # }}}
-
-            if val is None:
-                if not is_written:
-                    raise TypeError(
-                            "must supply input argument '%s'" % arg.name)
-
-                if arg.arg_class is lp.ImageArg:
-                    raise RuntimeError("write-mode image '%s' must "
-                            "be explicitly supplied" % arg.name)
-
-                from pymbolic import evaluate
-                shape = evaluate(arg.shape, kwargs)
-                itemsize = arg.dtype.itemsize
-                numpy_strides = tuple(
-                        i*itemsize for i in evaluate(arg.strides, kwargs))
-
-                from pytools import all
-                assert all(s > 0 for s in numpy_strides)
-                alloc_size = (sum(astrd*(alen-1)
-                        for alen, astrd in zip(shape, numpy_strides))
-                        + itemsize)
-
-                if allocator is None:
-                    storage = cl.Buffer(
-                            queue.context, cl.mem_flags.READ_WRITE, alloc_size)
-                else:
-                    storage = allocator(alloc_size)
-
-                val = cl_array.Array(queue, shape, arg.dtype,
-                        strides=numpy_strides, data=storage,
-                        allocator=allocator)
-            else:
-                assert _arg_matches_spec(arg, val, kwargs)
-
-            if is_written:
-                outputs[arg.name] = val
-
-            if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-                args.append(val.base_data)
-            else:
-                args.append(val)
-
-        assert not kwargs_copy, (
-                "extra arguments: "+", ".join(kwargs_copy.iterkeys()))
-
-        if no_run:
-            evt = cl.enqueue_marker(queue)
-        else:
-            evt = cl_kernel(queue,
-                    kernel_info.global_size_func(**domain_parameters),
-                    kernel_info.local_size_func(**domain_parameters),
-                    *args,
-                    g_times_l=True, wait_for=wait_for)
-
-        if out_host is None and (encountered_numpy and not encountered_cl):
-            out_host = True
-        if out_host:
-            outputs = dict(
-                    (name, o.get(queue=queue))
-                    for name, o in outputs.iteritems())
-
-        outputs = self.packing_controller.pack(outputs)
+# }}}
 
-        if not return_dict:
-            outputs = tuple(outputs[name] for name in self.output_names)
 
-        return evt, outputs
+def get_highlighted_python_code(text):
+    try:
+        from pygments import highlight
+    except ImportError:
+        return text
+    else:
+        from pygments.lexers import PythonLexer
+        from pygments.formatters import TerminalFormatter
 
-# }}}
+        return highlight(text, PythonLexer(), TerminalFormatter())
 
 
-def get_highlighted_code(text):
+def get_highlighted_cl_code(text):
     try:
         from pygments import highlight
     except ImportError:
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 66bb1f56d0f4df4218f9b9ca6889ccfaf8e41159..b2baf97e74e14db92040573dd927c9498f08b267 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -900,6 +900,36 @@ class LoopKernel(Record):
 
     # }}}
 
+    # {{{ implementation arguments
+
+    @property
+    @memoize_method
+    def impl_arg_to_arg(self):
+        from loopy.kernel.array import ArrayBase
+
+        result = {}
+
+        for arg in self.args:
+            if not isinstance(arg, ArrayBase):
+                result[arg.name] = arg
+                continue
+
+            if arg.shape is None or arg.dim_tags is None:
+                result[arg.name] = arg
+                continue
+
+            subscripts_and_names = arg.subscripts_and_names()
+            if subscripts_and_names is None:
+                result[arg.name] = arg
+                continue
+
+            for index, sub_arg_name in subscripts_and_names:
+                result[sub_arg_name] = arg
+
+        return result
+
+    # }}}
+
     # {{{ direct execution
 
     @memoize_method
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 8cb4eee8755d3c4547a9652ce222c115e0425f9a..f2e49a0955a73201fc221b83f66953ac9e0d8f64 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -49,6 +49,17 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
 
     The stride is given in units of :attr:`ArrayBase.dtype`.
 
+    .. attribute :: stride
+
+        May be one of the following:
+
+        - A :class:`pymbolic.primitives.Expression`, including an
+          integer, indicating the stride in units of the underlying
+          array's :attr:`ArrayBase.dtype`.
+
+        - :class:`loopy.auto`, indicating that a new kernel argument
+          for this stride should automatically be created.
+
     .. attribute :: target_axis
 
         For objects (such as images) with more than one axis, *target_axis*
@@ -130,10 +141,7 @@ def parse_array_dim_tag(tag, default_target_axis=0):
 
     tag = tag.strip()
 
-    if tag.startswith("stride:"):
-        from loopy.symbolic import parse
-        return FixedStrideArrayDimTag(parse(tag[7:]))
-    elif tag == "sep":
+    if tag == "sep":
         return SeparateArrayArrayDimTag()
     elif tag == "vec":
         return VectorArrayDimTag()
@@ -146,7 +154,16 @@ def parse_array_dim_tag(tag, default_target_axis=0):
     else:
         target_axis = default_target_axis
 
-    if tag in ["c", "C", "f", "F"]:
+    if tag.startswith("stride:"):
+        fixed_stride_descr = tag[7:]
+        if fixed_stride_descr.strip() == "auto":
+            import loopy as lp
+            return FixedStrideArrayDimTag(lp.auto, target_axis)
+        else:
+            from loopy.symbolic import parse
+            return FixedStrideArrayDimTag(parse(fixed_stride_descr), target_axis)
+
+    elif tag in ["c", "C", "f", "F"]:
         return ComputedStrideArrayDimTag(tag, target_axis=target_axis)
     else:
         padded_stride_match = PADDED_STRIDE_TAG.match(tag)
@@ -606,16 +623,26 @@ class ArrayBase(Record):
         return 1
 
     def decl_info(self, is_written, index_dtype):
-        """Return a list of tuples ``(cgen_decl, arg_info)``, where
-        *cgen_decl* is a :mod:`cgen` argument declarations, *arg_info*
-        is a :class:`CLArgumentInfo` instance.
+        """Return a list of :class:`loopy.codegen.ImplementedDataInfo`
+        instances corresponding to the argume
         """
 
-        from loopy.codegen import CLArgumentInfo
+        from loopy.codegen import ImplementedDataInfo
+        from loopy.kernel.data import ValueArg
 
         vector_size = self.vector_size()
 
-        def gen_decls(name_suffix, shape, strides, dtype, user_index):
+        def gen_decls(name_suffix, shape, strides, stride_arg_axes,
+                dtype, user_index):
+            """
+            :arg stride_arg_axes: a tuple *(user_axis, impl_axis)*
+            :arg user_index: A tuple representing a (user-facing)
+                multi-dimensional subscript. This is filled in with
+                concrete integers when known (such as for separate-array
+                dim tags), and with *None* where the index won't be
+                known until run time.
+            """
+
             if dtype is None:
                 dtype = self.dtype
 
@@ -624,33 +651,59 @@ class ArrayBase(Record):
             num_user_axes = self.num_user_axes(require_answer=False)
 
             if num_user_axes is None or user_axis >= num_user_axes:
-                # implemented by various argument types
+                # {{{ recursion base case
+
                 full_name = self.name + name_suffix
 
-                yield (self.get_arg_decl(name_suffix, shape, dtype, is_written),
-                        CLArgumentInfo(
+                stride_args = []
+                strides = list(strides)
+
+                # generate stride arguments, yielded later to keep array first
+                for stride_user_axis, stride_impl_axis in stride_arg_axes:
+                    from cgen import Const, POD
+                    stride_name = full_name+"_stride%d" % stride_user_axis
+
+                    from pymbolic import var
+                    strides[stride_impl_axis] = var(stride_name)
+
+                    stride_args.append(
+                            ImplementedDataInfo(
+                                name=stride_name,
+                                dtype=index_dtype,
+                                cgen_declarator=Const(POD(index_dtype, stride_name)),
+                                arg_class=ValueArg,
+                                stride_for_name_and_axis=(
+                                    full_name, stride_impl_axis)))
+
+                yield ImplementedDataInfo(
                             name=full_name,
                             base_name=self.name,
+
+                            # implemented by various argument types
+                            cgen_declarator=self.get_arg_decl(
+                                name_suffix, shape, dtype, is_written),
+
+                            arg_class=type(self),
                             dtype=dtype,
                             shape=shape,
-                            strides=strides,
-                            offset_for_name=None,
+                            strides=tuple(strides),
                             allows_offset=bool(self.offset),
-                            arg_class=type(self)))
+                            )
 
                 if self.offset:
                     from cgen import Const, POD
                     offset_name = full_name+"_offset"
-                    yield (Const(POD(index_dtype, offset_name)),
-                            CLArgumentInfo(
+                    yield ImplementedDataInfo(
                                 name=offset_name,
-                                base_name=None,
                                 dtype=index_dtype,
-                                shape=None,
-                                strides=None,
-                                offset_for_name=full_name,
-                                allows_offset=None,
-                                arg_class=None))
+                                cgen_declarator=Const(POD(index_dtype, offset_name)),
+                                arg_class=ValueArg,
+                                offset_for_name=full_name)
+
+                for sa in stride_args:
+                    yield sa
+
+                # }}}
 
                 return
 
@@ -662,8 +715,20 @@ class ArrayBase(Record):
                 else:
                     new_shape = shape + (self.shape[user_axis],)
 
+                import loopy as lp
+                if dim_tag.stride is lp.auto:
+                    new_stride_arg_axes = stride_arg_axes \
+                            + ((user_axis, len(strides)),)
+
+                    # fixed above when final array name is known
+                    new_strides = strides + (None,)
+                else:
+                    new_stride_arg_axes = stride_arg_axes
+                    new_strides = strides + (dim_tag.stride // vector_size,)
+
                 for res in gen_decls(name_suffix, new_shape,
-                        strides + (dim_tag.stride // vector_size,),
+                        new_strides,
+                        new_stride_arg_axes,
                         dtype, user_index + (None,)):
                     yield res
 
@@ -676,7 +741,7 @@ class ArrayBase(Record):
 
                 for i in xrange(shape_i):
                     for res in gen_decls(name_suffix + "_s%d" % i,
-                            shape, strides, dtype,
+                            shape, strides, stride_arg_axes, dtype,
                             user_index + (i,)):
                         yield res
 
@@ -688,6 +753,7 @@ class ArrayBase(Record):
                                 self.name, user_axis))
 
                 for res in gen_decls(name_suffix, shape, strides,
+                        stride_arg_axes,
                         cl.array.vec.types[dtype, shape_i],
                         user_index + (None,)):
                     yield res
@@ -696,9 +762,36 @@ class ArrayBase(Record):
                 raise RuntimeError("unsupported array dim implementation tag '%s' "
                         "in array '%s'" % (dim_tag, self.name))
 
-        for res in gen_decls("", (), (), self.dtype, ()):
+        for res in gen_decls(name_suffix="", shape=(), strides=(),
+                stride_arg_axes=(),
+                dtype=self.dtype, user_index=()):
             yield res
 
+    @memoize_method
+    def sep_shape(self):
+        sep_shape = []
+        for shape_i, dim_tag in zip(self.shape, self.dim_tags):
+            if isinstance(dim_tag, SeparateArrayArrayDimTag):
+                if not isinstance(shape_i, int):
+                    raise TypeError("array '%s' has non-fixed-size "
+                            "separate-array axis" % self.name)
+
+                sep_shape.append(shape_i)
+
+        return tuple(sep_shape)
+
+    @memoize_method
+    def subscripts_and_names(self):
+        sep_shape = self.sep_shape()
+
+        if not sep_shape:
+            return None
+
+        from pytools import indices_in_shape
+        return [
+                (i, self.name + "".join("_s%d" % sub_i for sub_i in i))
+                for i in indices_in_shape(sep_shape)]
+
 # }}}
 
 
@@ -730,29 +823,47 @@ def get_access_info(ary, index, eval_expr):
 
     num_target_axes = ary.num_target_axes()
 
-    array_suffix = ""
+    array_name = ary.name
     vector_index = None
     subscripts = [0] * num_target_axes
 
     vector_size = ary.vector_size()
 
+    # {{{ process separate-array dim tags first, to find array name
+
+    for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
+        if isinstance(dim_tag, SeparateArrayArrayDimTag):
+            idx = eval_expr(idx)
+            if not isinstance(idx, int):
+                raise RuntimeError("subscript '%s[%s]' has non-constant "
+                        "index for separate-array axis %d (0-based)" % (
+                            ary.name, index, i))
+            array_name += "_s%d" % idx
+
+    # }}}
+
+    # {{{ process remaining dim tags
+
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, FixedStrideArrayDimTag):
-            if isinstance(dim_tag.stride, int):
+            import loopy as lp
+
+            stride = dim_tag.stride
+
+            if isinstance(stride, int):
                 if not dim_tag.stride % vector_size == 0:
                     raise RuntimeError("stride of axis %d of array '%s' "
                             "is not a multiple of the vector axis"
                             % (i, ary.name))
 
-            subscripts[dim_tag.target_axis] += (dim_tag.stride // vector_size)*idx
+            elif stride is lp.auto:
+                from pymbolic import var
+                stride = var(array_name + "_stride%d" % i)
+
+            subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
         elif isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr(idx)
-            if not isinstance(idx, int):
-                raise RuntimeError("subscript '%s[%s]' has non-constant "
-                        "index for separate-array axis %d (0-based)" % (
-                            ary.name, index, i))
-            array_suffix += "_s%d" % idx
+            pass
 
         elif isinstance(dim_tag, VectorArrayDimTag):
             idx = eval_expr(idx)
@@ -776,12 +887,12 @@ def get_access_info(ary, index, eval_expr):
 
         offset_name = ary.offset
         if offset_name is lp.auto:
-            offset_name = ary.name+array_suffix+"_offset"
+            offset_name = array_name+"_offset"
 
         subscripts[0] = var(offset_name) + subscripts[0]
 
     return AccessInfo(
-            array_suffix=array_suffix,
+            array_name=array_name,
             vector_index=vector_index,
             subscripts=subscripts)