diff --git a/.gitignore b/.gitignore
index 7ae71dc52e28413d3da125b4e5e9a64c2c970a19..5c9e73c7b6ffc90a15059b96c59c461ca0157baf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,5 @@ htmlcov
 .ipynb_checkpoints
 lextab.py
 yacctab.py
+
+.cache
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d62a3c85a518ef0cb0cba138755f5c28a90027c2..bb15d7c9fe93a0f0307edb33c62f76e1f8deec5a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -88,3 +88,15 @@ CentOS binary:
     - build-helpers/loopy-centos6
   tags:
   - docker
+  only:
+  - master
+
+Documentation:
+  script:
+  - EXTRA_INSTALL="numpy"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
+  - ". ./build-docs.sh"
+  tags:
+  - python3.5
+  only:
+  - master
diff --git a/doc/misc.rst b/doc/misc.rst
index a4c174e52da118143dc38f4d66dfa649eb395a50..d7abe80dda7999575b7d0386cce1bdcecb236c55 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -17,7 +17,7 @@ run this beforehand::
     python get-pip.py
 
 For a more manual installation, `download the source
-<http://pypi.python.org/pypi/Loopy>`_, unpack it, and say::
+<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say::
 
     python setup.py install
 
@@ -310,5 +310,14 @@ Here's a Bibtex entry for your convenience::
        doi = "{10.1145/2627373.2627387}",
     }
 
+Acknowledgments
+===============
 
+Andreas Klöckner's work on :mod:`loopy` was supported in part by
 
+* US Navy ONR grant number N00014-14-1-0117
+* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433.
+
+AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
+views and opinions expressed herein do not necessarily reflect those of the
+funding agencies.
diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh
index a95f9a68d7b1aed82bf92773422b22eb1336bc8a..a4691aecd31a14163b8f0c9bd30df137558eaa50 100755
--- a/doc/upload-docs.sh
+++ b/doc/upload-docs.sh
@@ -1,12 +1,3 @@
 #! /bin/bash
 
-cat > _build/html/.htaccess <<EOF
-AuthUserFile /home/andreas/htpasswd
-AuthGroupFile /dev/null
-AuthName "Pre-Release Documentation"
-AuthType Basic
-
-require user iliketoast
-EOF
-
-rsync --progress --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy
+rsync --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 778b3d49ef65df162857b8638d56b42ee8dee9d8..8975674442f049f51f724bfcb818fc6e8a441c88 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -66,7 +66,8 @@ from loopy.transform.iname import (
         split_reduction_inward, split_reduction_outward,
         affine_map_inames, find_unused_axis_tag,
         make_reduction_inames_unique,
-        has_schedulable_iname_nesting, get_iname_duplication_options)
+        has_schedulable_iname_nesting, get_iname_duplication_options,
+        add_inames_to_insn)
 
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
@@ -171,6 +172,7 @@ __all__ = [
         "affine_map_inames", "find_unused_axis_tag",
         "make_reduction_inames_unique",
         "has_schedulable_iname_nesting", "get_iname_duplication_options",
+        "add_inames_to_insn",
 
         "add_prefetch", "change_arg_to_image",
         "tag_array_axes", "tag_data_axes",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index b475a40f3580b3b0e3c746a0de90f33cf182aa4e..e992fa3d48a625c3cb982eed1409be847aadde5f 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -444,7 +444,7 @@ def auto_test_vs_ref(
             print(75*"-")
             print("Reference Code:")
             print(75*"-")
-            print(get_highlighted_cl_code(ref_compiled.code))
+            print(get_highlighted_cl_code(ref_compiled.get_code()))
             print(75*"-")
 
         ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())
diff --git a/loopy/check.py b/loopy/check.py
index 76b86a7c3f0b4267a9a41b5be3e6fae3a78a01bb..2f48211da430b42e8000467f070b1633ab3a4f38 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -34,17 +34,41 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+# {{{ sanity checks run before preprocessing
+
+def check_identifiers_in_subst_rules(knl):
+    """Substitution rules may only refer to kernel-global quantities or their
+    own arguments.
+    """
+
+    from loopy.symbolic import get_dependencies
+
+    allowed_identifiers = knl.all_variable_names()
+
+    for rule in six.itervalues(knl.substitutions):
+        deps = get_dependencies(rule.expression)
+        rule_allowed_identifiers = allowed_identifiers | frozenset(rule.arguments)
+
+        if not deps <= rule_allowed_identifiers:
+            raise LoopyError("kernel '%s': substitution rule '%s' refers to "
+                    "identifier(s) '%s' which are neither rule arguments nor "
+                    "kernel-global identifiers"
+                    % (knl.name, ", ".join(deps-rule_allowed_identifiers)))
+
+# }}}
+
+
 # {{{ sanity checks run pre-scheduling
 
 def check_insn_attributes(kernel):
     all_insn_ids = set(insn.id for insn in kernel.instructions)
 
     for insn in kernel.instructions:
-        if not insn.forced_iname_deps <= kernel.all_inames():
+        if not insn.within_inames <= kernel.all_inames():
             raise LoopyError("insn '%s' has unknown forced iname "
                     "dependencies: %s"
                     % (insn.id, ", ".join(
-                        insn.forced_iname_deps - kernel.all_inames())))
+                        insn.within_inames - kernel.all_inames())))
 
         if insn.depends_on is not None and not insn.depends_on <= all_insn_ids:
             raise LoopyError("insn '%s' has unknown instruction "
@@ -278,7 +302,12 @@ def check_bounds(kernel):
             continue
 
         acm = _AccessCheckMapper(kernel, domain, insn.id)
-        insn.with_transformed_expressions(acm)
+
+        def run_acm(expr):
+            acm(expr)
+            return expr
+
+        insn.with_transformed_expressions(run_acm)
 
 
 def check_write_destinations(kernel):
@@ -324,7 +353,7 @@ def check_has_schedulable_iname_nesting(kernel):
 
 def pre_schedule_checks(kernel):
     try:
-        logger.info("pre-schedule check %s: start" % kernel.name)
+        logger.info("%s: pre-schedule check: start" % kernel.name)
 
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel)
@@ -337,7 +366,7 @@ def pre_schedule_checks(kernel):
         check_write_destinations(kernel)
         check_has_schedulable_iname_nesting(kernel)
 
-        logger.info("pre-schedule check %s: done" % kernel.name)
+        logger.info("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
         raise
     except:
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index d95ab928925655148775699e206ebe76429d7463..79d824a44fc04f479139f2797994621f09798297 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -402,7 +402,7 @@ def generate_code_v2(kernel):
         input_kernel = kernel
         try:
             result = code_gen_cache[input_kernel]
-            logger.info("%s: code generation cache hit" % kernel.name)
+            logger.debug("%s: code generation cache hit" % kernel.name)
             return result
         except KeyError:
             pass
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 8531b97a947f3c46b984cf7405b794bd68e2f638..5fa54aeee91b5c41531e0dd5bfd7ab97323b19ae 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -97,7 +97,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
 
     ecm = codegen_state.expression_to_code_mapper
 
-    from loopy.expression import dtype_to_type_context, VectorizabilityChecker
+    from loopy.expression import VectorizabilityChecker
 
     # {{{ vectorization handling
 
@@ -141,52 +141,19 @@ def generate_assignment_instruction_code(codegen_state, insn):
     else:
         raise RuntimeError("invalid lvalue '%s'" % lhs)
 
-    lhs_var = kernel.get_var_descriptor(assignee_var_name)
-    lhs_dtype = lhs_var.dtype
-
-    if insn.atomicity is not None:
-        lhs_atomicity = [
-                a for a in insn.atomicity if a.var_name == assignee_var_name]
-        assert len(lhs_atomicity) <= 1
-        if lhs_atomicity:
-            lhs_atomicity, = lhs_atomicity
-        else:
-            lhs_atomicity = None
-    else:
-        lhs_atomicity = None
-
-    from loopy.kernel.data import AtomicInit, AtomicUpdate
-
-    lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
-    rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
-    if lhs_atomicity is None:
-        result = codegen_state.ast_builder.emit_assignment(
-                codegen_state,
-                lhs_code,
-                ecm(insn.expression, prec=PREC_NONE,
-                    type_context=rhs_type_context,
-                    needed_dtype=lhs_dtype))
-
-    elif isinstance(lhs_atomicity, AtomicInit):
-        raise NotImplementedError("atomic init")
-
-    elif isinstance(lhs_atomicity, AtomicUpdate):
-        codegen_state.seen_atomic_dtypes.add(lhs_dtype)
-        result = codegen_state.ast_builder.generate_atomic_update(
-                kernel, codegen_state, lhs_atomicity, lhs_var,
-                insn.assignee, insn.expression,
-                lhs_dtype, rhs_type_context)
-
-    else:
-        raise ValueError("unexpected lhs atomicity type: %s"
-                % type(lhs_atomicity).__name__)
+    result = codegen_state.ast_builder.emit_assignment(codegen_state, insn)
 
     # {{{ tracing
 
+    lhs_dtype = codegen_state.kernel.get_var_descriptor(assignee_var_name).dtype
+
     if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
         if codegen_state.vectorization_info and is_vector:
             raise Unvectorizable("tracing does not support vectorization")
 
+        from pymbolic.mapper.stringifier import PREC_NONE
+        lhs_code = codegen_state.expression_to_code_mapper(insn.assignee, PREC_NONE)
+
         from cgen import Statement as S  # noqa
 
         gs, ls = kernel.get_grid_size_upper_bounds()
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 5782ed7d9b897ad9ca3bf4da165e3a61da31ce40..b3e4fe0589dc9a62d7bdefd7152784560be0ca8a 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -1,6 +1,6 @@
-from __future__ import division, with_statement, absolute_import
+from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -22,898 +22,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import range, zip
 
-import numpy as np
-from pytools import Record, memoize_method
-from loopy.diagnostic import ParameterFinderWarning
-from pytools.py_codegen import (
-        Indentation, PythonFunctionGenerator)
-from loopy.diagnostic import LoopyError
-from loopy.types import NumpyType
+from loopy.target.pyopencl_execution import (  # noqa
+        PyOpenCLKernelExecutor,
+        get_highlighted_cl_code)
 
-import logging
-logger = logging.getLogger(__name__)
 
+# {{{ compatibility
 
-# {{{ object array argument packing
-
-class _PackingInfo(Record):
-    """
-    .. attribute:: name
-    .. attribute:: sep_shape
-
-    .. attribute:: subscripts_and_names
-
-        A list of type ``[(index, unpacked_name), ...]``.
-    """
-
-
-class SeparateArrayPackingController(object):
-    """For argument arrays with axes tagged to be implemented as separate
-    arrays, this class provides preprocessing of the incoming arguments so that
-    all sub-arrays may be passed in one object array (under the original,
-    un-split argument name) and are unpacked into separate arrays before being
-    passed to the kernel.
-
-    It also repacks outgoing arrays of this type back into an object array.
-    """
-
-    def __init__(self, kernel):
-        # map from arg name
-        self.packing_info = {}
-
-        from loopy.kernel.array import ArrayBase
-        for arg in kernel.args:
-            if not isinstance(arg, ArrayBase):
-                continue
-
-            if arg.shape is None or arg.dim_tags is None:
-                continue
-
-            subscripts_and_names = arg.subscripts_and_names()
-
-            if subscripts_and_names is None:
-                continue
-
-            self.packing_info[arg.name] = _PackingInfo(
-                    name=arg.name,
-                    sep_shape=arg.sep_shape(),
-                    subscripts_and_names=subscripts_and_names,
-                    is_written=arg.name in kernel.get_written_variables())
-
-    def unpack(self, kernel_kwargs):
-        if not self.packing_info:
-            return kernel_kwargs
-
-        kernel_kwargs = kernel_kwargs.copy()
-
-        for packing_info in six.itervalues(self.packing_info):
-            arg_name = packing_info.name
-            if packing_info.name in kernel_kwargs:
-                arg = kernel_kwargs[arg_name]
-                for index, unpacked_name in packing_info.subscripts_and_names:
-                    assert unpacked_name not in kernel_kwargs
-                    kernel_kwargs[unpacked_name] = arg[index]
-                del kernel_kwargs[arg_name]
-
-        return kernel_kwargs
-
-    def pack(self, outputs):
-        if not self.packing_info:
-            return outputs
-
-        for packing_info in six.itervalues(self.packing_info):
-            if not packing_info.is_written:
-                continue
-
-            result = outputs[packing_info.name] = \
-                    np.zeros(packing_info.sep_shape, dtype=np.object)
-
-            for index, unpacked_name in packing_info.subscripts_and_names:
-                result[index] = outputs.pop(unpacked_name)
-
-        return outputs
-
-# }}}
-
-
-# {{{ invoker generation
-
-# /!\ This code runs in a namespace controlled by the user.
-# Prefix all auxiliary variables with "_lpy".
-
-
-def python_dtype_str(dtype):
-    import pyopencl.tools as cl_tools
-    if dtype.isbuiltin:
-        return "_lpy_np."+dtype.name
-    else:
-        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
-                % cl_tools.dtype_to_ctype(dtype))
-
-
-# {{{ integer arg finding from shapes
-
-def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info):
-    # a mapping from integer argument names to a list of tuples
-    # (arg_name, expression), where expression is a
-    # unary function of kernel.arg_dict[arg_name]
-    # returning the desired integer argument.
-    iarg_to_sources = {}
-
-    from loopy.kernel.data import GlobalArg
-    from loopy.symbolic import DependencyMapper, StringifyMapper
-    dep_map = DependencyMapper()
-
-    from pymbolic import var
-    for arg in implemented_data_info:
-        if arg.arg_class is GlobalArg:
-            sym_shape = var(arg.name).attr("shape")
-            for axis_nr, shape_i in enumerate(arg.shape):
-                if shape_i is None:
-                    continue
-
-                deps = dep_map(shape_i)
-
-                if len(deps) == 1:
-                    integer_arg_var, = deps
-
-                    if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
-                        from pymbolic.algorithm import solve_affine_equations_for
-                        try:
-                            # friggin' overkill :)
-                            iarg_expr = solve_affine_equations_for(
-                                    [integer_arg_var.name],
-                                    [(shape_i, sym_shape.index(axis_nr))]
-                                    )[integer_arg_var]
-                        except Exception as e:
-                            #from traceback import print_exc
-                            #print_exc()
-
-                            # went wrong? oh well
-                            from warnings import warn
-                            warn("Unable to generate code to automatically "
-                                    "find '%s' from the shape of '%s':\n%s"
-                                    % (integer_arg_var.name, arg.name, str(e)),
-                                    ParameterFinderWarning)
-                        else:
-                            iarg_to_sources.setdefault(integer_arg_var.name, []) \
-                                    .append((arg.name, iarg_expr))
-
-    gen("# {{{ find integer arguments from shapes")
-    gen("")
-
-    for iarg_name, sources in six.iteritems(iarg_to_sources):
-        gen("if %s is None:" % iarg_name)
-        with Indentation(gen):
-            if_stmt = "if"
-            for arg_name, value_expr in sources:
-                gen("%s %s is not None:" % (if_stmt, arg_name))
-                with Indentation(gen):
-                    gen("%s = %s"
-                            % (iarg_name, StringifyMapper()(value_expr)))
-
-                if_stmt = "elif"
-
-        gen("")
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ integer arg finding from offsets
-
-def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info):
-    options = kernel.options
-
-    gen("# {{{ find integer arguments from offsets")
-    gen("")
-
-    for arg in implemented_data_info:
-        impl_array_name = arg.offset_for_name
-        if impl_array_name is not None:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("if %s is None:" % impl_array_name)
-                with Indentation(gen):
-                    gen("# Output variable, we'll be allocating "
-                            "it, with zero offset.")
-                    gen("%s = 0" % arg.name)
-                gen("else:")
-                with Indentation(gen):
-                    if not options.no_numpy:
-                        gen("_lpy_offset = getattr(%s, \"offset\", 0)"
-                                % impl_array_name)
-                    else:
-                        gen("_lpy_offset = %s.offset" % impl_array_name)
-
-                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                    if not options.skip_arg_checks:
-                        gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-                        gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
-                                "not divisible by its dtype itemsize\""
-                                % impl_array_name)
-                        gen("del _lpy_remdr")
-                    else:
-                        gen("%s = _lpy_offset // %d"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-                    if not options.skip_arg_checks:
-                        gen("del _lpy_offset")
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ integer arg finding from strides
-
-def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info):
-    options = kernel.options
-
-    gen("# {{{ find integer arguments from strides")
-    gen("")
-
-    for arg in implemented_data_info:
-        if arg.stride_for_name_and_axis is not None:
-            impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
-
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                if not options.skip_arg_checks:
-                    gen("if %s is None:" % impl_array_name)
-                    with Indentation(gen):
-                        gen("raise RuntimeError(\"required stride '%s' for "
-                                "argument '%s' not given or deducible from "
-                                "passed array\")"
-                                % (arg.name, impl_array_name))
-
-                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                    if not options.skip_arg_checks:
-                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
-                                % (arg.name, impl_array_name, stride_impl_axis,
-                                    base_arg.dtype.dtype.itemsize))
-
-                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is "
-                                "not divisible by its dtype itemsize\""
-                                % (stride_impl_axis, impl_array_name))
-                        gen("del _lpy_remdr")
-                    else:
-                        gen("%s = _lpy_offset // %d"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ check that value args are present
-
-def generate_value_arg_check(gen, kernel, implemented_data_info):
-    if kernel.options.skip_arg_checks:
-        return
-
-    from loopy.kernel.data import ValueArg
-
-    gen("# {{{ check that value args are present")
-    gen("")
-
-    for arg in implemented_data_info:
-        if not issubclass(arg.arg_class, ValueArg):
-            continue
-
-        gen("if %s is None:" % arg.name)
-        with Indentation(gen):
-            gen("raise TypeError(\"value argument '%s' "
-                    "was not given and could not be automatically "
-                    "determined\")" % arg.name)
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ arg setup
-
-def generate_arg_setup(gen, kernel, implemented_data_info, options):
-    import loopy as lp
-
-    from loopy.kernel.data import KernelArgument
-    from loopy.kernel.array import ArrayBase
-    from loopy.symbolic import StringifyMapper
-    from pymbolic import var
-
-    gen("# {{{ set up array arguments")
-    gen("")
-
-    if not options.no_numpy:
-        gen("_lpy_encountered_numpy = False")
-        gen("_lpy_encountered_dev = False")
-        gen("")
-
-    args = []
-
-    strify = StringifyMapper()
-
-    expect_no_more_arguments = False
-
-    for arg_idx, arg in enumerate(implemented_data_info):
-        is_written = arg.base_name in kernel.get_written_variables()
-        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
-
-        if not issubclass(arg.arg_class, KernelArgument):
-            expect_no_more_arguments = True
-            continue
-
-        if expect_no_more_arguments:
-            raise LoopyError("Further arguments encountered after arg info "
-                    "describing a global temporary variable")
-
-        if not issubclass(arg.arg_class, ArrayBase):
-            args.append(arg.name)
-            continue
-
-        gen("# {{{ process %s" % arg.name)
-        gen("")
-
-        if not options.no_numpy:
-            gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
-            with Indentation(gen):
-                gen("# synchronous, nothing to worry about")
-                gen("%s = _lpy_cl_array.to_device("
-                        "queue, %s, allocator=allocator)"
-                        % (arg.name, arg.name))
-                gen("_lpy_encountered_numpy = True")
-            gen("elif %s is not None:" % arg.name)
-            with Indentation(gen):
-                gen("_lpy_encountered_dev = True")
-
-            gen("")
-
-        if not options.skip_arg_checks and not is_written:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"input argument '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if (is_written
-                and arg.arg_class is lp.ImageArg
-                and not options.skip_arg_checks):
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"written image '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if is_written and arg.shape is None and not options.skip_arg_checks:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"written argument '%s' has "
-                        "unknown shape and must be supplied\")" % arg.name)
-                gen("")
-
-        possibly_made_by_loopy = False
-
-        # {{{ allocate written arrays, if needed
-
-        if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                and arg.shape is not None:
-
-            if not isinstance(arg.dtype, NumpyType):
-                raise LoopyError("do not know how to pass arg of type '%s'"
-                        % arg.dtype)
-
-            possibly_made_by_loopy = True
-            gen("_lpy_made_by_loopy = False")
-            gen("")
-
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                num_axes = len(arg.strides)
-                for i in range(num_axes):
-                    gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
-
-                itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                for i in range(num_axes):
-                    gen("_lpy_strides_%d = %s" % (i, strify(
-                        itemsize*arg.unvec_strides[i])))
-
-                if not options.skip_arg_checks:
-                    for i in range(num_axes):
-                        gen("assert _lpy_strides_%d > 0, "
-                                "\"'%s' has negative stride in axis %d\""
-                                % (i, arg.name, i))
-
-                sym_strides = tuple(
-                        var("_lpy_strides_%d" % i)
-                        for i in range(num_axes))
-                sym_shape = tuple(
-                        var("_lpy_shape_%d" % i)
-                        for i in range(num_axes))
-
-                alloc_size_expr = (sum(astrd*(alen-1)
-                    for alen, astrd in zip(sym_shape, sym_strides))
-                    + itemsize)
-
-                gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
-                gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
-                        "%(dtype)s, strides=%(strides)s, "
-                        "data=allocator(_lpy_alloc_size), allocator=allocator)"
-                        % dict(
-                            name=arg.name,
-                            shape=strify(sym_shape),
-                            strides=strify(sym_strides),
-                            dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))
-
-                if not options.skip_arg_checks:
-                    for i in range(num_axes):
-                        gen("del _lpy_shape_%d" % i)
-                        gen("del _lpy_strides_%d" % i)
-                    gen("del _lpy_alloc_size")
-                    gen("")
-
-                gen("_lpy_made_by_loopy = True")
-                gen("")
-
-        # }}}
-
-        # {{{ argument checking
-
-        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                and not options.skip_arg_checks:
-            if possibly_made_by_loopy:
-                gen("if not _lpy_made_by_loopy:")
-            else:
-                gen("if True:")
-
-            with Indentation(gen):
-                gen("if %s.dtype != %s:"
-                        % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
-                with Indentation(gen):
-                    gen("raise TypeError(\"dtype mismatch on argument '%s' "
-                            "(got: %%s, expected: %s)\" %% %s.dtype)"
-                            % (arg.name, arg.dtype, arg.name))
-
-                # {{{ generate shape checking code
-
-                def strify_allowing_none(shape_axis):
-                    if shape_axis is None:
-                        return "None"
-                    else:
-                        return strify(shape_axis)
-
-                def strify_tuple(t):
-                    if len(t) == 0:
-                        return "()"
-                    else:
-                        return "(%s,)" % ", ".join(
-                                strify_allowing_none(sa)
-                                for sa in t)
-
-                shape_mismatch_msg = (
-                        "raise TypeError(\"shape mismatch on argument '%s' "
-                        "(got: %%s, expected: %%s)\" "
-                        "%% (%s.shape, %s))"
-                        % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
-
-                if kernel_arg.shape is None:
-                    pass
-
-                elif any(shape_axis is None for shape_axis in kernel_arg.shape):
-                    gen("if len(%s.shape) != %s:"
-                            % (arg.name, len(arg.unvec_shape)))
-                    with Indentation(gen):
-                        gen(shape_mismatch_msg)
-
-                    for i, shape_axis in enumerate(arg.unvec_shape):
-                        if shape_axis is None:
-                            continue
-
-                        gen("if %s.shape[%d] != %s:"
-                                % (arg.name, i, strify(shape_axis)))
-                        with Indentation(gen):
-                            gen(shape_mismatch_msg)
-
-                else:  # not None, no Nones in tuple
-                    gen("if %s.shape != %s:"
-                            % (arg.name, strify(arg.unvec_shape)))
-                    with Indentation(gen):
-                        gen(shape_mismatch_msg)
-
-                # }}}
-
-                if arg.unvec_strides and kernel_arg.dim_tags:
-                    itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                    sym_strides = tuple(
-                            itemsize*s_i for s_i in arg.unvec_strides)
-                    gen("if %s.strides != %s:"
-                            % (arg.name, strify(sym_strides)))
-                    with Indentation(gen):
-                        gen("raise TypeError(\"strides mismatch on "
-                                "argument '%s' (got: %%s, expected: %%s)\" "
-                                "%% (%s.strides, %s))"
-                                % (arg.name, arg.name, strify(sym_strides)))
-
-                if not arg.allows_offset:
-                    gen("if %s.offset:" % arg.name)
-                    with Indentation(gen):
-                        gen("raise ValueError(\"Argument '%s' does not "
-                                "allow arrays with offsets. Try passing "
-                                "default_offset=loopy.auto to make_kernel()."
-                                "\")" % arg.name)
-                        gen("")
-
-        # }}}
-
-        if possibly_made_by_loopy and not options.skip_arg_checks:
-            gen("del _lpy_made_by_loopy")
-            gen("")
-
-        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-            args.append("%s.base_data" % arg.name)
-        else:
-            args.append("%s" % arg.name)
-
-        gen("")
-
-        gen("# }}}")
-        gen("")
-
-    gen("# }}}")
-    gen("")
-
-    return args
-
-# }}}
-
-
-def generate_invoker(kernel, codegen_result):
-    options = kernel.options
-    implemented_data_info = codegen_result.implemented_data_info
-    host_code = codegen_result.host_code()
-
-    system_args = [
-            "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
-            # ignored if options.no_numpy
-            "out_host=None"
-            ]
-
-    from loopy.kernel.data import KernelArgument
-    gen = PythonFunctionGenerator(
-            "invoke_%s_loopy_kernel" % kernel.name,
-            system_args + [
-                "%s=None" % idi.name
-                for idi in implemented_data_info
-                if issubclass(idi.arg_class, KernelArgument)
-                ])
-
-    gen.add_to_preamble("from __future__ import division")
-    gen.add_to_preamble("")
-    gen.add_to_preamble("import pyopencl as _lpy_cl")
-    gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
-    gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
-    gen.add_to_preamble("import numpy as _lpy_np")
-    gen.add_to_preamble("")
-    gen.add_to_preamble(host_code)
-    gen.add_to_preamble("")
-
-    gen("if allocator is None:")
-    with Indentation(gen):
-        gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
-    gen("")
-
-    generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info)
-    generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info)
-    generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info)
-    generate_value_arg_check(gen, kernel, implemented_data_info)
-
-    args = generate_arg_setup(gen, kernel, implemented_data_info, options)
-
-    # {{{ generate invocation
-
-    gen("_lpy_evt = {kernel_name}({args})"
-            .format(
-                kernel_name=codegen_result.host_program.name,
-                args=", ".join(
-                    ["_lpy_cl_kernels", "queue"]
-                    + args
-                    + ["wait_for=wait_for"])))
-
-    # }}}
-
-    # {{{ output
-
-    if not options.no_numpy:
-        gen("if out_host is None and (_lpy_encountered_numpy "
-                "and not _lpy_encountered_dev):")
-        with Indentation(gen):
-            gen("out_host = True")
-
-        gen("if out_host:")
-        with Indentation(gen):
-            gen("pass")  # if no outputs (?!)
-            for arg in implemented_data_info:
-                if not issubclass(arg.arg_class, KernelArgument):
-                    continue
-
-                is_written = arg.base_name in kernel.get_written_variables()
-                if is_written:
-                    gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
-
-        gen("")
-
-    if options.return_dict:
-        gen("return _lpy_evt, {%s}"
-                % ", ".join("\"%s\": %s" % (arg.name, arg.name)
-                    for arg in implemented_data_info
-                    if issubclass(arg.arg_class, KernelArgument)
-                    if arg.base_name in kernel.get_written_variables()))
-    else:
-        out_args = [arg
-                for arg in implemented_data_info
-                    if issubclass(arg.arg_class, KernelArgument)
-                if arg.base_name in kernel.get_written_variables()]
-        if out_args:
-            gen("return _lpy_evt, (%s,)"
-                    % ", ".join(arg.name for arg in out_args))
-        else:
-            gen("return _lpy_evt, ()")
-
-    # }}}
-
-    if options.write_wrapper:
-        output = gen.get()
-        if options.highlight_wrapper:
-            output = get_highlighted_python_code(output)
-
-        if options.write_wrapper is True:
-            print(output)
-        else:
-            with open(options.write_wrapper, "w") as outf:
-                outf.write(output)
-
-    return gen.get_function()
-
-
-# }}}
-
-
-# {{{ compiled kernel object
-
-class _CLKernelInfo(Record):
-    pass
-
-
-class _CLKernels(object):
-    pass
-
-
-class CompiledKernel:
-    """An object connecting a kernel to a :class:`pyopencl.Context`
-    for execution.
-
-    .. automethod:: __init__
-    .. automethod:: __call__
-    """
-
+class CompiledKernel(PyOpenCLKernelExecutor):
     def __init__(self, context, kernel):
-        """
-        :arg context: a :class:`pyopencl.Context`
-        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
-            (a warning will be issued if more than one is returned). If the
-            kernel has not yet been loop-scheduled, that is done, too, with no
-            specific arguments.
-        """
-
-        self.context = context
-
-        from loopy.target.pyopencl import PyOpenCLTarget
-        self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
-
-        self.packing_controller = SeparateArrayPackingController(kernel)
-
-        self.output_names = tuple(arg.name for arg in self.kernel.args
-                if arg.name in self.kernel.get_written_variables())
-
-    @memoize_method
-    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
-        kernel = self.kernel
-
-        from loopy.kernel.tools import add_dtypes
-
-        if var_to_dtype_set:
-            var_to_dtype = {}
-            for var, dtype in var_to_dtype_set:
-                try:
-                    dest_name = kernel.impl_arg_to_arg[var].name
-                except KeyError:
-                    dest_name = var
-
-                try:
-                    var_to_dtype[dest_name] = dtype
-                except KeyError:
-                    raise LoopyError("cannot set type for '%s': "
-                            "no known variable/argument with that name"
-                            % var)
-
-            kernel = add_dtypes(kernel, var_to_dtype)
-
-            from loopy.preprocess import infer_unknown_types
-            kernel = infer_unknown_types(kernel, expect_completion=True)
-
-        if kernel.schedule is None:
-            from loopy.preprocess import preprocess_kernel
-            kernel = preprocess_kernel(kernel)
-
-            from loopy.schedule import get_one_scheduled_kernel
-            kernel = get_one_scheduled_kernel(kernel)
-
-        return kernel
-
-    @memoize_method
-    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
-
-        from loopy.codegen import generate_code_v2
-        codegen_result = generate_code_v2(kernel)
-
-        dev_code = codegen_result.device_code()
-
-        if self.kernel.options.write_cl:
-            output = dev_code
-            if self.kernel.options.highlight_cl:
-                output = get_highlighted_cl_code(output)
-
-            if self.kernel.options.write_cl is True:
-                print(output)
-            else:
-                with open(self.kernel.options.write_cl, "w") as outf:
-                    outf.write(output)
-
-        if self.kernel.options.edit_cl:
-            from pytools import invoke_editor
-            dev_code = invoke_editor(dev_code, "code.cl")
-
-        import pyopencl as cl
-
-        logger.info("%s: opencl compilation start" % self.kernel.name)
-
-        cl_program = (
-                cl.Program(self.context, dev_code)
-                .build(options=kernel.options.cl_build_options))
-
-        cl_kernels = _CLKernels()
-        for dp in codegen_result.device_programs:
-            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
-
-        logger.info("%s: opencl compilation done" % self.kernel.name)
-
-        return _CLKernelInfo(
-                kernel=kernel,
-                cl_kernels=cl_kernels,
-                implemented_data_info=codegen_result.implemented_data_info,
-                invoker=generate_invoker(kernel, codegen_result))
-
-    # {{{ debugging aids
-
-    def get_code(self, arg_to_dtype=None):
-        if arg_to_dtype is not None:
-            arg_to_dtype = frozenset(six.iteritems(arg_to_dtype))
-
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
-
-        from loopy.codegen import generate_code_v2
-        code = generate_code_v2(kernel)
-        return code.device_code()
-
-    def get_highlighted_code(self, arg_to_dtype=None):
-        return get_highlighted_cl_code(
-                self.get_code(arg_to_dtype))
-
-    @property
-    def code(self):
         from warnings import warn
-        warn("CompiledKernel.code is deprecated. Use .get_code() instead.",
+        warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.",
                 DeprecationWarning, stacklevel=2)
 
-        return self.get_code()
-
-    # }}}
-
-    def __call__(self, queue, **kwargs):
-        """
-        :arg allocator: a callable passed a byte count and returning
-            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
-            maybe.
-        :arg wait_for: A list of :class:`pyopencl.Event` instances
-            for which to wait.
-        :arg out_host: :class:`bool`
-            Decides whether output arguments (i.e. arguments
-            written by the kernel) are to be returned as
-            :mod:`numpy` arrays. *True* for yes, *False* for no.
-
-            For the default value of *None*, if all (input) array
-            arguments are :mod:`numpy` arrays, defaults to
-            returning :mod:`numpy` arrays as well.
-
-        :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event`
-            associated with the execution of the kernel, and
-            output is a tuple of output arguments (arguments that
-            are written as part of the kernel). The order is given
-            by the order of kernel arguments. If this order is unspecified
-            (such as when kernel arguments are inferred automatically),
-            enable :attr:`loopy.Options.return_dict` to make *output* a
-            :class:`dict` instead, with keys of argument names and values
-            of the returned arrays.
-        """
-
-        allocator = kwargs.pop("allocator", None)
-        wait_for = kwargs.pop("wait_for", None)
-        out_host = kwargs.pop("out_host", None)
-
-        kwargs = self.packing_controller.unpack(kwargs)
-
-        impl_arg_to_arg = self.kernel.impl_arg_to_arg
-        arg_to_dtype = {}
-        for arg_name, val in six.iteritems(kwargs):
-            arg = impl_arg_to_arg.get(arg_name, None)
-
-            if arg is None:
-                # offsets, strides and such
-                continue
-
-            if arg.dtype is None and val is not None:
-                try:
-                    dtype = val.dtype
-                except AttributeError:
-                    pass
-                else:
-                    arg_to_dtype[arg_name] = dtype
-
-        kernel_info = self.cl_kernel_info(
-                frozenset(six.iteritems(arg_to_dtype)))
-
-        return kernel_info.invoker(
-                kernel_info.cl_kernels, queue, allocator, wait_for,
-                out_host, **kwargs)
+        super(CompiledKernel, self).__init__(context, kernel)
 
 # }}}
-
-
-def get_highlighted_python_code(text):
-    try:
-        from pygments import highlight
-    except ImportError:
-        return text
-    else:
-        from pygments.lexers import PythonLexer
-        from pygments.formatters import TerminalFormatter
-
-        return highlight(text, PythonLexer(), TerminalFormatter())
-
-
-def get_highlighted_cl_code(text):
-    try:
-        from pygments import highlight
-    except ImportError:
-        return text
-    else:
-        from pygments.lexers import CLexer
-        from pygments.formatters import TerminalFormatter
-
-        return highlight(text, CLexer(), TerminalFormatter())
-
-
-# vim: foldmethod=marker
diff --git a/loopy/execution.py b/loopy/execution.py
new file mode 100644
index 0000000000000000000000000000000000000000..802684247f9f95a4374838ddcfaaae0ddbadec2e
--- /dev/null
+++ b/loopy/execution.py
@@ -0,0 +1,200 @@
+from __future__ import division, with_statement, absolute_import
+
+__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import six
+import numpy as np
+from pytools import Record, memoize_method
+from loopy.diagnostic import LoopyError
+
+
+# {{{ object array argument packing
+
+class _PackingInfo(Record):
+    """
+    .. attribute:: name
+    .. attribute:: sep_shape
+
+    .. attribute:: subscripts_and_names
+
+        A list of type ``[(index, unpacked_name), ...]``.
+    """
+
+
+class SeparateArrayPackingController(object):
+    """For argument arrays with axes tagged to be implemented as separate
+    arrays, this class provides preprocessing of the incoming arguments so that
+    all sub-arrays may be passed in one object array (under the original,
+    un-split argument name) and are unpacked into separate arrays before being
+    passed to the kernel.
+
+    It also repacks outgoing arrays of this type back into an object array.
+    """
+
+    def __init__(self, kernel):
+        # map from arg name
+        self.packing_info = {}
+
+        from loopy.kernel.array import ArrayBase
+        for arg in kernel.args:
+            if not isinstance(arg, ArrayBase):
+                continue
+
+            if arg.shape is None or arg.dim_tags is None:
+                continue
+
+            subscripts_and_names = arg.subscripts_and_names()
+
+            if subscripts_and_names is None:
+                continue
+
+            self.packing_info[arg.name] = _PackingInfo(
+                    name=arg.name,
+                    sep_shape=arg.sep_shape(),
+                    subscripts_and_names=subscripts_and_names,
+                    is_written=arg.name in kernel.get_written_variables())
+
+    def unpack(self, kernel_kwargs):
+        if not self.packing_info:
+            return kernel_kwargs
+
+        kernel_kwargs = kernel_kwargs.copy()
+
+        for packing_info in six.itervalues(self.packing_info):
+            arg_name = packing_info.name
+            if packing_info.name in kernel_kwargs:
+                arg = kernel_kwargs[arg_name]
+                for index, unpacked_name in packing_info.subscripts_and_names:
+                    assert unpacked_name not in kernel_kwargs
+                    kernel_kwargs[unpacked_name] = arg[index]
+                del kernel_kwargs[arg_name]
+
+        return kernel_kwargs
+
+    def pack(self, outputs):
+        if not self.packing_info:
+            return outputs
+
+        for packing_info in six.itervalues(self.packing_info):
+            if not packing_info.is_written:
+                continue
+
+            result = outputs[packing_info.name] = \
+                    np.zeros(packing_info.sep_shape, dtype=np.object)
+
+            for index, unpacked_name in packing_info.subscripts_and_names:
+                result[index] = outputs.pop(unpacked_name)
+
+        return outputs
+
+# }}}
+
+
+# {{{ KernelExecutorBase
+
+class KernelExecutorBase(object):
+    """An object connecting a kernel to a :class:`pyopencl.Context`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, kernel):
+        """
+        :arg kernel: a loopy.LoopKernel
+        """
+
+        self.kernel = kernel
+
+        self.packing_controller = SeparateArrayPackingController(kernel)
+
+        self.output_names = tuple(arg.name for arg in self.kernel.args
+                if arg.name in self.kernel.get_written_variables())
+
+        self.has_runtime_typed_args = any(
+                arg.dtype is None
+                for arg in kernel.args)
+
+    @memoize_method
+    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
+        kernel = self.kernel
+
+        from loopy.kernel.tools import add_dtypes
+
+        if var_to_dtype_set:
+            var_to_dtype = {}
+            for var, dtype in var_to_dtype_set:
+                try:
+                    dest_name = kernel.impl_arg_to_arg[var].name
+                except KeyError:
+                    dest_name = var
+
+                try:
+                    var_to_dtype[dest_name] = dtype
+                except KeyError:
+                    raise LoopyError("cannot set type for '%s': "
+                            "no known variable/argument with that name"
+                            % var)
+
+            kernel = add_dtypes(kernel, var_to_dtype)
+
+            from loopy.preprocess import infer_unknown_types
+            kernel = infer_unknown_types(kernel, expect_completion=True)
+
+        if kernel.schedule is None:
+            from loopy.preprocess import preprocess_kernel
+            kernel = preprocess_kernel(kernel)
+
+            from loopy.schedule import get_one_scheduled_kernel
+            kernel = get_one_scheduled_kernel(kernel)
+
+        return kernel
+
+    def arg_to_dtype_set(self, kwargs):
+        if not self.has_runtime_typed_args:
+            return None
+
+        impl_arg_to_arg = self.kernel.impl_arg_to_arg
+        arg_to_dtype = {}
+        for arg_name, val in six.iteritems(kwargs):
+            arg = impl_arg_to_arg.get(arg_name, None)
+
+            if arg is None:
+                # offsets, strides and such
+                continue
+
+            if arg.dtype is None and val is not None:
+                try:
+                    dtype = val.dtype
+                except AttributeError:
+                    pass
+                else:
+                    arg_to_dtype[arg_name] = dtype
+
+        return frozenset(six.iteritems(arg_to_dtype))
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index e6e3d07c3a93ab0d3b7a308505cae1549802e364..f0f978c43fec48e24d192f2626baf59e4657aa3d 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -234,7 +234,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         from loopy.kernel.data import Assignment
         insn = Assignment(
                 lhs, rhs,
-                forced_iname_deps=frozenset(
+                within_inames=frozenset(
                     scope.active_loopy_inames),
                 depends_on=depends_on,
                 id=new_id,
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 077bc2dd12aeede3d59abd93ce04f795dad99b3f..c3c543d853b0f21db2ee7b1f5e63952f3cd4e716 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -303,6 +303,8 @@ class LoopKernel(RecordWithoutPickling):
                 state=state,
                 target=target)
 
+        self._kernel_executor_cache = {}
+
     # }}}
 
     # {{{ function mangling
@@ -671,7 +673,7 @@ class LoopKernel(RecordWithoutPickling):
         """
         result = {}
         for insn in self.instructions:
-            result[insn.id] = insn.forced_iname_deps
+            result[insn.id] = insn.within_inames
 
         return result
 
@@ -685,7 +687,7 @@ class LoopKernel(RecordWithoutPickling):
     def insn_inames(self, insn):
         if isinstance(insn, str):
             insn = self.id_to_insn[insn]
-        return insn.forced_iname_deps
+        return insn.within_inames
 
     @memoize_method
     def iname_to_insns(self):
@@ -1262,14 +1264,15 @@ class LoopKernel(RecordWithoutPickling):
 
     # {{{ direct execution
 
-    @memoize_method
-    def get_compiled_kernel(self, ctx):
-        from loopy.compiled import CompiledKernel
-        return CompiledKernel(ctx, self)
+    def __call__(self, *args, **kwargs):
+        key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
+        try:
+            kex = self._kernel_executor_cache[key]
+        except KeyError:
+            kex = self.target.get_kernel_executor(self, *args, **kwargs)
+            self._kernel_executor_cache[key] = kex
 
-    def __call__(self, queue, **kwargs):
-        cknl = self.get_compiled_kernel(queue.context)
-        return cknl(queue, **kwargs)
+        return kex(*args, **kwargs)
 
     # }}}
 
@@ -1363,7 +1366,7 @@ class LoopKernel(RecordWithoutPickling):
                     return False
 
                 for set_a, set_b in zip(self.domains, other.domains):
-                    if not set_a.plain_is_equal(set_b):
+                    if not (set_a.plain_is_equal(set_b) or set_a.is_equal(set_b)):
                         return False
 
             elif field_name == "assumptions":
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index ce8092b346f3a4232e7398e45799cda28a99c08c..cff92016dd4b07f937d443adbf8a3baa5fc5890a 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -782,7 +782,6 @@ class ArrayBase(Record):
             kwargs["strides"] = strides
 
         if dim_names is not None and not isinstance(dim_names, tuple):
-            pu.db
             from warnings import warn
             warn("dim_names is not a tuple when calling ArrayBase constructor",
                     DeprecationWarning, stacklevel=2)
@@ -1173,6 +1172,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         or *None*.
     """
 
+    import loopy as lp
+    from pymbolic import var
+
     def eval_expr_assert_integer_constant(i, expr):
         from pymbolic.mapper.evaluator import UnknownVariableError
         try:
@@ -1191,6 +1193,16 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
         return result
 
+    def apply_offset(sub):
+        if ary.offset:
+            offset_name = ary.offset
+            if offset_name is lp.auto:
+                offset_name = array_name+"_offset"
+
+            return var(offset_name) + sub
+        else:
+            return sub
+
     if not isinstance(index, tuple):
         index = (index,)
 
@@ -1204,7 +1216,10 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                     "'shape=None'?)"
                     % ary.name)
 
-        return AccessInfo(array_name=array_name, subscripts=index, vector_index=None)
+        return AccessInfo(
+                array_name=array_name,
+                subscripts=(apply_offset(index[0]),),
+                vector_index=None)
 
     if len(ary.dim_tags) != len(index):
         raise LoopyError("subscript to '%s[%s]' has the wrong "
@@ -1231,8 +1246,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, FixedStrideArrayDimTag):
-            import loopy as lp
-
             stride = dim_tag.stride
 
             if is_integer(stride):
@@ -1243,7 +1256,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                             % (ary.name, i, dim_tag.stride, vector_size))
 
             elif stride is lp.auto:
-                from pymbolic import var
                 stride = var(array_name + "_stride%d" % i)
 
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
@@ -1278,11 +1290,7 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         if num_target_axes > 1:
             raise NotImplementedError("offsets for multiple image axes")
 
-        offset_name = ary.offset
-        if offset_name is lp.auto:
-            offset_name = array_name+"_offset"
-
-        subscripts[0] = var(offset_name) + subscripts[0]
+        subscripts[0] = apply_offset(subscripts[0])
 
     return AccessInfo(
             array_name=array_name,
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 3583fdc689659cca3d240edf9be7f501672d9987..6aa2fcbbf6cabd4b5c8eea98dbbcbed35f0edd89 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -158,8 +158,8 @@ def get_default_insn_options_dict():
         "insn_id": None,
         "inames_to_dup": [],
         "priority": 0,
-        "forced_iname_deps_is_final": False,
-        "forced_iname_deps": frozenset(),
+        "within_inames_is_final": False,
+        "within_inames": frozenset(),
         "predicates": frozenset(),
         "tags": frozenset(),
         "atomicity": (),
@@ -215,10 +215,10 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
             for value in opt_value.split(":"):
                 arrow_idx = value.find("->")
                 if arrow_idx >= 0:
-                    result["inames_to_dup"].append(
+                    result.setdefault("inames_to_dup", []).append(
                             (value[:arrow_idx], value[arrow_idx+2:]))
                 else:
-                    result["inames_to_dup"].append((value, None))
+                    result.setdefault("inames_to_dup", []).append((value, None))
 
         elif opt_key == "dep" and opt_value is not None:
             if is_with_block:
@@ -254,12 +254,12 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 
         elif opt_key == "inames" and opt_value is not None:
             if opt_value.startswith("+"):
-                result["forced_iname_deps_is_final"] = False
+                result["within_inames_is_final"] = False
                 opt_value = (opt_value[1:]).strip()
             else:
-                result["forced_iname_deps_is_final"] = True
+                result["within_inames_is_final"] = True
 
-            result["forced_iname_deps"] = intern_frozenset_of_ids(
+            result["within_inames"] = intern_frozenset_of_ids(
                     opt_value.split(":"))
 
         elif opt_key == "if" and opt_value is not None:
@@ -316,8 +316,14 @@ WITH_OPTIONS_RE = re.compile(
 
 FOR_RE = re.compile(
         "^"
-        "\s*for\s*"
-        "(?P<inames>[ ,\w]+)"
+        "\s*(for)\s+"
+        "(?P<inames>[ ,\w]*)"
+        "\s*$")
+
+IF_RE = re.compile(
+        "^"
+        "\s*if\s+"
+        "(?P<predicate>\w+)"
         "\s*$")
 
 INSN_RE = re.compile(
@@ -329,6 +335,13 @@ INSN_RE = re.compile(
         "\s*?"
         "(?:\{(?P<options>.+)\}\s*)?$")
 
+EMPTY_LHS_INSN_RE = re.compile(
+        "^"
+        "\s*"
+        "(?P<rhs>.+?)"
+        "\s*?"
+        "(?:\{(?P<options>.+)\}\s*)?$")
+
 SUBST_RE = re.compile(
         r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$")
 
@@ -342,14 +355,17 @@ def parse_insn(groups, insn_options):
     """
 
     import loopy as lp
-
     from loopy.symbolic import parse
-    try:
-        lhs = parse(groups["lhs"])
-    except:
-        print("While parsing left hand side '%s', "
-                "the following error occurred:" % groups["lhs"])
-        raise
+
+    if "lhs" in groups:
+        try:
+            lhs = parse(groups["lhs"])
+        except:
+            print("While parsing left hand side '%s', "
+                    "the following error occurred:" % groups["lhs"])
+            raise
+    else:
+        lhs = ()
 
     try:
         rhs = parse(groups["rhs"])
@@ -508,8 +524,8 @@ def parse_instructions(instructions, defines):
                         groups=frozenset(intern(grp) for grp in insn.groups),
                         conflicts_with_groups=frozenset(
                             intern(grp) for grp in insn.conflicts_with_groups),
-                        forced_iname_deps=frozenset(
-                            intern(iname) for iname in insn.forced_iname_deps),
+                        within_inames=frozenset(
+                            intern(iname) for iname in insn.within_inames),
                         predicates=frozenset(
                             intern(pred) for pred in insn.predicates),
                         ))
@@ -593,27 +609,27 @@ def parse_instructions(instructions, defines):
 
     for insn in instructions:
         if isinstance(insn, InstructionBase):
-            local_fids = insn_options_stack[-1]["forced_iname_deps"]
+            local_w_inames = insn_options_stack[-1]["within_inames"]
 
-            if insn.forced_iname_deps_is_final:
+            if insn.within_inames_is_final:
                 if not (
-                        local_fids <= insn.forced_iname_deps):
+                        local_w_inames <= insn.within_inames):
                     raise LoopyError("non-parsed instruction '%s' without "
                             "inames '%s' (but with final iname dependencies) "
                             "found inside 'for'/'with' block for inames "
                             "'%s'"
                             % (insn.id,
-                                ", ".join(local_fids - insn.forced_iname_deps),
-                                insn_options_stack[-1].forced_iname_deps))
+                                ", ".join(local_w_inames - insn.within_inames),
+                                insn_options_stack[-1].within_inames))
 
             else:
                 # not final, add inames from current scope
                 insn = insn.copy(
-                        forced_iname_deps=insn.forced_iname_deps | local_fids,
-                        forced_iname_deps_is_final=(
+                        within_inames=insn.within_inames | local_w_inames,
+                        within_inames_is_final=(
                             # If it's inside a for/with block, then it's
                             # final now.
-                            bool(local_fids)),
+                            bool(local_w_inames)),
                         tags=(
                             insn.tags
                             | insn_options_stack[-1]["tags"]),
@@ -631,7 +647,7 @@ def parse_instructions(instructions, defines):
             new_instructions.append(insn)
             inames_to_dup.append([])
 
-            del local_fids
+            del local_w_inames
 
             continue
 
@@ -646,12 +662,32 @@ def parse_instructions(instructions, defines):
         for_match = FOR_RE.match(insn)
         if for_match is not None:
             options = insn_options_stack[-1].copy()
-            options["forced_iname_deps"] = (
-                    options.get("forced_iname_deps", frozenset())
-                    | frozenset(
-                        iname.strip()
-                        for iname in for_match.group("inames").split(",")))
-            options["forced_iname_deps_is_final"] = True
+            added_inames = frozenset(
+                    iname.strip()
+                    for iname in for_match.group("inames").split(",")
+                    if iname.strip())
+            if not added_inames:
+                raise LoopyError("'for' without inames encountered")
+
+            options["within_inames"] = (
+                    options.get("within_inames", frozenset())
+                    | added_inames)
+            options["within_inames_is_final"] = True
+
+            insn_options_stack.append(options)
+            del options
+            continue
+
+        if_match = IF_RE.match(insn)
+        if if_match is not None:
+            options = insn_options_stack[-1].copy()
+            predicate = if_match.group("predicate")
+            if not predicate:
+                raise LoopyError("'if' without predicate encountered")
+
+            options["predicates"] = (
+                    options.get("predicates", frozenset())
+                    | frozenset([predicate]))
 
             insn_options_stack.append(options)
             del options
@@ -675,6 +711,14 @@ def parse_instructions(instructions, defines):
             inames_to_dup.append(insn_inames_to_dup)
             continue
 
+        insn_match = EMPTY_LHS_INSN_RE.match(insn)
+        if insn_match is not None:
+            insn, insn_inames_to_dup = parse_insn(
+                    insn_match.groupdict(), insn_options_stack[-1])
+            new_instructions.append(insn)
+            inames_to_dup.append(insn_inames_to_dup)
+            continue
+
         raise LoopyError("instruction parse error: %s" % insn)
 
     if len(insn_options_stack) != 1:
@@ -732,7 +776,7 @@ def parse_domains(domains, defines):
                 parameters = (_gather_isl_identifiers(dom)
                         - _find_inames_in_set(dom)
                         - _find_existentially_quantified_inames(dom))
-                dom = "[%s] -> %s" % (",".join(parameters), dom)
+                dom = "[%s] -> %s" % (",".join(sorted(parameters)), dom)
 
             try:
                 dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
@@ -815,17 +859,21 @@ class ArgumentGuesser:
             if isinstance(insn, MultiAssignmentBase):
                 for assignee_var_name in insn.assignee_var_names():
                     self.all_written_names.add(assignee_var_name)
-                    self.all_names.update(get_dependencies(
-                        self.submap(insn.assignees)))
-                    self.all_names.update(get_dependencies(
-                        self.submap(insn.expression)))
+
+                self.all_names.update(get_dependencies(
+                    self.submap(insn.assignees)))
+                self.all_names.update(get_dependencies(
+                    self.submap(insn.expression)))
 
     def find_index_rank(self, name):
         irf = IndexRankFinder(name)
 
+        def run_irf(expr):
+            irf(self.submap(expr))
+            return expr
+
         for insn in self.instructions:
-            insn.with_transformed_expressions(
-                    lambda expr: irf(self.submap(expr)))
+            insn.with_transformed_expressions(run_irf)
 
         if not irf.index_ranks:
             return 0
@@ -941,13 +989,13 @@ def check_for_duplicate_names(knl):
 
 def check_for_nonexistent_iname_deps(knl):
     for insn in knl.instructions:
-        if not set(insn.forced_iname_deps) <= knl.all_inames():
+        if not set(insn.within_inames) <= knl.all_inames():
             raise ValueError("In instruction '%s': "
                     "cannot force dependency on inames '%s'--"
                     "they don't exist" % (
                         insn.id,
                         ",".join(
-                            set(insn.forced_iname_deps)-knl.all_inames())))
+                            set(insn.within_inames)-knl.all_inames())))
 
 
 def check_for_multiple_writes_to_loop_bounds(knl):
@@ -1037,8 +1085,8 @@ def expand_cses(instructions, cse_prefix="cse_expr"):
                 assignee=Variable(new_var_name),
                 expression=expr,
                 predicates=insn.predicates,
-                forced_iname_deps=insn.forced_iname_deps,
-                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                within_inames=insn.within_inames,
+                within_inames_is_final=insn.within_inames_is_final,
                 )
         newly_created_insn_ids.add(new_insn.id)
         new_insns.append(new_insn)
@@ -1352,6 +1400,27 @@ def resolve_wildcard_deps(knl):
 # }}}
 
 
+# {{{ add used inames deps
+
+def add_used_inames(knl):
+    new_insns = []
+
+    for insn in knl.instructions:
+        deps = insn.read_dependency_names() | insn.write_dependency_names()
+        iname_deps = deps & knl.all_inames()
+
+        new_within_inames = insn.within_inames | iname_deps
+
+        if new_within_inames != insn.within_inames:
+            insn = insn.copy(within_inames=new_within_inames)
+
+        new_insns.append(insn)
+
+    return knl.copy(instructions=new_insns)
+
+# }}}
+
+
 # {{{ add inferred iname deps
 
 def add_inferred_inames(knl):
@@ -1359,7 +1428,7 @@ def add_inferred_inames(knl):
     insn_inames = find_all_insn_inames(knl)
 
     return knl.copy(instructions=[
-            insn.copy(forced_iname_deps=insn_inames[insn.id])
+            insn.copy(within_inames=insn_inames[insn.id])
             for insn in knl.instructions])
 
 # }}}
@@ -1549,10 +1618,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
             **kwargs)
 
     from loopy import duplicate_inames
+    from loopy.match import Id
     for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup):
         for old_iname, new_iname in insn_inames_to_dup:
             knl = duplicate_inames(knl, old_iname,
-                    within=insn.id, new_inames=new_iname)
+                    within=Id(insn.id), new_inames=new_iname)
 
     check_for_nonexistent_iname_deps(knl)
 
@@ -1563,6 +1633,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     # Must create temporaries before inferring inames (because those temporaries
     # mediate dependencies that are then used for iname propagation.)
     # -------------------------------------------------------------------------
+    knl = add_used_inames(knl)
     # NOTE: add_inferred_inames will be phased out and throws warnings if it
     # does something.
     knl = add_inferred_inames(knl)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index a884fb0f07f14072c75201b0fe4013e3d72f691c..5af3f0272073243417c3111a847b921a41afec6e 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 
 from six.moves import intern
+from warnings import warn
 import numpy as np  # noqa
 from pytools import Record, memoize_method
 from loopy.kernel.array import ArrayBase
@@ -590,32 +591,13 @@ class InstructionBase(Record):
 
     .. rubric:: Iname dependencies
 
-    .. attribute:: forced_iname_deps_is_final
+    .. attribute:: within_inames
 
-        A :class:`bool` determining whether :attr:`forced_iname_deps` constitutes
-        the *entire* list of iname dependencies.
-
-    .. attribute:: forced_iname_deps
-
-        A :class:`frozenset` of inames that are added to the list of iname
-        dependencies *or* constitute the entire list of iname dependencies,
-        depending on the value of :attr:`forced_iname_deps_is_final`.
+        A :class:`frozenset` of inames identifying the loops within which this
+        instruction will be executed.
 
     .. rubric:: Iname dependencies
 
-    .. attribute:: boostable
-
-        Whether the instruction may safely be executed inside more loops than
-        advertised without changing the meaning of the program. Allowed values
-        are *None* (for unknown), *True*, and *False*.
-
-    .. attribute:: boostable_into
-
-        A :class:`set` of inames into which the instruction
-        may need to be boosted, as a heuristic help for the scheduler.
-        Also allowed to be *None* to indicate that this hasn't been
-        decided yet.
-
     .. rubric:: Tagging
 
     .. attribute:: tags
@@ -623,6 +605,9 @@ class InstructionBase(Record):
         A :class:`frozenset` of string identifiers that can be used to
         identify groups of instructions.
 
+        Tags starting with exclamation marks (``!``) are reserved and may have
+        specific meanings defined by :mod:`loopy` or its targets.
+
     .. automethod:: __init__
     .. automethod:: assignee_var_names
     .. automethod:: assignee_subscript_deps
@@ -632,31 +617,48 @@ class InstructionBase(Record):
     .. automethod:: copy
     """
 
+    # within_inames_is_final, boostable and boostable_into are deprecated and
+    # will be removed in version 2017.x.
+
     fields = set("id depends_on depends_on_is_final "
             "groups conflicts_with_groups "
             "no_sync_with "
             "predicates "
-            "forced_iname_deps_is_final forced_iname_deps "
+            "within_inames_is_final within_inames "
             "priority boostable boostable_into".split())
 
     def __init__(self, id, depends_on, depends_on_is_final,
             groups, conflicts_with_groups,
             no_sync_with,
-            forced_iname_deps_is_final, forced_iname_deps,
+            within_inames_is_final, within_inames,
             priority,
             boostable, boostable_into, predicates, tags,
-            insn_deps=None, insn_deps_is_final=None):
+            insn_deps=None, insn_deps_is_final=None,
+            forced_iname_deps=None, forced_iname_deps_is_final=None):
+
+        # {{{ backwards compatibility goop
 
         if depends_on is not None and insn_deps is not None:
-            raise ValueError("may not specify both insn_deps and depends_on")
+            raise LoopyError("may not specify both insn_deps and depends_on")
         elif insn_deps is not None:
-            from warnings import warn
             warn("insn_deps is deprecated, use depends_on",
                     DeprecationWarning, stacklevel=2)
 
             depends_on = insn_deps
             depends_on_is_final = insn_deps_is_final
 
+        if forced_iname_deps is not None and within_inames is not None:
+            raise LoopyError("may not specify both forced_iname_deps "
+                    "and within_inames")
+        elif forced_iname_deps is not None:
+            warn("forced_iname_deps is deprecated, use within_inames",
+                    DeprecationWarning, stacklevel=2)
+
+            within_inames = forced_iname_deps
+            within_inames_is_final = forced_iname_deps_is_final
+
+        # }}}
+
         if depends_on is None:
             depends_on = frozenset()
 
@@ -669,8 +671,11 @@ class InstructionBase(Record):
         if no_sync_with is None:
             no_sync_with = frozenset()
 
-        if forced_iname_deps_is_final is None:
-            forced_iname_deps_is_final = False
+        if within_inames is None:
+            within_inames = frozenset()
+
+        if within_inames_is_final is None:
+            within_inames_is_final = False
 
         if depends_on_is_final is None:
             depends_on_is_final = False
@@ -694,10 +699,10 @@ class InstructionBase(Record):
         # assert all(is_interned(dep) for dep in depends_on)
         # assert all(is_interned(grp) for grp in groups)
         # assert all(is_interned(grp) for grp in conflicts_with_groups)
-        # assert all(is_interned(iname) for iname in forced_iname_deps)
+        # assert all(is_interned(iname) for iname in within_inames)
         # assert all(is_interned(pred) for pred in predicates)
 
-        assert isinstance(forced_iname_deps, frozenset)
+        assert isinstance(within_inames, frozenset)
         assert isinstance(depends_on, frozenset) or depends_on is None
         assert isinstance(groups, frozenset)
         assert isinstance(conflicts_with_groups, frozenset)
@@ -708,24 +713,45 @@ class InstructionBase(Record):
                 depends_on_is_final=depends_on_is_final,
                 no_sync_with=no_sync_with,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
-                forced_iname_deps_is_final=forced_iname_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
+                within_inames_is_final=within_inames_is_final,
+                within_inames=within_inames,
                 priority=priority,
                 boostable=boostable,
                 boostable_into=boostable_into,
                 predicates=predicates,
                 tags=tags)
 
-    # legacy
+    # {{{ backwards compatibility goop
+
     @property
     def insn_deps(self):
+        warn("insn_deps is deprecated, use depends_on",
+                DeprecationWarning, stacklevel=2)
+
         return self.depends_on
 
     # legacy
     @property
     def insn_deps_is_final(self):
+        warn("insn_deps_is_final is deprecated, use depends_on_is_final",
+                DeprecationWarning, stacklevel=2)
+
         return self.depends_on_is_final
 
+    @property
+    def forced_iname_deps(self):
+        warn("forced_iname_deps is deprecated, use within_inames",
+                DeprecationWarning, stacklevel=2)
+        return self.within_inames
+
+    @property
+    def forced_iname_deps_is_final(self):
+        warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final",
+                DeprecationWarning, stacklevel=2)
+        return self.within_inames_is_final
+
+    # }}}
+
     # {{{ abstract interface
 
     def read_dependency_names(self):
@@ -877,8 +903,8 @@ class InstructionBase(Record):
         self.groups = intern_frozenset_of_ids(self.groups)
         self.conflicts_with_groups = (
                 intern_frozenset_of_ids(self.conflicts_with_groups))
-        self.forced_iname_deps = (
-                intern_frozenset_of_ids(self.forced_iname_deps))
+        self.within_inames = (
+                intern_frozenset_of_ids(self.within_inames))
         self.predicates = (
                 intern_frozenset_of_ids(self.predicates))
 
@@ -1170,12 +1196,13 @@ class Assignment(MultiAssignmentBase):
             groups=None,
             conflicts_with_groups=None,
             no_sync_with=None,
-            forced_iname_deps_is_final=None,
-            forced_iname_deps=frozenset(),
+            within_inames_is_final=None,
+            within_inames=None,
             boostable=None, boostable_into=None, tags=None,
             temp_var_type=None, atomicity=(),
             priority=0, predicates=frozenset(),
-            insn_deps=None, insn_deps_is_final=None):
+            insn_deps=None, insn_deps_is_final=None,
+            forced_iname_deps=None, forced_iname_deps_is_final=None):
 
         super(Assignment, self).__init__(
                 id=id,
@@ -1184,15 +1211,17 @@ class Assignment(MultiAssignmentBase):
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
                 no_sync_with=no_sync_with,
-                forced_iname_deps_is_final=forced_iname_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
+                within_inames_is_final=within_inames_is_final,
+                within_inames=within_inames,
                 boostable=boostable,
                 boostable_into=boostable_into,
                 priority=priority,
                 predicates=predicates,
                 tags=tags,
                 insn_deps=insn_deps,
-                insn_deps_is_final=insn_deps_is_final)
+                insn_deps_is_final=insn_deps_is_final,
+                forced_iname_deps=forced_iname_deps,
+                forced_iname_deps_is_final=forced_iname_deps_is_final)
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -1200,13 +1229,10 @@ class Assignment(MultiAssignmentBase):
         if isinstance(expression, str):
             expression = parse(expression)
 
-        # FIXME: It may be worth it to enable this check eventually.
-        # For now, it causes grief with certain 'checky' uses of the
-        # with_transformed_expressions(). (notably the access checker)
-        #
-        # from pymbolic.primitives import Variable, Subscript
-        # if not isinstance(assignee, (Variable, Subscript)):
-        #     raise LoopyError("invalid lvalue '%s'" % assignee)
+        from pymbolic.primitives import Variable, Subscript
+        from loopy.symbolic import LinearSubscript
+        if not isinstance(assignee, (Variable, Subscript, LinearSubscript)):
+            raise LoopyError("invalid lvalue '%s'" % assignee)
 
         self.assignee = assignee
         self.expression = expression
@@ -1288,6 +1314,8 @@ class CallInstruction(MultiAssignmentBase):
 
     .. attribute:: assignees
 
+        A :class:`tuple` of left-hand sides for the assignment
+
     .. attribute:: expression
 
     The following attributes are only used until
@@ -1312,12 +1340,14 @@ class CallInstruction(MultiAssignmentBase):
             groups=None,
             conflicts_with_groups=None,
             no_sync_with=None,
-            forced_iname_deps_is_final=None,
-            forced_iname_deps=frozenset(),
+            within_inames_is_final=None,
+            within_inames=None,
             boostable=None, boostable_into=None, tags=None,
             temp_var_types=None,
             priority=0, predicates=frozenset(),
-            insn_deps=None, insn_deps_is_final=None):
+            insn_deps=None, insn_deps_is_final=None,
+            forced_iname_deps=None,
+            forced_iname_deps_is_final=None):
 
         super(CallInstruction, self).__init__(
                 id=id,
@@ -1326,15 +1356,17 @@ class CallInstruction(MultiAssignmentBase):
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
                 no_sync_with=no_sync_with,
-                forced_iname_deps_is_final=forced_iname_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
+                within_inames_is_final=within_inames_is_final,
+                within_inames=within_inames,
                 boostable=boostable,
                 boostable_into=boostable_into,
                 priority=priority,
                 predicates=predicates,
                 tags=tags,
                 insn_deps=insn_deps,
-                insn_deps_is_final=insn_deps_is_final)
+                insn_deps_is_final=insn_deps_is_final,
+                forced_iname_deps=forced_iname_deps,
+                forced_iname_deps_is_final=forced_iname_deps_is_final)
 
         from pymbolic.primitives import Call
         from loopy.symbolic import Reduction
@@ -1345,16 +1377,19 @@ class CallInstruction(MultiAssignmentBase):
         from loopy.symbolic import parse
         if isinstance(assignees, str):
             assignees = parse(assignees)
+        if not isinstance(assignees, tuple):
+            raise LoopyError("'assignees' argument to CallInstruction "
+                    "must be a tuple or a string parseable to a tuple"
+                    "--got '%s'" % type(assignees).__name__)
+
         if isinstance(expression, str):
             expression = parse(expression)
 
-        # FIXME: It may be worth it to enable this check eventually.
-        # For now, it causes grief with certain 'checky' uses of the
-        # with_transformed_expressions(). (notably the access checker)
-        #
-        # from pymbolic.primitives import Variable, Subscript
-        # if not isinstance(assignee, (Variable, Subscript)):
-        #     raise LoopyError("invalid lvalue '%s'" % assignee)
+        from pymbolic.primitives import Variable, Subscript
+        from loopy.symbolic import LinearSubscript
+        for assignee in assignees:
+            if not isinstance(assignee, (Variable, Subscript, LinearSubscript)):
+                raise LoopyError("invalid lvalue '%s'" % assignee)
 
         self.assignees = assignees
         self.expression = expression
@@ -1414,9 +1449,7 @@ class CallInstruction(MultiAssignmentBase):
 
 
 def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
-    if len(assignees) < 1:
-        raise LoopyError("every instruction must have a left-hand side")
-    elif len(assignees) > 1:
+    if len(assignees) > 1 or len(assignees) == 0:
         atomicity = kwargs.pop("atomicity", ())
         if atomicity:
             raise LoopyError("atomic operations with more than one "
@@ -1491,7 +1524,7 @@ class CInstruction(InstructionBase):
             id=None, depends_on=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
-            forced_iname_deps_is_final=None, forced_iname_deps=frozenset(),
+            within_inames_is_final=None, within_inames=None,
             priority=0, boostable=None, boostable_into=None,
             predicates=frozenset(), tags=None,
             insn_deps=None, insn_deps_is_final=None):
@@ -1511,8 +1544,8 @@ class CInstruction(InstructionBase):
                 depends_on_is_final=depends_on_is_final,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
                 no_sync_with=no_sync_with,
-                forced_iname_deps_is_final=forced_iname_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
+                within_inames_is_final=within_inames_is_final,
+                within_inames=within_inames,
                 boostable=boostable,
                 boostable_into=boostable_into,
                 priority=priority, predicates=predicates, tags=tags,
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index fa47cd275419c8ca9fbfdabb927ddad8bc25879c..7bf4d786613f5fb3b9d9633e1f810ccbc260d061 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -63,10 +63,11 @@ def _add_dtypes(knl, dtype_dict):
     dtype_dict = dtype_dict.copy()
     new_args = []
 
+    from loopy.types import to_loopy_type
     for arg in knl.args:
         new_dtype = dtype_dict.pop(arg.name, None)
         if new_dtype is not None:
-            new_dtype = np.dtype(new_dtype)
+            new_dtype = to_loopy_type(new_dtype)
             if arg.dtype is not None and arg.dtype != new_dtype:
                 raise RuntimeError(
                         "argument '%s' already has a different dtype "
@@ -142,7 +143,7 @@ def guess_iname_deps_based_on_var_use(kernel, insn, insn_id_to_inames=None):
         for writer_id in writer_map[tv_name]:
             writer_insn = kernel.id_to_insn[writer_id]
             if insn_id_to_inames is None:
-                writer_inames = writer_insn.forced_iname_deps
+                writer_inames = writer_insn.within_inames
             else:
                 writer_inames = insn_id_to_inames[writer_id]
 
@@ -180,12 +181,12 @@ def find_all_insn_inames(kernel):
         all_write_deps[insn.id] = write_deps = insn.write_dependency_names()
         deps = read_deps | write_deps
 
-        if insn.forced_iname_deps_is_final:
-            iname_deps = insn.forced_iname_deps
+        if insn.within_inames_is_final:
+            iname_deps = insn.within_inames
         else:
             iname_deps = (
                     deps & kernel.all_inames()
-                    | insn.forced_iname_deps)
+                    | insn.within_inames)
 
         assert isinstance(read_deps, frozenset), type(insn)
         assert isinstance(write_deps, frozenset), type(insn)
@@ -217,7 +218,7 @@ def find_all_insn_inames(kernel):
         did_something = False
         for insn in kernel.instructions:
 
-            if insn.forced_iname_deps_is_final:
+            if insn.within_inames_is_final:
                 continue
 
             # {{{ depdency-based propagation
@@ -232,12 +233,12 @@ def find_all_insn_inames(kernel):
                 did_something = True
 
                 warn_with_kernel(kernel, "inferred_iname",
-                        "The iname(s) '%s' on instruction '%s' in kernel '%s' "
+                        "The iname(s) '%s' on instruction '%s' "
                         "was/were automatically added. "
                         "This is deprecated. Please add the iname "
                         "to the instruction "
                         "explicitly, e.g. by adding 'for' loops"
-                        % (", ".join(inames_new-inames_old), insn.id, kernel.name))
+                        % (", ".join(inames_new-inames_old), insn.id))
 
             # }}}
 
@@ -273,7 +274,7 @@ def find_all_insn_inames(kernel):
                         "automatically added. "
                         "This is deprecated. Please add the iname "
                         "to the instruction "
-                        "explicitly, e.g. by adding '{inames=...}"
+                        "explicitly, e.g. by adding 'for' loops"
                         % (", ".join(inames_new-inames_old), insn.id))
 
             # }}}
diff --git a/loopy/maxima.py b/loopy/maxima.py
index 738df86c4b94988b57d6dc01bcc22bb0ca62ac21..22d0c085c0edd5a96b6b45b457957f4be50e49d7 100644
--- a/loopy/maxima.py
+++ b/loopy/maxima.py
@@ -25,7 +25,8 @@ THE SOFTWARE.
 """
 
 
-from pymbolic.maxima import MaximaStringifyMapper as MaximaStringifyMapperBase
+from pymbolic.interop.maxima import \
+        MaximaStringifyMapper as MaximaStringifyMapperBase
 
 
 class MaximaStringifyMapper(MaximaStringifyMapperBase):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 9738777ad1fd599732e39d419c3de43827f13470..bdb91d01d7988c1bf5517f699664e0090f97cd34 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -528,8 +528,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
         init_insn = make_assignment(
                 id=init_id,
                 assignees=acc_vars,
-                forced_iname_deps=outer_insn_inames - frozenset(expr.inames),
-                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                within_inames=outer_insn_inames - frozenset(expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset(),
                 expression=expr.operation.neutral_element(arg_dtype, expr.inames))
 
@@ -539,8 +539,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
 
         update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
-        if insn.forced_iname_deps_is_final:
-            update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames)
+        if insn.within_inames_is_final:
+            update_insn_iname_deps = insn.within_inames | set(expr.inames)
 
         reduction_insn = make_assignment(
                 id=update_id,
@@ -550,8 +550,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     acc_vars if len(acc_vars) > 1 else acc_vars[0],
                     expr.expr, expr.inames),
                 depends_on=frozenset([init_insn.id]) | insn.depends_on,
-                forced_iname_deps=update_insn_iname_deps,
-                forced_iname_deps_is_final=insn.forced_iname_deps_is_final)
+                within_inames=update_insn_iname_deps,
+                within_inames_is_final=insn.within_inames_is_final)
 
         generated_insns.append(reduction_insn)
 
@@ -644,8 +644,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
                     for acc_var in acc_vars),
                 expression=neutral,
-                forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]),
-                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                within_inames=base_iname_deps | frozenset([base_exec_iname]),
+                within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset())
         generated_insns.append(init_insn)
 
@@ -657,10 +657,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     for acc_var in acc_vars),
                 expression=expr.operation(
                     arg_dtype, neutral, expr.expr, expr.inames),
-                forced_iname_deps=(
+                within_inames=(
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
-                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset([init_id]) | insn.depends_on,
                 no_sync_with=frozenset([init_id]))
         generated_insns.append(transfer_insn)
@@ -706,9 +706,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                                     var(stage_exec_iname) + new_size,)]
                             for acc_var in acc_vars]),
                         expr.inames),
-                    forced_iname_deps=(
+                    within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
-                    forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                    within_inames_is_final=insn.within_inames_is_final,
                     depends_on=frozenset([prev_id]),
                     )
 
@@ -721,7 +721,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         new_insn_add_depends_on.add(prev_id)
         new_insn_add_no_sync_with.add(prev_id)
-        new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname)
+        new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)
 
         if nresults == 1:
             assert len(acc_vars) == 1
@@ -831,7 +831,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
     while insn_queue:
         new_insn_add_depends_on = set()
         new_insn_add_no_sync_with = set()
-        new_insn_add_forced_iname_deps = set()
+        new_insn_add_within_inames = set()
 
         generated_insns = []
 
@@ -860,9 +860,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     | frozenset(new_insn_add_depends_on),
                     no_sync_with=insn.no_sync_with
                     | frozenset(new_insn_add_no_sync_with),
-                    forced_iname_deps=(
+                    within_inames=(
                         temp_kernel.insn_inames(insn)
-                        | new_insn_add_forced_iname_deps))
+                        | new_insn_add_within_inames))
 
             kwargs.pop("id")
             kwargs.pop("expression")
@@ -871,13 +871,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             kwargs.pop("temp_var_type", None)
             kwargs.pop("temp_var_types", None)
 
-            replacement_insns = [
-                    lp.Assignment(
-                        id=insn_id_gen(insn.id),
-                        assignee=assignee,
-                        expression=new_expr,
-                        **kwargs)
-                    for assignee, new_expr in zip(insn.assignees, new_expressions)]
+            if isinstance(insn.expression, Reduction) and nresults > 1:
+                replacement_insns = [
+                        lp.Assignment(
+                            id=insn_id_gen(insn.id),
+                            assignee=assignee,
+                            expression=new_expr,
+                            **kwargs)
+                        for assignee, new_expr in zip(
+                            insn.assignees, new_expressions)]
+
+            else:
+                new_expr, = new_expressions
+                replacement_insns = [
+                        make_assignment(
+                            id=insn_id_gen(insn.id),
+                            assignees=insn.assignees,
+                            expression=new_expr,
+                            **kwargs)
+                        ]
 
             insn_id_replacements[insn.id] = [
                     rinsn.id for rinsn in replacement_insns]
@@ -1054,7 +1066,7 @@ def preprocess_kernel(kernel, device=None):
 
         try:
             result = preprocess_cache[kernel]
-            logger.info("%s: preprocess cache hit" % kernel.name)
+            logger.debug("%s: preprocess cache hit" % kernel.name)
             return result
         except KeyError:
             pass
@@ -1063,6 +1075,9 @@ def preprocess_kernel(kernel, device=None):
 
     logger.info("%s: preprocess start" % kernel.name)
 
+    from loopy.check import check_identifiers_in_subst_rules
+    check_identifiers_in_subst_rules(kernel)
+
     # {{{ check that there are no l.auto-tagged inames
 
     from loopy.kernel.data import AutoLocalIndexTagBase
@@ -1106,6 +1121,8 @@ def preprocess_kernel(kernel, device=None):
     kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel)
 
     kernel = find_temporary_scope(kernel)
+
+    # boostability should be removed in 2017.x.
     kernel = find_idempotence(kernel)
     kernel = limit_boostability(kernel)
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 3437c49177e87288c5f0a14a599618ee0144bf72..04882334f7c5ce9337a690277b16e258eac5327b 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -27,7 +27,7 @@ import six
 from pytools import Record
 import sys
 import islpy as isl
-from loopy.diagnostic import LoopyError  # noqa
+from loopy.diagnostic import warn_with_kernel, LoopyError  # noqa
 
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
@@ -529,6 +529,11 @@ class SchedulerState(Record):
         A mapping from instruction group names to the number of instructions
         in them that are left to schedule. If a group name occurs in this
         mapping, that group is considered active.
+
+    .. attribute:: uses_of_boostability
+
+        Used to produce warnings about deprecated 'boosting' behavior
+        Should be removed along with boostability in 2017.x.
     """
 
     @property
@@ -624,6 +629,7 @@ def generate_loop_schedules_internal(
         # If insn is boostable, it may be placed inside a more deeply
         # nested loop without harm.
 
+        orig_have = have
         if allow_boost:
             # Note that the inames in 'insn.boostable_into' necessarily won't
             # be contained in 'want'.
@@ -685,12 +691,21 @@ def generate_loop_schedules_internal(
 
             # }}}
 
+            new_uses_of_boostability = []
+            if allow_boost:
+                if orig_have & insn.boostable_into:
+                    new_uses_of_boostability.append(
+                            (insn.id, orig_have & insn.boostable_into))
+
             new_sched_state = sched_state.copy(
                     scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set,
                     unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set,
                     schedule=(
                         sched_state.schedule + (RunInstruction(insn_id=insn.id),)),
                     active_group_counts=new_active_group_counts,
+                    uses_of_boostability=(
+                        sched_state.uses_of_boostability
+                        + new_uses_of_boostability)
                     )
 
             # Don't be eager about entering/leaving loops--if progress has been
@@ -875,6 +890,10 @@ def generate_loop_schedules_internal(
                 writer_insn, = kernel.writer_map()[domain_par]
                 if writer_insn not in sched_state.scheduled_insn_ids:
                     data_dep_written = False
+                    if debug_mode:
+                        print("iname '%s' not scheduled because domain "
+                                "parameter '%s' is not yet available"
+                                % (iname, domain_par))
                     break
 
             if not data_dep_written:
@@ -1000,6 +1019,15 @@ def generate_loop_schedules_internal(
         # if done, yield result
         debug.log_success(sched_state.schedule)
 
+        for boost_insn_id, boost_inames in sched_state.uses_of_boostability:
+            warn_with_kernel(
+                    kernel, "used_boostability",
+                    "instruction '%s' was implicitly nested inside "
+                    "inames '%s' based on an idempotence heuristic. "
+                    "This is deprecated and will stop working in loopy 2017.x."
+                    % (boost_insn_id, ", ".join(boost_inames)),
+                    DeprecationWarning)
+
         yield sched_state.schedule
 
     else:
@@ -1053,7 +1081,7 @@ class DependencyRecord(Record):
 
 
 def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
-    """If there exists a depdency between target and source and the two access
+    """If there exists a dependency between target and source and the two access
     a common variable of *var_kind* in a way that requires a barrier (essentially,
     at least one write), then the function will return a tuple
     ``(target, source, var_name)``. Otherwise, it will return *None*.
@@ -1437,7 +1465,9 @@ def generate_loop_schedules(kernel, debug_args={}):
             parallel_inames=parallel_inames - ilp_inames - vec_inames,
 
             group_insn_counts=group_insn_counts(kernel),
-            active_group_counts={})
+            active_group_counts={},
+
+            uses_of_boostability=[])
 
     generators = [
             generate_loop_schedules_internal(sched_state,
@@ -1546,7 +1576,7 @@ def get_one_scheduled_kernel(kernel):
         try:
             result, ambiguous = schedule_cache[sched_cache_key]
 
-            logger.info("%s: schedule cache hit" % kernel.name)
+            logger.debug("%s: schedule cache hit" % kernel.name)
             from_cache = True
         except KeyError:
             pass
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index c3a7d9b0427b660254c8d8210acaee85909a6546..ca782a3d8ca85ea6250f7c9317ca0947db28d5e8 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -95,11 +95,22 @@ def get_common_hw_inames(kernel, insn_ids):
     # Get the list of hardware inames in which the temporary is defined.
     if len(insn_ids) == 0:
         return set()
-    id_to_insn = kernel.id_to_insn
-    from six.moves import reduce
-    return reduce(
-        set.intersection,
-        (get_hw_inames(kernel, id_to_insn[id]) for id in insn_ids))
+    return set.intersection(
+        *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids))
+
+
+def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel):
+    from loopy.kernel.data import HardwareParallelTag
+    new_schedule = []
+
+    for item in kernel.schedule:
+        if isinstance(item, (EnterLoop, LeaveLoop)):
+            tag = kernel.iname_to_tag.get(item.iname)
+            if isinstance(tag, HardwareParallelTag):
+                continue
+        new_schedule.append(item)
+
+    return kernel.copy(schedule=new_schedule)
 
 # }}}
 
@@ -268,7 +279,7 @@ def compute_live_temporaries(kernel, schedule):
                 live_in[idx] = live_out[idx] = live_in[idx + 1]
                 idx -= 1
             else:
-                raise LoopyError("unexepcted type of schedule item: %s"
+                raise LoopyError("unexpected type of schedule item: %s"
                         % type(sched_item).__name__)
 
     # }}}
@@ -333,6 +344,10 @@ class PromotedTemporary(Record):
 
 def determine_temporaries_to_promote(kernel, temporaries, name_gen):
     """
+    For each temporary in the passed list of temporaries, construct a
+    :class:`PromotedTemporary` which describes how the temporary should
+    get promoted into global storage.
+
     :returns: A :class:`dict` mapping temporary names from `temporaries` to
               :class:`PromotedTemporary` objects
     """
@@ -351,6 +366,18 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen):
         assert temporary.base_storage is None, \
             "Cannot promote temporaries with base_storage to global"
 
+        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
+        # is associated with. This is used for determining the shape of the
+        # global storage needed for saving and restoring the temporary across
+        # kernel calls.
+        #
+        # TODO: Make a policy decision about which dimensions to use. Currently,
+        # the code looks at each instruction that defines or uses the temporary,
+        # and takes the common set of hw-parallel tagged inames associated with
+        # these instructions.
+        #
+        # Furthermore, in the case of local temporaries, inames that are tagged
+        # hw-local do not contribute to the global storage shape.
         hw_inames = get_common_hw_inames(kernel,
             def_lists[temporary.name] + use_lists[temporary.name])
 
@@ -358,6 +385,8 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen):
         hw_inames = sorted(hw_inames,
             key=lambda iname: str(kernel.iname_to_tag[iname]))
 
+        # Calculate the sizes of the dimensions that get added in front for
+        # the global storage of the temporary.
         shape_prefix = []
 
         backing_hw_inames = []
@@ -411,11 +440,15 @@ def augment_domain_for_temporary_promotion(
         new_iname = name_gen("{name}_{mode}_dim_{dim}".
             format(name=orig_temporary.name,
                    mode=mode,
-                   dim=orig_dim + t_idx))
+                   dim=t_idx))
         domain = domain.set_dim_name(
             isl.dim_type.set, orig_dim + t_idx, new_iname)
-        #from loopy.kernel.data import auto
-        #iname_to_tag[new_iname] = auto
+        if orig_temporary.is_local:
+            # If the temporary is has local scope, then loads / stores can be
+            # done in parallel.
+            from loopy.kernel.data import AutoFitLocalIndexTag
+            iname_to_tag[new_iname] = AutoFitLocalIndexTag()
+
         dim_inames.append(new_iname)
 
         # Add size information.
@@ -423,7 +456,7 @@ def augment_domain_for_temporary_promotion(
         domain &= aff[0].le_set(aff[new_iname])
         size = orig_temporary.shape[t_idx]
         from loopy.symbolic import aff_from_expr
-        domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size))
+        domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size))
 
     hw_inames = []
 
@@ -538,7 +571,7 @@ def restore_and_save_temporaries(kernel):
             for tval in tvals:
                 from loopy.kernel.tools import DomainChanger
                 tval_hw_inames = new_temporaries[tval].hw_inames
-                dchg = DomainChanger(kernel,
+                dchg = DomainChanger(new_kernel,
                     frozenset(sched_item.extra_inames + tval_hw_inames))
                 domain = dchg.domain
 
@@ -572,7 +605,9 @@ def restore_and_save_temporaries(kernel):
                     args = reversed(args)
 
                 from loopy.kernel.data import Assignment
-                new_insn = Assignment(*args, id=insn_id)
+                new_insn = Assignment(*args, id=insn_id,
+                    within_inames=frozenset(hw_inames + dim_inames),
+                    within_inames_is_final=True)
 
                 new_instructions.append(new_insn)
 
@@ -620,17 +655,25 @@ def restore_and_save_temporaries(kernel):
     # }}}
 
     new_iname_to_tag.update(kernel.iname_to_tag)
-    new_temporary_variables = dict(
+    updated_temporary_variables = dict(
         (t.name, t.as_variable()) for t in new_temporaries.values())
-    new_temporary_variables.update(kernel.temporary_variables)
+    updated_temporary_variables.update(kernel.temporary_variables)
 
     kernel = kernel.copy(
         iname_to_tag=new_iname_to_tag,
-        temporary_variables=new_temporary_variables,
+        temporary_variables=updated_temporary_variables,
         instructions=kernel.instructions + new_instructions,
         schedule=new_schedule
         )
 
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
+    # Once assign_automatic_axes() does its job, loops in the schedule
+    # for newly hardware-tagged inames are no longer necessary (and in
+    # fact illegal), so remove them.
+    kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel)
+
     return kernel
 
 
@@ -722,7 +765,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
                     current_chunk.append(sched_item)
                 i += 1
             else:
-                raise LoopyError("unexepcted type of schedule item: %s"
+                raise LoopyError("unexpected type of schedule item: %s"
                         % type(sched_item).__name__)
 
         if current_chunk and schedule_required_splitting:
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 932acab50b29163345585b10edca0e020f8183b8..6fa4614b9711ce30c44333276970492979554c42 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -308,7 +308,8 @@ class GlobalSubscriptCounter(CombineMapper):
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
 
-            coeffs = CoefficientCollector()(idx)
+            from loopy.symbolic import simplify_using_aff
+            coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx))
             # check if he contains the lid 0 guy
             try:
                 coeff_id0 = coeffs[Variable(local_id0)]
@@ -890,7 +891,10 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     kernel = infer_unknown_types(kernel, expect_completion=True)
-    kernel = preprocess_kernel(kernel)
+
+    from loopy.kernel import kernel_state
+    if kernel.state < kernel_state.PREPROCESSED:
+        kernel = preprocess_kernel(kernel)
 
     write_footprints = []
     read_footprints = []
@@ -938,6 +942,13 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         data-dependent or nonlinear indices)
     """
 
+    from loopy.preprocess import preprocess_kernel, infer_unknown_types
+    kernel = infer_unknown_types(kernel, expect_completion=True)
+
+    from loopy.kernel import kernel_state
+    if kernel.state < kernel_state.PREPROCESSED:
+        kernel = preprocess_kernel(kernel)
+
     result = {}
     fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 9a753a6630590f05bd4377c1e02863eebb0bf834..1eb7cbed68fc84fdcd5b6a933930373ae6bfb471 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1,8 +1,6 @@
 """Pymbolic mappers for loopy."""
 
 from __future__ import division, absolute_import
-import six
-from six.moves import range, zip, reduce
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,13 +25,14 @@ THE SOFTWARE.
 """
 
 
-from six.moves import intern
+import six
+from six.moves import range, zip, reduce, intern
 
 from pytools import memoize, memoize_method, Record
 import pytools.lex
 
 from pymbolic.primitives import (
-        Leaf, AlgebraicLeaf, Expression, Variable, CommonSubexpression)
+        Leaf, Expression, Variable, CommonSubexpression)
 
 from pymbolic.mapper import (
         CombineMapper as CombineMapperBase,
@@ -102,6 +101,8 @@ class IdentityMapperMixin(object):
 
     map_linear_subscript = IdentityMapperBase.map_subscript
 
+    map_rule_argument = map_group_hw_index
+
 
 class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
     pass
@@ -132,6 +133,8 @@ class WalkMapper(WalkMapperBase):
 
     map_linear_subscript = WalkMapperBase.map_subscript
 
+    map_rule_argument = map_group_hw_index
+
 
 class CallbackMapper(CallbackMapperBase, IdentityMapper):
     map_reduction = CallbackMapperBase.map_constant
@@ -181,6 +184,9 @@ class StringifyMapper(StringifyMapperBase):
                 type(expr).__name__,
                 ", ".join(str(a) for a in expr.__getinitargs__()))
 
+    def map_rule_argument(self, expr, enclosing_prec):
+        return "<arg%d>" % expr.index
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
@@ -357,7 +363,7 @@ class TaggedVariable(Variable):
     mapper_method = intern("map_tagged_variable")
 
 
-class Reduction(AlgebraicLeaf):
+class Reduction(Expression):
     """Represents a reduction operation on :attr:`expr`
     across :attr:`inames`.
 
@@ -437,7 +443,7 @@ class Reduction(AlgebraicLeaf):
     mapper_method = intern("map_reduction")
 
 
-class LinearSubscript(AlgebraicLeaf):
+class LinearSubscript(Expression):
     """Represents a linear index into a multi-dimensional array, completely
     ignoring any multi-dimensional layout.
     """
@@ -456,6 +462,26 @@ class LinearSubscript(AlgebraicLeaf):
 
     mapper_method = intern("map_linear_subscript")
 
+
+class RuleArgument(Expression):
+    """Represents a (numbered) argument of a :class:`loopy.SubstitutionRule`.
+    Only used internally in the rule-aware mappers to match subst rules
+    independently of argument names.
+    """
+
+    init_arg_names = ("index",)
+
+    def __init__(self, index):
+        self.index = index
+
+    def __getinitargs__(self):
+        return (self.index,)
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_rule_argument")
+
 # }}}
 
 
@@ -544,32 +570,47 @@ def rename_subst_rules_in_instructions(insns, renames):
 
 
 class SubstitutionRuleMappingContext(object):
+    def _get_subst_rule_key(self, args, body):
+        subst_dict = dict(
+                (arg, RuleArgument(i))
+                for i, arg in enumerate(args))
+
+        from pymbolic.mapper.substitutor import make_subst_func
+        arg_subst_map = SubstitutionMapper(make_subst_func(subst_dict))
+
+        return arg_subst_map(body)
+
     def __init__(self, old_subst_rules, make_unique_var_name):
         self.old_subst_rules = old_subst_rules
         self.make_unique_var_name = make_unique_var_name
 
         # maps subst rule (args, bodies) to (names, original_name)
         self.subst_rule_registry = dict(
-                ((rule.arguments, rule.expression), (name, name))
+                (self._get_subst_rule_key(rule.arguments, rule.expression),
+                    (name, rule.arguments, rule.expression))
                 for name, rule in six.iteritems(old_subst_rules))
 
-        # maps subst rule (args, bodies) to use counts
-        self.subst_rule_use_count = {}
+        # maps subst rule (args, bodies) to a list of old names,
+        # which doubles as (a) a histogram of uses and (b) a way
+        # to deterministically find the 'lexicographically earliest'
+        # name
+        self.subst_rule_old_names = {}
 
     def register_subst_rule(self, original_name, args, body):
         """Returns a name (as a string) for a newly created substitution
         rule.
         """
-        key = (args, body)
+        key = self._get_subst_rule_key(args, body)
         reg_value = self.subst_rule_registry.get(key)
 
         if reg_value is None:
-            new_name = self.make_unique_var_name(original_name)
-            self.subst_rule_registry[key] = (new_name, original_name)
+            # These names are temporary and won't stick around.
+            new_name = self.make_unique_var_name("_lpy_tmp_"+original_name)
+            self.subst_rule_registry[key] = (new_name, args, body)
         else:
-            new_name, _ = reg_value
+            new_name, _, _ = reg_value
 
-        self.subst_rule_use_count[key] = self.subst_rule_use_count.get(key, 0) + 1
+        self.subst_rule_old_names.setdefault(key, []).append(original_name)
         return new_name
 
     def _get_new_substitutions_and_renames(self):
@@ -591,27 +632,33 @@ class SubstitutionRuleMappingContext(object):
 
         from loopy.kernel.data import SubstitutionRule
 
-        orig_name_histogram = {}
-        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
-            if self.subst_rule_use_count.get(key, 0):
-                orig_name_histogram[orig_name] = \
-                        orig_name_histogram.get(orig_name, 0) + 1
-
         result = {}
         renames = {}
 
-        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
-            args, body = key
+        used_names = set()
+
+        for key, (name, args, body) in six.iteritems(
+                self.subst_rule_registry):
+            orig_names = self.subst_rule_old_names.get(key, [])
+
+            # If no orig_names are found, then this particular
+            # subst rule was never referenced, and so it's fine
+            # to leave out.
+
+            if not orig_names:
+                continue
+
+            new_name = min(orig_names)
+            if new_name in used_names:
+                new_name = self.make_unique_var_name(new_name)
 
-            if self.subst_rule_use_count.get(key, 0):
-                if orig_name_histogram[orig_name] == 1 and name != orig_name:
-                    renames[name] = orig_name
-                    name = orig_name
+            renames[name] = new_name
+            used_names.add(new_name)
 
-                result[name] = SubstitutionRule(
-                        name=name,
-                        arguments=args,
-                        expression=body)
+            result[new_name] = SubstitutionRule(
+                    name=new_name,
+                    arguments=args,
+                    expression=body)
 
         # {{{ perform renames on new substitutions
 
@@ -1005,16 +1052,11 @@ class ArrayAccessFinder(CombineMapper):
 # }}}
 
 
-# {{{ aff <-> expr conversion
+# {{{ (pw)aff to expr conversion
 
-def aff_to_expr(aff, except_name=None, error_on_name=None):
-    if except_name is not None and error_on_name is not None:
-        raise ValueError("except_name and error_on_name may not be specified "
-                "at the same time")
+def aff_to_expr(aff):
     from pymbolic import var
 
-    except_coeff = 0
-
     denom = aff.get_denominator_val().to_python()
 
     result = (aff.get_constant_val()*denom).to_python()
@@ -1023,29 +1065,14 @@ def aff_to_expr(aff, except_name=None, error_on_name=None):
             coeff = (aff.get_coefficient_val(dt, i)*denom).to_python()
             if coeff:
                 dim_name = aff.get_dim_name(dt, i)
-                if dim_name == except_name:
-                    except_coeff += coeff
-                elif dim_name == error_on_name:
-                    raise RuntimeError("'%s' occurred in this subexpression--"
-                            "this is not allowed" % dim_name)
-                else:
-                    result += coeff*var(dim_name)
-
-    error_on_name = error_on_name or except_name
+                result += coeff*var(dim_name)
 
     for i in range(aff.dim(dim_type.div)):
         coeff = (aff.get_coefficient_val(dim_type.div, i)*denom).to_python()
         if coeff:
-            result += coeff*aff_to_expr(aff.get_div(i), error_on_name=error_on_name)
+            result += coeff*aff_to_expr(aff.get_div(i))
 
-    if except_name is not None:
-        if except_coeff % denom != 0:
-            raise RuntimeError("coefficient of '%s' is not divisible by "
-                    "aff denominator" % except_name)
-
-        return result // denom, except_coeff // denom
-    else:
-        return result // denom
+    return result // denom
 
 
 def pw_aff_to_expr(pw_aff, int_ok=False):
@@ -1064,21 +1091,81 @@ def pw_aff_to_expr(pw_aff, int_ok=False):
     (set, aff), = pieces
     return aff_to_expr(aff)
 
+# }}}
+
+
+# {{{ (pw)aff_from_expr
+
+class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
+    def __init__(self, space, vars_to_zero):
+        self.zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
+
+        context = {}
+        for name, (dt, pos) in six.iteritems(space.get_var_dict()):
+            if dt == dim_type.set:
+                dt = dim_type.in_
+
+            context[name] = isl.PwAff.from_aff(
+                    self.zero.set_coefficient_val(dt, pos, 1))
+
+        for v in vars_to_zero:
+            context[v] = self.zero
+
+        self.pw_zero = isl.PwAff.from_aff(self.zero)
 
-def aff_from_expr(space, expr, vars_to_zero=set()):
-    zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
-    context = {}
-    for name, (dt, pos) in six.iteritems(space.get_var_dict()):
-        if dt == dim_type.set:
-            dt = dim_type.in_
+        super(PwAffEvaluationMapper, self).__init__(context)
 
-        context[name] = zero.set_coefficient_val(dt, pos, 1)
+    def map_constant(self, expr):
+        return self.pw_zero + expr
+
+    def map_min(self, expr):
+        from functools import reduce
+        return reduce(
+                lambda a, b: a.min(b),
+                (self.rec(ch) for ch in expr.children))
+
+    def map_max(self, expr):
+        from functools import reduce
+        return reduce(
+                lambda a, b: a.max(b),
+                (self.rec(ch) for ch in expr.children))
+
+    def map_quotient(self, expr):
+        raise TypeError("true division in '%s' not supported "
+                "for as-pwaff evaluation" % expr)
+
+    def map_floor_div(self, expr):
+        num = self.rec(expr.numerator)
+        denom = self.rec(expr.denominator)
+        return num.div(denom).floor()
+
+    def map_remainder(self, expr):
+        num = self.rec(expr.numerator)
+        denom = self.rec(expr.denominator)
+        if not denom.is_cst():
+            raise TypeError("modulo non-constant in '%s' not supported "
+                    "for as-pwaff evaluation" % expr)
+
+        (s, denom_aff), = denom.get_pieces()
+        denom = denom_aff.get_constant_val()
+
+        return num.mod_val(denom)
+
+
+def aff_from_expr(space, expr, vars_to_zero=frozenset()):
+    pwaff = pwaff_from_expr(space, expr, vars_to_zero).coalesce()
+
+    pieces = pwaff.get_pieces()
+    if len(pieces) == 1:
+        (s, aff), = pieces
+        return aff
+    else:
+        raise RuntimeError("expression '%s' could not be converted to a "
+                "non-piecewise quasi-affine expression" % expr)
 
-    for name in vars_to_zero:
-        context[name] = zero
 
-    from pymbolic import evaluate
-    return zero + evaluate(expr, context)
+def pwaff_from_expr(space, expr, vars_to_zero=frozenset()):
+    return PwAffEvaluationMapper(space, vars_to_zero)(expr)
 
 # }}}
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 88e656a1e3a4bfeb25a250dbcb3a05d1f805bac8..c1b9f2302c67d56fd16f72170d70c7a869e6aae9 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -124,6 +124,20 @@ class TargetBase(object):
 
     # }}}
 
+    def get_kernel_executor_cache_key(self, *args, **kwargs):
+        """
+        :returns: an immutable type to be used as the cache key for
+            kernel executor caching.
+        """
+        raise NotImplementedError()
+
+    def get_kernel_executor(self, kernel, *args, **kwargs):
+        """
+        :returns: an immutable type to be used as the cache key for
+            kernel executor caching.
+        """
+        raise NotImplementedError()
+
 
 class ASTBuilderBase(object):
     """An interface for generating (host or device) ASTs.
@@ -186,16 +200,12 @@ class ASTBuilderBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
-    def emit_assignment(self, codegen_state, lhs, rhs):
+    def emit_assignment(self, codegen_state, insn):
         raise NotImplementedError()
 
     def emit_multiple_assignment(self, codegen_state, insn):
         raise NotImplementedError()
 
-    def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
-            lhs_expr, rhs_expr, lhs_dtype):
-        raise NotImplementedError("atomic update in target %s" % type(self).__name__)
-
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             static_lbound, static_ubound, inner):
         raise NotImplementedError()
@@ -254,16 +264,12 @@ class DummyHostASTBuilder(ASTBuilderBase):
     def ast_block_class(self):
         return _DummyASTBlock
 
-    def emit_assignment(self, codegen_state, lhs, rhs):
+    def emit_assignment(self, codegen_state, insn):
         return None
 
     def emit_multiple_assignment(self, codegen_state, insn):
         return None
 
-    def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
-            lhs_expr, rhs_expr, lhs_dtype):
-        return None
-
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             static_lbound, static_ubound, inner):
         return None
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 7580038f45e7b098dfddbb8031584ae4be398c44..6e1893b1a866532f1f97722a72d648c893a4e34a 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -495,9 +495,57 @@ class CASTBuilder(ASTBuilderBase):
 
         return arg_decl
 
-    def emit_assignment(self, codegen_state, lhs, rhs):
-        from cgen import Assign
-        return Assign(lhs, rhs)
+    def emit_assignment(self, codegen_state, insn):
+        kernel = codegen_state.kernel
+        ecm = codegen_state.expression_to_code_mapper
+
+        assignee_var_name, = insn.assignee_var_names()
+
+        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
+        lhs_dtype = lhs_var.dtype
+
+        if insn.atomicity is not None:
+            lhs_atomicity = [
+                    a for a in insn.atomicity if a.var_name == assignee_var_name]
+            assert len(lhs_atomicity) <= 1
+            if lhs_atomicity:
+                lhs_atomicity, = lhs_atomicity
+            else:
+                lhs_atomicity = None
+        else:
+            lhs_atomicity = None
+
+        from loopy.kernel.data import AtomicInit, AtomicUpdate
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
+        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
+        if lhs_atomicity is None:
+            from cgen import Assign
+            return Assign(
+                    lhs_code,
+                    ecm(insn.expression, prec=PREC_NONE,
+                        type_context=rhs_type_context,
+                        needed_dtype=lhs_dtype))
+
+        elif isinstance(lhs_atomicity, AtomicInit):
+            raise NotImplementedError("atomic init")
+
+        elif isinstance(lhs_atomicity, AtomicUpdate):
+            codegen_state.seen_atomic_dtypes.add(lhs_dtype)
+            return codegen_state.ast_builder.emit_atomic_update(
+                    codegen_state, lhs_atomicity, lhs_var,
+                    insn.assignee, insn.expression,
+                    lhs_dtype, rhs_type_context)
+
+        else:
+            raise ValueError("unexpected lhs atomicity type: %s"
+                    % type(lhs_atomicity).__name__)
+
+    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
+            lhs_expr, rhs_expr, lhs_dtype):
+        raise NotImplementedError("atomic updates in %s" % type(self).__name__)
 
     def emit_multiple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 51b5f5343b90ac644d9f1d222637bf428d833d6e..0b20c2d08416c931fe88535cbda47c78723c57ae 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -194,10 +194,26 @@ class ExpressionToCMapper(RecursiveMapper):
         from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable
 
         if isinstance(ary, ImageArg):
-            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
-                    % (ary.name, ary.dimensions,
-                        ", ".join(self.rec(idx, PREC_NONE, 'i')
-                            for idx in expr.index[::-1])))
+            extra_axes = 0
+
+            num_target_axes = ary.num_target_axes()
+            if num_target_axes in [1, 2]:
+                idx_vec_type = "float2"
+                extra_axes = 2-num_target_axes
+            elif num_target_axes == 3:
+                idx_vec_type = "float4"
+                extra_axes = 4-num_target_axes
+            else:
+                raise LoopyError("unsupported number (%d) of target axes in image"
+                        % num_target_axes)
+
+            idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes
+
+            idx_str = ", ".join(self.rec(idx, PREC_NONE, 'i')
+                    for idx in idx_tuple)
+
+            base_access = ("read_imagef(%s, loopy_sampler, (%s) (%s))"
+                    % (ary.name, idx_vec_type, idx_str))
 
             if ary.dtype.numpy_dtype == np.float32:
                 return base_access+".x"
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index acfb176ba0a619cf9ced560baad83f256a9971d1..a3f0fdc63e053e8c73a0e8e61fcf949e72b02a0d 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -327,7 +327,7 @@ class CUDACASTBuilder(CASTBuilder):
 
         return arg_decl
 
-    def get_image_arg_decl(self, name, shape, dtype, is_written):
+    def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError("not yet: texture arguments in CUDA")
 
     def get_constant_arg_decl(self, name, shape, dtype, is_written):
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9..8372abb0373bba7995fc79512df5f4b31631b032 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -356,6 +356,117 @@ class ISPCASTBuilder(CASTBuilder):
         from cgen.ispc import ISPCUniform
         return ISPCUniform(result)
 
+    def emit_assignment(self, codegen_state, insn):
+        kernel = codegen_state.kernel
+        ecm = codegen_state.expression_to_code_mapper
+
+        assignee_var_name, = insn.assignee_var_names()
+
+        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
+        lhs_dtype = lhs_var.dtype
+
+        if insn.atomicity:
+            raise NotImplementedError("atomic ops in ISPC")
+
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
+        rhs_code = ecm(insn.expression, prec=PREC_NONE,
+                    type_context=rhs_type_context,
+                    needed_dtype=lhs_dtype)
+
+        lhs = insn.assignee
+
+        # {{{ handle streaming stores
+
+        if "!streaming_store" in insn.tags:
+            ary = ecm.find_array(lhs)
+
+            from loopy.kernel.array import get_access_info
+            from pymbolic import evaluate
+
+            from loopy.symbolic import simplify_using_aff
+            index_tuple = tuple(
+                    simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
+
+            access_info = get_access_info(kernel.target, ary, index_tuple,
+                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    codegen_state.vectorization_info)
+
+            from loopy.kernel.data import GlobalArg, TemporaryVariable
+
+            if not isinstance(ary, (GlobalArg, TemporaryVariable)):
+                raise LoopyError("array type not supported in ISPC: %s"
+                        % type(ary).__name)
+
+            if len(access_info.subscripts) != 1:
+                raise LoopyError("streaming stores must have a subscript")
+            subscript, = access_info.subscripts
+
+            from pymbolic.primitives import Sum, flattened_sum, Variable
+            if isinstance(subscript, Sum):
+                terms = subscript.children
+            else:
+                terms = (subscript.children,)
+
+            new_terms = []
+
+            from loopy.kernel.data import LocalIndexTag
+            from loopy.symbolic import get_dependencies
+
+            saw_l0 = False
+            for term in terms:
+                if (isinstance(term, Variable)
+                        and isinstance(
+                            kernel.iname_to_tag.get(term.name), LocalIndexTag)
+                        and kernel.iname_to_tag.get(term.name).axis == 0):
+                    if saw_l0:
+                        raise LoopyError("streaming store must have stride 1 "
+                                "in local index, got: %s" % subscript)
+                    saw_l0 = True
+                    continue
+                else:
+                    for dep in get_dependencies(term):
+                        if (
+                                isinstance(
+                                    kernel.iname_to_tag.get(dep), LocalIndexTag)
+                                and kernel.iname_to_tag.get(dep).axis == 0):
+                            raise LoopyError("streaming store must have stride 1 "
+                                    "in local index, got: %s" % subscript)
+
+                    new_terms.append(term)
+
+            if not saw_l0:
+                raise LoopyError("streaming store must have stride 1 in "
+                        "local index, got: %s" % subscript)
+
+            if access_info.vector_index is not None:
+                raise LoopyError("streaming store may not use a short-vector "
+                        "data type")
+
+            rhs_has_programindex = any(
+                    isinstance(
+                        kernel.iname_to_tag.get(dep), LocalIndexTag)
+                    and kernel.iname_to_tag.get(dep).axis == 0
+                    for dep in get_dependencies(insn.expression))
+
+            if not rhs_has_programindex:
+                rhs_code = "broadcast(%s, 0)" % rhs_code
+
+            from cgen import Statement
+            return Statement(
+                    "streaming_store(%s + %s, %s)"
+                    % (
+                        access_info.array_name,
+                        ecm(flattened_sum(new_terms), PREC_NONE, 'i'),
+                        rhs_code))
+
+        # }}}
+
+        from cgen import Assign
+        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
+
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             static_lbound, static_ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 8e32c7fa187c384de261ad9f6444c7bd6f3b4171..104f5c654b9ac662e50c642a0135058ed38ef347 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -311,7 +311,7 @@ class OpenCLTarget(CTarget):
     """A target for the OpenCL C heterogeneous compute programming language.
     """
 
-    def __init__(self, atomics_flavor="cl1"):
+    def __init__(self, atomics_flavor=None):
         """
         :arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0),
             ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap
@@ -320,6 +320,9 @@ class OpenCLTarget(CTarget):
         """
         super(OpenCLTarget, self).__init__()
 
+        if atomics_flavor is None:
+            atomics_flavor = "cl1"
+
         if atomics_flavor not in ["cl1", "cl2"]:
             raise ValueError("unsupported atomics flavor: %s" % atomics_flavor)
 
@@ -495,7 +498,7 @@ class OpenCLCASTBuilder(CASTBuilder):
 
     # {{{ code generation for atomic update
 
-    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
+    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
         from pymbolic.mapper.stringifier import PREC_NONE
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index b570cf4ab7e4ed543213b73716b6271e61e6b69b..bdd5773b3f68d631343871adee423144086ca71a 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -275,11 +275,13 @@ class PyOpenCLTarget(OpenCLTarget):
     host_program_name_prefix = "_lpy_host_"
     host_program_name_suffix = ""
 
-    def __init__(self, device=None, pyopencl_module_name="_lpy_cl"):
+    def __init__(self, device=None, pyopencl_module_name="_lpy_cl",
+            atomics_flavor=None):
         # This ensures the dtype registry is populated.
         import pyopencl.tools  # noqa
 
-        super(PyOpenCLTarget, self).__init__()
+        super(PyOpenCLTarget, self).__init__(
+                atomics_flavor=atomics_flavor)
 
         self.device = device
         self.pyopencl_module_name = pyopencl_module_name
@@ -375,6 +377,13 @@ class PyOpenCLTarget(OpenCLTarget):
 
     # }}}
 
+    def get_kernel_executor_cache_key(self, queue, **kwargs):
+        return queue.context
+
+    def get_kernel_executor(self, kernel, queue, **kwargs):
+        from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor
+        return PyOpenCLKernelExecutor(queue.context, kernel)
+
 # }}}
 
 
@@ -615,7 +624,9 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         from genpy import Assign, Comment, Line
 
         def alloc_nbytes(tv):
-            return tv.dtype.numpy_dtype.itemsize
+            from six.moves import reduce
+            from operator import mul
+            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
 
         from loopy.kernel.data import temp_var_scope
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
new file mode 100644
index 0000000000000000000000000000000000000000..540cad00036de046484357826781353a927d7497
--- /dev/null
+++ b/loopy/target/pyopencl_execution.py
@@ -0,0 +1,774 @@
+from __future__ import division, with_statement, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+from six.moves import range, zip
+
+from pytools import Record, memoize_method
+from loopy.diagnostic import ParameterFinderWarning
+from pytools.py_codegen import (
+        Indentation, PythonFunctionGenerator)
+from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
+from loopy.execution import KernelExecutorBase
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+# {{{ invoker generation
+
+# /!\ This code runs in a namespace controlled by the user.
+# Prefix all auxiliary variables with "_lpy".
+
+
+def python_dtype_str(dtype):
+    import pyopencl.tools as cl_tools
+    if dtype.isbuiltin:
+        return "_lpy_np."+dtype.name
+    else:
+        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
+                % cl_tools.dtype_to_ctype(dtype))
+
+
+# {{{ integer arg finding from shapes
+
+def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info):
+    # a mapping from integer argument names to a list of tuples
+    # (arg_name, expression), where expression is a
+    # unary function of kernel.arg_dict[arg_name]
+    # returning the desired integer argument.
+    iarg_to_sources = {}
+
+    from loopy.kernel.data import GlobalArg
+    from loopy.symbolic import DependencyMapper, StringifyMapper
+    dep_map = DependencyMapper()
+
+    from pymbolic import var
+    for arg in implemented_data_info:
+        if arg.arg_class is GlobalArg:
+            sym_shape = var(arg.name).attr("shape")
+            for axis_nr, shape_i in enumerate(arg.shape):
+                if shape_i is None:
+                    continue
+
+                deps = dep_map(shape_i)
+
+                if len(deps) == 1:
+                    integer_arg_var, = deps
+
+                    if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
+                        from pymbolic.algorithm import solve_affine_equations_for
+                        try:
+                            # friggin' overkill :)
+                            iarg_expr = solve_affine_equations_for(
+                                    [integer_arg_var.name],
+                                    [(shape_i, sym_shape.index(axis_nr))]
+                                    )[integer_arg_var]
+                        except Exception as e:
+                            #from traceback import print_exc
+                            #print_exc()
+
+                            # went wrong? oh well
+                            from warnings import warn
+                            warn("Unable to generate code to automatically "
+                                    "find '%s' from the shape of '%s':\n%s"
+                                    % (integer_arg_var.name, arg.name, str(e)),
+                                    ParameterFinderWarning)
+                        else:
+                            iarg_to_sources.setdefault(integer_arg_var.name, []) \
+                                    .append((arg.name, iarg_expr))
+
+    gen("# {{{ find integer arguments from shapes")
+    gen("")
+
+    for iarg_name, sources in six.iteritems(iarg_to_sources):
+        gen("if %s is None:" % iarg_name)
+        with Indentation(gen):
+            if_stmt = "if"
+            for arg_name, value_expr in sources:
+                gen("%s %s is not None:" % (if_stmt, arg_name))
+                with Indentation(gen):
+                    gen("%s = %s"
+                            % (iarg_name, StringifyMapper()(value_expr)))
+
+                if_stmt = "elif"
+
+        gen("")
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+# {{{ integer arg finding from offsets
+
+def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info):
+    options = kernel.options
+
+    gen("# {{{ find integer arguments from offsets")
+    gen("")
+
+    for arg in implemented_data_info:
+        impl_array_name = arg.offset_for_name
+        if impl_array_name is not None:
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("if %s is None:" % impl_array_name)
+                with Indentation(gen):
+                    gen("# Output variable, we'll be allocating "
+                            "it, with zero offset.")
+                    gen("%s = 0" % arg.name)
+                gen("else:")
+                with Indentation(gen):
+                    if not options.no_numpy:
+                        gen("_lpy_offset = getattr(%s, \"offset\", 0)"
+                                % impl_array_name)
+                    else:
+                        gen("_lpy_offset = %s.offset" % impl_array_name)
+
+                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                    if not options.skip_arg_checks:
+                        gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+                        gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
+                                "not divisible by its dtype itemsize\""
+                                % impl_array_name)
+                        gen("del _lpy_remdr")
+                    else:
+                        gen("%s = _lpy_offset // %d"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+                    if not options.skip_arg_checks:
+                        gen("del _lpy_offset")
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+# {{{ integer arg finding from strides
+
+def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info):
+    options = kernel.options
+
+    gen("# {{{ find integer arguments from strides")
+    gen("")
+
+    for arg in implemented_data_info:
+        if arg.stride_for_name_and_axis is not None:
+            impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
+
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                if not options.skip_arg_checks:
+                    gen("if %s is None:" % impl_array_name)
+                    with Indentation(gen):
+                        gen("raise RuntimeError(\"required stride '%s' for "
+                                "argument '%s' not given or deducible from "
+                                "passed array\")"
+                                % (arg.name, impl_array_name))
+
+                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                    if not options.skip_arg_checks:
+                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
+                                % (arg.name, impl_array_name, stride_impl_axis,
+                                    base_arg.dtype.dtype.itemsize))
+
+                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is "
+                                "not divisible by its dtype itemsize\""
+                                % (stride_impl_axis, impl_array_name))
+                        gen("del _lpy_remdr")
+                    else:
+                        gen("%s = _lpy_offset // %d"
+                                % (arg.name, base_arg.dtype.itemsize))
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+# {{{ check that value args are present
+
+def generate_value_arg_check(gen, kernel, implemented_data_info):
+    if kernel.options.skip_arg_checks:
+        return
+
+    from loopy.kernel.data import ValueArg
+
+    gen("# {{{ check that value args are present")
+    gen("")
+
+    for arg in implemented_data_info:
+        if not issubclass(arg.arg_class, ValueArg):
+            continue
+
+        gen("if %s is None:" % arg.name)
+        with Indentation(gen):
+            gen("raise TypeError(\"value argument '%s' "
+                    "was not given and could not be automatically "
+                    "determined\")" % arg.name)
+
+    gen("# }}}")
+    gen("")
+
+# }}}
+
+
+# {{{ arg setup
+
+def generate_arg_setup(gen, kernel, implemented_data_info, options):
+    import loopy as lp
+
+    from loopy.kernel.data import KernelArgument
+    from loopy.kernel.array import ArrayBase
+    from loopy.symbolic import StringifyMapper
+    from pymbolic import var
+
+    gen("# {{{ set up array arguments")
+    gen("")
+
+    if not options.no_numpy:
+        gen("_lpy_encountered_numpy = False")
+        gen("_lpy_encountered_dev = False")
+        gen("")
+
+    args = []
+
+    strify = StringifyMapper()
+
+    expect_no_more_arguments = False
+
+    for arg_idx, arg in enumerate(implemented_data_info):
+        is_written = arg.base_name in kernel.get_written_variables()
+        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
+
+        if not issubclass(arg.arg_class, KernelArgument):
+            expect_no_more_arguments = True
+            continue
+
+        if expect_no_more_arguments:
+            raise LoopyError("Further arguments encountered after arg info "
+                    "describing a global temporary variable")
+
+        if not issubclass(arg.arg_class, ArrayBase):
+            args.append(arg.name)
+            continue
+
+        gen("# {{{ process %s" % arg.name)
+        gen("")
+
+        if not options.no_numpy:
+            gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
+            with Indentation(gen):
+                gen("# synchronous, nothing to worry about")
+                gen("%s = _lpy_cl_array.to_device("
+                        "queue, %s, allocator=allocator)"
+                        % (arg.name, arg.name))
+                gen("_lpy_encountered_numpy = True")
+            gen("elif %s is not None:" % arg.name)
+            with Indentation(gen):
+                gen("_lpy_encountered_dev = True")
+
+            gen("")
+
+        if not options.skip_arg_checks and not is_written:
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("raise RuntimeError(\"input argument '%s' must "
+                        "be supplied\")" % arg.name)
+                gen("")
+
+        if (is_written
+                and arg.arg_class is lp.ImageArg
+                and not options.skip_arg_checks):
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("raise RuntimeError(\"written image '%s' must "
+                        "be supplied\")" % arg.name)
+                gen("")
+
+        if is_written and arg.shape is None and not options.skip_arg_checks:
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("raise RuntimeError(\"written argument '%s' has "
+                        "unknown shape and must be supplied\")" % arg.name)
+                gen("")
+
+        possibly_made_by_loopy = False
+
+        # {{{ allocate written arrays, if needed
+
+        if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                and arg.shape is not None:
+
+            if not isinstance(arg.dtype, NumpyType):
+                raise LoopyError("do not know how to pass arg of type '%s'"
+                        % arg.dtype)
+
+            possibly_made_by_loopy = True
+            gen("_lpy_made_by_loopy = False")
+            gen("")
+
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                num_axes = len(arg.strides)
+                for i in range(num_axes):
+                    gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
+
+                itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+                for i in range(num_axes):
+                    gen("_lpy_strides_%d = %s" % (i, strify(
+                        itemsize*arg.unvec_strides[i])))
+
+                if not options.skip_arg_checks:
+                    for i in range(num_axes):
+                        gen("assert _lpy_strides_%d > 0, "
+                                "\"'%s' has negative stride in axis %d\""
+                                % (i, arg.name, i))
+
+                sym_strides = tuple(
+                        var("_lpy_strides_%d" % i)
+                        for i in range(num_axes))
+                sym_shape = tuple(
+                        var("_lpy_shape_%d" % i)
+                        for i in range(num_axes))
+
+                alloc_size_expr = (sum(astrd*(alen-1)
+                    for alen, astrd in zip(sym_shape, sym_strides))
+                    + itemsize)
+
+                gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
+                gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
+                        "%(dtype)s, strides=%(strides)s, "
+                        "data=allocator(_lpy_alloc_size), allocator=allocator)"
+                        % dict(
+                            name=arg.name,
+                            shape=strify(sym_shape),
+                            strides=strify(sym_strides),
+                            dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))
+
+                if not options.skip_arg_checks:
+                    for i in range(num_axes):
+                        gen("del _lpy_shape_%d" % i)
+                        gen("del _lpy_strides_%d" % i)
+                    gen("del _lpy_alloc_size")
+                    gen("")
+
+                gen("_lpy_made_by_loopy = True")
+                gen("")
+
+        # }}}
+
+        # {{{ argument checking
+
+        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                and not options.skip_arg_checks:
+            if possibly_made_by_loopy:
+                gen("if not _lpy_made_by_loopy:")
+            else:
+                gen("if True:")
+
+            with Indentation(gen):
+                gen("if %s.dtype != %s:"
+                        % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
+                with Indentation(gen):
+                    gen("raise TypeError(\"dtype mismatch on argument '%s' "
+                            "(got: %%s, expected: %s)\" %% %s.dtype)"
+                            % (arg.name, arg.dtype, arg.name))
+
+                # {{{ generate shape checking code
+
+                def strify_allowing_none(shape_axis):
+                    if shape_axis is None:
+                        return "None"
+                    else:
+                        return strify(shape_axis)
+
+                def strify_tuple(t):
+                    if len(t) == 0:
+                        return "()"
+                    else:
+                        return "(%s,)" % ", ".join(
+                                strify_allowing_none(sa)
+                                for sa in t)
+
+                shape_mismatch_msg = (
+                        "raise TypeError(\"shape mismatch on argument '%s' "
+                        "(got: %%s, expected: %%s)\" "
+                        "%% (%s.shape, %s))"
+                        % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
+
+                if kernel_arg.shape is None:
+                    pass
+
+                elif any(shape_axis is None for shape_axis in kernel_arg.shape):
+                    gen("if len(%s.shape) != %s:"
+                            % (arg.name, len(arg.unvec_shape)))
+                    with Indentation(gen):
+                        gen(shape_mismatch_msg)
+
+                    for i, shape_axis in enumerate(arg.unvec_shape):
+                        if shape_axis is None:
+                            continue
+
+                        gen("if %s.shape[%d] != %s:"
+                                % (arg.name, i, strify(shape_axis)))
+                        with Indentation(gen):
+                            gen(shape_mismatch_msg)
+
+                else:  # not None, no Nones in tuple
+                    gen("if %s.shape != %s:"
+                            % (arg.name, strify(arg.unvec_shape)))
+                    with Indentation(gen):
+                        gen(shape_mismatch_msg)
+
+                # }}}
+
+                if arg.unvec_strides and kernel_arg.dim_tags:
+                    itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+                    sym_strides = tuple(
+                            itemsize*s_i for s_i in arg.unvec_strides)
+                    gen("if %s.strides != %s:"
+                            % (arg.name, strify(sym_strides)))
+                    with Indentation(gen):
+                        gen("raise TypeError(\"strides mismatch on "
+                                "argument '%s' (got: %%s, expected: %%s)\" "
+                                "%% (%s.strides, %s))"
+                                % (arg.name, arg.name, strify(sym_strides)))
+
+                if not arg.allows_offset:
+                    gen("if %s.offset:" % arg.name)
+                    with Indentation(gen):
+                        gen("raise ValueError(\"Argument '%s' does not "
+                                "allow arrays with offsets. Try passing "
+                                "default_offset=loopy.auto to make_kernel()."
+                                "\")" % arg.name)
+                        gen("")
+
+        # }}}
+
+        if possibly_made_by_loopy and not options.skip_arg_checks:
+            gen("del _lpy_made_by_loopy")
+            gen("")
+
+        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
+            args.append("%s.base_data" % arg.name)
+        else:
+            args.append("%s" % arg.name)
+
+        gen("")
+
+        gen("# }}}")
+        gen("")
+
+    gen("# }}}")
+    gen("")
+
+    return args
+
+# }}}
+
+
+def generate_invoker(kernel, codegen_result):
+    options = kernel.options
+    implemented_data_info = codegen_result.implemented_data_info
+    host_code = codegen_result.host_code()
+
+    system_args = [
+            "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
+            # ignored if options.no_numpy
+            "out_host=None"
+            ]
+
+    from loopy.kernel.data import KernelArgument
+    gen = PythonFunctionGenerator(
+            "invoke_%s_loopy_kernel" % kernel.name,
+            system_args + [
+                "%s=None" % idi.name
+                for idi in implemented_data_info
+                if issubclass(idi.arg_class, KernelArgument)
+                ])
+
+    gen.add_to_preamble("from __future__ import division")
+    gen.add_to_preamble("")
+    gen.add_to_preamble("import pyopencl as _lpy_cl")
+    gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
+    gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
+    gen.add_to_preamble("import numpy as _lpy_np")
+    gen.add_to_preamble("")
+    gen.add_to_preamble(host_code)
+    gen.add_to_preamble("")
+
+    gen("if allocator is None:")
+    with Indentation(gen):
+        gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
+    gen("")
+
+    generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info)
+    generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info)
+    generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info)
+    generate_value_arg_check(gen, kernel, implemented_data_info)
+
+    args = generate_arg_setup(gen, kernel, implemented_data_info, options)
+
+    # {{{ generate invocation
+
+    gen("_lpy_evt = {kernel_name}({args})"
+            .format(
+                kernel_name=codegen_result.host_program.name,
+                args=", ".join(
+                    ["_lpy_cl_kernels", "queue"]
+                    + args
+                    + ["wait_for=wait_for"])))
+
+    # }}}
+
+    # {{{ output
+
+    if not options.no_numpy:
+        gen("if out_host is None and (_lpy_encountered_numpy "
+                "and not _lpy_encountered_dev):")
+        with Indentation(gen):
+            gen("out_host = True")
+
+        gen("if out_host:")
+        with Indentation(gen):
+            gen("pass")  # if no outputs (?!)
+            for arg in implemented_data_info:
+                if not issubclass(arg.arg_class, KernelArgument):
+                    continue
+
+                is_written = arg.base_name in kernel.get_written_variables()
+                if is_written:
+                    gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
+
+        gen("")
+
+    if options.return_dict:
+        gen("return _lpy_evt, {%s}"
+                % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    for arg in implemented_data_info
+                    if issubclass(arg.arg_class, KernelArgument)
+                    if arg.base_name in kernel.get_written_variables()))
+    else:
+        out_args = [arg
+                for arg in implemented_data_info
+                    if issubclass(arg.arg_class, KernelArgument)
+                if arg.base_name in kernel.get_written_variables()]
+        if out_args:
+            gen("return _lpy_evt, (%s,)"
+                    % ", ".join(arg.name for arg in out_args))
+        else:
+            gen("return _lpy_evt, ()")
+
+    # }}}
+
+    if options.write_wrapper:
+        output = gen.get()
+        if options.highlight_wrapper:
+            output = get_highlighted_python_code(output)
+
+        if options.write_wrapper is True:
+            print(output)
+        else:
+            with open(options.write_wrapper, "w") as outf:
+                outf.write(output)
+
+    return gen.get_function()
+
+
+# }}}
+
+
+# {{{ kernel executor
+
+class _CLKernelInfo(Record):
+    pass
+
+
+class _CLKernels(object):
+    pass
+
+
+class PyOpenCLKernelExecutor(KernelExecutorBase):
+    """An object connecting a kernel to a :class:`pyopencl.Context`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, context, kernel):
+        """
+        :arg context: a :class:`pyopencl.Context`
+        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
+            (a warning will be issued if more than one is returned). If the
+            kernel has not yet been loop-scheduled, that is done, too, with no
+            specific arguments.
+        """
+
+        super(PyOpenCLKernelExecutor, self).__init__(kernel)
+
+        self.context = context
+
+        from loopy.target.pyopencl import PyOpenCLTarget
+        if isinstance(kernel.target, PyOpenCLTarget):
+            self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
+
+    @memoize_method
+    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+
+        from loopy.codegen import generate_code_v2
+        codegen_result = generate_code_v2(kernel)
+
+        dev_code = codegen_result.device_code()
+
+        if self.kernel.options.write_cl:
+            output = dev_code
+            if self.kernel.options.highlight_cl:
+                output = get_highlighted_cl_code(output)
+
+            if self.kernel.options.write_cl is True:
+                print(output)
+            else:
+                with open(self.kernel.options.write_cl, "w") as outf:
+                    outf.write(output)
+
+        if self.kernel.options.edit_cl:
+            from pytools import invoke_editor
+            dev_code = invoke_editor(dev_code, "code.cl")
+
+        import pyopencl as cl
+
+        logger.info("%s: opencl compilation start" % self.kernel.name)
+
+        cl_program = (
+                cl.Program(self.context, dev_code)
+                .build(options=kernel.options.cl_build_options))
+
+        cl_kernels = _CLKernels()
+        for dp in codegen_result.device_programs:
+            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
+
+        logger.info("%s: opencl compilation done" % self.kernel.name)
+
+        return _CLKernelInfo(
+                kernel=kernel,
+                cl_kernels=cl_kernels,
+                implemented_data_info=codegen_result.implemented_data_info,
+                invoker=generate_invoker(kernel, codegen_result))
+
+    # {{{ debugging aids
+
+    def get_code(self, arg_to_dtype=None):
+        if arg_to_dtype is not None:
+            arg_to_dtype = frozenset(six.iteritems(arg_to_dtype))
+
+        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
+
+        from loopy.codegen import generate_code_v2
+        code = generate_code_v2(kernel)
+        return code.device_code()
+
+    def get_highlighted_code(self, arg_to_dtype=None):
+        return get_highlighted_cl_code(
+                self.get_code(arg_to_dtype))
+
+    # }}}
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg allocator: a callable passed a byte count and returning
+            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
+            maybe.
+        :arg wait_for: A list of :class:`pyopencl.Event` instances
+            for which to wait.
+        :arg out_host: :class:`bool`
+            Decides whether output arguments (i.e. arguments
+            written by the kernel) are to be returned as
+            :mod:`numpy` arrays. *True* for yes, *False* for no.
+
+            For the default value of *None*, if all (input) array
+            arguments are :mod:`numpy` arrays, defaults to
+            returning :mod:`numpy` arrays as well.
+
+        :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event`
+            associated with the execution of the kernel, and
+            output is a tuple of output arguments (arguments that
+            are written as part of the kernel). The order is given
+            by the order of kernel arguments. If this order is unspecified
+            (such as when kernel arguments are inferred automatically),
+            enable :attr:`loopy.Options.return_dict` to make *output* a
+            :class:`dict` instead, with keys of argument names and values
+            of the returned arrays.
+        """
+
+        allocator = kwargs.pop("allocator", None)
+        wait_for = kwargs.pop("wait_for", None)
+        out_host = kwargs.pop("out_host", None)
+
+        kwargs = self.packing_controller.unpack(kwargs)
+
+        kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs))
+
+        return kernel_info.invoker(
+                kernel_info.cl_kernels, queue, allocator, wait_for,
+                out_host, **kwargs)
+
+# }}}
+
+
+def get_highlighted_python_code(text):
+    try:
+        from pygments import highlight
+    except ImportError:
+        return text
+    else:
+        from pygments.lexers import PythonLexer
+        from pygments.formatters import TerminalFormatter
+
+        return highlight(text, PythonLexer(), TerminalFormatter())
+
+
+def get_highlighted_cl_code(text):
+    try:
+        from pygments import highlight
+    except ImportError:
+        return text
+    else:
+        from pygments.lexers import CLexer
+        from pygments.formatters import TerminalFormatter
+
+        return highlight(text, CLexer(), TerminalFormatter())
+
+
+# vim: foldmethod=marker
diff --git a/loopy/target/python.py b/loopy/target/python.py
index f2a529c34bc4c56f69db1693058308769919e27a..591161d818bf6691a0412b3a00d624f8b02dde5b 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -256,9 +256,18 @@ class PythonASTBuilderBase(ASTBuilderBase):
         from genpy import If
         return If(condition_str, ast)
 
-    def emit_assignment(self, codegen_state, lhs, rhs):
+    def emit_assignment(self, codegen_state, insn):
+        ecm = codegen_state.expression_to_code_mapper
+
+        if insn.atomicity:
+            raise NotImplementedError("atomic ops in Python")
+
+        from pymbolic.mapper.stringifier import PREC_NONE
         from genpy import Assign
-        return Assign(lhs, rhs)
+
+        return Assign(
+                ecm(insn.assignee, prec=PREC_NONE, type_context=None),
+                ecm(insn.expression, prec=PREC_NONE, type_context=None))
 
     # }}}
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index 181f5241137cc1abe3256506f3ebd59327e4c4f8..b7f47c38a6a0daf8e4495c16791ef2f955019649 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -226,6 +226,13 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
 
     # }}}
 
+    common_factors = [
+            (ik, cf) for ik, cf in common_factors
+            if cf]
+
+    if not common_factors:
+        raise LoopyError("no common factors found")
+
     # {{{ remove common factors
 
     new_insns = []
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 40c2b66c05dd11bd69bfa4ff682c45a68d71d623..88e3898e2cceeeb62edea306283fcb718c3b088d 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -169,7 +169,7 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
     batch_iname_set = frozenset([batch_iname])
     kernel = kernel.copy(
             instructions=[
-                insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set)
+                insn.copy(within_inames=insn.within_inames | batch_iname_set)
                 for insn in kernel.instructions])
 
     return kernel
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 0c25b63930f99ebcb8b2b817367ec1572b15a1d4..2210530e997fae4898c113b88fd141bc07f56163 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -400,7 +400,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     init_instruction = Assignment(id=init_insn_id,
                 assignee=buf_var_init,
                 expression=init_expression,
-                forced_iname_deps=(
+                within_inames=(
                     frozenset(within_inames)
                     | frozenset(non1_init_inames)),
                 depends_on=frozenset(),
@@ -480,7 +480,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
                     no_sync_with=frozenset([init_insn_id]),
                     assignee=store_target,
                     expression=store_expression,
-                    forced_iname_deps=(
+                    within_inames=(
                         frozenset(within_inames)
                         | frozenset(non1_store_inames)))
     else:
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index a8c3b5849d90466d1163f4cc47777e1253cae079..d972f44cf6cdd4497c03b2725c4fe607f0e35842 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -311,8 +311,8 @@ class DifferentiationContext(object):
                 assignee=var(new_var_name)[
                     lhs_ind + diff_iname_exprs],
                 expression=diff_expr,
-                forced_iname_deps=(
-                    orig_writer_insn.forced_iname_deps | frozenset(diff_inames)))
+                within_inames=(
+                    orig_writer_insn.within_inames | frozenset(diff_inames)))
 
         self.new_instructions.append(insn)
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 3a9941fa83fdad7cf9b11dee54e76499b6bacd6a..1914b8d677af4cd0a5683d90ff5ac1168a660063 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -70,6 +70,8 @@ __doc__ = """
 
 .. autofunction:: make_reduction_inames_unique
 
+.. autofunction:: add_inames_to_insn
+
 """
 
 
@@ -86,7 +88,7 @@ def set_loop_priority(kernel, loop_priority):
     """
 
     if isinstance(loop_priority, str):
-        loop_priority = [s.strip() for s in loop_priority.split(",")]
+        loop_priority = [s.strip() for s in loop_priority.split(",") if s.strip()]
 
     return kernel.copy(loop_priority=loop_priority)
 
@@ -225,16 +227,16 @@ def _split_iname_backend(kernel, split_iname,
 
     new_insns = []
     for insn in kernel.instructions:
-        if split_iname in insn.forced_iname_deps:
-            new_forced_iname_deps = (
-                    (insn.forced_iname_deps.copy()
+        if split_iname in insn.within_inames:
+            new_within_inames = (
+                    (insn.within_inames.copy()
                     - frozenset([split_iname]))
                     | frozenset([outer_iname, inner_iname]))
         else:
-            new_forced_iname_deps = insn.forced_iname_deps
+            new_within_inames = insn.within_inames
 
         insn = insn.copy(
-                forced_iname_deps=new_forced_iname_deps)
+                within_inames=new_within_inames)
 
         new_insns.append(insn)
 
@@ -528,7 +530,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
         if within is None:
             new_domain = new_domain.project_out(iname_dt, iname_idx, 1)
 
-    def subst_forced_iname_deps(fid):
+    def subst_within_inames(fid):
         result = set()
         for iname in fid:
             if iname in inames:
@@ -540,7 +542,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
 
     new_insns = [
             insn.copy(
-                forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps))
+                within_inames=subst_within_inames(insn.within_inames))
             for insn in kernel.instructions]
 
     kernel = (kernel
@@ -579,7 +581,12 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
     :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given
         as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or
         as a string as shown in :ref:`iname-tags`. May also be a dictionary
-        for backwards compatibility.
+        for backwards compatibility. *iname* may also be a wildcard using ``*``
+        and ``?``.
+
+    .. versionchanged:: 2016.3
+
+        Added wildcards.
     """
 
     if isinstance(iname_to_tag, dict):
@@ -614,14 +621,35 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
     from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
             ForceSequentialTag)
 
-    new_iname_to_tag = kernel.iname_to_tag.copy()
+    # {{{ globbing
+
+    all_inames = kernel.all_inames()
+
+    from loopy.match import re_from_glob
+    new_iname_to_tag = {}
     for iname, new_tag in iname_to_tag:
-        if iname not in kernel.all_inames():
-            if ignore_nonexistent:
-                continue
-            else:
-                raise LoopyError("iname '%s' does not exist" % iname)
+        if '*' in iname or '?' in iname:
+            match_re = re_from_glob(iname)
+            for sub_iname in all_inames:
+                if match_re.match(sub_iname):
+                    new_iname_to_tag[sub_iname] = new_tag
+
+        else:
+            if iname not in all_inames:
+                if ignore_nonexistent:
+                    continue
+                else:
+                    raise LoopyError("iname '%s' does not exist" % iname)
 
+            new_iname_to_tag[iname] = new_tag
+
+    iname_to_tag = new_iname_to_tag
+    del new_iname_to_tag
+
+    # }}}
+
+    knl_iname_to_tag = kernel.iname_to_tag.copy()
+    for iname, new_tag in six.iteritems(iname_to_tag):
         old_tag = kernel.iname_to_tag.get(iname)
 
         retag_ok = False
@@ -652,9 +680,9 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
             raise LoopyError("'%s' is already tagged '%s'--cannot retag"
                     % (iname, old_tag))
 
-        new_iname_to_tag[iname] = new_tag
+        knl_iname_to_tag[iname] = new_tag
 
-    return kernel.copy(iname_to_tag=new_iname_to_tag)
+    return kernel.copy(iname_to_tag=knl_iname_to_tag)
 
 # }}}
 
@@ -709,8 +737,8 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
         new_fid = frozenset(
                 self.old_to_new.get(iname, iname)
-                for iname in insn.forced_iname_deps)
-        return insn.copy(forced_iname_deps=new_fid)
+                for iname in insn.within_inames)
+        return insn.copy(within_inames=new_fid)
 
 
 def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
@@ -1034,11 +1062,11 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
 
         new_instructions = []
         for insn in knl.instructions:
-            if (old_iname in insn.forced_iname_deps
+            if (old_iname in insn.within_inames
                     and within(knl, insn, ())):
                 insn = insn.copy(
-                        forced_iname_deps=(
-                            (insn.forced_iname_deps - frozenset([old_iname]))
+                        within_inames=(
+                            (insn.within_inames - frozenset([old_iname]))
                             | frozenset([new_iname])))
 
             new_instructions.append(insn)
@@ -1410,8 +1438,8 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
             return inames
 
     new_instructions = [
-            insn.copy(forced_iname_deps=fix_iname_set(
-                insn.id, insn.forced_iname_deps))
+            insn.copy(within_inames=fix_iname_set(
+                insn.id, insn.within_inames))
             for insn in kernel.instructions]
 
     # }}}
@@ -1590,4 +1618,45 @@ def make_reduction_inames_unique(kernel, inames=None, within=None):
 
 # }}}
 
+
+# {{{ add_inames_to_insn
+
+def add_inames_to_insn(knl, inames, insn_match):
+    """
+    :arg inames: a frozenset of inames that will be added to the
+        instructions matched by *insn_match*, or a comma-separated
+        string that parses to such a tuple.
+    :arg insn_match: An instruction match as understood by
+        :func:`loopy.match.parse_match`.
+
+    :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag`
+        that is not being used within the instructions matched by
+        *insn_match*.
+
+    .. versionadded:: 2016.3
+    """
+
+    if isinstance(inames, str):
+        inames = frozenset(s.strip() for s in inames.split(","))
+
+    if not isinstance(inames, frozenset):
+        raise TypeError("'inames' must be a frozenset")
+
+    from loopy.match import parse_match
+    match = parse_match(insn_match)
+
+    new_instructions = []
+
+    for insn in knl.instructions:
+        if match(knl, insn):
+            new_instructions.append(
+                    insn.copy(within_inames=insn.within_inames | inames))
+        else:
+            new_instructions.append(insn)
+
+    return knl.copy(instructions=new_instructions)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 2828b19f9e1c824be6b81ee92269bb317ad9b620..27fad67f8838622d24e276defc7b8c66eaf11306 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -307,8 +307,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         tuple, in which case names will be automatically created.
         May also equivalently be a comma-separated string.
 
-    :arg precompute_outer_inames: The inames within which the compute
-        instruction is nested. If *None*, make an educated guess.
+    :arg precompute_outer_inames: A :class:`frozenset` of inames within which
+        the compute instruction is nested. If *None*, make an educated guess.
+        May also be specified as a comma-separated string.
 
     :arg compute_insn_id: The ID of the instruction generated to perform the
         precomputation.
@@ -361,6 +362,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     if isinstance(precompute_inames, str):
         precompute_inames = [iname.strip() for iname in precompute_inames.split(",")]
 
+    if isinstance(precompute_outer_inames, str):
+        precompute_outer_inames = frozenset(
+                iname.strip() for iname in precompute_outer_inames.split(","))
+
     if isinstance(subst_use, str):
         subst_use = [subst_use]
 
@@ -770,7 +775,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             id=compute_insn_id,
             assignee=assignee,
             expression=compute_expression,
-            # forced_iname_deps determined below
+            # within_inames determined below
             )
 
     # }}}
@@ -805,9 +810,12 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         if not isinstance(precompute_outer_inames, frozenset):
             raise TypeError("precompute_outer_inames must be a frozenset")
 
+        precompute_outer_inames = precompute_outer_inames \
+                | frozenset(non1_storage_axis_names)
+
     kernel = kernel.copy(
             instructions=[
-                insn.copy(forced_iname_deps=precompute_outer_inames)
+                insn.copy(within_inames=precompute_outer_inames)
                 if insn.id == compute_insn_id
                 else insn
                 for insn in kernel.instructions])
diff --git a/loopy/version.py b/loopy/version.py
index 026ff2e4e5090005e9ce80c1894861048492dc61..12b2fedbec2669205de2a82a8d5eca42678e9353 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v40-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v42-islpy%s" % _islpy_version
diff --git a/test/gnuma_loopy_transforms.py b/test/gnuma_loopy_transforms.py
index 2107d50b03bdf99dad1b8c5bd7f31dfa7062579c..49a2a99bb8d4d518ac23134ba9d1a1ff3d6e8053 100644
--- a/test/gnuma_loopy_transforms.py
+++ b/test/gnuma_loopy_transforms.py
@@ -23,17 +23,17 @@ def fix_euler_parameters(kernel, p_p0, p_Gamma, p_R):
 
 
 def set_q_storage_format(kernel, name):
-    kernel = lp.set_array_dim_names(kernel, name, "i,j,k,field,e")
+    kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e")
 
     kernel = lp.split_array_dim(
         kernel, (name, 3, "F"), 4, auto_split_inames=False)
-    kernel = lp.tag_data_axes(kernel, name, "N0,N1,N2,vec,N4,N3")
+    kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3")
 
     return kernel
 
 
 def set_D_storage_format(kernel):
-    return lp.tag_data_axes(kernel, "D", "f,f")
+    return lp.tag_array_axes(kernel, "D", "f,f")
 
 
 def set_up_volume_loop(kernel, Nq):
diff --git a/test/test_diff.py b/test/test_diff.py
index d6e2ae156f23259d16cb06040f48fefc0d565661..95471f9b126fd6b763530d115c21509d14d2ba47 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -65,6 +65,8 @@ def test_diff(ctx_factory):
     dknl, diff_map = diff_kernel(knl, "z", "x")
     dknl = lp.remove_unused_arguments(dknl)
 
+    dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")
+
     print(dknl)
 
     n = 50
diff --git a/test/test_linalg.py b/test/test_linalg.py
index b7a1e059cb4499fe136f962b5d5fdcd539fc05f1..7db8566084423ab85cc3870448ddefd60b0bc27b 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -286,6 +286,7 @@ def test_rank_one(ctx_factory):
         knl = lp.add_prefetch(knl, "a")
         knl = lp.add_prefetch(knl, "b")
         knl = lp.set_loop_priority(knl, ["i", "j"])
+        knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch")
         return knl
 
     def variant_2(knl):
@@ -330,7 +331,12 @@ def test_rank_one(ctx_factory):
 
     seq_knl = knl
 
-    for variant in [variant_1, variant_2, variant_3, variant_4]:
+    for variant in [
+            variant_1,
+            variant_2,
+            variant_3,
+            variant_4
+            ]:
         lp.auto_test_vs_ref(seq_knl, ctx, variant(knl),
                 op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"],
                 parameters={"n": n})
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 078e89ff4911c2ff8cb3a71e5d394cf4f1705b59..dfc50fca04c7c4e26c11dbdc7a20b6a1ec96d729 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -989,7 +989,7 @@ def test_atomic(ctx_factory, dtype):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
 
 
-def test_forced_iname_deps_and_reduction():
+def test_within_inames_and_reduction():
     # See https://github.com/inducer/loopy/issues/24
 
     # This is (purposefully) somewhat un-idiomatic, to replicate the conditions
@@ -1005,8 +1005,8 @@ def test_forced_iname_deps_and_reduction():
     from pymbolic.primitives import Subscript, Variable
     i2 = lp.Assignment("a",
             lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
-            forced_iname_deps=frozenset(),
-            forced_iname_deps_is_final=True)
+            within_inames=frozenset(),
+            within_inames_is_final=True)
 
     k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
             [i1, i2],
@@ -1106,13 +1106,16 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
     knl = lp.make_kernel(
             "{ [i,k]: 0<=i<n and 0<=k<3 }",
             """
-            <> t_private = a[k,i+1]
+            <> t_private_scalar = a[k,i+1]
+            <> t_private_array[i % 2] = a[k,i+1]
             c[k,i] = a[k,i+1]
-            out[k,i] = c[k,i] + t_private
+            out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
             """)
 
     knl = lp.add_and_infer_dtypes(knl,
             {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+    knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
+    knl = lp.set_temporary_scope(knl, "t_private_array", "private")
 
     ref_knl = knl
 
@@ -1138,6 +1141,46 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
+def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i,k]: 0<=i<n and 0<=k<3 }",
+            """
+            <> t_local[i % 8,k] = i % 8
+            c[k,i] = a[k,i+1]
+            out[k,i] = c[k,i] + t_local[i % 8,k]
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl,
+            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+    knl = lp.set_temporary_scope(knl, "t_local", "local")
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
+
+    # schedule
+    from loopy.preprocess import preprocess_kernel
+    knl = preprocess_kernel(knl)
+
+    from loopy.schedule import get_one_scheduled_kernel
+    knl = get_one_scheduled_kernel(knl)
+
+    # map schedule onto host or device
+    print(knl)
+
+    cgr = lp.generate_code_v2(knl)
+
+    assert len(cgr.device_programs) == 2
+
+    print(cgr.device_code())
+    print(cgr.host_code())
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
+
+
 def test_global_temporary(ctx_factory):
     ctx = ctx_factory()
 
@@ -1269,7 +1312,7 @@ def test_call_with_no_returned_value(ctx_factory):
 
     knl = lp.make_kernel(
         "{:}",
-        [lp.CallInstruction([], p.Call(p.Variable("f"), ()))]
+        [lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
         )
 
     knl = lp.register_function_manglers(knl, [_f_mangler])
@@ -1310,6 +1353,16 @@ def test_unschedulable_kernel_detection():
     assert len(list(lp.get_iname_duplication_options(knl))) == 10
 
 
+def test_regression_no_ret_call_removal(ctx_factory):
+    # https://github.com/inducer/loopy/issues/32
+    knl = lp.make_kernel(
+            "{[i] : 0<=i<n}",
+            "f(sum(i, x[i]))")
+    knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
+    knl = lp.preprocess_kernel(knl)
+    assert len(knl.instructions) == 3
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 389151f173cc835c0e01f2cbc3c640952cf3c575..e4f303f78bd6442ce7d956b66b3d99bffc1d6252 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -140,9 +140,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
                 precompute_inames=flux_precomp_inames + flux_ilp_inames,
                 default_tag=None)
             if flux_var.endswith("_s"):
-                hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?")
+                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
             else:
-                hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?")
+                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")
 
             n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "")
             if n_iname.endswith("_0"):
@@ -190,9 +190,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
               Q_dim_field_outer="unr"))
 
     # buffer axes need to be vectorized in order for this to work
-    hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c")
-    hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c")
-    hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f")
+    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
+    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
+    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
     hsv = lp.tag_inames(hsv,
             {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"},
             ignore_nonexistent=True)
diff --git a/test/test_reduction.py b/test/test_reduction.py
index b751dd515b5978ba3c9ccedab06016d19e746546..29dda7419ae870e9a3d773aefeafb540567e94a1 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -126,10 +126,14 @@ def test_multi_nested_dependent_reduction(ctx_factory):
                 "{[isrc_box]: 0 <= isrc_box < nboxes}",
                 "{[isrc]: 0 <= isrc < npart}"
                 ],
-            [
-                "<> npart = nparticles_per_box[isrc_box]",
-                "a[itgt] = sum((isrc_box, isrc), 1)",
-                ],
+            """
+            for itgt
+                for isrc_box
+                    <> npart = nparticles_per_box[isrc_box]
+                end
+                a[itgt] = sum((isrc_box, isrc), 1)
+            end
+            """,
             [
                 lp.ValueArg("n", np.int32),
                 lp.GlobalArg("a", dtype, ("n",)),
@@ -154,11 +158,15 @@ def test_recursive_nested_dependent_reduction(ctx_factory):
                 "{[isrc_box]: 0 <= isrc_box < nboxes}",
                 "{[isrc]: 0 <= isrc < npart}"
                 ],
-            [
-                "<> npart = nparticles_per_box[isrc_box]",
-                "<> boxsum = sum(isrc, isrc+isrc_box+itgt)",
-                "a[itgt] = sum(isrc_box, boxsum)",
-                ],
+            """
+            for itgt
+                for isrc_box
+                    <> npart = nparticles_per_box[isrc_box]
+                    <> boxsum = sum(isrc, isrc+isrc_box+itgt)
+                end
+                a[itgt] = sum(isrc_box, boxsum)
+            end
+            """,
             [
                 lp.ValueArg("n", np.int32),
                 lp.GlobalArg("a", dtype, ("n",)),