diff --git a/MEMO b/MEMO
index d650e7ea46aea8b01def996336810a2fe97a61fa..f4e5c34e48e62d5c951d01fcb212a9117e361def 100644
--- a/MEMO
+++ b/MEMO
@@ -142,7 +142,7 @@ Dealt with
 - How can one automatically generate something like microblocks?
   -> Some sort of axis-adding transform?
 
-- ExpandingIdentityMapper
+- RuleAwareIdentityMapper
   extract_subst -> needs WalkMapper [actually fine as is]
   padding [DONE]
   replace make_unique_var_name [DONE]
diff --git a/doc/reference.rst b/doc/reference.rst
index 595fd4b5fa5bac3ccd69cfb1b7a05ebb1c7b75f9..9d75a924b71847d04632b4945f9228c80c2e01f4 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -402,6 +402,8 @@ Caching, Precomputation and Prefetching
 
 .. autofunction:: add_prefetch
 
+.. autofunction:: buffer_array
+
 Influencing data access
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 3027e10d38a98b4c86737502b7fd20f41c3cc3c3..450aa4ac45272a580afd7dc5fb933355b5a8dcfd 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -30,7 +30,8 @@ THE SOFTWARE.
 import islpy as isl
 from islpy import dim_type
 
-from loopy.symbolic import ExpandingIdentityMapper, ExpandingSubstitutionMapper
+from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
+        SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 
 
@@ -55,6 +56,7 @@ from loopy.kernel.creation import make_kernel, UniqueName
 from loopy.library.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst, temporary_to_subst
 from loopy.precompute import precompute
+from loopy.buffer import buffer_array
 from loopy.padding import (split_arg_axis, find_padding_multiple,
         add_padding)
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
@@ -82,7 +84,7 @@ __all__ = [
         "register_reduction_parser",
 
         "extract_subst", "expand_subst", "temporary_to_subst",
-        "precompute",
+        "precompute", "buffer_array",
         "split_arg_axis", "find_padding_multiple", "add_padding",
 
         "get_dot_dependency_graph",
@@ -125,11 +127,10 @@ __all__ = [
 
 # {{{ split inames
 
-class _InameSplitter(ExpandingIdentityMapper):
-    def __init__(self, kernel, within,
+class _InameSplitter(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, within,
             split_iname, outer_iname, inner_iname, replacement_index):
-        ExpandingIdentityMapper.__init__(self,
-                kernel.substitutions, kernel.get_var_name_generator())
+        super(_InameSplitter, self).__init__(rule_mapping_context)
 
         self.within = within
 
@@ -140,7 +141,9 @@ class _InameSplitter(ExpandingIdentityMapper):
         self.replacement_index = replacement_index
 
     def map_reduction(self, expr, expn_state):
-        if self.split_iname in expr.inames and self.within(expn_state.stack):
+        if (self.split_iname in expr.inames
+                and self.split_iname not in expn_state.arg_context
+                and self.within(expn_state.stack)):
             new_inames = list(expr.inames)
             new_inames.remove(self.split_iname)
             new_inames.extend([self.outer_iname, self.inner_iname])
@@ -149,13 +152,15 @@ class _InameSplitter(ExpandingIdentityMapper):
             return Reduction(expr.operation, tuple(new_inames),
                         self.rec(expr.expr, expn_state))
         else:
-            return ExpandingIdentityMapper.map_reduction(self, expr, expn_state)
+            return super(_InameSplitter, self).map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
-        if expr.name == self.split_iname and self.within(expn_state.stack):
+        if (expr.name == self.split_iname
+                and self.split_iname not in expn_state.arg_context
+                and self.within(expn_state.stack)):
             return self.replacement_index
         else:
-            return ExpandingIdentityMapper.map_variable(self, expr, expn_state)
+            return super(_InameSplitter, self).map_variable(expr, expn_state)
 
 
 def split_iname(kernel, split_iname, inner_length,
@@ -272,10 +277,13 @@ def split_iname(kernel, split_iname, inner_length,
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
 
-    ins = _InameSplitter(kernel, within,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    ins = _InameSplitter(rule_mapping_context, within,
             split_iname, outer_iname, inner_iname, new_loop_index)
 
     kernel = ins.map_kernel(kernel)
+    kernel = rule_mapping_context.finish_kernel(kernel)
 
     if existing_tag is not None:
         kernel = tag_inames(kernel,
@@ -288,10 +296,10 @@ def split_iname(kernel, split_iname, inner_length,
 
 # {{{ join inames
 
-class _InameJoiner(ExpandingSubstitutionMapper):
-    def __init__(self, kernel, within, subst_func, joined_inames, new_iname):
-        ExpandingSubstitutionMapper.__init__(self,
-                kernel.substitutions, kernel.get_var_name_generator(),
+class _InameJoiner(RuleAwareSubstitutionMapper):
+    def __init__(self, rule_mapping_context, within, subst_func,
+            joined_inames, new_iname):
+        super(_InameJoiner, self).__init__(rule_mapping_context,
                 subst_func, within)
 
         self.joined_inames = set(joined_inames)
@@ -299,7 +307,8 @@ class _InameJoiner(ExpandingSubstitutionMapper):
 
     def map_reduction(self, expr, expn_state):
         expr_inames = set(expr.inames)
-        overlap = self.join_inames & expr_inames
+        overlap = (self.join_inames & expr_inames
+                - set(expn_state.arg_context))
         if overlap and self.within(expn_state.stack):
             if overlap != expr_inames:
                 raise LoopyError(
@@ -317,7 +326,7 @@ class _InameJoiner(ExpandingSubstitutionMapper):
             return Reduction(expr.operation, tuple(new_inames),
                         self.rec(expr.expr, expn_state))
         else:
-            return ExpandingIdentityMapper.map_reduction(self, expr, expn_state)
+            return super(_InameJoiner, self).map_reduction(expr, expn_state)
 
 
 def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
@@ -419,11 +428,14 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
     within = parse_stack_match(within)
 
     from pymbolic.mapper.substitutor import make_subst_func
-    ijoin = _InameJoiner(kernel, within,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    ijoin = _InameJoiner(rule_mapping_context, within,
             make_subst_func(subst_dict),
             inames, new_iname)
 
-    kernel = ijoin.map_kernel(kernel)
+    kernel = rule_mapping_context.finish_kernel(
+            ijoin.map_kernel(kernel))
 
     if tag is not None:
         kernel = tag_inames(kernel, {new_iname: tag})
@@ -488,33 +500,37 @@ def tag_inames(kernel, iname_to_tag, force=False):
 
 # {{{ duplicate inames
 
-class _InameDuplicator(ExpandingIdentityMapper):
-    def __init__(self, rules, make_unique_var_name,
+class _InameDuplicator(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context,
             old_to_new, within):
-        ExpandingIdentityMapper.__init__(self,
-                rules, make_unique_var_name)
+        super(_InameDuplicator, self).__init__(rule_mapping_context)
 
         self.old_to_new = old_to_new
         self.old_inames_set = set(six.iterkeys(old_to_new))
         self.within = within
 
     def map_reduction(self, expr, expn_state):
-        if set(expr.inames) & self.old_inames_set and self.within(expn_state.stack):
+        if (set(expr.inames) & self.old_inames_set
+                and self.within(expn_state.stack)):
             new_inames = tuple(
                     self.old_to_new.get(iname, iname)
+                    if iname not in expn_state.arg_context
+                    else iname
                     for iname in expr.inames)
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, new_inames,
                         self.rec(expr.expr, expn_state))
         else:
-            return ExpandingIdentityMapper.map_reduction(self, expr, expn_state)
+            return super(_InameDuplicator, self).map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
         new_name = self.old_to_new.get(expr.name)
 
-        if new_name is None or not self.within(expn_state.stack):
-            return ExpandingIdentityMapper.map_variable(self, expr, expn_state)
+        if (new_name is None
+                or expr.name in expn_state.arg_context
+                or not self.within(expn_state.stack)):
+            return super(_InameDuplicator, self).map_variable(expr, expn_state)
         else:
             from pymbolic import var
             return var(new_name)
@@ -591,11 +607,14 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
 
     # {{{ change the inames in the code
 
-    indup = _InameDuplicator(knl.substitutions, name_gen,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            knl.substitutions, name_gen)
+    indup = _InameDuplicator(rule_mapping_context,
             old_to_new=dict(list(zip(inames, new_inames))),
             within=within)
 
-    knl = indup.map_kernel(knl)
+    knl = rule_mapping_context.finish_kernel(
+            indup.map_kernel(knl))
 
     # }}}
 
@@ -694,8 +713,6 @@ def link_inames(knl, inames, new_iname, within=None, tag=None):
     all_equal = True
     first_proj = projections[0]
     for proj in projections[1:]:
-        print(proj.gist(first_proj))
-        print(first_proj.gist(proj))
         all_equal = all_equal and (proj <= first_proj and first_proj <= proj)
 
     if not all_equal:
@@ -721,10 +738,13 @@ def link_inames(knl, inames, new_iname, within=None, tag=None):
     within = parse_stack_match(within)
 
     from pymbolic.mapper.substitutor import make_subst_func
-    ijoin = ExpandingSubstitutionMapper(knl.substitutions, var_name_gen,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            knl.substitutions, var_name_gen)
+    ijoin = RuleAwareSubstitutionMapper(rule_mapping_context,
                     make_subst_func(subst_dict), within)
 
-    knl = ijoin.map_kernel(knl)
+    knl = rule_mapping_context.finish_kernel(
+            ijoin.map_kernel(knl))
 
     # }}}
 
@@ -1176,16 +1196,20 @@ def tag_data_axes(knl, ary_names, dim_tags):
 
 # {{{ split_reduction
 
-class _ReductionSplitter(ExpandingIdentityMapper):
-    def __init__(self, kernel, within, inames, direction):
-        ExpandingIdentityMapper.__init__(self,
-                kernel.substitutions, kernel.get_var_name_generator())
+class _ReductionSplitter(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, within, inames, direction):
+        super(_ReductionSplitter, self).__init__(
+                rule_mapping_context)
 
         self.within = within
         self.inames = inames
         self.direction = direction
 
     def map_reduction(self, expr, expn_state):
+        if set(expr.inames) & set(expn_state.arg_context):
+            # FIXME
+            raise NotImplementedError()
+
         if self.inames <= set(expr.inames) and self.within(expn_state.stack):
             leftover_inames = set(expr.inames) - self.inames
 
@@ -1201,7 +1225,7 @@ class _ReductionSplitter(ExpandingIdentityMapper):
             else:
                 assert False
         else:
-            return ExpandingIdentityMapper.map_reduction(self, expr, expn_state)
+            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
 
 
 def _split_reduction(kernel, inames, direction, within=None):
@@ -1215,8 +1239,12 @@ def _split_reduction(kernel, inames, direction, within=None):
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
 
-    rsplit = _ReductionSplitter(kernel, within, inames, direction)
-    return rsplit.map_kernel(kernel)
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    rsplit = _ReductionSplitter(rule_mapping_context,
+            within, inames, direction)
+    return rule_mapping_context.finish_kernel(
+            rsplit.map_kernel(kernel))
 
 
 def split_reduction_inward(kernel, inames, within=None):
@@ -1313,11 +1341,13 @@ def _fix_parameter(kernel, name, value):
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(None)
 
-    from loopy.symbolic import ExpandingSubstitutionMapper
-    esubst_map = ExpandingSubstitutionMapper(
-            kernel.substitutions, kernel.get_var_name_generator(),
-            subst_func, within=within)
-    return (esubst_map.map_kernel(kernel)
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    esubst_map = RuleAwareSubstitutionMapper(
+            rule_mapping_context, subst_func, within=within)
+    return (
+            rule_mapping_context.finish_kernel(
+                esubst_map.map_kernel(kernel))
             .copy(
                 domains=new_domains,
                 args=new_args,
@@ -1570,11 +1600,11 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
         equations = [equations]
 
     import re
-    EQN_RE = re.compile(r"^([^=]+)=([^=]+)$")
+    eqn_re = re.compile(r"^([^=]+)=([^=]+)$")
 
     def parse_equation(eqn):
         if isinstance(eqn, str):
-            eqn_match = EQN_RE.match(eqn)
+            eqn_match = eqn_re.match(eqn)
             if not eqn_match:
                 raise ValueError("invalid equation: %s" % eqn)
 
@@ -1618,10 +1648,14 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
     var_name_gen = kernel.get_var_name_generator()
 
     from pymbolic.mapper.substitutor import make_subst_func
-    old_to_new = ExpandingSubstitutionMapper(kernel.substitutions, var_name_gen,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, var_name_gen)
+    old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context,
             make_subst_func(subst_dict), within=lambda stack: True)
 
-    kernel = (old_to_new.map_kernel(kernel)
+    kernel = (
+            rule_mapping_context.finish_kernel(
+                old_to_new.map_kernel(kernel))
             .copy(
                 applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
                 ))
diff --git a/loopy/array_buffer.py b/loopy/array_buffer_map.py
similarity index 96%
rename from loopy/array_buffer.py
rename to loopy/array_buffer_map.py
index c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c..9189421271ea28c95be861c785243aabbd7bd934 100644
--- a/loopy/array_buffer.py
+++ b/loopy/array_buffer_map.py
@@ -42,7 +42,6 @@ class AccessDescriptor(Record):
 
     __slots__ = [
             "identifier",
-            "expands_footprint",
             "storage_axis_exprs",
             ]
 
@@ -138,16 +137,15 @@ def build_global_storage_to_sweep_map(kernel, access_descriptors,
 
     # build footprint
     for accdesc in access_descriptors:
-        if accdesc.expands_footprint:
-            stor2sweep = build_per_access_storage_to_domain_map(
-                    accdesc, domain_dup_sweep,
-                    storage_axis_names,
-                    prime_sweep_inames)
+        stor2sweep = build_per_access_storage_to_domain_map(
+                accdesc, domain_dup_sweep,
+                storage_axis_names,
+                prime_sweep_inames)
 
-            if global_stor2sweep is None:
-                global_stor2sweep = stor2sweep
-            else:
-                global_stor2sweep = global_stor2sweep.union(stor2sweep)
+        if global_stor2sweep is None:
+            global_stor2sweep = stor2sweep
+        else:
+            global_stor2sweep = global_stor2sweep.union(stor2sweep)
 
     if isinstance(global_stor2sweep, isl.BasicMap):
         global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep)
@@ -336,9 +334,6 @@ class ArrayToBufferMap(object):
             return convexify(domain)
 
     def is_access_descriptor_in_footprint(self, accdesc):
-        if accdesc.expands_footprint:
-            return True
-
         # Make all inames except the sweep parameters. (The footprint may depend on
         # those.) (I.e. only leave sweep inames as out parameters.)
 
@@ -350,7 +345,7 @@ class ArrayToBufferMap(object):
                 set(global_s2s_par_dom.get_var_names(dim_type.param))
                 & self.kernel.all_inames())
 
-        for arg in accdesc.args:
+        for arg in accdesc.storage_axis_exprs:
             arg_inames.update(get_dependencies(arg))
         arg_inames = frozenset(arg_inames)
 
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index ecd74c1a6b3d9ee793889d44976dce1f4bdc0a65..774e1f86e672cbf62eefb8bdffaeb3da2b6cc76f 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -26,11 +26,12 @@ THE SOFTWARE.
 """
 
 from pytools import Record
+from warnings import warn
 
 import numpy as np
 
 import loopy as lp
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, AutomaticTestFailure
 
 
 AUTO_TEST_SKIP_RUN = False
@@ -76,7 +77,7 @@ class TestArgInfo(Record):
 
 # {{{ "reference" arguments
 
-def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value):
+def make_ref_args(kernel, impl_arg_info, queue, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
@@ -143,36 +144,36 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value):
                 numpy_strides = [itemsize*s for s in strides]
 
                 storage_array = cl_array.empty(queue, alloc_size, dtype)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
 
-            if is_output:
-                if arg.arg_class is ImageArg:
-                    raise LoopyError("write-mode images not supported in "
-                            "automatic testing")
+            if is_output and arg.arg_class is ImageArg:
+                raise LoopyError("write-mode images not supported in "
+                        "automatic testing")
 
-                if is_dtype_supported(dtype):
-                    storage_array.fill(fill_value)
-                else:
-                    from warnings import warn
-                    warn("Cannot pre-fill array of dtype '%s' with set "
-                            "value--zeroing instead" % dtype)
-                    storage_array.view(np.uint8).fill(0)
+            fill_rand(storage_array)
 
-                ref_args[arg.name] = ary
+            if arg.arg_class is ImageArg:
+                # must be contiguous
+                pre_run_ary = pre_run_storage_array = storage_array.copy()
+
+                ref_args[arg.name] = cl.image_from_array(
+                        queue.context, ary.get())
             else:
-                fill_rand(storage_array)
-                if arg.arg_class is ImageArg:
-                    # must be contiguous
-                    ref_args[arg.name] = cl.image_from_array(
-                            queue.context, ary.get())
-                else:
-                    ref_args[arg.name] = ary
+                pre_run_storage_array = storage_array.copy()
+
+                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+                pre_run_ary = cl_array.as_strided(
+                        pre_run_storage_array, shape, numpy_strides)
+                ref_args[arg.name] = ary
 
             ref_arg_data.append(
                     TestArgInfo(
                         name=arg.name,
                         ref_array=ary,
                         ref_storage_array=storage_array,
+
+                        ref_pre_run_array=pre_run_ary,
+                        ref_pre_run_storage_array=pre_run_storage_array,
+
                         ref_shape=shape,
                         ref_strides=strides,
                         ref_alloc_size=alloc_size,
@@ -188,8 +189,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value):
 
 # {{{ "full-scale" arguments
 
-def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters,
-        fill_value):
+def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
@@ -224,7 +224,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters,
 
             # must be contiguous
             args[arg.name] = cl.image_from_array(
-                    queue.context, arg_desc.ref_array.get())
+                    queue.context, arg_desc.ref_pre_run_array.get())
 
         elif arg.arg_class is GlobalArg:
             shape = evaluate(arg.unvec_shape, parameters)
@@ -238,50 +238,37 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters,
             alloc_size = sum(astrd*(alen-1)
                     for alen, astrd in zip(shape, strides)) + 1
 
-            if arg.base_name in kernel.get_written_variables():
-                storage_array = cl_array.empty(queue, alloc_size, dtype)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+            # use contiguous array to transfer to host
+            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()
 
-                if is_dtype_supported(dtype):
-                    storage_array.fill(fill_value)
-                else:
-                    from warnings import warn
-                    warn("Cannot pre-fill array of dtype '%s'" % dtype)
-                    storage_array.view(np.uint8).fill(0)
+            # use device shape/strides
+            from pyopencl.compyte.array import as_strided
+            host_ref_array = as_strided(host_ref_contig_array,
+                    arg_desc.ref_shape, arg_desc.ref_numpy_strides)
 
-                args[arg.name] = ary
-            else:
-                # use contiguous array to transfer to host
-                host_ref_contig_array = arg_desc.ref_storage_array.get()
-
-                # use device shape/strides
-                from pyopencl.compyte.array import as_strided
-                host_ref_array = as_strided(host_ref_contig_array,
-                        arg_desc.ref_shape, arg_desc.ref_numpy_strides)
-
-                # flatten the thing
-                host_ref_flat_array = host_ref_array.flatten()
-
-                # create host array with test shape (but not strides)
-                host_contig_array = np.empty(shape, dtype=dtype)
-
-                common_len = min(
-                        len(host_ref_flat_array),
-                        len(host_contig_array.ravel()))
-                host_contig_array.ravel()[:common_len] = \
-                        host_ref_flat_array[:common_len]
-
-                # create host array with test shape and storage layout
-                host_storage_array = np.empty(alloc_size, dtype)
-                host_array = as_strided(
-                        host_storage_array, shape, numpy_strides)
-                host_array[:] = host_contig_array
-
-                host_contig_array = arg_desc.ref_storage_array.get()
-                storage_array = cl_array.to_device(queue, host_storage_array)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+            # flatten the thing
+            host_ref_flat_array = host_ref_array.flatten()
+
+            # create host array with test shape (but not strides)
+            host_contig_array = np.empty(shape, dtype=dtype)
+
+            common_len = min(
+                    len(host_ref_flat_array),
+                    len(host_contig_array.ravel()))
+            host_contig_array.ravel()[:common_len] = \
+                    host_ref_flat_array[:common_len]
+
+            # create host array with test shape and storage layout
+            host_storage_array = np.empty(alloc_size, dtype)
+            host_array = as_strided(
+                    host_storage_array, shape, numpy_strides)
+            host_array[:] = host_contig_array
 
-                args[arg.name] = ary
+            host_contig_array = arg_desc.ref_storage_array.get()
+            storage_array = cl_array.to_device(queue, host_storage_array)
+            ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+
+            args[arg.name] = ary
 
             arg_desc.test_storage_array = storage_array
             arg_desc.test_array = ary
@@ -324,7 +311,7 @@ def _default_check_result(result, ref_result):
 # }}}
 
 
-# {{{ ref device finder
+# {{{ find device for reference test
 
 def _enumerate_cl_devices_for_ref_test():
     import pyopencl as cl
@@ -332,8 +319,6 @@ def _enumerate_cl_devices_for_ref_test():
     noncpu_devs = []
     cpu_devs = []
 
-    from warnings import warn
-
     for pf in cl.get_platforms():
         if pf.name == "Portable Computing Language":
             # pocl not mature enough yet, sadly
@@ -368,7 +353,7 @@ def auto_test_vs_ref(
         ref_knl, ctx, test_knl, op_count=[], op_label=[], parameters={},
         print_ref_code=False, print_code=True, warmup_rounds=2,
         dump_binary=False,
-        fills_entire_output=True, do_check=True, check_result=None
+        fills_entire_output=None, do_check=True, check_result=None
         ):
     """Compare results of `ref_knl` to the kernels generated by
     scheduling *test_knl*.
@@ -386,46 +371,29 @@ def auto_test_vs_ref(
 
     for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
         if ref_arg.name != test_arg.name:
-            raise LoopyError("ref_knl and test_knl argument lists disagee at index "
+            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                     "%d (1-based)" % (i+1))
 
         if ref_arg.dtype != test_arg.dtype:
-            raise LoopyError("ref_knl and test_knl argument lists disagee at index "
+            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                     "%d (1-based)" % (i+1))
 
     from loopy.compiled import CompiledKernel, get_highlighted_cl_code
 
     if isinstance(op_count, (int, float)):
-        from warnings import warn
         warn("op_count should be a list", stacklevel=2)
         op_count = [op_count]
     if isinstance(op_label, str):
-        from warnings import warn
         warn("op_label should be a list", stacklevel=2)
         op_label = [op_label]
 
-    read_and_written_args = (
-            ref_knl.get_read_variables()
-            & ref_knl.get_written_variables()
-            & set(ref_knl.arg_dict))
-
-    if read_and_written_args:
-        # FIXME: In principle, that's possible to test
-        raise LoopyError("kernel reads *and* writes argument(s) '%s' "
-                "and therefore cannot be automatically tested"
-                % ", ".join(read_and_written_args))
-
     from time import time
 
     if check_result is None:
         check_result = _default_check_result
 
-    if fills_entire_output:
-        fill_value_ref = -17
-        fill_value = -18
-    else:
-        fill_value_ref = -17
-        fill_value = fill_value_ref
+    if fills_entire_output is not None:
+        warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2)
 
     # {{{ compile and run reference code
 
@@ -460,8 +428,7 @@ def auto_test_vs_ref(
         try:
             ref_args, ref_arg_data = \
                     make_ref_args(ref_sched_kernel, ref_cl_kernel_info.impl_arg_info,
-                            ref_queue, parameters,
-                            fill_value=fill_value_ref)
+                            ref_queue, parameters)
             ref_args["out_host"] = False
         except cl.RuntimeError as e:
             if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
@@ -523,7 +490,6 @@ def auto_test_vs_ref(
     args = None
     from loopy.kernel import LoopKernel
     if not isinstance(test_knl, LoopKernel):
-        from warnings import warn
         warn("Passing an iterable of kernels to auto_test_vs_ref "
                 "is deprecated--just pass the kernel instead. "
                 "Scheduling will be performed in auto_test_vs_ref.",
@@ -552,7 +518,7 @@ def auto_test_vs_ref(
             cl_kernel_info = compiled.cl_kernel_info(frozenset())
 
             args = make_args(kernel, cl_kernel_info.impl_arg_info,
-                    queue, ref_arg_data, parameters, fill_value=fill_value)
+                    queue, ref_arg_data, parameters)
         args["out_host"] = False
 
         print(75*"-")
@@ -593,7 +559,9 @@ def auto_test_vs_ref(
                     test_ary = test_ary[:common_len]
 
                     error_is_small, error = check_result(test_ary, ref_ary)
-                    assert error_is_small, error
+                    if not error_is_small:
+                        raise AutomaticTestFailure(error)
+
                     need_check = False
 
         events = []
diff --git a/loopy/buffer.py b/loopy/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d155dba7e852e6a3978c3c92d4f4a7cba29bc4f0
--- /dev/null
+++ b/loopy/buffer.py
@@ -0,0 +1,412 @@
+from __future__ import division, absolute_import
+from six.moves import range
+
+__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap,
+        AccessDescriptor)
+from loopy.symbolic import (get_dependencies,
+        RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
+        SubstitutionMapper)
+from pymbolic.mapper.substitutor import make_subst_func
+
+from pymbolic import var
+
+
+# {{{ replace array access
+
+class ArrayAccessReplacer(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context,
+            var_name, within, array_base_map, buf_var):
+        super(ArrayAccessReplacer, self).__init__(rule_mapping_context)
+
+        self.within = within
+
+        self.array_base_map = array_base_map
+
+        self.var_name = var_name
+        self.modified_insn_ids = set()
+
+        self.buf_var = buf_var
+
+    def map_variable(self, expr, expn_state):
+        result = None
+        if expr.name == self.var_name and self.within(expn_state):
+            result = self.map_array_access((), expn_state)
+
+        if result is None:
+            return super(ArrayAccessReplacer, self).map_variable(expr, expn_state)
+        else:
+            self.modified_insn_ids.add(expn_state.insn_id)
+            return result
+
+    def map_subscript(self, expr, expn_state):
+        result = None
+        if expr.aggregate.name == self.var_name and self.within(expn_state):
+            result = self.map_array_access(expr.index, expn_state)
+
+        if result is None:
+            return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state)
+        else:
+            self.modified_insn_ids.add(expn_state.insn_id)
+            return result
+
+    def map_array_access(self, index, expn_state):
+        accdesc = AccessDescriptor(
+            identifier=None,
+            storage_axis_exprs=index)
+
+        if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
+            return None
+
+        abm = self.array_base_map
+
+        index = expn_state.apply_arg_context(index)
+
+        assert len(index) == len(abm.non1_storage_axis_flags)
+
+        access_subscript = []
+        for i in range(len(index)):
+            if not abm.non1_storage_axis_flags[i]:
+                continue
+
+            ax_index = index[i]
+            from loopy.isl_helpers import simplify_via_aff
+            ax_index = simplify_via_aff(
+                    ax_index - abm.storage_base_indices[i])
+
+            access_subscript.append(ax_index)
+
+        result = self.buf_var
+        if access_subscript:
+            result = result.index(tuple(access_subscript))
+
+        # Can't possibly be nested, but recurse anyway to
+        # make sure substitution rules referenced below here
+        # do not get thrown away.
+        self.rec(result, expn_state.copy(arg_context={}))
+
+        return result
+
+# }}}
+
+
+def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
+        store_expression=None, within=None, default_tag="l.auto",
+        temporary_is_local=None, fetch_bounding_box=False):
+    """
+    :arg init_expression: Either *None* (indicating the prior value of the buffered
+        array should be read) or an expression optionally involving the
+        variable 'base' (which references the associated location in the array
+        being buffered).
+    :arg store_expression: Either *None* or an expression involving
+        variables 'base' and 'buffer' (without array indices).
+    """
+
+    # {{{ process arguments
+
+    if isinstance(init_expression, str):
+        from loopy.symbolic import parse
+        init_expression = parse(init_expression)
+
+    if isinstance(store_expression, str):
+        from loopy.symbolic import parse
+        store_expression = parse(store_expression)
+
+    if isinstance(buffer_inames, str):
+        buffer_inames = [s.strip()
+                for s in buffer_inames.split(",") if s.strip()]
+
+    for iname in buffer_inames:
+        if iname not in kernel.all_inames():
+            raise RuntimeError("sweep iname '%s' is not a known iname"
+                    % iname)
+
+    buffer_inames = list(buffer_inames)
+    buffer_inames_set = frozenset(buffer_inames)
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    if var_name in kernel.arg_dict:
+        var_descr = kernel.arg_dict[var_name]
+    elif var_name in kernel.temporary_variables:
+        var_descr = kernel.temporary_variables[var_name]
+    else:
+        raise ValueError("variable '%s' not found" % var_name)
+
+    from loopy.kernel.data import ArrayBase
+    if isinstance(var_descr, ArrayBase):
+        var_shape = var_descr.shape
+    else:
+        var_shape = ()
+
+    if temporary_is_local is None:
+        import loopy as lp
+        temporary_is_local = lp.auto
+
+    # }}}
+
+    var_name_gen = kernel.get_var_name_generator()
+    within_inames = set()
+
+    access_descriptors = []
+    for insn in kernel.instructions:
+        if not within((insn.id, insn.tags)):
+            continue
+
+        for assignee, index in insn.assignees_and_indices():
+            if assignee == var_name:
+                within_inames.update(
+                        (get_dependencies(index) & kernel.all_inames())
+                        - buffer_inames_set)
+                access_descriptors.append(
+                        AccessDescriptor(
+                            identifier=insn.id,
+                            storage_axis_exprs=index))
+
+    # {{{ find fetch/store inames
+
+    init_inames = []
+    store_inames = []
+    new_iname_to_tag = {}
+
+    for i in range(len(var_shape)):
+        init_iname = var_name_gen("%s_init_%d" % (var_name, i))
+        store_iname = var_name_gen("%s_store_%d" % (var_name, i))
+
+        new_iname_to_tag[init_iname] = default_tag
+        new_iname_to_tag[store_iname] = default_tag
+
+        init_inames.append(init_iname)
+        store_inames.append(store_iname)
+
+    # }}}
+
+    # {{{ modify loop domain
+
+    non1_init_inames = []
+    non1_store_inames = []
+
+    if var_shape:
+        # {{{ find domain to be changed
+
+        from loopy.kernel.tools import DomainChanger
+        domch = DomainChanger(kernel, buffer_inames_set | within_inames)
+
+        if domch.leaf_domain_index is not None:
+            # If the sweep inames are at home in parent domains, then we'll add
+            # fetches with loops over copies of these parent inames that will end
+            # up being scheduled *within* loops over these parents.
+
+            for iname in buffer_inames_set:
+                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
+                    raise RuntimeError("buffer iname '%s' is not 'at home' in the "
+                            "sweep's leaf domain" % iname)
+
+        # }}}
+
+        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames,
+                access_descriptors, len(var_shape))
+
+        for i in range(len(var_shape)):
+            if abm.non1_storage_axis_flags[i]:
+                non1_init_inames.append(init_inames[i])
+                non1_store_inames.append(store_inames[i])
+            else:
+                del new_iname_to_tag[init_inames[i]]
+                del new_iname_to_tag[store_inames[i]]
+
+        new_domain = domch.domain
+        new_domain = abm.augment_domain_with_sweep(
+                    new_domain, non1_init_inames,
+                    boxify_sweep=fetch_bounding_box)
+        new_domain = abm.augment_domain_with_sweep(
+                    new_domain, non1_store_inames,
+                    boxify_sweep=fetch_bounding_box)
+        new_kernel_domains = domch.get_domains_with(new_domain)
+        del new_domain
+
+    else:
+        # leave kernel domains unchanged
+        new_kernel_domains = kernel.domains
+
+        abm = NoOpArrayToBufferMap()
+
+    # }}}
+
+    # {{{ set up temp variable
+
+    import loopy as lp
+
+    buf_var_name = var_name_gen(based_on=var_name+"_buf")
+
+    new_temporary_variables = kernel.temporary_variables.copy()
+    temp_var = lp.TemporaryVariable(
+            name=buf_var_name,
+            dtype=var_descr.dtype,
+            base_indices=(0,)*len(abm.non1_storage_shape),
+            shape=tuple(abm.non1_storage_shape),
+            is_local=temporary_is_local)
+
+    new_temporary_variables[buf_var_name] = temp_var
+
+    # }}}
+
+    new_insns = []
+
+    buf_var = var(buf_var_name)
+
+    # {{{ generate init instruction
+
+    buf_var_init = buf_var
+    if non1_init_inames:
+        buf_var_init = buf_var_init.index(
+                tuple(var(iname) for iname in non1_init_inames))
+
+    init_base = var(var_name)
+
+    init_subscript = []
+    init_iname_idx = 0
+    if var_shape:
+        for i in range(len(var_shape)):
+            ax_subscript = abm.storage_base_indices[i]
+            if abm.non1_storage_axis_flags[i]:
+                ax_subscript += var(non1_init_inames[init_iname_idx])
+                init_iname_idx += 1
+            init_subscript.append(ax_subscript)
+
+    if init_subscript:
+        init_base = init_base.index(tuple(init_subscript))
+
+    if init_expression is None:
+        init_expression = init_base
+    else:
+        init_expression = init_expression
+        init_expression = SubstitutionMapper(
+                make_subst_func({
+                    "base": init_base,
+                    }))(init_expression)
+
+    init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name)
+    from loopy.kernel.data import ExpressionInstruction
+    init_instruction = ExpressionInstruction(id=init_insn_id,
+                assignee=buf_var_init,
+                expression=init_expression,
+                forced_iname_deps=frozenset(within_inames),
+                insn_deps=frozenset(),
+                insn_deps_is_final=True,
+                )
+
+    # }}}
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    aar = ArrayAccessReplacer(rule_mapping_context, var_name,
+            within, abm, buf_var)
+    kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel))
+
+    did_write = False
+    for insn_id in aar.modified_insn_ids:
+        insn = kernel.id_to_insn[insn_id]
+        if any(assignee_name == buf_var_name
+                for assignee_name, _ in insn.assignees_and_indices()):
+            did_write = True
+
+    # {{{ add init_insn_id to insn_deps
+
+    new_insns = []
+
+    def none_to_empty_set(s):
+        if s is None:
+            return frozenset()
+        else:
+            return s
+
+    for insn in kernel.instructions:
+        if insn.id in aar.modified_insn_ids:
+            new_insns.append(
+                    insn.copy(
+                        insn_deps=(
+                            none_to_empty_set(insn.insn_deps)
+                            | frozenset([init_insn_id]))))
+        else:
+            new_insns.append(insn)
+
+    # }}}
+
+    # {{{ generate store instruction
+
+    buf_var_store = buf_var
+    if non1_store_inames:
+        buf_var_store = buf_var_store.index(
+                tuple(var(iname) for iname in non1_store_inames))
+
+    store_subscript = []
+    store_iname_idx = 0
+    if var_shape:
+        for i in range(len(var_shape)):
+            ax_subscript = abm.storage_base_indices[i]
+            if abm.non1_storage_axis_flags[i]:
+                ax_subscript += var(non1_store_inames[store_iname_idx])
+                store_iname_idx += 1
+            store_subscript.append(ax_subscript)
+
+    store_target = var(var_name)
+    if store_subscript:
+        store_target = store_target.index(tuple(store_subscript))
+
+    if store_expression is None:
+        store_expression = buf_var_store
+    else:
+        store_expression = SubstitutionMapper(
+                make_subst_func({
+                    "base": store_target,
+                    "buffer": buf_var_store,
+                    }))(store_expression)
+
+    from loopy.kernel.data import ExpressionInstruction
+    store_instruction = ExpressionInstruction(
+                id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
+                insn_deps=frozenset(aar.modified_insn_ids),
+                assignee=store_target,
+                expression=store_expression,
+                forced_iname_deps=frozenset(within_inames))
+
+    # }}}
+
+    new_insns.append(init_instruction)
+    if did_write:
+        new_insns.append(store_instruction)
+
+    kernel = kernel.copy(
+            domains=new_kernel_domains,
+            instructions=new_insns,
+            temporary_variables=new_temporary_variables)
+
+    from loopy import tag_inames
+    kernel = tag_inames(kernel, new_iname_to_tag)
+
+    return kernel
+
+# vim: foldmethod=marker
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index dc0d0d4538c618bf1115b8636e5b50e826882b11..56d7f6706c8932a9a97202768bdd0053c5c36d10 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -74,6 +74,10 @@ class TypeInferenceFailure(LoopyError):
     pass
 
 
+class AutomaticTestFailure(LoopyError):
+    pass
+
+
 class DependencyTypeInferenceFailure(TypeInferenceFailure):
     def __init__(self, message, symbol):
         TypeInferenceFailure.__init__(self, message)
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 93be120d2cca92e9e887ffb7908816da82b2c925..fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -354,4 +354,13 @@ def boxify(cache_manager, domain, box_inames, context):
 
     return convexify(result)
 
+
+def simplify_via_aff(expr):
+    from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies
+    deps = get_dependencies(expr)
+    return aff_to_expr(aff_from_expr(
+        isl.Space.create_from_names(isl.Context(), list(deps)),
+        expr))
+
+
 # vim: foldmethod=marker
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 68f7f131c4c40d2f647fb6c6095959b4884a3c41..14562b384a89f1455a3a29ac161682889f1d3e01 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -79,7 +79,7 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
 
 # {{{ loop kernel object
 
-class kernel_state:
+class kernel_state:  # noqa
     INITIAL = 0
     PREPROCESSED = 1
     SCHEDULED = 2
@@ -922,12 +922,6 @@ class LoopKernel(RecordWithoutPickling):
             line = "%s: %s" % (iname, self.iname_to_tag.get(iname))
             lines.append(line)
 
-        if self.substitutions:
-            lines.append(sep)
-            lines.append("SUBSTIUTION RULES:")
-            for rule_name in sorted(six.iterkeys(self.substitutions)):
-                lines.append(str(self.substitutions[rule_name]))
-
         if self.temporary_variables:
             lines.append(sep)
             lines.append("TEMPORARIES:")
@@ -935,6 +929,12 @@ class LoopKernel(RecordWithoutPickling):
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
+        if self.substitutions:
+            lines.append(sep)
+            lines.append("SUBSTIUTION RULES:")
+            for rule_name in sorted(six.iterkeys(self.substitutions)):
+                lines.append(str(self.substitutions[rule_name]))
+
         lines.append(sep)
         lines.append("INSTRUCTIONS:")
         loop_list_width = 35
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index ee14042d11263b6711b022fc546e7d1979f0da6b..3f5340d645495de42e8e384dc07c34d138be87f4 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -342,6 +342,15 @@ def parse_array_dim_tags(dim_tags, use_increasing_target_axes=False):
                             for nl in ta_nesting_levels),
                         target_axis))
 
+        ta_nesting_level_increment = -min(ta_nesting_levels)
+        for i in range(len(result)):
+            if (isinstance(result[i], _StrideArrayDimTagBase)
+                    and result[i].target_axis == target_axis
+                    and result[i].layout_nesting_level is not None):
+                result[i] = result[i].copy(
+                        layout_nesting_level=result[i].layout_nesting_level
+                        + ta_nesting_level_increment)
+
     # }}}
 
     return result
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index a21d983d7059499a16242f6c4857a2a7332a4f82..e99dedbf9b4ec36dfebe082da970ff260e2fb27e 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -459,16 +459,16 @@ class ArgumentGuesser:
                 (assignee_var_name, _), = insn.assignees_and_indices()
                 self.all_written_names.add(assignee_var_name)
                 self.all_names.update(get_dependencies(
-                    self.submap(insn.assignee, insn.id, insn.tags)))
+                    self.submap(insn.assignee)))
                 self.all_names.update(get_dependencies(
-                    self.submap(insn.expression, insn.id, insn.tags)))
+                    self.submap(insn.expression)))
 
     def find_index_rank(self, name):
         irf = IndexRankFinder(name)
 
         for insn in self.instructions:
             insn.with_transformed_expressions(
-                    lambda expr: irf(self.submap(expr, insn.id, insn.tags)))
+                    lambda expr: irf(self.submap(expr)))
 
         if not irf.index_ranks:
             return 0
@@ -859,8 +859,7 @@ def guess_arg_shape_if_requested(kernel, default_order):
     from loopy.kernel.array import ArrayBase
     from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper
 
-    submap = SubstitutionRuleExpander(kernel.substitutions,
-            kernel.get_var_name_generator())
+    submap = SubstitutionRuleExpander(kernel.substitutions)
 
     for arg in kernel.args:
         if isinstance(arg, ArrayBase) and arg.shape is lp.auto:
@@ -869,11 +868,12 @@ def guess_arg_shape_if_requested(kernel, default_order):
             try:
                 for insn in kernel.instructions:
                     if isinstance(insn, lp.ExpressionInstruction):
-                        armap(submap(insn.assignee, insn.id, insn.tags),
-                                kernel.insn_inames(insn))
-                        armap(submap(insn.expression, insn.id, insn.tags),
-                                kernel.insn_inames(insn))
+                        armap(submap(insn.assignee), kernel.insn_inames(insn))
+                        armap(submap(insn.expression), kernel.insn_inames(insn))
             except TypeError as e:
+                from traceback import print_exc
+                print_exc()
+
                 from loopy.diagnostic import LoopyError
                 raise LoopyError(
                         "Failed to (automatically, as requested) find "
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 871f5359b925703a8e71729af7585e51d051f7ca..ab747b9a66f82044e0fe7dc4591867fdd34d3519 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -28,10 +28,10 @@ THE SOFTWARE.
 import numpy as np
 from pytools import Record, memoize_method
 from loopy.kernel.array import ArrayBase
-from loopy.diagnostic import LoopyError  # noqa
+from loopy.diagnostic import LoopyError
 
 
-class auto(object):
+class auto(object):  # noqa
     """A generic placeholder object for something that should be automatically
     detected.  See, for example, the *shape* or *strides* argument of
     :class:`GlobalArg`.
@@ -484,6 +484,10 @@ class InstructionBase(Record):
         if insn_deps_is_final is None:
             insn_deps_is_final = False
 
+        if insn_deps_is_final and not isinstance(insn_deps, frozenset):
+            raise LoopyError("Setting insn_deps_is_final to True requires "
+                    "actually specifying insn_deps")
+
         if tags is None:
             tags = ()
 
diff --git a/loopy/padding.py b/loopy/padding.py
index 8f747d413e0182f2a863bbe13d745362b993d1ee..4951c68543bc143f25a2c376b183d0f76e5d6c18 100644
--- a/loopy/padding.py
+++ b/loopy/padding.py
@@ -25,12 +25,12 @@ THE SOFTWARE.
 """
 
 
-from loopy.symbolic import ExpandingIdentityMapper
+from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
 
 
-class ArgAxisSplitHelper(ExpandingIdentityMapper):
-    def __init__(self, rules, var_name_gen, arg_names, handler):
-        ExpandingIdentityMapper.__init__(self, rules, var_name_gen)
+class ArgAxisSplitHelper(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, arg_names, handler):
+        super(ArgAxisSplitHelper, self).__init__(rule_mapping_context)
         self.arg_names = arg_names
         self.handler = handler
 
@@ -38,7 +38,7 @@ class ArgAxisSplitHelper(ExpandingIdentityMapper):
         if expr.aggregate.name in self.arg_names:
             return self.handler(expr)
         else:
-            return ExpandingIdentityMapper.map_subscript(self, expr, expn_state)
+            return super(ArgAxisSplitHelper, self).map_subscript(expr, expn_state)
 
 
 def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True):
@@ -205,9 +205,11 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True):
 
         return expr.aggregate.index(tuple(idx))
 
-    aash = ArgAxisSplitHelper(kernel.substitutions, var_name_gen,
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, var_name_gen)
+    aash = ArgAxisSplitHelper(rule_mapping_context,
             set(six.iterkeys(arg_to_rest)), split_access_axis)
-    kernel = aash.map_kernel(kernel)
+    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))
 
     kernel = kernel.copy(args=new_args)
 
diff --git a/loopy/precompute.py b/loopy/precompute.py
index de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c..e516dde139d370f60b05fe3d6740d6fbd9d833e8 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -25,15 +25,15 @@ THE SOFTWARE.
 """
 
 
-import islpy as isl
-from loopy.symbolic import (get_dependencies, SubstitutionMapper,
-        ExpandingIdentityMapper)
+from loopy.symbolic import (get_dependencies,
+        RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
+        SubstitutionRuleMappingContext)
 from pymbolic.mapper.substitutor import make_subst_func
 import numpy as np
 
 from pymbolic import var
 
-from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap,
+from loopy.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap,
         AccessDescriptor)
 
 
@@ -57,18 +57,11 @@ def storage_axis_exprs(storage_axis_sources, args):
     return result
 
 
-def simplify_via_aff(expr):
-    from loopy.symbolic import aff_from_expr, aff_to_expr
-    deps = get_dependencies(expr)
-    return aff_to_expr(aff_from_expr(
-        isl.Space.create_from_names(isl.Context(), list(deps)),
-        expr))
+# {{{ gather rule invocations
 
-
-class RuleInvocationGatherer(ExpandingIdentityMapper):
-    def __init__(self, kernel, subst_name, subst_tag, within):
-        ExpandingIdentityMapper.__init__(self,
-                kernel.substitutions, kernel.get_var_name_generator())
+class RuleInvocationGatherer(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within):
+        super(RuleInvocationGatherer, self).__init__(rule_mapping_context)
 
         from loopy.symbolic import SubstitutionRuleExpander
         self.subst_expander = SubstitutionRuleExpander(
@@ -90,18 +83,17 @@ class RuleInvocationGatherer(ExpandingIdentityMapper):
         process_me = process_me and self.within(expn_state.stack)
 
         if not process_me:
-            return ExpandingIdentityMapper.map_substitution(
-                    self, name, tag, arguments, expn_state)
+            return super(RuleInvocationGatherer, self).map_substitution(
+                    name, tag, arguments, expn_state)
 
-        rule = self.old_subst_rules[name]
+        rule = self.rule_mapping_context.old_subst_rules[name]
         arg_context = self.make_new_arg_context(
                     name, rule.arguments, arguments, expn_state.arg_context)
 
         arg_deps = set()
         for arg_val in six.itervalues(arg_context):
             arg_deps = (arg_deps
-                    | get_dependencies(self.subst_expander(
-                        arg_val, insn_id=None, insn_tags=None)))
+                    | get_dependencies(self.subst_expander(arg_val)))
 
         # FIXME: This is too strict--and the footprint machinery
         # needs to be taught how to deal with locally constant
@@ -116,12 +108,11 @@ class RuleInvocationGatherer(ExpandingIdentityMapper):
                         ", ".join(arg_deps - self.kernel.all_inames()),
                         ))
 
-            return ExpandingIdentityMapper.map_substitution(
-                    self, name, tag, arguments, expn_state)
+            return super(RuleInvocationGatherer, self).map_substitution(
+                    name, tag, arguments, expn_state)
 
         args = [arg_context[arg_name] for arg_name in rule.arguments]
 
-        # Do not set expands_footprint here, it is set below.
         self.access_descriptors.append(
                 RuleAccessDescriptor(
                     identifier=access_descriptor_id(args, expn_state.stack),
@@ -130,21 +121,19 @@ class RuleInvocationGatherer(ExpandingIdentityMapper):
 
         return 0  # exact value irrelevant
 
+# }}}
+
+
+# {{{ replace rule invocation
 
-class RuleInvocationReplacer(ExpandingIdentityMapper):
-    def __init__(self, kernel, subst_name, subst_tag, within,
+class RuleInvocationReplacer(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, subst_name, subst_tag, within,
             access_descriptors, array_base_map,
             storage_axis_names, storage_axis_sources,
-            storage_base_indices, non1_storage_axis_names,
+            non1_storage_axis_names,
             target_var_name):
-        ExpandingIdentityMapper.__init__(self,
-                kernel.substitutions, kernel.get_var_name_generator())
-
-        from loopy.symbolic import SubstitutionRuleExpander
-        self.subst_expander = SubstitutionRuleExpander(
-                kernel.substitutions, kernel.get_var_name_generator())
+        super(RuleInvocationReplacer, self).__init__(rule_mapping_context)
 
-        self.kernel = kernel
         self.subst_name = subst_name
         self.subst_tag = subst_tag
         self.within = within
@@ -154,55 +143,44 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
 
         self.storage_axis_names = storage_axis_names
         self.storage_axis_sources = storage_axis_sources
-        self.storage_base_indices = storage_base_indices
         self.non1_storage_axis_names = non1_storage_axis_names
 
         self.target_var_name = target_var_name
 
     def map_substitution(self, name, tag, arguments, expn_state):
-        process_me = name == self.subst_name
+        if not (
+                name == self.subst_name
+                and self.within(expn_state.stack)
+                and (self.subst_tag is None or self.subst_tag == tag)):
+            return super(RuleInvocationReplacer, self).map_substitution(
+                    name, tag, arguments, expn_state)
 
-        if self.subst_tag is not None and self.subst_tag != tag:
-            process_me = False
+        # {{{ check if in footprint
 
-        process_me = process_me and self.within(expn_state.stack)
-
-        # {{{ find matching invocation descriptor
-
-        rule = self.old_subst_rules[name]
+        rule = self.rule_mapping_context.old_subst_rules[name]
         arg_context = self.make_new_arg_context(
                     name, rule.arguments, arguments, expn_state.arg_context)
         args = [arg_context[arg_name] for arg_name in rule.arguments]
 
-        if not process_me:
-            return ExpandingIdentityMapper.map_substitution(
-                    self, name, tag, arguments, expn_state)
-
-        matching_accdesc = None
-        for accdesc in self.access_descriptors:
-            if accdesc.identifier == access_descriptor_id(args, expn_state.stack):
-                # Could be more than one, that's fine.
-                matching_accdesc = accdesc
-                break
-
-        assert matching_accdesc is not None
+        accdesc = AccessDescriptor(
+                storage_axis_exprs=storage_axis_exprs(
+                    self.storage_axis_sources, args))
 
-        accdesc = matching_accdesc
-        del matching_accdesc
+        if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
+            return super(RuleInvocationReplacer, self).map_substitution(
+                    name, tag, arguments, expn_state)
 
         # }}}
 
-        if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
-            return ExpandingIdentityMapper.map_substitution(
-                    self, name, tag, arguments, expn_state)
-
         assert len(arguments) == len(rule.arguments)
 
+        abm = self.array_base_map
+
         stor_subscript = []
         for sax_name, sax_source, sax_base_idx in zip(
                 self.storage_axis_names,
                 self.storage_axis_sources,
-                self.storage_base_indices):
+                abm.storage_base_indices):
             if sax_name not in self.non1_storage_axis_names:
                 continue
 
@@ -213,6 +191,7 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
                 # an iname
                 ax_index = var(sax_source)
 
+            from loopy.isl_helpers import simplify_via_aff
             ax_index = simplify_via_aff(ax_index - sax_base_idx)
             stor_subscript.append(ax_index)
 
@@ -220,13 +199,14 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
         if stor_subscript:
             new_outer_expr = new_outer_expr.index(tuple(stor_subscript))
 
-        # Can't possibly be nested, but recurse anyway to
-        # make sure substitution rules referenced below here
-        # do not get thrown away.
-        self.rec(rule.expression, expn_state.copy(arg_context={}))
+        # Can't possibly be nested, and no need to traverse
+        # further as compute expression has already been seen
+        # by rule_mapping_context.
 
         return new_outer_expr
 
+# }}}
+
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={},
@@ -301,6 +281,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             raise RuntimeError("sweep iname '%s' is not a known iname"
                     % iname)
 
+    sweep_inames = list(sweep_inames)
+    sweep_inames_set = frozenset(sweep_inames)
+
     if isinstance(storage_axes, str):
         storage_axes = storage_axes.split(",")
 
@@ -357,11 +340,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ process invocations in footprint generators, start access_descriptors
 
-    access_descriptors = []
-
     if footprint_generators:
         from pymbolic.primitives import Variable, Call
 
+        access_descriptors = []
+
         for fpg in footprint_generators:
             if isinstance(fpg, Variable):
                 args = ()
@@ -374,7 +357,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             access_descriptors.append(
                     RuleAccessDescriptor(
                         identifier=access_descriptor_id(args, None),
-                        expands_footprint=True,
                         args=args
                         ))
 
@@ -382,34 +364,33 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ gather up invocations in kernel code, finish access_descriptors
 
-    invg = RuleInvocationGatherer(kernel, subst_name, subst_tag, within)
-
-    import loopy as lp
-    for insn in kernel.instructions:
-        if isinstance(insn, lp.ExpressionInstruction):
-            invg(insn.expression, insn.id, insn.tags)
+    if not footprint_generators:
+        rule_mapping_context = SubstitutionRuleMappingContext(
+                kernel.substitutions, kernel.get_var_name_generator())
+        invg = RuleInvocationGatherer(
+                rule_mapping_context, kernel, subst_name, subst_tag, within)
+        del rule_mapping_context
 
-    for accdesc in invg.access_descriptors:
-        access_descriptors.append(
-                accdesc.copy(expands_footprint=footprint_generators is None))
+        import loopy as lp
+        for insn in kernel.instructions:
+            if isinstance(insn, lp.ExpressionInstruction):
+                invg(insn.assignee, insn.id, insn.tags)
+                invg(insn.expression, insn.id, insn.tags)
 
-    if not access_descriptors:
-        raise RuntimeError("no invocations of '%s' found" % subst_name)
+        access_descriptors = invg.access_descriptors
+        if not access_descriptors:
+            raise RuntimeError("no invocations of '%s' found" % subst_name)
 
     # }}}
 
-    sweep_inames = list(sweep_inames)
-    sweep_inames_set = frozenset(sweep_inames)
-
     # {{{ find inames used in arguments
 
     expanding_usage_arg_deps = set()
 
     for accdesc in access_descriptors:
-        if accdesc.expands_footprint:
-            for arg in accdesc.args:
-                expanding_usage_arg_deps.update(
-                        get_dependencies(arg) & kernel.all_inames())
+        for arg in accdesc.args:
+            expanding_usage_arg_deps.update(
+                    get_dependencies(arg) & kernel.all_inames())
 
     # }}}
 
@@ -424,7 +405,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     submap = SubstitutionRuleExpander(kernel.substitutions)
 
     value_inames = get_dependencies(
-            submap(subst.expression, insn_id=None, insn_tags=None)
+            submap(subst.expression)
             ) & kernel.all_inames()
     if value_inames - expanding_usage_arg_deps < extra_storage_axes:
         raise RuntimeError("unreferenced sweep inames specified: "
@@ -438,7 +419,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 list(extra_storage_axes)
                 + list(range(len(subst.arguments))))
 
-    expr_subst_dict = {}
+    prior_storage_axis_name_dict = {}
 
     storage_axis_names = []
     storage_axis_sources = []  # number for arg#, or iname
@@ -471,18 +452,12 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         new_iname_to_tag[name] = storage_axis_to_tag.get(
                 tag_lookup_saxis, default_tag)
 
-        expr_subst_dict[old_name] = var(name)
+        prior_storage_axis_name_dict[name] = old_name
 
     del storage_axis_to_tag
     del storage_axes
     del new_storage_axis_names
 
-    compute_expr = (
-            SubstitutionMapper(make_subst_func(expr_subst_dict))
-            (subst.expression))
-
-    del expr_subst_dict
-
     # }}}
 
     # {{{ fill out access_descriptors[...].storage_axis_exprs
@@ -535,48 +510,68 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         # leave kernel domains unchanged
         new_kernel_domains = kernel.domains
 
+        non1_storage_axis_names = []
         abm = NoOpArrayToBufferMap()
 
+    kernel = kernel.copy(domains=new_kernel_domains)
+
     # {{{ set up compute insn
 
     target_var_name = var_name_gen(based_on=c_subst_name)
-
     assignee = var(target_var_name)
 
     if non1_storage_axis_names:
         assignee = assignee.index(
                 tuple(var(iname) for iname in non1_storage_axis_names))
 
-    def zero_length_1_arg(arg_name):
+    # {{{ process substitutions on compute instruction
+
+    storage_axis_subst_dict = {}
+
+    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
         if arg_name in non1_storage_axis_names:
-            return var(arg_name)
+            arg = var(arg_name)
         else:
-            return 0
+            arg = 0
+
+        storage_axis_subst_dict[
+                prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi
 
-    compute_expr = (SubstitutionMapper(
-        make_subst_func(dict(
-            (arg_name, zero_length_1_arg(arg_name)+bi)
-            for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices)
-            )))
-        (compute_expr))
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+
+    from loopy.context_matching import AllStackMatch
+    expr_subst_map = RuleAwareSubstitutionMapper(
+            rule_mapping_context,
+            make_subst_func(storage_axis_subst_dict),
+            within=AllStackMatch())
+
+    compute_expression = expr_subst_map(subst.expression, None, None)
+
+    # }}}
 
     from loopy.kernel.data import ExpressionInstruction
+    compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name)
     compute_insn = ExpressionInstruction(
-            id=kernel.make_unique_instruction_id(based_on=c_subst_name),
+            id=compute_insn_id,
             assignee=assignee,
-            expression=compute_expr)
+            expression=compute_expression)
 
     # }}}
 
     # {{{ substitute rule into expressions in kernel (if within footprint)
 
-    invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within,
+    invr = RuleInvocationReplacer(rule_mapping_context,
+            subst_name, subst_tag, within,
             access_descriptors, abm,
             storage_axis_names, storage_axis_sources,
-            abm.storage_base_indices, non1_storage_axis_names,
+            non1_storage_axis_names,
             target_var_name)
 
     kernel = invr.map_kernel(kernel)
+    kernel = kernel.copy(
+            instructions=[compute_insn] + kernel.instructions)
+    kernel = rule_mapping_context.finish_kernel(kernel)
 
     # }}}
 
@@ -603,13 +598,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     new_temporary_variables[target_var_name] = temp_var
 
-    # }}}
-
     kernel = kernel.copy(
-            domains=new_kernel_domains,
-            instructions=[compute_insn] + kernel.instructions,
             temporary_variables=new_temporary_variables)
 
+    # }}}
+
     from loopy import tag_inames
     return tag_inames(kernel, new_iname_to_tag)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 64736a40a210d526ddbfa01854daac76b5daa1e7..f9cd3f3eb2140354decf61b475ba1742d5ea6832 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -26,8 +26,8 @@ THE SOFTWARE.
 import six
 import numpy as np
 from loopy.diagnostic import (
-        LoopyError, LoopyWarning, WriteRaceConditionWarning, warn,
-        LoopyAdvisory)
+        LoopyError, WriteRaceConditionWarning, warn,
+        LoopyAdvisory, DependencyTypeInferenceFailure)
 
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
@@ -88,8 +88,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         if not isinstance(writer_insn, lp.ExpressionInstruction):
             continue
 
-        expr = subst_expander(writer_insn.expression,
-                insn_id=writer_insn_id, insn_tags=writer_insn.tags)
+        expr = subst_expander(writer_insn.expression)
 
         try:
             debug("             via expr %s" % expr)
@@ -139,12 +138,8 @@ def infer_unknown_types(kernel, expect_completion=False):
     def debug(s):
         logger.debug("%s: %s" % (kernel.name, s))
 
+    unexpanded_kernel = kernel
     if kernel.substitutions:
-        from warnings import warn as py_warn
-        py_warn("type inference called when substitution "
-                "rules are still unexpanded, expanding",
-                LoopyWarning, stacklevel=2)
-
         from loopy.subst import expand_subst
         kernel = expand_subst(kernel)
 
@@ -175,8 +170,7 @@ def infer_unknown_types(kernel, expect_completion=False):
                 ]))
 
     from loopy.symbolic import SubstitutionRuleExpander
-    subst_expander = SubstitutionRuleExpander(kernel.substitutions,
-            kernel.get_var_name_generator())
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
 
     # {{{ work on type inference queue
 
@@ -240,7 +234,7 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     # }}}
 
-    return kernel.copy(
+    return unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
@@ -419,7 +413,11 @@ def realize_reduction(kernel, insn_id_filter=None):
         target_var_name = var_name_gen("acc_"+"_".join(expr.inames))
         target_var = var(target_var_name)
 
-        arg_dtype = type_inf_mapper(expr.expr)
+        try:
+            arg_dtype = type_inf_mapper(expr.expr)
+        except DependencyTypeInferenceFailure:
+            raise LoopyError("failed to determine type of accumulator for "
+                    "reduction '%s'" % expr)
 
         from loopy.kernel.data import ExpressionInstruction, TemporaryVariable
 
@@ -845,8 +843,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
                 continue
             coeffs = CoefficientCollector()(iexpr_i)
             for var, coeff in six.iteritems(coeffs):
-                assert isinstance(var, Variable)
-                if var.name in auto_axis_inames:  # excludes '1', i.e.  the constant
+                if (isinstance(var, Variable)
+                        and var.name in auto_axis_inames):
+                    # excludes '1', i.e.  the constant
                     new_stride = coeff*stride
                     old_stride = iname_to_stride_expr.get(var.name, None)
                     if old_stride is None or new_stride < old_stride:
diff --git a/loopy/subst.py b/loopy/subst.py
index d4dba4b5dfd9918926adaad2af9179813914e8be..3b112a4fd44880f6cd66019aff1023ee426ae125 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 
 from loopy.symbolic import (
         get_dependencies, SubstitutionMapper,
-        ExpandingIdentityMapper)
+        RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 from pymbolic.mapper.substitutor import make_subst_func
 
@@ -200,19 +200,20 @@ def extract_subst(kernel, subst_name, template, parameters=()):
 
 # {{{ temporary_to_subst
 
-class TemporaryToSubstChanger(ExpandingIdentityMapper):
-    def __init__(self, kernel, temp_name, definition_insn_ids,
-            usage_to_definition, within):
-        self.var_name_gen = kernel.get_var_name_generator()
+class TemporaryToSubstChanger(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, temp_name, definition_insn_ids,
+            usage_to_definition, extra_arguments, within):
+        self.var_name_gen = rule_mapping_context.make_unique_var_name
 
-        super(TemporaryToSubstChanger, self).__init__(
-                kernel.substitutions, self.var_name_gen)
+        super(TemporaryToSubstChanger, self).__init__(rule_mapping_context)
 
-        self.kernel = kernel
         self.temp_name = temp_name
         self.definition_insn_ids = definition_insn_ids
         self.usage_to_definition = usage_to_definition
 
+        from pymbolic import var
+        self.extra_arguments = tuple(var(arg) for arg in extra_arguments)
+
         self.within = within
 
         self.definition_insn_id_to_subst_name = {}
@@ -230,7 +231,8 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper):
             return subst_name
 
     def map_variable(self, expr, expn_state):
-        if expr.name == self.temp_name:
+        if (expr.name == self.temp_name
+                and expr.name not in expn_state.arg_context):
             result = self.transform_access(None, expn_state)
             if result is not None:
                 return result
@@ -239,7 +241,8 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper):
                 expr, expn_state)
 
     def map_subscript(self, expr, expn_state):
-        if expr.aggregate.name == self.temp_name:
+        if (expr.aggregate.name == self.temp_name
+                and expr.aggregate.name not in expn_state.arg_context):
             result = self.transform_access(expr.index, expn_state)
             if result is not None:
                 return result
@@ -248,7 +251,7 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper):
                 expr, expn_state)
 
     def transform_access(self, index, expn_state):
-        my_insn_id = expn_state.stack[0][0]
+        my_insn_id = expn_state.insn_id
 
         if my_insn_id in self.definition_insn_ids:
             return None
@@ -259,10 +262,14 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper):
             self.saw_unmatched_usage_sites[my_def_id] = True
             return None
 
-        my_insn_id = expn_state.stack[0][0]
-
         subst_name = self.get_subst_name(my_def_id)
 
+        if self.extra_arguments:
+            if index is None:
+                index = self.extra_arguments
+            else:
+                index = index + self.extra_arguments
+
         from pymbolic import var
         if index is None:
             return var(subst_name)
@@ -270,9 +277,11 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper):
             return var(subst_name)(*index)
 
 
-def temporary_to_subst(kernel, temp_name, within=None):
+def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
     """Extract an assignment to a temporary variable
-    as a :ref:`substituion-rule`. The temporary may
+    as a :ref:`substituiton-rule`. The temporary may be an array, in
+    which case the array indices will become arguments to the substitution
+    rule.
 
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
@@ -284,6 +293,9 @@ def temporary_to_subst(kernel, temp_name, within=None):
     as the temporary variable is left in place.
     """
 
+    if isinstance(extra_arguments, str):
+        extra_arguments = tuple(s.strip() for s in extra_arguments.split(","))
+
     # {{{ establish the relevant definition of temp_name for each usage site
 
     dep_kernel = expand_subst(kernel)
@@ -347,10 +359,13 @@ def temporary_to_subst(kernel, temp_name, within=None):
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
 
-    tts = TemporaryToSubstChanger(kernel, temp_name, definition_insn_ids,
-            usage_to_definition, within)
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    tts = TemporaryToSubstChanger(rule_mapping_context,
+            temp_name, definition_insn_ids,
+            usage_to_definition, extra_arguments, within)
 
-    kernel = tts.map_kernel(kernel)
+    kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel))
 
     from loopy.kernel.data import SubstitutionRule
 
@@ -376,7 +391,7 @@ def temporary_to_subst(kernel, temp_name, within=None):
 
         new_substs[subst_name] = SubstitutionRule(
                 name=subst_name,
-                arguments=tuple(arguments),
+                arguments=tuple(arguments) + extra_arguments,
                 expression=def_insn.expression)
 
     # }}}
@@ -410,19 +425,18 @@ def temporary_to_subst(kernel, temp_name, within=None):
 # }}}
 
 
-def expand_subst(kernel, ctx_match=None):
+def expand_subst(kernel, within=None):
     logger.debug("%s: expand subst" % kernel.name)
 
-    from loopy.symbolic import SubstitutionRuleExpander
+    from loopy.symbolic import RuleAwareSubstitutionRuleExpander
     from loopy.context_matching import parse_stack_match
-    submap = SubstitutionRuleExpander(kernel.substitutions,
-            kernel.get_var_name_generator(),
-            parse_stack_match(ctx_match))
-
-    kernel = submap.map_kernel(kernel)
-    if ctx_match is None:
-        return kernel.copy(substitutions={})
-    else:
-        return kernel
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    submap = RuleAwareSubstitutionRuleExpander(
+            rule_mapping_context,
+            kernel.substitutions,
+            parse_stack_match(within))
+
+    return rule_mapping_context.finish_kernel(submap.map_kernel(kernel))
 
 # vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 4492f032d0d355e9268dab049b55c0613b22b47b..8897f35938ee29e50d70edfdc52945bc2ad856ac 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1,11 +1,8 @@
 """Pymbolic mappers for loopy."""
 
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import division, absolute_import
 import six
-from six.moves import range
-from six.moves import zip
-from functools import reduce
+from six.moves import range, zip, reduce
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -197,6 +194,41 @@ class DependencyMapper(DependencyMapperBase):
 
     map_linear_subscript = DependencyMapperBase.map_subscript
 
+
+class SubstitutionRuleExpander(IdentityMapper):
+    def __init__(self, rules):
+        self.rules = rules
+
+    def map_variable(self, expr):
+        if expr.name in self.rules:
+            return self.map_substitution(expr.name, self.rules[expr.name], ())
+        else:
+            return super(SubstitutionRuleExpander, self).map_variable(expr)
+
+    def map_call(self, expr):
+        if expr.function.name in self.rules:
+            return self.map_substitution(
+                    expr.function.name,
+                    self.rules[expr.function.name],
+                    expr.parameters)
+        else:
+            return super(SubstitutionRuleExpander, self).map_call(expr)
+
+    def map_substitution(self, name, rule, arguments):
+        if len(rule.arguments) != len(arguments):
+            from loopy.diagnostic import LoopyError
+            raise LoopyError("number of arguments to '%s' does not match "
+                    "definition" % name)
+
+        from pymbolic.mapper.substitutor import make_subst_func
+        submap = SubstitutionMapper(
+                make_subst_func(
+                    dict(zip(rule.arguments, arguments))))
+
+        expr = submap(rule.expression)
+
+        return self.rec(expr)
+
 # }}}
 
 
@@ -333,7 +365,7 @@ def get_dependencies(expr):
     return frozenset(dep.name for dep in dep_mapper(expr))
 
 
-# {{{ identity mapper that expands subst rules on the fly
+# {{{ rule-aware mappers
 
 def parse_tagged_name(expr):
     if isinstance(expr, TaggedVariable):
@@ -346,12 +378,26 @@ def parse_tagged_name(expr):
 
 class ExpansionState(Record):
     """
-    :ivar stack: a tuple representing the current expansion stack, as a tuple
+    .. attribute:: stack
+
+        a tuple representing the current expansion stack, as a tuple
         of (name, tag) pairs. At the top level, this should be initialized to a
         tuple with the id of the calling instruction.
-    :ivar arg_context: a dict representing current argument values
+
+    .. attribute:: arg_context
+
+        a dict representing current argument values
     """
 
+    @property
+    def insn_id(self):
+        return self.stack[0][0]
+
+    def apply_arg_context(self, expr):
+        from pymbolic.mapper.substitutor import make_subst_func
+        return SubstitutionMapper(
+                make_subst_func(self.arg_context))(expr)
+
 
 class SubstitutionRuleRenamer(IdentityMapper):
     def __init__(self, renames):
@@ -395,11 +441,7 @@ def rename_subst_rules_in_instructions(insns, renames):
             for insn in insns]
 
 
-class ExpandingIdentityMapper(IdentityMapper):
-    """Note: the third argument dragged around by this mapper is the
-    current :class:`ExpansionState`.
-    """
-
+class SubstitutionRuleMappingContext(object):
     def __init__(self, old_subst_rules, make_unique_var_name):
         self.old_subst_rules = old_subst_rules
         self.make_unique_var_name = make_unique_var_name
@@ -428,9 +470,84 @@ class ExpandingIdentityMapper(IdentityMapper):
         self.subst_rule_use_count[key] = self.subst_rule_use_count.get(key, 0) + 1
         return new_name
 
+    def _get_new_substitutions_and_renames(self):
+        """This makes a new dictionary of substitutions from the ones
+        encountered in mapping all the encountered expressions.
+        It tries hard to keep substitution names the same--i.e.
+        if all derivative versions of a substitution rule ended
+        up with the same mapped version, then this version should
+        retain the name that the substitution rule had previously.
+        Unfortunately, this can't be done in a single pass, and so
+        the routine returns an additional dictionary *subst_renames*
+        of renamings to be performed on the processed expressions.
+
+        The returned substitutions already have the rename applied
+        to them.
+
+        :returns: (new_substitutions, subst_renames)
+        """
+
+        from loopy.kernel.data import SubstitutionRule
+
+        orig_name_histogram = {}
+        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
+            if self.subst_rule_use_count.get(key, 0):
+                orig_name_histogram[orig_name] = \
+                        orig_name_histogram.get(orig_name, 0) + 1
+
+        result = {}
+        renames = {}
+
+        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
+            args, body = key
+
+            if self.subst_rule_use_count.get(key, 0):
+                if orig_name_histogram[orig_name] == 1 and name != orig_name:
+                    renames[name] = orig_name
+                    name = orig_name
+
+                result[name] = SubstitutionRule(
+                        name=name,
+                        arguments=args,
+                        expression=body)
+
+        # {{{ perform renames on new substitutions
+
+        subst_renamer = SubstitutionRuleRenamer(renames)
+
+        renamed_result = {}
+        for name, rule in six.iteritems(result):
+            renamed_result[name] = rule.copy(
+                    expression=subst_renamer(rule.expression))
+
+        # }}}
+
+        return renamed_result, renames
+
+    def finish_kernel(self, kernel):
+        new_substs, renames = self._get_new_substitutions_and_renames()
+
+        new_insns = rename_subst_rules_in_instructions(kernel.instructions, renames)
+
+        return kernel.copy(
+            substitutions=new_substs,
+            instructions=new_insns)
+
+
+class RuleAwareIdentityMapper(IdentityMapper):
+    """Note: the third argument dragged around by this mapper is the
+    current :class:`ExpansionState`.
+
+    Subclasses of this must be careful to not touch identifiers that
+    are in :attr:`ExpansionState.arg_context`.
+    """
+
+    def __init__(self, rule_mapping_context):
+        self.rule_mapping_context = rule_mapping_context
+
     def map_variable(self, expr, expn_state):
         name, tag = parse_tagged_name(expr)
-        if name not in self.old_subst_rules:
+        if name not in self.rule_mapping_context.old_subst_rules:
             return IdentityMapper.map_variable(self, expr, expn_state)
         else:
             return self.map_substitution(name, tag, (), expn_state)
@@ -441,8 +558,8 @@ class ExpandingIdentityMapper(IdentityMapper):
 
         name, tag = parse_tagged_name(expr.function)
 
-        if name not in self.old_subst_rules:
-            return IdentityMapper.map_call(self, expr, expn_state)
+        if name not in self.rule_mapping_context.old_subst_rules:
+            return super(RuleAwareIdentityMapper, self).map_call(expr, expn_state)
         else:
             return self.map_substitution(name, tag, expr.parameters, expn_state)
 
@@ -459,7 +576,7 @@ class ExpandingIdentityMapper(IdentityMapper):
                 for formal_arg_name, arg_value in zip(arg_names, arguments))
 
     def map_substitution(self, name, tag, arguments, expn_state):
-        rule = self.old_subst_rules[name]
+        rule = self.rule_mapping_context.old_subst_rules[name]
 
         rec_arguments = self.rec(arguments, expn_state)
 
@@ -475,7 +592,8 @@ class ExpandingIdentityMapper(IdentityMapper):
 
         result = self.rec(rule.expression, new_expn_state)
 
-        new_name = self.register_subst_rule(name, rule.arguments, result)
+        new_name = self.rule_mapping_context.register_subst_rule(
+                name, rule.arguments, result)
 
         if tag is None:
             sym = Variable(new_name)
@@ -496,60 +614,6 @@ class ExpandingIdentityMapper(IdentityMapper):
         return IdentityMapper.__call__(self, expr, ExpansionState(
             stack=stack, arg_context={}))
 
-    def _get_new_substitutions_and_renames(self):
-        """This makes a new dictionary of substitutions from the ones
-        encountered in mapping all the encountered expressions.
-        It tries hard to keep substitution names the same--i.e.
-        if all derivative versions of a substitution rule ended
-        up with the same mapped version, then this version should
-        retain the name that the substitution rule had previously.
-        Unfortunately, this can't be done in a single pass, and so
-        the routine returns an additional dictionary *subst_renames*
-        of renamings to be performed on the processed expressions.
-
-        The returned substitutions already have the rename applied
-        to them.
-
-        :returns: (new_substitutions, subst_renames)
-        """
-
-        from loopy.kernel.data import SubstitutionRule
-
-        orig_name_histogram = {}
-        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
-            if self.subst_rule_use_count.get(key, 0):
-                orig_name_histogram[orig_name] = \
-                        orig_name_histogram.get(orig_name, 0) + 1
-
-        result = {}
-        renames = {}
-
-        for key, (name, orig_name) in six.iteritems(self.subst_rule_registry):
-            args, body = key
-
-            if self.subst_rule_use_count.get(key, 0):
-                if orig_name_histogram[orig_name] == 1 and name != orig_name:
-                    renames[name] = orig_name
-                    name = orig_name
-
-                result[name] = SubstitutionRule(
-                        name=name,
-                        arguments=args,
-                        expression=body)
-
-        # {{{ perform renames on new substitutions
-
-        subst_renamer = SubstitutionRuleRenamer(renames)
-
-        renamed_result = {}
-        for name, rule in six.iteritems(result):
-            renamed_result[name] = rule.copy(
-                    expression=subst_renamer(rule.expression))
-
-        # }}}
-
-        return renamed_result, renames
-
     def map_instruction(self, insn):
         return insn
 
@@ -558,49 +622,40 @@ class ExpandingIdentityMapper(IdentityMapper):
                 # While subst rules are not allowed in assignees, the mapper
                 # may perform tasks entirely unrelated to subst rules, so
                 # we must map assignees, too.
-
-                insn.with_transformed_expressions(self, insn.id, insn.tags)
+                self.map_instruction(
+                    insn.with_transformed_expressions(self, insn.id, insn.tags))
                 for insn in kernel.instructions]
 
-        new_substs, renames = self._get_new_substitutions_and_renames()
-
-        new_insns = [self.map_instruction(insn)
-                for insn in rename_subst_rules_in_instructions(
-                    new_insns, renames)]
+        return kernel.copy(instructions=new_insns)
 
-        return kernel.copy(
-            substitutions=new_substs,
-            instructions=new_insns)
 
-
-class ExpandingSubstitutionMapper(ExpandingIdentityMapper):
-    def __init__(self, rules, make_unique_var_name, subst_func, within):
-        ExpandingIdentityMapper.__init__(self, rules, make_unique_var_name)
+class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, subst_func, within):
+        super(RuleAwareSubstitutionMapper, self).__init__(rule_mapping_context)
 
         self.subst_func = subst_func
         self.within = within
 
     def map_variable(self, expr, expn_state):
+        if (expr.name in expn_state.arg_context
+                or not self.within(expn_state.stack)):
+            return super(RuleAwareSubstitutionMapper, self).map_variable(
+                    expr, expn_state)
+
         result = self.subst_func(expr)
-        if result is not None or not self.within(expn_state.stack):
+        if result is not None:
             return result
         else:
-            return ExpandingIdentityMapper.map_variable(self, expr, expn_state)
+            return super(RuleAwareSubstitutionMapper, self).map_variable(
+                    expr, expn_state)
 
-# }}}
-
-
-# {{{ substitution rule expander
 
-class SubstitutionRuleExpander(ExpandingIdentityMapper):
-    def __init__(self, rules, make_unique_var=None, ctx_match=None):
-        ExpandingIdentityMapper.__init__(self, rules, make_unique_var)
+class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, rules, within):
+        super(RuleAwareSubstitutionRuleExpander, self).__init__(rule_mapping_context)
 
-        if ctx_match is None:
-            from loopy.context_matching import AllStackMatch
-            ctx_match = AllStackMatch()
-
-        self.ctx_match = ctx_match
+        self.rules = rules
+        self.within = within
 
     def map_substitution(self, name, tag, arguments, expn_state):
         if tag is None:
@@ -610,9 +665,9 @@ class SubstitutionRuleExpander(ExpandingIdentityMapper):
 
         new_stack = expn_state.stack + ((name, tags),)
 
-        if self.ctx_match(new_stack):
+        if self.within(new_stack):
             # expand
-            rule = self.old_subst_rules[name]
+            rule = self.rules[name]
 
             new_expn_state = expn_state.copy(
                     stack=new_stack,
@@ -630,8 +685,8 @@ class SubstitutionRuleExpander(ExpandingIdentityMapper):
 
         else:
             # do not expand
-            return ExpandingIdentityMapper.map_substitution(
-                    self, name, tag, arguments, expn_state)
+            return super(RuleAwareSubstitutionRuleExpander, self).map_substitution(
+                    name, tag, arguments, expn_state)
 
 # }}}
 
diff --git a/loopy/tools.py b/loopy/tools.py
index 29005cfbb6aca798d6b0f51c3dabf9a2b75e53a4..0c8898bb1ce20a732dbd0d64a679e18c98609edb 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -68,7 +68,7 @@ class LoopyKeyBuilder(KeyBuilderBase):
         for dict_key in sorted(six.iterkeys(key)):
             self.rec(key_hash, (dict_key, key[dict_key]))
 
-    def update_for_BasicSet(self, key_hash, key):
+    def update_for_BasicSet(self, key_hash, key):  # noqa
         from islpy import Printer
         prn = Printer.to_str(key.get_ctx())
         getattr(prn, "print_"+key._base_name)(key)
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 33826e4063190f7d7d48b1bfcb4122931868bd1e..d51411a6e65ba750ddf8cfc2ee844c4bc167ab82 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -130,7 +130,7 @@ def test_temporary_to_subst(ctx_factory):
           integer n
 
           do i = 1, n
-            a = inp(n)
+            a = inp(i)
             out(i) = 5*a
             out2(i) = 6*a
           end do
@@ -142,7 +142,7 @@ def test_temporary_to_subst(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.temporary_to_subst(knl, "a")
+    knl = lp.temporary_to_subst(knl, "a", "i")
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
@@ -273,26 +273,25 @@ def test_tagged(ctx_factory):
     assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2
 
 
-def test_matmul(ctx_factory):
+@pytest.mark.parametrize("buffer_inames", [
+    "",
+    "i_inner,j_inner",
+    ])
+def test_matmul(ctx_factory, buffer_inames):
     fortran_src = """
         subroutine dgemm(m,n,l,a,b,c)
           implicit none
-          real*8 temp, a(m,l),b(l,n),c(m,n)
+          real*8 a(m,l),b(l,n),c(m,n)
           integer m,n,k,i,j,l
 
           do j = 1,n
             do i = 1,m
-              temp = 0
               do k = 1,l
-                temp = temp + b(k,j)*a(i,k)
+                c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
-              c(i,j) = temp
             end do
           end do
         end subroutine
-
-        !$loopy begin transform
-        !$loopy end transform
         """
 
     from loopy.frontend.fortran import f2loopy
@@ -307,14 +306,71 @@ def test_matmul(ctx_factory):
     knl = lp.split_iname(knl, "j", 8,
             outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 32)
+    knl = lp.assume(knl, "n mod 32 = 0")
+    knl = lp.assume(knl, "m mod 32 = 0")
+    knl = lp.assume(knl, "l mod 16 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
     knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
     knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")
 
+    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
+            init_expression="0", store_expression="base+buffer")
+
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+
+    # # FIXME: Make r/w tests possible, reactivate the above
+    # knl = lp.preprocess_kernel(knl)
+    # for k in lp.generate_loop_schedules(knl):
+    #     code, _ = lp.generate_code(k)
+    #     print(code)
+
+
+@pytest.mark.xfail
+def test_batched_sparse():
+    fortran_src = """
+        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
+          implicit none
+
+          integer rowstarts(m+1), colindices(nvals)
+          real*8 values(nvals)
+          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)
+
+          integer m, n, rowstart, rowend, length, nvals, nvecs
+
+          do i = 1, m
+            rowstart = rowstarts(i)
+            rowend = rowstarts(i+1)
+            length = rowend - rowstart
+
+            do k = 1, nvecs
+              rowsum(k) = 0
+            enddo
+            do k = 1, nvecs
+              do j = 1, length
+                rowsum(k) = rowsum(k) + &
+                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
+              end do
+            end do
+            do k = 1, nvecs
+              y(i,k) = rowsum(k)
+            end do
+          end do
+        end
+
+        """
+
+    from loopy.frontend.fortran import f2loopy
+    knl, = f2loopy(fortran_src)
+
+    knl = lp.split_iname(knl, "i", 128)
+    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
+    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
+    knl = lp.add_prefetch(knl, "values")
+    knl = lp.add_prefetch(knl, "colindices")
+    knl = lp.fix_parameters(knl, nvecs=4)
 
 
 if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index d1a995b19fe22c818f03cadc7131e7732d3e1d58..9dbedf320eec358d29245e35ab009ded910f1b5c 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -615,8 +615,8 @@ def test_small_batched_matvec(ctx_factory):
 
     order = "C"
 
-    K = 9997
-    Np = 36
+    K = 9997  # noqa
+    Np = 36  # noqa
 
     knl = lp.make_kernel(
             "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6a59eb7388afedcc65bb6aa3d42f57b2e7152ac3..f99c0b3b3638d024530c339e4340e27d71043c4e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1816,6 +1816,69 @@ def test_affine_map_inames():
     print(knl)
 
 
+def test_precompute_confusing_subst_arguments(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<5}",
+        """
+        D(i):=a[i+1]-a[i]
+        b[i,j] = D(j)
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.tag_inames(knl, dict(j="g.1"))
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    from loopy.symbolic import get_dependencies
+    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
+    knl = lp.precompute(knl, "D")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=12345))
+
+
+def test_precompute_nested_subst(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<5}",
+        """
+        E:=a[i]
+        D:=E*E
+        b[i] = D
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.tag_inames(knl, dict(j="g.1"))
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    from loopy.symbolic import get_dependencies
+    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
+    knl = lp.precompute(knl, "D", "i_inner")
+
+    # There's only one surviving 'E' rule.
+    assert len([
+        rule_name
+        for rule_name in knl.substitutions
+        if rule_name.startswith("E")]) == 1
+
+    # That rule should use the newly created prefetch inames,
+    # not the prior 'i_inner'
+    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=12345))
+
+
 def test_poisson(ctx_factory):
     # Stolen from Peter Coogan and Rob Kirby for FEM assembly
     ctx = ctx_factory()
@@ -1873,6 +1936,28 @@ def test_poisson(ctx_factory):
                 parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
 
 
+def test_auto_test_can_detect_problems(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i,j<n}",
+        """
+        a[i,j] = 25
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.link_inames(knl, "i,j", "i0")
+
+    from loopy.diagnostic import AutomaticTestFailure
+    with pytest.raises(AutomaticTestFailure):
+        lp.auto_test_vs_ref(
+                ref_knl, ctx, knl,
+                parameters=dict(n=123))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])