diff --git a/TODO b/TODO
new file mode 100644
index 0000000000000000000000000000000000000000..9f0bf2ecf68f28a330012ff1c77c9931f81144ef
--- /dev/null
+++ b/TODO
@@ -0,0 +1,4 @@
+FORTRAN:
+do/continue
+case sensitivity
+
diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
new file mode 100644
index 0000000000000000000000000000000000000000..ea85b0c6b198c8bac6270098696504fc9006fd14
--- /dev/null
+++ b/examples/fortran/matmul.floopy
@@ -0,0 +1,26 @@
+subroutine dgemm(m,n,l,alpha,a,b,c)
+  implicit none
+  real*8 temp, a(m,l),b(l,n),c(m,n), alpha
+  integer m,n,k,i,j,l
+
+  do j = 1,n
+    do k = 1,l
+      do i = 1,m
+        c(i,j) = c(i,j) + alpha*b(k,j)*a(i,k)
+      end do
+    end do
+  end do
+end subroutine
+
+!$loopy begin transform
+! dgemm = lp.split_iname(dgemm, "i", 16,
+!         outer_tag="g.0", inner_tag="l.1")
+! dgemm = lp.split_iname(dgemm, "j", 8,
+!         outer_tag="g.1", inner_tag="l.0")
+! dgemm = lp.split_iname(dgemm, "k", 32)
+!
+! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
+! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
+! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner")
+! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner")
+!$loopy end transform
diff --git a/loopy/array_buffer.py b/loopy/array_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c
--- /dev/null
+++ b/loopy/array_buffer.py
@@ -0,0 +1,408 @@
+from __future__ import division, absolute_import
+from six.moves import range, zip
+
+__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import islpy as isl
+from islpy import dim_type
+from loopy.symbolic import (get_dependencies, SubstitutionMapper)
+from pymbolic.mapper.substitutor import make_subst_func
+
+from pytools import Record
+from pymbolic import var
+
+
+class AccessDescriptor(Record):
+    """
+    .. attribute:: identifier
+
+        An identifier under user control, used to connect this access descriptor
+        to the access that generated it. Any Python value.
+    """
+
+    __slots__ = [
+            "identifier",
+            "expands_footprint",
+            "storage_axis_exprs",
+            ]
+
+
+def to_parameters_or_project_out(param_inames, set_inames, set):
+    for iname in list(set.get_space().get_var_dict().keys()):
+        if iname in param_inames:
+            dt, idx = set.get_space().get_var_dict()[iname]
+            set = set.move_dims(
+                    dim_type.param, set.dim(dim_type.param),
+                    dt, idx, 1)
+        elif iname in set_inames:
+            pass
+        else:
+            dt, idx = set.get_space().get_var_dict()[iname]
+            set = set.project_out(dt, idx, 1)
+
+    return set
+
+
+# {{{ construct storage->sweep map
+
+def build_per_access_storage_to_domain_map(accdesc, domain,
+        storage_axis_names,
+        prime_sweep_inames):
+
+    map_space = domain.space
+    stor_dim = len(storage_axis_names)
+    rn = map_space.dim(dim_type.out)
+
+    map_space = map_space.add_dims(dim_type.in_, stor_dim)
+    for i, saxis in enumerate(storage_axis_names):
+        # arg names are initially primed, to be replaced with unprimed
+        # base-0 versions below
+
+        map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'")
+
+    # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn)
+
+    set_space = map_space.move_dims(
+            dim_type.out, rn,
+            dim_type.in_, 0, stor_dim).range()
+
+    # set_space: [domain](dup_sweep_index)[dup_sweep](rn)[stor_axes']
+
+    stor2sweep = None
+
+    from loopy.symbolic import aff_from_expr
+
+    for saxis, sa_expr in zip(storage_axis_names, accdesc.storage_axis_exprs):
+        cns = isl.Constraint.equality_from_aff(
+                aff_from_expr(set_space,
+                    var(saxis+"'") - prime_sweep_inames(sa_expr)))
+
+        cns_map = isl.BasicMap.from_constraint(cns)
+        if stor2sweep is None:
+            stor2sweep = cns_map
+        else:
+            stor2sweep = stor2sweep.intersect(cns_map)
+
+    if stor2sweep is not None:
+        stor2sweep = stor2sweep.move_dims(
+                dim_type.in_, 0,
+                dim_type.out, rn, stor_dim)
+
+    # stor2sweep is back in map_space
+    return stor2sweep
+
+
+def move_to_par_from_out(s2smap, except_inames):
+    while True:
+        var_dict = s2smap.get_var_dict(dim_type.out)
+        todo_inames = set(var_dict) - except_inames
+        if todo_inames:
+            iname = todo_inames.pop()
+
+            _, dim_idx = var_dict[iname]
+            s2smap = s2smap.move_dims(
+                    dim_type.param, s2smap.dim(dim_type.param),
+                    dim_type.out, dim_idx, 1)
+        else:
+            return s2smap
+
+
+def build_global_storage_to_sweep_map(kernel, access_descriptors,
+        domain_dup_sweep, dup_sweep_index,
+        storage_axis_names,
+        sweep_inames, primed_sweep_inames, prime_sweep_inames):
+    # The storage map goes from storage axes to the domain.
+    # The first len(arg_names) storage dimensions are the rule's arguments.
+
+    global_stor2sweep = None
+
+    # build footprint
+    for accdesc in access_descriptors:
+        if accdesc.expands_footprint:
+            stor2sweep = build_per_access_storage_to_domain_map(
+                    accdesc, domain_dup_sweep,
+                    storage_axis_names,
+                    prime_sweep_inames)
+
+            if global_stor2sweep is None:
+                global_stor2sweep = stor2sweep
+            else:
+                global_stor2sweep = global_stor2sweep.union(stor2sweep)
+
+    if isinstance(global_stor2sweep, isl.BasicMap):
+        global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep)
+    global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep)
+
+    # space for global_stor2sweep:
+    # [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn)
+
+    return global_stor2sweep
+
+# }}}
+
+
+# {{{ compute storage bounds
+
+def find_var_base_indices_and_shape_from_inames(
+        domain, inames, cache_manager, context=None):
+    base_indices_and_sizes = [
+            cache_manager.base_index_and_length(domain, iname, context)
+            for iname in inames]
+    return list(zip(*base_indices_and_sizes))
+
+
+def compute_bounds(kernel, domain, stor2sweep,
+        primed_sweep_inames, storage_axis_names):
+
+    bounds_footprint_map = move_to_par_from_out(
+            stor2sweep, except_inames=frozenset(primed_sweep_inames))
+
+    # compute bounds for each storage axis
+    storage_domain = bounds_footprint_map.domain().coalesce()
+
+    if not storage_domain.is_bounded():
+        raise RuntimeError("sweep did not result in a bounded storage domain")
+
+    return find_var_base_indices_and_shape_from_inames(
+            storage_domain, [saxis+"'" for saxis in storage_axis_names],
+            kernel.cache_manager, context=kernel.assumptions)
+
+# }}}
+
+
+# {{{ array-to-buffer map
+
+class ArrayToBufferMap(object):
+    def __init__(self, kernel, domain, sweep_inames, access_descriptors,
+            storage_axis_count):
+        self.kernel = kernel
+        self.sweep_inames = sweep_inames
+
+        storage_axis_names = self.storage_axis_names = [
+                "_loopy_storage_%d" % i for i in range(storage_axis_count)]
+
+        # {{{ duplicate sweep inames
+
+        # The duplication is necessary, otherwise the storage fetch
+        # inames remain weirdly tied to the original sweep inames.
+
+        self.primed_sweep_inames = [psin+"'" for psin in sweep_inames]
+
+        from loopy.isl_helpers import duplicate_axes
+        dup_sweep_index = domain.space.dim(dim_type.out)
+        domain_dup_sweep = duplicate_axes(
+                domain, sweep_inames,
+                self.primed_sweep_inames)
+
+        self.prime_sweep_inames = SubstitutionMapper(make_subst_func(
+            dict((sin, var(psin))
+                for sin, psin in zip(sweep_inames, self.primed_sweep_inames))))
+
+        # # }}}
+
+        self.stor2sweep = build_global_storage_to_sweep_map(
+                kernel, access_descriptors,
+                domain_dup_sweep, dup_sweep_index,
+                storage_axis_names,
+                sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames)
+
+        storage_base_indices, storage_shape = compute_bounds(
+                kernel, domain, self.stor2sweep, self.primed_sweep_inames,
+                storage_axis_names)
+
+        # compute augmented domain
+
+        # {{{ filter out unit-length dimensions
+
+        non1_storage_axis_flags = []
+        non1_storage_shape = []
+
+        for saxis, bi, l in zip(
+                storage_axis_names, storage_base_indices, storage_shape):
+            has_length_non1 = l != 1
+
+            non1_storage_axis_flags.append(has_length_non1)
+
+            if has_length_non1:
+                non1_storage_shape.append(l)
+
+        # }}}
+
+        # {{{ subtract off the base indices
+        # add the new, base-0 indices as new in dimensions
+
+        sp = self.stor2sweep.get_space()
+        stor_idx = sp.dim(dim_type.out)
+
+        n_stor = storage_axis_count
+        nn1_stor = len(non1_storage_shape)
+
+        aug_domain = self.stor2sweep.move_dims(
+                dim_type.out, stor_idx,
+                dim_type.in_, 0,
+                n_stor).range()
+
+        # aug_domain space now:
+        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes']
+
+        aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor)
+
+        inew = 0
+        for i, name in enumerate(storage_axis_names):
+            if non1_storage_axis_flags[i]:
+                aug_domain = aug_domain.set_dim_name(
+                        dim_type.set, stor_idx + inew, name)
+                inew += 1
+
+        # aug_domain space now:
+        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes]
+
+        from loopy.symbolic import aff_from_expr
+        for saxis, bi, s in zip(storage_axis_names, storage_base_indices,
+                storage_shape):
+            if s != 1:
+                cns = isl.Constraint.equality_from_aff(
+                        aff_from_expr(aug_domain.get_space(),
+                            var(saxis) - (var(saxis+"'") - bi)))
+
+                aug_domain = aug_domain.add_constraint(cns)
+
+        # }}}
+
+        # eliminate (primed) storage axes with non-zero base indices
+        aug_domain = aug_domain.project_out(dim_type.set, stor_idx+nn1_stor, n_stor)
+
+        # eliminate duplicated sweep_inames
+        nsweep = len(sweep_inames)
+        aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index, nsweep)
+
+        self.non1_storage_axis_flags = non1_storage_axis_flags
+        self.aug_domain = aug_domain
+        self.storage_base_indices = storage_base_indices
+        self.non1_storage_shape = non1_storage_shape
+
+    def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
+            boxify_sweep=False):
+
+        renamed_aug_domain = self.aug_domain
+        first_storage_index = (
+                renamed_aug_domain.dim(dim_type.set)
+                - len(self.non1_storage_shape))
+
+        inon1 = 0
+        for i, old_name in enumerate(self.storage_axis_names):
+            if not self.non1_storage_axis_flags[i]:
+                continue
+
+            new_name = new_non1_storage_axis_names[inon1]
+
+            assert (
+                    renamed_aug_domain.get_dim_name(
+                        dim_type.set, first_storage_index+inon1)
+                    == old_name)
+            renamed_aug_domain = renamed_aug_domain.set_dim_name(
+                    dim_type.set, first_storage_index+inon1, new_name)
+
+            inon1 += 1
+
+        domain, renamed_aug_domain = isl.align_two(domain, renamed_aug_domain)
+        domain = domain & renamed_aug_domain
+
+        from loopy.isl_helpers import convexify, boxify
+        if boxify_sweep:
+            return boxify(self.kernel.cache_manager, domain,
+                    new_non1_storage_axis_names, self.kernel.assumptions)
+        else:
+            return convexify(domain)
+
+    def is_access_descriptor_in_footprint(self, accdesc):
+        if accdesc.expands_footprint:
+            return True
+
+        # Make all inames except the sweep parameters. (The footprint may depend on
+        # those.) (I.e. only leave sweep inames as out parameters.)
+
+        global_s2s_par_dom = move_to_par_from_out(
+                self.stor2sweep,
+                except_inames=frozenset(self.primed_sweep_inames)).domain()
+
+        arg_inames = (
+                set(global_s2s_par_dom.get_var_names(dim_type.param))
+                & self.kernel.all_inames())
+
+        for arg in accdesc.args:
+            arg_inames.update(get_dependencies(arg))
+        arg_inames = frozenset(arg_inames)
+
+        from loopy.kernel import CannotBranchDomainTree
+        try:
+            usage_domain = self.kernel.get_inames_domain(arg_inames)
+        except CannotBranchDomainTree:
+            return False
+
+        for i in range(usage_domain.dim(dim_type.set)):
+            iname = usage_domain.get_dim_name(dim_type.set, i)
+            if iname in self.sweep_inames:
+                usage_domain = usage_domain.set_dim_name(
+                        dim_type.set, i, iname+"'")
+
+        stor2sweep = build_per_access_storage_to_domain_map(accdesc,
+                usage_domain, self.storage_axis_names,
+                self.prime_sweep_inames)
+
+        if isinstance(stor2sweep, isl.BasicMap):
+            stor2sweep = isl.Map.from_basic_map(stor2sweep)
+
+        stor2sweep = stor2sweep.intersect_range(usage_domain)
+
+        stor2sweep = move_to_par_from_out(stor2sweep,
+                except_inames=frozenset(self.primed_sweep_inames))
+
+        s2s_domain = stor2sweep.domain()
+        s2s_domain, aligned_g_s2s_parm_dom = isl.align_two(
+                s2s_domain, global_s2s_par_dom)
+
+        arg_restrictions = (
+                aligned_g_s2s_parm_dom
+                .eliminate(dim_type.set, 0,
+                    aligned_g_s2s_parm_dom.dim(dim_type.set))
+                .remove_divs())
+
+        return (arg_restrictions & s2s_domain).is_subset(
+                aligned_g_s2s_parm_dom)
+
+
+class NoOpArrayToBufferMap(object):
+    non1_storage_axis_names = ()
+    storage_base_indices = ()
+    non1_storage_shape = ()
+
+    def is_access_descriptor_in_footprint(self, accdesc):
+        # no index dependencies--every reference to the subst rule
+        # is necessarily in the footprint.
+
+        return True
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 44426c3519e03c9f6cbfe5990324d824c43132b7..3927293ae82dc521327a8c17afae5e93be1e3aa9 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -195,11 +195,11 @@ def build_loop_nest(kernel, sched_index, codegen_state):
     sched_index_info_entries = [
             ScheduleIndexInfo(
                 schedule_index=i,
-                admissible_cond_inames=
-                get_admissible_conditional_inames_for(kernel, i),
+                admissible_cond_inames=(
+                    get_admissible_conditional_inames_for(kernel, i)),
                 required_predicates=get_required_predicates(kernel, i)
                 )
-        for i in my_sched_indices]
+            for i in my_sched_indices]
 
     # }}}
 
@@ -279,11 +279,11 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             current_iname_set = (
                     current_iname_set
                     & sched_index_info_entries[candidate_group_length-1]
-                        .admissible_cond_inames)
+                    .admissible_cond_inames)
             current_pred_set = (
                     current_pred_set
                     & sched_index_info_entries[candidate_group_length-1]
-                        .required_predicates)
+                    .required_predicates)
 
             # {{{ see which inames are actually used in group
 
@@ -376,6 +376,4 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             build_insn_group(sched_index_info_entries, codegen_state))
 
 
-
-
 # vim: foldmethod=marker
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index ca1cce69c4cf1f87233c35e3dfc7987940f132fa..1bd977f3ea2e8e15e63424e89b1e347063da70e2 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -36,7 +36,6 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st
     bounds_checks = get_bounds_checks(
             domain, check_inames,
             codegen_state.implemented_domain, overapproximate=False)
-
     bounds_check_set = isl.Set.universe(domain.get_space()) \
             .add_constraints(bounds_checks)
     bounds_check_set, new_implemented_domain = isl.align_two(
@@ -70,6 +69,7 @@ def generate_instruction_code(kernel, insn, codegen_state):
         raise RuntimeError("unexpected instruction type")
 
     insn_inames = kernel.insn_inames(insn)
+
     insn_code, impl_domain = wrap_in_conditionals(
             codegen_state,
             kernel.get_inames_domain(insn_inames), insn_inames,
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 783dc32313883f092f15baacccf3e68d75af3b93..a6b5b422bcaa413033ecce4808be581139394876 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -221,6 +221,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         self.isl_context = isl.Context()
 
         self.insn_id_counter = 0
+        self.condition_id_counter = 0
 
         self.kernels = []
 
@@ -229,9 +230,34 @@ class F2LoopyTranslator(FTreeWalkerBase):
         self.in_transform_code = False
 
         self.instruction_tags = []
+        self.conditions = []
 
         self.transform_code_lines = []
 
+    def add_expression_instruction(self, lhs, rhs):
+        scope = self.scope_stack[-1]
+
+        new_id = "insn%d" % self.insn_id_counter
+        self.insn_id_counter += 1
+
+        if scope.previous_instruction_id:
+            insn_deps = frozenset([scope.previous_instruction_id])
+        else:
+            insn_deps = frozenset()
+
+        from loopy.kernel.data import ExpressionInstruction
+        insn = ExpressionInstruction(
+                lhs, rhs,
+                forced_iname_deps=frozenset(
+                    scope.active_loopy_inames),
+                insn_deps=insn_deps,
+                id=new_id,
+                predicates=frozenset(self.conditions),
+                tags=tuple(self.instruction_tags))
+
+        scope.previous_instruction_id = new_id
+        scope.instructions.append(insn)
+
     # {{{ map_XXX functions
 
     def map_BeginSource(self, node):
@@ -385,28 +411,9 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         scope.use_name(lhs_name)
 
-        from loopy.kernel.data import ExpressionInstruction
-
         rhs = scope.process_expression_for_loopy(self.parse_expr(node.expr))
 
-        new_id = "insn%d" % self.insn_id_counter
-        self.insn_id_counter += 1
-
-        if scope.previous_instruction_id:
-            insn_deps = frozenset([scope.previous_instruction_id])
-        else:
-            insn_deps = frozenset()
-
-        insn = ExpressionInstruction(
-                lhs, rhs,
-                forced_iname_deps=frozenset(
-                    scope.active_loopy_inames),
-                insn_deps=insn_deps,
-                id=new_id,
-                tags=tuple(self.instruction_tags))
-
-        scope.previous_instruction_id = new_id
-        scope.instructions.append(insn)
+        self.add_expression_instruction(lhs, rhs)
 
     def map_Allocate(self, node):
         raise NotImplementedError("allocate")
@@ -448,10 +455,31 @@ class F2LoopyTranslator(FTreeWalkerBase):
         # node.content[0]
 
     def map_IfThen(self, node):
-        raise NotImplementedError("if-then")
+        scope = self.scope_stack[-1]
+
+        cond_name = "loopy_cond%d" % self.condition_id_counter
+        self.condition_id_counter += 1
+        assert cond_name not in scope.type_map
+
+        scope.type_map[cond_name] = np.int32
+
+        from pymbolic import var
+        cond_var = var(cond_name)
+
+        self.add_expression_instruction(
+                cond_var, self.parse_expr(node.expr))
+
+        self.conditions.append(cond_name)
+
+        for c in node.content:
+            self.rec(c)
+
+    def map_Else(self, node):
+        cond_name = self.conditions.pop()
+        self.conditions.append("!" + cond_name)
 
     def map_EndIfThen(self, node):
-        return []
+        self.conditions.pop()
 
     def map_Do(self, node):
         scope = self.scope_stack[-1]
@@ -460,7 +488,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
             loop_var, loop_bounds = node.loopcontrol.split("=")
             loop_var = loop_var.strip()
             scope.use_name(loop_var)
-            loop_bounds = [self.parse_expr(s) for s in loop_bounds.split(",")]
+            loop_bounds = self.parse_expr(
+                    loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS)
 
             if len(loop_bounds) == 2:
                 start, stop = loop_bounds
@@ -560,7 +589,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         begin_tag_match = self.begin_tag_re.match(stripped_comment_line)
         end_tag_match = self.end_tag_re.match(stripped_comment_line)
-        faulty_loopy_pragma_match = self.faulty_loopy_pragma.match(stripped_comment_line)
+        faulty_loopy_pragma_match = self.faulty_loopy_pragma.match(
+                stripped_comment_line)
 
         if stripped_comment_line == "$loopy begin transform":
             if self.in_transform_code:
@@ -653,6 +683,9 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     default_order="F"
                     )
 
+            from loopy.loop import fuse_loop_domains
+            knl = fuse_loop_domains(knl)
+
             proc_dict[sub.subprogram_name] = lp.fold_constants(knl)
 
         transform_code = remove_common_indentation(
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index fe25435b79d73d89f84f3a7dc1e5e9a474ef8b6a..4291d98749cb84bf743c3e8c745052190000cd03 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -81,8 +81,8 @@ class FTreeWalkerBase(object):
 
     # {{{ expressions
 
-    def parse_expr(self, expr_str):
-        return self.expr_parser(expr_str)
+    def parse_expr(self, expr_str, **kwargs):
+        return self.expr_parser(expr_str, **kwargs)
 
     # }}}
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d3089fc6a6baad741c1cfc2d1843aa541192cfb3..68f7f131c4c40d2f647fb6c6095959b4884a3c41 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -392,10 +392,9 @@ class LoopKernel(RecordWithoutPickling):
         iname_set_stack = []
         result = []
 
-        writer_map = self.writer_map()
+        from loopy.kernel.tools import is_domain_dependent_on_inames
 
-        for dom in self.domains:
-            parameters = set(dom.get_var_names(dim_type.param))
+        for dom_idx, dom in enumerate(self.domains):
             inames = set(dom.get_var_names(dim_type.set))
 
             # This next domain may be nested inside the previous domain.
@@ -405,37 +404,11 @@ class LoopKernel(RecordWithoutPickling):
 
             discard_level_count = 0
             while discard_level_count < len(iname_set_stack):
-                # {{{ check for parenthood by loop bound iname
-
                 last_inames = iname_set_stack[-1-discard_level_count]
-                if last_inames & parameters:
-                    break
-
-                # }}}
-
-                # {{{ check for parenthood by written variable
 
-                is_parent_by_variable = False
-                for par in parameters:
-                    if par in self.temporary_variables:
-                        writer_insns = writer_map[par]
-
-                        if len(writer_insns) > 1:
-                            raise RuntimeError("loop bound '%s' "
-                                    "may only be written to once" % par)
-
-                        writer_insn, = writer_insns
-                        writer_inames = self.insn_inames(writer_insn)
-
-                        if writer_inames & last_inames:
-                            is_parent_by_variable = True
-                            break
-
-                if is_parent_by_variable:
+                if is_domain_dependent_on_inames(self, dom_idx, last_inames):
                     break
 
-                # }}}
-
                 discard_level_count += 1
 
             if discard_level_count:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index ba405c15af18321cd8cae8fbd41816c6b9b14927..871f5359b925703a8e71729af7585e51d051f7ca 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -431,8 +431,10 @@ class InstructionBase(Record):
 
     .. attribute:: predicates
 
-        a :class:`frozenset` of variable names whose truth values (as defined
-        by C) determine whether this instruction should be run
+        a :class:`frozenset` of variable names the conjunction (logical and) of
+        whose truth values (as defined by C) determine whether this instruction
+        should be run. Each variable name may, optionally, be preceded by
+        an exclamation point, indicating negation.
 
     .. attribute:: forced_iname_deps_is_final
 
@@ -678,7 +680,10 @@ class ExpressionInstruction(InstructionBase):
         for _, subscript in self.assignees_and_indices():
             result = result | get_dependencies(subscript)
 
-        result = result | self.predicates
+        processed_predicates = frozenset(
+                pred.lstrip("!") for pred in self.predicates)
+
+        result = result | processed_predicates
 
         return result
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index bd1fcb449320b5bab98b5d69cd0239621a61076a..2973cd9e69e46c875e15e6c8674150373be8be1e 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -539,4 +539,41 @@ class DomainParameterFinder(object):
 
 # }}}
 
+
+# {{{ is domain dependent on inames
+
+def is_domain_dependent_on_inames(kernel, domain_index, inames):
+    dom = kernel.domains[domain_index]
+    dom_parameters = set(dom.get_var_names(dim_type.param))
+
+    # {{{ check for parenthood by loop bound iname
+
+    if inames & dom_parameters:
+        return True
+
+    # }}}
+
+    # {{{ check for parenthood by written variable
+
+    for par in dom_parameters:
+        if par in kernel.temporary_variables:
+            writer_insns = kernel.writer_map()[par]
+
+            if len(writer_insns) > 1:
+                raise RuntimeError("loop bound '%s' "
+                        "may only be written to once" % par)
+
+            writer_insn, = writer_insns
+            writer_inames = kernel.insn_inames(writer_insn)
+
+            if writer_inames & inames:
+                return True
+
+    # }}}
+
+    return False
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/loop.py b/loopy/loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..4592463822a2321745aaf48a316d16c98d4efca3
--- /dev/null
+++ b/loopy/loop.py
@@ -0,0 +1,122 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import islpy as isl
+import six
+
+
+def potential_loop_nest_map(kernel):
+    """Returns a dictionary mapping inames to other inames that *could*
+    be nested around them.
+
+    :seealso: :func:`loopy.schedule.loop_nest_map`
+    """
+
+    result = {}
+
+    all_inames = kernel.all_inames()
+    iname_to_insns = kernel.iname_to_insns()
+
+    # examine pairs of all inames--O(n**2), I know.
+    for inner_iname in all_inames:
+        inner_result = set()
+        for outer_iname in all_inames:
+            if inner_iname == outer_iname:
+                continue
+
+            if iname_to_insns[inner_iname] <= iname_to_insns[outer_iname]:
+                inner_result.add(outer_iname)
+
+        if inner_result:
+            result[inner_iname] = inner_result
+
+    return result
+
+
+def fuse_loop_domains(kernel):
+    from loopy.kernel.tools import is_domain_dependent_on_inames
+
+    while True:
+        lnm = potential_loop_nest_map(kernel)
+        parents_per_domain = kernel.parents_per_domain()
+        all_parents_per_domain = kernel.all_parents_per_domain()
+
+        new_domains = None
+
+        for inner_iname, outer_inames in six.iteritems(lnm):
+            for outer_iname in outer_inames:
+                # {{{ check if it's safe to fuse
+
+                inner_domain_idx = kernel.get_home_domain_index(inner_iname)
+                outer_domain_idx = kernel.get_home_domain_index(outer_iname)
+
+                if inner_domain_idx == outer_domain_idx:
+                    break
+
+                if (
+                        outer_domain_idx in all_parents_per_domain[inner_domain_idx]
+                        and not
+                        outer_domain_idx == parents_per_domain[inner_domain_idx]):
+                    # Outer domain is not a direct parent of the inner
+                    # domain. Unable to fuse.
+                    continue
+
+                outer_dom = kernel.domains[outer_domain_idx]
+                inner_dom = kernel.domains[inner_domain_idx]
+
+                outer_inames = set(outer_dom.get_var_names(isl.dim_type.set))
+                if is_domain_dependent_on_inames(kernel, inner_domain_idx,
+                        outer_inames):
+                    # Bounds of inner domain depend on outer domain.
+                    # Unable to fuse.
+                    continue
+
+                # }}}
+
+                new_domains = kernel.domains[:]
+                min_idx = min(inner_domain_idx, outer_domain_idx)
+                max_idx = max(inner_domain_idx, outer_domain_idx)
+
+                del new_domains[max_idx]
+                del new_domains[min_idx]
+
+                outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom)
+
+                new_domains.insert(min_idx, inner_dom & outer_dom)
+                break
+
+            if new_domains:
+                break
+
+        if not new_domains:
+            # Nothing was accomplished in the last loop trip, time to quit.
+            break
+
+        kernel = kernel.copy(domains=new_domains)
+
+    return kernel
+
+
+# vim: fdm=marker
diff --git a/loopy/precompute.py b/loopy/precompute.py
index 984e24d1f89b5ea036636513acee7636bbec58dd..de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -1,8 +1,6 @@
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import division, absolute_import
 import six
-from six.moves import range
-from six.moves import zip
+from six.moves import range, zip
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -28,337 +26,35 @@ THE SOFTWARE.
 
 
 import islpy as isl
-from islpy import dim_type
 from loopy.symbolic import (get_dependencies, SubstitutionMapper,
         ExpandingIdentityMapper)
 from pymbolic.mapper.substitutor import make_subst_func
 import numpy as np
 
-from pytools import Record
 from pymbolic import var
 
+from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap,
+        AccessDescriptor)
 
-class InvocationDescriptor(Record):
-    __slots__ = [
-            "args",
-            "expands_footprint",
-            "is_in_footprint",
 
-            # Remember where the invocation happened, in terms of the expansion
-            # call stack.
-            "expansion_stack",
-            ]
+class RuleAccessDescriptor(AccessDescriptor):
+    __slots__ = ["args", "expansion_stack"]
 
 
-def to_parameters_or_project_out(param_inames, set_inames, set):
-    for iname in list(set.get_space().get_var_dict().keys()):
-        if iname in param_inames:
-            dt, idx = set.get_space().get_var_dict()[iname]
-            set = set.move_dims(
-                    dim_type.param, set.dim(dim_type.param),
-                    dt, idx, 1)
-        elif iname in set_inames:
-            pass
-        else:
-            dt, idx = set.get_space().get_var_dict()[iname]
-            set = set.project_out(dt, idx, 1)
-
-    return set
-
-
-# {{{ construct storage->sweep map
-
-def build_per_access_storage_to_domain_map(invdesc, domain,
-        storage_axis_names, storage_axis_sources,
-        prime_sweep_inames):
-
-    map_space = domain.get_space()
-    stor_dim = len(storage_axis_names)
-    rn = map_space.dim(dim_type.out)
-
-    map_space = map_space.add_dims(dim_type.in_, stor_dim)
-    for i, saxis in enumerate(storage_axis_names):
-        # arg names are initially primed, to be replaced with unprimed
-        # base-0 versions below
-
-        map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'")
+def access_descriptor_id(args, expansion_stack):
+    return (args, expansion_stack)
 
-    # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn)
 
-    set_space = map_space.move_dims(
-            dim_type.out, rn,
-            dim_type.in_, 0, stor_dim).range()
+def storage_axis_exprs(storage_axis_sources, args):
+    result = []
 
-    # set_space: [domain](dup_sweep_index)[dup_sweep](rn)[stor_axes']
-
-    stor2sweep = None
-
-    from loopy.symbolic import aff_from_expr
-
-    for saxis, saxis_source in zip(storage_axis_names, storage_axis_sources):
+    for saxis_source in storage_axis_sources:
         if isinstance(saxis_source, int):
-            # an argument
-            cns = isl.Constraint.equality_from_aff(
-                    aff_from_expr(set_space,
-                        var(saxis+"'")
-                        - prime_sweep_inames(invdesc.args[saxis_source])))
-        else:
-            # a 'bare' sweep iname
-            cns = isl.Constraint.equality_from_aff(
-                    aff_from_expr(set_space,
-                        var(saxis+"'")
-                        - prime_sweep_inames(var(saxis_source))))
-
-        cns_map = isl.BasicMap.from_constraint(cns)
-        if stor2sweep is None:
-            stor2sweep = cns_map
-        else:
-            stor2sweep = stor2sweep.intersect(cns_map)
-
-    if stor2sweep is not None:
-        stor2sweep = stor2sweep.move_dims(
-                dim_type.in_, 0,
-                dim_type.out, rn, stor_dim)
-
-    # stor2sweep is back in map_space
-    return stor2sweep
-
-
-def move_to_par_from_out(s2smap, except_inames):
-    while True:
-        var_dict = s2smap.get_var_dict(dim_type.out)
-        todo_inames = set(var_dict) - except_inames
-        if todo_inames:
-            iname = todo_inames.pop()
-
-            _, dim_idx = var_dict[iname]
-            s2smap = s2smap.move_dims(
-                    dim_type.param, s2smap.dim(dim_type.param),
-                    dim_type.out, dim_idx, 1)
+            result.append(args[saxis_source])
         else:
-            return s2smap
-
-
-def build_global_storage_to_sweep_map(kernel, invocation_descriptors,
-        domain_dup_sweep, dup_sweep_index,
-        storage_axis_names, storage_axis_sources,
-        sweep_inames, primed_sweep_inames, prime_sweep_inames):
-    """
-    As a side effect, this fills out is_in_footprint in the
-    invocation descriptors.
-    """
-
-    # The storage map goes from storage axes to the domain.
-    # The first len(arg_names) storage dimensions are the rule's arguments.
-
-    global_stor2sweep = None
-
-    # build footprint
-    for invdesc in invocation_descriptors:
-        if invdesc.expands_footprint:
-            stor2sweep = build_per_access_storage_to_domain_map(
-                    invdesc, domain_dup_sweep,
-                    storage_axis_names, storage_axis_sources,
-                    prime_sweep_inames)
-
-            if global_stor2sweep is None:
-                global_stor2sweep = stor2sweep
-            else:
-                global_stor2sweep = global_stor2sweep.union(stor2sweep)
-
-            invdesc.is_in_footprint = True
-
-    if isinstance(global_stor2sweep, isl.BasicMap):
-        global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep)
-    global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep)
-
-    # space for global_stor2sweep:
-    # [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn)
-
-    # {{{ check if non-footprint-building invocation descriptors fall into footprint
-
-    # Make all inames except the sweep parameters. (The footprint may depend on
-    # those.) (I.e. only leave sweep inames as out parameters.)
-    global_s2s_par_dom = move_to_par_from_out(
-            global_stor2sweep, except_inames=frozenset(primed_sweep_inames)).domain()
-
-    for invdesc in invocation_descriptors:
-        if not invdesc.expands_footprint:
-            arg_inames = (
-                    set(global_s2s_par_dom.get_var_names(dim_type.param))
-                    & kernel.all_inames())
-
-            for arg in invdesc.args:
-                arg_inames.update(get_dependencies(arg))
-            arg_inames = frozenset(arg_inames)
-
-            from loopy.kernel import CannotBranchDomainTree
-            try:
-                usage_domain = kernel.get_inames_domain(arg_inames)
-            except CannotBranchDomainTree:
-                # and that's the end of that.
-                invdesc.is_in_footprint = False
-                continue
-
-            for i in range(usage_domain.dim(dim_type.set)):
-                iname = usage_domain.get_dim_name(dim_type.set, i)
-                if iname in sweep_inames:
-                    usage_domain = usage_domain.set_dim_name(
-                            dim_type.set, i, iname+"'")
-
-            stor2sweep = build_per_access_storage_to_domain_map(invdesc,
-                    usage_domain, storage_axis_names, storage_axis_sources,
-                    prime_sweep_inames)
-
-            if isinstance(stor2sweep, isl.BasicMap):
-                stor2sweep = isl.Map.from_basic_map(stor2sweep)
-
-            stor2sweep = stor2sweep.intersect_range(usage_domain)
-
-            stor2sweep = move_to_par_from_out(stor2sweep,
-                    except_inames=frozenset(primed_sweep_inames))
-
-            s2s_domain = stor2sweep.domain()
-            s2s_domain, aligned_g_s2s_parm_dom = isl.align_two(
-                    s2s_domain, global_s2s_par_dom)
-
-            arg_restrictions = (
-                    aligned_g_s2s_parm_dom
-                    .eliminate(dim_type.set, 0,
-                        aligned_g_s2s_parm_dom.dim(dim_type.set))
-                    .remove_divs())
-
-            is_in_footprint = (arg_restrictions & s2s_domain).is_subset(
-                    aligned_g_s2s_parm_dom)
-
-            invdesc.is_in_footprint = is_in_footprint
-
-    # }}}
-
-    return global_stor2sweep
-
-# }}}
-
-
-# {{{ compute storage bounds
-
-def find_var_base_indices_and_shape_from_inames(
-        domain, inames, cache_manager, context=None):
-    base_indices_and_sizes = [
-            cache_manager.base_index_and_length(domain, iname, context)
-            for iname in inames]
-    return list(zip(*base_indices_and_sizes))
-
-
-def compute_bounds(kernel, domain, stor2sweep,
-        primed_sweep_inames, storage_axis_names):
-
-    bounds_footprint_map = move_to_par_from_out(
-            stor2sweep, except_inames=frozenset(primed_sweep_inames))
-
-    # compute bounds for each storage axis
-    storage_domain = bounds_footprint_map.domain().coalesce()
-
-    if not storage_domain.is_bounded():
-        raise RuntimeError("sweep did not result in a bounded storage domain")
-
-    return find_var_base_indices_and_shape_from_inames(
-            storage_domain, [saxis+"'" for saxis in storage_axis_names],
-            kernel.cache_manager, context=kernel.assumptions)
-
-# }}}
+            result.append(var(saxis_source))
 
-
-def get_access_info(kernel, domain,
-        storage_axis_names, storage_axis_sources,
-        sweep_inames, invocation_descriptors):
-
-    # {{{ duplicate sweep inames
-
-    # The duplication is necessary, otherwise the storage fetch
-    # inames remain weirdly tied to the original sweep inames.
-
-    primed_sweep_inames = [psin+"'" for psin in sweep_inames]
-    from loopy.isl_helpers import duplicate_axes
-    dup_sweep_index = domain.space.dim(dim_type.out)
-    domain_dup_sweep = duplicate_axes(
-            domain, sweep_inames,
-            primed_sweep_inames)
-
-    prime_sweep_inames = SubstitutionMapper(make_subst_func(
-        dict((sin, var(psin))
-            for sin, psin in zip(sweep_inames, primed_sweep_inames))))
-
-    # }}}
-
-    stor2sweep = build_global_storage_to_sweep_map(
-            kernel, invocation_descriptors,
-            domain_dup_sweep, dup_sweep_index,
-            storage_axis_names, storage_axis_sources,
-            sweep_inames, primed_sweep_inames, prime_sweep_inames)
-
-    storage_base_indices, storage_shape = compute_bounds(
-            kernel, domain, stor2sweep, primed_sweep_inames,
-            storage_axis_names)
-
-    # compute augmented domain
-
-    # {{{ filter out unit-length dimensions
-
-    non1_storage_axis_names = []
-    non1_storage_shape = []
-
-    for saxis, bi, l in zip(storage_axis_names, storage_base_indices, storage_shape):
-        if l != 1:
-            non1_storage_axis_names.append(saxis)
-            non1_storage_shape.append(l)
-
-    # }}}
-
-    # {{{ subtract off the base indices
-    # add the new, base-0 indices as new in dimensions
-
-    sp = stor2sweep.get_space()
-    stor_idx = sp.dim(dim_type.out)
-
-    n_stor = len(storage_axis_names)
-    nn1_stor = len(non1_storage_axis_names)
-
-    aug_domain = stor2sweep.move_dims(
-            dim_type.out, stor_idx,
-            dim_type.in_, 0,
-            n_stor).range()
-
-    # aug_domain space now:
-    # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes']
-
-    aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor)
-    for i, name in enumerate(non1_storage_axis_names):
-        aug_domain = aug_domain.set_dim_name(dim_type.set, stor_idx+i, name)
-
-    # aug_domain space now:
-    # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes]
-
-    from loopy.symbolic import aff_from_expr
-    for saxis, bi, s in zip(storage_axis_names, storage_base_indices, storage_shape):
-        if s != 1:
-            cns = isl.Constraint.equality_from_aff(
-                    aff_from_expr(aug_domain.get_space(),
-                        var(saxis) - (var(saxis+"'") - bi)))
-
-            aug_domain = aug_domain.add_constraint(cns)
-
-    # }}}
-
-    # eliminate (primed) storage axes with non-zero base indices
-    aug_domain = aug_domain.project_out(dim_type.set, stor_idx+nn1_stor, n_stor)
-
-    # eliminate duplicated sweep_inames
-    nsweep = len(sweep_inames)
-    aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index, nsweep)
-
-    return (non1_storage_axis_names, aug_domain,
-            storage_base_indices, non1_storage_shape)
+    return result
 
 
 def simplify_via_aff(expr):
@@ -369,7 +65,7 @@ def simplify_via_aff(expr):
         expr))
 
 
-class InvocationGatherer(ExpandingIdentityMapper):
+class RuleInvocationGatherer(ExpandingIdentityMapper):
     def __init__(self, kernel, subst_name, subst_tag, within):
         ExpandingIdentityMapper.__init__(self,
                 kernel.substitutions, kernel.get_var_name_generator())
@@ -383,7 +79,7 @@ class InvocationGatherer(ExpandingIdentityMapper):
         self.subst_tag = subst_tag
         self.within = within
 
-        self.invocation_descriptors = []
+        self.access_descriptors = []
 
     def map_substitution(self, name, tag, arguments, expn_state):
         process_me = name == self.subst_name
@@ -407,6 +103,9 @@ class InvocationGatherer(ExpandingIdentityMapper):
                     | get_dependencies(self.subst_expander(
                         arg_val, insn_id=None, insn_tags=None)))
 
+        # FIXME: This is too strict--and the footprint machinery
+        # needs to be taught how to deal with locally constant
+        # variables.
         if not arg_deps <= self.kernel.all_inames():
             from warnings import warn
             warn("Precompute arguments in '%s(%s)' do not consist exclusively "
@@ -420,17 +119,21 @@ class InvocationGatherer(ExpandingIdentityMapper):
             return ExpandingIdentityMapper.map_substitution(
                     self, name, tag, arguments, expn_state)
 
-        self.invocation_descriptors.append(
-                InvocationDescriptor(
-                    args=[arg_context[arg_name] for arg_name in rule.arguments],
-                    expansion_stack=expn_state.stack))
+        args = [arg_context[arg_name] for arg_name in rule.arguments]
+
+        # Do not set expands_footprint here, it is set below.
+        self.access_descriptors.append(
+                RuleAccessDescriptor(
+                    identifier=access_descriptor_id(args, expn_state.stack),
+                    args=args,
+                    ))
 
         return 0  # exact value irrelevant
 
 
-class InvocationReplacer(ExpandingIdentityMapper):
+class RuleInvocationReplacer(ExpandingIdentityMapper):
     def __init__(self, kernel, subst_name, subst_tag, within,
-            invocation_descriptors,
+            access_descriptors, array_base_map,
             storage_axis_names, storage_axis_sources,
             storage_base_indices, non1_storage_axis_names,
             target_var_name):
@@ -446,7 +149,8 @@ class InvocationReplacer(ExpandingIdentityMapper):
         self.subst_tag = subst_tag
         self.within = within
 
-        self.invocation_descriptors = invocation_descriptors
+        self.access_descriptors = access_descriptors
+        self.array_base_map = array_base_map
 
         self.storage_axis_names = storage_axis_names
         self.storage_axis_sources = storage_axis_sources
@@ -474,21 +178,21 @@ class InvocationReplacer(ExpandingIdentityMapper):
             return ExpandingIdentityMapper.map_substitution(
                     self, name, tag, arguments, expn_state)
 
-        matching_invdesc = None
-        for invdesc in self.invocation_descriptors:
-            if invdesc.args == args and expn_state.stack:
+        matching_accdesc = None
+        for accdesc in self.access_descriptors:
+            if accdesc.identifier == access_descriptor_id(args, expn_state.stack):
                 # Could be more than one, that's fine.
-                matching_invdesc = invdesc
+                matching_accdesc = accdesc
                 break
 
-        assert matching_invdesc is not None
+        assert matching_accdesc is not None
 
-        invdesc = matching_invdesc
-        del matching_invdesc
+        accdesc = matching_accdesc
+        del matching_accdesc
 
         # }}}
 
-        if not invdesc.is_in_footprint:
+        if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
             return ExpandingIdentityMapper.map_substitution(
                     self, name, tag, arguments, expn_state)
 
@@ -526,7 +230,8 @@ class InvocationReplacer(ExpandingIdentityMapper):
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={},
-        default_tag="l.auto", dtype=None, fetch_bounding_box=False):
+        default_tag="l.auto", dtype=None, fetch_bounding_box=False,
+        temporary_is_local=None):
     """Precompute the expression described in the substitution rule determined by
     *subst_use* and store it in a temporary array. A precomputation needs two
     things to operate, a list of *sweep_inames* (order irrelevant) and an
@@ -642,11 +347,17 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
 
+    from loopy.kernel.data import parse_tag
+    default_tag = parse_tag(default_tag)
+
+    subst = kernel.substitutions[subst_name]
+    c_subst_name = subst_name.replace(".", "_")
+
     # }}}
 
-    # {{{ process invocations in footprint generators, start invocation_descriptors
+    # {{{ process invocations in footprint generators, start access_descriptors
 
-    invocation_descriptors = []
+    access_descriptors = []
 
     if footprint_generators:
         from pymbolic.primitives import Variable, Call
@@ -660,35 +371,29 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 raise ValueError("footprint generator must "
                         "be substitution rule invocation")
 
-            invocation_descriptors.append(
-                    InvocationDescriptor(args=args,
+            access_descriptors.append(
+                    RuleAccessDescriptor(
+                        identifier=access_descriptor_id(args, None),
                         expands_footprint=True,
-                        expansion_stack=None))
+                        args=args
+                        ))
 
     # }}}
 
-    c_subst_name = subst_name.replace(".", "_")
+    # {{{ gather up invocations in kernel code, finish access_descriptors
 
-    from loopy.kernel.data import parse_tag
-    default_tag = parse_tag(default_tag)
-
-    subst = kernel.substitutions[subst_name]
-    arg_names = subst.arguments
-
-    # {{{ gather up invocations in kernel code, finish invocation_descriptors
-
-    invg = InvocationGatherer(kernel, subst_name, subst_tag, within)
+    invg = RuleInvocationGatherer(kernel, subst_name, subst_tag, within)
 
     import loopy as lp
     for insn in kernel.instructions:
         if isinstance(insn, lp.ExpressionInstruction):
             invg(insn.expression, insn.id, insn.tags)
 
-    for invdesc in invg.invocation_descriptors:
-        invocation_descriptors.append(
-                invdesc.copy(expands_footprint=footprint_generators is None))
+    for accdesc in invg.access_descriptors:
+        access_descriptors.append(
+                accdesc.copy(expands_footprint=footprint_generators is None))
 
-    if not invocation_descriptors:
+    if not access_descriptors:
         raise RuntimeError("no invocations of '%s' found" % subst_name)
 
     # }}}
@@ -700,9 +405,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     expanding_usage_arg_deps = set()
 
-    for invdesc in invocation_descriptors:
-        if invdesc.expands_footprint:
-            for arg in invdesc.args:
+    for accdesc in access_descriptors:
+        if accdesc.expands_footprint:
+            for arg in accdesc.args:
                 expanding_usage_arg_deps.update(
                         get_dependencies(arg) & kernel.all_inames())
 
@@ -731,7 +436,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     if storage_axes is None:
         storage_axes = (
                 list(extra_storage_axes)
-                + list(range(len(arg_names))))
+                + list(range(len(subst.arguments))))
 
     expr_subst_dict = {}
 
@@ -780,6 +485,16 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
+    # {{{ fill out access_descriptors[...].storage_axis_exprs
+
+    access_descriptors = [
+            accdesc.copy(
+                storage_axis_exprs=storage_axis_exprs(
+                    storage_axis_sources, accdesc.args))
+            for accdesc in access_descriptors]
+
+    # }}}
+
     expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
     assert expanding_inames <= kernel.all_inames()
 
@@ -801,37 +516,26 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         # }}}
 
-        (non1_storage_axis_names, new_domain,
-                storage_base_indices, non1_storage_shape) = \
-                        get_access_info(kernel, domch.domain,
-                                storage_axis_names, storage_axis_sources,
-                                sweep_inames, invocation_descriptors)
-
-        from loopy.isl_helpers import convexify, boxify
-        if fetch_bounding_box:
-            new_domain = boxify(kernel.cache_manager, new_domain,
-                    non1_storage_axis_names, kernel.assumptions)
-        else:
-            new_domain = convexify(new_domain)
+        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
+                access_descriptors, len(storage_axis_names))
 
-        for saxis in storage_axis_names:
-            if saxis not in non1_storage_axis_names:
+        non1_storage_axis_names = []
+        for i, saxis in enumerate(storage_axis_names):
+            if abm.non1_storage_axis_flags[i]:
+                non1_storage_axis_names.append(saxis)
+            else:
                 del new_iname_to_tag[saxis]
 
-        new_kernel_domains = domch.get_domains_with(new_domain)
+        new_kernel_domains = domch.get_domains_with(
+                abm.augment_domain_with_sweep(
+                    domch.domain, non1_storage_axis_names,
+                    boxify_sweep=fetch_bounding_box))
+
     else:
         # leave kernel domains unchanged
         new_kernel_domains = kernel.domains
 
-        non1_storage_axis_names = ()
-        storage_base_indices = ()
-        non1_storage_shape = ()
-
-        # no index dependencies--every reference to the subst rule
-        # is necessarily in the footprint.
-
-        for invdesc in invocation_descriptors:
-            invdesc.is_in_footprint = True
+        abm = NoOpArrayToBufferMap()
 
     # {{{ set up compute insn
 
@@ -852,7 +556,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     compute_expr = (SubstitutionMapper(
         make_subst_func(dict(
             (arg_name, zero_length_1_arg(arg_name)+bi)
-            for arg_name, bi in zip(storage_axis_names, storage_base_indices)
+            for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices)
             )))
         (compute_expr))
 
@@ -866,10 +570,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ substitute rule into expressions in kernel (if within footprint)
 
-    invr = InvocationReplacer(kernel, subst_name, subst_tag, within,
-            invocation_descriptors,
+    invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within,
+            access_descriptors, abm,
             storage_axis_names, storage_axis_sources,
-            storage_base_indices, non1_storage_axis_names,
+            abm.storage_base_indices, non1_storage_axis_names,
             target_var_name)
 
     kernel = invr.map_kernel(kernel)
@@ -886,13 +590,16 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     import loopy as lp
 
+    if temporary_is_local is None:
+        temporary_is_local = lp.auto
+
     new_temporary_variables = kernel.temporary_variables.copy()
     temp_var = lp.TemporaryVariable(
             name=target_var_name,
             dtype=dtype,
-            base_indices=(0,)*len(non1_storage_shape),
-            shape=tuple(non1_storage_shape),
-            is_local=lp.auto)
+            base_indices=(0,)*len(abm.non1_storage_shape),
+            shape=tuple(abm.non1_storage_shape),
+            is_local=temporary_is_local)
 
     new_temporary_variables[target_var_name] = temp_var
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 6bba764842cc19a55f8e1bd02cccdfb71bacccc8..6ef5378d6456ca022f9dd27ee87e593810981e98 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -638,7 +638,7 @@ def generate_loop_schedules_internal(sched_state, loop_priority, schedule=[],
 
     if debug_mode:
         print(75*"=")
-        six.input("Hit Enter for next schedule:")
+        six.moves.input("Hit Enter for next schedule:")
 
     if not active_inames and not unscheduled_insn_ids:
         # if done, yield result
@@ -1025,8 +1025,6 @@ def generate_loop_schedules(kernel, debug_args={}):
     from loopy.check import pre_schedule_checks
     pre_schedule_checks(kernel)
 
-    logger.info("%s: schedule start" % kernel.name)
-
     schedule_count = 0
 
     debug = ScheduleDebugger(**debug_args)
@@ -1094,7 +1092,7 @@ def generate_loop_schedules(kernel, debug_args={}):
             print("  debug_args=dict(interactive=False)")
             print("to generate_loop_schedules().")
             print(75*"-")
-            six.input("Enter:")
+            six.moves.input("Enter:")
             print()
             print()
 
@@ -1134,6 +1132,11 @@ def get_one_scheduled_kernel(kernel):
 
         kernel_count = 0
 
+        from time import time
+        start_time = time()
+
+        logger.info("%s: schedule start" % kernel.name)
+
         for scheduled_kernel in generate_loop_schedules(kernel):
             kernel_count += 1
 
@@ -1145,6 +1148,9 @@ def get_one_scheduled_kernel(kernel):
                 ambiguous = True
                 break
 
+        logger.info("%s: scheduling done after %.2f s" % (
+            kernel.name, time()-start_time))
+
     if ambiguous:
         from warnings import warn
         from loopy.diagnostic import LoopyWarning
diff --git a/loopy/subst.py b/loopy/subst.py
index 90c0a79162c77ac1cbd97ac8ffaf12fd8c8c36d0..d4dba4b5dfd9918926adaad2af9179813914e8be 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -43,10 +43,12 @@ class ExprDescriptor(Record):
     __slots__ = ["insn", "expr", "unif_var_dict"]
 
 
-def extract_subst(kernel, subst_name, template, parameters):
+def extract_subst(kernel, subst_name, template, parameters=()):
     """
     :arg subst_name: The name of the substitution rule to be created.
     :arg template: Unification template expression.
+    :arg parameters: An iterable of parameters used in
+        *template*, or a comma-separated string of the same.
 
     All targeted subexpressions must match ('unify with') *template*
     The template may contain '*' wildcards that will have to match exactly across all
@@ -57,6 +59,10 @@ def extract_subst(kernel, subst_name, template, parameters):
         from pymbolic import parse
         template = parse(template)
 
+    if isinstance(parameters, str):
+        parameters = tuple(
+                s.strip() for s in parameters.split(","))
+
     var_name_gen = kernel.get_var_name_generator()
 
     # {{{ replace any wildcards in template with new variables
@@ -268,7 +274,6 @@ def temporary_to_subst(kernel, temp_name, within=None):
     """Extract an assignment to a temporary variable
     as a :ref:`substituion-rule`. The temporary may
 
-
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
 
@@ -319,7 +324,6 @@ def temporary_to_subst(kernel, temp_name, within=None):
         return def_id
 
     usage_to_definition = {}
-    definition_insn_ids = set()
 
     for insn in kernel.instructions:
         if temp_name not in insn.read_dependency_names():
@@ -332,7 +336,11 @@ def temporary_to_subst(kernel, temp_name, within=None):
                     % (temp_name, insn.id))
 
         usage_to_definition[insn.id] = def_id
-        definition_insn_ids.add(def_id)
+
+    definition_insn_ids = set()
+    for insn in kernel.instructions:
+        if temp_name in insn.write_dependency_names():
+            definition_insn_ids.add(insn.id)
 
     # }}}
 
@@ -350,7 +358,7 @@ def temporary_to_subst(kernel, temp_name, within=None):
 
     new_substs = kernel.substitutions.copy()
     for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name):
-        def_insn = id_to_insn[def_id]
+        def_insn = kernel.id_to_insn[def_id]
 
         (_, indices), = def_insn.assignees_and_indices()
 
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index f1d3bfdde83735a34b9f86f14c561e9aa1bb2124..a1b323d7cf0deaef308952d87b1523b0a020ad68 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -34,6 +34,9 @@ from loopy.target.opencl import OpenCLTarget
 import pyopencl as cl
 import pyopencl.characterize as cl_char
 
+# This ensures the dtype registry is populated.
+import pyopencl.tools  # noqa
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -228,7 +231,7 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
 # }}}
 
 
-# {{{
+# {{{ pyopencl tools
 
 class PyOpenCLTarget(OpenCLTarget):
     def __init__(self, device=None):
@@ -236,9 +239,6 @@ class PyOpenCLTarget(OpenCLTarget):
 
         self.device = device
 
-        # This ensures the dtype registry is populated.
-        import pyopencl.tools  # noqa
-
     hash_fields = ["device"]
     comparison_fields = ["device"]
 
diff --git a/test/test_dg.py b/test/test_dg.py
index 291da44846f551e4576ec81326836191942dbfd8..581562da89210ea476700191c6d21ad2dbe7fd3d 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 import numpy as np
 import pyopencl as cl
+import pyopencl.array  # noqa
 import loopy as lp
 
 import logging  # noqa
diff --git a/test/test_fortran.py b/test/test_fortran.py
index bf6ffe140289c13e5c04df02b0c226f44832d41b..33826e4063190f7d7d48b1bfcb4122931868bd1e 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -201,12 +201,122 @@ def test_temporary_to_subst_indices(ctx_factory):
 
     ref_knl = knl
 
+    assert "a" in knl.temporary_variables
     knl = lp.temporary_to_subst(knl, "a")
+    assert "a" not in knl.temporary_variables
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
 
+def test_if(ctx_factory):
+    fortran_src = """
+        subroutine fill(out, out2, inp, n)
+          implicit none
+
+          real*8 a, b, out(n), out2(n), inp(n)
+          integer n
+
+          do i = 1, n
+            a = inp(i)
+            if (a.ge.3) then
+                b = 2*a
+                do j = 1,3
+                    b = 3 * b
+                end do
+                out(i) = 5*b
+            else
+                out(i) = 4*a
+            endif
+          end do
+        end
+        """
+
+    from loopy.frontend.fortran import f2loopy
+    knl, = f2loopy(fortran_src)
+
+    ref_knl = knl
+
+    knl = lp.temporary_to_subst(knl, "a")
+
+    ctx = ctx_factory()
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
+def test_tagged(ctx_factory):
+    fortran_src = """
+        subroutine rot_norm(out, alpha, out2, inp, inp2, n)
+          implicit none
+          real*8 a, b, r, out(n), out2(n), inp(n), inp2(n)
+          real*8 alpha
+          integer n
+
+          do i = 1, n
+            !$loopy begin tagged: input
+            a = cos(alpha)*inp(i) + sin(alpha)*inp2(i)
+            b = -sin(alpha)*inp(i) + cos(alpha)*inp2(i)
+            !$loopy end tagged: input
+
+            r = sqrt(a**2 + b**2)
+            a = a/r
+            b = b/r
+
+            out(i) = a
+            out2(i) = b
+          end do
+        end
+        """
+
+    from loopy.frontend.fortran import f2loopy
+    knl, = f2loopy(fortran_src)
+
+    assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2
+
+
+def test_matmul(ctx_factory):
+    fortran_src = """
+        subroutine dgemm(m,n,l,a,b,c)
+          implicit none
+          real*8 temp, a(m,l),b(l,n),c(m,n)
+          integer m,n,k,i,j,l
+
+          do j = 1,n
+            do i = 1,m
+              temp = 0
+              do k = 1,l
+                temp = temp + b(k,j)*a(i,k)
+              end do
+              c(i,j) = temp
+            end do
+          end do
+        end subroutine
+
+        !$loopy begin transform
+        !$loopy end transform
+        """
+
+    from loopy.frontend.fortran import f2loopy
+    knl, = f2loopy(fortran_src)
+
+    assert len(knl.domains) == 1
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16,
+            outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", 8,
+            outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", 32)
+
+    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
+    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
+    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
+    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")
+
+    ctx = ctx_factory()
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_linalg.py b/test/test_linalg.py
index b1ebacb1e477e2cb2529fed263b94faa787e334d..d1a995b19fe22c818f03cadc7131e7732d3e1d58 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -183,32 +183,28 @@ def test_plain_matrix_mul(ctx_factory):
 
 
 def test_variable_size_matrix_mul(ctx_factory):
-    dtype = np.float32
     ctx = ctx_factory()
-    order = "C"
 
     n = get_suitable_size(ctx)
 
     knl = lp.make_kernel(
-            "[n] -> {[i,j,k]: 0<=i,j,k<n}",
-            [
-                "c[i, j] = sum(k, a[i, k]*b[k, j]) {id=labl}"
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape="n, n", order=order),
-                lp.GlobalArg("b", dtype, shape="n, n", order=order),
-                lp.GlobalArg("c", dtype, shape="n, n", order=order),
-                lp.ValueArg("n", np.int32, approximately=n),
-                ],
-            name="matmul", assumptions="n >= 16")
+            "{[i,j,k]: 0<=i,j,k<n}",
+            "c[i, j] = sum(k, a[i, k]*b[k, j])")
+
+    knl = lp.add_dtypes(knl, {
+        "a": np.float32,
+        "b": np.float32,
+        })
 
     ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 16,
-            outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_iname(knl, "j", 8,
-            outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 32)
+            outer_tag="g.0", inner_tag="l.1",
+            slabs=(0, 1))
+    knl = lp.split_iname(knl, "j", 16,
+            outer_tag="g.1", inner_tag="l.0",
+            slabs=(0, 1))
+    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))
 
     knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
@@ -226,7 +222,7 @@ def test_funny_shape_matrix_mul(ctx_factory):
     l = m+12
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
                 ],
@@ -245,8 +241,12 @@ def test_funny_shape_matrix_mul(ctx_factory):
             outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 32)
 
-    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
-    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
+    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
+    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
+    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
+    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
+    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 17e28af6a05ef94abb5ba061271d9f7f5d4ec617..6a59eb7388afedcc65bb6aa3d42f57b2e7152ac3 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -63,14 +63,9 @@ def test_complicated_subst(ctx_factory):
                 h(x) := 1 + g(x) + 20*g$two(x)
 
                 a[i] = h$one(i) * h$two(i)
-                """,
-            [
-                lp.GlobalArg("a", np.float32, shape=("n",)),
-                lp.ValueArg("n", np.int32),
-                ])
+                """)
 
-    from loopy.subst import expand_subst
-    knl = expand_subst(knl, "g$two < h$two")
+    knl = lp.expand_subst(knl, "g$two < h$two")
 
     print(knl)
 
@@ -84,6 +79,23 @@ def test_complicated_subst(ctx_factory):
         assert substs_with_letter == how_many
 
 
+def test_extract_subst(ctx_factory):
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+                a[i] = 23*b[i]**2 + 25*b[i]**2
+                """)
+
+    knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha")
+
+    print(knl)
+
+    from loopy.symbolic import parse
+
+    insn, = knl.instructions
+    assert insn.expression == parse("bsquare(23) + bsquare(25)")
+
+
 def test_type_inference_no_artificial_doubles(ctx_factory):
     ctx = ctx_factory()
 
@@ -1694,13 +1706,11 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory):
         assert (a_ref == a_knl).get().all()
 
 
-def test_multiple_writes_to_local_temporary(ctx_factory):
+def test_multiple_writes_to_local_temporary():
     # Loopy would previously only handle barrier insertion correctly if exactly
     # one instruction wrote to each local temporary. This tests that multiple
     # writes are OK.
 
-    ctx = ctx_factory()
-
     knl = lp.make_kernel(
         "{[i,e]: 0<=i<5 and 0<=e<nelements}",
         """
@@ -1709,13 +1719,13 @@ def test_multiple_writes_to_local_temporary(ctx_factory):
         """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    knl = lp.preprocess_kernel(knl)
     for k in lp.generate_loop_schedules(knl):
         code, _ = lp.generate_code(k)
         print(code)
 
 
-def test_fd_demo(ctx_factory):
+def test_fd_demo():
     knl = lp.make_kernel(
         "{[i,j]: 0<=i,j<n}",
         "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \
@@ -1743,6 +1753,26 @@ def test_fd_demo(ctx_factory):
     assert "double" not in code
 
 
+def test_fd_1d(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        "result[i] = u[i+1]-u[i]")
+
+    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16)
+    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
+    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
+    knl = lp.assume(knl, "n mod 16 = 0")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=2048))
+
+
 def test_make_copy_kernel(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)