diff --git a/loopy/__init__.py b/loopy/__init__.py
index e7d8ce637967a674f73aa1ada6bae6b6f098cef4..4d296fe4b0a6898040b564a59aacf3101cc4375b 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -23,20 +23,23 @@ class LoopyAdvisory(UserWarning):
 from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg
 
 from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph
-from loopy.cse import realize_cse
+from loopy.subst import extract_subst, apply_subst
+from loopy.cse import precompute
 from loopy.preprocess import preprocess_kernel
 from loopy.schedule import generate_loop_schedules
 from loopy.codegen import generate_code
 from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq
 from loopy.check import check_kernels
 
-__all__ = ["ScalarArg", "ArrayArg", "ImageArg",
+__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg",
         "get_dot_dependency_graph",
         "preprocess_kernel", "generate_loop_schedules",
         "generate_code",
-        "CompiledKernel", "drive_timing_run", "check_kernels",
+        "CompiledKernel", "drive_timing_run", "auto_test_vs_seq", "check_kernels",
         "make_kernel", "split_dimension", "join_dimensions",
-        "tag_dimensions", "realize_cse", "add_prefetch"
+        "tag_dimensions",
+        "extract_subst", "apply_subst",
+        "precompute", "add_prefetch"
         ]
 
 # }}}
@@ -62,12 +65,7 @@ def make_kernel(*args, **kwargs):
 
     newly_created_vars = set()
 
-    from loopy.symbolic import ParametrizedSubstitutor
-    cse_sub = ParametrizedSubstitutor(knl.cses, wrap_cse=True)
-    subst_sub = ParametrizedSubstitutor(knl.substitutions, wrap_cse=False)
-
     for insn in knl.instructions:
-        insn = insn.copy(expression=subst_sub(cse_sub(insn.expression)))
 
         # {{{ sanity checking
 
@@ -190,8 +188,7 @@ def make_kernel(*args, **kwargs):
             domain=new_domain,
             temporary_variables=new_temp_vars,
             iname_to_tag=new_iname_to_tag,
-            iname_to_tag_requests=[],
-            cses={})
+            iname_to_tag_requests=[])
 
 # }}}
 
@@ -276,6 +273,7 @@ def split_dimension(kernel, split_iname, inner_length,
     iname_slab_increments = kernel.iname_slab_increments.copy()
     iname_slab_increments[outer_iname] = slabs
     result = (kernel
+            .map_expressions(subst_mapper, exclude_instructions=True)
             .copy(domain=new_domain,
                 iname_slab_increments=iname_slab_increments,
                 instructions=new_insns,
@@ -367,9 +365,9 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()):
                 forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps))
             for insn in kernel.instructions]
 
-    result = kernel.copy(
-            instructions=new_insns,
-            domain=new_domain)
+    result = (kernel
+            .map_expressions(subst_map, exclude_instructions=True)
+            .copy(instructions=new_insns, domain=new_domain))
 
     return tag_dimensions(result, {new_iname: tag})
 
@@ -419,53 +417,47 @@ def tag_dimensions(kernel, iname_to_tag, force=False):
 
 # {{{ convenience: add_prefetch
 
-def add_prefetch(kernel, var_name, fetch_dims=[], uni_template=None,
-        new_inames=None, default_tag="l.auto"):
-    used_cse_tags = set()
-    def map_cse(expr, rec):
-        used_cse_tags.add(expr.tag)
-        rec(expr.child)
+def add_prefetch(kernel, var_name, sweep_dims, dim_args=None,
+        new_inames=None, default_tag="l.auto", rule_name=None):
 
-    def get_unique_cse_tag():
-        from loopy.tools import generate_unique_possibilities
-        for cse_tag in generate_unique_possibilities(prefix="fetch_"+var_name):
-            if cse_tag not in used_cse_tags:
-                used_cse_tags.add(cse_tag)
-                return cse_tag
+    if rule_name is None:
+        rule_name = kernel.make_unique_subst_rule_name("%s_fetch" % var_name)
 
-    cse_tag = get_unique_cse_tag()
+    arg = kernel.arg_dict[var_name]
 
-    from loopy.symbolic import VariableFetchCSEMapper
-    vf_cse_mapper = VariableFetchCSEMapper(var_name, lambda: cse_tag)
-    kernel = kernel.copy(instructions=[
-            insn.copy(expression=vf_cse_mapper(insn.expression))
-            for insn in kernel.instructions])
+    newly_created_vars = set()
+    parameters = []
+    for i in range(len(arg.shape)):
+        based_on = "%s_i%d" % (var_name, i)
+        if dim_args is not None and i < len(dim_args):
+            based_on = dim_args[i]
 
-    if var_name in kernel.arg_dict:
-        dtype = kernel.arg_dict[var_name].dtype
-    else:
-        dtype = kernel.temporary_variables[var_name].dtype
+        par_name = kernel.make_unique_var_name(based_on=based_on,
+                extra_used_vars=newly_created_vars)
+        newly_created_vars.add(par_name)
+        parameters.append(par_name)
 
-    kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims, uni_template=uni_template,
-            new_inames=new_inames, default_tag=default_tag)
+    from pymbolic import var
+    uni_template = var(var_name)
+    if len(parameters) > 1:
+        uni_template = uni_template[tuple(var(par_name) for par_name in parameters)]
+    elif len(parameters) == 1:
+        uni_template = uni_template[var(parameters[0])]
+
+    kernel = extract_subst(kernel, rule_name, uni_template, parameters)
+
+    new_fetch_dims = []
+    for fd in sweep_dims:
+        if isinstance(fd, int):
+            new_fetch_dims.append(parameters[fd])
+        else:
+            new_fetch_dims.append(fd)
 
-    return kernel
+    return precompute(kernel, rule_name, arg.dtype, sweep_dims, new_arg_names=dim_args,
+            default_tag=default_tag)
 
 # }}}
 
-def remove_cses(kernel):
-    from loopy.symbolic import CSECallbackMapper
-
-    def map_cse(expr, rec):
-        return expr.child
-
-    new_insns = []
-    for insn in kernel.instructions:
-        new_insns.append(
-                insn.copy(
-                    expression=CSECallbackMapper(map_cse)(insn.expression)))
-
-    return kernel.copy(instructions=new_insns)
 
 
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 0beaf47dbaf0958ee1c3e663151a165925a33a79..fb94fbecaf819537d829249197bf1c9756914207 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -62,7 +62,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 
         if lower_incr:
             assert lower_incr > 0
-            lower_slab = ("initial", isl.Set.universe(kernel.space)
+            lower_slab = ("initial", isl.BasicSet.universe(kernel.space)
                     .add_constraint(lb_cns_orig)
                     .add_constraint(ub_cns_orig)
                     .add_constraint(
@@ -78,7 +78,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 
         if upper_incr:
             assert upper_incr > 0
-            upper_slab = ("final", isl.Set.universe(kernel.space)
+            upper_slab = ("final", isl.BasicSet.universe(kernel.space)
                     .add_constraint(lb_cns_orig)
                     .add_constraint(ub_cns_orig)
                     .add_constraint(
@@ -98,7 +98,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
             slabs.append(lower_slab)
         slabs.append((
             ("bulk",
-                (isl.Set.universe(kernel.space)
+                (isl.BasicSet.universe(kernel.space)
                     .add_constraint(lower_bulk_bound)
                     .add_constraint(upper_bulk_bound)))))
         if upper_slab:
@@ -108,7 +108,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 
     else:
         return [("bulk",
-            (isl.Set.universe(kernel.space)
+            (isl.BasicSet.universe(kernel.space)
             .add_constraint(lb_cns_orig)
             .add_constraint(ub_cns_orig)))]
 
diff --git a/loopy/cse.py b/loopy/cse.py
index ee22a8b8fcbe79f3943783dbefffba51db04515f..748ef458b443be1e154a9d8751805337376b2cf1 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -12,50 +12,8 @@ from pymbolic import var
 
 
 
-def check_cse_iname_deps(iname, duplicate_inames, tag, dependencies, cse_tag, uni_template):
-    from loopy.kernel import (LocalIndexTagBase, GroupIndexTag, IlpTag)
-
-    if isinstance(tag, LocalIndexTagBase):
-        kind = "l"
-    elif isinstance(tag, GroupIndexTag):
-        kind = "g"
-    elif isinstance(tag, IlpTag):
-        kind = "i"
-    else:
-        kind = "o"
-
-    if iname not in duplicate_inames and iname in dependencies:
-        if kind == "i":
-            raise RuntimeError(
-                    "When realizing CSE with tag '%s', encountered iname "
-                    "'%s' which is depended upon by the CSE and tagged "
-                    "'%s', but not duplicated. The CSE would "
-                    "inherit this iname, which would lead to a write race. "
-                    "A likely solution of this problem is to also duplicate this "
-                    "iname."
-                    % (cse_tag, iname, tag))
-
-    if iname in duplicate_inames and kind == "g":
-        raise RuntimeError("duplicating the iname '%s' into "
-                "group index axes is not helpful, as they cannot "
-                "collaborate in computing a local/private variable"
-                %iname)
-
-    if iname in dependencies:
-        return
-
-    # the iname is *not* a dependency of the fetch expression
-    if iname in duplicate_inames:
-        raise RuntimeError("duplicating an iname ('%s') "
-                "that the CSE ('%s') does not depend on "
-                "does not make sense" % (iname, uni_template))
-
-
-
-
-class CSEDescriptor(Record):
-    __slots__ = ["insn", "cse", "independent_inames",
-            "unif_var_dict"]
+class InvocationDescriptor(Record):
+    __slots__ = ["expr", "args", ]
 
 
 
@@ -78,366 +36,233 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 
 
 
-def process_cses(kernel, uni_template,
-        independent_inames, matching_vars, cse_descriptors):
-    from loopy.symbolic import UnidirectionalUnifier
-
-    ind_inames_set = set(independent_inames)
-
-    uni_iname_list = independent_inames + matching_vars
-    footprint = None
-
-    uni_recs = []
-    matching_var_values = {}
-
-    for csed in cse_descriptors:
-        # {{{ find unifier
-
-        unif = UnidirectionalUnifier(
-                lhs_mapping_candidates=ind_inames_set | set(matching_vars))
-        unifiers = unif(uni_template, csed.cse.child)
-        if not unifiers:
-            raise RuntimeError("Unable to unify  "
-            "CSEs '%s' and '%s' (with lhs candidates '%s')" % (
-                uni_template, csed.cse.child,
-                ",".join(unif.lhs_mapping_candidates),
-                ))
-
-        # }}}
-
-        found_good_unifier = False
-
-        for unifier in unifiers:
-            # {{{ construct, check mapping
-
-            map_space = kernel.space
-            ln = len(uni_iname_list)
-            rn = kernel.space.dim(dim_type.out)
-
-            map_space = map_space.add_dims(dim_type.in_, ln)
-            for i, iname in enumerate(uni_iname_list):
-                map_space = map_space.set_dim_name(dim_type.in_, i, iname)
-
-            set_space = map_space.move_dims(
-                    dim_type.out, rn,
-                    dim_type.in_, 0, ln).range()
-
-            var_map = None
+def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
+        sweep_inames, invocation_descriptors):
+    global_footprint_map = None
 
-            rhs_deps = set()
+    processed_sweep_inames = set()
 
-            from loopy.symbolic import aff_from_expr
-            for lhs, rhs in unifier.equations:
-                cns = isl.Constraint.equality_from_aff(
-                        aff_from_expr(set_space, lhs - rhs))
+    for invdesc in invocation_descriptors:
 
-                rhs_deps.update(get_dependencies(rhs))
-
-                cns_map = isl.BasicMap.from_constraint(cns)
-                if var_map is None:
-                    var_map = cns_map
-                else:
-                    var_map = var_map.intersect(cns_map)
+        for iname in sweep_inames:
+            if iname in arg_names:
+                arg_idx = arg_names.index(iname)
+                processed_sweep_inames.add(
+                        get_dependencies(invdesc.args[arg_idx]))
+            else:
+                processed_sweep_inames.add(iname)
 
-            var_map = var_map.move_dims(
-                    dim_type.in_, 0,
-                    dim_type.out, rn, ln)
+        # {{{ construct, check mapping
 
-            restr_rhs_map = (
-                    isl.Map.from_basic_map(var_map)
-                    .intersect_range(kernel.domain))
+        map_space = kernel.space
+        ln = len(unique_new_arg_names)
+        rn = kernel.space.dim(dim_type.out)
 
-            # Sanity check: If the range of the map does not recover the
-            # domain of the expression, the unifier must have been no
-            # good.
-            if restr_rhs_map.range() != kernel.domain:
-                continue
+        map_space = map_space.add_dims(dim_type.in_, ln)
+        for i, iname in enumerate(unique_new_arg_names):
+            map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'")
 
-            restr_rhs_map = restr_rhs_map.project_out_except(
-                    rhs_deps, [dim_type.out])
+        set_space = map_space.move_dims(
+                dim_type.out, rn,
+                dim_type.in_, 0, ln).range()
 
-            # Sanity check: Injectivity here means that unique lead indices
-            # can be found for each
+        footprint_map = None
 
-            if not restr_rhs_map.is_injective():
-                raise RuntimeError("In CSEs '%s' and '%s': "
-                        "cannot find lead indices uniquely"
-                        % (uni_template, csed.cse.child))
+        from loopy.symbolic import aff_from_expr
+        for uarg_name, arg_val in zip(unique_new_arg_names, invdesc.args):
+            cns = isl.Constraint.equality_from_aff(
+                    aff_from_expr(set_space, var(uarg_name+"'") - arg_val))
 
-            footprint_contrib = restr_rhs_map.domain()
-            if footprint is None:
-                footprint = footprint_contrib
+            cns_map = isl.BasicMap.from_constraint(cns)
+            if footprint_map is None:
+                footprint_map = cns_map
             else:
-                footprint = footprint.union(footprint_contrib)
-
-            found_good_unifier = True
-
-            # }}}
-
-        if not found_good_unifier:
-            raise RuntimeError("No valid unifier for '%s' and '%s'"
-                    % (csed.cse.child, uni_template))
-
-        uni_recs.append(unifier)
+                footprint_map = footprint_map.intersect(cns_map)
 
-        # {{{ check that matching_vars have a unique_value
+        footprint_map = footprint_map.move_dims(
+                dim_type.in_, 0,
+                dim_type.out, rn, ln)
 
-        csed.unif_var_dict = dict((lhs.name, rhs)
-                for lhs, rhs in unifier.equations)
-        for mv_name in matching_vars:
-            if mv_name in matching_var_values:
-                if matching_var_values[mv_name] != csed.unif_var_dict[mv_name]:
-                    raise RuntimeError("two different expressions encountered "
-                            "for matching variable '%s' in unification template '%s':"
-                            "'%s' and '%s'" % (
-                                mv_name, uni_template,
-                                matching_var_values[mv_name], csed.unif_var_dict[mv_name]))
-            else:
-                matching_var_values[mv_name] = csed.unif_var_dict[mv_name]
+        if global_footprint_map is None:
+            global_footprint_map = footprint_map
+        else:
+            global_footprint_map = global_footprint_map.union(footprint_map)
 
         # }}}
 
-    assert (footprint
-            .project_out_except(independent_inames, [dim_type.set])
-            .is_bounded())
+    processed_sweep_inames = list(processed_sweep_inames)
 
-    return footprint, matching_var_values,
+    global_footprint_map = global_footprint_map.intersect_range(kernel.domain)
 
+    # move non-sweep-dimensions into parameter space
+    sweep_footprint_map = global_footprint_map.coalesce()
 
+    for iname in kernel.all_inames():
+        if iname not in processed_sweep_inames:
+            sp = sweep_footprint_map.get_space()
+            dt, idx = sp.get_var_dict()[iname]
+            sweep_footprint_map = sweep_footprint_map.move_dims(
+                    dim_type.param, sp.dim(dim_type.param),
+                    dt, idx, 1)
 
+    # compute bounding boxes to each set of parameters
+    sfm_dom = sweep_footprint_map.domain().coalesce()
 
+    if not sfm_dom.is_bounded():
+        raise RuntimeError("In precomputation of substitution '%s': "
+                "sweep did not result in a bounded footprint"
+                % subst_name)
 
-def make_compute_insn(kernel, cse_tag, uni_template,
-        target_var_name, target_var_base_indices,
-        independent_inames, ind_iname_to_tag, insn):
+    from loopy.kernel import find_var_base_indices_and_shape_from_inames
+    base_indices, shape = find_var_base_indices_and_shape_from_inames(
+            sfm_dom, [uarg+"'" for uarg in unique_new_arg_names],
+            kernel.cache_manager)
 
-    # {{{ decide whether to force a dep
+    # compute augmented domain
 
-    from loopy.symbolic import IndexVariableFinder
-    dependencies = IndexVariableFinder(
-            include_reduction_inames=False)(uni_template)
+    # {{{ subtract off the base indices
+    # add the new, base-0 as new in dimensions
 
-    parent_inames = kernel.insn_inames(insn) | insn.reduction_inames()
-    #print dependencies, parent_inames
-    #assert dependencies <= parent_inames
+    sp = global_footprint_map.get_space()
+    tgt_idx = sp.dim(dim_type.out)
 
-    for iname in parent_inames:
-        if iname in independent_inames:
-            tag = ind_iname_to_tag[iname]
-        else:
-            tag = kernel.iname_to_tag.get(iname)
+    n_args = len(unique_new_arg_names)
 
-        check_cse_iname_deps(
-                iname, independent_inames, tag, dependencies, cse_tag, uni_template)
+    aug_domain = global_footprint_map.move_dims(
+            dim_type.out, tgt_idx,
+            dim_type.in_, 0,
+            n_args).range().coalesce()
 
-    # }}}
+    aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, n_args)
+    for i, name in enumerate(unique_new_arg_names):
+        aug_domain = aug_domain.set_dim_name(dim_type.set, tgt_idx+i, name)
 
-    assignee = var(target_var_name)
+    # index layout now:
+    # <....out.....> (tgt_idx) <base-0 args> <args>
 
-    if independent_inames:
-        assignee = assignee[tuple(
-            var(iname)-bi
-            for iname, bi in zip(independent_inames, target_var_base_indices)
-            )]
+    from loopy.symbolic import aff_from_expr
+    for uarg_name, bi in zip(unique_new_arg_names, base_indices):
+        cns = isl.Constraint.equality_from_aff(
+                aff_from_expr(aug_domain.get_space(),
+                    var(uarg_name) - (var(uarg_name+"'") - bi)))
 
-    insn_prefix = cse_tag
-    if insn_prefix is None:
-        insn_prefix = "cse"
-    from loopy.kernel import Instruction
-    return Instruction(
-            id=kernel.make_unique_instruction_id(based_on=insn_prefix+"_compute"),
-            assignee=assignee,
-            expression=uni_template)
+        aug_domain = aug_domain.add_constraint(cns)
 
+    aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+n_args, n_args)
+    aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+n_args, n_args)
 
+    base_indices_2, shape_2 = find_var_base_indices_and_shape_from_inames(
+            aug_domain, unique_new_arg_names,
+            kernel.cache_manager)
 
+    assert base_indices_2 == [0] * n_args
+    assert shape_2 == shape
 
-def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
-        uni_template=None, ind_iname_to_tag={}, new_inames=None, default_tag="l.auto"):
-    """
-    :arg independent_inames: which inames are supposed to be separate loops
-        in the CSE. Also determines index order of temporary array.
-        The variables in independent_inames refer to the unification
-        template.
-    :arg uni_template: An expression against which all targeted subexpressions
-        must unify
+    return aug_domain, base_indices, shape
 
-        If None, a unification template will be chosen from among the targeted
-        CSEs. That CSE is chosen to depend on all the variables in
-        *independent_inames*.  It is an error if no such expression can be
-        found.
 
-        May contain '*' wildcards that will have to match exactly across all
-        unifications.
 
-    Process:
 
-    - Find all targeted CSEs.
 
-    - Find *uni_template* as described above.
+def simplify_via_aff(space, expr):
+    from loopy.symbolic import aff_from_expr, aff_to_expr
+    return aff_to_expr(aff_from_expr(space, expr))
 
-    - Turn all wildcards in *uni_template* into matching-relevant (but not
-      independent, in the sense of *independent_inames*) variables.
 
-    - Unify the CSEs with the unification template, detecting mappings
-      of template variables to variables used in the CSE.
 
-    - Find the (union) footprint of the CSEs in terms of the
-      *independent_inames*.
 
-    - Augment the kernel domain by that footprint and generate the fetch
-      instruction.
+def precompute(kernel, subst_name, dtype, sweep_inames=[],
+        new_arg_names=None, arg_name_to_tag={}, default_tag="l.auto"):
 
-    - Replace the CSEs according to the mapping detected in unification.
-    """
+    subst = kernel.substitutions[subst_name]
+    arg_names = subst.arguments
+    subst_expr = subst.expression
 
-    newly_created_var_names = set()
+    # {{{ gather up invocations
 
-    # {{{ replace any wildcards in uni_template with new variables
+    invocation_descriptors = []
+    invocation_arg_deps = set()
 
-    if isinstance(uni_template, str):
-        from pymbolic import parse
-        uni_template = parse(uni_template)
+    def gather_substs(expr, name, args, rec):
+        arg_deps = get_dependencies(args)
+        if not arg_deps <= kernel.all_inames():
+            raise RuntimeError("CSE arguments in '%s' do not consist "
+                    "exclusively of inames" % expr)
 
-    def get_unique_var_name():
-        if cse_tag is None:
-            based_on = "cse_wc"
-        else:
-            based_on = cse_tag+"_wc"
+        invocation_arg_deps.update(arg_deps)
 
-        result = kernel.make_unique_var_name(
-                based_on=based_on, extra_used_vars=newly_created_var_names)
-        newly_created_var_names.add(result)
-        return result
+        invocation_descriptors.append(
+                InvocationDescriptor(expr=expr, args=args))
+        return expr
 
-    if uni_template is not None:
-        from loopy.symbolic import WildcardToUniqueVariableMapper
-        wc_map = WildcardToUniqueVariableMapper(get_unique_var_name)
-        uni_template = wc_map(uni_template)
+    from loopy.symbolic import SubstitutionCallbackMapper
+    scm = SubstitutionCallbackMapper([subst_name], gather_substs)
+    for insn in kernel.instructions:
+        scm(insn.expression)
+    for s in kernel.substitutions.itervalues():
+        if s is not subst:
+            scm(s.expression)
+
+    allowable_sweep_inames = invocation_arg_deps | set(arg_names)
+    if not set(sweep_inames) <= allowable_sweep_inames:
+        raise RuntimeError("independent iname(s) '%s' do not occur as arg names "
+                "of subsitution rule or in arguments of invocation" % (",".join(
+                    set(sweep_inames)-allowable_sweep_inames)))
 
     # }}}
 
     # {{{ process ind_iname_to_tag argument
 
-    ind_iname_to_tag = ind_iname_to_tag.copy()
+    arg_name_to_tag = arg_name_to_tag.copy()
 
     from loopy.kernel import parse_tag
     default_tag = parse_tag(default_tag)
-    for iname in independent_inames:
-        ind_iname_to_tag.setdefault(iname, default_tag)
+    for iname in arg_names:
+        arg_name_to_tag.setdefault(iname, default_tag)
 
-    if not set(ind_iname_to_tag.iterkeys()) <= set(independent_inames):
-        raise RuntimeError("tags for non-new inames may not be passed")
+    if not set(arg_name_to_tag.iterkeys()) <= set(arg_names):
+        raise RuntimeError("tags for non-argument names may not be passed")
 
     # here, all information is consolidated into ind_iname_to_tag
 
     # }}}
 
-    # {{{ gather cse descriptors
-
-    cse_descriptors = []
-
-    def gather_cses(cse, rec):
-        if cse.prefix != cse_tag:
-            rec(cse.child)
-            return
-
-        cse_descriptors.append(
-                CSEDescriptor(insn=insn, cse=cse))
-        # can't nest, don't recurse
-
-    from loopy.symbolic import CSECallbackMapper
-    cse_cb_mapper = CSECallbackMapper(gather_cses)
-
-    for insn in kernel.instructions:
-        cse_cb_mapper(insn.expression)
-
-    # }}}
-
-    # {{{ find/pick a unification template
-
-    if not cse_descriptors:
-        raise RuntimeError("no CSEs tagged '%s' found" % cse_tag)
-
-    if uni_template is None:
-        for csed in cse_descriptors:
-            if set(independent_inames) <= get_dependencies(csed.cse.child):
-                # pick the first cse that has the required inames as the unification template
-                uni_template = csed.cse.child
-                break
-
-        if uni_template is None:
-            raise RuntimeError("could not find a suitable unification template that depends on "
-                    "inames '%s'" % ",".join(independent_inames))
-
-    # }}}
-
-    # {{{ make sure that independent inames and kernel inames do not overlap
-
-    # (and substitute in uni_template if any variable name changes are necessary)
-
-    if set(independent_inames) & kernel.all_inames():
-        old_to_new = {}
-
-        new_independent_inames = []
-        new_ind_iname_to_tag = {}
-        for i, iname in enumerate(independent_inames):
-            if iname in kernel.all_inames():
-                based_on = iname
-                if new_inames is not None and i < len(new_inames):
-                    based_on = new_inames[i]
-                elif cse_tag is not None:
-                    based_on = "%s_%s" % (iname, cse_tag)
-
-                new_iname = kernel.make_unique_var_name(
-                        based_on=based_on, extra_used_vars=newly_created_var_names)
-                old_to_new[iname] = var(new_iname)
-                newly_created_var_names.add(new_iname)
-                new_independent_inames.append(new_iname)
-                new_ind_iname_to_tag[new_iname] = ind_iname_to_tag[iname]
-            else:
-                new_independent_inames.append(iname)
-                new_ind_iname_to_tag[iname] = ind_iname_to_tag[iname]
-
-        independent_inames = new_independent_inames
-        ind_iname_to_tag = new_ind_iname_to_tag
-        uni_template = (
-                SubstitutionMapper(make_subst_func(old_to_new))
-                (uni_template))
-
-    # }}}
-
-    if not set(independent_inames) <= get_dependencies(uni_template):
-        raise RuntimeError("independent iname(s) '%s' do not occur in unification "
-                "template" % (",".join(
-                    set(independent_inames)-get_dependencies(uni_template))))
+    newly_created_var_names = set()
 
-    # {{{ deal with iname deps of uni_template that are not independent_inames
+    # {{{ make sure that new
 
-    # (We call these 'matching_vars', because they have to match exactly in
-    # every CSE. As above, they might need to be renamed to make them unique
-    # within the kernel.)
+    # (and substitute in subst_expressions if any variable name changes are necessary)
 
-    matching_vars = []
     old_to_new = {}
 
-    for iname in (get_dependencies(uni_template)
-            - set(independent_inames)
-            - kernel.non_iname_variable_names()):
-        if iname in kernel.all_inames():
-            # need to rename to be unique
-            new_iname = kernel.make_unique_var_name(
-                    based_on=iname, extra_used_vars=newly_created_var_names)
-            old_to_new[iname] = var(new_iname)
-            newly_created_var_names.add(new_iname)
-            matching_vars.append(new_iname)
+    unique_new_arg_names = []
+    new_arg_name_to_tag = {}
+    for i, name in enumerate(arg_names):
+        new_name = None
+
+        if new_arg_names is not None and i < len(new_arg_names):
+            new_name = new_arg_names[i]
+            if new_name in kernel.all_variable_names():
+                raise RuntimeError("new name '%s' already exists" % new_name)
+
+        if name in kernel.all_variable_names():
+            based_on = "%s_%s" % (name, subst_name)
+            new_name = kernel.make_unique_var_name(
+                    based_on=based_on, extra_used_vars=newly_created_var_names)
+
+        if new_name is not None:
+            old_to_new[name] = var(new_name)
+            newly_created_var_names.add(new_name)
+            unique_new_arg_names.append(new_name)
+            new_arg_name_to_tag[new_name] = arg_name_to_tag[name]
         else:
-            matching_vars.append(iname)
+            unique_new_arg_names.append(name)
+            new_arg_name_to_tag[name] = arg_name_to_tag[name]
 
-    if old_to_new:
-        uni_template = (
-                SubstitutionMapper(make_subst_func(old_to_new))
-                (uni_template))
+    arg_name_to_tag = new_arg_name_to_tag
+    subst_expr = (
+            SubstitutionMapper(make_subst_func(old_to_new))
+            (subst_expr))
 
     # }}}
 
@@ -445,61 +270,27 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
 
     # (If there are independent inames, this adds extra dimensions to the domain.)
 
-    footprint, matching_var_values = process_cses(kernel, uni_template,
-            independent_inames, matching_vars,
-            cse_descriptors)
-
-    if isinstance(footprint, isl.Set):
-        footprint = footprint.coalesce()
-        footprint_bsets = footprint.get_basic_sets()
-        if len(footprint_bsets) > 1:
-            raise NotImplementedError("CSE '%s' yielded a non-convex footprint"
-                    % cse_tag)
+    new_domain, target_var_base_indices, target_var_shape = \
+            get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
+                    sweep_inames, invocation_descriptors)
 
-        footprint, = footprint_bsets
+    new_domain = new_domain.coalesce()
+    if isinstance(new_domain, isl.Set):
+        dom_bsets = new_domain.get_basic_sets()
+        if len(dom_bsets) > 1:
+            raise NotImplementedError("Substitution '%s' yielded a non-convex footprint"
+                    % subst_name)
 
-    ndim = kernel.space.dim(dim_type.set)
-    footprint = footprint.insert_dims(dim_type.set, 0, ndim)
-    for i in range(ndim):
-        footprint = footprint.set_dim_name(dim_type.set, i,
-                kernel.space.get_dim_name(dim_type.set, i))
-
-    from islpy import align_spaces
-    new_domain = align_spaces(kernel.domain, footprint).intersect(footprint)
-
-    # set matching vars equal to their unified value, eliminate them
-    from loopy.symbolic import aff_from_expr
-
-    assert set(matching_var_values) == set(matching_vars)
-
-    for var_name, value in matching_var_values.iteritems():
-        cns = isl.Constraint.equality_from_aff(
-                aff_from_expr(new_domain.get_space(), var(var_name) - value))
-        new_domain = new_domain.add_constraint(cns)
-
-    new_domain = (new_domain
-            .eliminate(dim_type.set,
-                new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars))
-            .remove_dims(dim_type.set,
-                new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars)))
-    new_domain = new_domain.remove_redundancies()
+        new_domain, = dom_bsets
 
     # }}}
 
     # {{{ set up temp variable
 
-    var_base = cse_tag
-    if var_base is None:
-        var_base = "cse"
-    target_var_name = kernel.make_unique_var_name(var_base)
-
-    from loopy.kernel import (TemporaryVariable,
-            find_var_base_indices_and_shape_from_inames)
+    target_var_name = kernel.make_unique_var_name(based_on=subst_name,
+            extra_used_vars=newly_created_var_names)
 
-    target_var_base_indices, target_var_shape = \
-            find_var_base_indices_and_shape_from_inames(
-                    new_domain, independent_inames,
-                    kernel.cache_manager)
+    from loopy.kernel import TemporaryVariable
 
     new_temporary_variables = kernel.temporary_variables.copy()
     new_temporary_variables[target_var_name] = TemporaryVariable(
@@ -511,56 +302,63 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
 
     # }}}
 
-    mv_subst = SubstitutionMapper(make_subst_func(
-        dict((mv, matching_var_values[mv]) for mv in matching_vars)))
+    # {{{ set up compute insn
 
-    compute_insn = make_compute_insn(
-            kernel, cse_tag, mv_subst(uni_template),
-            target_var_name, target_var_base_indices,
-            independent_inames, ind_iname_to_tag,
-            # pick one insn at random for dep check
-            cse_descriptors[0].insn)
+    assignee = var(target_var_name)
+
+    if unique_new_arg_names:
+        assignee = assignee[tuple(var(iname) for iname in unique_new_arg_names)]
+
+    from loopy.kernel import Instruction
+    compute_insn = Instruction(
+            id=kernel.make_unique_instruction_id(based_on=subst_name),
+            assignee=assignee,
+            expression=subst_expr)
 
-    # {{{ substitute variable references into instructions
+    # }}}
+
+    # {{{ substitute rule into instructions
 
-    def subst_cses(cse, rec):
+    def do_substs(expr, name, args, rec):
         found = False
-        for csed in cse_descriptors:
-            if cse is csed.cse:
+        for invdesc in invocation_descriptors:
+            if expr is invdesc.expr:
                 found = True
                 break
 
         if not found:
-            from pymbolic.primitives import CommonSubexpression
-            return CommonSubexpression(
-                    rec(cse.child), cse.prefix)
+            return
 
-        indices = [csed.unif_var_dict[iname]-bi
-                for iname, bi in zip(independent_inames, target_var_base_indices)]
+        args = [simplify_via_aff(new_domain.get_space(), arg-bi)
+                for arg, bi in zip(args, target_var_base_indices)]
 
         new_outer_expr = var(target_var_name)
-        if indices:
-            new_outer_expr = new_outer_expr[tuple(indices)]
+        if args:
+            new_outer_expr = new_outer_expr[tuple(args)]
 
         return new_outer_expr
         # can't nest, don't recurse
 
-    cse_cb_mapper = CSECallbackMapper(subst_cses)
-
     new_insns = [compute_insn]
 
+    sub_map = SubstitutionCallbackMapper([subst_name], do_substs)
     for insn in kernel.instructions:
-        new_expr = cse_cb_mapper(insn.expression)
-        new_insns.append(insn.copy(expression=new_expr))
+        new_insns.append(insn.copy(expression=sub_map(insn.expression)))
 
     # }}}
 
     new_iname_to_tag = kernel.iname_to_tag.copy()
-    new_iname_to_tag.update(ind_iname_to_tag)
+    new_iname_to_tag.update(arg_name_to_tag)
+
+    new_substs = dict(
+            (s.name, s.copy(expression=sub_map(subst.expression)))
+            for s in kernel.substitutions.itervalues())
+    del new_substs[subst_name]
 
     return kernel.copy(
             domain=new_domain,
             instructions=new_insns,
+            substitutions=new_substs,
             temporary_variables=new_temporary_variables,
             iname_to_tag=new_iname_to_tag)
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index dfdae050229367911eece85980090724bede98ef..e8c80d36a2c24d952a97011570cb3d6ee847f4ec 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -217,6 +217,26 @@ class TemporaryVariable(Record):
 
 # }}}
 
+# {{{ subsitution rule
+
+class SubstitutionRule(Record):
+    """
+    :ivar name:
+    :ivar arguments:
+    :ivar expression:
+    """
+
+    def __init__(self, name, arguments, expression):
+        Record.__init__(self,
+                name=name, arguments=arguments, expression=expression)
+
+    def __str__(self):
+        return "%s(%s) := %s" % (
+                self.name, ", ".join(self.arguments), self.expression)
+
+
+# }}}
+
 # {{{ instruction
 
 class Instruction(Record):
@@ -451,6 +471,8 @@ class LoopKernel(Record):
     :ivar local_sizes: A dictionary from integers to integers, mapping
         workgroup axes to ther sizes, e.g. *{0: 16}* forces axis 0 to be
         length 16.
+    :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
+        objects
 
     :ivar cache_manager:
 
@@ -458,8 +480,6 @@ class LoopKernel(Record):
     finished:
 
     :ivar iname_to_tag_requests:
-    :ivar cses: a mapping from CSE names to tuples (arg_names, expr).
-    :ivar substitutions: a mapping from CSE names to tuples (arg_names, expr).
     """
 
     def __init__(self, device, domain, instructions, args=None, schedule=None,
@@ -468,7 +488,7 @@ class LoopKernel(Record):
             iname_slab_increments={},
             temporary_variables={},
             local_sizes={},
-            iname_to_tag={}, iname_to_tag_requests=None, cses={}, substitutions={},
+            iname_to_tag={}, iname_to_tag_requests=None, substitutions={},
             cache_manager=None):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
@@ -489,7 +509,7 @@ class LoopKernel(Record):
 
         INAME_ENTRY_RE = re.compile(
                 r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$")
-        LABEL_DEP_RE = re.compile(
+        INSN_RE = re.compile(
                 r"^\s*(?:(?P<label>\w+):)?"
                 "\s*(?:\["
                     "(?P<iname_deps_and_tags>[\s\w,:.]*)"
@@ -499,6 +519,9 @@ class LoopKernel(Record):
                 "\s*(?P<lhs>.+)\s*=\s*(?P<rhs>.+?)"
                 "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$"
                 )
+        SUBST_RE = re.compile(
+                r"^\s*(?P<lhs>.+)\s*:=\s*(?P<rhs>.+?)\s*$"
+                )
 
         def parse_iname_and_tag_list(s):
             dup_entries = [
@@ -540,21 +563,28 @@ class LoopKernel(Record):
                         "instance or a parseable string. got '%s' instead."
                         % type(insn))
 
-            label_dep_match = LABEL_DEP_RE.match(insn)
-            if label_dep_match is None:
+            insn_match = INSN_RE.match(insn)
+            subst_match = SUBST_RE.match(insn)
+            if insn_match is not None and subst_match is not None:
                 raise RuntimeError("insn parse error")
 
-            groups = label_dep_match.groupdict()
-            if groups["label"] is not None:
-                label = groups["label"]
+            if insn_match is not None:
+                groups = insn_match.groupdict()
+            elif subst_match is not None:
+                groups = subst_match.groupdict()
             else:
-                label = "insn"
+                raise RuntimeError("insn parse error")
 
             lhs = parse(groups["lhs"])
             from loopy.symbolic import FunctionToPrimitiveMapper
             rhs = FunctionToPrimitiveMapper()(parse(groups["rhs"]))
 
-            if label.lower() not in ["cse", "subst"]:
+            if insn_match is not None:
+                if groups["label"] is not None:
+                    label = groups["label"]
+                else:
+                    label = "insn"
+
                 if groups["insn_deps"] is not None:
                     insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(","))
                 else:
@@ -587,43 +617,35 @@ class LoopKernel(Record):
                             assignee=lhs, expression=rhs,
                             temp_var_type=temp_var_type,
                             duplicate_inames_and_tags=duplicate_inames_and_tags))
-            else:
-                if groups["iname_deps_and_tags"] is not None:
-                    raise RuntimeError("CSEs/substitutions cannot declare iname dependencies")
-                if groups["insn_deps"] is not None:
-                    raise RuntimeError("CSEs/substitutions cannot declare instruction dependencies")
-                if groups["temp_var_type"] is not None:
-                    raise RuntimeError("CSEs/substitutions cannot declare temporary storage")
-
+            elif subst_match is not None:
                 from pymbolic.primitives import Variable, Call
 
                 if isinstance(lhs, Variable):
-                    cse_name = lhs.name
+                    subst_name = lhs.name
                     arg_names = []
                 elif isinstance(lhs, Call):
                     if not isinstance(lhs.function, Variable):
-                        raise RuntimeError("Invalid CSE left-hand side")
-                    cse_name = lhs.function.name
+                        raise RuntimeError("Invalid substitution rule left-hand side")
+                    subst_name = lhs.function.name
                     arg_names = []
 
                     for arg in lhs.parameters:
                         if not isinstance(arg, Variable):
-                            raise RuntimeError("Invalid CSE left-hand side")
+                            raise RuntimeError("Invalid substitution rule left-hand side")
                         arg_names.append(arg.name)
                 else:
-                    raise RuntimeError("CSEs cannot declare temporary storage")
+                    raise RuntimeError("Invalid substitution rule left-hand side")
 
-                if label.lower() == "cse":
-                    cses[cse_name] = (arg_names, rhs)
-                else:
-                    substitutions[cse_name] = (arg_names, rhs)
+                substitutions[subst_name] = SubstitutionRule(
+                        name=subst_name,
+                        arguments=arg_names,
+                        expression=rhs)
 
         # }}}
 
         insns = []
 
-        cses = cses.copy()
-        substituions = substitutions.copy()
+        substitutions = substitutions.copy()
 
         for insn in instructions:
             # must construct list one-by-one to facilitate unique id generation
@@ -656,7 +678,7 @@ class LoopKernel(Record):
                 local_sizes=local_sizes,
                 iname_to_tag=iname_to_tag,
                 iname_to_tag_requests=iname_to_tag_requests,
-                cses=cses, substitutions=substitutions,
+                substitutions=substitutions,
                 cache_manager=cache_manager)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
@@ -670,6 +692,12 @@ class LoopKernel(Record):
             if id_str not in used_ids:
                 return id_str
 
+    def make_unique_subst_rule_name(self, based_on="subst"):
+        from loopy.tools import generate_unique_possibilities
+        for id_str in generate_unique_possibilities(based_on):
+            if id_str not in self.substitutions:
+                return id_str
+
     @memoize_method
     def all_inames(self):
         from islpy import dim_type
@@ -826,12 +854,16 @@ class LoopKernel(Record):
             insn.get_assignee_var_name()
             for insn in self.instructions)
 
-    def make_unique_var_name(self, based_on="var", extra_used_vars=set()):
-        used_vars = (
+    @memoize_method
+    def all_variable_names(self):
+        return (
                 set(self.temporary_variables.iterkeys())
+                | set(self.substitutions.iterkeys())
                 | set(arg.name for arg in self.args)
-                | set(self.iname_to_dim.keys())
-                | extra_used_vars)
+                | set(self.iname_to_dim.keys()))
+
+    def make_unique_var_name(self, based_on="var", extra_used_vars=set()):
+        used_vars = self.all_variable_names() | extra_used_vars
 
         from loopy.tools import generate_unique_possibilities
         for var_name in generate_unique_possibilities(based_on):
@@ -1025,17 +1057,44 @@ class LoopKernel(Record):
 
         return result
 
+    def map_expressions(self, func, exclude_instructions=False):
+        if exclude_instructions:
+            new_insns = self.instructions
+        else:
+            new_insns = [insn.copy(expression=func(insn.expression))
+                    for insn in self.instructions]
+
+        return self.copy(
+                instructions=new_insns,
+                substitutions=dict(
+                    (subst.name, subst.copy(expression=func(subst.expression)))
+                    for subst in self.substitutions.itervalues()))
+
     def __str__(self):
         lines = []
 
+        sep = 75*"-"
+        lines.append(sep)
+        lines.append("INAME-TO-TAG MAP:")
         for iname in sorted(self.all_inames()):
             lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname)))
-        lines.append("")
+
+        lines.append(sep)
+        lines.append("DOMAIN:")
         lines.append(str(self.domain))
-        lines.append("")
+
+        if self.substitutions:
+            lines.append(sep)
+            lines.append("SUBSTIUTION RULES:")
+            for rule in self.substitutions.itervalues():
+                lines.append(str(rule))
+
+        lines.append(sep)
+        lines.append("INSTRUCTIONS:")
         for insn in self.instructions:
             lines.append(str(insn))
             lines.append("    [%s]" % ",".join(sorted(self.insn_inames(insn))))
+        lines.append(sep)
 
         return "\n".join(lines)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 3c84d835c832b3ce67fdcec01bd1d068429ada64..8419b9f217f2d159be69060aa76787104745cd00 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -642,6 +642,9 @@ def adjust_local_temp_var_storage(kernel):
 
 
 def preprocess_kernel(kernel):
+    from loopy.subst import apply_subst
+    kernel = apply_subst(kernel)
+
     kernel = mark_local_temporaries(kernel)
     kernel = duplicate_reduction_inames(kernel)
     kernel = realize_reduction(kernel)
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 0b1fb08fbe8d82a11f3f03d703e8a76eb03206b3..c1a1fdea0bd84db05bb08168c78c73f28809d405 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -604,18 +604,6 @@ def insert_barriers(kernel, schedule, level=0):
 # {{{ main scheduling entrypoint
 
 def generate_loop_schedules(kernel, loop_priority=[], debug=None):
-    # {{{ check that all CSEs have been realized
-
-    from loopy.symbolic import CSECallbackMapper
-
-    def map_cse(expr, rec):
-        raise RuntimeError("all CSEs must be realized before scheduling")
-
-    for insn in kernel.instructions:
-        CSECallbackMapper(map_cse)(insn.expression)
-
-    # }}}
-
     from loopy.preprocess import preprocess_kernel
     kernel = preprocess_kernel(kernel)
 
diff --git a/loopy/subst.py b/loopy/subst.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b56a9a3acbce2578f459d84d85e2de530bdd396
--- /dev/null
+++ b/loopy/subst.py
@@ -0,0 +1,203 @@
+from __future__ import division
+
+from loopy.symbolic import get_dependencies, SubstitutionMapper
+from pymbolic.mapper.substitutor import make_subst_func
+
+from pytools import Record
+from pymbolic import var
+
+
+
+
+
+class ExprDescriptor(Record):
+    __slots__ = ["insn", "expr", "unif_var_dict"]
+
+
+
+
+
+def extract_subst(kernel, subst_name, template, parameters):
+    """
+    :arg template: An expression against which all targeted subexpressions
+        must unify
+
+        If None, a unification template will be chosen from among the targeted
+        CSEs. That CSE is chosen to depend on all the variables in
+        *parameters*.  It is an error if no such expression can be
+        found.
+
+        May contain '*' wildcards that will have to match exactly across all
+        unifications.
+    """
+
+    newly_created_var_names = set()
+
+    if isinstance(template, str):
+        from pymbolic import parse
+        template = parse(template)
+
+    # {{{ replace any wildcards in template with new variables
+
+    def get_unique_var_name():
+        based_on = subst_name+"_wc"
+
+        result = kernel.make_unique_var_name(
+                based_on=based_on, extra_used_vars=newly_created_var_names)
+        newly_created_var_names.add(result)
+        return result
+
+    from loopy.symbolic import WildcardToUniqueVariableMapper
+    wc_map = WildcardToUniqueVariableMapper(get_unique_var_name)
+    template = wc_map(template)
+
+    # }}}
+
+    # {{{ deal with iname deps of template that are not independent_inames
+
+    # (We call these 'matching_vars', because they have to match exactly in
+    # every CSE. As above, they might need to be renamed to make them unique
+    # within the kernel.)
+
+    matching_vars = []
+    old_to_new = {}
+
+    for iname in (get_dependencies(template)
+            - set(parameters)
+            - kernel.non_iname_variable_names()):
+        if iname in kernel.all_inames():
+            # need to rename to be unique
+            new_iname = kernel.make_unique_var_name(
+                    based_on=iname, extra_used_vars=newly_created_var_names)
+            old_to_new[iname] = var(new_iname)
+            newly_created_var_names.add(new_iname)
+            matching_vars.append(new_iname)
+        else:
+            matching_vars.append(iname)
+
+    if old_to_new:
+        template = (
+                SubstitutionMapper(make_subst_func(old_to_new))
+                (template))
+
+    # }}}
+
+    # {{{ gather up expressions
+
+    expr_descriptors = []
+
+    from loopy.symbolic import UnidirectionalUnifier
+    unif = UnidirectionalUnifier(
+            lhs_mapping_candidates=set(parameters) | set(matching_vars))
+
+    def gather_exprs(expr, mapper):
+        print expr
+        urecs = unif(template, expr)
+
+        if urecs:
+            if len(urecs) > 1:
+                raise RuntimeError("ambiguous unification of '%s' with template '%s'" 
+                        % (expr, template))
+
+            urec, = urecs
+
+            expr_descriptors.append(
+                    ExprDescriptor(
+                        insn=insn,
+                        expr=expr,
+                        unif_var_dict = dict((lhs.name, rhs)
+                            for lhs, rhs in urec.equations)))
+        else:
+            mapper.fallback_mapper(expr)
+            # can't nest, don't recurse
+
+    from loopy.symbolic import (
+            CallbackMapper, WalkMapper, IdentityMapper)
+    dfmapper = CallbackMapper(gather_exprs, WalkMapper())
+
+    for insn in kernel.instructions:
+        dfmapper(insn.expression)
+
+    for sr in kernel.substitutions.itervalues():
+        dfmapper(sr.expression)
+
+    # }}}
+
+    if not expr_descriptors:
+        raise RuntimeError("no expressions matching '%s'" % template)
+
+    # {{{ substitute rule into instructions
+
+    def replace_exprs(expr, mapper):
+        found = False
+        for exprd in expr_descriptors:
+            if expr is exprd.expr:
+                found = True
+                break
+
+        if not found:
+            return mapper.fallback_mapper(expr)
+
+        args = [exprd.unif_var_dict[arg_name]
+                for arg_name in parameters]
+
+        result = var(subst_name)
+        if args:
+            result = result(*args)
+
+        return result
+        # can't nest, don't recurse
+
+    cbmapper = CallbackMapper(replace_exprs, IdentityMapper())
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        new_expr = cbmapper(insn.expression)
+        new_insns.append(insn.copy(expression=new_expr))
+
+    from loopy.kernel import SubstitutionRule
+    new_substs = {
+            subst_name: SubstitutionRule(
+                name=subst_name,
+                arguments=parameters,
+                expression=template,
+                )}
+
+    for subst in kernel.substitutions.itervalues():
+        new_substs[subst.name] = subst.copy(
+                expression=cbmapper(insn.expression))
+
+    # }}}
+
+    return kernel.copy(
+            instructions=new_insns,
+            substitutions=new_substs)
+
+
+
+
+def apply_subst(kernel, subst_name=None):
+    if subst_name is None:
+        rules = kernel.substitutions
+    else:
+        rule = kernel.substitutions[subst_name]
+        rules = {rule.name: rule}
+
+    from loopy.symbolic import ParametrizedSubstitutor
+    submap = ParametrizedSubstitutor(rules)
+
+    if subst_name:
+        new_substs = kernel.substitutions.copy()
+        del new_substs[subst_name]
+    else:
+        new_substs = {}
+
+    return (kernel
+            .copy(substitutions=new_substs)
+            .map_expressions(submap))
+
+
+
+
+# vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 63a564eaf43f01648ae4d21ab30ee54cc2cb2a68..d980de74a254ad8f3f3f05ad2569bbc61603c879 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -8,7 +8,10 @@ from pymbolic.primitives import AlgebraicLeaf
 from pymbolic.mapper import (
         CombineMapper as CombineMapperBase,
         IdentityMapper as IdentityMapperBase,
-        RecursiveMapper)
+        RecursiveMapper,
+        WalkMapper as WalkMapperBase,
+        CallbackMapper as CallbackMapperBase,
+        )
 from pymbolic.mapper.c_code import CCodeMapper
 from pymbolic.mapper.stringifier import PREC_NONE
 from pymbolic.mapper.substitutor import \
@@ -80,6 +83,13 @@ class IdentityMapperMixin(object):
 class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
     pass
 
+class WalkMapper(WalkMapperBase):
+    def map_reduction(self, expr):
+        self.rec(expr.expression)
+
+class CallbackMapper(CallbackMapperBase, IdentityMapper):
+    map_reduction = CallbackMapperBase.map_constant
+
 class CombineMapper(CombineMapperBase):
     def map_reduction(self, expr):
         return self.rec(expr.expr)
@@ -510,20 +520,6 @@ def constraint_to_expr(cns, except_name=None):
 
 # }}}
 
-# {{{ CSE callback mapper
-
-class CSECallbackMapper(IdentityMapper):
-    def __init__(self, callback):
-        self.callback = callback
-
-    def map_common_subexpression(self, expr):
-        result = self.callback(expr, self.rec)
-        if result is None:
-            return IdentityMapper.map_common_subexpression(self, expr)
-        return result
-
-# }}}
-
 # {{{ Reduction callback mapper
 
 class ReductionCallbackMapper(IdentityMapper):
@@ -579,82 +575,71 @@ class IndexVariableFinder(CombineMapper):
 
 # }}}
 
-# {{{ variable-fetch CSE mapper
-
-class VariableFetchCSEMapper(IdentityMapper):
-    """Turns fetches of a given variable names into CSEs."""
-    def __init__(self, var_name, cse_tag_getter):
-        self.var_name = var_name
-        self.cse_tag_getter = cse_tag_getter
-
-    def map_variable(self, expr):
-        from pymbolic.primitives import CommonSubexpression
-        if expr.name == self.var_name:
-            return CommonSubexpression(expr, self.cse_tag_getter())
-        else:
-            return IdentityMapper.map_variable(self, expr)
-
-    def map_subscript(self, expr):
-        from pymbolic.primitives import CommonSubexpression, Variable, Subscript
-        if (isinstance(expr.aggregate, Variable)
-                and expr.aggregate.name == self.var_name):
-            return CommonSubexpression(
-                    Subscript(expr.aggregate, self.rec(expr.index)), self.cse_tag_getter())
-        else:
-            return IdentityMapper.map_subscript(self, expr)
-
-# }}}
-
 # {{{ parametrized substitutor
 
 class ParametrizedSubstitutor(IdentityMapper):
-    def __init__(self, cses, wrap_cse):
-        """
-        :arg cses: a mapping from CSE names to tuples (arg_names, expr).
-        :arg wrap_cse: flag: wrap substituted expressions in CSEs
-        """
-        self.cses = cses
-        self.wrap_cse = wrap_cse
+    def __init__(self, rules):
+        self.rules = rules
 
     def map_variable(self, expr):
-        if expr.name not in self.cses:
+        if expr.name not in self.rules:
             return IdentityMapper.map_variable(self, expr)
 
-        arg_names, cse_expr = self.cse[expr.name]
-        if len(arg_names) != 0:
+        rule = self.rules[expr.name]
+        if rule.arguments:
             raise RuntimeError("CSE '%s' must be invoked with %d arguments"
-                    % (expr.name, len(arg_names)))
+                    % (expr.name, len(rule.arguments)))
 
-        cse_expr = self.rec(cse_expr)
-
-        if self.wrap_cse:
-            from pymbolic.primitives import CommonSubexpression
-            return CommonSubexpression(cse_expr, expr.name)
-        else:
-            return cse_expr
+        return self.rec(rule.expression)
 
     def map_call(self, expr):
-        from pymbolic.primitives import Variable, CommonSubexpression
+        from pymbolic.primitives import Variable
         if (not isinstance(expr.function, Variable)
-                or expr.function.name not in self.cses):
+                or expr.function.name not in self.rules):
             return IdentityMapper.map_variable(self, expr)
 
         cse_name = expr.function.name
-        arg_names, cse_expr = self.cses[cse_name]
-        if len(arg_names) != len(expr.parameters):
-            raise RuntimeError("CSE '%s' invoked with %d arguments (needs %d)"
-                    % (cse_name, len(arg_names), len(expr.parameters)))
+        rule = self.rules[cse_name]
+        if len(rule.arguments) != len(expr.parameters):
+            raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)"
+                    % (cse_name, len(expr.parameters), len(rule.arguments), ))
 
         from pymbolic.mapper.substitutor import make_subst_func
         subst_map = SubstitutionMapper(make_subst_func(
-            dict(zip(arg_names, expr.parameters))))
+            dict(zip(rule.arguments, expr.parameters))))
+
+        return self.rec(subst_map(rule.expression))
+
+# }}}
+
+# {{{ substitution callback mapper
+
+class SubstitutionCallbackMapper(IdentityMapper):
+    def __init__(self, names, func):
+        self.names = names
+        self.func = func
+
+    def map_variable(self, expr):
+        if expr.name not in self.names:
+            return IdentityMapper.map_variable(self, expr)
 
-        cse_expr = self.rec(subst_map(cse_expr))
+        result = self.func(expr, expr.name, (), self.rec)
+        if result is None:
+            return IdentityMapper.map_variable(self, expr)
+        else:
+            return result
+
+    def map_call(self, expr):
+        from pymbolic.primitives import Variable
+        if (not isinstance(expr.function, Variable)
+                or expr.function.name not in self.names):
+            return IdentityMapper.map_variable(self, expr)
 
-        if self.wrap_cse:
-            return CommonSubexpression(cse_expr, cse_name)
+        result = self.func(expr, expr.function.name, expr.parameters, self.rec)
+        if result is None:
+            return IdentityMapper.map_call(self, expr)
         else:
-            return cse_expr
+            return result
 
 # }}}
 
diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py
index 4a580725e3bceae5158d058a21ea3ee0e32d5335..7e57d1508a68f8983c4e2982ec02e5062517a0cb 100644
--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
@@ -34,7 +34,7 @@ def test_laplacian_stiffness(ctx_factory):
             "[Nc] -> {[K,i,j,q]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d}" 
             % dict(Nb=Nb, Nq=Nq),
             [
-                "SUBST: dPsi(a,dxi) = jacInv[K,q,0,dxi] * DPsi[a,q,0] "
+                "CSE: dPsi(a,dxi) = jacInv[K,q,0,dxi] * DPsi[a,q,0] "
                     "+ jacInv[K,q,1,dxi] * DPsi[a,q,1]",
                 "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
                     "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))"
@@ -52,20 +52,43 @@ def test_laplacian_stiffness(ctx_factory):
 
     seq_knl = knl
 
-    knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
-    knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
-    knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-    knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"],
-            uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]")
-
-    kernel_gen = lp.generate_loop_schedules(knl,
-            loop_priority=["K", "i", "j"])
-    kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
-
-    lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
-            op_count=0, op_label="GFlops",
-            parameters={"Nc": Nc}, print_seq_code=True,
-            timing_rounds=30)
+    def variant_1(knl):
+        # no ILP across elements
+        knl= lp.remove_cse_by_tag(knl, "dPsi")
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"],
+                uni_template="jacInv[Kii +16*Ko,q,x,y]")
+        return knl
+
+    def variant_2(knl):
+        # with ILP across elements
+        knl= lp.remove_cse_by_tag(knl, "dPsi")
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"],
+                uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]")
+        return knl
+
+    def variant_1(knl):
+        # no ILP across elements, precompute dPsiTransf
+        knl= lp.realize_cse(knl, "dPsi", ["a", "dxi", ""])
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"],
+                uni_template="jacInv[Kii +16*Ko,q,x,y]")
+        return knl
+
+    for variant in [variant_1, variant_2]:
+        kernel_gen = lp.generate_loop_schedules(variant(knl),
+                loop_priority=["K", "i", "j"])
+        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
+
+        lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
+                op_count=0, op_label="GFlops",
+                parameters={"Nc": Nc}, print_seq_code=True,
+                timing_rounds=30)
 
 
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index f5d9c2f00e42a78faee6396bde0101d7bb1c1c35..0c09032b803e4765a055f7cc775b08b1e748a3ea 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -348,9 +348,9 @@ def test_rank_one(ctx_factory):
 
     def variant_4(knl):
         knl = lp.split_dimension(knl, "i", 256,
-                outer_tag="g.0")#, slabs=(0, 1))
+                outer_tag="g.0", slabs=(0, 1))
         knl = lp.split_dimension(knl, "j", 256,
-                outer_tag="g.1")#, slabs=(0, 1))
+                outer_tag="g.1", slabs=(0, 1))
 
         knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
         knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index cc387b017c579054670e2bf61dc2c10a24a85b79..6bc77e6ad9353207dd3f4cd9da71d3df3936dbae 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -73,7 +73,8 @@ def test_multi_cse(ctx_factory):
             local_sizes={0: 16})
 
     knl = lp.split_dimension(knl, "i", 16, inner_tag="l.0")
-    knl = lp.realize_cse(knl, None, np.float32, ["i_inner"])
+    knl = lp.add_prefetch(knl, "a", [])
+    #knl = lp.realize_cse(knl, None, np.float32, ["i_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen)
@@ -124,23 +125,29 @@ def test_stencil(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j]: 0<= i,j < 32}",
             [
-                "[i] <float32> z[i,j] = -2*cse(a[i,j])"
-                    " + cse(a[i,j-1])"
-                    " + cse(a[i,j+1])"
-                    " + cse(a[i-1,j])"
-                    " + cse(a[i+1,j])"
+                "[i] <float32> z[i,j] = -2*a[i,j]"
+                    " + a[i,j-1]"
+                    " + a[i,j+1]"
+                    " + a[i-1,j]"
+                    " + a[i+1,j]"
                 ],
             [
                 lp.ArrayArg("a", np.float32, shape=(32,32,))
                 ])
 
-    def variant_3(knl):
+
+    def variant_1(knl):
+        knl = lp.add_prefetch(knl, "a", [0, 1])
+        return knl
+
+    def variant_2(knl):
         knl = lp.split_dimension(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
         knl = lp.split_dimension(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
-        knl = lp.realize_cse(knl, None, np.float32, ["i_inner", "j_inner"])
+        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"])
         return knl
 
-    for variant in [variant_3]:
+    #for variant in [variant_1, variant_2]:
+    for variant in [variant_2]:
         kernel_gen = lp.generate_loop_schedules(variant(knl),
                 loop_priority=["i_outer", "i_inner_0", "j_0"])
         kernel_gen = lp.check_kernels(kernel_gen)