diff --git a/MEMO b/MEMO
index 23865403a6a5b91d6b9e85b76f177ecceb38b09b..63851b21f95d44cf5726fa8dbc90e577db0a33b7 100644
--- a/MEMO
+++ b/MEMO
@@ -46,15 +46,13 @@ Things to consider
 To-do
 ^^^^^
 
-- Expose iname-duplicate-and-rename as a primitive.
-
 - Kernel fusion
 
 - ExpandingIdentityMapper
   extract_subst -> needs WalkMapper
-  duplicate_inames
   join_inames
   padding
+  duplicate_inames [DONE]
   split_iname [DONE]
   CSE [DONE]
 
@@ -142,6 +140,8 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- Expose iname-duplicate-and-rename as a primitive.
+
 - make sure simple side effects work
 
 - Loop bounds currently may not depend on parallel dimensions
diff --git a/doc/reference.rst b/doc/reference.rst
index 0131a14942ff53888468dddcb7728105e6c1aa9b..af2ea1bbeb6e7c1a90432753349b46aaa86e3750 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -161,6 +161,8 @@ Wrangling inames
 
 .. autofunction:: tag_inames
 
+.. autofunction:: duplicate_inames
+
 Dealing with Substitution Rules
 -------------------------------
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index e06ffdbe3be8253b704c5fdd218d1442f45d690c..80afc44a7055a5daba827c17ca271c7022c26780 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -81,7 +81,7 @@ __all__ = ["ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "Ima
         "generate_code",
         "CompiledKernel", "auto_test_vs_ref", "check_kernels",
         "make_kernel", 
-        "split_iname", "join_inames", "tag_inames",
+        "split_iname", "join_inames", "tag_inames", "duplicate_inames",
         "split_dimension", "join_dimensions", "tag_dimensions",
         "extract_subst", "expand_subst",
         "precompute", "add_prefetch",
@@ -132,6 +132,9 @@ def split_iname(kernel, split_iname, inner_length,
         outer_tag=None, inner_tag=None,
         slabs=(0, 0), do_tagged_check=True,
         within=None):
+    """
+    :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`.
+    """
 
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
@@ -197,9 +200,6 @@ def split_iname(kernel, split_iname, inner_length,
 
     # {{{ actually modify instructions
 
-    ins = _InameSplitter(kernel, within,
-            split_iname, outer_iname, inner_iname, new_loop_index)
-
     new_insns = []
     for insn in kernel.instructions:
         if split_iname in insn.forced_iname_deps:
@@ -211,8 +211,6 @@ def split_iname(kernel, split_iname, inner_length,
             new_forced_iname_deps = insn.forced_iname_deps
 
         insn = insn.copy(
-                assignee=ins(insn.assignee, insn.id),
-                expression=ins(insn.expression, insn.id),
                 forced_iname_deps=new_forced_iname_deps)
 
         new_insns.append(insn)
@@ -222,19 +220,23 @@ def split_iname(kernel, split_iname, inner_length,
     iname_slab_increments = kernel.iname_slab_increments.copy()
     iname_slab_increments[outer_iname] = slabs
 
-    result = (kernel
+    kernel = (kernel
             .copy(domains=new_domains,
                 iname_slab_increments=iname_slab_increments,
-                substitutions=ins.get_new_substitutions(),
                 instructions=new_insns,
                 applied_iname_rewrites=applied_iname_rewrites,
                 ))
 
+    ins = _InameSplitter(kernel, within,
+            split_iname, outer_iname, inner_iname, new_loop_index)
+
+    kernel = ins.map_kernel(kernel)
+
     if existing_tag is not None:
-        result = tag_inames(result,
+        kernel = tag_inames(kernel,
                 {outer_iname: existing_tag, inner_iname: existing_tag})
 
-    return tag_inames(result, {outer_iname: outer_tag, inner_iname: inner_tag})
+    return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
 
 split_dimension = MovedFunctionDeprecationWrapper(split_iname)
 
@@ -394,6 +396,118 @@ tag_dimensions = MovedFunctionDeprecationWrapper(tag_inames)
 
 # }}}
 
+# {{{ duplicate inames
+
+class _InameDuplicator(ExpandingIdentityMapper):
+    def __init__(self, rules, make_unique_var_name,
+            old_to_new, within):
+        ExpandingIdentityMapper.__init__(self,
+                rules, make_unique_var_name)
+
+        self.old_to_new = old_to_new
+        self.old_inames_set = set(old_to_new.iterkeys())
+        self.within = within
+
+    def map_reduction(self, expr, expn_state):
+        if set(expr.inames) & self.old_inames_set and self.within(expn_state.stack):
+            new_inames = tuple(
+                    self.old_to_new.get(iname, iname)
+                    for iname in expr.inames)
+
+            from loopy.symbolic import Reduction
+            return Reduction(expr.operation, new_inames,
+                        self.rec(expr.expr, expn_state))
+        else:
+            return ExpandingIdentityMapper.map_reduction(self, expr, expn_state)
+
+    def map_variable(self, expr, expn_state):
+        new_name = self.old_to_new.get(expr.name)
+
+        if new_name is None or not self.within(expn_state.stack):
+            return ExpandingIdentityMapper.map_variable(self, expr, expn_state)
+        else:
+            from pymbolic import var
+            return var(new_name)
+
+def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
+        tags={}):
+    """
+    :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`.
+    """
+
+    # {{{ normalize arguments, find unique new_inames
+
+    if isinstance(inames, str):
+        inames = inames.split(",")
+    if isinstance(new_inames, str):
+        new_inames = new_inames.split(",")
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    if new_inames is None:
+        new_inames = [None] * len(inames)
+
+    if len(new_inames) != len(inames):
+        raise ValueError("new_inames must have the same number of entries as inames")
+
+    name_gen = knl.get_var_name_generator()
+
+    for i, iname in enumerate(inames):
+        new_iname = new_inames[i]
+
+        if new_iname is None:
+            new_iname = iname
+
+            if suffix is not None:
+                new_iname += suffix
+
+            new_iname = name_gen(new_iname)
+
+        else:
+            name_gen.add_name(new_iname)
+            raise ValueError("new iname '%s' conflicts with existing names"
+                    % new_iname)
+
+        new_inames[i] = new_iname
+
+    # }}}
+
+    # {{{ duplicate the inames
+
+    for old_iname, new_iname in zip(inames, new_inames):
+        from loopy.kernel import DomainChanger
+        domch = DomainChanger(knl, frozenset([old_iname]))
+
+        from loopy.isl_helpers import duplicate_axes
+        knl = knl.copy(
+                domains=domch.get_domains_with(
+                    duplicate_axes(domch.domain, [old_iname], [new_iname])))
+    # }}}
+
+    # {{{ change the inames in the code
+
+    indup = _InameDuplicator(knl.substitutions, name_gen,
+            old_to_new=dict(zip(inames, new_inames)),
+            within=within)
+
+    knl = indup.map_kernel(knl)
+
+    # }}}
+
+    # {{{ realize tags
+
+    for old_iname, new_iname in zip(inames, new_inames):
+        new_tag = tags.get(old_iname)
+        if new_tag is not None:
+            knl = tag_inames(knl, {new_iname: new_tag})
+
+    # }}}
+
+    return knl
+
+# }}}
+
 # {{{ convenience: add_prefetch
 
 # {{{ process footprint_subscripts
@@ -572,7 +686,6 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
             _process_footprint_subscripts(
                     kernel,  rule_name, sweep_inames,
                     footprint_subscripts, arg)
-
     new_kernel = precompute(kernel, subst_use, sweep_inames,
             new_storage_axis_names=dim_arg_names,
             default_tag=default_tag, dtype=arg.dtype)
@@ -663,13 +776,6 @@ def change_arg_to_image(knl, name):
 
 # }}}
 
-# {{{ duplicate inames
-
-def duplicate_inames(knl, inames):
-    pass
-
-# }}}
-
 
 
 
diff --git a/loopy/cse.py b/loopy/cse.py
index 48ef6e3566c55837e9a7c4194cdaa3be064adae8..28518cb230372279e829f0fc30da582ceb45303f 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -173,6 +173,9 @@ def build_global_storage_to_sweep_map(kernel, invocation_descriptors,
         global_stor2sweep = isl.Map.from_basic_map(stor2sweep)
     global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep)
 
+    # space for global_stor2sweep:
+    # [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn)
+
     # {{{ check if non-footprint-building invocation descriptors fall into footprint
 
     # Make all inames except the sweep parameters. (The footprint may depend on those.)
@@ -506,7 +509,7 @@ class InvocationReplacer(ExpandingIdentityMapper):
         stor_subscript = []
         for sax_name, sax_source, sax_base_idx in zip(
                 self.storage_axis_names,
-                self.storage_axis_sources, 
+                self.storage_axis_sources,
                 self.storage_base_indices):
             if sax_name not in self.non1_storage_axis_names:
                 continue
@@ -640,10 +643,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
 
-    from loopy import infer_type
-    if dtype is None:
-        dtype = infer_type
-
     # }}}
 
     # {{{ process invocations in footprint generators, start invocation_descriptors
@@ -664,6 +663,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             invocation_descriptors.append(
                     InvocationDescriptor(args=args,
                         expands_footprint=True,
+                        expansion_stack=None,
                         ))
 
     # }}}
@@ -858,12 +858,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ set up temp variable
 
+    from loopy import infer_type
+    if dtype is None:
+        dtype = infer_type
+    else:
+        dtype = np.dtype(dtype)
+
     from loopy.kernel import TemporaryVariable
 
     new_temporary_variables = kernel.temporary_variables.copy()
     temp_var = TemporaryVariable(
             name=target_var_name,
-            dtype=np.dtype(dtype),
+            dtype=dtype,
             base_indices=(0,)*len(non1_storage_shape),
             shape=non1_storage_shape,
             is_local=None)
@@ -872,13 +878,13 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
-    result =  kernel.copy(
+    kernel =  kernel.copy(
             domains=domch.get_domains_with(new_domain),
             instructions=[compute_insn] + kernel.instructions,
             temporary_variables=new_temporary_variables)
 
     from loopy import tag_inames
-    return tag_inames(result, new_iname_to_tag)
+    return tag_inames(kernel, new_iname_to_tag)
 
 
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 014a4f6e302f7228b7cc7a9e3836ac730a8cf82e..c00c46d3c7c9c0c51621b4f05939c86ccbda28f4 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -637,12 +637,13 @@ class _UniqueNameGenerator:
         return name in self.existing_names
 
     def add_name(self, name):
-        assert name not in self.existing_names
+        if self.is_name_conflicting(name):
+            raise ValueError("name '%s' conflicts with existing names")
         self.existing_names.add(name)
 
     def add_names(self, names):
-        assert not frozenset(names) & self.existing_names
-        self.existing_names.update(names)
+        for name in names:
+            self.add_name(name)
 
     def __call__(self, based_on="var"):
         for var_name in _generate_unique_possibilities(based_on):
@@ -1288,15 +1289,16 @@ class LoopKernel(Record):
         return self._get_inames_domain_backend(inames)
 
     @memoize_method
-    def get_leaf_domain_index(self, inames):
-        """Find the leaf of the domain tree needed to cover all inames."""
+    def get_leaf_domain_indices(self, inames):
+        """Find the leaves of the domain tree needed to cover all inames."""
 
         hdm = self._get_home_domain_map()
         ppd = self.all_parents_per_domain()
 
         domain_indices = set()
 
-        leaf_domain_index = None
+        # map root -> leaf
+        root_to_leaf = {}
 
         for iname in inames:
             home_domain_index = hdm[iname]
@@ -1304,27 +1306,39 @@ class LoopKernel(Record):
                 # nothin' new
                 continue
 
-            leaf_domain_index = home_domain_index
-
-            all_parents = set(ppd[home_domain_index])
-            if not domain_indices <= all_parents:
-                raise CannotBranchDomainTree("iname set '%s' requires "
-                        "branch in domain tree (when adding '%s')"
-                        % (", ".join(inames), iname))
+            domain_parents = [home_domain_index] + ppd[home_domain_index]
+            current_root = domain_parents[-1]
+            previous_leaf = root_to_leaf.get(current_root)
+
+            if previous_leaf is not None:
+                # Check that we don't branch the domain tree.
+                #
+                # Branching the domain tree is dangerous/ill-formed because
+                # it can introduce artificial restrictions on variables
+                # further up the tree.
+
+                prev_parents = set(ppd[previous_leaf])
+                if not prev_parents <= set(domain_parents):
+                    raise CannotBranchDomainTree("iname set '%s' requires "
+                            "branch in domain tree (when adding '%s')"
+                            % (", ".join(inames), iname))
+            else:
+                # We're adding a new root. That's fine.
+                pass
 
-            domain_indices.add(home_domain_index)
-            domain_indices.update(all_parents)
+            root_to_leaf[current_root] = home_domain_index
+            domain_indices.update(domain_parents)
 
-        return leaf_domain_index
+        return root_to_leaf.values()
 
     @memoize_method
     def _get_inames_domain_backend(self, inames):
-        leaf_dom_idx = self.get_leaf_domain_index(inames)
+        domain_indices = set()
+        for leaf_dom_idx in self.get_leaf_domain_indices(inames):
+            domain_indices.add(leaf_dom_idx)
+            domain_indices.update(self.all_parents_per_domain()[leaf_dom_idx])
 
-        return self.combine_domains(tuple(sorted(
-            self.all_parents_per_domain()[leaf_dom_idx]
-            + [leaf_dom_idx]
-            )))
+        return self.combine_domains(tuple(sorted(domain_indices)))
 
     # }}}
 
@@ -1824,7 +1838,13 @@ class DomainChanger:
     def __init__(self, kernel, inames):
         self.kernel = kernel
         if inames:
-            self.leaf_domain_index = kernel.get_leaf_domain_index(inames)
+            ldi = kernel.get_leaf_domain_indices(inames)
+            if len(ldi) > 1:
+                raise RuntimeError("Inames '%s' require more than one leaf "
+                        "domain, which makes the domain change that is part "
+                        "of your current operation ambiguous." % ", ".join(inames))
+
+            self.leaf_domain_index, = ldi
             self.domain = kernel.domains[self.leaf_domain_index]
 
         else:
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 860c9ec18692e6c8f75abac4e60f60e30edd8a06..fe7b458d8469c5c6a1a3c09470e34c43c5cd1e7a 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -269,6 +269,45 @@ class ExpansionState(Record):
     :ivar arg_context: a dict representing current argument values
     """
 
+class SubstitutionRuleRenamer(IdentityMapper):
+    def __init__(self, renames):
+        self.renames = renames
+
+    def map_call(self, expr):
+        if not isinstance(expr.function, Variable):
+            return IdentityMapper.map_call(self, expr)
+
+        name, tag = parse_tagged_name(expr.function)
+
+        new_name = self.renames.get(name)
+        if new_name is None:
+            return IdentityMapper.map_call(self, expr)
+
+        if tag is None:
+            sym = Variable(new_name)
+        else:
+            sym = TaggedVariable(new_name, tag)
+
+        return type(expr)(sym, tuple(self.rec(child) for child in expr.parameters))
+
+    def map_variable(self, expr):
+        name, tag = parse_tagged_name(expr)
+
+        new_name = self.renames.get(name)
+        if new_name is None:
+            return IdentityMapper.map_variable(self, expr)
+
+        if tag is None:
+            return Variable(new_name)
+        else:
+            return TaggedVariable(new_name, tag)
+
+def rename_subst_rules_in_instructions(insns, renames):
+    subst_renamer = SubstitutionRuleRenamer(renames)
+    return [
+            insn.copy(expression=subst_renamer(insn.expression))
+            for insn in insns]
+
 class ExpandingIdentityMapper(IdentityMapper):
     """Note: the third argument dragged around by this mapper is the
     current expansion expansion state.
@@ -278,26 +317,26 @@ class ExpandingIdentityMapper(IdentityMapper):
         self.old_subst_rules = old_subst_rules
         self.make_unique_var_name = make_unique_var_name
 
-        # maps subst rule (args, bodies) to names
+        # maps subst rule (args, bodies) to (names, original_name)
         self.subst_rule_registry = dict(
-                ((rule.arguments, rule.expression), name)
+                ((rule.arguments, rule.expression), (name, name))
                 for name, rule in old_subst_rules.iteritems())
 
         # maps subst rule (args, bodies) to use counts
         self.subst_rule_use_count = {}
 
-    def register_subst_rule(self, name, args, body):
+    def register_subst_rule(self, original_name, args, body):
         """Returns a name (as a string) for a newly created substitution
         rule.
         """
         key = (args, body)
-        existing_name = self.subst_rule_registry.get(key)
+        reg_value = self.subst_rule_registry.get(key)
 
-        if existing_name is None:
-            new_name = self.make_unique_var_name(name)
-            self.subst_rule_registry[key] = new_name
+        if reg_value is None:
+            new_name = self.make_unique_var_name(original_name)
+            self.subst_rule_registry[key] = (new_name, original_name)
         else:
-            new_name = existing_name
+            new_name, _ = reg_value
 
         self.subst_rule_use_count[key] = self.subst_rule_use_count.get(key, 0) + 1
         return new_name
@@ -364,32 +403,76 @@ class ExpandingIdentityMapper(IdentityMapper):
         return IdentityMapper.__call__(self, expr, ExpansionState(
             stack=stack, arg_context={}))
 
-    def get_new_substitutions(self):
+    def _get_new_substitutions_and_renames(self):
+        """This makes a new dictionary of substitutions from the ones
+        encountered in mapping all the encountered expressions.
+        It tries hard to keep substitution names the same--i.e.
+        if all derivative versions of a substitution rule ended
+        up with the same mapped version, then this version should
+        retain the name that the substitution rule had previously.
+        Unfortunately, this can't be done in a single pass, and so
+        the routine returns an additional dictionary *subst_renames*
+        of renamings to be performed on the processed expressions.
+
+        The returned substitutions already have the rename applied
+        to them.
+
+        :returns: (new_substitutions, subst_renames)
+        """
+
         from loopy.kernel import SubstitutionRule
 
+        orig_name_histogram = {}
+        for key, (name, orig_name) in self.subst_rule_registry.iteritems():
+            if self.subst_rule_use_count.get(key, 0):
+                orig_name_histogram[orig_name] = \
+                        orig_name_histogram.get(orig_name, 0) + 1
+
         result = {}
-        for key, name in self.subst_rule_registry.iteritems():
+        renames = {}
+
+        for key, (name, orig_name) in self.subst_rule_registry.iteritems():
             args, body = key
 
             if self.subst_rule_use_count.get(key, 0):
+                if orig_name_histogram[orig_name] == 1 and name != orig_name:
+                    renames[name] = orig_name
+                    name = orig_name
+
                 result[name] = SubstitutionRule(
                         name=name,
                         arguments=args,
                         expression=body)
 
-        return result
+        # {{{ perform renames on new substitutions
+
+        subst_renamer = SubstitutionRuleRenamer(renames)
+
+        renamed_result = {}
+        for name, rule in result.iteritems():
+            renamed_result[name] = rule.copy(
+                    expression=subst_renamer(rule.expression))
+
+        # }}}
+
+        return renamed_result, renames
 
     def map_kernel(self, kernel):
         new_insns = [
                 insn.copy(
+                    # While subst rules are not allowed in assignees, the mapper
+                    # may perform tasks entirely unrelated to subst rules, so
+                    # we must map assignees, too.
                     assignee=self(insn.assignee, insn.id),
+
                     expression=self(insn.expression, insn.id))
+
                 for insn in kernel.instructions]
 
+        new_substs, renames = self._get_new_substitutions_and_renames()
         return kernel.copy(
-            substitutions=self.get_new_substitutions(),
-            instructions=new_insns)
-
+            substitutions=new_substs,
+            instructions=rename_subst_rules_in_instructions(new_insns, renames))
 
 # }}}
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index b536cc94a86cfba377210fa742830d96e3f69a2f..8ff5bf8acfe1c0197f3177d892b0193155cbb2f4 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -51,8 +51,8 @@ def test_tim2d(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
            [
-            "ur(a,b) := sum(@o, D[a,o]*u[e,o,b])",
-            "us(a,b) := sum(@o, D[b,o]*u[e,a,o])",
+            "ur(a,b) := sum(o, D[a,o]*u[e,o,b])",
+            "us(a,b) := sum(o, D[b,o]*u[e,a,o])",
 
             #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",
 
@@ -74,6 +74,9 @@ def test_tim2d(ctx_factory):
             ],
             name="semlap2D", assumptions="K>=1")
 
+    knl = lp.duplicate_inames(knl, "o", within="ur")
+    knl = lp.duplicate_inames(knl, "o", within="us")
+
     seq_knl = knl
 
     def variant_orig(knl):
@@ -82,11 +85,11 @@ def test_tim2d(ctx_factory):
         knl = lp.add_prefetch(knl, "D[:,:]")
         knl = lp.add_prefetch(knl, "u[e, :, :]")
 
-        knl = lp.precompute(knl, "ur(m,j)", np.float32, ["m", "j"])
-        knl = lp.precompute(knl, "us(i,m)", np.float32, ["i", "m"])
+        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"])
+        knl = lp.precompute(knl, "us(i,m)", ["i", "m"])
 
-        knl = lp.precompute(knl, "Gux(m,j)", np.float32, ["m", "j"])
-        knl = lp.precompute(knl, "Guy(i,m)", np.float32, ["i", "m"])
+        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"])
+        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"])
 
         knl = lp.add_prefetch(knl, "G$x[:,e,:,:]")
         knl = lp.add_prefetch(knl, "G$y[:,e,:,:]")