diff --git a/MEMO b/MEMO
index 3914782d73167efc53ccbae5a8de12352d7de25e..225dfffd985865d135617e890cdbdde8d4cfc102 100644
--- a/MEMO
+++ b/MEMO
@@ -42,13 +42,10 @@ Things to consider
 To-do
 ^^^^^
 
-- variable shuffle detection
-  -> will need unification
-
 - Automatically generate testing code vs. sequential.
 
 - For forced workgroup sizes: check that at least one iname
-  maps to it.
+  maps to them.
 
 - If isl can prove that all operands are positive, may use '/' instead of
   'floor_div'.
@@ -95,6 +92,9 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- variable shuffle detection
+  -> will need unification
+
 - Dimension joining
 
 - user interface for dim length prescription
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 4eb0e2c55bfd7a36c42edfda26438d407f3527c3..3b80b4689d5e80c340e3a25b74722331ccad7de2 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -235,9 +235,9 @@ def make_kernel(*args, **kwargs):
 def split_dimension(kernel, iname, inner_length,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
-        slabs=(0, 0)):
+        slabs=(0, 0), do_tagged_check=True):
 
-    if kernel.iname_to_tag.get(iname) is not None:
+    if do_tagged_check and kernel.iname_to_tag.get(iname) is not None:
         raise RuntimeError("cannot split already tagged iname '%s'" % iname)
 
     if iname not in kernel.all_inames():
diff --git a/loopy/cse.py b/loopy/cse.py
index d169d0d5bdb9534b9d6cb542778811f5934b9232..6a1ab747e8556eeab38e10cc1e6e45ca8b5c4647 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -295,7 +295,7 @@ def process_cses(kernel, lead_csed, cse_descriptors):
 
 
 def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local,
-        new_inames, ind_iname_to_tag):
+        independent_inames, new_inames, ind_iname_to_tag):
     insn = lead_csed.insn
 
     # {{{ decide whether to force a dep
@@ -310,13 +310,13 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local,
     assert dependencies <= parent_inames
 
     for iname in parent_inames:
-        if iname in lead_csed.independent_inames:
+        if iname in independent_inames:
             tag = ind_iname_to_tag[iname]
         else:
             tag = kernel.iname_to_tag.get(iname)
 
         if should_cse_force_iname_dep(
-                iname, lead_csed.independent_inames, tag, dependencies,
+                iname, independent_inames, tag, dependencies,
                 target_var_is_local, lead_csed.cse):
             forced_iname_deps.add(iname)
 
@@ -324,7 +324,7 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local,
 
     assignee = var(target_var_name)
 
-    if lead_csed.independent_inames:
+    if new_inames:
         assignee = assignee[tuple(
             var(iname) for iname in new_inames
             )]
@@ -334,8 +334,7 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local,
     subst_map = SubstitutionMapper(make_subst_func(
         dict(
             (old_iname, var(new_iname))
-            for old_iname, new_iname in zip(lead_csed.independent_inames,
-                new_inames))))
+            for old_iname, new_iname in zip(independent_inames, new_inames))))
     new_inner_expr = subst_map(lead_csed.cse.child)
 
     insn_prefix = lead_csed.cse.prefix
@@ -483,7 +482,7 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
 
     compute_insn = make_compute_insn(
             kernel, lead_csed, target_var_name, target_var_is_local,
-            new_inames, ind_iname_to_tag)
+            independent_inames, new_inames, ind_iname_to_tag)
 
     # {{{ substitute variable references into instructions
 
@@ -493,12 +492,16 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
 
             lead_indices = [var(iname) for iname in independent_inames]
         else:
+            found = False
             for csed in cse_descriptors:
                 if cse is csed.cse:
+                    found = True
                     break
 
-            if cse is not csed.cse:
-                return rec(cse.child)
+            if not found:
+                from pymbolic.primitives import CommonSubexpression
+                return CommonSubexpression(
+                        rec(cse.child), cse.prefix)
 
             lead_indices = csed.lead_index_exprs
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index d7d02effe36ccb29720bb1388922105f748b39a4..2bed109f204ce3668444c3d1a7de2a2bbb1fc2b1 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -359,7 +359,8 @@ def assign_automatic_axes(kernel, only_axis_0=True):
                 from loopy import split_dimension
                 return assign_automatic_axes(
                         split_dimension(kernel, iname, inner_length=local_size[axis],
-                            outer_tag=UnrollTag(), inner_tag=new_tag),
+                            outer_tag=UnrollTag(), inner_tag=new_tag,
+                            do_tagged_check=False),
                         only_axis_0=only_axis_0)
 
         new_iname_to_tag = kernel.iname_to_tag.copy()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 23370ced982b89f4e02d7b58d9db025c64bbf661..f5e88286a9ee8feb2868423ccb41382bbc8d52d8 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -175,7 +175,6 @@ def test_transpose(ctx_factory):
     knl = lp.split_dimension(knl, "j", 16,
             outer_tag="g.1", inner_tag="l.0")
     knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"])
-    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
 
     kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen, {})