diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8633d71defa850293b2e6c8a1dfd80b812275800..e1745099d236a0a4fd94b04d33b973dda3acc731 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -388,8 +388,7 @@ def tag_dimensions(kernel, iname_to_tag, force=False):
     iname_to_tag = dict((iname, parse_tag(tag))
             for iname, tag in iname_to_tag.iteritems())
 
-    from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase,
-            LocalIndexTag)
+    from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase)
 
     new_iname_to_tag = kernel.iname_to_tag.copy()
     for iname, new_tag in iname_to_tag.iteritems():
@@ -397,9 +396,7 @@ def tag_dimensions(kernel, iname_to_tag, force=False):
 
         retag_ok = False
 
-        if (isinstance(old_tag, AutoLocalIndexTagBase)
-                and (new_tag is None
-                    or isinstance(new_tag, LocalIndexTag))):
+        if isinstance(old_tag, AutoLocalIndexTagBase):
             retag_ok = True
 
         if not retag_ok and old_tag is not None and new_tag is None:
diff --git a/loopy/cse.py b/loopy/cse.py
index b16983f5e3bbf8eeddcc2d7c84542065131b5c54..db1da2d19ae46f19b1b2299573d3bae490a2e44a 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -476,8 +476,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
         if new_storage_axis_names is not None and i < len(new_storage_axis_names):
             name = new_storage_axis_names[i]
             tag_lookup_saxis = name
-            if new_name in (kernel.all_variable_names() | newly_created_var_names):
-                raise RuntimeError("new storage axis name '%s' already exists" % new_name)
+            if name in (kernel.all_variable_names() | newly_created_var_names):
+                raise RuntimeError("new storage axis name '%s' already exists" % name)
 
         if name in (kernel.all_variable_names()
                 | newly_created_var_names):
@@ -488,7 +488,7 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
         storage_axis_name_to_tag[name] = storage_axis_to_tag.get(
                 tag_lookup_saxis, default_tag)
 
-        newly_created_var_names.add(new_name)
+        newly_created_var_names.add(name)
         expr_subst_dict[old_name] = var(name)
 
     del storage_axis_to_tag
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 0f9db5f724a46f3ff53a6274b4a4ed04ca68469a..fb2cee8c26e8585c3ab05e9d855a4807805a2d53 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -40,6 +40,7 @@ def test_nbody(ctx_factory):
     def variant_cpu(knl):
         knl = lp.split_dimension(knl, "i", 1024,
                 outer_tag="g.0", slabs=(0,1))
+        knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
         return knl, []
 
     def variant_gpu(knl):
@@ -47,10 +48,12 @@ def test_nbody(ctx_factory):
                 outer_tag="g.0", inner_tag="l.0", slabs=(0,1))
         knl = lp.split_dimension(knl, "j", 256, slabs=(0,1))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
-        knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"])
+        knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
+                ["x_fetch_j", "x_fetch_k"])
+        knl = lp.tag_dimensions(knl, dict(x_fetch_k="unr"))
         return knl, ["j_outer", "j_inner"]
 
-    n = 100
+    n = 3000
 
     for variant in [variant_gpu]:
         variant_knl, loop_prio = variant(knl)
@@ -59,7 +62,7 @@ def test_nbody(ctx_factory):
         kernel_gen = lp.check_kernels(kernel_gen, dict(N=n))
 
         lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
-                op_count=4*n**2*1e-9, op_label="GOps/s",
+                op_count=n**2*1e-6, op_label="M particle pairs",
                 parameters={"N": n}, print_ref_code=True)