diff --git a/loopy/__init__.py b/loopy/__init__.py
index 4d296fe4b0a6898040b564a59aacf3101cc4375b..c899824dc3d720fa8850bf075e5ab82d78f87934 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -428,7 +428,7 @@ def add_prefetch(kernel, var_name, sweep_dims, dim_args=None,
     newly_created_vars = set()
     parameters = []
     for i in range(len(arg.shape)):
-        based_on = "%s_i%d" % (var_name, i)
+        based_on = "%s_fetch_%d" % (var_name, i)
         if dim_args is not None and i < len(dim_args):
             based_on = dim_args[i]
 
diff --git a/loopy/check.py b/loopy/check.py
index f490f85512817ed13710732f20973eae562bcf49..16b4c72845d490ce33c852334bfb0f4ec76f523d 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -58,7 +58,7 @@ def check_for_double_use_of_hw_axes(kernel):
             if isinstance(tag, UniqueTag):
                 key = tag.key
                 if key in insn_tag_keys:
-                    raise RuntimeError("instruction '%s' has two "
+                    raise RuntimeError("instruction '%s' has multiple "
                             "inames tagged '%s'" % (insn.id, tag))
 
                 insn_tag_keys.add(key)
@@ -155,7 +155,7 @@ def check_for_write_races(kernel):
             raise RuntimeError(
                     "instruction '%s' contains a write race: "
                     "instruction will be run across parallel iname(s) '%s', which "
-                    "is/are not referenced in the assignee index"
+                    "is/are not referenced in the lhs index"
                     % (insn.id, ",".join(inames_without_write_dep)))
 
 def check_for_orphaned_user_hardware_axes(kernel):
diff --git a/loopy/cse.py b/loopy/cse.py
index 748ef458b443be1e154a9d8751805337376b2cf1..9f184edda971d60723054995e0471642b5354813 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -36,7 +36,7 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 
 
 
-def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
+def get_footprint(kernel, subst_name, old_arg_names, arg_names,
         sweep_inames, invocation_descriptors):
     global_footprint_map = None
 
@@ -45,8 +45,8 @@ def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
     for invdesc in invocation_descriptors:
 
         for iname in sweep_inames:
-            if iname in arg_names:
-                arg_idx = arg_names.index(iname)
+            if iname in old_arg_names:
+                arg_idx = old_arg_names.index(iname)
                 processed_sweep_inames.add(
                         get_dependencies(invdesc.args[arg_idx]))
             else:
@@ -55,11 +55,11 @@ def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
         # {{{ construct, check mapping
 
         map_space = kernel.space
-        ln = len(unique_new_arg_names)
+        ln = len(arg_names)
         rn = kernel.space.dim(dim_type.out)
 
         map_space = map_space.add_dims(dim_type.in_, ln)
-        for i, iname in enumerate(unique_new_arg_names):
+        for i, iname in enumerate(arg_names):
             map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'")
 
         set_space = map_space.move_dims(
@@ -69,7 +69,7 @@ def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
         footprint_map = None
 
         from loopy.symbolic import aff_from_expr
-        for uarg_name, arg_val in zip(unique_new_arg_names, invdesc.args):
+        for uarg_name, arg_val in zip(arg_names, invdesc.args):
             cns = isl.Constraint.equality_from_aff(
                     aff_from_expr(set_space, var(uarg_name+"'") - arg_val))
 
@@ -92,7 +92,8 @@ def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
 
     processed_sweep_inames = list(processed_sweep_inames)
 
-    global_footprint_map = global_footprint_map.intersect_range(kernel.domain)
+    global_footprint_map = (isl.Map.from_basic_map(global_footprint_map)
+            .intersect_range(kernel.domain))
 
     # move non-sweep-dimensions into parameter space
     sweep_footprint_map = global_footprint_map.coalesce()
@@ -114,51 +115,72 @@ def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
                 % subst_name)
 
     from loopy.kernel import find_var_base_indices_and_shape_from_inames
-    base_indices, shape = find_var_base_indices_and_shape_from_inames(
-            sfm_dom, [uarg+"'" for uarg in unique_new_arg_names],
+    arg_base_indices, shape = find_var_base_indices_and_shape_from_inames(
+            sfm_dom, [uarg+"'" for uarg in arg_names],
             kernel.cache_manager)
+    print arg_names, shape
 
     # compute augmented domain
 
+    # {{{ filter out unit-length dimensions
+
+    non1_arg_names = []
+    non1_arg_base_indices = []
+    non1_shape = []
+
+    for arg_name, bi, l in zip(arg_names, arg_base_indices, shape):
+        if l > 1:
+            non1_arg_names.append(arg_name)
+            non1_arg_base_indices.append(bi)
+            non1_shape.append(l)
+
+    # }}}
+
     # {{{ subtract off the base indices
     # add the new, base-0 as new in dimensions
 
     sp = global_footprint_map.get_space()
     tgt_idx = sp.dim(dim_type.out)
 
-    n_args = len(unique_new_arg_names)
+    n_args = len(arg_names)
+    nn1_args = len(non1_arg_names)
 
     aug_domain = global_footprint_map.move_dims(
             dim_type.out, tgt_idx,
             dim_type.in_, 0,
             n_args).range().coalesce()
 
-    aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, n_args)
-    for i, name in enumerate(unique_new_arg_names):
+    aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, nn1_args)
+    for i, name in enumerate(non1_arg_names):
         aug_domain = aug_domain.set_dim_name(dim_type.set, tgt_idx+i, name)
 
     # index layout now:
-    # <....out.....> (tgt_idx) <base-0 args> <args>
+    # <....out.....> (tgt_idx) <base-0 non-1-length args> <args>
 
     from loopy.symbolic import aff_from_expr
-    for uarg_name, bi in zip(unique_new_arg_names, base_indices):
-        cns = isl.Constraint.equality_from_aff(
-                aff_from_expr(aug_domain.get_space(),
-                    var(uarg_name) - (var(uarg_name+"'") - bi)))
+    for arg_name, bi, s in zip(arg_names, arg_base_indices, shape):
+        if s > 1:
+            cns = isl.Constraint.equality_from_aff(
+                    aff_from_expr(aug_domain.get_space(),
+                        var(arg_name) - (var(arg_name+"'") - bi)))
+
+            aug_domain = aug_domain.add_constraint(cns)
+
+    # }}}
 
-        aug_domain = aug_domain.add_constraint(cns)
+    # eliminate inames with non-zero base indices
 
-    aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+n_args, n_args)
-    aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+n_args, n_args)
+    aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+nn1_args, n_args)
+    aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+nn1_args, n_args)
 
     base_indices_2, shape_2 = find_var_base_indices_and_shape_from_inames(
-            aug_domain, unique_new_arg_names,
-            kernel.cache_manager)
+            aug_domain, non1_arg_names, kernel.cache_manager)
 
-    assert base_indices_2 == [0] * n_args
-    assert shape_2 == shape
+    assert base_indices_2 == [0] * nn1_args
+    assert shape_2 == non1_shape
 
-    return aug_domain, base_indices, shape
+    return (non1_arg_names, aug_domain,
+            arg_base_indices, non1_arg_base_indices, non1_shape)
 
 
 
@@ -229,7 +251,7 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
 
     newly_created_var_names = set()
 
-    # {{{ make sure that new
+    # {{{ make sure that new arg names are unique
 
     # (and substitute in subst_expressions if any variable name changes are necessary)
 
@@ -252,12 +274,16 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
 
         if new_name is not None:
             old_to_new[name] = var(new_name)
-            newly_created_var_names.add(new_name)
             unique_new_arg_names.append(new_name)
             new_arg_name_to_tag[new_name] = arg_name_to_tag[name]
+            newly_created_var_names.add(new_name)
         else:
             unique_new_arg_names.append(name)
             new_arg_name_to_tag[name] = arg_name_to_tag[name]
+            newly_created_var_names.add(name)
+
+    old_arg_names = arg_names
+    arg_names = unique_new_arg_names
 
     arg_name_to_tag = new_arg_name_to_tag
     subst_expr = (
@@ -269,10 +295,10 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
     # {{{ align and intersect the footprint and the domain
 
     # (If there are independent inames, this adds extra dimensions to the domain.)
-
-    new_domain, target_var_base_indices, target_var_shape = \
-            get_footprint(kernel, subst_name, arg_names, unique_new_arg_names,
-                    sweep_inames, invocation_descriptors)
+    (non1_arg_names, new_domain,
+                arg_base_indices, non1_arg_base_indices, non1_shape) = \
+                        get_footprint(kernel, subst_name, old_arg_names, arg_names,
+                                sweep_inames, invocation_descriptors)
 
     new_domain = new_domain.coalesce()
     if isinstance(new_domain, isl.Set):
@@ -296,8 +322,8 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
     new_temporary_variables[target_var_name] = TemporaryVariable(
             name=target_var_name,
             dtype=np.dtype(dtype),
-            base_indices=target_var_base_indices,
-            shape=target_var_shape,
+            base_indices=(0,)*len(non1_shape),
+            shape=non1_shape,
             is_local=None)
 
     # }}}
@@ -306,14 +332,27 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
 
     assignee = var(target_var_name)
 
-    if unique_new_arg_names:
-        assignee = assignee[tuple(var(iname) for iname in unique_new_arg_names)]
+    if non1_arg_names:
+        assignee = assignee[tuple(var(iname) for iname in non1_arg_names)]
+
+    def zero_length_1_arg(arg_name):
+        if arg_name in non1_arg_names:
+            return var(arg_name)
+        else:
+            return 0
+
+    compute_expr = (SubstitutionMapper(
+        make_subst_func(dict(
+            (arg_name, zero_length_1_arg(arg_name)+bi)
+            for arg_name, bi in zip(arg_names, arg_base_indices)
+            )))
+        (subst_expr))
 
     from loopy.kernel import Instruction
     compute_insn = Instruction(
             id=kernel.make_unique_instruction_id(based_on=subst_name),
             assignee=assignee,
-            expression=subst_expr)
+            expression=compute_expr)
 
     # }}}
 
@@ -330,7 +369,7 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
             return
 
         args = [simplify_via_aff(new_domain.get_space(), arg-bi)
-                for arg, bi in zip(args, target_var_base_indices)]
+                for arg, bi in zip(args, non1_arg_base_indices)]
 
         new_outer_expr = var(target_var_name)
         if args:
@@ -348,7 +387,8 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
     # }}}
 
     new_iname_to_tag = kernel.iname_to_tag.copy()
-    new_iname_to_tag.update(arg_name_to_tag)
+    if sweep_inames:
+        new_iname_to_tag.update(arg_name_to_tag)
 
     new_substs = dict(
             (s.name, s.copy(expression=sub_map(subst.expression)))
diff --git a/loopy/kernel.py b/loopy/kernel.py
index e8c80d36a2c24d952a97011570cb3d6ee847f4ec..bcd09f820a70cc55c45508d789c0c565550d931e 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -1112,12 +1112,13 @@ def find_var_base_indices_and_shape_from_inames(domain, inames, cache_manager):
         lower_bound_pw_aff = cache_manager.dim_min(domain, iname_to_dim[iname][1])
         upper_bound_pw_aff = cache_manager.dim_max(domain, iname_to_dim[iname][1])
 
-        from loopy.isl_helpers import static_max_of_pw_aff
+        from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
         from loopy.symbolic import pw_aff_to_expr
 
         shape.append(pw_aff_to_expr(static_max_of_pw_aff(
                 upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True)))
-        base_indices.append(pw_aff_to_expr(lower_bound_pw_aff))
+        base_indices.append(pw_aff_to_expr(
+            static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False)))
 
     return base_indices, shape
 
diff --git a/loopy/subst.py b/loopy/subst.py
index 0b56a9a3acbce2578f459d84d85e2de530bdd396..1d44ed29d134ae2eb1f9b6c42bfd8794ca38ef81 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -91,7 +91,6 @@ def extract_subst(kernel, subst_name, template, parameters):
             lhs_mapping_candidates=set(parameters) | set(matching_vars))
 
     def gather_exprs(expr, mapper):
-        print expr
         urecs = unif(template, expr)
 
         if urecs:
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index d980de74a254ad8f3f3f05ad2569bbc61603c879..7804c1cdc27a89ca662b08d5b98ed3616c4411a1 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -85,7 +85,7 @@ class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
 
 class WalkMapper(WalkMapperBase):
     def map_reduction(self, expr):
-        self.rec(expr.expression)
+        self.rec(expr.expr)
 
 class CallbackMapper(CallbackMapperBase, IdentityMapper):
     map_reduction = CallbackMapperBase.map_constant
@@ -330,6 +330,9 @@ class LoopyCCodeMapper(CCodeMapper):
         var_subst_map.update(assignments)
         return self.copy(var_subst_map=var_subst_map)
 
+    def map_common_subexpression(self, expr, prec):
+        raise RuntimeError("common subexpressions are not allowed in loopy")
+
     def map_variable(self, expr, prec):
         if expr.name in self.var_subst_map:
             if self.with_annotation:
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 0c09032b803e4765a055f7cc775b08b1e748a3ea..55f21cf2448d1452a42ecdf9f55213ab89d2f7e4 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -219,8 +219,10 @@ def test_plain_matrix_mul(ctx_factory):
     knl = lp.split_dimension(knl, "j", 16,
             outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_dimension(knl, "k", 16)
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
-    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])
+
+    print lp.preprocess_kernel(knl)
 
     kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen, {})
@@ -319,8 +321,8 @@ def test_rank_one(ctx_factory):
             name="rank_one", assumptions="n >= 16")
 
     def variant_1(knl):
-        knl = lp.add_prefetch(knl, "a")
-        knl = lp.add_prefetch(knl, "b")
+        knl = lp.add_prefetch(knl, "a", [])
+        knl = lp.add_prefetch(knl, "b", [])
         return knl
 
     def variant_2(knl):
@@ -329,8 +331,8 @@ def test_rank_one(ctx_factory):
         knl = lp.split_dimension(knl, "j", 16,
                 outer_tag="g.1", inner_tag="l.1")
 
-        knl = lp.add_prefetch(knl, "a")
-        knl = lp.add_prefetch(knl, "b")
+        knl = lp.add_prefetch(knl, "a", [])
+        knl = lp.add_prefetch(knl, "b", [])
         return knl
 
     def variant_3(knl):
@@ -360,15 +362,16 @@ def test_rank_one(ctx_factory):
         knl = lp.split_dimension(knl, "j_inner", 16,
                 inner_tag="l.1")
 
-        knl = lp.split_dimension(knl, "j_inner_fetch_b", 16,
+        knl = lp.split_dimension(knl, "a_fetch_0", 16,
                 outer_tag="l.1", inner_tag="l.0")
-        knl = lp.split_dimension(knl, "i_inner_fetch_a", 16,
+        knl = lp.split_dimension(knl, "b_fetch_0", 16,
                 outer_tag="l.1", inner_tag="l.0")
         return knl
 
     seq_knl = knl
 
-    for variant in [variant_1, variant_2, variant_4]:
+    #for variant in [variant_1, variant_2, variant_4]:
+    for variant in [variant_2, variant_4]:
         kernel_gen = lp.generate_loop_schedules(variant(knl))
         kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6bc77e6ad9353207dd3f4cd9da71d3df3936dbae..2eeea576fb0865a2510216422064f264248035cd 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -67,14 +67,13 @@ def test_multi_cse(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i]: 0<=i<100}",
             [
-                "[i] <float32> z[i] = cse(a[i]) + cse(a[i])**2"
+                "[i] <float32> z[i] = a[i] + a[i]**2"
                 ],
             [lp.ArrayArg("a", np.float32, shape=(100,))],
             local_sizes={0: 16})
 
     knl = lp.split_dimension(knl, "i", 16, inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", [])
-    #knl = lp.realize_cse(knl, None, np.float32, ["i_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen)
@@ -86,38 +85,6 @@ def test_multi_cse(ctx_factory):
 
 
 
-def test_bad_stencil(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(ctx.devices[0],
-            "{[i,j]: 0<= i,j < 32}",
-            [
-                "[i] <float32> z[i,j] = -2*cse(a[i,j])"
-                    " + cse(a[i,j-1])"
-                    " + cse(a[i,j+1])"
-                    " + cse(a[i-1,j])"
-                    " + cse(a[i+1,i])" # watch out: i!
-                ],
-            [
-                lp.ArrayArg("a", np.float32, shape=(32,32,))
-                ])
-
-    def variant_2(knl):
-        knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
-        knl = lp.realize_cse(knl, None, np.float32, ["i_inner", "j"])
-        return knl
-
-    for variant in [variant_2]:
-        kernel_gen = lp.generate_loop_schedules(variant(knl),
-                loop_priority=["i_outer", "i_inner_0", "j_0"])
-        kernel_gen = lp.check_kernels(kernel_gen)
-
-        for knl in kernel_gen:
-            print lp.generate_code(knl)
-
-
-
-
 
 def test_stencil(ctx_factory):
     ctx = ctx_factory()