diff --git a/MEMO b/MEMO
index a281b98085a5ce9e37102cbf94231596e6563b79..05eaf2de75a810bd9ff726646ac8ae53d56bd382 100644
--- a/MEMO
+++ b/MEMO
@@ -34,6 +34,8 @@ Things to consider
 
 - Measure efficiency of corner cases
 
+- Loopy as a data model for implementing custom rewritings
+
 To-do
 ^^^^^
 
@@ -45,12 +47,18 @@ To-do
 
 - user interface for dim length prescription
 
+- Way too many barriers in SEM test.
+
 - Deal with equality constraints.
   (These arise, e.g., when partitioning a loop of length 16 into 16s.)
 
 Future ideas
 ^^^^^^^^^^^^
 
+- Float4 joining on fetch/store?
+
+- How can one automatically generate something like microblocks?
+
 - Better for loop bound generation
   -> Try a triangular loop
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 3c3f25751d7f0acdb04b147927f09b6e6e2acf37..1cc356f77d4d19a4c9c334a1de2bce49a83b4f63 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -699,7 +699,8 @@ class LoopKernel(Record):
         if all_inames_by_insns != self.all_inames():
             raise RuntimeError("inames collected from instructions (%s) "
                     "do not match domain inames (%s)"
-                    % (", ".join(all_inames_by_insns), ", ".join(self.all_inames())))
+                    % (", ".join(sorted(all_inames_by_insns)), 
+                        ", ".join(sorted(self.all_inames()))))
 
         global_sizes = {}
         local_sizes = {}
@@ -780,11 +781,13 @@ class LoopKernel(Record):
     def __str__(self):
         lines = []
 
-        for insn in self.instructions:
-            lines.append(str(insn))
-        lines.append("")
         for iname in sorted(self.all_inames()):
             lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname)))
+        lines.append("")
+        lines.append(str(self.domain))
+        lines.append("")
+        for insn in self.instructions:
+            lines.append(str(insn))
 
         return "\n".join(lines)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 15573cf251c52557a2baf6158063bb640f2bf8f7..990a4fc7f2cb526b73a7afa514f9e571c85c0c40 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -6,6 +6,86 @@ import pyopencl.characterize as cl_char
 
 
 
+# {{{ make reduction variables unique
+
+def make_reduction_variables_unique(kernel):
+    # {{{ count number of uses of each reduction iname
+
+    def count_reduction_iname_uses(expr, rec):
+        rec(expr.expr)
+        for iname in expr.inames:
+            reduction_iname_uses[iname] = (
+                    reduction_iname_uses.get(iname, 0)
+                    + 1)
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(count_reduction_iname_uses)
+
+    reduction_iname_uses = {}
+
+    for insn in kernel.instructions:
+        cb_mapper(insn.expression)
+
+    # }}}
+
+    # {{{ make iname uses in reduction unique
+
+    def ensure_reduction_iname_uniqueness(expr, rec):
+        child = rec(expr.expr)
+        my_created_inames = []
+        new_red_inames = []
+
+        for iname in expr.inames:
+            if reduction_iname_uses[iname] > 1:
+                new_iname = kernel.make_unique_var_name(iname, set(new_inames))
+
+                old_inames.append(iname)
+                new_inames.append(new_iname)
+                my_created_inames.append(new_iname)
+                new_red_inames.append(new_iname)
+                reduction_iname_uses[iname] -= 1
+            else:
+                new_red_inames.append(iname)
+
+        if my_created_inames:
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+            from pymbolic import var
+            subst_dict = dict(
+                    (old_iname, var(new_iname))
+                    for old_iname, new_iname in zip(expr.inames, my_created_inames))
+            subst_map = SubstitutionMapper(make_subst_func(subst_dict))
+
+            child = subst_map(child)
+
+        from loopy.symbolic import Reduction
+        return Reduction(
+                operation=expr.operation,
+                inames=tuple(new_red_inames),
+                expr=child)
+
+    new_insns = []
+    old_inames = []
+    new_inames = []
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(ensure_reduction_iname_uniqueness)
+
+    new_insns = [
+        insn.copy(expression=cb_mapper(insn.expression))
+        for insn in kernel.instructions]
+
+    domain = kernel.domain
+    from loopy.isl_helpers import duplicate_axes
+    for old, new in zip(old_inames, new_inames):
+        domain = duplicate_axes(domain, [old], [new])
+
+    return kernel.copy(instructions=new_insns, domain=domain)
+
+    # }}}
+
+# }}}
+
 # {{{ rewrite reduction to imperative form
 
 def realize_reduction(kernel):
@@ -466,6 +546,7 @@ def adjust_local_temp_var_storage(kernel):
 
 
 def preprocess_kernel(kernel):
+    kernel = make_reduction_variables_unique(kernel)
     kernel = realize_reduction(kernel)
 
     # {{{ check that all CSEs have been realized
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index bb4fd83fb22bcbe46a72c23db2d266ab1ba472b4..5e08fbd44a12b031361e29dcbc29d3a64c12e790 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -514,7 +514,7 @@ class IndexVariableFinder(CombineMapper):
         result = self.rec(expr.expr)
         if not (set(expr.inames) & result):
             raise RuntimeError("reduction '%s' does not depend on "
-                    "reduction inames" % expr)
+                    "reduction inames (%s)" % (expr, ",".join(expr.inames)))
         if self.include_reduction_inames:
             return result
         else:
diff --git a/test/test_sem.py b/test/test_sem.py
index bdc7d678ed66dae150acdd5e75c4939b5352c983..1d9f530e96638324ec7c81b0cd8b1973071b9ca4 100644
--- a/test/test_sem.py
+++ b/test/test_sem.py
@@ -4,7 +4,6 @@ import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
 import pyopencl.array as cl_array
-import pyopencl.clrandom as cl_random
 import loopy as lp
 
 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -287,16 +286,16 @@ def test_sem_3d(ctx_factory):
     # K - run-time symbolic
     n = 8
     knl = lp.make_kernel(ctx.devices[0],
-            "[K] -> {[i,j,k,e,m,mp]: 0<=i,j,k,m<%d and 0<=e<K}" % n,
+            "[K] -> {[i,j,k,e,m]: 0<=i,j,k,m<%d and 0<=e<K}" % n,
             [
                 "[|i,j,k] <float32> ur[i,j,k] = sum_float32(m, D[i,m]*u[m,j,k,e])",
                 "[|i,j,k] <float32> us[i,j,k] = sum_float32(m, D[j,m]*u[i,m,k,e])",
                 "[|i,j,k] <float32> ut[i,j,k] = sum_float32(m, D[k,m]*u[i,j,m,e])",
 
                 "lap[i,j,k,e]  = "
-                "  sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k,e] + G[1,m,j,k,e]*us[m,j,k,e] + G[2,m,j,k,e]*ut[m,j,k,e]))"
-                "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k,e] + G[3,i,m,k,e]*us[i,m,k,e] + G[4,i,m,k,e]*ut[i,m,k,e]))"
-                "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m,e] + G[4,i,j,m,e]*us[i,j,m,e] + G[5,i,j,m,e]*ut[i,j,m,e]))"
+                "  sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k] + G[1,m,j,k,e]*us[m,j,k] + G[2,m,j,k,e]*ut[m,j,k]))"
+                "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k] + G[3,i,m,k,e]*us[i,m,k] + G[4,i,m,k,e]*ut[i,m,k]))"
+                "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m] + G[4,i,j,m,e]*us[i,j,m] + G[5,i,j,m,e]*ut[i,j,m]))"
                 ],
             [
             lp.ArrayArg("u",   dtype, shape=field_shape, order=order),
@@ -307,17 +306,18 @@ def test_sem_3d(ctx_factory):
             ],
             name="semlap", assumptions="K>=1")
 
-    print knl
+    #print knl
     #for tv in knl.temporary_variables.iteritems():
         #print tv
-    1/0
+    #1/0
 
     knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
     #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp")
     knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1"))
     #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"])
-    knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"])
-    print knl
+    #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"])
+    knl = lp.preprocess_kernel(knl)
+    #print knl
     #1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
@@ -343,10 +343,6 @@ def test_sem_3d(ctx_factory):
 
 
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the
-    # tests.
-    import pyopencl as cl
-
     import sys
     if len(sys.argv) > 1:
         exec(sys.argv[1])