diff --git a/MEMO b/MEMO
index 751f9769df87ed194078f4dc4e6c206c7b2cd198..10448216275060abadc24c1e2ff3a5a7c21d7ae8 100644
--- a/MEMO
+++ b/MEMO
@@ -43,10 +43,21 @@ To-do
 ^^^^^
 
 - variable shuffle detection
+  -> will need unification
+
+- Fix all tests
+
+- Automatically generate testing code vs. sequential.
 
 - Deal with equality constraints.
   (These arise, e.g., when partitioning a loop of length 16 into 16s.)
 
+- duplicate_dimensions can be implemented without having to muck around 
+  with individual constraints:
+  - add_dims
+  - move_dims
+  - intersect
+
 Future ideas
 ^^^^^^^^^^^^
 
@@ -84,6 +95,8 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- Dimension joining
+
 - user interface for dim length prescription
 
 - Restrict-to-sequential and tagging have nothing to do with each other.
diff --git a/loopy/__init__.py b/loopy/__init__.py
index aab4f1083774369b9347b227545c217e56eac33e..ae0a6d4ae4a14fde1ba9520d9febba4e28a3fcff 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -161,8 +161,8 @@ def make_kernel(*args, **kwargs):
             insn = insn.copy(
                     assignee=subst_map(insn.assignee),
                     expression=new_expression,
-                    forced_iname_deps=[
-                        old_to_new.get(iname, iname) for iname in insn.forced_iname_deps],
+                    forced_iname_deps=set(
+                        old_to_new.get(iname, iname) for iname in insn.forced_iname_deps),
                     )
 
         # }}}
@@ -224,8 +224,7 @@ def make_kernel(*args, **kwargs):
 
 # {{{ user-facing kernel manipulation functionality
 
-
-def split_dimension(kernel, iname, inner_length, padded_length=None,
+def split_dimension(kernel, iname, inner_length,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
         slabs=(0, 0)):
@@ -233,9 +232,6 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
     if iname not in kernel.all_inames():
         raise ValueError("cannot split loop for unknown variable '%s'" % iname)
 
-    if padded_length is not None:
-        inner_tag = inner_tag.copy(forced_length=padded_length)
-
     if outer_iname is None:
         outer_iname = iname+"_outer"
     if inner_iname is None:
@@ -286,9 +282,9 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
         new_expr = subst_mapper(rls(insn.expression))
 
         if iname in insn.forced_iname_deps:
-            new_forced_iname_deps = insn.forced_iname_deps[:]
+            new_forced_iname_deps = insn.forced_iname_deps.copy()
             new_forced_iname_deps.remove(iname)
-            new_forced_iname_deps.extend([outer_iname, inner_iname])
+            new_forced_iname_deps.update([outer_iname, inner_iname])
         else:
             new_forced_iname_deps = insn.forced_iname_deps
 
@@ -307,7 +303,6 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
     result = (kernel
             .copy(domain=new_domain,
                 iname_slab_increments=iname_slab_increments,
-                iname_to_dim=None,
                 instructions=new_insns))
 
     return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag})
@@ -390,6 +385,10 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
     if new_inames is None:
         new_inames = [None] * len(duplicate_inames)
 
+    if len(new_inames) != len(duplicate_inames):
+        raise ValueError("If given, the new_inames argument must have the "
+                "same length as duplicate_inames")
+
     temp_new_inames = []
     for old_iname, new_iname in zip(duplicate_inames, new_inames):
         if new_iname is None:
@@ -431,7 +430,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
         # {{{ decide what to do with each iname
 
-        forced_iname_deps = []
+        forced_iname_deps = set()
 
         from loopy.symbolic import IndexVariableFinder
         dependencies = IndexVariableFinder(
@@ -507,7 +506,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                 force_dependency = kind == "g"
 
             if force_dependency:
-                forced_iname_deps.append(iname)
+                forced_iname_deps.add(iname)
 
         # }}}
 
@@ -604,7 +603,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
 # {{{ convenience
 
-def add_prefetch(kernel, var_name, fetch_dims=[]):
+def add_prefetch(kernel, var_name, fetch_dims=[], new_inames=None):
     used_cse_tags = set()
     def map_cse(expr, rec):
         used_cse_tags.add(expr.tag)
@@ -632,7 +631,8 @@ def add_prefetch(kernel, var_name, fetch_dims=[]):
         dtype = kernel.temporary_variables[var_name].dtype
 
     for cse_tag in new_cse_tags:
-        kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims)
+        kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims,
+                new_inames=new_inames)
 
     return kernel
 
diff --git a/loopy/check.py b/loopy/check.py
index 3717084dd9610024676d36d89c20648bf8fb1801..cb20950bce4d7ef0c099100f3b0f018ca3b46d41 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -210,6 +210,15 @@ def check_implemented_domains(kernel, implemented_domains):
 
 # }}}
 
+def run_automatic_checks(kernel):
+    import loopy.check as chk
+
+    chk.check_for_double_use_of_hw_axes(kernel)
+    chk.check_for_unused_hw_axes(kernel)
+    chk.check_for_inactive_iname_access(kernel)
+    chk.check_for_write_races(kernel)
+
+
 # {{{ user-invoked checks
 
 def get_problems(kernel, parameters):
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 4c31a30c05d73407553575182850131912c535c8..25d1ca77dbc703ca9b145bc0bceac17466f19d4c 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -214,7 +214,7 @@ class Instruction(Record):
         a :class:`LoopKernel`.
     :ivar assignee:
     :ivar expression:
-    :ivar forced_iname_deps: a list of inames that are added to the list of iname
+    :ivar forced_iname_deps: a set of inames that are added to the list of iname
         dependencies
     :ivar insn_deps: a list of ids of :class:`Instruction` instances that
         *must* be executed before this one. Note that loop scheduling augments this
@@ -233,9 +233,12 @@ class Instruction(Record):
     """
     def __init__(self,
             id, assignee, expression,
-            forced_iname_deps=[], insn_deps=[], boostable=None,
+            forced_iname_deps=set(), insn_deps=set(), boostable=None,
             temp_var_type=None, duplicate_inames_and_tags=[]):
 
+        assert isinstance(forced_iname_deps, set)
+        assert isinstance(insn_deps, set)
+
         Record.__init__(self,
                 id=id, assignee=assignee, expression=expression,
                 forced_iname_deps=forced_iname_deps,
@@ -452,7 +455,7 @@ class LoopKernel(Record):
             preamble=None, assumptions=None,
             iname_slab_increments={},
             temporary_variables={},
-            local_sizes=None,
+            local_sizes={},
             iname_to_tag={}, iname_to_tag_requests=None):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
@@ -523,17 +526,17 @@ class LoopKernel(Record):
                 else:
                     label = "insn"
                 if groups["insn_deps"] is not None:
-                    insn_deps = [dep.strip() for dep in groups["insn_deps"].split(",")]
+                    insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(","))
                 else:
-                    insn_deps = []
+                    insn_deps = set()
 
                 if groups["iname_deps_and_tags"] is not None:
                     inames_and_tags = parse_iname_and_tag_list(
                             groups["iname_deps_and_tags"])
-                    forced_iname_deps = [iname for iname, tag in inames_and_tags]
+                    forced_iname_deps = set(iname for iname, tag in inames_and_tags)
                     iname_to_tag_requests.update(dict(inames_and_tags))
                 else:
-                    forced_iname_deps = []
+                    forced_iname_deps = set()
 
                 if groups["duplicate_inames_and_tags"] is not None:
                     duplicate_inames_and_tags = parse_iname_and_tag_list(
@@ -566,7 +569,6 @@ class LoopKernel(Record):
         if len(set(insn.id for insn in insns)) != len(insns):
             raise RuntimeError("instruction ids do not appear to be unique")
 
-
         if assumptions is None:
             assumptions_space = domain.get_space().params()
             assumptions = isl.Set.universe(assumptions_space)
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index a06e28b4eff24e6451281e3b70c3a86f7697e59a..d7d02effe36ccb29720bb1388922105f748b39a4 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -57,7 +57,7 @@ def realize_reduction(kernel):
                 id=kernel.make_unique_instruction_id(
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
-                forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
+                forced_iname_deps=insn.all_inames() - set(expr.inames),
                 expression=expr.operation.neutral_element)
 
         new_insns.append(init_insn)
@@ -67,12 +67,12 @@ def realize_reduction(kernel):
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
                 expression=expr.operation(target_var, sub_expr),
-                insn_deps=[init_insn.id],
-                forced_iname_deps=list(insn.all_inames() | set(expr.inames)))
+                insn_deps=set([init_insn.id]) | insn.insn_deps,
+                forced_iname_deps=insn.all_inames() | set(expr.inames))
 
         new_insns.append(reduction_insn)
 
-        new_insn_insn_deps.append(reduction_insn.id)
+        new_insn_insn_deps.add(reduction_insn.id)
 
         return target_var
 
@@ -80,15 +80,15 @@ def realize_reduction(kernel):
     cb_mapper = ReductionCallbackMapper(map_reduction)
 
     for insn in kernel.instructions:
-        new_insn_insn_deps = []
+        new_insn_insn_deps = set()
 
         new_expression = cb_mapper(insn.expression)
 
         new_insn = insn.copy(
                     expression=new_expression,
                     insn_deps=insn.insn_deps
-                        + new_insn_insn_deps,
-                    forced_iname_deps=list(insn.all_inames()))
+                        | new_insn_insn_deps,
+                    forced_iname_deps=insn.all_inames())
 
         new_insns.append(new_insn)
 
@@ -149,7 +149,7 @@ def add_boostability_and_automatic_dependencies(kernel):
 
     new_insns = []
     for insn in kernel.instructions:
-        auto_deps = []
+        auto_deps = set()
 
         # {{{ add automatic dependencies
 
@@ -170,7 +170,7 @@ def add_boostability_and_automatic_dependencies(kernel):
                         % (var, insn.id))
 
             if len(var_writers) == 1:
-                auto_deps.extend(var_writers)
+                auto_deps.update(var_writers)
 
         # }}}
 
@@ -195,7 +195,7 @@ def add_boostability_and_automatic_dependencies(kernel):
 
         new_insns.append(
                 insn.copy(
-                    insn_deps=insn.insn_deps + auto_deps,
+                    insn_deps=insn.insn_deps | auto_deps,
                     boostable=boostable))
 
     # {{{ remove boostability from isns that access non-boostable vars
@@ -504,13 +504,6 @@ def preprocess_kernel(kernel):
     kernel = add_boostability_and_automatic_dependencies(kernel)
     kernel = adjust_local_temp_var_storage(kernel)
 
-    import loopy.check as chk
-
-    chk.check_for_double_use_of_hw_axes(kernel)
-    chk.check_for_unused_hw_axes(kernel)
-    chk.check_for_inactive_iname_access(kernel)
-    chk.check_for_write_races(kernel)
-
     return kernel
 
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 9b597e844b8731bf2c8544f0488a7aed8dc67bd0..bae8bb612c79977040cf60692bc7794b8ed574ae 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -431,6 +431,9 @@ def generate_loop_schedules(kernel, loop_priority=[]):
     from loopy.preprocess import preprocess_kernel
     kernel = preprocess_kernel(kernel)
 
+    from loopy.check import run_automatic_checks
+    run_automatic_checks(kernel)
+
     schedule_count = 0
 
     for gen_sched in generate_loop_schedules_internal(kernel, loop_priority):
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 265353abee68cd54a6e2882d6ee67d9bb3e363ff..23370ced982b89f4e02d7b58d9db025c64bbf661 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -586,7 +586,7 @@ def test_image_matrix_mul(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = a[i, k]*b[k, j]"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
@@ -633,12 +633,12 @@ def test_image_matrix_mul_ilp(ctx_factory):
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = get_suitable_size(ctx)
+    n = 2*get_suitable_size(ctx)
 
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = a[i, k]*b[k, j]"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
@@ -655,8 +655,12 @@ def test_image_matrix_mul_ilp(ctx_factory):
     knl = lp.split_dimension(knl, "k", 2)
     # conflict-free
     knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
-    #knl = lp.add_prefetch(knl, 'b', [("j_inner_outer", "j_inner_inner"), "k_inner"])
+    knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"],
+            ["b_j_io", "b_j_ii", "b_k_i"])
+    knl = lp.join_dimensions(knl, ["b_j_io", "b_j_ii"])
+
+    #print lp.preprocess_kernel(knl)
+    #1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))