diff --git a/MEMO b/MEMO
index facaab5a845f999aadc7c4e2905ab776707cb6f8..ea2317ab2b8c5f8b4dc46dd1ed39c86d56c4c142 100644
--- a/MEMO
+++ b/MEMO
@@ -56,7 +56,8 @@ Things to consider
 - Parallel dimension splitting/merging via tags
   -> unnecessary?
 
-- All user-supplied commands are assumed to be idempotent.
+- Not using all hw loop dimensions causes an error, as
+  is the case for variant 3 in the rank_one test.
 
 TODO
 ^^^^
@@ -83,6 +84,9 @@ TODO
 - Better for loop bound generation
   -> Try a triangular loop
 
+- Nested slab decomposition (in conjunction with conditional hoisting) could
+  generate nested conditional code.
+
 Dealt with
 ^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 784bc99cf1d900e177e6d612c2e5d1338d6d9c75..86c0986c026616bdef457e7a351876aa319d5507 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -38,14 +38,11 @@ from loopy.compiled import CompiledKernel, drive_timing_run
 def split_dimension(kernel, iname, inner_length, padded_length=None,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
-        outer_slab_increments=(0, -1), no_slabs=None):
+        slabs=(0, 0)):
 
     if iname not in kernel.all_inames():
         raise ValueError("cannot split loop for unknown variable '%s'" % iname)
 
-    if no_slabs:
-        outer_slab_increments = (0, 0)
-
     if padded_length is not None:
         inner_tag = inner_tag.copy(forced_length=padded_length)
 
@@ -115,7 +112,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
     # }}}
 
     iname_slab_increments = kernel.iname_slab_increments.copy()
-    iname_slab_increments[outer_iname] = outer_slab_increments
+    iname_slab_increments[outer_iname] = slabs
     result = (kernel
             .copy(domain=new_domain,
                 iname_slab_increments=iname_slab_increments,
@@ -321,8 +318,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                 id=kernel.make_unique_instruction_id(based_on=cse_tag),
                 assignee=assignee,
                 expression=new_inner_expr,
-                forced_iname_deps=forced_iname_deps,
-                idempotent=True)
+                forced_iname_deps=forced_iname_deps)
 
         cse_result_insns.append(new_insn)
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 0d9ea7cf05f35565697f9d10e780709ef7f70d26..d9db977807448348c4656342df79390e2f4e2f8e 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -224,10 +224,8 @@ class Instruction(Record):
         dependencies) without changing the meaning of the program.
     """
     def __init__(self,
-            id, assignee, expression, idempotent,
-            forced_iname_deps=[], insn_deps=[]):
-
-        assert isinstance(idempotent, bool)
+            id, assignee, expression,
+            forced_iname_deps=[], insn_deps=[], idempotent=None):
 
         Record.__init__(self,
                 id=id, assignee=assignee, expression=expression,
@@ -258,6 +256,15 @@ class Instruction(Record):
         result = "%s: %s <- %s\n    [%s]" % (self.id,
                 self.assignee, self.expression, ", ".join(sorted(self.all_inames())))
 
+        if self.idempotent == True:
+            result += " (idempotent)"
+        elif self.idempotent == False:
+            result += " (not idempotent)"
+        elif self.idempotent is None:
+            result += " (idempotence unknown)"
+        else:
+            raise RuntimeError("unexpected value for Instruction.idempotent")
+
         if self.insn_deps:
             result += "\n    : " + ", ".join(self.insn_deps)
 
@@ -450,8 +457,7 @@ class LoopKernel(Record):
                     id=self.make_unique_instruction_id(insns, based_on=label),
                     insn_deps=insn_deps,
                     forced_iname_deps=forced_iname_deps,
-                    assignee=lhs, expression=rhs,
-                    idempotent=True)
+                    assignee=lhs, expression=rhs)
 
         if isinstance(domain, str):
             ctx = isl.Context()
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 3d7a329cede167ff0c2e8c4f9ace40aa8bc2aa38..cf146782bf59945a857673aef566cfee67918f40 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -58,8 +58,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
                 forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
-                expression=expr.operation.neutral_element,
-                idempotent=True)
+                expression=expr.operation.neutral_element)
 
         new_insns.append(init_insn)
 
@@ -69,8 +68,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                 assignee=target_var,
                 expression=expr.operation(target_var, sub_expr),
                 insn_deps=[init_insn.id],
-                forced_iname_deps=list(insn.all_inames()),
-                idempotent=False)
+                forced_iname_deps=list(insn.all_inames()))
 
         new_insns.append(reduction_insn)
 
@@ -210,10 +208,10 @@ def check_for_unused_hw_axes(kernel):
                 raise RuntimeError("auto local tag encountered")
 
         if group_axes != group_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw group axes"
+            raise RuntimeError("instruction '%s' does not use all group hw axes"
                     % insn.id)
         if local_axes != local_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw local axes"
+            raise RuntimeError("instruction '%s' does not use all local hw axes"
                     % insn.id)
 
 
@@ -305,53 +303,65 @@ def adjust_local_temp_var_storage(kernel):
 
 # }}}
 
-# {{{ automatic dependencies
+# {{{ automatic dependencies, find idempotent instructions
 
-def find_writers(kernel):
+def find_accessors(kernel, readers):
     """
     :return: a dict that maps variable names to ids of insns that
         write to that variable.
     """
-    writer_insn_ids = {}
+    result = {}
 
-    admissible_write_vars = (
+    admissible_vars = (
             set(arg.name for arg in kernel.args)
             | set(kernel.temporary_variables.iterkeys()))
 
     for insn in kernel.instructions:
-        var_name = insn.get_assignee_var_name()
-
-        if var_name not in admissible_write_vars:
-            raise RuntimeError("writing to '%s' is not allowed" % var_name)
+        if readers:
+            from loopy.symbolic import DependencyMapper
+            var_names = DependencyMapper()(insn.expression) & admissible_vars
+        else:
+            var_name = insn.get_assignee_var_name()
 
-        writer_insn_ids.setdefault(var_name, set()).add(insn.id)
+            if var_name not in admissible_vars:
+                raise RuntimeError("writing to '%s' is not allowed" % var_name)
+            var_names = [var_name]
 
-    return writer_insn_ids
+        for var_name in var_names:
+            result.setdefault(var_name, set()).add(insn.id)
 
+    return result
 
 
 
 
-def add_automatic_dependencies(kernel):
-    writer_map = find_writers(kernel)
+def add_idempotence_and_automatic_dependencies(kernel):
+    writer_map = find_accessors(kernel, readers=False)
 
     arg_names = set(arg.name for arg in kernel.args)
 
     var_names = arg_names | set(kernel.temporary_variables.iterkeys())
 
     from loopy.symbolic import DependencyMapper
-    dep_map = DependencyMapper(composite_leaves=False)
-    new_insns = []
+    dm = DependencyMapper(composite_leaves=False)
+    dep_map = {}
+
     for insn in kernel.instructions:
-        read_vars = (
-                set(var.name for var in dep_map(insn.expression)) 
+        dep_map[insn.id] = (
+                set(var.name for var in dm(insn.expression))
                 & var_names)
 
+    new_insns = []
+    for insn in kernel.instructions:
         auto_deps = []
-        for var in read_vars:
+
+        # {{{ add automatic dependencies
+        all_my_var_writers = set()
+        for var in dep_map[insn.id]:
             var_writers = writer_map.get(var, set())
+            all_my_var_writers |= var_writers
 
-            if not var_writers and var not in var_names:
+            if not var_writers and var not in arg_names:
                 from warnings import warn
                 warn("'%s' is read, but never written." % var)
 
@@ -365,9 +375,26 @@ def add_automatic_dependencies(kernel):
             if len(var_writers) == 1:
                 auto_deps.extend(var_writers)
 
+        # }}}
+
+        # {{{ find dependency loops, flag idempotence
+
+        while True:
+            last_all_my_var_writers = all_my_var_writers
+
+            for writer_insn_id in last_all_my_var_writers:
+                for var in dep_map[writer_insn_id]:
+                    all_my_var_writers = all_my_var_writers | writer_map.get(var, set())
+
+            if last_all_my_var_writers == all_my_var_writers:
+                break
+
+        # }}}
+
         new_insns.append(
                 insn.copy(
-                    insn_deps=insn.insn_deps + auto_deps))
+                    insn_deps=insn.insn_deps + auto_deps,
+                    idempotent=insn.id not in all_my_var_writers))
 
     return kernel.copy(instructions=new_insns)
 
@@ -514,7 +541,7 @@ def assign_automatic_axes(kernel, only_axis_0=True):
                 from loopy import split_dimension
                 return assign_automatic_axes(
                         split_dimension(kernel, iname, inner_length=local_size[axis],
-                            outer_tag=UnrollTag(), inner_tag=new_tag, no_slabs=True),
+                            outer_tag=UnrollTag(), inner_tag=new_tag),
                         only_axis_0=only_axis_0)
 
         new_iname_to_tag = kernel.iname_to_tag.copy()
@@ -613,7 +640,7 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
     for insn_id in unscheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
 
-        if insn.idempotent:
+        if insn.idempotent == True:
             # If insn is idempotent, it may be placed inside a more deeply
             # nested loop without harm.
 
@@ -621,7 +648,8 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
                     insn.all_inames() - parallel_inames
                     <=
                     active_inames - parallel_inames)
-        else:
+
+        elif insn.idempotent == False:
             # If insn is not idempotent, we must insist that it is placed inside
             # the exactly correct set of loops.
 
@@ -630,6 +658,10 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
                     ==
                     active_inames - parallel_inames)
 
+        else:
+            raise RuntimeError("instruction '%s' has undetermined idempotence"
+                    % insn.id)
+
         if (iname_deps_satisfied
                 and set(insn.insn_deps) <= scheduled_insn_ids):
             scheduled_insn_ids.add(insn.id)
@@ -782,7 +814,7 @@ def insert_barriers(kernel, schedule, level=0):
 
             # {{{ issue dependency-based barriers for this instruction
 
-            if insn.id in owed_barriers:
+            if set(insn.insn_deps) & owed_barriers:
                 issue_barrier(is_pre_barrier=False)
 
             # }}}
@@ -827,7 +859,7 @@ def generate_loop_schedules(kernel):
     # }}}
 
     kernel = assign_automatic_axes(kernel)
-    kernel = add_automatic_dependencies(kernel)
+    kernel = add_idempotence_and_automatic_dependencies(kernel)
     kernel = adjust_local_temp_var_storage(kernel)
 
     check_for_double_use_of_hw_axes(kernel)
diff --git a/test/test_matmul.py b/test/test_matmul.py
index c6797c6bc3b654ce13e8bfa2067d586240901871..7cb8ac4e709ce425fd0c3e50e6ec31d3a60a728a 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -1,3 +1,5 @@
+from __future__ import division
+
 import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
@@ -214,16 +216,16 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
             name="matmul", assumptions="n >= 16")
 
     knl = lp.split_dimension(knl, "i", 16,
-            outer_tag="g.0", inner_tag="l.1", no_slabs=True)
+            outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 8,
-            outer_tag="g.1", inner_tag="l.0", no_slabs=True)
-    knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
+            outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_dimension(knl, "k", 32)
 
     knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"])
     knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -251,8 +253,7 @@ def test_rank_one(ctx_factory):
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = int(get_suitable_size(ctx)**(3/2))
-    print n
+    n = int(get_suitable_size(ctx)**(2.7/2))
 
     knl = lp.LoopKernel(ctx.devices[0],
             "[n] -> {[i,j]: 0<=i,j<n}",
@@ -267,33 +268,71 @@ def test_rank_one(ctx_factory):
                 ],
             name="rank_one", assumptions="n >= 16")
 
-    #knl = lp.split_dimension(knl, "i", 16,
-            #outer_tag="g.0", inner_tag="l.1", no_slabs=True)
-    #knl = lp.split_dimension(knl, "j", 8,
-            #outer_tag="g.1", inner_tag="l.0", no_slabs=True)
-    #knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
-
-    knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"])
-    knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"])
-
-    kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
-
-    a = cl_random.rand(queue, n, dtype=dtype)
-    b = cl_random.rand(queue, n, dtype=dtype)
-    refsol = a.get()[:, np.newaxis] * b.get()
-    c = cl_array.empty(queue, refsol.shape, refsol.dtype)
-
-    def launcher(kernel, gsize, lsize, check):
-        evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
-                g_times_l=True)
-
-        if check:
-            check_error(refsol, c.get())
-
-        return evt
-
-    lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
+    def variant_1(knl):
+        knl = lp.realize_cse(knl, "a", dtype)
+        knl = lp.realize_cse(knl, "b", dtype)
+        return knl
+
+    def variant_2(knl):
+        knl = lp.split_dimension(knl, "i", 16,
+                outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_dimension(knl, "j", 16,
+                outer_tag="g.1", inner_tag="l.1")
+
+        knl = lp.realize_cse(knl, "a", dtype)
+        knl = lp.realize_cse(knl, "b", dtype)
+        return knl
+
+    def variant_3(knl):
+        knl = lp.split_dimension(knl, "i", 16,
+                outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_dimension(knl, "j", 16,
+                outer_tag="g.1", inner_tag="l.1")
+
+        knl = lp.realize_cse(knl, "a", dtype, ["i_inner"])
+        knl = lp.realize_cse(knl, "b", dtype, ["j_inner"])
+        return knl
+
+    def variant_4(knl):
+        knl = lp.split_dimension(knl, "i", 256,
+                outer_tag="g.0", slabs=(0, -1))
+        knl = lp.split_dimension(knl, "j", 256,
+                outer_tag="g.1", slabs=(0, -1))
+
+        knl = lp.realize_cse(knl, "a", dtype, ["i_inner"])
+        knl = lp.realize_cse(knl, "b", dtype, ["j_inner"])
+
+        knl = lp.split_dimension(knl, "i_inner", 16,
+                inner_tag="l.0")
+        knl = lp.split_dimension(knl, "j_inner", 16,
+                inner_tag="l.1")
+
+        knl = lp.split_dimension(knl, "j_inner_0", 16,
+                outer_tag="l.1", inner_tag="l.0")
+        knl = lp.split_dimension(knl, "i_inner_0", 16,
+                outer_tag="l.1", inner_tag="l.0")
+        return knl
+
+    #for variant in [variant_1, variant_2, variant_3]:
+    for variant in [variant_4]:
+        kernel_gen = lp.generate_loop_schedules(variant(knl))
+        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+
+        a = cl_random.rand(queue, n, dtype=dtype)
+        b = cl_random.rand(queue, n, dtype=dtype)
+        refsol = a.get()[:, np.newaxis] * b.get()
+        c = cl_array.empty(queue, refsol.shape, refsol.dtype)
+
+        def launcher(kernel, gsize, lsize, check):
+            evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
+                    g_times_l=True)
+
+            if check:
+                check_error(refsol, c.get())
+
+            return evt
+
+        lp.drive_timing_run(kernel_gen, queue, launcher, n**2)