diff --git a/loopy/__init__.py b/loopy/__init__.py
index 64f426dc9a54736811b4f3ed18eb2a565fa36efe..390f9996e36e4b4195eb167e30e430512c6fc623 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -10,9 +10,14 @@ register_mpz_with_pymbolic()
 
 
 
+# Immediately:
+# ------------
+# TODO: Imitate codegen bulk slab handling in bulk slab trials
 
+# For writeup:
+# ------------
 # TODO: Try, fix reg. prefetch (DG example) / CSEs
-#   ILP and reg. prefetch (might) interact!
+#   ILP and reg. prefetch interact!
 # TODO: Custom reductions per red. axis
 # TODO: Functions
 # TODO: Common subexpressions
@@ -21,16 +26,20 @@ register_mpz_with_pymbolic()
 # FIXME: write names should be assigned during scheduling
 
 # TODO: Divisibility
-# TODO: Try different kernels
-# TODO:   - Tricky: Convolution, FD
 # TODO: Try, fix indirect addressing
 # TODO: More user control for slab opt
-# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?)
 
+# TODO: Implement GT200 matmul, Fermi matmul, DG
+# TODO: DMA engine threads?
+
+# Later:
+# ------
+# TODO: Try different kernels
+# TODO:   - Tricky: Convolution, FD
+# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?)
 # TODO: implement efficient ceil_div? (as opposed to floor_div)
 # TODO: why are corner cases inefficient?
 # TODO: Use gists (why do disjoint sets arise?)
-# TODO: Imitate codegen bulk slab handling in bulk slab trials
 
 
 
diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py
index 872aeca776c64953788099a61137c81953c1a5b0..7c799a84c3b5026ac12a4f5d6e4be5353aa1cf65 100644
--- a/loopy/codegen/loop_dim.py
+++ b/loopy/codegen/loop_dim.py
@@ -141,7 +141,12 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain):
     assert lower_kind == ">="
     assert upper_kind == "<"
 
-    success, length = kernel.domain.project_out_except([iname], [dim_type.set]).count()
+    proj_domain = (kernel.domain
+            .project_out_except([iname], [dim_type.set])
+            .project_out_except([], [dim_type.param])
+            .remove_divs())
+    assert proj_domain.is_bounded()
+    success, length = proj_domain.count()
     assert success == 0
 
     def generate_idx_eq_slabs():
diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py
index e7d880cf287da5b8da5820d73cb578e61140f648..22af66d33b52e22d4d877440a60eb3af2619da59 100644
--- a/loopy/codegen/prefetch.py
+++ b/loopy/codegen/prefetch.py
@@ -413,11 +413,11 @@ def generate_prefetch_code(cgs, kernel, sched_index, exec_domain):
             iname_lwr, iname_upr = pf.dim_bounds_by_iname[iname]
             new_block.append(Comment("      %s [%d..%d)" % (iname, iname_lwr, iname_upr)))
         new_block.append(Comment("    using:"))
-        for realiz_iname in realiz_inames:
 
-            if realiz_iname is None:
-                new_block.append(Comment("      loop"))
-            else:
+        if realiz_inames is None:
+            new_block.append(Comment("      loop"))
+        else:
+            for realiz_iname in realiz_inames:
                 rd_iname_descr = "loop"
                 iname_lwr, iname_upr, iname_eq = flnd.kernel.get_bounds(realiz_iname, (realiz_iname,),
                         allow_parameters=False)
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 63d82d7ff462e3c28cb7a314f2b79a4ea311ddeb..1c6210f0d9c5c5e10636ae718ae04707c474693b 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -79,6 +79,63 @@ def get_suitable_size(ctx):
 
 
 
+def test_axpy(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    n = get_suitable_size(ctx)
+    from pymbolic import var
+    x, y, z, n_sym, i = [var(s) for s in "xyzni"]
+
+    n_approx = 10**6
+
+    knl = lp.LoopKernel(ctx.devices[0],
+            "[n] -> {[i]: 0<=i<n}",
+            [
+                (z[i], x[i]+y[i]) # FIXME: Add scalars
+                ],
+            [
+                lp.ArrayArg("x", dtype, shape=(n,)),
+                lp.ArrayArg("y", dtype, shape=(n,)),
+                lp.ArrayArg("z", dtype, shape=(n,)),
+                lp.ScalarArg("n", np.int32, approximately=n_approx),
+                ],
+            name="matmul")
+
+    unroll = 16
+    block_size = 256
+    knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0")
+    knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0")
+    assert knl.get_problems({"n": n_approx})[0] <= 2
+
+    kernel_gen = (lp.insert_register_prefetches(knl)
+            for knl in lp.generate_loop_schedules(knl))
+
+    #a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
+    #b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
+    #c = cl_array.empty_like(a)
+    #refsol = np.dot(a.get(), b.get())
+
+    def launcher(kernel, gsize, lsize, check):
+        #evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data,
+                #g_times_l=True)
+
+        #if check:
+            #check_error(refsol, c.get())
+
+        #return evt
+        1/0
+
+    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
+            edit_code=True)
+
+
+
+
+
 def test_plain_matrix_mul(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
@@ -104,7 +161,7 @@ def test_plain_matrix_mul(ctx_factory):
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 4)
+    knl = lp.split_dimension(knl, "k", 16)
     knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
     assert knl.get_problems({})[0] <= 2