diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index e76acc2e8c0a6ad9c90ff13460726c50ead5c0dc..e497928e65c2e86bf40b2c85aefc35d29506903e 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -10,12 +10,17 @@ import loopy as lp
 FAST_OPTIONS = ["-cl-mad-enable", "-cl-fast-relaxed-math", 
         "-cl-no-signed-zeros", "-cl-strict-aliasing"]
 
-def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, order="C", ran_factor=1, od=0):
+def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, 
+        order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0):
     if isinstance(shape, int):
         shape = (shape, shape)
+    l = max(shape)
+    eye_ish = id_factor*np.eye(l, k=od)
+    if inc_factor:
+        eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l)
     ary = np.asarray(
         ran_factor*np.random.randn(*shape)
-        + 5*np.eye(max(shape), k=od)[:shape[0], :shape[1]],
+        + eye_ish[:shape[0], :shape[1]],
         dtype=dtype, order=order)
 
     return cl_array.to_device(queue, ary)
@@ -27,9 +32,12 @@ DO_CHECK = True
 
 DEBUG_PREAMBLE = r"""
     #pragma OPENCL EXTENSION cl_amd_printf: enable
-    #define IFDIAG if (i_outer*16+i_inner == j_outer*16+j_inner)
-    #define TST(S) IFDIAG if (i_outer*16+i_inner == 0) \
-            printf("ko=%d ki=%d" #S "\n", k_outer, k_inner);
+    #define MY_J (j_outer*64+j_inner_outer*16+j_inner_inner)
+    #define MY_I (i_outer*16+i_inner)
+    #define IFDIAG if (MY_I == MY_J)
+    #define TST(S) if (MY_J == 144 && MY_I == 16-48) \
+            for (int aa = 0; aa < 16: ++ab) \
+                for (int bb = 0; bb < 16: ++bb) 
     """
 
 
@@ -38,20 +46,21 @@ DEBUG_PREAMBLE = r"""
 def check_error(refsol, sol):
     rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro")
     if DO_CHECK and rel_err > 1e-5:
-        if 1:
+        if 0:
             import matplotlib.pyplot as pt
             pt.imshow(refsol-sol)
             pt.colorbar()
             pt.show()
-        elif 0:
+        elif 1:
             print "---------------------------"
             print "ACTUAL"
             print "---------------------------"
-            print sol
+            np.set_printoptions(threshold=1000000, linewidth=200)
+            print sol[:,-16:]
             print "---------------------------"
             print "CORRECT"
             print "---------------------------"
-            print refsol
+            print refsol[-16:,-16:]
         raise RuntimeError("check failed, rel err=%g" % rel_err)
 
 
@@ -168,101 +177,6 @@ def image_matrix_mul(ctx_factory=cl.create_some_context):
             force_rebuild=True)
 
 
-def dg_matrix_mul(ctx_factory=cl.create_some_context):
-    dtype = np.float32
-    ctx = ctx_factory()
-    order = "C"
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-    Np = 84
-    Np_padded = 96
-    K = 20000
-    dim = 3
-    num_flds = 2
-
-    from pymbolic import var
-    fld = var("fld")
-    matrix_names = ["d%d" % i for i in range(dim)]
-    i, j, k = [var(s) for s in "i j k".split()]
-
-    fld_strides = (1, Np_padded)
-
-    knl = lp.LoopKernel(ctx.devices[0],
-            "{[i,j,k]: 0<=i,j< %d and 0<=k<%d}" % (Np, K),
-            [
-                (var(mn+"fld%d" % ifld)[i, k], 
-                    var(mn)[i, j]*var("fld%d" % ifld)[j, k])
-                for mn in matrix_names
-                for ifld in range(num_flds)
-                ],
-            [lp.ImageArg(mn, dtype, 2) for mn in matrix_names]
-            + [lp.ArrayArg("fld%d" % ifld, dtype,
-                strides=fld_strides)
-                for ifld in range(num_flds)
-                ]
-            + [lp.ArrayArg(mn+"fld%d" % ifld, dtype,
-                strides=fld_strides)
-                for ifld in range(num_flds)
-                for mn in matrix_names
-                ],
-            name="dg_matmul")
-
-    ilp = 4
-    knl = lp.split_dimension(knl, "i", 30, 32, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 16*ilp, outer_tag="g.1")
-    knl = lp.split_dimension(knl, "k_inner", 16, outer_tag="ilp", inner_tag="l.1")
-
-    assert Np % 2 == 0
-    #knl = lp.split_dimension(knl, "j", Np//2)
-    #knl = lp.split_dimension(knl, "k", 32)
-
-    #for mn in matrix_names:
-        #knl = lp.add_prefetch(knl, mn, ["j", "i_inner"])
-    for ifld in range(num_flds):
-        knl = lp.add_prefetch(knl, 'fld%d' % ifld,
-                ["k_inner_outer", "k_inner_inner", "j"])
-    assert knl.get_invalid_reason() is None
-
-    kernel_gen = list(lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl))[:1]
-
-    matrices = [
-            make_well_conditioned_dev_matrix(queue, Np, dtype=dtype, order="C")
-            for mn in matrix_names]
-    flds = [
-            make_well_conditioned_dev_matrix(queue, (Np_padded, K), dtype=dtype, order="F")
-            for ifld in range(num_flds)]
-    outputs = [cl_array.empty_like(flds[0])
-            for ifld in range(num_flds)
-            for mn in matrix_names]
-
-    ref_soln = [np.dot(mat.get(), fld.get()[:Np]) 
-            for fld in flds
-            for mat in matrices]
-
-    mat_images = [
-            cl.image_from_array(ctx, mat.get(), 1) for mat in matrices]
-
-    def launcher(kernel, gsize, lsize, check):
-        args = mat_images + [fld.data for fld in flds] + [out.data for out in outputs]
-        kwargs = dict(g_times_l=True)
-        evt = kernel(queue, gsize(), lsize(), *args, g_times_l=True)
-
-        if check:
-            for out, ref in zip(outputs, ref_soln):
-                check_error(ref, out.get()[:Np])
-
-        return evt
-
-    lp.drive_timing_run(kernel_gen, queue, launcher, num_flds*dim*2*(Np**2)*K,
-            options=FAST_OPTIONS + ["-cl-nv-verbose"],
-            force_rebuild=True, #, edit=True
-            print_code=False
-            )
-
-
-
 
 
 def image_matrix_mul_ilp(ctx_factory=cl.create_some_context):
@@ -272,7 +186,7 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context):
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = 16*100
+    n = 16*10
     from pymbolic import var
     a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
@@ -288,24 +202,28 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context):
                 #lp.ArrayArg("b", dtype, shape=(n, n), order=order),
                 lp.ArrayArg("c", dtype, shape=(n, n), order=order),
                 ],
-            name="matmul")
+            name="matmul", preamble=DEBUG_PREAMBLE)
 
-    ilp = 8
+    ilp = 4
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", ilp*16, outer_tag="g.1")
     knl = lp.split_dimension(knl, "j_inner", 16, outer_tag="ilp", inner_tag="l.0")
     knl = lp.split_dimension(knl, "k", 32)
     # conflict-free
     knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
-    assert knl.get_invalid_reason() is None
+    #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
+    inv_reason = knl.get_invalid_reason()
+    assert inv_reason is None, inv_reason
 
     kernel_gen = (lp.insert_register_prefetches(knl)
             for knl in lp.generate_loop_schedules(knl))
 
-    a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
+    a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
+            ran_factor=0, id_factor=1)
+    b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
+            ran_factor=0, id_factor=0, inc_factor=1000)
     c = cl_array.empty_like(a)
+    c.fill(-17)
     refsol = np.dot(a.get(), b.get())
     a_img = cl.image_from_array(ctx, a.get(), 1)
     b_img = cl.image_from_array(ctx, b.get(), 1)
@@ -321,7 +239,7 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context):
 
     lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
             options=FAST_OPTIONS,# + ["-cl-nv-verbose"],
-            force_rebuild=True, edit_code=False)
+            force_rebuild=True, edit_code=True)
 
 
 
@@ -386,6 +304,103 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context):
 
 
 
+def dg_matrix_mul(ctx_factory=cl.create_some_context):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    Np = 84
+    Np_padded = 96
+    K = 20000
+    dim = 3
+    num_flds = 2
+
+    from pymbolic import var
+    fld = var("fld")
+    matrix_names = ["d%d" % i for i in range(dim)]
+    i, j, k = [var(s) for s in "i j k".split()]
+
+    fld_strides = (1, Np_padded)
+
+    knl = lp.LoopKernel(ctx.devices[0],
+            "{[i,j,k]: 0<=i,j< %d and 0<=k<%d}" % (Np, K),
+            [
+                (var(mn+"fld%d" % ifld)[i, k], 
+                    var(mn)[i, j]*var("fld%d" % ifld)[j, k])
+                for mn in matrix_names
+                for ifld in range(num_flds)
+                ],
+            [lp.ImageArg(mn, dtype, 2) for mn in matrix_names]
+            + [lp.ArrayArg("fld%d" % ifld, dtype,
+                strides=fld_strides)
+                for ifld in range(num_flds)
+                ]
+            + [lp.ArrayArg(mn+"fld%d" % ifld, dtype,
+                strides=fld_strides)
+                for ifld in range(num_flds)
+                for mn in matrix_names
+                ],
+            name="dg_matmul")
+
+    ilp = 4
+    knl = lp.split_dimension(knl, "i", 30, 32, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_dimension(knl, "k", 16*ilp, outer_tag="g.1")
+    knl = lp.split_dimension(knl, "k_inner", 16, outer_tag="ilp", inner_tag="l.1")
+
+    assert Np % 2 == 0
+    #knl = lp.split_dimension(knl, "j", Np//2)
+    #knl = lp.split_dimension(knl, "k", 32)
+
+    #for mn in matrix_names:
+        #knl = lp.add_prefetch(knl, mn, ["j", "i_inner"])
+    for ifld in range(num_flds):
+        knl = lp.add_prefetch(knl, 'fld%d' % ifld,
+                ["k_inner_outer", "k_inner_inner", "j"])
+    assert knl.get_invalid_reason() is None
+
+    kernel_gen = list(lp.insert_register_prefetches(knl)
+            for knl in lp.generate_loop_schedules(knl))[:1]
+
+    matrices = [
+            make_well_conditioned_dev_matrix(queue, Np, dtype=dtype, order="C")
+            for mn in matrix_names]
+    flds = [
+            make_well_conditioned_dev_matrix(queue, (Np_padded, K), dtype=dtype, order="F")
+            for ifld in range(num_flds)]
+    outputs = [cl_array.empty_like(flds[0])
+            for ifld in range(num_flds)
+            for mn in matrix_names]
+
+    ref_soln = [np.dot(mat.get(), fld.get()[:Np]) 
+            for fld in flds
+            for mat in matrices]
+
+    mat_images = [
+            cl.image_from_array(ctx, mat.get(), 1) for mat in matrices]
+
+    def launcher(kernel, gsize, lsize, check):
+        args = mat_images + [fld.data for fld in flds] + [out.data for out in outputs]
+        kwargs = dict(g_times_l=True)
+        evt = kernel(queue, gsize(), lsize(), *args, g_times_l=True)
+
+        if check:
+            for out, ref in zip(outputs, ref_soln):
+                check_error(ref, out.get()[:Np])
+
+        return evt
+
+    lp.drive_timing_run(kernel_gen, queue, launcher, num_flds*dim*2*(Np**2)*K,
+            options=FAST_OPTIONS + ["-cl-nv-verbose"],
+            force_rebuild=True, #, edit=True
+            print_code=False
+            )
+
+
+
+
+
 def main_elwise_scaled_matrix_mul():
     Np = 64
     K = 2000
@@ -469,4 +484,4 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        plain_matrix_mul()
+        image_matrix_mul_ilp()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 570bdd089b2e29963d9ab2e49dad9f981b707050..f4d47e2c7b997ea7870651e9f808659f4be692c4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,6 +22,7 @@ register_mpz_with_pymbolic()
 
 
 
+# TODO: Constant memory
 # TODO: Reuse of previously split dimensions for prefetch
 #   (Or general merging)
 # TODO: ILP Unroll
@@ -31,6 +32,8 @@ register_mpz_with_pymbolic()
 #     ILP must be outside of reduction loops
 #     Therfore, there are prefetches inside ILPs
 # TODO: Debug 1 ILP
+# FIXME: Random equality constraints
+# TODO: Use increment for ILP?
 
 # TODO: Try, fix reg. prefetch (DG example) / CSEs
 # TODO: Custom reductions per red. axis