diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index ac383546bd737fbb139f87e2dcf7cea78b4f37f1..e8618098c299a16dee4cc8b063527f2e3d80ea02 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -40,7 +40,7 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context):
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 16)
+    knl = lp.split_dimension(knl, "k", 4)
     knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
     assert knl.get_invalid_reason() is None
@@ -92,15 +92,15 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context):
                 (c[i, j], a[i, k]*b[k, j])
                 ],
             [
-                lp.ArrayArg("a", dtype, shape=(n_sym, n_sym)),
-                lp.ArrayArg("b", dtype, shape=(n_sym, n_sym)),
-                lp.ArrayArg("c", dtype, shape=(n_sym, n_sym)),
+                lp.ArrayArg("a", dtype, shape=(n_sym, n_sym), order="F"),
+                lp.ArrayArg("b", dtype, shape=(n_sym, n_sym), order="F"),
+                lp.ArrayArg("c", dtype, shape=(n_sym, n_sym), order="F"),
                 lp.ScalarArg("n", np.int32, approximately=1000),
                 ], name="fancy_matmul")
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 16, inner_tag="unr1")
+    knl = lp.split_dimension(knl, "k", 4, inner_tag="unr1")
     knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
     knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"])
     assert knl.get_invalid_reason() is None
@@ -217,4 +217,4 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        plain_matrix_mul()
+        fancy_matrix_mul()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 563bace7c06e62c6de9d801a561d9cb98a895de5..d27a3225bfe9bcc30b5b8342d452b881692262f1 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,6 +22,7 @@ register_mpz_with_pymbolic()
 
 
 
+# TODO: Try, fix reg. prefetch
 # TODO: Divisibility
 # TODO: nD Texture access
 # TODO: Functions
@@ -29,10 +30,12 @@ register_mpz_with_pymbolic()
 # TODO: Try different kernels
 # TODO:   - Tricky: Convolution, FD
 # TODO: Try, fix indirect addressing
+# TODO: ILP Unroll
+# TODO: User controllable switch for slab opt
+# TODO: User control over schedule
+# TODO: Condition hoisting
 
 # TODO: Custom reductions per red. axis
-# TODO: ILP Unroll
-# TODO: User controllable switch for 
 
 
 
@@ -282,9 +285,9 @@ def copy_constraint(cns, as_equality=None):
     return cast_constraint_to_space(cns, cns.get_dim(),
             as_equality=as_equality)
 
-def get_dim_bounds(set):
+def get_dim_bounds(set, inames):
     vars = set.get_dim().get_var_dict(dim_type.set).keys()
-    return [get_projected_bounds(set, v) for v in vars]
+    return [get_projected_bounds(set, v) for v in inames]
 
 def count_box_from_bounds(bounds):
     from pytools import product
@@ -818,7 +821,7 @@ class PrefetchDescriptor(Record):
     @property
     @memoize_method
     def dim_bounds(self):
-        return get_dim_bounds(self.domain)
+        return get_dim_bounds(self.domain, self.inames)
 
     @property
     def itemsize(self):
@@ -1819,13 +1822,13 @@ def generate_code(kernel):
         # below to avoid bank conflicts
         from pytools import product
         other_dim_sizes = (pf.itemsize
-                * product(dim_storage_lengths[1:]))
+                * product(dim_storage_lengths[:-1]))
 
         from pyopencl.characterize import usable_local_mem_size
-        if (dim_storage_lengths[0] % 2 == 0
-                and other_pf_sizes+other_dim_sizes*(dim_storage_lengths[0]+1)
+        if (dim_storage_lengths[-1] % 2 == 0
+                and other_pf_sizes+other_dim_sizes*(dim_storage_lengths[-1]+1)
                 < usable_local_mem_size(kernel.device)):
-            dim_storage_lengths[0] += 1
+            dim_storage_lengths[-1] += 1
 
         new_prefetch[pf.input_vector, pf.index_expr] = \
                 pf.copy(dim_storage_lengths=dim_storage_lengths,
@@ -1908,7 +1911,7 @@ def generate_code(kernel):
 
     for pf in kernel.prefetch.itervalues():
         smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name)
-        for l in pf.dim_storage_lengths[::-1]:
+        for l in pf.dim_storage_lengths:
             smem_pf_array = ArrayOf(smem_pf_array, l)
         body.append(CLLocal(smem_pf_array))