diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index eb1ffb7bf87766a92e5eac03f659f347c787eda0..e1f81b878f6789b16e6a284656e04d0e67235114 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -136,12 +136,9 @@ def image_matrix_mul(ctx_factory=cl.create_some_context):
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_dimension(knl, "k", 32)
-    # slow, but conflict-free
-    #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
-    #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"])
-    # fast
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
-    knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner", ])
+    # conflict-free
+    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
+    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"])
     assert knl.get_invalid_reason() is None
 
     kernel_gen = (lp.insert_register_prefetches(knl)
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 78ee835a2923bba1d7aeda6d3acb7b9b5ed89959..07c2450a68ef2cb3adcd0cf29f9aaf11f2d9b3ee 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -38,6 +38,9 @@ register_mpz_with_pymbolic()
 
 # TODO: Custom reductions per red. axis
 
+# TODO Tim: implement efficient div_ceil?
+# TODO Tim: why are corner cases inefficient?
+
 
 
 
@@ -1873,6 +1876,7 @@ def preprocess_prefetch(kernel):
         min_mult = cl_char.local_memory_bank_count(kernel.device)
         good_incr = None
         new_dsl = dim_storage_lengths
+        min_why_not = None
 
         for increment in range(dim_storage_lengths[-1]//2):
 
@@ -1884,21 +1888,22 @@ def preprocess_prefetch(kernel):
 
             # will choose smallest increment 'automatically'
             if new_mult < min_mult:
-                new_lmem_use = other_pf_sizes+pf.itemsize*product(new_dsl)
+                new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl)
                 if new_lmem_use < lmem_size:
                     new_dsl = test_dsl
                     min_mult = new_mult
+                    min_why_not = why_not
                     good_incr = increment
 
         if min_mult != 1:
             from warnings import warn
             warn("could not find a conflict-free mem layout "
                     "for prefetch of '%s' "
-                    "(currently: %dx conflict, increment: %d)" 
-                    % (pf.input_vector, min_mult, good_incr),
+                    "(currently: %dx conflict, increment: %d, reason: %s)"
+                    % (pf.input_vector, min_mult, good_incr, min_why_not),
                     LoopyAdvisory)
 
-        new_pf = pf.copy(dim_storage_lengths=dim_storage_lengths,
+        new_pf = pf.copy(dim_storage_lengths=new_dsl,
                 name="prefetch_%s_%d" % (pf.input_vector, i_pf))
         new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf
         all_pf_list[i_pf] = new_pf