diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index eb1ffb7bf87766a92e5eac03f659f347c787eda0..e1f81b878f6789b16e6a284656e04d0e67235114 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -136,12 +136,9 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) - # slow, but conflict-free - #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) - # fast - knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner", ]) + # conflict-free + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) assert knl.get_invalid_reason() is None kernel_gen = (lp.insert_register_prefetches(knl) diff --git a/loopy/__init__.py b/loopy/__init__.py index 78ee835a2923bba1d7aeda6d3acb7b9b5ed89959..07c2450a68ef2cb3adcd0cf29f9aaf11f2d9b3ee 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -38,6 +38,9 @@ register_mpz_with_pymbolic() # TODO: Custom reductions per red. axis +# TODO Tim: implement efficient div_ceil? +# TODO Tim: why are corner cases inefficient? + @@ -1873,6 +1876,7 @@ def preprocess_prefetch(kernel): min_mult = cl_char.local_memory_bank_count(kernel.device) good_incr = None new_dsl = dim_storage_lengths + min_why_not = None for increment in range(dim_storage_lengths[-1]//2): @@ -1884,21 +1888,22 @@ def preprocess_prefetch(kernel): # will choose smallest increment 'automatically' if new_mult < min_mult: - new_lmem_use = other_pf_sizes+pf.itemsize*product(new_dsl) + new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) if new_lmem_use < lmem_size: new_dsl = test_dsl min_mult = new_mult + min_why_not = why_not good_incr = increment if min_mult != 1: from warnings import warn warn("could not find a conflict-free mem layout " "for prefetch of '%s' " - "(currently: %dx conflict, increment: %d)" - % (pf.input_vector, min_mult, good_incr), + "(currently: %dx conflict, increment: %d, reason: %s)" + % (pf.input_vector, min_mult, good_incr, min_why_not), LoopyAdvisory) - new_pf = pf.copy(dim_storage_lengths=dim_storage_lengths, + new_pf = pf.copy(dim_storage_lengths=new_dsl, name="prefetch_%s_%d" % (pf.input_vector, i_pf)) new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf all_pf_list[i_pf] = new_pf