diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b35522043bfc32b71c0a063c3efc3b4403a26f2..23840f09a46ab97902a8d1ed7e078a7c70d36dec 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -22,7 +22,11 @@ end subroutine ! ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") -! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") -! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") ! RESULT = [dgemm] !$loopy end diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py index b8da89c6c75986e3baf5e35ee76b680d08c51632..eda11fc155fc951246381ca697409615fa0be90a 100644 --- a/examples/python/rank-one.py +++ b/examples/python/rank-one.py @@ -33,8 +33,10 @@ evt, (c,) = knl(queue, a=a, b=b) split_knl = knl # PREFETCH1BEGIN -knl = lp.add_prefetch(knl, "a") -knl = lp.add_prefetch(knl, "b") +knl = lp.add_prefetch(knl, "a", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') +knl = lp.add_prefetch(knl, "b", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') # PREFETCH1END knl = lp.set_options(knl, write_code=True) @@ -43,8 +45,14 @@ evt, (c,) = knl(queue, a=a, b=b) knl = split_knl # PREFETCH2BEGIN -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0") +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") # PREFETCH2END knl = lp.set_options(knl, write_code=True) @@ -58,8 +66,10 @@ knl = lp.split_iname(knl, "i", 256, knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") diff --git a/test/test_apps.py b/test/test_apps.py index 71029cc9ce408f8e7fa95eaf3b766864c4beee5b..f7eeb756e735ffb4d5ab6ab747c6bb792c690668 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -101,8 +101,11 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", + fetch_outer_inames='im_x_outer, im_y_outer, ifeat', + default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", + fetch_outer_inames='iimg, im_x_outer, im_y_outer, ifeat, icolor', default_tag="l.auto") return knl diff --git a/test/test_dg.py b/test/test_dg.py index 967dea35071bb3d95c06b2e37d73da29ac019763..543701a5fb4f2ce8c40851117573d1f72639436c 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -100,6 +100,7 @@ def test_dg_volume(ctx_factory): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", + fetch_outer_inames='k_outer', default_tag="l.auto") return knl diff --git a/test/test_domain.py b/test/test_domain.py index 5daf84eaa5b7ffd1647daf4b35acd7a5de91c5d1..8962514450f8ee352089104b2ffc1241e323725d 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -317,7 +317,7 @@ def test_equality_constraints(ctx_factory): ], [ "a[i,j] = 5 {id=set_all}", - "b[i,k] = 22 {dep=set_all}", + "b[i,k] = 22 {id=set_b, dep=set_all}", ], [ lp.GlobalArg("a,b", dtype, shape="n, n", order=order), @@ -329,6 +329,9 @@ def test_equality_constraints(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + + knl = lp.add_inames_to_insn(knl, 'j_inner, j_outer', 'id:set_b') + #print(knl) #print(knl.domains[0].detect_equalities()) diff --git a/test/test_fortran.py b/test/test_fortran.py index 3601e96b752f18e6e01bcfcffe49780bda4058b4..c7270abd29e4e68a110bc6ddc9efa4bc95a45823 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -316,8 +316,12 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") @@ -492,9 +496,11 @@ def test_precompute_some_exist(ctx_factory): knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl diff --git a/test/test_linalg.py b/test/test_linalg.py index f075d3493195ec3364c4de0d26f92c4a987e7187..390c5654fc0ee5bae631d26e5a0f58e939f8c78b 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -186,8 +186,10 @@ def test_plain_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -223,8 +225,12 @@ def test_variable_size_matrix_mul(ctx_factory): slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -263,8 +269,10 @@ def test_funny_shape_matrix_mul(ctx_factory): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -307,8 +315,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a") - knl = lp.add_prefetch(knl, "b") + knl = lp.add_prefetch(knl, "a", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') + knl = lp.add_prefetch(knl, "b", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') return knl def variant_3(knl): @@ -317,8 +327,15 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + return knl def variant_4(knl): @@ -327,8 +344,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") @@ -385,6 +404,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -425,8 +445,10 @@ def test_intel_matrix_mul(ctx_factory): #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") # FIXME: Grouped prefetch @@ -528,8 +550,12 @@ def test_image_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -608,8 +634,12 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], diff --git a/test/test_nbody.py b/test/test_nbody.py index 5b36ed4163c650317d8656883eeda599a3c21faa..6016c2f1c9955d3bd58d52ad33a3fa95ed63cff8 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -77,7 +77,8 @@ def test_nbody(ctx_factory): outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], - ["x_fetch_j", "x_fetch_k"], default_tag=None) + ["x_fetch_j", "x_fetch_k"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 54b608a183840cc5d33f1e738f36fc605d16d94a..57d75b24b59fc7972fb529fa3e6f220c76d84095 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -90,7 +90,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if opt_level == 0: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") + hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", + default_tag="l.auto") if opt_level == 1: tap_hsv = hsv diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index 54c64e0a4d4a23b429eb83be6c0a19f482a1b922..fff2b5356e75f414356ea1c61c2dd54753186d26 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -82,7 +82,8 @@ def test_tim2d(ctx_factory): def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) - knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', + default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") diff --git a/test/test_target.py b/test/test_target.py index 038b2e6c06116049441fad36d033c5a6831b4dbe..afad1b676485091ec49a2a1b4870e96d1bf70539 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -281,7 +281,9 @@ def test_numba_cuda_target(): knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "X[i,:]", + fetch_outer_inames='i_inner, i_outer, j_inner', + default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") diff --git a/test/test_transform.py b/test/test_transform.py index ffef893b05fbca5a0d244ff17f379e1bb5cf27a1..a6fb9424d44326d11e4b561b971554dd9ffcd7f4 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -374,7 +374,8 @@ def test_precompute_confusing_subst_arguments(ctx_factory): from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D") + knl = lp.precompute(knl, "D", sweep_inames='j', + precompute_outer_inames='j, i_inner, i_outer') lp.auto_test_vs_ref( ref_knl, ctx, knl,