From d1caf2487b5e5a0974d352eb13826681daff770d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 22 Jun 2018 17:18:09 -0500 Subject: [PATCH 1/3] Modify add_prefetch and precompute to strongly suggest specifying default_tag --- loopy/transform/data.py | 23 +++++++++++---- loopy/transform/precompute.py | 55 +++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 575311b11..4102d91ce 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -136,8 +136,16 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames, # }}} +class _not_provided: # noqa: N801 + pass + + def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, - default_tag="l.auto", rule_name=None, + + # "None" is a valid value here, distinct from the default. + default_tag=_not_provided, + + rule_name=None, temporary_name=None, temporary_scope=None, temporary_is_local=None, footprint_subscripts=None, @@ -168,10 +176,11 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg default_tag: The :ref:`implementation tag ` to assign to the inames driving the prefetch code. Use *None* to - leave them undefined (to assign them later by hand). The - default values of ``"l.auto"`` will automatically determine - what it deems 'reasonable' inames to map to 'local' axes and - map the rest to sequential loops. + leave them undefined (to assign them later by hand). The current + default will make them local axes and automatically split them to + fit the work group size, but this default will disappear in favor + of simply leaving them untagged in 2019.x. For 2018.x, a warning + will be issued if no *default_tag* is specified. :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. @@ -316,6 +325,10 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, kernel, rule_name, sweep_inames, footprint_subscripts, arg) + # Our _not_provided is actually a different object from the one in the + # precompute module, but precompute acutally uses that to adjust its + # warning message. + from loopy.transform.precompute import precompute new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 1af16f92f..c2bfe321d 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -254,10 +254,19 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): # }}} +class _not_provided(object): # noqa: N801 + pass + + def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, - storage_axis_to_tag={}, default_tag="l.auto", dtype=None, + storage_axis_to_tag={}, + + # "None" is a valid value here, distinct from the default. + default_tag=_not_provided, + + dtype=None, fetch_bounding_box=False, temporary_scope=None, temporary_is_local=None, compute_insn_id=None): @@ -305,11 +314,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, :arg sweep_inames: A :class:`list` of inames to be swept. May also equivalently be a comma-separated string. + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`. :arg storage_axes: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. May also equivalently be a comma-separated string. - :arg within: a stack match as understood by - :func:`loopy.match.parse_stack_match`. :arg temporary_name: The temporary variable name to use for storing the precomputed data. If it does not exist, it will be created. If it does exist, its properties @@ -328,6 +337,13 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, the compute instruction is nested. If *None*, make an educated guess. May also be specified as a comma-separated string. + :arg default_tag: The :ref:`iname tag ` to be applied to the + inames created to perform the precomputation. The current default will + make them local axes and automatically split them to fit the work + group size, but this default will disappear in favor of simply leaving them + untagged in 2019. For 2018, a warning will be issued if no *default_tag* is + specified. + :arg compute_insn_id: The ID of the instruction generated to perform the precomputation. @@ -426,9 +442,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, from loopy.match import parse_stack_match within = parse_stack_match(within) - from loopy.kernel.data import parse_tag - default_tag = parse_tag(default_tag) - try: subst = kernel.substitutions[subst_name] except KeyError: @@ -437,6 +450,36 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, c_subst_name = subst_name.replace(".", "_") + # {{{ handle default_tag + + from loopy.transform.data import _not_provided \ + as transform_data_not_provided + + if default_tag is _not_provided or default_tag is transform_data_not_provided: + # no need to warn for scalar precomputes + if sweep_inames: + from warnings import warn + warn( + "Not specifying default_tag is deprecated, and default_tag " + "will become mandatory in 2019.x. " + "Pass 'default_tag=\"l.auto\" to match the current default, " + "or Pass 'default_tag=None to leave the loops untagged, which " + "is the recommended behavior.", + DeprecationWarning, stacklevel=( + + # In this case, we came here through add_prefetch. Increase + # the stacklevel. + 3 if default_tag is transform_data_not_provided + + else 2)) + + default_tag = "l.auto" + + from loopy.kernel.data import parse_tag + default_tag = parse_tag(default_tag) + + # }}} + # }}} # {{{ process invocations in footprint generators, start access_descriptors -- GitLab From 55690d274d65a45bd90c6c06e94444cdc1efe769 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 22 Jun 2018 17:18:36 -0500 Subject: [PATCH 2/3] Fix tests and demos to always specify default_tag in precompute and add_prefetch --- doc/tutorial.rst | 2 +- examples/fortran/matmul.floopy | 4 +- examples/python/rank-one.py | 4 +- proto-tests/test_fem_assembly.py | 8 ++-- proto-tests/test_sem.py | 50 +++++++++++++++---------- proto-tests/test_sem_tim.py | 57 ++++++++++++++++------------- proto-tests/test_tim.py | 52 ++++++++++++++------------ test/test_apps.py | 17 +++++---- test/test_dg.py | 6 ++- test/test_fortran.py | 16 +++++--- test/test_linalg.py | 63 ++++++++++++++++++-------------- test/test_loopy.py | 8 ++-- test/test_numa_diff.py | 8 ++-- test/test_reduction.py | 6 ++- test/test_sem_reagan.py | 16 ++++---- test/test_statistics.py | 4 +- test/test_target.py | 8 ++-- test/test_transform.py | 10 ++--- 18 files changed, 195 insertions(+), 144 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e08c88598..3b1af9406 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1038,7 +1038,7 @@ earlier: .. doctest:: - >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"]) + >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 8ee05643b..4b3552204 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -22,7 +22,7 @@ end subroutine ! ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") -! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner") -! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") ! RESULT = [dgemm] !$loopy end diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py index b22edfcfa..b8da89c6c 100644 --- a/examples/python/rank-one.py +++ b/examples/python/rank-one.py @@ -43,8 +43,8 @@ evt, (c,) = knl(queue, a=a, b=b) knl = split_knl # PREFETCH2BEGIN -knl = lp.add_prefetch(knl, "a", ["i_inner"]) -knl = lp.add_prefetch(knl, "b", ["j_inner"]) +knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") +knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0") # PREFETCH2END knl = lp.set_options(knl, write_code=True) diff --git a/proto-tests/test_fem_assembly.py b/proto-tests/test_fem_assembly.py index a2cba7c57..18f2a5bfa 100644 --- a/proto-tests/test_fem_assembly.py +++ b/proto-tests/test_fem_assembly.py @@ -112,10 +112,10 @@ def test_laplacian_stiffness(ctx_factory): outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) - knl = lp.add_prefetch(knl, "w", ["q"]) - knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2]) - knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3]) - knl = lp.add_prefetch(knl, "jacDet", [1]) + knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto") + knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto") + knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto") return knl, ["K", "i", "j", "q", "ax_b_insn"] # Plug in variant name here diff --git a/proto-tests/test_sem.py b/proto-tests/test_sem.py index ddf913275..4613b74ae 100644 --- a/proto-tests/test_sem.py +++ b/proto-tests/test_sem.py @@ -79,17 +79,17 @@ def test_laplacian(ctx_factory): if 0: pass - #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) - #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") + #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], default_tag="l.auto") + #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - knl = lp.add_prefetch(knl, "D", ["m", "j"]) - #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto") + #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") @@ -154,17 +154,23 @@ def test_laplacian_lmem(ctx_factory): knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: - seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) - seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", + default_tag="l.auto") + seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], + default_tag="l.auto") + seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", + default_tag="l.auto") else: seq_knl = knl - knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) + knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - knl = lp.add_prefetch(knl, "D", ["m", "j"]) - knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "D", ["m", "j"], + default_tag="l.auto") + knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", + default_tag="l.auto") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print seq_knl @@ -230,14 +236,20 @@ def test_laplacian_lmem_ilp(ctx_factory): knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") - knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) + knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"], + default_tag="l.auto") - knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) - knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) - knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) + knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"], + default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"], + default_tag="l.auto") + knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"], + default_tag="l.auto") - knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) - knl = lp.add_prefetch(knl, "D", ["m", "j"]) + knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"], + default_tag="l.auto") + knl = lp.add_prefetch(knl, "D", ["m", "j"], + default_tag="l.auto") #print seq_knl #1/0 diff --git a/proto-tests/test_sem_tim.py b/proto-tests/test_sem_tim.py index 01bee47c8..1bfb437fb 100644 --- a/proto-tests/test_sem_tim.py +++ b/proto-tests/test_sem_tim.py @@ -79,17 +79,19 @@ def test_laplacian(ctx_factory): if 0: pass - #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) - #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") + #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], default_tag="l.auto") + #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - knl = lp.add_prefetch(knl, "D", ["m", "j"]) - #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") + knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "D", ["m", "j"], + default_tag="l.auto") + #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") @@ -150,26 +152,31 @@ def test_laplacian_lmem(ctx_factory): if 1: # original - knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) - knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"]) - knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"]) - knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"]) + knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], + default_tag="l.auto") + knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], + default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], + default_tag="l.auto") + knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], + default_tag="l.auto") knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) + knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], + default_tag="l.auto") else: # experiment -# knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) - knl = lp.precompute(knl, "eu", np.float32, ["b", "c"]) - knl = lp.precompute(knl, "ur", np.float32, ["b", "c"]) - knl = lp.precompute(knl, "us", np.float32, ["b", "c"]) - knl = lp.precompute(knl, "ut", np.float32, ["b", "c"]) +# knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") + knl = lp.precompute(knl, "eu", np.float32, ["b", "c"], default_tag="l.auto") + knl = lp.precompute(knl, "ur", np.float32, ["b", "c"], default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, ["b", "c"], default_tag="l.auto") + knl = lp.precompute(knl, "ut", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) + knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") - #knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G - #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"]) # axis/argument indices on G + #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G + #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G #print(knl) #1/0 @@ -241,14 +248,14 @@ def test_laplacian_lmem_ilp(ctx_factory): knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") - knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) + knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"], default_tag="l.auto") - knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) - knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) - knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) + knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") + knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) - knl = lp.add_prefetch(knl, "D", ["m", "j"]) + knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto") #print(seq_knl) #1/0 diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py index 866224e11..d7061933e 100644 --- a/proto-tests/test_tim.py +++ b/proto-tests/test_tim.py @@ -29,7 +29,7 @@ def test_tim2d(ctx_factory): [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", - + "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" @@ -47,21 +47,21 @@ def test_tim2d(ctx_factory): name="semlap2D", assumptions="K>=1") unroll = 32 - + seq_knl = knl - knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"]) - knl = lp.add_prefetch(knl, "u", ["i", "j", "o"]) - knl = lp.precompute(knl, "ur", np.float32, ["a", "b"]) - knl = lp.precompute(knl, "us", np.float32, ["a", "b"]) + knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") + knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) - + # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G - knl = lp.add_prefetch(knl, "G", [2,3]) # axis/argument indices on G + knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) @@ -109,13 +109,16 @@ def test_red2d(ctx_factory): name="semlap2D", assumptions="K>=1") unroll = 32 - + seq_knl = knl - knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"]) - knl = lp.add_prefetch(knl, "u", ["i", "j", "o"]) - knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"]) - knl = lp.precompute(knl, "ur", np.float32, ["a", "b"]) - knl = lp.precompute(knl, "us", np.float32, ["a", "b"]) + knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") + knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"], + default_tag="l.auto") + knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], + default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, ["a", "b"], + default_tag="l.auto") knl = lp.split_iname(knl, "e", 2, outer_tag="g.0") knl = lp.split_iname(knl, "j", n, inner_tag="l.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.1")#, slabs=(0, 1)) @@ -123,8 +126,8 @@ def test_red2d(ctx_factory): knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) - - knl = lp.add_prefetch(knl, "G", [2,3]) # axis/argument indices on G + + knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) @@ -175,13 +178,16 @@ def test_tim3d(ctx_factory): lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap3D", assumptions="K>=1") - + seq_knl = knl - knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "k","o"]) - knl = lp.add_prefetch(knl, "u", ["i", "j", "o", "k"]) - knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"]) - knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"]) - knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"]) + knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "k","o"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "u", ["i", "j", "o", "k"], default_tag="l.auto") + knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], + default_tag="l.auto") + knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], + default_tag="l.auto") + knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], + default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "k", n, inner_tag="l.2")#, slabs=(0, 1)) knl = lp.split_iname(knl, "j", n, inner_tag="l.1")#, slabs=(0, 1)) @@ -193,7 +199,7 @@ def test_tim3d(ctx_factory): knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.tag_inames(knl, dict(i="unr")) - knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G + knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) diff --git a/test/test_apps.py b/test/test_apps.py index ee3d4ff44..2a88e0b74 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -99,8 +99,9 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") - knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", + default_tag="l.auto") return knl for variant in [ @@ -344,7 +345,7 @@ def test_stencil(ctx_factory): def variant_1(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl @@ -352,7 +353,7 @@ def test_stencil(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) + fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl @@ -399,7 +400,7 @@ def test_stencil_with_overfetch(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", slabs=(1, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) + fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl @@ -501,7 +502,8 @@ def test_lbm(ctx_factory): knl = lp.split_iname(knl, "ii", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "jj", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.expand_subst(knl) - knl = lp.add_prefetch(knl, "f", "ii_inner,jj_inner", fetch_bounding_box=True) + knl = lp.add_prefetch(knl, "f", "ii_inner,jj_inner", fetch_bounding_box=True, + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20}) @@ -519,7 +521,8 @@ def test_fd_demo(): "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], - fetch_bounding_box=True) + fetch_bounding_box=True, + default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) diff --git a/test/test_dg.py b/test/test_dg.py index 8de742f27..967dea350 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -99,14 +99,16 @@ def test_dg_volume(ctx_factory): def variant_prefetch_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") - knl = lp.add_prefetch(knl, "DrDsDt[:,:]") + knl = lp.add_prefetch(knl, "DrDsDt[:,:]", + default_tag="l.auto") return knl def variant_prefetch_fields(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: - knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"]) + knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"], + default_tag="l.auto") return knl diff --git a/test/test_fortran.py b/test/test_fortran.py index a876e2b49..e08033360 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -310,8 +310,8 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") @@ -360,8 +360,10 @@ def test_batched_sparse(): knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) knl = lp.tag_inames(knl, {"i_inner": "l.0"}) - knl = lp.add_prefetch(knl, "values") - knl = lp.add_prefetch(knl, "colindices") + knl = lp.add_prefetch(knl, "values", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "colindices", + default_tag="l.auto") knl = lp.fix_parameters(knl, nvecs=4) @@ -484,9 +486,11 @@ def test_precompute_some_exist(ctx_factory): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", - precompute_inames="ktemp,itemp") + precompute_inames="ktemp,itemp", + default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", - precompute_inames="itemp,k2temp") + precompute_inames="itemp,k2temp", + default_tag="l.auto") ref_knl = knl diff --git a/test/test_linalg.py b/test/test_linalg.py index 093fcbf24..6115cc8a5 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -148,7 +148,8 @@ def test_transpose(ctx_factory): outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) + knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"], + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], @@ -184,8 +185,10 @@ def test_plain_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ], + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], @@ -220,8 +223,8 @@ def test_variable_size_matrix_mul(ctx_factory): slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -255,12 +258,14 @@ def test_funny_shape_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) - #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) - #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) + #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") + #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -290,8 +295,8 @@ def test_rank_one(ctx_factory): assumptions="n >= 16") def variant_1(knl): - knl = lp.add_prefetch(knl, "a") - knl = lp.add_prefetch(knl, "b") + knl = lp.add_prefetch(knl, "a", default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", default_tag="l.auto") knl = lp.prioritize_loops(knl, ["i", "j"]) knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch") return knl @@ -312,8 +317,8 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", ["i_inner"]) - knl = lp.add_prefetch(knl, "b", ["j_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto") return knl def variant_4(knl): @@ -379,7 +384,8 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) - knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) + knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -418,13 +424,15 @@ def test_intel_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") - knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"]) + knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], + default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], + default_tag="l.auto") # FIXME: Grouped prefetch - #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) + #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")], default_tag="l.auto") #knl = lp.add_prefetch(knl, 'b', - # ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") #hints=["k_outer", "k_inner_outer", "k_inner_inner"] @@ -475,9 +483,9 @@ def test_magma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") # FIXME - #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) + #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], default_tag="l.auto") #knl = lp.add_prefetch(knl, 'b', - # ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -518,8 +526,8 @@ def test_image_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -564,8 +572,9 @@ def no_test_image_matrix_mul_ilp(ctx_factory): outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) # conflict-free? - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"], + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -597,8 +606,8 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -629,7 +638,7 @@ def test_small_batched_matvec(ctx_factory): seq_knl = knl align_bytes = 64 - knl = lp.add_prefetch(knl, 'd[:,:]') + knl = lp.add_prefetch(knl, 'd[:,:]', default_tag="l.auto") pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) diff --git a/test/test_loopy.py b/test/test_loopy.py index 3ceca5a75..c069916e5 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -646,7 +646,8 @@ def test_vector_ilp_with_prefetch(ctx_factory): knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], + default_tag="l.auto") cknl = lp.CompiledKernel(ctx, knl) cknl.kernel_info() @@ -1722,7 +1723,8 @@ def test_finite_difference_expr_subst(ctx_factory): fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute( - gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) + gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True, + default_tag="l.auto") precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) @@ -2794,7 +2796,7 @@ def test_add_prefetch_works_in_lhs_index(): "..." ]) - knl = lp.add_prefetch(knl, "a1_map", "k") + knl = lp.add_prefetch(knl, "a1_map", "k", default_tag="l.auto") from loopy.symbolic import get_dependencies for insn in knl.instructions: diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index be07b6c31..6b578838d 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -90,7 +90,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if opt_level == 0: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "D[:,:]") + hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv @@ -169,12 +169,14 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, - lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) + lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*"), + default_tag="l.auto") if opt_level == 3: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) + hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, + default_tag="l.auto") if opt_level == 4: tap_hsv = hsv diff --git a/test/test_reduction.py b/test/test_reduction.py index 81de627d8..d65c0e7de 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -238,7 +238,8 @@ def test_global_parallel_reduction(ctx_factory, size): from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", - temporary_scope=lp.temp_var_scope.GLOBAL) + temporary_scope=lp.temp_var_scope.GLOBAL, + default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", @@ -278,7 +279,8 @@ def test_global_mc_parallel_reduction(ctx_factory, size): from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", - temporary_scope=lp.temp_var_scope.GLOBAL) + temporary_scope=lp.temp_var_scope.GLOBAL, + default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index e724a65df..e022e92f3 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -82,17 +82,17 @@ def test_tim2d(ctx_factory): def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) - knl = lp.add_prefetch(knl, "D[:,:]") - knl = lp.add_prefetch(knl, "u[e, :, :]") + knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") - knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) - knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) + knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") + knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto") - knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) - knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) + knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto") + knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") - knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") + knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) diff --git a/test/test_statistics.py b/test/test_statistics.py index d9adee241..79c5ec7da 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -932,8 +932,8 @@ def test_all_counters_parallel_matmul(): knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 diff --git a/test/test_target.py b/test/test_target.py index eb94bdc81..7c0d003ee 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -69,7 +69,8 @@ def test_ispc_target(occa_mode=False): knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], + default_tag="l.auto") codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( @@ -93,7 +94,8 @@ def test_cuda_target(): knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], + default_tag="l.auto") print( lp.generate_code( @@ -278,7 +280,7 @@ def test_numba_cuda_target(): knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]") + knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") diff --git a/test/test_transform.py b/test/test_transform.py index 210984512..ed184fb50 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -234,9 +234,9 @@ def test_alias_temporaries(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.precompute(knl, "times2", "i_inner") - knl = lp.precompute(knl, "times3", "i_inner") - knl = lp.precompute(knl, "times4", "i_inner") + knl = lp.precompute(knl, "times2", "i_inner", default_tag="l.auto") + knl = lp.precompute(knl, "times3", "i_inner", default_tag="l.auto") + knl = lp.precompute(knl, "times4", "i_inner", default_tag="l.auto") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) @@ -307,7 +307,7 @@ def test_join_inames(ctx_factory): ref_knl = knl - knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) + knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"], default_tag="l.auto") knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) @@ -401,7 +401,7 @@ def test_precompute_nested_subst(ctx_factory): from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D", "i_inner") + knl = lp.precompute(knl, "D", "i_inner", default_tag="l.auto") # There's only one surviving 'E' rule. assert len([ -- GitLab From 01489848f012b15cb73b5d2df918799c882eb481 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 22 Jun 2018 17:30:21 -0500 Subject: [PATCH 3/3] Placate flake8 --- test/test_linalg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_linalg.py b/test/test_linalg.py index 6115cc8a5..fec6cd5e7 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -430,7 +430,8 @@ def test_intel_matrix_mul(ctx_factory): default_tag="l.auto") # FIXME: Grouped prefetch - #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")], default_tag="l.auto") + #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")], + # default_tag="l.auto") #knl = lp.add_prefetch(knl, 'b', # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") @@ -483,7 +484,8 @@ def test_magma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") # FIXME - #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], default_tag="l.auto") + #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + # default_tag="l.auto") #knl = lp.add_prefetch(knl, 'b', # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") -- GitLab