diff --git a/MEMO b/MEMO index 87725fac86ce357147af22ed60355ab30797eb9a..66f5f2f4e9ca3ba8657585ba04e6387d80e5abbd 100644 --- a/MEMO +++ b/MEMO @@ -55,10 +55,11 @@ To-do - reg rolling - nbody GPU + -> pending better prefetch spec - Expose iname-duplicate-and-rename as a primitive. -- String instructions? +- add_prefetch gets a flag to separate out each access - Making parameters run-time varying, substituting values that depend on other inames? @@ -68,6 +69,8 @@ To-do Future ideas ^^^^^^^^^^^^ +- String instructions? + - How is intra-instruction ordering of ILP loops going to be determined? (taking into account that it could vary even per-instruction?) diff --git a/doc/reference.rst b/doc/reference.rst index a6c3bc988409f6f037b8a86ea7ec3d05410448b1..0e538771935d373519eb074bb484550241a3ec50 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -14,6 +14,8 @@ Expressions ----------- * `if` +* `reductions` + * duplication of reduction inames * complex-valued arithmetic Assignments and Substitution Rules @@ -28,13 +30,21 @@ Tag Meaning `None` | `"for"` Sequential loop `"l.N"` Local (intra-group) axis N `"l.auto"` Automatically chosen local (intra-group) axis +`"g.N"` Group-number axis N `"unr"` Plain unrolling -`"ilp"` Unroll using instruction-level parallelism -`"inn"` Realize parallel iname as innermost loop +`"ilp"` | `"ilp.unr"` Unroll using instruction-level parallelism +`"ilp.seq"` Realize parallel iname as innermost loop ===================== ==================================================== (Throughout this table, `N` must be replaced by an actual number.) +ILP is really three things combined: + +* Restricts loops to be innermost (excludes them from scheduling) +* Duplicates reduction storage for any reductions nested around ILP usage +* Causes a loop (unrolled or not) to be opened/generated for each + involved instruction + .. _automatic-axes: Automatic Axis Assignment diff --git a/loopy/check.py b/loopy/check.py index 16b4c72845d490ce33c852334bfb0f4ec76f523d..c0bfeca134cd70a77aa78cbe1d0ffd78f75c4d7f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -85,7 +85,7 @@ def check_for_inactive_iname_access(kernel): def check_for_write_races(kernel): from loopy.symbolic import DependencyMapper - from loopy.kernel import ParallelTag, GroupIndexTag, IlpTag + from loopy.kernel import ParallelTag, GroupIndexTag, IlpBaseTag depmap = DependencyMapper() for insn in kernel.instructions: @@ -136,7 +136,7 @@ def check_for_write_races(kernel): ilp_inames = set( iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), IlpTag)) + if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) inames_without_write_dep = ilp_inames - ( assignee_inames & ilp_inames) diff --git a/loopy/cse.py b/loopy/cse.py index 3246a34e6267361f1e3d259eb18bbb05521a3fe8..82b43d25d0311bb9f8dff46e28eb8d054d7d8570 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -278,7 +278,7 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], :arg sweep_axes: A :class:`list` of inames and/or rule argument names to be swept. :arg storage_dims: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. - If `storage_dims` is not specified, it defaults to the arrangement + If `storage_axes` is not specified, it defaults to the arrangement `<direct sweep axes><arguments>` with the direct sweep axes being the slower-varying indices. @@ -318,6 +318,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], from loopy.symbolic import SubstitutionCallbackMapper scm = SubstitutionCallbackMapper([(subst_name, subst_instance)], gather_substs) + # We need to work on the fully expanded form of an expression. + # To that end, instantiate a substitutor. from loopy.symbolic import ParametrizedSubstitutor rules_except_mine = kernel.substitutions.copy() del rules_except_mine[subst_name] @@ -328,7 +330,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], # arguments. Therefore, fully expand each instruction and look at # the invocations in subst_name occurring there. - scm(subst_expander(insn.expression)) + expanded_expr = subst_expander(insn.expression) + scm(expanded_expr) if not invocation_descriptors: raise RuntimeError("no invocations of '%s' found" % subst_name) diff --git a/loopy/kernel.py b/loopy/kernel.py index 4ca9b5540b05c0abb0808d7933b365d3cb023b9b..8b5010c44964b09d31228a327f3ad61d2d2b4349 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -65,9 +65,16 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase): def __str__(self): return "l.auto" -class IlpTag(ParallelTag): +class IlpBaseTag(ParallelTag): + pass + +class UnrolledIlpTag(IlpBaseTag): + def __str__(self): + return "ilp.unr" + +class LoopedIlpTag(IlpBaseTag): def __str__(self): - return "ilp" + return "ilp.seq" class UnrollTag(IndexTag): def __str__(self): @@ -87,8 +94,10 @@ def parse_tag(tag): return None elif tag in ["unr"]: return UnrollTag() - elif tag == "ilp": - return IlpTag() + elif tag in ["ilp", "ilp.unr"]: + return UnrolledIlpTag() + elif tag == "ilp.seq": + return LoopedIlpTag() elif tag.startswith("g."): return GroupIndexTag(int(tag[2:])) elif tag.startswith("l."): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 43e93ddd0e2e77e8c4cd57b3efa8f4b71c74cd64..40c702fb9dbe90262729dc1696383d33e2b9e172 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -132,17 +132,17 @@ def realize_reduction(kernel, insn_id_filter=None): new_insns = [] new_temporary_variables = kernel.temporary_variables.copy() - from loopy.kernel import IlpTag + from loopy.kernel import IlpBaseTag def map_reduction(expr, rec): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. - # {{{ see if this reduction is nested inside some ILP loops + # {{{ see if this reduction is nested around some ILP loops ilp_inames = [iname for iname in temp_kernel.insn_inames(insn) - if isinstance(temp_kernel.iname_to_tag.get(iname), IlpTag)] + if isinstance(temp_kernel.iname_to_tag.get(iname), IlpBaseTag)] from loopy.isl_helpers import static_max_of_pw_aff diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 1c42082f2b9a77db3fa38f8f9f34822b84b0c9da..c90b556bd6a8e8265451bc4ed7ca41543654091c 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -83,7 +83,7 @@ def test_laplacian_stiffness(ctx_factory): knl = lp.split_dimension(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi.one", np.float32, ["dx_axis"], default_tag=None) - knl = lp.tag_dimensions(knl, {"j": "ilp"}) + knl = lp.tag_dimensions(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"] @@ -124,7 +124,7 @@ def test_laplacian_stiffness(ctx_factory): # Plug in variant name here # | # v - for variant in [variant_simple_gpu_prefetch]: + for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) kernel_gen = lp.generate_loop_schedules(var_knl, loop_priority=loop_prio)