diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 23840f09a46ab97902a8d1ed7e078a7c70d36dec..4b35522043bfc32b71c0a063c3efc3b4403a26f2 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -22,11 +22,7 @@ end subroutine ! ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") -! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", -! precompute_outer_inames="i_outer, j_outer, k_outer", -! default_tag="l.auto") -! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", -! precompute_outer_inames="i_outer, j_outer, k_outer", -! default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") ! RESULT = [dgemm] !$loopy end diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py index eda11fc155fc951246381ca697409615fa0be90a..b8da89c6c75986e3baf5e35ee76b680d08c51632 100644 --- a/examples/python/rank-one.py +++ b/examples/python/rank-one.py @@ -33,10 +33,8 @@ evt, (c,) = knl(queue, a=a, b=b) split_knl = knl # PREFETCH1BEGIN -knl = lp.add_prefetch(knl, "a", - fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') -knl = lp.add_prefetch(knl, "b", - fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') +knl = lp.add_prefetch(knl, "a") +knl = lp.add_prefetch(knl, "b") # PREFETCH1END knl = lp.set_options(knl, write_code=True) @@ -45,14 +43,8 @@ evt, (c,) = knl(queue, a=a, b=b) knl = split_knl # PREFETCH2BEGIN -knl = lp.add_prefetch(knl, "a", ["i_inner"], - fetch_outer_inames='i_outer, j_outer, j_inner', - temporary_address_space=lp.AddressSpace.LOCAL, - default_tag="l.0") -knl = lp.add_prefetch(knl, "b", ["j_inner"], - fetch_outer_inames='i_outer, j_outer, j_inner', - temporary_address_space=lp.AddressSpace.LOCAL, - default_tag="l.0") +knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") +knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0") # PREFETCH2END knl = lp.set_options(knl, write_code=True) @@ -66,10 +58,8 @@ knl = lp.split_iname(knl, "i", 256, knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) -knl = lp.add_prefetch(knl, "a", ["i_inner"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) -knl = lp.add_prefetch(knl, "b", ["j_inner"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) +knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) +knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") diff --git a/loopy/check.py b/loopy/check.py index b49a60dff7f5396ff0afbef19339628ec4d4da0d..4588a59b48bab0b9122902878c6aa5d96cf6ed8f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -787,6 +787,9 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 + if insn.boostable: + continue + group_axes_used = set() local_axes_used = set() diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index bc5c51eb0453a34ad902d58903997b75d6c54f34..9a90462768684152aec96319199fea0f2fa63435 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1909,7 +1909,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): breaking language changes *will* apply to your kernel without asking, likely breaking your code.) - If not given, this value defaults to version **(2018, 2)** and + If not given, this value defaults to version **(2017, 2, 1)** and a warning will be issued. To set the kernel version for all :mod:`loopy` kernels in a (Python) source @@ -2017,6 +2017,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): if lang_version not in version_to_symbol: raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index a966fa7e9eb211e44de7a5e6698b8d1c5f929d94..61127232a9f494fe2fdc536dd50d8fdf41b8f17c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -151,14 +151,15 @@ class InstructionBase(ImmutableRecord): .. automethod:: copy """ - # within_inames_is_final is deprecated and will be removed in version 2017.x. + # within_inames_is_final, boostable and boostable_into are deprecated and + # will be removed in version 2017.x. fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " "within_inames_is_final within_inames " - "priority".split()) + "priority boostable boostable_into".split()) # Names of fields that are pymbolic expressions. Needed for key building pymbolic_fields = set("") @@ -171,7 +172,30 @@ class InstructionBase(ImmutableRecord): no_sync_with, within_inames_is_final, within_inames, priority, - predicates, tags): + boostable, boostable_into, predicates, tags, + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, forced_iname_deps_is_final=None): + + # {{{ backwards compatibility goop + + if depends_on is not None and insn_deps is not None: + raise LoopyError("may not specify both insn_deps and depends_on") + elif insn_deps is not None: + warn("insn_deps is deprecated, use depends_on", + DeprecationWarning, stacklevel=2) + + depends_on = insn_deps + depends_on_is_final = insn_deps_is_final + + if forced_iname_deps is not None and within_inames is not None: + raise LoopyError("may not specify both forced_iname_deps " + "and within_inames") + elif forced_iname_deps is not None: + warn("forced_iname_deps is deprecated, use within_inames", + DeprecationWarning, stacklevel=2) + + within_inames = forced_iname_deps + within_inames_is_final = forced_iname_deps_is_final if predicates is None: predicates = frozenset() @@ -193,6 +217,8 @@ class InstructionBase(ImmutableRecord): predicates = frozenset(new_predicates) del new_predicates + # }}} + if depends_on is None: depends_on = frozenset() @@ -257,9 +283,42 @@ class InstructionBase(ImmutableRecord): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, + boostable=boostable, + boostable_into=boostable_into, predicates=predicates, tags=tags) + # {{{ backwards compatibility goop + + @property + def insn_deps(self): + warn("insn_deps is deprecated, use depends_on", + DeprecationWarning, stacklevel=2) + + return self.depends_on + + # legacy + @property + def insn_deps_is_final(self): + warn("insn_deps_is_final is deprecated, use depends_on_is_final", + DeprecationWarning, stacklevel=2) + + return self.depends_on_is_final + + @property + def forced_iname_deps(self): + warn("forced_iname_deps is deprecated, use within_inames", + DeprecationWarning, stacklevel=2) + return self.within_inames + + @property + def forced_iname_deps_is_final(self): + warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final", + DeprecationWarning, stacklevel=2) + return self.within_inames_is_final + + # }}} + # {{{ abstract interface def read_dependency_names(self): @@ -336,6 +395,18 @@ class InstructionBase(ImmutableRecord): def get_str_options(self): result = [] + if self.boostable is True: + if self.boostable_into: + result.append("boostable into '%s'" % ",".join(self.boostable_into)) + else: + result.append("boostable") + elif self.boostable is False: + result.append("not boostable") + elif self.boostable is None: + pass + else: + raise RuntimeError("unexpected value for Instruction.boostable") + if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: @@ -397,6 +468,21 @@ class InstructionBase(ImmutableRecord): # }}} + def copy(self, **kwargs): + if "insn_deps" in kwargs: + warn("insn_deps is deprecated, use depends_on", + DeprecationWarning, stacklevel=2) + + kwargs["depends_on"] = kwargs.pop("insn_deps") + + if "insn_deps_is_final" in kwargs: + warn("insn_deps_is_final is deprecated, use depends_on", + DeprecationWarning, stacklevel=2) + + kwargs["depends_on_is_final"] = kwargs.pop("insn_deps_is_final") + + return super(InstructionBase, self).copy(**kwargs) + def __setstate__(self, val): super(InstructionBase, self).__setstate__(val) @@ -826,9 +912,11 @@ class Assignment(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - tags=None, + boostable=None, boostable_into=None, tags=None, temp_var_type=Optional(), atomicity=(), - priority=0, predicates=frozenset()): + priority=0, predicates=frozenset(), + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, forced_iname_deps_is_final=None): super(Assignment, self).__init__( id=id, @@ -839,9 +927,15 @@ class Assignment(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, + boostable=boostable, + boostable_into=boostable_into, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + insn_deps=insn_deps, + insn_deps_is_final=insn_deps_is_final, + forced_iname_deps=forced_iname_deps, + forced_iname_deps_is_final=forced_iname_deps_is_final) from loopy.symbolic import parse if isinstance(assignee, str): @@ -957,9 +1051,12 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - tags=None, + boostable=None, boostable_into=None, tags=None, temp_var_types=None, - priority=0, predicates=frozenset()): + priority=0, predicates=frozenset(), + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, + forced_iname_deps_is_final=None): super(CallInstruction, self).__init__( id=id, @@ -970,9 +1067,15 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, + boostable=boostable, + boostable_into=boostable_into, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + insn_deps=insn_deps, + insn_deps_is_final=insn_deps_is_final, + forced_iname_deps=forced_iname_deps, + forced_iname_deps_is_final=forced_iname_deps_is_final) from pymbolic.primitives import Call from loopy.symbolic import Reduction @@ -1131,8 +1234,9 @@ class CInstruction(InstructionBase): groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, - priority=0, - predicates=frozenset(), tags=None): + priority=0, boostable=None, boostable_into=None, + predicates=frozenset(), tags=None, + insn_deps=None, insn_deps_is_final=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -1151,7 +1255,11 @@ class CInstruction(InstructionBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - priority=priority, predicates=predicates, tags=tags) + boostable=boostable, + boostable_into=boostable_into, + priority=priority, predicates=predicates, tags=tags, + insn_deps=insn_deps, + insn_deps_is_final=insn_deps_is_final) # {{{ normalize iname_exprs @@ -1291,6 +1399,7 @@ class NoOpInstruction(_DataObliviousInstruction): no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, + boostable=None, boostable_into=None, predicates=None, tags=None): super(NoOpInstruction, self).__init__( id=id, @@ -1302,6 +1411,8 @@ class NoOpInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, + boostable=boostable, + boostable_into=boostable_into, predicates=predicates, tags=tags) @@ -1350,6 +1461,7 @@ class BarrierInstruction(_DataObliviousInstruction): no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, + boostable=None, boostable_into=None, predicates=None, tags=None, synchronization_kind="global", mem_kind="local"): @@ -1366,6 +1478,8 @@ class BarrierInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, + boostable=boostable, + boostable_into=boostable_into, predicates=predicates, tags=tags ) diff --git a/loopy/maxima.py b/loopy/maxima.py new file mode 100644 index 0000000000000000000000000000000000000000..c74360a731fa06644065e743fb9397ea170fb7f3 --- /dev/null +++ b/loopy/maxima.py @@ -0,0 +1,105 @@ +# pylint: disable=all # This code needs porting to modern loopy +"""Export to maxima.""" + +from __future__ import division + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from pymbolic.interop.maxima import \ + MaximaStringifyMapper as MaximaStringifyMapperBase + + +class MaximaStringifyMapper(MaximaStringifyMapperBase): + def map_subscript(self, expr, enclosing_prec): + res = self.rec(expr.aggregate, enclosing_prec) + idx = expr.index + if not isinstance(idx, tuple): + idx = (idx,) + for i in idx: + if isinstance(i, int): + res += "_%d" % i + + return res + + +def get_loopy_instructions_as_maxima(kernel, prefix): + """Sample use for code comparison:: + + load("knl-optFalse.mac"); + load("knl-optTrue.mac"); + + vname: bessel_j_8; + + un_name : concat(''un_, vname); + opt_name : concat(''opt_, vname); + + print(ratsimp(ev(un_name - opt_name))); + """ + from loopy.preprocess import add_boostability_and_automatic_dependencies + kernel = add_boostability_and_automatic_dependencies(kernel) + + my_variable_names = ( + avn + for insn in kernel.instructions + for avn in insn.assignee_var_names() + ) + + from pymbolic import var + subst_dict = dict( + (vn, var(prefix+vn)) for vn in my_variable_names) + + mstr = MaximaStringifyMapper() + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + substitute = SubstitutionMapper(make_subst_func(subst_dict)) + + result = ["ratprint:false;"] + + written_insn_ids = set() + + from loopy.kernel import InstructionBase, Assignment + + def write_insn(insn): + if not isinstance(insn, InstructionBase): + insn = kernel.id_to_insn[insn] + if not isinstance(insn, Assignment): + raise RuntimeError("non-single-output assignment not supported " + "in maxima export") + + for dep in insn.depends_on: + if dep not in written_insn_ids: + write_insn(dep) + + aname, = insn.assignee_var_names() + result.append("%s%s : %s;" % ( + prefix, aname, + mstr(substitute(insn.expression)))) + + written_insn_ids.add(insn.id) + + for insn in kernel.instructions: + if insn.id not in written_insn_ids: + write_insn(insn) + + return "\n".join(result) diff --git a/loopy/options.py b/loopy/options.py index ede2b3f59748072828f5fc4ed5eaaf412d17c0fb..63089d94d3487e77a1def39a98fe24631c508398 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -89,6 +89,12 @@ class Options(ImmutableRecord): Like :attr:`trace_assignments`, but also trace the assigned values. + .. attribute:: ignore_boostable_into + + Ignore the boostable_into field of the kernel, when + determining whether an iname duplication is necessary + for the kernel to be schedulable. + .. attribute:: check_dep_resolution Whether loopy should issue an error if a dependency @@ -205,6 +211,7 @@ class Options(ImmutableRecord): annotate_inames=kwargs.get("annotate_inames", False), trace_assignments=kwargs.get("trace_assignments", False), trace_assignment_values=kwargs.get("trace_assignment_values", False), + ignore_boostable_into=kwargs.get("ignore_boostable_into", False), skip_arg_checks=kwargs.get("skip_arg_checks", False), no_numpy=kwargs.get("no_numpy", False), @@ -221,7 +228,7 @@ class Options(ImmutableRecord): check_dep_resolution=kwargs.get("check_dep_resolution", True), enforce_variable_access_ordered=kwargs.get( - "enforce_variable_access_ordered", True), + "enforce_variable_access_ordered", False), ) # {{{ legacy compatibility diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a231b31ee5913812afd531acd871ec5d2e3e4ded..de81815a82655136941b57b1f78486aed39237da 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1954,6 +1954,114 @@ def realize_ilp(kernel): # }}} +# {{{ find idempotence ("boostability") of instructions + +def find_idempotence(kernel): + logger.debug("%s: idempotence" % kernel.name) + + writer_map = kernel.writer_map() + + arg_names = set(arg.name for arg in kernel.args) + + var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) + + reads_map = dict( + (insn.id, insn.read_dependency_names() & var_names) + for insn in kernel.instructions) + + from collections import defaultdict + dep_graph = defaultdict(set) + + for insn in kernel.instructions: + dep_graph[insn.id] = set(writer_id + for var in reads_map[insn.id] + for writer_id in writer_map.get(var, set())) + + # Find SCCs of dep_graph. These are used for checking if the instruction is + # in a dependency cycle. + from pytools.graph import compute_sccs + + sccs = dict((item, scc) + for scc in compute_sccs(dep_graph) + for item in scc) + + non_idempotently_updated_vars = set() + + new_insns = [] + for insn in kernel.instructions: + boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] + + if not boostable: + non_idempotently_updated_vars.update( + insn.assignee_var_names()) + + new_insns.append(insn.copy(boostable=boostable)) + + # {{{ remove boostability from isns that access non-idempotently updated vars + + new2_insns = [] + for insn in new_insns: + if insn.boostable and bool( + non_idempotently_updated_vars & insn.dependency_names()): + new2_insns.append(insn.copy(boostable=False)) + else: + new2_insns.append(insn) + + # }}} + + return kernel.copy(instructions=new2_insns) + +# }}} + + +# {{{ limit boostability + +def limit_boostability(kernel): + """Finds out which other inames an instruction's inames occur with + and then limits boostability to just those inames. + """ + + logger.debug("%s: limit boostability" % kernel.name) + + iname_occurs_with = {} + for insn in kernel.instructions: + insn_inames = kernel.insn_inames(insn) + for iname in insn_inames: + iname_occurs_with.setdefault(iname, set()).update(insn_inames) + + iname_use_counts = {} + for insn in kernel.instructions: + for iname in kernel.insn_inames(insn): + iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1 + + single_use_inames = set(iname for iname, uc in six.iteritems(iname_use_counts) + if uc == 1) + + new_insns = [] + for insn in kernel.instructions: + if insn.boostable is None: + raise LoopyError("insn '%s' has undetermined boostability" % insn.id) + elif insn.boostable: + boostable_into = set() + for iname in kernel.insn_inames(insn): + boostable_into.update(iname_occurs_with[iname]) + + boostable_into -= kernel.insn_inames(insn) | single_use_inames + + # Even if boostable_into is empty, leave boostable flag on--it is used + # for boosting into unused hw axes. + + insn = insn.copy(boostable_into=boostable_into) + else: + insn = insn.copy(boostable_into=set()) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + + # {{{ check for loads of atomic variables def check_atomic_loads(kernel): @@ -2077,6 +2185,10 @@ def preprocess_kernel(kernel, device=None): kernel = find_temporary_address_space(kernel) + # boostability should be removed in 2017.x. + kernel = find_idempotence(kernel) + kernel = limit_boostability(kernel) + # check for atomic loads, much easier to do here now that the dependencies # have been established kernel = check_atomic_loads(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b2b6553c5d9f7a92ec8f61a9b58ab4b54c7c64dc..032cdc2760597f1fa6f701a8a88252312deac797 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -636,6 +636,11 @@ class SchedulerState(ImmutableRecord): A mapping from instruction group names to the number of instructions in them that are left to schedule. If a group name occurs in this mapping, that group is considered active. + + .. attribute:: uses_of_boostability + + Used to produce warnings about deprecated 'boosting' behavior + Should be removed along with boostability in 2017.x. """ @property @@ -647,13 +652,18 @@ class SchedulerState(ImmutableRecord): def generate_loop_schedules_internal( - sched_state, debug=None): + sched_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel Fore = kernel.options._fore # noqa Style = kernel.options._style # noqa + if allow_boost is None: + rec_allow_boost = None + else: + rec_allow_boost = False + active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( @@ -683,6 +693,7 @@ def generate_loop_schedules_internal( print(75*"=") print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") print(dump_schedule(sched_state.kernel, sched_state.preschedule)) + #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") for iname, val in six.iteritems(sched_state.loop_nest_around_map): @@ -708,6 +719,7 @@ def generate_loop_schedules_internal( within_subkernel=True, may_schedule_global_barriers=False, enclosing_subkernel_inames=sched_state.active_inames), + allow_boost=rec_allow_boost, debug=debug): yield result @@ -721,6 +733,7 @@ def generate_loop_schedules_internal( preschedule=sched_state.preschedule[1:], within_subkernel=False, may_schedule_global_barriers=True), + allow_boost=rec_allow_boost, debug=debug): yield result @@ -739,6 +752,7 @@ def generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:]), + allow_boost=rec_allow_boost, debug=debug): yield result @@ -792,6 +806,15 @@ def generate_loop_schedules_internal( want = kernel.insn_inames(insn) - sched_state.parallel_inames have = active_inames_set - sched_state.parallel_inames + # If insn is boostable, it may be placed inside a more deeply + # nested loop without harm. + + orig_have = have + if allow_boost: + # Note that the inames in 'insn.boostable_into' necessarily won't + # be contained in 'want'. + have = have - insn.boostable_into + if want != have: is_ready = False @@ -897,6 +920,12 @@ def generate_loop_schedules_internal( # }}} + new_uses_of_boostability = [] + if allow_boost: + if orig_have & insn.boostable_into: + new_uses_of_boostability.append( + (insn.id, orig_have & insn.boostable_into)) + new_sched_state = sched_state.copy( scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set, unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, @@ -908,6 +937,9 @@ def generate_loop_schedules_internal( if insn_id not in sched_state.prescheduled_insn_ids else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, + uses_of_boostability=( + sched_state.uses_of_boostability + + new_uses_of_boostability) ) # Don't be eager about entering/leaving loops--if progress has been @@ -915,7 +947,7 @@ def generate_loop_schedules_internal( # made. for sub_sched in generate_loop_schedules_internal( new_sched_state, - debug=debug): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched if not sched_state.group_insn_counts: @@ -957,10 +989,12 @@ def generate_loop_schedules_internal( # outside of last_entered_loop. for subdep_id in gen_dependencies_except(kernel, insn_id, sched_state.scheduled_insn_ids): + subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - sched_state.parallel_inames) if ( - last_entered_loop not in want): + last_entered_loop not in want and + last_entered_loop not in subdep.boostable_into): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " "which the schedule is " @@ -1020,7 +1054,7 @@ def generate_loop_schedules_internal( not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), - debug=debug): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched return @@ -1135,7 +1169,7 @@ def generate_loop_schedules_internal( for insn_id in reachable_insn_ids: insn = kernel.id_to_insn[insn_id] - want = kernel.insn_inames(insn) + want = kernel.insn_inames(insn) | insn.boostable_into if hypothetically_active_loops <= want: if usefulness is None: @@ -1235,6 +1269,7 @@ def generate_loop_schedules_internal( if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), + allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True yield sub_sched @@ -1259,11 +1294,28 @@ def generate_loop_schedules_internal( # if done, yield result debug.log_success(sched_state.schedule) + for boost_insn_id, boost_inames in sched_state.uses_of_boostability: + warn_with_kernel( + kernel, "used_boostability", + "instruction '%s' was implicitly nested inside " + "inames '%s' based on an idempotence heuristic. " + "This is deprecated and will stop working in loopy 2017.x." + % (boost_insn_id, ", ".join(boost_inames)), + DeprecationWarning) + yield sched_state.schedule else: - if debug is not None: - debug.log_dead_end(sched_state.schedule) + if not allow_boost and allow_boost is not None: + # try again with boosting allowed + for sub_sched in generate_loop_schedules_internal( + sched_state, + allow_boost=True, debug=debug): + yield sub_sched + else: + # dead end + if debug is not None: + debug.log_dead_end(sched_state.schedule) # }}} @@ -1862,9 +1914,13 @@ def generate_loop_schedules_inner(kernel, debug_args={}): parallel_inames=parallel_inames - ilp_inames - vec_inames, group_insn_counts=group_insn_counts(kernel), - active_group_counts={}) + active_group_counts={}, + + uses_of_boostability=[]) schedule_gen_kwargs = {} + if kernel.options.ignore_boostable_into: + schedule_gen_kwargs["allow_boost"] = None def print_longest_dead_end(): if debug.interactive: diff --git a/loopy/statistics.py b/loopy/statistics.py index cbdbdac6e43a75059973d45a8df9de8ec689b10b..32fe7741e1298c99e2baf74f3e08e67fc8b2a63e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1405,6 +1405,10 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1593,6 +1597,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1705,6 +1713,10 @@ def get_synchronization_map(knl, subgroup_size=None): """ + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 33a51f6278f6baa61a3b1078c35c4c886f9be8d3..8432d59ec5b162f6e963abbeae3b2fcabe94cf27 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -986,7 +986,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=None): +def get_iname_duplication_options(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1016,15 +1016,6 @@ def get_iname_duplication_options(knl, use_boostable_into=None): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ - if use_boostable_into: - raise LoopyError("'use_boostable_into=True' is no longer supported.") - - if use_boostable_into is False: - from warnings import warn - warn("passing 'use_boostable_into=False' to 'get_iname_duplication_options'" - " is deprecated. This will be the default in 2021.x.", - DeprecationWarning, stacklevel=2) - from loopy.kernel.data import ConcurrentTag concurrent_inames = set( @@ -1033,12 +1024,23 @@ def get_iname_duplication_options(knl, use_boostable_into=None): if knl.iname_tags_of_type(iname, ConcurrentTag)) # First we extract the minimal necessary information from the kernel - insn_iname_sets = ( - frozenset( - insn.within_inames - concurrent_inames - for insn in knl.instructions) - - - frozenset([frozenset([])])) + if use_boostable_into: + insn_iname_sets = ( + frozenset( + (insn.within_inames + | insn.boostable_into if insn.boostable_into is not None + else frozenset([])) + - concurrent_inames + for insn in knl.instructions) + - + frozenset([frozenset([])])) + else: + insn_iname_sets = ( + frozenset( + insn.within_inames - concurrent_inames + for insn in knl.instructions) + - + frozenset([frozenset([])])) # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_iname_sets): @@ -1047,6 +1049,23 @@ def get_iname_duplication_options(knl, use_boostable_into=None): and knl.iname_tags_of_type(iname, ConcurrentTag)): continue + # If we find a duplication option and to not use boostable_into + # information, we restart this generator with use_boostable_into=True + if not use_boostable_into and not knl.options.ignore_boostable_into: + for option in get_iname_duplication_options(knl, True): + yield option + + # Emit a warning that we needed boostable_into + from warnings import warn + from loopy.diagnostic import LoopyWarning + warn("Kernel '%s' required the deprecated 'boostable_into' " + "instruction attribute in order to be schedulable!" % knl.name, + LoopyWarning) + + # Return to avoid yielding the duplication + # options without boostable_into + return + # Reconstruct an object that may be passed to the within parameter of # loopy.duplicate_inames from loopy.match import Id, Or @@ -1054,7 +1073,9 @@ def get_iname_duplication_options(knl, use_boostable_into=None): Id(insn.id) for insn in knl.instructions if insn.within_inames in insns)) - # Only yield the result if an instruction matched. + # Only yield the result if an instruction matched. With + # use_boostable_into=True this is not always true. + if within.children: yield iname, within diff --git a/loopy/transform/save.py b/loopy/transform/save.py index aef13b237bb1fba52f41d5a910017608b3957161..baa558a72861f31c5ce707329ea84786b96eb6d2 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -562,7 +562,9 @@ class TemporarySaver(object): self.subkernel_to_surrounding_inames[subkernel] | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, - depends_on=depends_on) + depends_on=depends_on, + boostable=False, + boostable_into=frozenset()) if mode == "save": self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) diff --git a/loopy/version.py b/loopy/version.py index 25a82f09f78ebd54fae264b2238f953c7ce16164..4eab69c14d8fb2cded8356a83d208353cd8f19eb 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ else: # }}} -VERSION = (2020, 2) +VERSION = (2020, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS @@ -64,13 +64,17 @@ DATA_MODEL_VERSION = "%s-islpy%s-cgen%s-%s-v1" % ( VERSION_TEXT, _islpy_version, _cgen_version, _git_rev) -FALLBACK_LANGUAGE_VERSION = (2018, 2) +FALLBACK_LANGUAGE_VERSION = (2017, 2, 1) MOST_RECENT_LANGUAGE_VERSION = (2018, 2) LOOPY_USE_LANGUAGE_VERSION_2018_2 = (2018, 2) +LOOPY_USE_LANGUAGE_VERSION_2018_1 = (2018, 1) +LOOPY_USE_LANGUAGE_VERSION_2017_2_1 = (2017, 2, 1) LANGUAGE_VERSION_SYMBOLS = [ "LOOPY_USE_LANGUAGE_VERSION_2018_2", + "LOOPY_USE_LANGUAGE_VERSION_2018_1", + "LOOPY_USE_LANGUAGE_VERSION_2017_2_1", ] __doc__ = """ @@ -98,7 +102,7 @@ language version to let them take advantage of this check. As a result, :mod:`loopy` will now issue a warning when a call to :func:`loopy.make_kernel` does not declare a language version. Such kernels -will (indefinitely) default to language version 2018.2. If passing a +will (indefinitely) default to language version 2017.2.1. If passing a language version to :func:`make_kernel` is impractical, you may also import one of the ``LOOPY_USE_LANGUAGE_VERSION_...`` symbols given below using:: @@ -129,12 +133,10 @@ History of Language Versions .. data:: LOOPY_USE_LANGUAGE_VERSION_2018_1 - :attr:`loopy.Options.enforce_variable_access_ordered` is turned on by - default. Unsupported from :mod:`loopy` version 2020.2 onwards. + :attr:`loopy.Options.enforce_variable_access_ordered` + is turned on by default. .. data:: LOOPY_USE_LANGUAGE_VERSION_2017_2_1 - Initial legacy language version. Unsupported from :mod:`loopy` version - 2020.2 onwards. - + Initial legacy language version. """ diff --git a/test/test_apps.py b/test/test_apps.py index f7eeb756e735ffb4d5ab6ab747c6bb792c690668..71029cc9ce408f8e7fa95eaf3b766864c4beee5b 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -101,11 +101,8 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", - fetch_outer_inames='im_x_outer, im_y_outer, ifeat', - default_tag="l.auto") + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", - fetch_outer_inames='iimg, im_x_outer, im_y_outer, ifeat, icolor', default_tag="l.auto") return knl diff --git a/test/test_dg.py b/test/test_dg.py index 543701a5fb4f2ce8c40851117573d1f72639436c..967dea35071bb3d95c06b2e37d73da29ac019763 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -100,7 +100,6 @@ def test_dg_volume(ctx_factory): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", - fetch_outer_inames='k_outer', default_tag="l.auto") return knl diff --git a/test/test_domain.py b/test/test_domain.py index 8962514450f8ee352089104b2ffc1241e323725d..5daf84eaa5b7ffd1647daf4b35acd7a5de91c5d1 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -317,7 +317,7 @@ def test_equality_constraints(ctx_factory): ], [ "a[i,j] = 5 {id=set_all}", - "b[i,k] = 22 {id=set_b, dep=set_all}", + "b[i,k] = 22 {dep=set_all}", ], [ lp.GlobalArg("a,b", dtype, shape="n, n", order=order), @@ -329,9 +329,6 @@ def test_equality_constraints(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - - knl = lp.add_inames_to_insn(knl, 'j_inner, j_outer', 'id:set_b') - #print(knl) #print(knl.domains[0].detect_equalities()) diff --git a/test/test_fortran.py b/test/test_fortran.py index c7270abd29e4e68a110bc6ddc9efa4bc95a45823..3601e96b752f18e6e01bcfcffe49780bda4058b4 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -316,12 +316,8 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", - precompute_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", - precompute_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") @@ -496,11 +492,9 @@ def test_precompute_some_exist(ctx_factory): knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", - precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", - precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl diff --git a/test/test_linalg.py b/test/test_linalg.py index 390c5654fc0ee5bae631d26e5a0f58e939f8c78b..f075d3493195ec3364c4de0d26f92c4a987e7187 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -186,10 +186,8 @@ def test_plain_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], - fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ], - fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -225,12 +223,8 @@ def test_variable_size_matrix_mul(ctx_factory): slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], - fetch_outer_inames="i_outer, j_outer, k_outer", - default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], - fetch_outer_inames="i_outer, j_outer, k_outer", - default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -269,10 +263,8 @@ def test_funny_shape_matrix_mul(ctx_factory): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", - precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", - precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -315,10 +307,8 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", - fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') - knl = lp.add_prefetch(knl, "b", - fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') + knl = lp.add_prefetch(knl, "a") + knl = lp.add_prefetch(knl, "b") return knl def variant_3(knl): @@ -327,15 +317,8 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", ["i_inner"], - fetch_outer_inames='i_outer, j_outer, j_inner', - temporary_address_space=lp.AddressSpace.LOCAL, - default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner"], - fetch_outer_inames='i_outer, j_outer, j_inner', - temporary_address_space=lp.AddressSpace.LOCAL, - default_tag="l.auto") - + knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto") return knl def variant_4(knl): @@ -344,10 +327,8 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) - knl = lp.add_prefetch(knl, "b", ["j_inner"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) + knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) + knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") @@ -404,7 +385,6 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], - fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -445,10 +425,8 @@ def test_intel_matrix_mul(ctx_factory): #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], - fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], - fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") # FIXME: Grouped prefetch @@ -550,12 +528,8 @@ def test_image_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], - fetch_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], - fetch_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -634,12 +608,8 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], - fetch_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], - fetch_outer_inames='i_outer, j_outer, k_outer', - default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], diff --git a/test/test_nbody.py b/test/test_nbody.py index 6016c2f1c9955d3bd58d52ad33a3fa95ed63cff8..5b36ed4163c650317d8656883eeda599a3c21faa 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -77,8 +77,7 @@ def test_nbody(ctx_factory): outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], - ["x_fetch_j", "x_fetch_k"], - fetch_outer_inames='i_outer, j_outer', default_tag=None) + ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 57d75b24b59fc7972fb529fa3e6f220c76d84095..54b608a183840cc5d33f1e738f36fc605d16d94a 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -90,8 +90,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if opt_level == 0: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", - default_tag="l.auto") + hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index fff2b5356e75f414356ea1c61c2dd54753186d26..54c64e0a4d4a23b429eb83be6c0a19f482a1b922 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -82,8 +82,7 @@ def test_tim2d(ctx_factory): def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) - knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', - default_tag="l.auto") + knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") diff --git a/test/test_target.py b/test/test_target.py index afad1b676485091ec49a2a1b4870e96d1bf70539..038b2e6c06116049441fad36d033c5a6831b4dbe 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -281,9 +281,7 @@ def test_numba_cuda_target(): knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]", - fetch_outer_inames='i_inner, i_outer, j_inner', - default_tag="l.auto") + knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") diff --git a/test/test_transform.py b/test/test_transform.py index a6fb9424d44326d11e4b561b971554dd9ffcd7f4..ffef893b05fbca5a0d244ff17f379e1bb5cf27a1 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -374,8 +374,7 @@ def test_precompute_confusing_subst_arguments(ctx_factory): from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D", sweep_inames='j', - precompute_outer_inames='j, i_inner, i_outer') + knl = lp.precompute(knl, "D") lp.auto_test_vs_ref( ref_knl, ctx, knl,