From cc775abc7099134381309697ea92a7a29b49547d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 8 Jul 2020 12:44:55 -0500 Subject: [PATCH 1/2] complete deprecation of boostability --- loopy/check.py | 3 - loopy/kernel/creation.py | 2 - loopy/kernel/instruction.py | 122 +++--------------------------------- loopy/maxima.py | 105 ------------------------------- loopy/options.py | 7 --- loopy/preprocess.py | 112 --------------------------------- loopy/schedule/__init__.py | 72 +++------------------ loopy/statistics.py | 12 ---- loopy/transform/iname.py | 55 +++++----------- loopy/transform/save.py | 4 +- 10 files changed, 34 insertions(+), 460 deletions(-) delete mode 100644 loopy/maxima.py diff --git a/loopy/check.py b/loopy/check.py index 4588a59b4..b49a60dff 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -787,9 +787,6 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 - if insn.boostable: - continue - group_axes_used = set() local_axes_used = set() diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 9a9046276..6bf8dcb18 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2019,8 +2019,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): raise LoopyError("Language version '%s' is not known." % (lang_version,)) if lang_version >= (2018, 1): options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) # }}} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 61127232a..befb28a78 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -151,15 +151,14 @@ class InstructionBase(ImmutableRecord): .. automethod:: copy """ - # within_inames_is_final, boostable and boostable_into are deprecated and - # will be removed in version 2017.x. + # within_inames_is_final is deprecated and will be removed in version 2017.x. fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " "within_inames_is_final within_inames " - "priority boostable boostable_into".split()) + "priority".split()) # Names of fields that are pymbolic expressions. Needed for key building pymbolic_fields = set("") @@ -172,31 +171,9 @@ class InstructionBase(ImmutableRecord): no_sync_with, within_inames_is_final, within_inames, priority, - boostable, boostable_into, predicates, tags, - insn_deps=None, insn_deps_is_final=None, + predicates, tags, forced_iname_deps=None, forced_iname_deps_is_final=None): - # {{{ backwards compatibility goop - - if depends_on is not None and insn_deps is not None: - raise LoopyError("may not specify both insn_deps and depends_on") - elif insn_deps is not None: - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - depends_on = insn_deps - depends_on_is_final = insn_deps_is_final - - if forced_iname_deps is not None and within_inames is not None: - raise LoopyError("may not specify both forced_iname_deps " - "and within_inames") - elif forced_iname_deps is not None: - warn("forced_iname_deps is deprecated, use within_inames", - DeprecationWarning, stacklevel=2) - - within_inames = forced_iname_deps - within_inames_is_final = forced_iname_deps_is_final - if predicates is None: predicates = frozenset() @@ -217,8 +194,6 @@ class InstructionBase(ImmutableRecord): predicates = frozenset(new_predicates) del new_predicates - # }}} - if depends_on is None: depends_on = frozenset() @@ -283,42 +258,9 @@ class InstructionBase(ImmutableRecord): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags) - # {{{ backwards compatibility goop - - @property - def insn_deps(self): - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - return self.depends_on - - # legacy - @property - def insn_deps_is_final(self): - warn("insn_deps_is_final is deprecated, use depends_on_is_final", - DeprecationWarning, stacklevel=2) - - return self.depends_on_is_final - - @property - def forced_iname_deps(self): - warn("forced_iname_deps is deprecated, use within_inames", - DeprecationWarning, stacklevel=2) - return self.within_inames - - @property - def forced_iname_deps_is_final(self): - warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final", - DeprecationWarning, stacklevel=2) - return self.within_inames_is_final - - # }}} - # {{{ abstract interface def read_dependency_names(self): @@ -395,18 +337,6 @@ class InstructionBase(ImmutableRecord): def get_str_options(self): result = [] - if self.boostable is True: - if self.boostable_into: - result.append("boostable into '%s'" % ",".join(self.boostable_into)) - else: - result.append("boostable") - elif self.boostable is False: - result.append("not boostable") - elif self.boostable is None: - pass - else: - raise RuntimeError("unexpected value for Instruction.boostable") - if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: @@ -468,21 +398,6 @@ class InstructionBase(ImmutableRecord): # }}} - def copy(self, **kwargs): - if "insn_deps" in kwargs: - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - kwargs["depends_on"] = kwargs.pop("insn_deps") - - if "insn_deps_is_final" in kwargs: - warn("insn_deps_is_final is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - kwargs["depends_on_is_final"] = kwargs.pop("insn_deps_is_final") - - return super(InstructionBase, self).copy(**kwargs) - def __setstate__(self, val): super(InstructionBase, self).__setstate__(val) @@ -912,10 +827,9 @@ class Assignment(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - boostable=None, boostable_into=None, tags=None, + tags=None, temp_var_type=Optional(), atomicity=(), priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): super(Assignment, self).__init__( @@ -927,13 +841,9 @@ class Assignment(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final, forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) @@ -1051,10 +961,9 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - boostable=None, boostable_into=None, tags=None, + tags=None, temp_var_types=None, priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): @@ -1067,13 +976,9 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final, forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) @@ -1234,9 +1139,8 @@ class CInstruction(InstructionBase): groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, - priority=0, boostable=None, boostable_into=None, - predicates=frozenset(), tags=None, - insn_deps=None, insn_deps_is_final=None): + priority=0, + predicates=frozenset(), tags=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -1255,11 +1159,7 @@ class CInstruction(InstructionBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, - priority=priority, predicates=predicates, tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final) + priority=priority, predicates=predicates, tags=tags) # {{{ normalize iname_exprs @@ -1399,7 +1299,6 @@ class NoOpInstruction(_DataObliviousInstruction): no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - boostable=None, boostable_into=None, predicates=None, tags=None): super(NoOpInstruction, self).__init__( id=id, @@ -1411,8 +1310,6 @@ class NoOpInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags) @@ -1461,7 +1358,6 @@ class BarrierInstruction(_DataObliviousInstruction): no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - boostable=None, boostable_into=None, predicates=None, tags=None, synchronization_kind="global", mem_kind="local"): @@ -1478,8 +1374,6 @@ class BarrierInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags ) diff --git a/loopy/maxima.py b/loopy/maxima.py deleted file mode 100644 index c74360a73..000000000 --- a/loopy/maxima.py +++ /dev/null @@ -1,105 +0,0 @@ -# pylint: disable=all # This code needs porting to modern loopy -"""Export to maxima.""" - -from __future__ import division - -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -from pymbolic.interop.maxima import \ - MaximaStringifyMapper as MaximaStringifyMapperBase - - -class MaximaStringifyMapper(MaximaStringifyMapperBase): - def map_subscript(self, expr, enclosing_prec): - res = self.rec(expr.aggregate, enclosing_prec) - idx = expr.index - if not isinstance(idx, tuple): - idx = (idx,) - for i in idx: - if isinstance(i, int): - res += "_%d" % i - - return res - - -def get_loopy_instructions_as_maxima(kernel, prefix): - """Sample use for code comparison:: - - load("knl-optFalse.mac"); - load("knl-optTrue.mac"); - - vname: bessel_j_8; - - un_name : concat(''un_, vname); - opt_name : concat(''opt_, vname); - - print(ratsimp(ev(un_name - opt_name))); - """ - from loopy.preprocess import add_boostability_and_automatic_dependencies - kernel = add_boostability_and_automatic_dependencies(kernel) - - my_variable_names = ( - avn - for insn in kernel.instructions - for avn in insn.assignee_var_names() - ) - - from pymbolic import var - subst_dict = dict( - (vn, var(prefix+vn)) for vn in my_variable_names) - - mstr = MaximaStringifyMapper() - from loopy.symbolic import SubstitutionMapper - from pymbolic.mapper.substitutor import make_subst_func - substitute = SubstitutionMapper(make_subst_func(subst_dict)) - - result = ["ratprint:false;"] - - written_insn_ids = set() - - from loopy.kernel import InstructionBase, Assignment - - def write_insn(insn): - if not isinstance(insn, InstructionBase): - insn = kernel.id_to_insn[insn] - if not isinstance(insn, Assignment): - raise RuntimeError("non-single-output assignment not supported " - "in maxima export") - - for dep in insn.depends_on: - if dep not in written_insn_ids: - write_insn(dep) - - aname, = insn.assignee_var_names() - result.append("%s%s : %s;" % ( - prefix, aname, - mstr(substitute(insn.expression)))) - - written_insn_ids.add(insn.id) - - for insn in kernel.instructions: - if insn.id not in written_insn_ids: - write_insn(insn) - - return "\n".join(result) diff --git a/loopy/options.py b/loopy/options.py index 63089d94d..c7acd77c6 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -89,12 +89,6 @@ class Options(ImmutableRecord): Like :attr:`trace_assignments`, but also trace the assigned values. - .. attribute:: ignore_boostable_into - - Ignore the boostable_into field of the kernel, when - determining whether an iname duplication is necessary - for the kernel to be schedulable. - .. attribute:: check_dep_resolution Whether loopy should issue an error if a dependency @@ -211,7 +205,6 @@ class Options(ImmutableRecord): annotate_inames=kwargs.get("annotate_inames", False), trace_assignments=kwargs.get("trace_assignments", False), trace_assignment_values=kwargs.get("trace_assignment_values", False), - ignore_boostable_into=kwargs.get("ignore_boostable_into", False), skip_arg_checks=kwargs.get("skip_arg_checks", False), no_numpy=kwargs.get("no_numpy", False), diff --git a/loopy/preprocess.py b/loopy/preprocess.py index de81815a8..a231b31ee 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1954,114 +1954,6 @@ def realize_ilp(kernel): # }}} -# {{{ find idempotence ("boostability") of instructions - -def find_idempotence(kernel): - logger.debug("%s: idempotence" % kernel.name) - - writer_map = kernel.writer_map() - - arg_names = set(arg.name for arg in kernel.args) - - var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) - - reads_map = dict( - (insn.id, insn.read_dependency_names() & var_names) - for insn in kernel.instructions) - - from collections import defaultdict - dep_graph = defaultdict(set) - - for insn in kernel.instructions: - dep_graph[insn.id] = set(writer_id - for var in reads_map[insn.id] - for writer_id in writer_map.get(var, set())) - - # Find SCCs of dep_graph. These are used for checking if the instruction is - # in a dependency cycle. - from pytools.graph import compute_sccs - - sccs = dict((item, scc) - for scc in compute_sccs(dep_graph) - for item in scc) - - non_idempotently_updated_vars = set() - - new_insns = [] - for insn in kernel.instructions: - boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] - - if not boostable: - non_idempotently_updated_vars.update( - insn.assignee_var_names()) - - new_insns.append(insn.copy(boostable=boostable)) - - # {{{ remove boostability from isns that access non-idempotently updated vars - - new2_insns = [] - for insn in new_insns: - if insn.boostable and bool( - non_idempotently_updated_vars & insn.dependency_names()): - new2_insns.append(insn.copy(boostable=False)) - else: - new2_insns.append(insn) - - # }}} - - return kernel.copy(instructions=new2_insns) - -# }}} - - -# {{{ limit boostability - -def limit_boostability(kernel): - """Finds out which other inames an instruction's inames occur with - and then limits boostability to just those inames. - """ - - logger.debug("%s: limit boostability" % kernel.name) - - iname_occurs_with = {} - for insn in kernel.instructions: - insn_inames = kernel.insn_inames(insn) - for iname in insn_inames: - iname_occurs_with.setdefault(iname, set()).update(insn_inames) - - iname_use_counts = {} - for insn in kernel.instructions: - for iname in kernel.insn_inames(insn): - iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1 - - single_use_inames = set(iname for iname, uc in six.iteritems(iname_use_counts) - if uc == 1) - - new_insns = [] - for insn in kernel.instructions: - if insn.boostable is None: - raise LoopyError("insn '%s' has undetermined boostability" % insn.id) - elif insn.boostable: - boostable_into = set() - for iname in kernel.insn_inames(insn): - boostable_into.update(iname_occurs_with[iname]) - - boostable_into -= kernel.insn_inames(insn) | single_use_inames - - # Even if boostable_into is empty, leave boostable flag on--it is used - # for boosting into unused hw axes. - - insn = insn.copy(boostable_into=boostable_into) - else: - insn = insn.copy(boostable_into=set()) - - new_insns.append(insn) - - return kernel.copy(instructions=new_insns) - -# }}} - - # {{{ check for loads of atomic variables def check_atomic_loads(kernel): @@ -2185,10 +2077,6 @@ def preprocess_kernel(kernel, device=None): kernel = find_temporary_address_space(kernel) - # boostability should be removed in 2017.x. - kernel = find_idempotence(kernel) - kernel = limit_boostability(kernel) - # check for atomic loads, much easier to do here now that the dependencies # have been established kernel = check_atomic_loads(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 032cdc276..b2b6553c5 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -636,11 +636,6 @@ class SchedulerState(ImmutableRecord): A mapping from instruction group names to the number of instructions in them that are left to schedule. If a group name occurs in this mapping, that group is considered active. - - .. attribute:: uses_of_boostability - - Used to produce warnings about deprecated 'boosting' behavior - Should be removed along with boostability in 2017.x. """ @property @@ -652,18 +647,13 @@ class SchedulerState(ImmutableRecord): def generate_loop_schedules_internal( - sched_state, allow_boost=False, debug=None): + sched_state, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel Fore = kernel.options._fore # noqa Style = kernel.options._style # noqa - if allow_boost is None: - rec_allow_boost = None - else: - rec_allow_boost = False - active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( @@ -693,7 +683,6 @@ def generate_loop_schedules_internal( print(75*"=") print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") print(dump_schedule(sched_state.kernel, sched_state.preschedule)) - #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") for iname, val in six.iteritems(sched_state.loop_nest_around_map): @@ -719,7 +708,6 @@ def generate_loop_schedules_internal( within_subkernel=True, may_schedule_global_barriers=False, enclosing_subkernel_inames=sched_state.active_inames), - allow_boost=rec_allow_boost, debug=debug): yield result @@ -733,7 +721,6 @@ def generate_loop_schedules_internal( preschedule=sched_state.preschedule[1:], within_subkernel=False, may_schedule_global_barriers=True), - allow_boost=rec_allow_boost, debug=debug): yield result @@ -752,7 +739,6 @@ def generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:]), - allow_boost=rec_allow_boost, debug=debug): yield result @@ -806,15 +792,6 @@ def generate_loop_schedules_internal( want = kernel.insn_inames(insn) - sched_state.parallel_inames have = active_inames_set - sched_state.parallel_inames - # If insn is boostable, it may be placed inside a more deeply - # nested loop without harm. - - orig_have = have - if allow_boost: - # Note that the inames in 'insn.boostable_into' necessarily won't - # be contained in 'want'. - have = have - insn.boostable_into - if want != have: is_ready = False @@ -920,12 +897,6 @@ def generate_loop_schedules_internal( # }}} - new_uses_of_boostability = [] - if allow_boost: - if orig_have & insn.boostable_into: - new_uses_of_boostability.append( - (insn.id, orig_have & insn.boostable_into)) - new_sched_state = sched_state.copy( scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set, unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, @@ -937,9 +908,6 @@ def generate_loop_schedules_internal( if insn_id not in sched_state.prescheduled_insn_ids else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, - uses_of_boostability=( - sched_state.uses_of_boostability - + new_uses_of_boostability) ) # Don't be eager about entering/leaving loops--if progress has been @@ -947,7 +915,7 @@ def generate_loop_schedules_internal( # made. for sub_sched in generate_loop_schedules_internal( new_sched_state, - allow_boost=rec_allow_boost, debug=debug): + debug=debug): yield sub_sched if not sched_state.group_insn_counts: @@ -989,12 +957,10 @@ def generate_loop_schedules_internal( # outside of last_entered_loop. for subdep_id in gen_dependencies_except(kernel, insn_id, sched_state.scheduled_insn_ids): - subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - sched_state.parallel_inames) if ( - last_entered_loop not in want and - last_entered_loop not in subdep.boostable_into): + last_entered_loop not in want): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " "which the schedule is " @@ -1054,7 +1020,7 @@ def generate_loop_schedules_internal( not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), - allow_boost=rec_allow_boost, debug=debug): + debug=debug): yield sub_sched return @@ -1169,7 +1135,7 @@ def generate_loop_schedules_internal( for insn_id in reachable_insn_ids: insn = kernel.id_to_insn[insn_id] - want = kernel.insn_inames(insn) | insn.boostable_into + want = kernel.insn_inames(insn) if hypothetically_active_loops <= want: if usefulness is None: @@ -1269,7 +1235,6 @@ def generate_loop_schedules_internal( if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), - allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True yield sub_sched @@ -1294,28 +1259,11 @@ def generate_loop_schedules_internal( # if done, yield result debug.log_success(sched_state.schedule) - for boost_insn_id, boost_inames in sched_state.uses_of_boostability: - warn_with_kernel( - kernel, "used_boostability", - "instruction '%s' was implicitly nested inside " - "inames '%s' based on an idempotence heuristic. " - "This is deprecated and will stop working in loopy 2017.x." - % (boost_insn_id, ", ".join(boost_inames)), - DeprecationWarning) - yield sched_state.schedule else: - if not allow_boost and allow_boost is not None: - # try again with boosting allowed - for sub_sched in generate_loop_schedules_internal( - sched_state, - allow_boost=True, debug=debug): - yield sub_sched - else: - # dead end - if debug is not None: - debug.log_dead_end(sched_state.schedule) + if debug is not None: + debug.log_dead_end(sched_state.schedule) # }}} @@ -1914,13 +1862,9 @@ def generate_loop_schedules_inner(kernel, debug_args={}): parallel_inames=parallel_inames - ilp_inames - vec_inames, group_insn_counts=group_insn_counts(kernel), - active_group_counts={}, - - uses_of_boostability=[]) + active_group_counts={}) schedule_gen_kwargs = {} - if kernel.options.ignore_boostable_into: - schedule_gen_kwargs["allow_boost"] = None def print_longest_dead_end(): if debug.interactive: diff --git a/loopy/statistics.py b/loopy/statistics.py index 32fe7741e..cbdbdac6e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1405,10 +1405,6 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1597,10 +1593,6 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - subgroup_size = _process_subgroup_size(knl, subgroup_size) from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1713,10 +1705,6 @@ def get_synchronization_map(knl, subgroup_size=None): """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 8432d59ec..33a51f627 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -986,7 +986,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options(knl, use_boostable_into=None): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1016,6 +1016,15 @@ def get_iname_duplication_options(knl, use_boostable_into=False): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ + if use_boostable_into: + raise LoopyError("'use_boostable_into=True' is no longer supported.") + + if use_boostable_into is False: + from warnings import warn + warn("passing 'use_boostable_into=False' to 'get_iname_duplication_options'" + " is deprecated. This will be the default in 2021.x.", + DeprecationWarning, stacklevel=2) + from loopy.kernel.data import ConcurrentTag concurrent_inames = set( @@ -1024,23 +1033,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False): if knl.iname_tags_of_type(iname, ConcurrentTag)) # First we extract the minimal necessary information from the kernel - if use_boostable_into: - insn_iname_sets = ( - frozenset( - (insn.within_inames - | insn.boostable_into if insn.boostable_into is not None - else frozenset([])) - - concurrent_inames - for insn in knl.instructions) - - - frozenset([frozenset([])])) - else: - insn_iname_sets = ( - frozenset( - insn.within_inames - concurrent_inames - for insn in knl.instructions) - - - frozenset([frozenset([])])) + insn_iname_sets = ( + frozenset( + insn.within_inames - concurrent_inames + for insn in knl.instructions) + - + frozenset([frozenset([])])) # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_iname_sets): @@ -1049,23 +1047,6 @@ def get_iname_duplication_options(knl, use_boostable_into=False): and knl.iname_tags_of_type(iname, ConcurrentTag)): continue - # If we find a duplication option and to not use boostable_into - # information, we restart this generator with use_boostable_into=True - if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): - yield option - - # Emit a warning that we needed boostable_into - from warnings import warn - from loopy.diagnostic import LoopyWarning - warn("Kernel '%s' required the deprecated 'boostable_into' " - "instruction attribute in order to be schedulable!" % knl.name, - LoopyWarning) - - # Return to avoid yielding the duplication - # options without boostable_into - return - # Reconstruct an object that may be passed to the within parameter of # loopy.duplicate_inames from loopy.match import Id, Or @@ -1073,9 +1054,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): Id(insn.id) for insn in knl.instructions if insn.within_inames in insns)) - # Only yield the result if an instruction matched. With - # use_boostable_into=True this is not always true. - + # Only yield the result if an instruction matched. if within.children: yield iname, within diff --git a/loopy/transform/save.py b/loopy/transform/save.py index baa558a72..aef13b237 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -562,9 +562,7 @@ class TemporarySaver(object): self.subkernel_to_surrounding_inames[subkernel] | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, - depends_on=depends_on, - boostable=False, - boostable_into=frozenset()) + depends_on=depends_on) if mode == "save": self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) -- GitLab From 4ca8fbdb33a706dfe0036561389650b9bae3ec0f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 10 Jul 2020 11:37:50 -0500 Subject: [PATCH 2/2] updates tests to get rid of boostability assumptions with patterns like: - adds outer inames to prefetches/precomputes to ensure all instructions are within all the hardware axes of the kernel. - adds inames to instructions to ensure all instructions are within all the hardware axes of the kernel. --- examples/fortran/matmul.floopy | 8 +++-- examples/python/rank-one.py | 22 ++++++++++---- test/test_apps.py | 5 +++- test/test_dg.py | 1 + test/test_domain.py | 5 +++- test/test_fortran.py | 10 +++++-- test/test_linalg.py | 54 ++++++++++++++++++++++++++-------- test/test_nbody.py | 3 +- test/test_numa_diff.py | 3 +- test/test_sem_reagan.py | 3 +- test/test_target.py | 4 ++- test/test_transform.py | 3 +- 12 files changed, 92 insertions(+), 29 deletions(-) diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b3552204..23840f09a 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -22,7 +22,11 @@ end subroutine ! ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") -! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") -! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") ! RESULT = [dgemm] !$loopy end diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py index b8da89c6c..eda11fc15 100644 --- a/examples/python/rank-one.py +++ b/examples/python/rank-one.py @@ -33,8 +33,10 @@ evt, (c,) = knl(queue, a=a, b=b) split_knl = knl # PREFETCH1BEGIN -knl = lp.add_prefetch(knl, "a") -knl = lp.add_prefetch(knl, "b") +knl = lp.add_prefetch(knl, "a", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') +knl = lp.add_prefetch(knl, "b", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') # PREFETCH1END knl = lp.set_options(knl, write_code=True) @@ -43,8 +45,14 @@ evt, (c,) = knl(queue, a=a, b=b) knl = split_knl # PREFETCH2BEGIN -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0") +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") # PREFETCH2END knl = lp.set_options(knl, write_code=True) @@ -58,8 +66,10 @@ knl = lp.split_iname(knl, "i", 256, knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") diff --git a/test/test_apps.py b/test/test_apps.py index 71029cc9c..f7eeb756e 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -101,8 +101,11 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", + fetch_outer_inames='im_x_outer, im_y_outer, ifeat', + default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", + fetch_outer_inames='iimg, im_x_outer, im_y_outer, ifeat, icolor', default_tag="l.auto") return knl diff --git a/test/test_dg.py b/test/test_dg.py index 967dea350..543701a5f 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -100,6 +100,7 @@ def test_dg_volume(ctx_factory): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", + fetch_outer_inames='k_outer', default_tag="l.auto") return knl diff --git a/test/test_domain.py b/test/test_domain.py index 5daf84eaa..896251445 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -317,7 +317,7 @@ def test_equality_constraints(ctx_factory): ], [ "a[i,j] = 5 {id=set_all}", - "b[i,k] = 22 {dep=set_all}", + "b[i,k] = 22 {id=set_b, dep=set_all}", ], [ lp.GlobalArg("a,b", dtype, shape="n, n", order=order), @@ -329,6 +329,9 @@ def test_equality_constraints(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + + knl = lp.add_inames_to_insn(knl, 'j_inner, j_outer', 'id:set_b') + #print(knl) #print(knl.domains[0].detect_equalities()) diff --git a/test/test_fortran.py b/test/test_fortran.py index 3601e96b7..c7270abd2 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -316,8 +316,12 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") @@ -492,9 +496,11 @@ def test_precompute_some_exist(ctx_factory): knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl diff --git a/test/test_linalg.py b/test/test_linalg.py index f075d3493..390c5654f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -186,8 +186,10 @@ def test_plain_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -223,8 +225,12 @@ def test_variable_size_matrix_mul(ctx_factory): slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -263,8 +269,10 @@ def test_funny_shape_matrix_mul(ctx_factory): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -307,8 +315,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a") - knl = lp.add_prefetch(knl, "b") + knl = lp.add_prefetch(knl, "a", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') + knl = lp.add_prefetch(knl, "b", + fetch_outer_inames='i_outer, i_inner, j_outer, j_inner') return knl def variant_3(knl): @@ -317,8 +327,15 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer, j_inner', + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + return knl def variant_4(knl): @@ -327,8 +344,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") @@ -385,6 +404,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -425,8 +445,10 @@ def test_intel_matrix_mul(ctx_factory): #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], + fetch_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") # FIXME: Grouped prefetch @@ -528,8 +550,12 @@ def test_image_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -608,8 +634,12 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") + knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], + fetch_outer_inames='i_outer, j_outer, k_outer', + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], diff --git a/test/test_nbody.py b/test/test_nbody.py index 5b36ed416..6016c2f1c 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -77,7 +77,8 @@ def test_nbody(ctx_factory): outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], - ["x_fetch_j", "x_fetch_k"], default_tag=None) + ["x_fetch_j", "x_fetch_k"], + fetch_outer_inames='i_outer, j_outer', default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 54b608a18..57d75b24b 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -90,7 +90,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if opt_level == 0: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") + hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", + default_tag="l.auto") if opt_level == 1: tap_hsv = hsv diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index 54c64e0a4..fff2b5356 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -82,7 +82,8 @@ def test_tim2d(ctx_factory): def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) - knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', + default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") diff --git a/test/test_target.py b/test/test_target.py index 038b2e6c0..afad1b676 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -281,7 +281,9 @@ def test_numba_cuda_target(): knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "X[i,:]", + fetch_outer_inames='i_inner, i_outer, j_inner', + default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") diff --git a/test/test_transform.py b/test/test_transform.py index ffef893b0..a6fb9424d 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -374,7 +374,8 @@ def test_precompute_confusing_subst_arguments(ctx_factory): from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D") + knl = lp.precompute(knl, "D", sweep_inames='j', + precompute_outer_inames='j, i_inner, i_outer') lp.auto_test_vs_ref( ref_knl, ctx, knl, -- GitLab