From 22bb8c78378a0477df04b2da4f4a2e8afd284f62 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 1 May 2018 17:41:37 +0100 Subject: [PATCH 1/8] packing arguments for external functions --- loopy/preprocess.py | 144 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be38..321f31e45 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2282,6 +2282,147 @@ def infer_arg_descr(kernel): # }}} +# {{{ + +def need_packing(tags_needed, tags): + if len(tags_needed) != len(tags): + return True + + strides_needed = (tag.stride for tag in tags_needed) + strides = (tag.stride for tag in tags) + return any(s1!=s2 for s1, s2 in zip(strides_needed, strides)) + +def add_pack_and_unpack(kernel): + """ + """ + + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + new_calls = {} + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + + callable = kernel.scoped_functions[call.expression.function.name] + from loopy.kernel.function_interface import CallableKernel + if isinstance(callable, CallableKernel): + # Not external functions + continue + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = call.expression.parameters + packing = [] + new_params = [] + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in call.within_inames if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + from loopy.symbolic import SubArrayRef + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + for i,p in enumerate(parameters): + if isinstance(p, SubArrayRef): + des = callable.arg_id_to_descr[i] + name = p.subscript.aggregate.name + if name in kernel.temporary_variables: + array = kernel.temporary_variables[name] + else: + assert name in kernel.arg_dict + array = kernel.arg_dict[name] + dim_tags, _ = p.get_sub_array_dim_tags_and_shape(array.dim_tags, array.shape) + # Check if memory layout match + if need_packing(des.dim_tags, dim_tags): + new_swept_inames = ilp_inames_map.copy() + for iname in p.swept_inames: + new_swept_inames[iname] = var(vng(iname.name + "_pack")) + new_domain = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, new_swept_inames[var(old_iname)].name) + new_domains.append(new_domain) + + pack_name = vng(name + "_pack") + + from loopy.kernel.data import TemporaryVariable + + pack_tmp = TemporaryVariable( + name=pack_name, + shape=des.shape, + dtype=array.dtype, + scope=array.scope, + dim_tags=des.dim_tags + ) + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + subst_mapper = SubstitutionMapper(make_subst_func(new_swept_inames)) + + packing.append(Assignment( + assignee=var(pack_name).index(tuple(new_swept_inames[i] for i in p.swept_inames)), + expression=subst_mapper.map_subscript(p.subscript), + within_inames=call.within_inames - ilp_inames | set(new_swept_inames[i].name for i in p.swept_inames) | new_ilp_inames, + depends_on=call.depends_on, + id=ing(call.id+"_pack") + )) + new_params.append(SubArrayRef(p.swept_inames, var(pack_name).index(p.swept_inames))) + else: + new_params.append(p) + else: + new_params.append(p) + if packing: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + _call = call.with_transformed_expressions(subst_mapper) + new_expr = _call.expression.function() + new_params = list(map(subst_mapper, new_params)) + packing.append( + _call.copy( + depends_on=_call.depends_on | set(pack.id for pack in packing), + within_inames=_call.within_inames - ilp_inames | new_ilp_inames, + expression=_call.expression.function(*new_params) + ) + ) + new_calls[call] = packing + + if new_calls: + new_instructions = [] + for insn in kernel.instructions: + if insn in new_calls: + new_instructions.extend(new_calls[insn]) + else: + new_instructions.append(insn) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + return kernel + +# }}} + + # {{{ class HWAxesInferenceMapper(CombineMapper): @@ -2814,6 +2955,9 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) + # packing args for external functions if necessary + kernel = add_pack_and_unpack(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) -- GitLab From 68ac270e677944468eb20c93ad6088d277c8af74 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 09:14:52 -0500 Subject: [PATCH 2/8] Added some changes to TJs code. --- loopy/kernel/function_interface.py | 24 ++- loopy/preprocess.py | 146 +------------- loopy/transform/pack_and_unpack_args.py | 250 ++++++++++++++++++++++++ loopy/transform/register_callable.py | 8 +- 4 files changed, 277 insertions(+), 151 deletions(-) create mode 100644 loopy/transform/pack_and_unpack_args.py diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 79c9cb2e1..91d9b2911 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -439,12 +439,12 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", + fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline"]) - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", + init_arg_names = ("name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline") - def __init__(self, subkernel, arg_id_to_dtype=None, + def __init__(self, name, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None, inline=False): super(CallableKernel, self).__init__( @@ -453,6 +453,7 @@ class CallableKernel(InKernelCallable): if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) + self.name = name self.name_in_target = name_in_target self.inline = inline self.subkernel = subkernel.copy( @@ -533,6 +534,23 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) + def with_packing_for_args(self): + from loopy.preprocess import preprocess_kernel + subkernel = preprocess_kernel(self.subkernel) + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + mem_scope='Global') + + return self.copy(subkernel=subkernel, + arg_id_to_descr=arg_id_to_descr) + def with_hw_axes_sizes(self, gsize, lsize): return self.copy( subkernel=self.subkernel.copy( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 321f31e45..3cf1e1df9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2282,147 +2282,6 @@ def infer_arg_descr(kernel): # }}} -# {{{ - -def need_packing(tags_needed, tags): - if len(tags_needed) != len(tags): - return True - - strides_needed = (tag.stride for tag in tags_needed) - strides = (tag.stride for tag in tags) - return any(s1!=s2 for s1, s2 in zip(strides_needed, strides)) - -def add_pack_and_unpack(kernel): - """ - """ - - new_domains = [] - new_tmps = kernel.temporary_variables.copy() - new_calls = {} - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - - callable = kernel.scoped_functions[call.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(callable, CallableKernel): - # Not external functions - continue - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - - parameters = call.expression.parameters - packing = [] - new_params = [] - - from loopy.kernel.data import IlpBaseTag, VectorizeTag - import islpy as isl - from pymbolic import var - - dim_type = isl.dim_type.set - ilp_inames = set(iname for iname in call.within_inames if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) - new_ilp_inames = set() - ilp_inames_map = {} - for iname in ilp_inames: - new_iname_name = vng(iname + "_ilp") - ilp_inames_map[var(iname)] = var(new_iname_name) - new_ilp_inames.add(new_iname_name) - for iname in ilp_inames: - new_domain = kernel.get_inames_domain(iname).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - if old_iname in ilp_inames: - new_domain = new_domain.set_dim_name( - dim_type, i, ilp_inames_map[var(old_iname)].name) - new_domains.append(new_domain) - - from loopy.symbolic import SubArrayRef - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - - for i,p in enumerate(parameters): - if isinstance(p, SubArrayRef): - des = callable.arg_id_to_descr[i] - name = p.subscript.aggregate.name - if name in kernel.temporary_variables: - array = kernel.temporary_variables[name] - else: - assert name in kernel.arg_dict - array = kernel.arg_dict[name] - dim_tags, _ = p.get_sub_array_dim_tags_and_shape(array.dim_tags, array.shape) - # Check if memory layout match - if need_packing(des.dim_tags, dim_tags): - new_swept_inames = ilp_inames_map.copy() - for iname in p.swept_inames: - new_swept_inames[iname] = var(vng(iname.name + "_pack")) - new_domain = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, new_swept_inames[var(old_iname)].name) - new_domains.append(new_domain) - - pack_name = vng(name + "_pack") - - from loopy.kernel.data import TemporaryVariable - - pack_tmp = TemporaryVariable( - name=pack_name, - shape=des.shape, - dtype=array.dtype, - scope=array.scope, - dim_tags=des.dim_tags - ) - new_tmps[pack_name] = pack_tmp - - from loopy import Assignment - subst_mapper = SubstitutionMapper(make_subst_func(new_swept_inames)) - - packing.append(Assignment( - assignee=var(pack_name).index(tuple(new_swept_inames[i] for i in p.swept_inames)), - expression=subst_mapper.map_subscript(p.subscript), - within_inames=call.within_inames - ilp_inames | set(new_swept_inames[i].name for i in p.swept_inames) | new_ilp_inames, - depends_on=call.depends_on, - id=ing(call.id+"_pack") - )) - new_params.append(SubArrayRef(p.swept_inames, var(pack_name).index(p.swept_inames))) - else: - new_params.append(p) - else: - new_params.append(p) - if packing: - subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) - _call = call.with_transformed_expressions(subst_mapper) - new_expr = _call.expression.function() - new_params = list(map(subst_mapper, new_params)) - packing.append( - _call.copy( - depends_on=_call.depends_on | set(pack.id for pack in packing), - within_inames=_call.within_inames - ilp_inames | new_ilp_inames, - expression=_call.expression.function(*new_params) - ) - ) - new_calls[call] = packing - - if new_calls: - new_instructions = [] - for insn in kernel.instructions: - if insn in new_calls: - new_instructions.extend(new_calls[insn]) - else: - new_instructions.append(insn) - kernel = kernel.copy( - domains=kernel.domains + new_domains, - instructions=new_instructions, - temporary_variables=new_tmps - ) - return kernel - -# }}} - - # {{{ class HWAxesInferenceMapper(CombineMapper): @@ -2955,11 +2814,8 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) - # packing args for external functions if necessary - kernel = add_pack_and_unpack(kernel) - # tuning the functions in the kernel to align with the grid sizes. - kernel = infer_hw_axes_sizes(kernel) + # kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 000000000..f6a748eef --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,250 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Tianjiao Sun" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +# {{{ main entrypoint + +def pack_and_unpack_args_for_call(kernel, call_name, args=None): + """ + """ + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + new_calls = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args is None: + args = [par.subscript.aggregate.name for par in parameters if + isinstance(par, SubArrayRef)] + [assignee.subscript.aggregate.name for + assignee in insn.assignees if isinstance(assignee, SubArrayRef)] + + # {{{ sanity checks for args + + for arg in args: + found_sub_array_ref = False + for par in parameters + insn.assignees: + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + + # }}} + + packing = [] + unpacking = [] + new_id_to_parameters = {} + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in insn.within_inames if isinstance( + kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + + for id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: + new_swept_inames = ilp_inames_map.copy() + for iname in p.swept_inames: + new_swept_inames[iname] = var(vng(iname.name + "_pack")) + new_domain = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, new_swept_inames[var(old_iname)].name) + new_domains.append(new_domain) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=kernel.arg_dict[arg].dtype, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + subst_mapper = SubstitutionMapper(make_subst_func( + new_swept_inames)) + + # {{{ getting the lhs assignee + + arg_in_caller = kernel.arg_dict[arg] + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + lhs_assignee = subst_mapper(var(pack_name).index(new_indices)) + + # }}} + + packing.append(Assignment( + assignee=lhs_assignee, + expression=subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_swept_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack") + )) + + unpacking.append(Assignment( + expression=lhs_assignee, + assignee=subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_swept_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=frozenset([insn.id]), + id=ing(insn.id+"_unpack") + )) + + # {{{ getting the new swept inames + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[id] = SubArrayRef(tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[id] = p + + if packing: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_insn = insn.with_transformed_expressions(subst_mapper) + new_params = [new_id_to_parameters[i] for i, _ in + enumerate(parameters)] + new_assignees = [new_id_to_parameters[-i-1] for i, _ in + enumerate(insn.assignees)] + new_params = [subst_mapper(p) for p in new_params] + new_assignees = tuple(subst_mapper(a) for a in new_assignees) + packing.append( + new_insn.copy( + depends_on=new_insn.depends_on | set( + pack.id for pack in packing), + within_inames=new_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_insn.expression.function(*new_params), + assignees=new_assignees + ) + ) + new_calls[insn] = packing + unpacking + + if new_calls: + new_instructions = [] + for insn in kernel.instructions: + if insn in new_calls: + new_instructions.extend(new_calls[insn]) + else: + new_instructions.append(insn) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 8300fa374..1204c9c13 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -126,9 +126,11 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # making the target of the child kernel to be same as the target of parent # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - is_master_kernel=False), inline=inline) + callable_kernel = CallableKernel(name=function_name, + subkernel=callee_kernel.copy( + target=caller_kernel.target, + is_master_kernel=False), + inline=inline) # disabling global barriers for callee kernel from loopy import set_options -- GitLab From 4af8ce256a040725ff7c41905f64916dd61cd2f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 16:02:28 -0500 Subject: [PATCH 3/8] Added pack, unpack. Remaining to comment the code. --- loopy/kernel/function_interface.py | 6 +-- loopy/preprocess.py | 2 +- loopy/transform/pack_and_unpack_args.py | 58 ++++++++++++++++--------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 91d9b2911..cb05a65b8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -535,20 +535,18 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=arg_id_to_descr) def with_packing_for_args(self): - from loopy.preprocess import preprocess_kernel - subkernel = preprocess_kernel(self.subkernel) kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) arg_id_to_descr = {} for pos, kw in pos_to_kw.items(): - arg = subkernel.arg_dict[kw] + arg = self.subkernel.arg_dict[kw] arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, dim_tags=arg.dim_tags, mem_scope='Global') - return self.copy(subkernel=subkernel, + return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) def with_hw_axes_sizes(self, gsize, lsize): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3cf1e1df9..1b1d9be38 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2815,7 +2815,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_arg_descr(kernel) # tuning the functions in the kernel to align with the grid sizes. - # kernel = infer_hw_axes_sizes(kernel) + kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index f6a748eef..853719c71 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -113,15 +113,21 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): for id, p in id_to_parameters: if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: - new_swept_inames = ilp_inames_map.copy() + new_pack_inames = ilp_inames_map.copy() + new_unpack_inames = ilp_inames_map.copy() for iname in p.swept_inames: - new_swept_inames[iname] = var(vng(iname.name + "_pack")) - new_domain = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, new_swept_inames[var(old_iname)].name) - new_domains.append(new_domain) + new_pack_inames[iname] = var(vng(iname.name + "_pack")) + new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) arg = p.subscript.aggregate.name pack_name = vng(arg + "_pack") @@ -132,14 +138,18 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): pack_tmp = TemporaryVariable( name=pack_name, dtype=kernel.arg_dict[arg].dtype, + dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[id].shape, scope=temp_var_scope.PRIVATE, ) new_tmps[pack_name] = pack_tmp from loopy import Assignment - subst_mapper = SubstitutionMapper(make_subst_func( - new_swept_inames)) + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) # {{{ getting the lhs assignee @@ -159,28 +169,32 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_indices = tuple(simplify_via_aff(i) for i in new_indices) - lhs_assignee = subst_mapper(var(pack_name).index(new_indices)) + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) # }}} packing.append(Assignment( - assignee=lhs_assignee, - expression=subst_mapper.map_subscript(p.subscript), + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), within_inames=insn.within_inames - ilp_inames | set( - new_swept_inames[i].name for i in p.swept_inames) | ( + new_pack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), depends_on=insn.depends_on, - id=ing(insn.id+"_pack") + id=ing(insn.id+"_pack"), + depends_on_is_final=True )) unpacking.append(Assignment( - expression=lhs_assignee, - assignee=subst_mapper.map_subscript(p.subscript), + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), within_inames=insn.within_inames - ilp_inames | set( - new_swept_inames[i].name for i in p.swept_inames) | ( + new_unpack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), - depends_on=frozenset([insn.id]), - id=ing(insn.id+"_unpack") + id=ing(insn.id+"_unpack"), + depends_on_is_final=True )) # {{{ getting the new swept inames @@ -227,7 +241,9 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): assignees=new_assignees ) ) - new_calls[insn] = packing + unpacking + new_unpacking = [unpack.copy(depends_on=frozenset( + pack.id for pack in packing)) for unpack in unpacking] + new_calls[insn] = packing + new_unpacking if new_calls: new_instructions = [] -- GitLab From fb63f2d7d0e543145feb5db9a313548f5b21856a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 17:53:37 -0500 Subject: [PATCH 4/8] Added test and a bit of cleanup. --- loopy/__init__.py | 3 ++ loopy/transform/pack_and_unpack_args.py | 61 ++++++++++++++++--------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0a..2da4815d3 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,7 @@ from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, register_function_lookup) +from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -231,6 +232,8 @@ __all__ = [ "register_callable_kernel", "register_function_lookup", + "pack_and_unpack_args_for_call", + # }}} "get_dot_dependency_graph", diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 853719c71..cf0003f8a 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -37,10 +37,20 @@ __doc__ = """ def pack_and_unpack_args_for_call(kernel, call_name, args=None): """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args: A list of the arguments as instances of :class:`str` which must + be packed and unpacked. If set *None*, it is interpreted that all the + array arguments would be packed anf unpacked. """ new_domains = [] new_tmps = kernel.temporary_variables.copy() - new_calls = {} + old_insn_to_new_insns = {} for insn in kernel.instructions: if not isinstance(insn, CallInstruction): @@ -66,6 +76,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # {{{ sanity checks for args + assert isinstance(args, list) + for arg in args: found_sub_array_ref = False for par in parameters + insn.assignees: @@ -81,7 +93,6 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): packing = [] unpacking = [] - new_id_to_parameters = {} from loopy.kernel.data import IlpBaseTag, VectorizeTag import islpy as isl @@ -108,24 +119,31 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper + # dict to store the new assignees and parameters, the mapping pattern + # from id to parameters is identical to InKernelCallable.arg_id_to_dtype id_to_parameters = tuple(enumerate(parameters)) + tuple( (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} for id, p in id_to_parameters: if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: - new_pack_inames = ilp_inames_map.copy() - new_unpack_inames = ilp_inames_map.copy() + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + for iname in p.swept_inames: new_pack_inames[iname] = var(vng(iname.name + "_pack")) new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + + # Updating the domains corresponding to the new inames. new_domain_pack = kernel.get_inames_domain(iname.name).copy() new_domain_unpack = kernel.get_inames_domain(iname.name).copy() for i in range(new_domain_pack.n_dim()): old_iname = new_domain_pack.get_dim_name(dim_type, i) - new_domain_pack = new_domain_pack.set_dim_name( - dim_type, i, new_pack_inames[var(old_iname)].name) - new_domain_unpack = new_domain_unpack.set_dim_name( - dim_type, i, new_unpack_inames[var(old_iname)].name) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) new_domains.append(new_domain_pack) new_domains.append(new_domain_unpack) @@ -151,7 +169,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): unpack_subst_mapper = SubstitutionMapper(make_subst_func( new_unpack_inames)) - # {{{ getting the lhs assignee + # {{{ getting the lhs for packing and rhs for unpacking arg_in_caller = kernel.arg_dict[arg] @@ -194,10 +212,11 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_unpack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), depends_on_is_final=True )) - # {{{ getting the new swept inames + # {{{ creating the sweep inames for the new sub array refs updated_swept_inames = [] @@ -225,12 +244,10 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): if packing: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) new_insn = insn.with_transformed_expressions(subst_mapper) - new_params = [new_id_to_parameters[i] for i, _ in - enumerate(parameters)] - new_assignees = [new_id_to_parameters[-i-1] for i, _ in - enumerate(insn.assignees)] - new_params = [subst_mapper(p) for p in new_params] - new_assignees = tuple(subst_mapper(a) for a in new_assignees) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) packing.append( new_insn.copy( depends_on=new_insn.depends_on | set( @@ -241,15 +258,15 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): assignees=new_assignees ) ) - new_unpacking = [unpack.copy(depends_on=frozenset( - pack.id for pack in packing)) for unpack in unpacking] - new_calls[insn] = packing + new_unpacking + old_insn_to_new_insns[insn] = packing + unpacking - if new_calls: + if old_insn_to_new_insns: new_instructions = [] for insn in kernel.instructions: - if insn in new_calls: - new_instructions.extend(new_calls[insn]) + if insn in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn]) else: new_instructions.append(insn) kernel = kernel.copy( -- GitLab From 55690f031a0f718c42e26f7fd64109c0b0a3c2f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 17:56:24 -0500 Subject: [PATCH 5/8] Commiting the tests. --- test/test_transform.py | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index b08d674a5..8d42b61ff 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -556,6 +556,52 @@ def test_inline_kernel_2d(ctx_factory): assert np.allclose(out, z) +@pytest.mark.parametrize("inline", [False, True]) +def test_packing_unpacking(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*b[i] + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<2 and 0 <= j < 3}", + """ + a[i, j] = 3*b[i, j] + """) + + knl = lp.make_kernel( + "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", + """ + [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) + [k]: y2[k] = callee_fn2([k]: x2[k]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline=inline) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline=inline) + + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + + assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm( + 2*x1.get()) < 1e-15 + assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm( + 3*x2.get()) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 488e47a3896fb4266f9ea395a57f76f2104d54ae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 10:32:28 -0500 Subject: [PATCH 6/8] Fixes minor error in getting the iname domains. --- loopy/transform/pack_and_unpack_args.py | 47 ++++++++++++++----------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index cf0003f8a..9ed2766e2 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -56,6 +56,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue + if insn.expression.function.name not in kernel.scoped_functions: + continue in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] @@ -70,9 +72,9 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): parameters = insn.expression.parameters if args is None: - args = [par.subscript.aggregate.name for par in parameters if - isinstance(par, SubArrayRef)] + [assignee.subscript.aggregate.name for - assignee in insn.assignees if isinstance(assignee, SubArrayRef)] + args = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] # {{{ sanity checks for args @@ -130,22 +132,24 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_pack_inames = ilp_inames_map.copy() # packing-specific inames new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname - for iname in p.swept_inames: - new_pack_inames[iname] = var(vng(iname.name + "_pack")) - new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + new_pack_inames = dict((iname, var(vng(iname.name + + "_pack"))) for iname in p.swept_inames) + new_unpack_inames = dict((iname, var(vng(iname.name + + "_unpack"))) for iname in p.swept_inames) # Updating the domains corresponding to the new inames. - new_domain_pack = kernel.get_inames_domain(iname.name).copy() - new_domain_unpack = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain_pack.n_dim()): - old_iname = new_domain_pack.get_dim_name(dim_type, i) - if var(old_iname) in new_pack_inames: - new_domain_pack = new_domain_pack.set_dim_name( - dim_type, i, new_pack_inames[var(old_iname)].name) - new_domain_unpack = new_domain_unpack.set_dim_name( - dim_type, i, new_unpack_inames[var(old_iname)].name) - new_domains.append(new_domain_pack) - new_domains.append(new_domain_unpack) + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) arg = p.subscript.aggregate.name pack_name = vng(arg + "_pack") @@ -153,9 +157,14 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): from loopy.kernel.data import (TemporaryVariable, temp_var_scope) + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + pack_tmp = TemporaryVariable( name=pack_name, - dtype=kernel.arg_dict[arg].dtype, + dtype=arg_in_caller.dtype, dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, shape=in_knl_callable.arg_id_to_descr[id].shape, scope=temp_var_scope.PRIVATE, @@ -171,8 +180,6 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # {{{ getting the lhs for packing and rhs for unpacking - arg_in_caller = kernel.arg_dict[arg] - from loopy.isl_helpers import simplify_via_aff, make_slab flatten_index = simplify_via_aff( -- GitLab From e0a167ae65df6e3002f0c74e8d8765acb57c17d0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 12:27:50 -0500 Subject: [PATCH 7/8] Now transfers scoped functions from caller to callee. --- loopy/kernel/function_interface.py | 8 ++++ loopy/preprocess.py | 71 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cb05a65b8..ea20ae9da 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -38,6 +38,14 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) +from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, + CombineMapper) + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + # {{{ argument descriptors diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be38..a1964fc7d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2546,6 +2546,54 @@ class KernelInliner(SubstitutionMapper): return super(KernelInliner, self).map_subscript(expr) +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + def inline_callable_kernels(kernel): from loopy import CallInstruction @@ -2718,6 +2766,29 @@ def inline_callable_kernels(kernel): # }}} + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + return kernel # }}} -- GitLab From b534f0b1952f505e826a3106d2568391e07ae9a3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 12:32:55 -0500 Subject: [PATCH 8/8] adding unpacking instructions as dependencies. --- loopy/transform/pack_and_unpack_args.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 9ed2766e2..2c06a6fa9 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -275,7 +275,19 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # instructions including the packing and unpacking instructions new_instructions.extend(old_insn_to_new_insns[insn]) else: - new_instructions.append(insn) + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + old_insn = kernel.id_to_insn[old_insn_id] + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) kernel = kernel.copy( domains=kernel.domains + new_domains, instructions=new_instructions, -- GitLab