diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0affffcf719024d9fa1302ffb577e1574..2da4815d32fa8c4d3acccfddca9ce2d801f55d9b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,7 @@ from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, register_function_lookup) +from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -231,6 +232,8 @@ __all__ = [ "register_callable_kernel", "register_function_lookup", + "pack_and_unpack_args_for_call", + # }}} "get_dot_dependency_graph", diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 79c9cb2e1e2bba7f2e45ec59f5cb57e00a70260a..ea20ae9da5400e6ff4b3a4c46460a0b8a7893be2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -38,6 +38,14 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) +from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, + CombineMapper) + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + # {{{ argument descriptors @@ -439,12 +447,12 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", + fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline"]) - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", + init_arg_names = ("name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline") - def __init__(self, subkernel, arg_id_to_dtype=None, + def __init__(self, name, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None, inline=False): super(CallableKernel, self).__init__( @@ -453,6 +461,7 @@ class CallableKernel(InKernelCallable): if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) + self.name = name self.name_in_target = name_in_target self.inline = inline self.subkernel = subkernel.copy( @@ -533,6 +542,21 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) + def with_packing_for_args(self): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + mem_scope='Global') + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + def with_hw_axes_sizes(self, gsize, lsize): return self.copy( subkernel=self.subkernel.copy( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be3816f51940a548c1656b6e08207c14189..a1964fc7d4f908b959e813f046ab058dbf5aa992 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2546,6 +2546,54 @@ class KernelInliner(SubstitutionMapper): return super(KernelInliner, self).map_subscript(expr) +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + def inline_callable_kernels(kernel): from loopy import CallInstruction @@ -2718,6 +2766,29 @@ def inline_callable_kernels(kernel): # }}} + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + return kernel # }}} diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 0000000000000000000000000000000000000000..2c06a6fa9fe4d8345e3db8a0ed645b5f4305ca6e --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,302 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Tianjiao Sun" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +# {{{ main entrypoint + +def pack_and_unpack_args_for_call(kernel, call_name, args=None): + """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args: A list of the arguments as instances of :class:`str` which must + be packed and unpacked. If set *None*, it is interpreted that all the + array arguments would be packed anf unpacked. + """ + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + old_insn_to_new_insns = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + if insn.expression.function.name not in kernel.scoped_functions: + continue + + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args is None: + args = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + + # {{{ sanity checks for args + + assert isinstance(args, list) + + for arg in args: + found_sub_array_ref = False + for par in parameters + insn.assignees: + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + + # }}} + + packing = [] + unpacking = [] + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in insn.within_inames if isinstance( + kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + # dict to store the new assignees and parameters, the mapping pattern + # from id to parameters is identical to InKernelCallable.arg_id_to_dtype + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} + + for id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + + new_pack_inames = dict((iname, var(vng(iname.name + + "_pack"))) for iname in p.swept_inames) + new_unpack_inames = dict((iname, var(vng(iname.name + + "_unpack"))) for iname in p.swept_inames) + + # Updating the domains corresponding to the new inames. + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=arg_in_caller.dtype, + dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[id].shape, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) + + # {{{ getting the lhs for packing and rhs for unpacking + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) + + # }}} + + packing.append(Assignment( + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_pack_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack"), + depends_on_is_final=True + )) + + unpacking.append(Assignment( + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_unpack_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), + depends_on_is_final=True + )) + + # {{{ creating the sweep inames for the new sub array refs + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[id] = SubArrayRef(tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[id] = p + + if packing: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_insn = insn.with_transformed_expressions(subst_mapper) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) + packing.append( + new_insn.copy( + depends_on=new_insn.depends_on | set( + pack.id for pack in packing), + within_inames=new_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_insn.expression.function(*new_params), + assignees=new_assignees + ) + ) + old_insn_to_new_insns[insn] = packing + unpacking + + if old_insn_to_new_insns: + new_instructions = [] + for insn in kernel.instructions: + if insn in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn]) + else: + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + old_insn = kernel.id_to_insn[old_insn_id] + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 8300fa37409a3b9bf4f9efee09b3348512350349..1204c9c13c98f59f62ded1e59ef07df1adfe8b56 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -126,9 +126,11 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # making the target of the child kernel to be same as the target of parent # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - is_master_kernel=False), inline=inline) + callable_kernel = CallableKernel(name=function_name, + subkernel=callee_kernel.copy( + target=caller_kernel.target, + is_master_kernel=False), + inline=inline) # disabling global barriers for callee kernel from loopy import set_options diff --git a/test/test_transform.py b/test/test_transform.py index b08d674a595ae0524f9efbdc1500ae081e3752b2..8d42b61ffcf624bffbac6aba82c58fbdfc4cfa0d 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -556,6 +556,52 @@ def test_inline_kernel_2d(ctx_factory): assert np.allclose(out, z) +@pytest.mark.parametrize("inline", [False, True]) +def test_packing_unpacking(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*b[i] + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<2 and 0 <= j < 3}", + """ + a[i, j] = 3*b[i, j] + """) + + knl = lp.make_kernel( + "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", + """ + [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) + [k]: y2[k] = callee_fn2([k]: x2[k]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline=inline) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline=inline) + + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + + assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm( + 2*x1.get()) < 1e-15 + assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm( + 3*x2.get()) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx)