From 48b887bd4b674ffc138fd63542e2cd70cc37c1c9 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 11 Apr 2018 18:06:45 +0100 Subject: [PATCH 01/18] kernel inlining prototype --- loopy/transform/register_knl.py | 208 ++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 loopy/transform/register_knl.py diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py new file mode 100644 index 000000000..9997ade35 --- /dev/null +++ b/loopy/transform/register_knl.py @@ -0,0 +1,208 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six + +from loopy.kernel import LoopKernel +from loopy.kernel.creation import FunctionScoper +from loopy.diagnostic import LoopyError +from loopy.kernel.function_interface import CallableKernel + +from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, + CInstruction, _DataObliviousInstruction) + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable_kernel +""" + + +# {{{ main entrypoint + +def register_callable_kernel(parent, function_name, child): + """ + The purpose of this transformation is so that one can inoke the child + kernel in the parent kernel. + + :arg parent + + This is the "main" kernel which will mostly remain unaltered and one + can interpret it as stitching up the child kernel in the parent kernel. + + :arg function_name + + The name of the function call with which the child kernel must be + associated in the parent kernel + + :arg child + + This is like a function in every other language and this might be + invoked in one of the instructions of the parent kernel. + + ..note:: + + One should note that the kernels would go under stringent compatibilty + tests so that both of them can be confirmed to be made for each other. + """ + + # {{{ sanity checks + + assert isinstance(parent, LoopKernel) + assert isinstance(child, LoopKernel) + assert isinstance(function_name, str) + + # }}} + + # scoping the function + function_scoper = FunctionScoper(set([function_name])) + new_insns = [] + + for insn in parent.instructions: + if isinstance(insn, CallInstruction): + new_insn = insn.copy(expression=function_scoper(insn.expression)) + new_insns.append(new_insn) + elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, + CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("scope_functions not implemented for %s" % + type(insn)) + + # adding the scoped function to the scoped function dict of the parent + # kernel. + + scoped_functions = parent.scoped_functions.copy() + + if function_name in scoped_functions: + raise LoopyError("%s is already being used as a funciton name -- maybe" + "use a different name for registering the subkernel") + + scoped_functions[function_name] = CallableKernel(name=function_name, + subkernel=child) + + # returning the parent kernel with the new scoped function dictionary + return parent.copy(scoped_functions=scoped_functions, + instructions=new_insns) + +# }}} + + + +def inline_kernel(kernel, function, arg_map=None): + + child = kernel.scoped_functions[function].subkernel + vng = kernel.get_var_name_generator() + + # duplicate and rename inames + + import islpy as isl + + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng(iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + n_dim = new_domain.n_dim() + for i in range(n_dim): + iname = new_domain.get_dim_name(dim_type, i) + new_iname = child_iname_map[iname] + new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domains.append(new_domain) + + kernel = kernel.copy(domains= kernel.domains + new_domains) + + # rename temporaries + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng(name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # rename arguments + + calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] + assert len(calls) == 1 + call, = calls + + parameters = call.assignees + call.expression.parameters + + child_arg_map = {} # arg -> SubArrayRef + for inside, outside in six.iteritems(arg_map): + child_arg_map[inside], = [p for p in parameters if p.subscript.aggregate.name == outside] + + + # Rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + class KernelInliner(SubstitutionMapper): + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + indices = [self.subst_func(i) for i in expr.index_tuple] + sar = child_arg_map[expr.aggregate.name] # SubArrayRef + # insert non-sweeping indices from outter kernel + for i, index in enumerate(sar.subscript.index_tuple): + if index not in sar.swept_inames: + indices.insert(i, index) + return aggregate.index(tuple(indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + inner_insns = [] + for insn in child.instructions: + new_insn = insn.with_transformed_expressions(subst_mapper) + within_inames = [child_iname_map[iname] for iname in insn.within_inames] + within_inames.extend(call.within_inames) + new_insn = new_insn.copy(within_inames=frozenset(within_inames)) + inner_insns.append(new_insn) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + return kernel + + +# vim: foldmethod=marker -- GitLab From 073550effb8c2f2df5608b45220716d6b61cad82 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 11:08:06 +0100 Subject: [PATCH 02/18] add test --- loopy/__init__.py | 3 +++ loopy/transform/register_knl.py | 9 ++++++-- test/test_transform.py | 38 +++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0a..c695f7df5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,8 @@ from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, register_function_lookup) +from loopy.transform.register_knl import (register_callable_kernel, + inline_kernel) # }}} @@ -230,6 +232,7 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", + "inline_kernel", # }}} diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 9997ade35..faa42b743 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -37,6 +37,8 @@ __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_callable_kernel + +.. autofunction:: inline_kernel """ @@ -139,6 +141,7 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(domains= kernel.domains + new_domains) # rename temporaries + child_temp_map = {} new_temps = kernel.temporary_variables.copy() for name, temp in six.iteritems(child.temporary_variables): @@ -149,7 +152,7 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(temporary_variables=new_temps) # rename arguments - + # TODO: put this in a loop calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] assert len(calls) == 1 call, = calls @@ -174,6 +177,7 @@ def inline_kernel(kernel, function, arg_map=None): indices = [self.subst_func(i) for i in expr.index_tuple] sar = child_arg_map[expr.aggregate.name] # SubArrayRef # insert non-sweeping indices from outter kernel + # TODO: sweeping indices might flip: [i,j]: A[j, i] for i, index in enumerate(sar.subscript.index_tuple): if index not in sar.swept_inames: indices.insert(i, index) @@ -191,7 +195,8 @@ def inline_kernel(kernel, function, arg_map=None): new_insn = insn.with_transformed_expressions(subst_mapper) within_inames = [child_iname_map[iname] for iname in insn.within_inames] within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames)) + new_insn = new_insn.copy(within_inames=frozenset(within_inames), priority=call.priority) + # TODO: depends on? inner_insns.append(new_insn) new_insns = [] diff --git a/test/test_transform.py b/test/test_transform.py index 76ff4520a..92a6c5cc3 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -424,6 +424,44 @@ def test_multi_arg_array_call(ctx_factory): assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) +def test_inlining_kernel(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 16 + + x = np.random.rand(n) + y = np.random.rand(n) + + knl1 = lp.make_kernel( + "{[i]: 0 <= i < 16}", + """ + for i + c[i] = a[i] + 2*b[i] + end + """ + ) + knl2 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[j, i] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl3 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + + evt, (out, ) = knl3(queue, x=x, y=y) + z = np.tile(x + y*2, [16, 1]) + + assert np.allclose(out, z) + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 0d223307282c97413e7134fefd1031b0c32a37ed Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 11:26:20 +0100 Subject: [PATCH 03/18] flake8 --- loopy/transform/register_knl.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index faa42b743..2adc2648e 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -112,8 +112,7 @@ def register_callable_kernel(parent, function_name, child): # }}} - -def inline_kernel(kernel, function, arg_map=None): +def inline_kernel(kernel, function, arg_map): child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() @@ -138,7 +137,7 @@ def inline_kernel(kernel, function, arg_map=None): new_domain = new_domain.set_dim_name(dim_type, i, new_iname) new_domains.append(new_domain) - kernel = kernel.copy(domains= kernel.domains + new_domains) + kernel = kernel.copy(domains=kernel.domains + new_domains) # rename temporaries @@ -152,8 +151,11 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(temporary_variables=new_temps) # rename arguments + # TODO: automatically figuring out arg map # TODO: put this in a loop - calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] + calls = [insn for insn in kernel.instructions + if isinstance(insn, CallInstruction) + and insn.expression.function.name == function] assert len(calls) == 1 call, = calls @@ -161,8 +163,8 @@ def inline_kernel(kernel, function, arg_map=None): child_arg_map = {} # arg -> SubArrayRef for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters if p.subscript.aggregate.name == outside] - + child_arg_map[inside], = [p for p in parameters + if p.subscript.aggregate.name == outside] # Rewrite instructions @@ -185,17 +187,21 @@ def inline_kernel(kernel, function, arg_map=None): else: return super(KernelInliner, self).map_subscript(expr) - var_map = dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] for insn in child.instructions: new_insn = insn.with_transformed_expressions(subst_mapper) within_inames = [child_iname_map[iname] for iname in insn.within_inames] within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames), priority=call.priority) + new_insn = new_insn.copy(within_inames=frozenset(within_inames), + priority=call.priority) # TODO: depends on? inner_insns.append(new_insn) -- GitLab From 762e7b2d8ef2c3967e3d384be755609ebbd53739 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 13:12:33 +0100 Subject: [PATCH 04/18] 2d tests --- loopy/transform/register_knl.py | 205 +++++++++++++++++--------------- test/test_transform.py | 85 ++++++++++++- 2 files changed, 193 insertions(+), 97 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 2adc2648e..8c0305154 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -114,105 +114,124 @@ def register_callable_kernel(parent, function_name, child): def inline_kernel(kernel, function, arg_map): + if function not in kernel.scoped_functions: + raise LoopyError("function: {0} does not exist".format(function)) + child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() - # duplicate and rename inames - - import islpy as isl - - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng(iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - n_dim = new_domain.n_dim() - for i in range(n_dim): - iname = new_domain.get_dim_name(dim_type, i) - new_iname = child_iname_map[iname] - new_domain = new_domain.set_dim_name(dim_type, i, new_iname) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng(name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # rename arguments - # TODO: automatically figuring out arg map - # TODO: put this in a loop - calls = [insn for insn in kernel.instructions - if isinstance(insn, CallInstruction) - and insn.expression.function.name == function] - assert len(calls) == 1 - call, = calls - - parameters = call.assignees + call.expression.parameters - - child_arg_map = {} # arg -> SubArrayRef - for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters - if p.subscript.aggregate.name == outside] - - # Rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - - class KernelInliner(SubstitutionMapper): - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - indices = [self.subst_func(i) for i in expr.index_tuple] - sar = child_arg_map[expr.aggregate.name] # SubArrayRef - # insert non-sweeping indices from outter kernel - # TODO: sweeping indices might flip: [i,j]: A[j, i] - for i, index in enumerate(sar.subscript.index_tuple): - if index not in sar.swept_inames: - indices.insert(i, index) - return aggregate.index(tuple(indices)) + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + if call.expression.function.name != function: + continue + + # {{{ duplicate and rename inames + + import islpy as isl + + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng(iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + n_dim = new_domain.n_dim() + for i in range(n_dim): + iname = new_domain.get_dim_name(dim_type, i) + new_iname = child_iname_map[iname] + new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng(name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ arguments + # TODO: automatically figuring out arg map + parameters = call.assignees + call.expression.parameters + + child_arg_map = {} # arg -> SubArrayRef + for inside, outside in six.iteritems(arg_map): + child_arg_map[inside], = [p for p in parameters + if p.subscript.aggregate.name == outside] + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + class KernelInliner(SubstitutionMapper): + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + sar = child_arg_map[expr.aggregate.name] # SubArrayRef + indices = [] + for index in sar.subscript.index_tuple: + if index in sar.swept_inames: + # map sweeping index to inner kernel index + pos = sar.swept_inames.index(index) + new_index = self.subst_func(expr.index_tuple[pos]) + else: + # non-sweepting index from outter kernel + new_index = index + indices.append(new_index) + return aggregate.index(tuple(indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + inner_insns = [] + for insn in child.instructions: + new_insn = insn.with_transformed_expressions(subst_mapper) + within_inames = [child_iname_map[iname] for iname in insn.within_inames] + within_inames.extend(call.within_inames) + id = vng(new_insn.id) + new_insn = new_insn.copy( + id=id, + within_inames=frozenset(within_inames), + priority=call.priority, + depends_on=new_insn.depends_on | call.depends_on + ) + # TODO: depends on is too conservative? + inner_insns.append(new_insn) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - inner_insns = [] - for insn in child.instructions: - new_insn = insn.with_transformed_expressions(subst_mapper) - within_inames = [child_iname_map[iname] for iname in insn.within_inames] - within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames), - priority=call.priority) - # TODO: depends on? - inner_insns.append(new_insn) + new_insns.append(insn) - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) + kernel = kernel.copy(instructions=new_insns) + + # }}} - kernel = kernel.copy(instructions=new_insns) return kernel diff --git a/test/test_transform.py b/test/test_transform.py index 92a6c5cc3..09b497348 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -424,7 +424,7 @@ def test_multi_arg_array_call(ctx_factory): assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) -def test_inlining_kernel(ctx_factory): +def test_inline_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 16 @@ -440,6 +440,7 @@ def test_inlining_kernel(ctx_factory): end """ ) + knl2 = lp.make_kernel( "{[i, j]: 0 <= i, j < 16}", """ @@ -453,14 +454,90 @@ def test_inlining_kernel(ctx_factory): ] ) + knl3 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[i, j] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl3 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1]) + assert np.allclose(out, z) + + knl3 = lp.register_callable_kernel(knl3, 'func', knl1) + knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl3(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1]).transpose() + assert np.allclose(out, z) + + +def test_inline_kernel_2d(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 16 + + x = np.random.rand(n ** 2).reshape((n, n)) + y = np.random.rand(n ** 2).reshape((n, n)) + + knl1 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for i, j + c[i, j] = a[i, j] + 2*b[i, j] + end + """, + kernel_data=[ + lp.GlobalArg("a", np.float64, (16, 16)), + lp.GlobalArg("b", np.float64, (16, 16)), "..." + ] + ) - evt, (out, ) = knl3(queue, x=x, y=y) - z = np.tile(x + y*2, [16, 1]) + knl2 = lp.make_kernel( + "{[i, j, k]: 0 <= i, j, k < 16}", + """ + for k + [i, j]: z[k, i, j] = func([i, j]: x[i, j], [i, j]: y[i, j]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16, 16)), + lp.GlobalArg("y", np.float64, (16, 16)), "..." + ] + ) + knl3 = lp.make_kernel( + "{[i, j, k]: 0 <= i, j, k < 16}", + """ + for k + [i, j]: z[k, j, i] = func([i, j]: x[i, j], [i, j]: y[i, j]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16, 16)), + lp.GlobalArg("y", np.float64, (16, 16)), "..." + ] + ) + + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1, 1]) assert np.allclose(out, z) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1) + knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl3(queue, x=x, y=y) + z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) + assert np.allclose(out, z) def test_rename_argument(ctx_factory): ctx = ctx_factory() -- GitLab From 0e805a1bb4efee6da2b4c8cb97937e9fba01ca79 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 19:18:15 +0100 Subject: [PATCH 05/18] better subscript mapping --- loopy/transform/register_knl.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 8c0305154..a8d52a3e6 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -180,21 +180,21 @@ def inline_kernel(kernel, function, arg_map): from loopy.symbolic import SubstitutionMapper class KernelInliner(SubstitutionMapper): + """ + Mapper to replace variables (indices, temporaries, arguments) in + the inner kernel. + """ def map_subscript(self, expr): if expr.aggregate.name in child_arg_map: aggregate = self.subst_func(expr.aggregate) sar = child_arg_map[expr.aggregate.name] # SubArrayRef - indices = [] - for index in sar.subscript.index_tuple: - if index in sar.swept_inames: - # map sweeping index to inner kernel index - pos = sar.swept_inames.index(index) - new_index = self.subst_func(expr.index_tuple[pos]) - else: - # non-sweepting index from outter kernel - new_index = index - indices.append(new_index) - return aggregate.index(tuple(indices)) + # first, map inner inames to outer inames + outer_indices = [self.subst_func(i) for i in expr.index_tuple] + # then, map index expressions in SubArrayRef to outer inames + index_map = dict(zip(sar.swept_inames, outer_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) -- GitLab From bf70d0a3935ff719bf5e3a75cd9c0c714fb3ad0b Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 14:38:56 +0100 Subject: [PATCH 06/18] add test for affine sweeping index --- test/test_transform.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index 09b497348..7f6eed495 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -467,6 +467,19 @@ def test_inline_kernel(ctx_factory): ] ) + knl4 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[j, 15-i] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) evt, (out, ) = knl2(queue, x=x, y=y) @@ -479,6 +492,11 @@ def test_inline_kernel(ctx_factory): z = np.tile(x + y * 2, [16, 1]).transpose() assert np.allclose(out, z) + knl4 = lp.register_callable_kernel(knl4, 'func', knl1) + knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl4(queue, x=x, y=y) + z = np.tile(np.flip(x + y * 2, 0), [16, 1]) + assert np.allclose(out, z) def test_inline_kernel_2d(ctx_factory): ctx = ctx_factory() -- GitLab From a74a880ecd0a9d1ebc8aa1d7483c3e49c8f3b272 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 15:20:11 +0100 Subject: [PATCH 07/18] automatic matching of args --- loopy/transform/register_knl.py | 58 ++++++++++++++++++++++++++------- test/test_transform.py | 9 +++-- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index a8d52a3e6..dd3a477bf 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -112,11 +112,13 @@ def register_callable_kernel(parent, function_name, child): # }}} -def inline_kernel(kernel, function, arg_map): +def inline_kernel(knl, function, arg_map=None): - if function not in kernel.scoped_functions: + if function not in knl.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) + kernel = knl.copy() + child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() @@ -163,14 +165,48 @@ def inline_kernel(kernel, function, arg_map): # }}} - # {{{ arguments - # TODO: automatically figuring out arg map - parameters = call.assignees + call.expression.parameters + # {{{ match kernel arguments + + child_arg_map = {} # child arg name -> SubArrayRef + + # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to + # the written arguments, and in1, in2 to the readonly arguments in + # child kernel, according the order they appear in child.args + writes = child.get_written_variables() + reads = [arg.name for arg in child.args if arg.name not in writes] + writes = [arg.name for arg in child.args if arg.name in writes] + + if arg_map: + for inside, outside in six.iteritems(arg_map): + if inside not in child.arg_dict: + raise LoopyError("arg named '{0}' not in the child " + "kernel".format(inside)) + if inside in writes: + sar = [sar for sar in call.assignees + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + sar = [sar for sar in call.expression.parameters + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + if len(call.assignees) != len(writes): + raise LoopyError("expect {0} output variable(s), got {1}".format( + len(writes), len(call.assignees))) + if len(call.expression.parameters) != len(reads): + raise LoopyError("expect {0} input variable(s), got {1}".format( + len(reads), len(call.expression.parameters))) + for arg_name, sar in zip(writes, call.assignees): + child_arg_map[arg_name] = sar + for arg_name, sar in zip(reads, call.expression.parameters): + child_arg_map[arg_name] = sar - child_arg_map = {} # arg -> SubArrayRef - for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters - if p.subscript.aggregate.name == outside] # }}} # {{{ rewrite instructions @@ -202,8 +238,8 @@ def inline_kernel(kernel, function, arg_map): for k, v in six.iteritems(child_iname_map)) var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(arg_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(child_arg_map))) subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] diff --git a/test/test_transform.py b/test/test_transform.py index 7f6eed495..c5180ead1 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -481,9 +481,14 @@ def test_inline_kernel(ctx_factory): ) knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) - evt, (out, ) = knl2(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1]) + + knl2_arg_map = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2_arg_map(queue, x=x, y=y) + assert np.allclose(out, z) + + knl2_no_arg_map = lp.inline_kernel(knl2, "func") + evt, (out, ) = knl2_no_arg_map(queue, x=x, y=y) assert np.allclose(out, z) knl3 = lp.register_callable_kernel(knl3, 'func', knl1) -- GitLab From 8917de2569a2fe0c8756de27540c8da752f1415f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 19:01:17 +0100 Subject: [PATCH 08/18] add inames to non-sweeping indices --- loopy/transform/register_knl.py | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index dd3a477bf..f08269964 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -118,9 +118,8 @@ def inline_kernel(knl, function, arg_map=None): raise LoopyError("function: {0} does not exist".format(function)) kernel = knl.copy() - child = kernel.scoped_functions[function].subkernel - vng = kernel.get_var_name_generator() + for call in kernel.instructions: if not isinstance(call, CallInstruction): @@ -132,6 +131,8 @@ def inline_kernel(knl, function, arg_map=None): import islpy as isl + vng = kernel.get_var_name_generator() + dim_type = isl.dim_type.set child_iname_map = {} @@ -243,24 +244,38 @@ def inline_kernel(knl, function, arg_map=None): subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] + + ing = kernel.get_instruction_id_generator() + insn_id = {} for insn in child.instructions: - new_insn = insn.with_transformed_expressions(subst_mapper) - within_inames = [child_iname_map[iname] for iname in insn.within_inames] - within_inames.extend(call.within_inames) - id = vng(new_insn.id) - new_insn = new_insn.copy( - id=id, + insn_id[insn.id] = ing(insn.id) + + for _insn in child.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = insn.dependency_names() & kernel.all_inames() + within_inames = within_inames | call.within_inames + depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) + depends_on = depends_on | call.depends_on + insn = insn.copy( + id=insn_id[insn.id], within_inames=frozenset(within_inames), priority=call.priority, - depends_on=new_insn.depends_on | call.depends_on + depends_on=depends_on ) # TODO: depends on is too conservative? - inner_insns.append(new_insn) + inner_insns.append(insn) + from loopy.kernel.instruction import NoOpInstruction new_insns = [] for insn in kernel.instructions: if insn == call: new_insns.extend(inner_insns) + noop = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=call.depends_on + ) + new_insns.append(noop) else: new_insns.append(insn) -- GitLab From 32a0b13045d823c0fb06549436a1ee8e2f37512b Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 20 Apr 2018 18:19:55 +0100 Subject: [PATCH 09/18] still some issues with mapping subscripts --- loopy/transform/register_knl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index f08269964..a2c753440 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -226,7 +226,7 @@ def inline_kernel(knl, function, arg_map=None): aggregate = self.subst_func(expr.aggregate) sar = child_arg_map[expr.aggregate.name] # SubArrayRef # first, map inner inames to outer inames - outer_indices = [self.subst_func(i) for i in expr.index_tuple] + outer_indices = self.map_tuple(expr.index_tuple) # then, map index expressions in SubArrayRef to outer inames index_map = dict(zip(sar.swept_inames, outer_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) @@ -250,19 +250,20 @@ def inline_kernel(knl, function, arg_map=None): for insn in child.instructions: insn_id[insn.id] = ing(insn.id) + new_inames = [] + for _insn in child.instructions: insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = insn.dependency_names() & kernel.all_inames() + within_inames = frozenset(child_iname_map[iname] for iname in insn.within_inames) within_inames = within_inames | call.within_inames depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) depends_on = depends_on | call.depends_on insn = insn.copy( id=insn_id[insn.id], - within_inames=frozenset(within_inames), + within_inames=within_inames, priority=call.priority, depends_on=depends_on ) - # TODO: depends on is too conservative? inner_insns.append(insn) from loopy.kernel.instruction import NoOpInstruction -- GitLab From 1b6becb7150bdfa30d5880322251d22a2b964fa6 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Mon, 23 Apr 2018 18:37:16 +0100 Subject: [PATCH 10/18] seems to work now --- loopy/transform/register_knl.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index a2c753440..bb43dd19d 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -25,6 +25,8 @@ THE SOFTWARE. import six +import numpy as np + from loopy.kernel import LoopKernel from loopy.kernel.creation import FunctionScoper from loopy.diagnostic import LoopyError @@ -137,7 +139,7 @@ def inline_kernel(knl, function, arg_map=None): child_iname_map = {} for iname in child.all_inames(): - child_iname_map[iname] = vng(iname) + child_iname_map[iname] = vng("child_"+iname) new_domains = [] for domain in child.domains: @@ -158,7 +160,7 @@ def inline_kernel(knl, function, arg_map=None): child_temp_map = {} new_temps = kernel.temporary_variables.copy() for name, temp in six.iteritems(child.temporary_variables): - new_name = vng(name) + new_name = vng("child_"+name) child_temp_map[name] = new_name new_temps[new_name] = temp.copy(name=new_name) @@ -215,6 +217,8 @@ def inline_kernel(knl, function, arg_map=None): import pymbolic.primitives as p from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper + from loopy.isl_helpers import simplify_via_aff + from functools import reduce class KernelInliner(SubstitutionMapper): """ @@ -224,13 +228,33 @@ def inline_kernel(knl, function, arg_map=None): def map_subscript(self, expr): if expr.aggregate.name in child_arg_map: aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef + sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) + arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) + # first, map inner inames to outer inames outer_indices = self.map_tuple(expr.index_tuple) - # then, map index expressions in SubArrayRef to outer inames - index_map = dict(zip(sar.swept_inames, outer_indices)) + + # next, reshape to match dimension of outer arrays + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) for i in range(len(arg_in.shape))] + make_sum = lambda x, y: p.Sum((x, y)) # TODO: can be more functional? + flatten_index = reduce(make_sum, map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = simplify_via_aff(flatten_index) + + from loopy.symbolic import pw_aff_to_expr + bounds = [kernel.get_iname_bounds(i.name) for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index = flatten_index - s * ind + new_indices.append(ind) + + # lastly, map sweeping indices to indices in Subscripts in SubArrayRef + index_map = dict(zip(sar.swept_inames, new_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) @@ -248,7 +272,7 @@ def inline_kernel(knl, function, arg_map=None): ing = kernel.get_instruction_id_generator() insn_id = {} for insn in child.instructions: - insn_id[insn.id] = ing(insn.id) + insn_id[insn.id] = ing("child_"+insn.id) new_inames = [] @@ -274,7 +298,7 @@ def inline_kernel(knl, function, arg_map=None): noop = NoOpInstruction( id=call.id, within_inames=call.within_inames, - depends_on=call.depends_on + depends_on=call.depends_on | set(insn.id for insn in inner_insns) ) new_insns.append(noop) else: -- GitLab From 3877b398df2581024fe5feac044ba32ff4243095 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 24 Apr 2018 14:05:34 +0100 Subject: [PATCH 11/18] better dependency reasoning and some cleaning up --- loopy/transform/register_knl.py | 94 +++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index bb43dd19d..6d40942c9 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -114,15 +114,13 @@ def register_callable_kernel(parent, function_name, child): # }}} -def inline_kernel(knl, function, arg_map=None): +def inline_kernel(kernel, function, arg_map=None): - if function not in knl.scoped_functions: + if function not in kernel.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) - kernel = knl.copy() child = kernel.scoped_functions[function].subkernel - for call in kernel.instructions: if not isinstance(call, CallInstruction): continue @@ -134,7 +132,6 @@ def inline_kernel(knl, function, arg_map=None): import islpy as isl vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set child_iname_map = {} @@ -144,11 +141,10 @@ def inline_kernel(knl, function, arg_map=None): new_domains = [] for domain in child.domains: new_domain = domain.copy() - n_dim = new_domain.n_dim() - for i in range(n_dim): + for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) - new_iname = child_iname_map[iname] - new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domain = new_domain.set_dim_name( + dim_type, i, child_iname_map[iname]) new_domains.append(new_domain) kernel = kernel.copy(domains=kernel.domains + new_domains) @@ -231,26 +227,43 @@ def inline_kernel(knl, function, arg_map=None): sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - # first, map inner inames to outer inames + # Firstly, map inner inames to outer inames. outer_indices = self.map_tuple(expr.index_tuple) - # next, reshape to match dimension of outer arrays - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) for i in range(len(arg_in.shape))] - make_sum = lambda x, y: p.Sum((x, y)) # TODO: can be more functional? - flatten_index = reduce(make_sum, map(p.Product, zip(outer_indices, inner_sizes))) + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg_in.shape): + raise LoopyError( + "Argument: {0} in child kernel: {1} does not have " + "constant shape.".format(arg_in, child.name)) + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) + for i in range(len(arg_in.shape))] + flatten_index = reduce( + lambda x, y: p.Sum((x, y)), + map(p.Product, zip(outer_indices, inner_sizes))) flatten_index = simplify_via_aff(flatten_index) from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) for i in sar.swept_inames] + bounds = [kernel.get_iname_bounds(i.name) + for i in sar.swept_inames] sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in parent kernel: {1} does not have " + "swept inames with constant size.".format( + sar, kernel.name)) + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + new_indices = [] for s in sizes: ind = flatten_index // s flatten_index = flatten_index - s * ind new_indices.append(ind) - # lastly, map sweeping indices to indices in Subscripts in SubArrayRef + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] index_map = dict(zip(sar.swept_inames, new_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) @@ -267,40 +280,63 @@ def inline_kernel(knl, function, arg_map=None): for k, v in six.iteritems(child_arg_map))) subst_mapper = KernelInliner(make_subst_func(var_map)) - inner_insns = [] - ing = kernel.get_instruction_id_generator() insn_id = {} for insn in child.instructions: insn_id[insn.id] = ing("child_"+insn.id) - new_inames = [] + # {{{ root and leave instructions in child kernel + + dep_map = child.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of child kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing("child_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] for _insn in child.instructions: insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(child_iname_map[iname] for iname in insn.within_inames) + within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) within_inames = within_inames | call.within_inames - depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) - depends_on = depends_on | call.depends_on + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) insn = insn.copy( id=insn_id[insn.id], within_inames=within_inames, + # TODO: probaby need to keep priority in child kernel priority=call.priority, depends_on=depends_on ) inner_insns.append(insn) - from loopy.kernel.instruction import NoOpInstruction + inner_insns.append(noop_end) + new_insns = [] for insn in kernel.instructions: if insn == call: new_insns.extend(inner_insns) - noop = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=call.depends_on | set(insn.id for insn in inner_insns) - ) - new_insns.append(noop) else: new_insns.append(insn) -- GitLab From e2a348275eeaa0de80031a08447230ecd6d56461 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 25 Apr 2018 12:24:35 +0100 Subject: [PATCH 12/18] rebase to kernel_callables_v3 --- loopy/__init__.py | 4 +- loopy/transform/register_callable.py | 239 +++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 3 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index c695f7df5..1c7951dc0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,9 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup) -from loopy.transform.register_knl import (register_callable_kernel, - inline_kernel) + register_function_lookup, inline_kernel) # }}} diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 4df55905c..4ce3c72cc 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -22,6 +22,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + +import numpy as np + from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord @@ -137,4 +141,239 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} + +def inline_kernel(kernel, function, arg_map=None): + + from loopy import CallInstruction, LoopyError + + if function not in kernel.scoped_functions: + raise LoopyError("function: {0} does not exist".format(function)) + + child = kernel.scoped_functions[function].subkernel + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + if call.expression.function.name != function: + continue + + # {{{ duplicate and rename inames + + import islpy as isl + + vng = kernel.get_var_name_generator() + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng("child_"+iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, child_iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng("child_"+name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + child_arg_map = {} # child arg name -> SubArrayRef + + # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to + # the written arguments, and in1, in2 to the readonly arguments in + # child kernel, according the order they appear in child.args + writes = child.get_written_variables() + reads = [arg.name for arg in child.args if arg.name not in writes] + writes = [arg.name for arg in child.args if arg.name in writes] + + if arg_map: + for inside, outside in six.iteritems(arg_map): + if inside not in child.arg_dict: + raise LoopyError("arg named '{0}' not in the child " + "kernel".format(inside)) + if inside in writes: + sar = [sar for sar in call.assignees + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + sar = [sar for sar in call.expression.parameters + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + if len(call.assignees) != len(writes): + raise LoopyError("expect {0} output variable(s), got {1}".format( + len(writes), len(call.assignees))) + if len(call.expression.parameters) != len(reads): + raise LoopyError("expect {0} input variable(s), got {1}".format( + len(reads), len(call.expression.parameters))) + for arg_name, sar in zip(writes, call.assignees): + child_arg_map[arg_name] = sar + for arg_name, sar in zip(reads, call.expression.parameters): + child_arg_map[arg_name] = sar + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + from loopy.isl_helpers import simplify_via_aff + from functools import reduce + + class KernelInliner(SubstitutionMapper): + """ + Mapper to replace variables (indices, temporaries, arguments) in + the inner kernel. + """ + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) + arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg_in.shape): + raise LoopyError( + "Argument: {0} in child kernel: {1} does not have " + "constant shape.".format(arg_in, child.name)) + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) + for i in range(len(arg_in.shape))] + flatten_index = reduce( + lambda x, y: p.Sum((x, y)), + map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = simplify_via_aff(flatten_index) + + from loopy.symbolic import pw_aff_to_expr + bounds = [kernel.get_iname_bounds(i.name) + for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in parent kernel: {1} does not have " + "swept inames with constant size.".format( + sar, kernel.name)) + + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index = flatten_index - s * ind + new_indices.append(ind) + + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] + index_map = dict(zip(sar.swept_inames, new_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(child_arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + ing = kernel.get_instruction_id_generator() + insn_id = {} + for insn in child.instructions: + insn_id[insn.id] = ing("child_"+insn.id) + + # {{{ root and leave instructions in child kernel + + dep_map = child.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of child kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing("child_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for _insn in child.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) + within_inames = within_inames | call.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in child kernel + priority=call.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + # vim: foldmethod=marker -- GitLab From 60704094dd8eb36ab1ee20fb09a33f41147c677f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 27 Apr 2018 15:42:18 +0100 Subject: [PATCH 13/18] docstring and minor modifications --- loopy/transform/register_knl.py | 25 +++++++++++++++++++++++++ test/test_transform.py | 6 +++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 6d40942c9..6804e2972 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -115,6 +115,31 @@ def register_callable_kernel(parent, function_name, child): def inline_kernel(kernel, function, arg_map=None): + """ + This transformation inlines a callable child kernel into the parent kernel. + + :arg: kernel + + The parent kernel. + + :arg: function + + The name of the function call to which the callable kernel is inlined. + + :arg: arg_map + + Dictionary which maps argument names in the child kernel to variables + in the parnet kernel. If not provided, the arguments will be mapped + according to their access and position, i.e. the first argument in the + child kernel with write access will be mapped to the first assignee in + the function call, and so on. + + """ + + assert isinstance(kernel, LoopKernel) + assert isinstance(function, str) + if not arg_map: + assert isinstance(arg_map, dict) if function not in kernel.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) diff --git a/test/test_transform.py b/test/test_transform.py index c5180ead1..ee4627cfd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -500,9 +500,12 @@ def test_inline_kernel(ctx_factory): knl4 = lp.register_callable_kernel(knl4, 'func', knl1) knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) evt, (out,) = knl4(queue, x=x, y=y) - z = np.tile(np.flip(x + y * 2, 0), [16, 1]) + z = x + y * 2 + z = z[::-1] + z = np.tile(z, [16, 1]) assert np.allclose(out, z) + def test_inline_kernel_2d(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -562,6 +565,7 @@ def test_inline_kernel_2d(ctx_factory): z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) assert np.allclose(out, z) + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 51cd5945fb12a32f1ef6f8bf72ac41f6a126d6f3 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 1 May 2018 14:09:20 +0100 Subject: [PATCH 14/18] remove register_knl.py --- loopy/transform/register_callable.py | 11 +- loopy/transform/register_knl.py | 375 --------------------------- 2 files changed, 4 insertions(+), 382 deletions(-) delete mode 100644 loopy/transform/register_knl.py diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 4ce3c72cc..3c5d8fbcf 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -244,7 +244,6 @@ def inline_kernel(kernel, function, arg_map=None): from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper from loopy.isl_helpers import simplify_via_aff - from functools import reduce class KernelInliner(SubstitutionMapper): """ @@ -267,11 +266,9 @@ def inline_kernel(kernel, function, arg_map=None): raise LoopyError( "Argument: {0} in child kernel: {1} does not have " "constant shape.".format(arg_in, child.name)) - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) - for i in range(len(arg_in.shape))] - flatten_index = reduce( - lambda x, y: p.Sum((x, y)), - map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = sum( + idx * tag.stride + for idx, tag in zip(outer_indices, arg_in.dim_tags)) flatten_index = simplify_via_aff(flatten_index) from loopy.symbolic import pw_aff_to_expr @@ -289,7 +286,7 @@ def inline_kernel(kernel, function, arg_map=None): new_indices = [] for s in sizes: ind = flatten_index // s - flatten_index = flatten_index - s * ind + flatten_index -= s * ind new_indices.append(ind) # Lastly, map sweeping indices to indices in Subscripts diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py deleted file mode 100644 index 6804e2972..000000000 --- a/loopy/transform/register_knl.py +++ /dev/null @@ -1,375 +0,0 @@ -from __future__ import division, absolute_import - -__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -import six - -import numpy as np - -from loopy.kernel import LoopKernel -from loopy.kernel.creation import FunctionScoper -from loopy.diagnostic import LoopyError -from loopy.kernel.function_interface import CallableKernel - -from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, - CInstruction, _DataObliviousInstruction) - -__doc__ = """ -.. currentmodule:: loopy - -.. autofunction:: register_callable_kernel - -.. autofunction:: inline_kernel -""" - - -# {{{ main entrypoint - -def register_callable_kernel(parent, function_name, child): - """ - The purpose of this transformation is so that one can inoke the child - kernel in the parent kernel. - - :arg parent - - This is the "main" kernel which will mostly remain unaltered and one - can interpret it as stitching up the child kernel in the parent kernel. - - :arg function_name - - The name of the function call with which the child kernel must be - associated in the parent kernel - - :arg child - - This is like a function in every other language and this might be - invoked in one of the instructions of the parent kernel. - - ..note:: - - One should note that the kernels would go under stringent compatibilty - tests so that both of them can be confirmed to be made for each other. - """ - - # {{{ sanity checks - - assert isinstance(parent, LoopKernel) - assert isinstance(child, LoopKernel) - assert isinstance(function_name, str) - - # }}} - - # scoping the function - function_scoper = FunctionScoper(set([function_name])) - new_insns = [] - - for insn in parent.instructions: - if isinstance(insn, CallInstruction): - new_insn = insn.copy(expression=function_scoper(insn.expression)) - new_insns.append(new_insn) - elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, - CInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("scope_functions not implemented for %s" % - type(insn)) - - # adding the scoped function to the scoped function dict of the parent - # kernel. - - scoped_functions = parent.scoped_functions.copy() - - if function_name in scoped_functions: - raise LoopyError("%s is already being used as a funciton name -- maybe" - "use a different name for registering the subkernel") - - scoped_functions[function_name] = CallableKernel(name=function_name, - subkernel=child) - - # returning the parent kernel with the new scoped function dictionary - return parent.copy(scoped_functions=scoped_functions, - instructions=new_insns) - -# }}} - - -def inline_kernel(kernel, function, arg_map=None): - """ - This transformation inlines a callable child kernel into the parent kernel. - - :arg: kernel - - The parent kernel. - - :arg: function - - The name of the function call to which the callable kernel is inlined. - - :arg: arg_map - - Dictionary which maps argument names in the child kernel to variables - in the parnet kernel. If not provided, the arguments will be mapped - according to their access and position, i.e. the first argument in the - child kernel with write access will be mapped to the first assignee in - the function call, and so on. - - """ - - assert isinstance(kernel, LoopKernel) - assert isinstance(function, str) - if not arg_map: - assert isinstance(arg_map, dict) - - if function not in kernel.scoped_functions: - raise LoopyError("function: {0} does not exist".format(function)) - - child = kernel.scoped_functions[function].subkernel - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - if call.expression.function.name != function: - continue - - # {{{ duplicate and rename inames - - import islpy as isl - - vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng("child_"+iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, child_iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # }}} - - # {{{ rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng("child_"+name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - child_arg_map = {} # child arg name -> SubArrayRef - - # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to - # the written arguments, and in1, in2 to the readonly arguments in - # child kernel, according the order they appear in child.args - writes = child.get_written_variables() - reads = [arg.name for arg in child.args if arg.name not in writes] - writes = [arg.name for arg in child.args if arg.name in writes] - - if arg_map: - for inside, outside in six.iteritems(arg_map): - if inside not in child.arg_dict: - raise LoopyError("arg named '{0}' not in the child " - "kernel".format(inside)) - if inside in writes: - sar = [sar for sar in call.assignees - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - sar = [sar for sar in call.expression.parameters - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - if len(call.assignees) != len(writes): - raise LoopyError("expect {0} output variable(s), got {1}".format( - len(writes), len(call.assignees))) - if len(call.expression.parameters) != len(reads): - raise LoopyError("expect {0} input variable(s), got {1}".format( - len(reads), len(call.expression.parameters))) - for arg_name, sar in zip(writes, call.assignees): - child_arg_map[arg_name] = sar - for arg_name, sar in zip(reads, call.expression.parameters): - child_arg_map[arg_name] = sar - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - from loopy.isl_helpers import simplify_via_aff - from functools import reduce - - class KernelInliner(SubstitutionMapper): - """ - Mapper to replace variables (indices, temporaries, arguments) in - the inner kernel. - """ - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) - arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in arg_in.shape): - raise LoopyError( - "Argument: {0} in child kernel: {1} does not have " - "constant shape.".format(arg_in, child.name)) - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) - for i in range(len(arg_in.shape))] - flatten_index = reduce( - lambda x, y: p.Sum((x, y)), - map(p.Product, zip(outer_indices, inner_sizes))) - flatten_index = simplify_via_aff(flatten_index) - - from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in parent kernel: {1} does not have " - "swept inames with constant size.".format( - sar, kernel.name)) - - sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] - - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index = flatten_index - s * ind - new_indices.append(ind) - - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(child_arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - ing = kernel.get_instruction_id_generator() - insn_id = {} - for insn in child.instructions: - insn_id[insn.id] = ing("child_"+insn.id) - - # {{{ root and leave instructions in child kernel - - dep_map = child.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of child kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing("child_start"), - within_inames=call.within_inames, - depends_on=call.depends_on - ) - noop_end = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for _insn in child.instructions: - insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) - within_inames = within_inames | call.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in child kernel - priority=call.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - - -# vim: foldmethod=marker -- GitLab From 1c5cfa2da7167f191640f1d9029b85080d1319a9 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 2 May 2018 17:40:11 +0100 Subject: [PATCH 15/18] updates based on feedbacks on MR --- loopy/__init__.py | 3 +- loopy/kernel/function_interface.py | 7 +- loopy/preprocess.py | 239 +++++++++++++++++++++++++- loopy/transform/register_callable.py | 242 +-------------------------- test/test_transform.py | 22 +-- 5 files changed, 253 insertions(+), 260 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 1c7951dc0..a5850ec0a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,7 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup, inline_kernel) + register_function_lookup) # }}} @@ -230,7 +230,6 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", - "inline_kernel", # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 368267d76..79c9cb2e1 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -440,12 +440,12 @@ class CallableKernel(InKernelCallable): """ fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + "name_in_target", "inline"]) init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") + "name_in_target", "inline") def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): + arg_id_to_descr=None, name_in_target=None, inline=False): super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, @@ -454,6 +454,7 @@ class CallableKernel(InKernelCallable): subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target + self.inline = inline self.subkernel = subkernel.copy( args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) if arg.dtype is not None else arg for arg in subkernel.args]) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bf1467c16..242422d61 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,7 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import CombineMapper +from loopy.symbolic import CombineMapper, SubstitutionMapper, pw_aff_to_expr from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -2477,6 +2477,239 @@ def make_functions_ready_for_codegen(kernel): # }}} +# {{{ inline callable kernel + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + import numpy as np + from pymbolic.mapper.substitutor import make_subst_func + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + arg = self.arg_dict[expr.aggregate.name] # Arg in callee + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(arg)) + flatten_index = sum( + idx * tag.stride + for idx, tag in zip(outer_indices, arg.dim_tags)) + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + bounds = [self.caller.get_iname_bounds(i.name) + for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in caller kernel does not have " + "swept inames with constant size.".format(sar)) + + sizes = [int(np.prod(sizes[i + 1:])) for i in range(len(sizes))] + + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index -= s * ind + new_indices.append(ind) + + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] + index_map = dict(zip(sar.swept_inames, new_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + +def inline_callable_kernels(kernel): + + from loopy import CallInstruction + import islpy as isl + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + + callable = kernel.scoped_functions[call.expression.function.name] + if not callable.inline: + continue + + callee = callable.subkernel + callee_label = callee.name[:4] + "_" # label used to generate new names + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + for domain in callee.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = call.assignees # writes + parameters = call.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(call.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee) + kw_parameters = call.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee.args): + if arg.direction == "out": + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee.arg_dict) + + insn_id = {} + for insn in callee.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for _insn in callee.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | call.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=call.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2548,6 +2781,9 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) + # inlining callable kernels that are marked with inline=True. + kernel = inline_callable_kernels(kernel) + # type specialize functions that were missed during the type inference. kernel = make_functions_ready_for_codegen(kernel) @@ -2563,6 +2799,7 @@ def preprocess_kernel(kernel, device=None): # inferring the shape and dim_tags of the arguments involved in a function # call. kernel = infer_arg_descr(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 3c5d8fbcf..8300fa374 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -22,10 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - -import numpy as np - from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord @@ -82,13 +78,15 @@ class RegisterCalleeKernel(ImmutableRecord): return None -def register_callable_kernel(caller_kernel, function_name, callee_kernel): +def register_callable_kernel(caller_kernel, function_name, callee_kernel, + inline=False): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. :arg function_name: An instance of :class:`str`. :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg inline: Boolean flag of inlining callee kernel into caller. """ # {{{ sanity checks @@ -130,7 +128,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, - is_master_kernel=False)) + is_master_kernel=False), inline=inline) # disabling global barriers for callee kernel from loopy import set_options @@ -141,236 +139,4 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} - -def inline_kernel(kernel, function, arg_map=None): - - from loopy import CallInstruction, LoopyError - - if function not in kernel.scoped_functions: - raise LoopyError("function: {0} does not exist".format(function)) - - child = kernel.scoped_functions[function].subkernel - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - if call.expression.function.name != function: - continue - - # {{{ duplicate and rename inames - - import islpy as isl - - vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng("child_"+iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, child_iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # }}} - - # {{{ rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng("child_"+name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - child_arg_map = {} # child arg name -> SubArrayRef - - # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to - # the written arguments, and in1, in2 to the readonly arguments in - # child kernel, according the order they appear in child.args - writes = child.get_written_variables() - reads = [arg.name for arg in child.args if arg.name not in writes] - writes = [arg.name for arg in child.args if arg.name in writes] - - if arg_map: - for inside, outside in six.iteritems(arg_map): - if inside not in child.arg_dict: - raise LoopyError("arg named '{0}' not in the child " - "kernel".format(inside)) - if inside in writes: - sar = [sar for sar in call.assignees - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - sar = [sar for sar in call.expression.parameters - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - if len(call.assignees) != len(writes): - raise LoopyError("expect {0} output variable(s), got {1}".format( - len(writes), len(call.assignees))) - if len(call.expression.parameters) != len(reads): - raise LoopyError("expect {0} input variable(s), got {1}".format( - len(reads), len(call.expression.parameters))) - for arg_name, sar in zip(writes, call.assignees): - child_arg_map[arg_name] = sar - for arg_name, sar in zip(reads, call.expression.parameters): - child_arg_map[arg_name] = sar - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - from loopy.isl_helpers import simplify_via_aff - - class KernelInliner(SubstitutionMapper): - """ - Mapper to replace variables (indices, temporaries, arguments) in - the inner kernel. - """ - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) - arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in arg_in.shape): - raise LoopyError( - "Argument: {0} in child kernel: {1} does not have " - "constant shape.".format(arg_in, child.name)) - flatten_index = sum( - idx * tag.stride - for idx, tag in zip(outer_indices, arg_in.dim_tags)) - flatten_index = simplify_via_aff(flatten_index) - - from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in parent kernel: {1} does not have " - "swept inames with constant size.".format( - sar, kernel.name)) - - sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] - - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index -= s * ind - new_indices.append(ind) - - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(child_arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - ing = kernel.get_instruction_id_generator() - insn_id = {} - for insn in child.instructions: - insn_id[insn.id] = ing("child_"+insn.id) - - # {{{ root and leave instructions in child kernel - - dep_map = child.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of child kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing("child_start"), - within_inames=call.within_inames, - depends_on=call.depends_on - ) - noop_end = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for _insn in child.instructions: - insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) - within_inames = within_inames | call.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in child kernel - priority=call.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - # vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index ee4627cfd..b08d674a5 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -480,25 +480,17 @@ def test_inline_kernel(ctx_factory): ] ) - knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) z = np.tile(x + y * 2, [16, 1]) - - knl2_arg_map = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) - evt, (out, ) = knl2_arg_map(queue, x=x, y=y) - assert np.allclose(out, z) - - knl2_no_arg_map = lp.inline_kernel(knl2, "func") - evt, (out, ) = knl2_no_arg_map(queue, x=x, y=y) + evt, (out, ) = knl2(queue, x=x, y=y) assert np.allclose(out, z) - knl3 = lp.register_callable_kernel(knl3, 'func', knl1) - knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) evt, (out,) = knl3(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1]).transpose() assert np.allclose(out, z) - knl4 = lp.register_callable_kernel(knl4, 'func', knl1) - knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) + knl4 = lp.register_callable_kernel(knl4, 'func', knl1, inline=True) evt, (out,) = knl4(queue, x=x, y=y) z = x + y * 2 z = z[::-1] @@ -553,14 +545,12 @@ def test_inline_kernel_2d(ctx_factory): ] ) - knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) evt, (out, ) = knl2(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1, 1]) assert np.allclose(out, z) - knl3 = lp.register_callable_kernel(knl3, 'func', knl1) - knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) evt, (out,) = knl3(queue, x=x, y=y) z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) assert np.allclose(out, z) -- GitLab From bc0ca75f385e96b92e1ea90803a769af3e6e8979 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 11:07:58 +0100 Subject: [PATCH 16/18] test for callable type before inlining --- loopy/preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 242422d61..e4494bbda 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2549,6 +2549,7 @@ class KernelInliner(SubstitutionMapper): def inline_callable_kernels(kernel): from loopy import CallInstruction + from loopy.kernel.function_interface import CallableKernel import islpy as isl for call in kernel.instructions: @@ -2556,6 +2557,10 @@ def inline_callable_kernels(kernel): continue callable = kernel.scoped_functions[call.expression.function.name] + + if not isinstance(callable, CallableKernel): + continue + if not callable.inline: continue -- GitLab From 18ee74a8aeeb1a718b30e3c6a036347aed034f34 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 11:48:52 +0100 Subject: [PATCH 17/18] test for function is scoped before inlining --- loopy/preprocess.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e4494bbda..8fe7acb78 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2556,6 +2556,9 @@ def inline_callable_kernels(kernel): if not isinstance(call, CallInstruction): continue + if call.expression.function.name not in kernel.scoped_functions: + continue + callable = kernel.scoped_functions[call.expression.function.name] if not isinstance(callable, CallableKernel): @@ -2773,6 +2776,10 @@ def preprocess_kernel(kernel, device=None): check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) + # Inlining callable kernels that are marked with inline=True. + # This should happen after type inference but before other transformations. + kernel = inline_callable_kernels(kernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) @@ -2786,9 +2793,6 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) - # inlining callable kernels that are marked with inline=True. - kernel = inline_callable_kernels(kernel) - # type specialize functions that were missed during the type inference. kernel = make_functions_ready_for_codegen(kernel) -- GitLab From fe3e5166836831486f0946861f262e841008c511 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 12:31:14 +0100 Subject: [PATCH 18/18] test for Call expression before inlining --- loopy/preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8fe7acb78..1b1d9be38 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2550,12 +2550,17 @@ def inline_callable_kernels(kernel): from loopy import CallInstruction from loopy.kernel.function_interface import CallableKernel + from pymbolic.primitives import Call + import islpy as isl for call in kernel.instructions: if not isinstance(call, CallInstruction): continue + if not isinstance(call.expression, Call): + continue + if call.expression.function.name not in kernel.scoped_functions: continue -- GitLab