diff --git a/doc/reference.rst b/doc/reference.rst index 9d75a924b71847d04632b4945f9228c80c2e01f4..af39de655c560f5a7be5e98ba9debfc5649b0fdb 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -336,6 +336,8 @@ function, which is responsible for creating kernels: .. autofunction:: make_copy_kernel +.. autofunction:: fuse_kernels + Transforming Kernels -------------------- @@ -429,6 +431,8 @@ Manipulating Instructions .. autofunction:: remove_instructions +.. autofunction:: tag_instructions + Library interface ^^^^^^^^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 12329bbf6472867c873958f1a18f20ba1016b5f7..453b4310339e05861fbe8a5db1d7d89e24dbbc65 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,8 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,6 +23,9 @@ THE SOFTWARE. """ +import six +from six.moves import range, zip + import islpy as isl from islpy import dim_type @@ -58,6 +57,7 @@ from loopy.library.reduction import register_reduction_parser from loopy.subst import extract_subst, expand_subst, temporary_to_subst from loopy.precompute import precompute from loopy.buffer import buffer_array +from loopy.fusion import fuse_kernels from loopy.padding import (split_arg_axis, find_padding_multiple, add_padding) from loopy.preprocess import (preprocess_kernel, realize_reduction, @@ -88,6 +88,7 @@ __all__ = [ "extract_subst", "expand_subst", "temporary_to_subst", "precompute", "buffer_array", + "fuse_kernels", "split_arg_axis", "find_padding_multiple", "add_padding", "get_dot_dependency_graph", @@ -539,7 +540,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): return var(new_name) def map_instruction(self, insn): - if not self.within(((insn.id, None),)): + if not self.within(((insn.id, insn.tags),)): return insn new_fid = frozenset( @@ -1781,4 +1782,23 @@ def fold_constants(kernel): # }}} + +# {{{ tag_instructions + +def tag_instructions(kernel, new_tag, within=None): + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + new_insns = [] + for insn in kernel.instructions: + if within(((insn.id, insn.tags),)): + new_insns.append( + insn.copy(tags=insn.tags + (new_tag,))) + else: + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/fusion.py b/loopy/fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..434297b1b430ec27f013be9e6838ba321d22f8b7 --- /dev/null +++ b/loopy/fusion.py @@ -0,0 +1,240 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six + +import islpy as isl +from islpy import dim_type + +from loopy.diagnostic import LoopyError + + +def _find_fusable_loop_domain_index(domain, other_domains): + my_inames = set(domain.get_var_dict(dim_type.set)) + + overlap_domains = [] + for i, o_dom in enumerate(other_domains): + o_inames = set(o_dom.get_var_dict(dim_type.set)) + if my_inames & o_inames: + overlap_domains.append(i) + + if len(overlap_domains) >= 2: + raise LoopyError("more than one domain in one kernel has " + "overlapping inames with a " + "domain of the other kernel, cannot fuse: '%s'" + % domain) + + if len(overlap_domains) == 1: + return overlap_domains[0] + else: + return None + + +# {{{ generic merge helpers + +def _ordered_merge_lists(list_a, list_b): + result = list_a[:] + for item in list_b: + if item not in list_b: + result.append(item) + + return result + + +def _merge_dicts(item_name, dict_a, dict_b): + result = dict_a.copy() + + for k, v in six.iteritems(dict_b): + if k in result: + if v != result[k]: + raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s" + % (item_name, k, v, result[k])) + + else: + result[k] = v + + return result + + +def _merge_values(item_name, val_a, val_b): + if val_a != val_b: + raise LoopyError("inconsistent %ss in merge: %s and %s" + % (item_name, val_a, val_b)) + + return val_a + +# }}} + + +# {{{ two-kernel fusion + +def _fuse_two_kernels(knla, knlb): + from loopy.kernel import kernel_state + if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL: + raise LoopyError("can only fuse kernels in INITIAL state") + + # {{{ fuse instructions + + new_instructions = knla.instructions[:] + from pytools import UniqueNameGenerator + insn_id_gen = UniqueNameGenerator( + set([insna.id for insna in new_instructions])) + for insnb in knlb.instructions: + new_instructions.append( + insnb.copy(id=insn_id_gen(insnb.id))) + + # }}} + + # {{{ fuse domains + + new_domains = knla.domains[:] + + for dom_b in knlb.domains: + i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains) + if i_fuse is None: + new_domains.append(dom_b) + else: + dom_a = new_domains[i_fuse] + dom_a, dom_b = isl.align_two(dom_a, dom_b) + + shared_inames = list( + set(dom_a.get_var_dict(dim_type.set)) + & + set(dom_b.get_var_dict(dim_type.set))) + + dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set]) + dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set]) + + if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s): + raise LoopyError("kernels do not agree on domain of " + "inames '%s'" % (",".join(shared_inames))) + + new_domain = dom_a & dom_b + + new_domains[i_fuse] = new_domain + + # }}} + + # {{{ fuse args + + new_args = knla.args[:] + for b_arg in knlb.args: + if b_arg.name not in knla.arg_dict: + new_args.append(b_arg) + else: + if b_arg != knla.arg_dict[b_arg.name]: + raise LoopyError( + "argument '%s' has inconsistent definition between " + "the two kernels being merged" % b_arg.name) + + # }}} + + # {{{ fuse assumptions + + assump_a = knla.assumptions + assump_b = knlb.assumptions + assump_a, assump_b = isl.align_two(assump_a, assump_b) + + shared_param_names = list( + set(dom_a.get_var_dict(dim_type.set)) + & + set(dom_b.get_var_dict(dim_type.set))) + + assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) + assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) + + if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s): + raise LoopyError("assumptions do not agree on kernels to be merged") + + new_assumptions = (assump_a & assump_b).params() + + # }}} + + from loopy.kernel import LoopKernel + return LoopKernel( + domains=new_domains, + instructions=new_instructions, + args=new_args, + name="%s_and_%s" % (knla.name, knlb.name), + preambles=_ordered_merge_lists(knla.preambles, knlb.preambles), + preamble_generators=_ordered_merge_lists( + knla.preamble_generators, knlb.preamble_generators), + assumptions=new_assumptions, + local_sizes=_merge_dicts( + "local size", knla.local_sizes, knlb.local_sizes), + temporary_variables=_merge_dicts( + "temporary variable", + knla.temporary_variables, + knlb.temporary_variables), + iname_to_tag=_merge_dicts( + "iname-to-tag mapping", + knla.iname_to_tag, + knlb.iname_to_tag), + substitutions=_merge_dicts( + "substitution", + knla.substitutions, + knlb.substitutions), + function_manglers=_ordered_merge_lists( + knla.function_manglers, + knlb.function_manglers), + symbol_manglers=_ordered_merge_lists( + knla.symbol_manglers, + knlb.symbol_manglers), + + iname_slab_increments=_merge_dicts( + "iname slab increment", + knla.iname_slab_increments, + knlb.iname_slab_increments), + loop_priority=_ordered_merge_lists( + knla.loop_priority, + knlb.loop_priority), + silenced_warnings=_ordered_merge_lists( + knla.silenced_warnings, + knlb.silenced_warnings), + applied_iname_rewrites=_ordered_merge_lists( + knla.applied_iname_rewrites, + knlb.applied_iname_rewrites), + index_dtype=_merge_values( + "index dtype", + knla.index_dtype, + knlb.index_dtype), + target=_merge_values( + "target", + knla.target, + knlb.target), + options=knla.options) + +# }}} + + +def fuse_kernels(kernels): + kernels = list(kernels) + result = kernels.pop(0) + while kernels: + result = _fuse_two_kernels(result, kernels.pop(0)) + + return result + +# vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 14562b384a89f1455a3a29ac161682889f1d3e01..1f64e59e6c9e8ab6d599ea0cc61937df5a60ffcb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1,10 +1,6 @@ """Kernel object.""" -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -28,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +from six.moves import range, zip import numpy as np from pytools import RecordWithoutPickling, Record, memoize_method diff --git a/loopy/loop.py b/loopy/loop.py index 54030bd8f0ebaaded29078f28a16b1dcc66a98e4..4592463822a2321745aaf48a316d16c98d4efca3 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -104,8 +104,7 @@ def fuse_loop_domains(kernel): outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom) - from loopy.isl_helpers import convexify - new_domains.insert(min_idx, convexify(inner_dom & outer_dom)) + new_domains.insert(min_idx, inner_dom & outer_dom) break if new_domains: diff --git a/test/test_fortran.py b/test/test_fortran.py index a2167cdafcf152dca282a39fff77d8b829c18f62..ce27424d8cfb7e4256e745a8806c3b6bedb6b4ac 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -321,12 +321,6 @@ def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) - # # FIXME: Make r/w tests possible, reactivate the above - # knl = lp.preprocess_kernel(knl) - # for k in lp.generate_loop_schedules(knl): - # code, _ = lp.generate_code(k) - # print(code) - @pytest.mark.xfail def test_batched_sparse(): @@ -374,6 +368,47 @@ def test_batched_sparse(): knl = lp.fix_parameters(knl, nvecs=4) +def test_fuse_kernels(ctx_factory): + fortran_template = """ + subroutine {name}(nelements, ndofs, result, d, q) + implicit none + integer e, i, j, k + integer nelements, ndofs + real*8 result(nelements, ndofs, ndofs) + real*8 q(nelements, ndofs, ndofs) + real*8 d(ndofs, ndofs) + + do e = 1,nelements + do i = 1,ndofs + do j = 1,ndofs + do k = 1,ndofs + {line} + end do + end do + end do + end do + end subroutine + """ + + xd_line = "result(e,i,j) = result(e,i,j) + d(i,k)*q(e,i,k)" + yd_line = "result(e,i,j) = result(e,i,j) + d(i,k)*q(e,k,j)" + + from loopy.frontend.fortran import f2loopy + xderiv, = f2loopy( + fortran_template.format(line=xd_line, name="xderiv")) + yderiv, = f2loopy( + fortran_template.format(line=yd_line, name="yderiv")) + xyderiv, = f2loopy( + fortran_template.format( + line=(xd_line + "\n" + yd_line), name="xyderiv")) + + knl = lp.fuse_kernels((xderiv, yderiv)) + knl = lp.set_loop_priority(knl, "e,i,j,k") + + ctx = ctx_factory() + lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])