diff --git a/doc/reference.rst b/doc/reference.rst index 595fd4b5fa5bac3ccd69cfb1b7a05ebb1c7b75f9..75592a63c4fec2f9bfb93b2eae8bb8e9f67af8e9 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -402,6 +402,8 @@ Caching, Precomputation and Prefetching .. autofunction:: add_prefetch +.. autofunction:: buffer_write + Influencing data access ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 3027e10d38a98b4c86737502b7fd20f41c3cc3c3..d9f67a43724a2b28d7192d3b79a04750920509ba 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -55,6 +55,7 @@ from loopy.kernel.creation import make_kernel, UniqueName from loopy.library.reduction import register_reduction_parser from loopy.subst import extract_subst, expand_subst, temporary_to_subst from loopy.precompute import precompute +from loopy.buffer_writes import buffer_write from loopy.padding import (split_arg_axis, find_padding_multiple, add_padding) from loopy.preprocess import (preprocess_kernel, realize_reduction, @@ -82,7 +83,7 @@ __all__ = [ "register_reduction_parser", "extract_subst", "expand_subst", "temporary_to_subst", - "precompute", + "precompute", "buffer_write", "split_arg_axis", "find_padding_multiple", "add_padding", "get_dot_dependency_graph", @@ -1570,11 +1571,11 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): equations = [equations] import re - EQN_RE = re.compile(r"^([^=]+)=([^=]+)$") + eqn_re = re.compile(r"^([^=]+)=([^=]+)$") def parse_equation(eqn): if isinstance(eqn, str): - eqn_match = EQN_RE.match(eqn) + eqn_match = eqn_re.match(eqn) if not eqn_match: raise ValueError("invalid equation: %s" % eqn) diff --git a/loopy/array_buffer.py b/loopy/array_buffer.py index c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c..6d1c56eef27e6e809dfc6b4048f7a570d1c29a10 100644 --- a/loopy/array_buffer.py +++ b/loopy/array_buffer.py @@ -350,7 +350,7 @@ class ArrayToBufferMap(object): set(global_s2s_par_dom.get_var_names(dim_type.param)) & self.kernel.all_inames()) - for arg in accdesc.args: + for arg in accdesc.storage_axis_exprs: arg_inames.update(get_dependencies(arg)) arg_inames = frozenset(arg_inames) diff --git a/loopy/buffer_writes.py b/loopy/buffer_writes.py new file mode 100644 index 0000000000000000000000000000000000000000..7083e3c269aea7d663c1c38605ecd8b581324b6d --- /dev/null +++ b/loopy/buffer_writes.py @@ -0,0 +1,386 @@ +from __future__ import division, absolute_import +import six +from six.moves import range, zip + +__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap, + AccessDescriptor) +from loopy.symbolic import ExpandingIdentityMapper, SubstitutionMapper +from pymbolic.mapper.substitutor import make_subst_func + +from pymbolic import var + + +# {{{ replace array access + +class ArrayAccessReplacer(ExpandingIdentityMapper): + def __init__(self, kernel, var_name, within, array_base_map, buf_var): + super(ArrayAccessReplacer, self).__init__( + kernel.substitutions, kernel.get_var_name_generator()) + + self.kernel = kernel + self.within = within + + self.array_base_map = array_base_map + + self.var_name = var_name + self.modified_insn_ids = set() + + self.buf_var = buf_var + + def map_variable(self, expr, expn_state): + result = None + if expr.name == self.var_name and self.within(expn_state): + result = self.map_array_access((), expn_state) + + if result is None: + return super(ArrayAccessReplacer, self).map_variable(expr, expn_state) + else: + self.modified_insn_ids.add(expn_state.insn_id) + return result + + def map_subscript(self, expr, expn_state): + result = None + if expr.aggregate.name == self.var_name and self.within(expn_state): + result = self.map_array_access(expr.index, expn_state) + + if result is None: + return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state) + else: + self.modified_insn_ids.add(expn_state.insn_id) + return result + + def map_array_access(self, index, expn_state): + accdesc = AccessDescriptor( + identifier=None, + expands_footprint=False, + storage_axis_exprs=index) + + if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): + return None + + abm = self.array_base_map + + assert len(index) == len(abm.non1_storage_axis_flags) + + access_subscript = [] + for i in range(len(index)): + if not abm.non1_storage_axis_flags[i]: + continue + + ax_index = index[i] + from loopy.isl_helpers import simplify_via_aff + ax_index = simplify_via_aff( + ax_index - abm.storage_base_indices[i]) + + access_subscript.append(ax_index) + + result = self.buf_var + if access_subscript: + result = result.index(tuple(access_subscript)) + + # Can't possibly be nested, but recurse anyway to + # make sure substitution rules referenced below here + # do not get thrown away. + self.rec(result, expn_state.copy(arg_context={})) + + return result + +# }}} + + +def buffer_write(kernel, var_name, buffer_inames, init_expression=None, + store_expression=None, within=None, default_tag="l.auto", + temporary_is_local=None, fetch_bounding_box=False, + within_inames=()): + """ + :arg init_expression: Either *None* (indicating the prior value of the buffered + array should be read) or an expression optionally involving the + variable 'base' (which references the associated location in the array + being buffered). + :arg store_expression: Either *None* or an expression involving + variables 'base' and 'buffer' (without array indices). + """ + + # {{{ process arguments + + if isinstance(init_expression, str): + from loopy.symbolic import parse + init_expression = parse(init_expression) + + if isinstance(store_expression, str): + from loopy.symbolic import parse + store_expression = parse(store_expression) + + if isinstance(buffer_inames, str): + buffer_inames = buffer_inames.split(",") + + if isinstance(within_inames, str): + within_inames = within_inames.split(",") + + for iname in buffer_inames: + if iname not in kernel.all_inames(): + raise RuntimeError("sweep iname '%s' is not a known iname" + % iname) + + buffer_inames = list(buffer_inames) + buffer_inames_set = frozenset(buffer_inames) + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + if var_name in kernel.arg_dict: + var_descr = kernel.arg_dict[var_name] + elif var_name in kernel.temporary_variables: + var_descr = kernel.temporary_variables[var_name] + else: + raise ValueError("variable '%s' not found" % var_name) + + from loopy.kernel.data import ArrayBase + if isinstance(var_descr, ArrayBase): + var_shape = var_descr.shape + else: + var_shape = () + + if temporary_is_local is None: + import loopy as lp + temporary_is_local = lp.auto + + # }}} + + var_name_gen = kernel.get_var_name_generator() + + access_descriptors = [] + for insn in kernel.instructions: + if not within((insn.id, insn.tags)): + continue + + for assignee, index in insn.assignees_and_indices(): + if assignee == var_name: + access_descriptors.append( + AccessDescriptor( + identifier=insn.id, + expands_footprint=True, + storage_axis_exprs=index)) + + # {{{ use given / find new storage_axes + + init_inames = [] + store_inames = [] + new_iname_to_tag = {} + + for i in range(len(var_shape)): + init_iname = var_name_gen("%s_init_%d" % (var_name, i)) + store_iname = var_name_gen("%s_store_%d" % (var_name, i)) + + new_iname_to_tag[init_iname] = default_tag + new_iname_to_tag[store_iname] = default_tag + + init_inames.append(init_iname) + store_inames.append(store_iname) + + # }}} + + # {{{ modify loop domains + + non1_init_inames = [] + non1_store_inames = [] + + if var_shape: + # {{{ find domain to be changed + + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel, buffer_inames_set) + + if domch.leaf_domain_index is not None: + # If the sweep inames are at home in parent domains, then we'll add + # fetches with loops over copies of these parent inames that will end + # up being scheduled *within* loops over these parents. + + for iname in buffer_inames_set: + if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: + raise RuntimeError("buffer iname '%s' is not 'at home' in the " + "sweep's leaf domain" % iname) + + # }}} + + abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, + access_descriptors, len(var_shape)) + + for i in range(len(var_shape)): + if abm.non1_storage_axis_flags[i]: + non1_init_inames.append(init_inames[i]) + non1_store_inames.append(store_inames[i]) + else: + del new_iname_to_tag[init_inames[i]] + del new_iname_to_tag[store_inames[i]] + + new_domain = domch.domain + new_domain = abm.augment_domain_with_sweep( + new_domain, non1_init_inames, + boxify_sweep=fetch_bounding_box) + new_domain = abm.augment_domain_with_sweep( + new_domain, non1_store_inames, + boxify_sweep=fetch_bounding_box) + new_kernel_domains = domch.get_domains_with(new_domain) + del new_domain + + else: + # leave kernel domains unchanged + new_kernel_domains = kernel.domains + + abm = NoOpArrayToBufferMap() + + # }}} + + # {{{ set up temp variable + + import loopy as lp + + buf_var_name = var_name_gen(based_on=var_name+"_buf") + + new_temporary_variables = kernel.temporary_variables.copy() + temp_var = lp.TemporaryVariable( + name=buf_var_name, + dtype=var_descr.dtype, + base_indices=(0,)*len(abm.non1_storage_shape), + shape=tuple(abm.non1_storage_shape), + is_local=temporary_is_local) + + new_temporary_variables[buf_var_name] = temp_var + + # }}} + + new_insns = [] + + buf_var = var(buf_var_name) + + # {{{ generate init instruction + + buf_var_init = buf_var + if non1_init_inames: + buf_var_init = buf_var_init.index( + tuple(var(iname) for iname in non1_init_inames)) + + init_base = var(var_name) + + init_subscript = [] + init_iname_idx = 0 + if var_shape: + for i in range(len(var_shape)): + ax_subscript = abm.storage_base_indices[i] + if abm.non1_storage_axis_flags[i]: + ax_subscript += var(non1_init_inames[init_iname_idx]) + init_iname_idx += 1 + init_subscript.append(ax_subscript) + + if init_subscript: + init_base = init_base.index(tuple(init_subscript)) + + if init_expression is None: + init_expression = init_base + else: + init_expression = init_expression + init_expression = SubstitutionMapper( + make_subst_func({ + "base": init_base, + }))(init_expression) + + init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) + from loopy.kernel.data import ExpressionInstruction + init_instruction = ExpressionInstruction(id=init_insn_id, + assignee=buf_var_init, + expression=init_expression, + forced_iname_deps=frozenset(within_inames)) + + # }}} + + aar = ArrayAccessReplacer(kernel, var_name, within, abm, buf_var) + kernel = aar.map_kernel(kernel) + + # {{{ add init_insn_id to insn_deps + + new_insns = [] + + for insn in kernel.instructions: + if insn.id in aar.modified_insn_ids: + new_insns.append( + insn.copy( + insn_deps=insn.insn_deps | frozenset([init_insn_id]))) + else: + new_insns.append(insn) + + # }}} + + # {{{ generate store instruction + + buf_var_store = buf_var + if non1_store_inames: + buf_var_store = buf_var_store.index( + tuple(var(iname) for iname in non1_store_inames)) + + store_subscript = [] + store_iname_idx = 0 + if var_shape: + for i in range(len(var_shape)): + ax_subscript = abm.storage_base_indices[i] + if abm.non1_storage_axis_flags[i]: + ax_subscript += var(non1_store_inames[store_iname_idx]) + store_iname_idx += 1 + store_subscript.append(ax_subscript) + + store_target = var(var_name) + if store_subscript: + store_target = store_target.index(tuple(store_subscript)) + + if store_expression is None: + store_expression = buf_var_store + else: + store_expression = SubstitutionMapper( + make_subst_func({ + "base": store_target, + "buffer": buf_var_store, + }))(store_expression) + + from loopy.kernel.data import ExpressionInstruction + store_instruction = ExpressionInstruction( + id=kernel.make_unique_instruction_id(based_on="store_"+var_name), + insn_deps=frozenset(aar.modified_insn_ids), + assignee=store_target, + expression=store_expression, + forced_iname_deps=frozenset(within_inames)) + + # }}} + + kernel = kernel.copy( + domains=new_kernel_domains, + instructions=new_insns + [init_instruction, store_instruction], + temporary_variables=new_temporary_variables) + + from loopy import tag_inames + kernel = tag_inames(kernel, new_iname_to_tag) + + return kernel + +# vim: foldmethod=marker diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 93be120d2cca92e9e887ffb7908816da82b2c925..fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -354,4 +354,13 @@ def boxify(cache_manager, domain, box_inames, context): return convexify(result) + +def simplify_via_aff(expr): + from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies + deps = get_dependencies(expr) + return aff_to_expr(aff_from_expr( + isl.Space.create_from_names(isl.Context(), list(deps)), + expr)) + + # vim: foldmethod=marker diff --git a/loopy/precompute.py b/loopy/precompute.py index de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c..3044b8f7dcc50907b01c9710b0e7880aac93ba4c 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -25,7 +25,6 @@ THE SOFTWARE. """ -import islpy as isl from loopy.symbolic import (get_dependencies, SubstitutionMapper, ExpandingIdentityMapper) from pymbolic.mapper.substitutor import make_subst_func @@ -57,13 +56,7 @@ def storage_axis_exprs(storage_axis_sources, args): return result -def simplify_via_aff(expr): - from loopy.symbolic import aff_from_expr, aff_to_expr - deps = get_dependencies(expr) - return aff_to_expr(aff_from_expr( - isl.Space.create_from_names(isl.Context(), list(deps)), - expr)) - +# {{{ gather rule invocations class RuleInvocationGatherer(ExpandingIdentityMapper): def __init__(self, kernel, subst_name, subst_tag, within): @@ -130,20 +123,20 @@ class RuleInvocationGatherer(ExpandingIdentityMapper): return 0 # exact value irrelevant +# }}} + + +# {{{ replace rule invocation class RuleInvocationReplacer(ExpandingIdentityMapper): def __init__(self, kernel, subst_name, subst_tag, within, access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, - storage_base_indices, non1_storage_axis_names, + non1_storage_axis_names, target_var_name): ExpandingIdentityMapper.__init__(self, kernel.substitutions, kernel.get_var_name_generator()) - from loopy.symbolic import SubstitutionRuleExpander - self.subst_expander = SubstitutionRuleExpander( - kernel.substitutions, kernel.get_var_name_generator()) - self.kernel = kernel self.subst_name = subst_name self.subst_tag = subst_tag @@ -154,7 +147,6 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): self.storage_axis_names = storage_axis_names self.storage_axis_sources = storage_axis_sources - self.storage_base_indices = storage_base_indices self.non1_storage_axis_names = non1_storage_axis_names self.target_var_name = target_var_name @@ -198,11 +190,13 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): assert len(arguments) == len(rule.arguments) + abm = self.array_base_map + stor_subscript = [] for sax_name, sax_source, sax_base_idx in zip( self.storage_axis_names, self.storage_axis_sources, - self.storage_base_indices): + abm.storage_base_indices): if sax_name not in self.non1_storage_axis_names: continue @@ -213,6 +207,7 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): # an iname ax_index = var(sax_source) + from loopy.isl_helpers import simplify_via_aff ax_index = simplify_via_aff(ax_index - sax_base_idx) stor_subscript.append(ax_index) @@ -227,6 +222,8 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): return new_outer_expr +# }}} + def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={}, @@ -301,6 +298,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, raise RuntimeError("sweep iname '%s' is not a known iname" % iname) + sweep_inames = list(sweep_inames) + sweep_inames_set = frozenset(sweep_inames) + if isinstance(storage_axes, str): storage_axes = storage_axes.split(",") @@ -398,9 +398,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - sweep_inames = list(sweep_inames) - sweep_inames_set = frozenset(sweep_inames) - # {{{ find inames used in arguments expanding_usage_arg_deps = set() @@ -573,7 +570,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, - abm.storage_base_indices, non1_storage_axis_names, + non1_storage_axis_names, target_var_name) kernel = invr.map_kernel(kernel) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 4492f032d0d355e9268dab049b55c0613b22b47b..9309fd0418e453311240cce8e9f6c73a8d428f9c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -352,6 +352,10 @@ class ExpansionState(Record): :ivar arg_context: a dict representing current argument values """ + @property + def insn_id(self): + return self.stack[0][0] + class SubstitutionRuleRenamer(IdentityMapper): def __init__(self, renames): diff --git a/test/test_fortran.py b/test/test_fortran.py index 33826e4063190f7d7d48b1bfcb4122931868bd1e..c68e963a60585634437d8a80b5eb1cbb1e37e2c2 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -282,17 +282,12 @@ def test_matmul(ctx_factory): do j = 1,n do i = 1,m - temp = 0 do k = 1,l - temp = temp + b(k,j)*a(i,k) + c(i,j) = c(i,j) + b(k,j)*a(i,k) end do - c(i,j) = temp end do end do end subroutine - - !$loopy begin transform - !$loopy end transform """ from loopy.frontend.fortran import f2loopy @@ -307,15 +302,28 @@ def test_matmul(ctx_factory): knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) + knl = lp.assume(knl, "n mod 32 = 0") + knl = lp.assume(knl, "m mod 32 = 0") + knl = lp.assume(knl, "l mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") - ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10)) + # FIXME: also test + # knl = lp.buffer_write(knl, "c", (), init_expression="0", + # store_expression="base+buffer") + knl = lp.buffer_write(knl, "c", "i_inner,j_inner", init_expression="0", + store_expression="base+buffer", within_inames="i_outer,j_outer") + + #ctx = ctx_factory() + #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10)) + knl = lp.preprocess_kernel(knl) + for k in lp.generate_loop_schedules(knl): + code, _ = lp.generate_code(k) + print(code) if __name__ == "__main__": if len(sys.argv) > 1: