diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 9f426f76bc6902fd09bd7685c73f187df935be1e..57955acd31c7e91c510e2d3208ccf247230377c2 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -28,9 +28,10 @@ from six.moves import range, zip import islpy as isl from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, - SubstitutionRuleMappingContext) + SubstitutionRuleMappingContext, CoefficientCollector) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func +from pytools import memoize_method import numpy as np from pymbolic import var @@ -62,7 +63,10 @@ def storage_axis_exprs(storage_axis_sources, args): # {{{ gather rule invocations class RuleInvocationGatherer(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within): + def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, + within, sweep_inames): + assert isinstance(sweep_inames, frozenset) + super(RuleInvocationGatherer, self).__init__(rule_mapping_context) from loopy.symbolic import SubstitutionRuleExpander @@ -73,9 +77,24 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper): self.subst_name = subst_name self.subst_tag = subst_tag self.within = within + self.sweep_inames = sweep_inames self.access_descriptors = [] + @memoize_method + def vars_constant_during_sweeping(self): + """ + Returns a :class:`frozenset` of variables which are invariant wrt + the sweep. + """ + from loopy.kernel.instruction import MultiAssignmentBase + return self.kernel.all_variable_names() - frozenset().union( + *( + frozenset(insn.assignees) + for insn in self.kernel.instructions + if isinstance(insn, MultiAssignmentBase) + and insn.within_inames & self.sweep_inames)) + def map_substitution(self, name, tag, arguments, expn_state): process_me = name == self.subst_name @@ -100,10 +119,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper): arg_deps = (arg_deps | get_dependencies(self.subst_expander(arg_val))) - # FIXME: This is too strict--and the footprint machinery - # needs to be taught how to deal with locally constant - # variables. - if not arg_deps <= self.kernel.all_inames(): + if not arg_deps <= self.vars_constant_during_sweeping(): from warnings import warn warn("Precompute arguments in '%s(%s)' do not consist exclusively " "of inames and constants--specifically, these are " @@ -137,7 +153,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): storage_axis_names, storage_axis_sources, non1_storage_axis_names, temporary_name, compute_insn_id, compute_dep_id, - compute_read_variables): + compute_read_variables, offsets): super(RuleInvocationReplacer, self).__init__(rule_mapping_context) self.subst_name = subst_name @@ -157,6 +173,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): self.compute_read_variables = compute_read_variables self.compute_insn_depends_on = set() + self.offsets = offsets + self.substituted_atleast_one = False def map_substitution(self, name, tag, arguments, expn_state): if not ( @@ -169,6 +187,10 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): return super(RuleInvocationReplacer, self).map_substitution( name, tag, arguments, expn_state) + from pymbolic.mapper.distributor import distribute + arguments = [distribute(arg - off) for arg, off in zip(arguments, + self.offsets)] + # {{{ check if in footprint rule = self.rule_mapping_context.old_subst_rules[name] @@ -180,12 +202,29 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): storage_axis_exprs=storage_axis_exprs( self.storage_axis_sources, args)) - if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): + from loopy.diagnostic import ExpressionToAffineConversionError + + try: + process_me = self.array_base_map.is_access_descriptor_in_footprint( + accdesc) + except ExpressionToAffineConversionError: + # accdesc has non-affine storage axes exprs=>cannot be processed + process_me = False + + if not process_me: + # subst will remain as a "call" node => re-add the offsets + from pymbolic.mapper.distributor import distribute + arguments = [distribute(arg + off) for arg, off in zip(arguments, + self.offsets)] return super(RuleInvocationReplacer, self).map_substitution( name, tag, arguments, expn_state) # }}} + # at this point an invocation will surely be processed => set + # corresponding flag for later assertion + self.substituted_atleast_one = True + assert len(arguments) == len(rule.arguments) abm = self.array_base_map @@ -254,6 +293,18 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): # }}} +class SweepInamesCoefficientCollector(CoefficientCollector): + """ + Coefficient Collector which casts non-linear terms as constants. + """ + def map_constant(self, expr): + return super(SweepInamesCoefficientCollector, self).map_constant(expr) + + map_power = map_constant + map_subscript = map_constant + map_call = map_constant + + class _not_provided(object): # noqa: N801 pass @@ -516,7 +567,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) invg = RuleInvocationGatherer( - rule_mapping_context, kernel, subst_name, subst_tag, within) + rule_mapping_context, kernel, subst_name, subst_tag, within, + sweep_inames_set) del rule_mapping_context import loopy as lp @@ -652,6 +704,93 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} + # {{{ realize non-sweep offset terms + + # 'sweep_dependency[i]' is True for a storage axes #i if axes #i is being swept. + sweep_dependency = [False for _ in access_descriptors[0].storage_axis_exprs] + sweep_inames_as_vars = [Variable(s) for s in sweep_inames_set] + for accdesc in access_descriptors: + for i, sax in enumerate(accdesc.storage_axis_exprs): + if get_dependencies(sax) & sweep_inames_set: + sweep_dependency[i] = True + + offsets = [None for _ in access_descriptors[0].storage_axis_exprs] + sweep_inames_as_vars = [Variable(s) for s in sweep_inames_set] + from numbers import Integral + from pymbolic.primitives import Sum + from pymbolic.mapper.distributor import distribute + + for accdesc in access_descriptors: + for i, sax in enumerate(accdesc.storage_axis_exprs): + coeffs = SweepInamesCoefficientCollector(sweep_inames_set)(sax) + offst = coeffs.get(1, 0) + + # {{{ ISL works fine with integral constants => do not include them + # in offsets + + if isinstance(offst, Sum): + try: + cnst, = [child for child in offst.children if + isinstance(child, Integral)] + except ValueError: + cnst = 0 + elif isinstance(offst, Integral): + cnst = offst + else: + cnst = 0 + + offst = distribute(offst-cnst) + + # }}} + + if (sweep_dependency[i] and (sweep_inames_as_vars & + six.viewkeys(coeffs))) or not sweep_dependency[i]: + if offsets[i] is None: + offsets[i] = offst + else: + if distribute(offsets[i] - offst) != 0: + raise LoopyError("Offsets in invocations of '%s' do not" + " match." % subst_name) + + offsets = [offst if offst is not None else 0 for offst in offsets] + + def subtract_offsets(accdesc): + saxes = [distribute(saxis-offst) for saxis, offst in + zip(accdesc.storage_axis_exprs, offsets)] + args = [distribute(arg-offst) for arg, offst in + zip(accdesc.args, offsets)] + return accdesc.copy(args=args, storage_axis_exprs=saxes) + + access_descriptors = list(map(subtract_offsets, access_descriptors)) + + # }}} + + # {{{ after getting the offsets get rid of accesss descriptors which aren't + # affine + + from loopy.symbolic import guarded_aff_from_expr + from loopy.diagnostic import ExpressionToAffineConversionError + aff_access_descs = [] + for accdesc in access_descriptors: + try: + for saxis in accdesc.storage_axis_exprs: + guarded_aff_from_expr(isl.Space.create_from_names( + kernel.isl_context, set=sweep_inames_set), saxis) + # all storage axes exprs are affine terms of sweep inames + aff_access_descs.append(accdesc) + except ExpressionToAffineConversionError: + pass + + if aff_access_descs == []: + raise ExpressionToAffineConversionError("None of the invocations of" + " '{}' are affine wrt the sweep inames".format(subst_name)) + + access_descriptors = aff_access_descs[:] + + del aff_access_descs + + # }}} + expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() @@ -816,14 +955,15 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axis_subst_dict = {} - for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): + for arg_name, bi, off in zip(storage_axis_names, abm.storage_base_indices, + offsets): if arg_name in non1_storage_axis_names: arg = var(arg_name) else: arg = 0 storage_axis_subst_dict[ - prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi + prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi+off rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) @@ -877,13 +1017,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axis_names, storage_axis_sources, non1_storage_axis_names, temporary_name, compute_insn_id, compute_dep_id, - compute_read_variables=get_dependencies(expander(compute_expression))) + compute_read_variables=get_dependencies(expander(compute_expression)), + offsets=offsets) kernel = invr.map_kernel(kernel) kernel = kernel.copy( instructions=added_compute_insns + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) + if not invr.substituted_atleast_one: + raise ExpressionToAffineConversionError("All invocations of '{}'" + " contained non-affine arguments.".format(subst_name)) + # }}} # {{{ add dependencies to compute insn diff --git a/test/test_apps.py b/test/test_apps.py index e07262dbdda8ad3c24522f7d0eb4dba8422bf0ce..0ce953e93c6c3324aadebf03dd32d24861c042a0 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -43,8 +43,6 @@ else: from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests -from loopy.diagnostic import LoopyError - __all__ = [ "pytest_generate_tests", "cl" # 'cl.create_some_context' @@ -449,23 +447,23 @@ def test_lbm(ctx_factory): # Example by Loic Gouarin knl = lp.make_kernel( "{[ii,jj]:0<=ii m[0] = + f[i-1, j, 0] + f[i, j-1, 1] + f[i+1, j, 2] + f[i, j+1, 3] - m[1] = + 4.*f[i-1, j, 0] - 4.*f[i+1, j, 2] - m[2] = + 4.*f[i, j-1, 1] - 4.*f[i, j+1, 3] - m[3] = + f[i-1, j, 0] - f[i, j-1, 1] + f[i+1, j, 2] - f[i, j+1, 3] - m[4] = + f[i-1, j, 4] + f[i, j-1, 5] + f[i+1, j, 6] + f[i, j+1, 7] - m[5] = + 4.*f[i-1, j, 4] - 4.*f[i+1, j, 6] - m[6] = + 4.*f[i, j-1, 5] - 4.*f[i, j+1, 7] - m[7] = + f[i-1, j, 4] - f[i, j-1, 5] + f[i+1, j, 6] - f[i, j+1, 7] - m[8] = + f[i-1, j, 8] + f[i, j-1, 9] + f[i+1, j, 10] + f[i, j+1, 11] - m[9] = + 4.*f[i-1, j, 8] - 4.*f[i+1, j, 10] - m[10] = + 4.*f[i, j-1, 9] - 4.*f[i, j+1, 11] - m[11] = + f[i-1, j, 8] - f[i, j-1, 9] + f[i+1, j, 10] - f[i, j+1, 11] + m[1] = + 4.*f[i-1, j, 0] - 4.*f[i+1, j, 2] + m[2] = + 4.*f[i, j-1, 1] - 4.*f[i, j+1, 3] + m[3] = + f[i-1, j, 0] - f[i, j-1, 1] + f[i+1, j, 2] - f[i, j+1, 3] + m[4] = + f[i-1, j, 4] + f[i, j-1, 5] + f[i+1, j, 6] + f[i, j+1, 7] + m[5] = + 4.*f[i-1, j, 4] - 4.*f[i+1, j, 6] + m[6] = + 4.*f[i, j-1, 5] - 4.*f[i, j+1, 7] + m[7] = + f[i-1, j, 4] - f[i, j-1, 5] + f[i+1, j, 6] - f[i, j+1, 7] + m[8] = + f[i-1, j, 8] + f[i, j-1, 9] + f[i+1, j, 10] + f[i, j+1, 11] + m[9] = + 4.*f[i-1, j, 8] - 4.*f[i+1, j, 10] + m[10] = + 4.*f[i, j-1, 9] - 4.*f[i, j+1, 11] + m[11] = + f[i-1, j, 8] - f[i, j-1, 9] + f[i+1, j, 10] - f[i, j+1, 11] end with {id_prefix=update_m,dep=init_m*} @@ -674,26 +672,6 @@ def test_domain_tree_nesting(): assert depth(i) < 2 -def test_prefetch_through_indirect_access(): - knl = lp.make_kernel("{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", - """ - for i, j, k - a[map1[indirect[i], j], k] = 2 - end - """, - [ - lp.GlobalArg("a", strides=(2, 1), dtype=int), - lp.GlobalArg("map1", shape=(10, 10), dtype=int), - "..." - ], - target=lp.CTarget()) - - knl = lp.prioritize_loops(knl, "i,j,k") - - with pytest.raises(LoopyError): - knl = lp.add_prefetch(knl, "map1[:, j]") - - if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index 6eb6697b5c192911864000781381244dfcbef631..f60f6cbe443621b2dbc04f659b21e26f59d6936d 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -570,14 +570,33 @@ def test_nested_substs_in_insns(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl) +def test_prefetch_through_indirect_access(): + knl = lp.make_kernel("{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", + """ + for i, j, k + a[map1[indirect[i], j], k] = 2 + end + """, + [ + lp.GlobalArg("a", strides=(2, 1), dtype=int), + lp.GlobalArg("map1", shape=(10, 10), dtype=int), + "..." + ], + target=lp.CTarget()) + + knl = lp.prioritize_loops(knl, "i,j,k") + + with pytest.raises(lp.LoopyError): + knl = lp.add_prefetch(knl, "map1[:, j]", default_tag='l.auto') + + def test_extract_subst_with_iname_deps_in_templ(ctx_factory): knl = lp.make_kernel( "{[i, j, k]: 0<=i<100 and 0<=j,k<5}", """ y[i, j, k] = x[i, j, k] """, - [lp.GlobalArg('x,y', shape=lp.auto, dtype=float)], - lang_version=(2018, 2)) + [lp.GlobalArg('x,y', shape=lp.auto, dtype=float)]) knl = lp.extract_subst(knl, 'rule1', 'x[i, arg1, arg2]', parameters=('arg1', 'arg2')) @@ -585,6 +604,90 @@ def test_extract_subst_with_iname_deps_in_templ(ctx_factory): lp.auto_test_vs_ref(knl, ctx_factory(), knl) +@pytest.mark.parametrize("expr", [ + "map[i]", + "map[i]*map[i]" +]) +def test_precompute_with_constant_wrt_sweep_terms(ctx_factory, expr): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i,j,k]: 0<=i<100 and 0<=j<32 and 0<=k<8}", + """ + for i + for j + x[j] = 3.14 * (i + j) + y[j] = 0 + for k + y[j] = y[j] + L[{expr}, k, j] *x[j] + end + out[i, j] = y[j] + end + end + """.format(expr=expr), + [ + lp.GlobalArg('L', dtype=np.float64, shape=(1, 8, 32)), + lp.GlobalArg('map', dtype=np.int32, shape=lp.auto), + lp.TemporaryVariable('x', dtype=np.float64, shape=(32,)), + lp.TemporaryVariable('y', dtype=np.float64, shape=(32,)), + '...'], + seq_dependencies=True, + ) + + ref_knl = knl + + knl = lp.add_prefetch(knl, "L", sweep_inames=["j", "k"], + temporary_scope=lp.AddressSpace.LOCAL, fetch_outer_inames="i", + default_tag=None, dim_arg_names=["iprftch0", "iprftch1", "iprftch2"]) + + knl = lp.tag_inames(knl, "i:g.0, j:l.0, iprftch2:l.0") + + lp.auto_test_vs_ref(ref_knl, ctx, ref_knl, parameters={"map": + np.zeros(100, dtype=np.int32)}) + + +def test_precompute_with_same_offsets(ctx_factory): + knl = lp.make_kernel( + "{[i, j, n]:0<=i<10 and 0<=j<9 and 0<=n<100}", + """ + subst(x) := x + y[n] = sum((i, j), subst(map[n]+i)*subst(map[n]+1+j)) + """, [lp.GlobalArg('map', dtype=np.int32, shape=(100,)), '...']) + + knl = lp.precompute(knl, 'subst', sweep_inames='i', default_tag=None) + assert 'subst' not in knl.substitutions + + print(knl) + + +def test_precompute_with_variable_offsets(ctx_factory): + # subst(map1[n]+1+j) does not lie in the sweep footprint + # and hence should not be substituted. + knl = lp.make_kernel( + "{[i, j, n]:0<=i<10 and 0<=j<9 and 0<=n<100}", + """ + subst(x) := x + y[n] = sum((i, j), subst(map1[n]+i)*subst(map2[n]+1+j)) + """, [lp.GlobalArg('map1, map2', dtype=np.int32, shape=(100,)), '...']) + + knl = lp.precompute(knl, 'subst', sweep_inames='i', default_tag=None) + assert 'subst' in knl.substitutions + + print(knl) + + +def test_precompute_with_variable_offsets_included_in_sweep(ctx_factory): + knl = lp.make_kernel( + "{[i, j, n]:0<=i<10 and 0<=j<9 and 0<=n<100}", + """ + subst(x) := x + y[n] = sum((i, j), subst(n**2+i)*subst(n**3+1+j)) + """, [lp.GlobalArg('map1, map2', dtype=np.int32, shape=(100,)), '...']) + + with pytest.raises(lp.LoopyError): + knl = lp.precompute(knl, 'subst', sweep_inames='i,j', + default_tag=None) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])