diff --git a/loopy/__init__.py b/loopy/__init__.py index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..c74c56768d8c803d9c6da750905ce3f9a5f81488 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -86,7 +86,8 @@ from loopy.transform.instruction import ( remove_instructions, replace_instruction_ids, tag_instructions, - add_nosync) + add_nosync, + impose_only_read_after_write_deps) from loopy.transform.data import ( add_prefetch, change_arg_to_image, @@ -95,7 +96,8 @@ from loopy.transform.data import ( remove_unused_arguments, alias_temporaries, set_argument_order, rename_argument, - set_temporary_scope) + set_temporary_scope, squeeze_axes_in_temporaries, + remove_axis) from loopy.transform.subst import (extract_subst, assignment_to_subst, expand_subst, find_rules_matching, @@ -203,6 +205,8 @@ __all__ = [ "remove_unused_arguments", "alias_temporaries", "set_argument_order", "rename_argument", "set_temporary_scope", + "squeeze_axes_in_temporaries", + "remove_axis", "find_instructions", "map_instructions", "set_instruction_priority", "add_dependency", @@ -210,6 +214,7 @@ __all__ = [ "replace_instruction_ids", "tag_instructions", "add_nosync", + "impose_only_read_after_write_deps", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..02456d64d18a12af7c2200c3d711734d79c681ae 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,8 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.symbolic import (IdentityMapper, RuleAwareIdentityMapper, + SubstitutionRuleMappingContext) # {{{ convenience: add_prefetch @@ -767,4 +769,116 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No # }}} +# {{{ remove unused axes in temporaries + +class AxesSqueezer(IdentityMapper): + def __init__(self, tv_to_removable_axes): + self.tv_to_removable_axes = tv_to_removable_axes + + def map_subscript(self, expr): + removable_indices = self.tv_to_removable_axes.get(expr.aggregate.name, + None) + + if removable_indices: + assert all(expr.index_tuple[idx] == 0 for idx in removable_indices) + new_expr = type(expr)(expr.aggregate, tuple(self.rec(idx) for i, idx + in enumerate(expr.index_tuple) if i not in + removable_indices)) + + return new_expr + + return super(AxesSqueezer, self).map_subscript(expr) + + +def squeeze_axes_in_temporaries(kernel): + """ + Returns a kernel with all 1-length axes in a temporary variable removed. This + is helpful if some temporaries are intended to be run through + :func:`loopy.assignment_to_subst`, but all references to the variable are + of the form ``var_name[0, i, j]``. + + .. note:: + + If the shape of ``A`` is ``(1, 1, 3, 6)`` and all references to ``A`` + are of the form ``A[0, 0, i0, i1]`` then axes 0 and 1 are unused axes + and the references to ``A`` will be updated to ``A[i0, i1]``. + """ + new_temps = {} + tv_x_removable_axes = {} + for tv in kernel.temporary_variables.values(): + removable_axes = tuple(i for i, axis_len in enumerate(tv.shape) if + axis_len == 1) + if removable_axes: + tv_x_removable_axes[tv.name] = removable_axes + new_temps[tv.name] = tv.copy(shape=tuple(axis_len for axis_len in + tv.shape if axis_len != 1), + dim_tags=None) + else: + new_temps[tv.name] = tv + + new_insns = [] + axes_squeezer = AxesSqueezer(tv_x_removable_axes) + + for insn in kernel.instructions: + new_insns.append(insn.with_transformed_expressions(axes_squeezer)) + + return kernel.copy(instructions=new_insns, temporary_variables=new_temps) + +# }}} + + +# {{{ remove axis + +class AxisRemover(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, var_name, axis_num): + self.var_name = var_name + self.axis_num = axis_num + super(AxisRemover, self).__init__(rule_mapping_context) + + def map_subscript(self, expr, expn_state): + from pymbolic.primitives import Variable, Subscript + if expr.aggregate.name == self.var_name: + if len(expr.index_tuple) == 1: + return Variable(self.var_name) + else: + return Subscript(expr.aggregate, + expr.index_tuple[:self.axis_num] + + expr.index_tuple[self.axis_num+1:]) + + return super(AxisRemover, self).map_subscript(expr, expn_state) + + +def remove_axis(kernel, var_name, axis_num): + """ + Returns a kernel after removing *axis_num* axis of the temporary variable + *var_name*. + + One might interpret this operation as the inverse of privatization. + """ + + assert var_name in kernel.temporary_variables + assert axis_num < len(kernel.temporary_variables[var_name].shape) + + rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, + kernel.get_var_name_generator()) + + kernel = AxisRemover(rule_mapping_context, var_name, axis_num).map_kernel(kernel) + + if len(kernel.temporary_variables[var_name].shape) == 1: + new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) + if tv.name == var_name else (tv.name, tv) for tv in + kernel.temporary_variables.values()) + else: + from loopy import auto + new_temps = dict((tv.name, + tv.copy(shape=tv.shape[:axis_num]+tv.shape[axis_num+1:], + strides=auto, dim_tags=None)) + if tv.name == var_name else (tv.name, tv) for tv in + kernel.temporary_variables.values()) + + return kernel.copy(temporary_variables=new_temps) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..cb5c903a611d7a0a2701a7323d1336dee57ea605 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -73,28 +73,16 @@ def set_instruction_priority(kernel, insn_match, priority): # }}} -# {{{ add_dependency - -def add_dependency(kernel, insn_match, depends_on): - """Add the instruction dependency *dependency* to the instructions matched - by *insn_match*. - - *insn_match* and *depends_on* may be any instruction id match understood by - :func:`loopy.match.parse_match`. - - .. versionchanged:: 2016.3 - - Third argument renamed to *depends_on* for clarity, allowed to - be not just ID but also match expression. - """ +# {{{ add/remove_dependency +def _add_or_remove_dependency(kernel, insn_match, depends_on, adds): if isinstance(depends_on, str) and depends_on in kernel.id_to_insn: - added_deps = frozenset([depends_on]) + depends_on = frozenset([depends_on]) else: - added_deps = frozenset( + depends_on = frozenset( dep.id for dep in find_instructions(kernel, depends_on)) - if not added_deps: + if not depends_on: raise LoopyError("no instructions found matching '%s' " "(to add as dependencies)" % depends_on) @@ -104,13 +92,26 @@ def add_dependency(kernel, insn_match, depends_on): new_deps = insn.depends_on matched[0] = True if new_deps is None: - new_deps = added_deps + new_deps = depends_on else: - new_deps = new_deps | added_deps + new_deps = new_deps | depends_on return insn.copy(depends_on=new_deps) - result = map_instructions(kernel, insn_match, add_dep) + def remove_dep(insn): + new_deps = insn.depends_on + matched[0] = True + if new_deps is None: + new_deps = None + else: + new_deps = new_deps - depends_on + + return insn.copy(depends_on=new_deps) + + if adds: + result = map_instructions(kernel, insn_match, add_dep) + else: + result = map_instructions(kernel, insn_match, remove_dep) if not matched[0]: raise LoopyError("no instructions found matching '%s' " @@ -118,6 +119,31 @@ def add_dependency(kernel, insn_match, depends_on): return result + +def add_dependency(kernel, insn_match, depends_on): + """Add the instruction dependency *dependency* to the instructions matched + by *insn_match*. + + *insn_match* and *depends_on* may be any instruction id match understood by + :func:`loopy.match.parse_match`. + + .. versionchanged:: 2016.3 + + Third argument renamed to *depends_on* for clarity, allowed to + be not just ID but also match expression. + """ + return _add_or_remove_dependency(kernel, insn_match, depends_on, adds=True) + + +def remove_dependency(kernel, insn_match, depends_on): + """Remove the instruction dependency *depends_on* to the instructions matched + by *insn_match*. + + *insn_match* and *depends_on* may be any instruction id match understood by + :func:`loopy.match.parse_match`. + """ + return _add_or_remove_dependency(kernel, insn_match, depends_on, adds=False) + # }}} @@ -357,4 +383,33 @@ def uniquify_instruction_ids(kernel): # }}} +# {{{ impose_only_read_after_write_deps + +def impose_only_read_after_write_deps(kernel): + """ + Returns a kernel with every instruction depending only on instructions + which write to the variables that it reads. + """ + from loopy.kernel.tools import find_recursive_dependencies + + # insn_to_all_deps: stores all direct or indirect dependencies of an insn + insn_to_all_deps = dict((insn.id, set()) for insn in kernel.instructions) + + for insn in kernel.instructions: + insn_to_all_deps[insn.id] = find_recursive_dependencies(kernel, + [insn.id]) - set([insn.id]) + + new_insns = [] + + for insn in kernel.instructions: + depends_on = frozenset([dep_id for dep_id in insn_to_all_deps[insn.id] if + insn.read_dependency_names() & ( + kernel.id_to_insn[dep_id].write_dependency_names() + - kernel.all_inames())]) + new_insns.append(insn.copy(depends_on=depends_on)) + + return kernel.copy(instructions=new_insns) + +# }}} + # vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index cdc0c14b8bacc4fe5279d000461c0ea2244af021..da0630ca6a3ea03f69d7d49cbaa98976d9ad12fb 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -570,6 +570,72 @@ def test_nested_substs_in_insns(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl) +def test_impose_only_raw_deps(): + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + a[i] = 2*b[i] {id=insn_0} + c[i] = 2*d[i] {id=insn_1} + e[i] = 2*a[i] {id=insn_2} + """, seq_dependencies=True) + knl = lp.impose_only_read_after_write_deps(knl) + assert knl.id_to_insn['insn_2'].depends_on == frozenset(['insn_0']) + assert knl.id_to_insn['insn_1'].depends_on == frozenset() + + +def test_squeeze_axes_in_temps(ctx_factory): + knl = lp.make_kernel( + "{[n, i, j]: 0<=i, j<32 and 0<=n<100}", + """ + # unnecessary temps which might exacerbate register pressure + <> temp_1[0, i] = 2*x[n, i]**2 + <> temp_2[0, i] = x[n, i]**2 + <> temp_3[0, i] = 12*x[n, i]**2 + <> temp_4[0, i] = 0.2*x[n, i]**2 + y[n, j] = temp_1[0, j]+2*temp_2[0, j]+11*temp_3[0, j]+2*temp_4[0, j] + """, [lp.GlobalArg('x, y', shape=(100, 32), dtype=float), '...'], + seq_dependencies=True) + + ref_knl = knl.copy() + knl = lp.squeeze_axes_in_temporaries(knl) + knl = lp.assignment_to_subst(knl, 'temp_1') + knl = lp.assignment_to_subst(knl, 'temp_2') + knl = lp.assignment_to_subst(knl, 'temp_3') + knl = lp.assignment_to_subst(knl, 'temp_4') + lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl) + + +def test_remove_axis(ctx_factory): + knl = lp.make_kernel( + "{[n, i1, i2, j, k]: 0<=n<100 and 0<=i1, i2<10 and 0<=j, k<6}", + """ + # gather + tmp[i1, j] = x[n, i1, j] + # scatter + y[n, i2, k] = tmp[i2, k] + """, + [ + lp.GlobalArg('x, y', shape=(100, 10, 6), dtype=float), + lp.TemporaryVariable('tmp', shape=(10, 6,), dtype=float, + address_space=lp.AddressSpace.PRIVATE) + ], + seq_dependencies=True + ) + + knl = lp.tag_inames(knl, "n:g.0, j:l.0, k:l.0") + + ref_knl = knl.copy() + ref_knl = lp.set_options(ref_knl, 'write_cl') + + # get rid of unnecessary usage of private memory + knl = lp.remove_axis(knl, 'tmp', 1) + + assert ref_knl.temporary_variables['tmp'].shape == (10, 6) + assert knl.temporary_variables['tmp'].shape == (10,) + + lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])