diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 4a07b63330747aa69d7ed498e004d60b7c312a7b..d293e3ebe998a632bd547f94a67e675ff0592bfb 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -72,6 +72,8 @@ Manipulating Instructions .. autofunction:: tag_instructions +.. autofunction:: add_nosync + Registering Library Routines ---------------------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 942c7d56e01f9d037b0e2b601f88bc8b96dda151..5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1479,6 +1479,8 @@ Barriers :mod:`loopy` may infer the need for a barrier when it is not necessary. The ``no_sync_with`` instruction attribute can be used to resolve this. +See also :func:`loopy.add_nosync`. + TODO .. }}} diff --git a/loopy/__init__.py b/loopy/__init__.py index 6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a..53dd9c8eed796f3f191c21589f1095d468bac772 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -75,7 +75,8 @@ from loopy.transform.instruction import ( set_instruction_priority, add_dependency, remove_instructions, replace_instruction_ids, - tag_instructions) + tag_instructions, + add_nosync) from loopy.transform.data import ( add_prefetch, change_arg_to_image, @@ -189,6 +190,7 @@ __all__ = [ "remove_instructions", "replace_instruction_ids", "tag_instructions", + "add_nosync", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 7c9c9688604179dce2aa7dcd6954d76a0df32cc7..2be78f8e5c25a3b48c195f52715f9d6453100e3b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match): match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] - # }}} @@ -130,6 +129,8 @@ def remove_instructions(kernel, insn_ids): Dependencies across (one, for now) deleted isntructions are propagated. Behavior is undefined for now for chains of dependencies within the set of deleted instructions. + + This also updates *no_sync_with* for all instructions. """ if not insn_ids: @@ -155,7 +156,14 @@ def remove_instructions(kernel, insn_ids): for dep_id in depends_on & insn_ids: new_deps = new_deps | id_to_insn[dep_id].depends_on - new_insns.append(insn.copy(depends_on=frozenset(new_deps))) + # update no_sync_with + + new_no_sync_with = frozenset((insn_id, scope) + for insn_id, scope in insn.no_sync_with + if insn_id not in insn_ids) + + new_insns.append( + insn.copy(depends_on=new_deps, no_sync_with=new_no_sync_with)) return kernel.copy( instructions=new_insns) @@ -171,6 +179,7 @@ def replace_instruction_ids(kernel, replacements): for insn in kernel.instructions: changed = False new_depends_on = [] + new_no_sync_with = [] for dep in insn.depends_on: if dep in replacements: @@ -179,8 +188,18 @@ def replace_instruction_ids(kernel, replacements): else: new_depends_on.append(dep) + for insn_id, scope in insn.no_sync_with: + if insn_id in replacements: + new_no_sync_with.extend( + (repl, scope) for repl in replacements[insn_id]) + changed = True + else: + new_no_sync_with.append((insn_id, scope)) + new_insns.append( - insn.copy(depends_on=frozenset(new_depends_on)) + insn.copy( + depends_on=frozenset(new_depends_on), + no_sync_with=frozenset(new_no_sync_with)) if changed else insn) return kernel.copy(instructions=new_insns) @@ -207,4 +226,79 @@ def tag_instructions(kernel, new_tag, within=None): # }}} +# {{{ add nosync + +def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): + """Add a *no_sync_with* directive between *source* and *sink*. + *no_sync_with* is only added if *sink* depends on *source* or + if the instruction pair is in a conflicting group. + + This function does not check for the presence of a memory dependency. + + :arg kernel: The kernel + :arg source: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg sink: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg scope: A valid *no_sync_with* scope. See + :attr:`loopy.InstructionBase.no_sync_with` for allowable scopes. + :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with* + to both the source and sink instructions, otherwise the directive + is only added to the sink instructions. + :arg force: A :class:`bool`. If *True*, add a *no_sync_with* directive + even without the presence of a dependency edge or conflicting + instruction group. + + :return: The updated kernel + """ + + if isinstance(source, str) and source in kernel.id_to_insn: + sources = frozenset([source]) + else: + sources = frozenset( + source.id for source in find_instructions(kernel, source)) + + if isinstance(sink, str) and sink in kernel.id_to_insn: + sinks = frozenset([sink]) + else: + sinks = frozenset( + sink.id for sink in find_instructions(kernel, sink)) + + def insns_in_conflicting_groups(insn1_id, insn2_id): + insn1 = kernel.id_to_insn[insn1_id] + insn2 = kernel.id_to_insn[insn2_id] + return ( + bool(insn1.groups & insn2.conflicts_with_groups) + or + bool(insn2.groups & insn1.conflicts_with_groups)) + + from collections import defaultdict + nosync_to_add = defaultdict(set) + + for sink in sinks: + for source in sources: + + needs_nosync = force or ( + source in kernel.recursive_insn_dep_map()[sink] + or insns_in_conflicting_groups(source, sink)) + + if not needs_nosync: + continue + + nosync_to_add[sink].add((source, scope)) + if bidirectional: + nosync_to_add[source].add((sink, scope)) + + new_instructions = list(kernel.instructions) + + for i, insn in enumerate(new_instructions): + if insn.id in nosync_to_add: + new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with + | frozenset(nosync_to_add[insn.id])) + + return kernel.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index ac5a26f6a5683bf9e86055a2729ea5ee995dee1e..b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -402,6 +402,42 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") +def test_add_nosync(): + orig_knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>tmp[i] = 10 {id=insn1} + <>tmp2[i] = 10 {id=insn2} + + <>tmp3[2*i] = 0 {id=insn3} + <>tmp4 = 1 + tmp3[2*i] {id=insn4} + + <>tmp5[i] = 0 {id=insn5,groups=g1} + tmp5[i] = 1 {id=insn6,conflicts=g1} + """) + + orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") + orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + + # No dependency present - don't add nosync + knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2") + assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + + # Dependency present + knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Bidirectional + knl = lp.add_nosync( + orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Groups + knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])