diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 4a07b63330747aa69d7ed498e004d60b7c312a7b..d293e3ebe998a632bd547f94a67e675ff0592bfb 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -72,6 +72,8 @@ Manipulating Instructions .. autofunction:: tag_instructions +.. autofunction:: add_nosync + Registering Library Routines ---------------------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 942c7d56e01f9d037b0e2b601f88bc8b96dda151..5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1479,6 +1479,8 @@ Barriers :mod:`loopy` may infer the need for a barrier when it is not necessary. The ``no_sync_with`` instruction attribute can be used to resolve this. +See also :func:`loopy.add_nosync`. + TODO .. }}} diff --git a/loopy/__init__.py b/loopy/__init__.py index 6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a..aa1d43172a4bd6472f5974c292c4256946fcf542 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -54,7 +54,11 @@ from loopy.kernel.tools import ( get_dot_dependency_graph, show_dependency_graph, add_dtypes, - add_and_infer_dtypes) + add_and_infer_dtypes, + get_global_barrier_order, + find_most_recent_global_barrier, + get_subkernels, + get_subkernel_to_insn_id_map) from loopy.kernel.creation import make_kernel, UniqueName from loopy.library.reduction import register_reduction_parser @@ -75,7 +79,8 @@ from loopy.transform.instruction import ( set_instruction_priority, add_dependency, remove_instructions, replace_instruction_ids, - tag_instructions) + tag_instructions, + add_nosync) from loopy.transform.data import ( add_prefetch, change_arg_to_image, @@ -189,6 +194,7 @@ __all__ = [ "remove_instructions", "replace_instruction_ids", "tag_instructions", + "add_nosync", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", @@ -215,6 +221,10 @@ __all__ = [ "show_dependency_graph", "add_dtypes", "add_and_infer_dtypes", + "get_global_barrier_order", + "find_most_recent_global_barrier", + "get_subkernels", + "get_subkernel_to_insn_id_map", "infer_unknown_types", diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 793d31791a3295ef1d7c03132f43489ab828f089..324f7da1a21de0115ea060ff7ef55e52ab0913d4 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -786,6 +786,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): for var_name in insn.read_dependency_names() & admissible_vars: result.setdefault(var_name, set()).add(insn.id) + return result + @memoize_method def writer_map(self): """ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 2033425236836ecf000d6c341c46dcb8b087a29a..8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -34,6 +34,8 @@ import numpy as np import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel +from pytools import memoize_on_first_arg + import logging logger = logging.getLogger(__name__) @@ -316,10 +318,16 @@ class SetOperationCacheManager: return result def dim_min(self, set, *args): + if set.plain_is_empty(): + raise LoopyError("domain '%s' is empty" % set) + from loopy.isl_helpers import dim_min_with_elimination return self.op(set, "dim_min", dim_min_with_elimination, args) def dim_max(self, set, *args): + if set.plain_is_empty(): + raise LoopyError("domain '%s' is empty" % set) + from loopy.isl_helpers import dim_max_with_elimination return self.op(set, "dim_max", dim_max_with_elimination, args) @@ -1367,4 +1375,195 @@ def draw_dependencies_as_unicode_arrows( # }}} + +# {{{ global barrier order finding + +@memoize_on_first_arg +def get_global_barrier_order(kernel): + """Return a :class:`tuple` of the listing the ids of global barrier instructions + as they appear in order in the kernel. + + See also :class:`loopy.instruction.BarrierInstruction`. + """ + barriers = [] + visiting = set() + visited = set() + + unvisited = set(insn.id for insn in kernel.instructions) + + def is_barrier(my_insn_id): + insn = kernel.id_to_insn[my_insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and insn.kind == "global" + + while unvisited: + stack = [unvisited.pop()] + + while stack: + top = stack[-1] + + if top in visiting: + visiting.remove(top) + if is_barrier(top): + barriers.append(top) + + if top in visited: + stack.pop() + continue + + visited.add(top) + visiting.add(top) + + for child in kernel.id_to_insn[top].depends_on: + # Check for no cycles. + assert child not in visiting + stack.append(child) + + # Ensure this is the only possible order. + # + # We do this by looking at the barriers in order. + # We check for each adjacent pair (a,b) in the order if a < b, + # i.e. if a is reachable by a chain of dependencies from b. + + visiting.clear() + visited.clear() + + for prev_barrier, barrier in zip(barriers, barriers[1:]): + # Check if prev_barrier is reachable from barrier. + stack = [barrier] + visited.discard(prev_barrier) + + while stack: + top = stack[-1] + + if top in visiting: + visiting.remove(top) + + if top in visited: + stack.pop() + continue + + visited.add(top) + visiting.add(top) + + if top == prev_barrier: + visiting.clear() + break + + for child in kernel.id_to_insn[top].depends_on: + stack.append(child) + else: + # Search exhausted and we did not find prev_barrier. + raise LoopyError("barriers '%s' and '%s' are not ordered" + % (prev_barrier, barrier)) + + return tuple(barriers) + +# }}} + + +# {{{ find most recent global barrier + +@memoize_on_first_arg +def find_most_recent_global_barrier(kernel, insn_id): + """Return the id of the latest occuring global barrier which the + given instruction (indirectly or directly) depends on, or *None* if this + instruction does not depend on a global barrier. + + The return value is guaranteed to be unique because global barriers are + totally ordered within the kernel. + """ + + global_barrier_order = get_global_barrier_order(kernel) + + if len(global_barrier_order) == 0: + return None + + insn = kernel.id_to_insn[insn_id] + + if len(insn.depends_on) == 0: + return None + + def is_barrier(my_insn_id): + insn = kernel.id_to_insn[my_insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and insn.kind == "global" + + global_barrier_to_ordinal = dict( + (b, i) for i, b in enumerate(global_barrier_order)) + + def get_barrier_ordinal(barrier_id): + return (global_barrier_to_ordinal[barrier_id] + if barrier_id is not None + else -1) + + direct_barrier_dependencies = set( + dep for dep in insn.depends_on if is_barrier(dep)) + + if len(direct_barrier_dependencies) > 0: + return max(direct_barrier_dependencies, key=get_barrier_ordinal) + else: + return max((find_most_recent_global_barrier(kernel, dep) + for dep in insn.depends_on), + key=get_barrier_ordinal) + +# }}} + + +# {{{ subkernel tools + +@memoize_on_first_arg +def get_subkernels(kernel): + """Return a :class:`tuple` of the names of the subkernels in the kernel. The + kernel must be scheduled. + + See also :class:`loopy.schedule.CallKernel`. + """ + from loopy.kernel import kernel_state + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import CallKernel + + return tuple(sched_item.kernel_name + for sched_item in kernel.schedule + if isinstance(sched_item, CallKernel)) + + +@memoize_on_first_arg +def get_subkernel_to_insn_id_map(kernel): + """Return a :class:`dict` mapping subkernel names to a :class:`frozenset` + consisting of the instruction ids scheduled within the subkernel. The + kernel must be scheduled. + """ + from loopy.kernel import kernel_state + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import ( + sched_item_to_insn_id, CallKernel, ReturnFromKernel) + + subkernel = None + result = {} + + for sched_item in kernel.schedule: + if isinstance(sched_item, CallKernel): + subkernel = sched_item.kernel_name + result[subkernel] = set() + + if isinstance(sched_item, ReturnFromKernel): + subkernel = None + + if subkernel is not None: + for insn_id in sched_item_to_insn_id(sched_item): + result[subkernel].add(insn_id) + + for subkernel in result: + result[subkernel] = frozenset(result[subkernel]) + + return result + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 7c9c9688604179dce2aa7dcd6954d76a0df32cc7..2be78f8e5c25a3b48c195f52715f9d6453100e3b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match): match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] - # }}} @@ -130,6 +129,8 @@ def remove_instructions(kernel, insn_ids): Dependencies across (one, for now) deleted isntructions are propagated. Behavior is undefined for now for chains of dependencies within the set of deleted instructions. + + This also updates *no_sync_with* for all instructions. """ if not insn_ids: @@ -155,7 +156,14 @@ def remove_instructions(kernel, insn_ids): for dep_id in depends_on & insn_ids: new_deps = new_deps | id_to_insn[dep_id].depends_on - new_insns.append(insn.copy(depends_on=frozenset(new_deps))) + # update no_sync_with + + new_no_sync_with = frozenset((insn_id, scope) + for insn_id, scope in insn.no_sync_with + if insn_id not in insn_ids) + + new_insns.append( + insn.copy(depends_on=new_deps, no_sync_with=new_no_sync_with)) return kernel.copy( instructions=new_insns) @@ -171,6 +179,7 @@ def replace_instruction_ids(kernel, replacements): for insn in kernel.instructions: changed = False new_depends_on = [] + new_no_sync_with = [] for dep in insn.depends_on: if dep in replacements: @@ -179,8 +188,18 @@ def replace_instruction_ids(kernel, replacements): else: new_depends_on.append(dep) + for insn_id, scope in insn.no_sync_with: + if insn_id in replacements: + new_no_sync_with.extend( + (repl, scope) for repl in replacements[insn_id]) + changed = True + else: + new_no_sync_with.append((insn_id, scope)) + new_insns.append( - insn.copy(depends_on=frozenset(new_depends_on)) + insn.copy( + depends_on=frozenset(new_depends_on), + no_sync_with=frozenset(new_no_sync_with)) if changed else insn) return kernel.copy(instructions=new_insns) @@ -207,4 +226,79 @@ def tag_instructions(kernel, new_tag, within=None): # }}} +# {{{ add nosync + +def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): + """Add a *no_sync_with* directive between *source* and *sink*. + *no_sync_with* is only added if *sink* depends on *source* or + if the instruction pair is in a conflicting group. + + This function does not check for the presence of a memory dependency. + + :arg kernel: The kernel + :arg source: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg sink: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg scope: A valid *no_sync_with* scope. See + :attr:`loopy.InstructionBase.no_sync_with` for allowable scopes. + :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with* + to both the source and sink instructions, otherwise the directive + is only added to the sink instructions. + :arg force: A :class:`bool`. If *True*, add a *no_sync_with* directive + even without the presence of a dependency edge or conflicting + instruction group. + + :return: The updated kernel + """ + + if isinstance(source, str) and source in kernel.id_to_insn: + sources = frozenset([source]) + else: + sources = frozenset( + source.id for source in find_instructions(kernel, source)) + + if isinstance(sink, str) and sink in kernel.id_to_insn: + sinks = frozenset([sink]) + else: + sinks = frozenset( + sink.id for sink in find_instructions(kernel, sink)) + + def insns_in_conflicting_groups(insn1_id, insn2_id): + insn1 = kernel.id_to_insn[insn1_id] + insn2 = kernel.id_to_insn[insn2_id] + return ( + bool(insn1.groups & insn2.conflicts_with_groups) + or + bool(insn2.groups & insn1.conflicts_with_groups)) + + from collections import defaultdict + nosync_to_add = defaultdict(set) + + for sink in sinks: + for source in sources: + + needs_nosync = force or ( + source in kernel.recursive_insn_dep_map()[sink] + or insns_in_conflicting_groups(source, sink)) + + if not needs_nosync: + continue + + nosync_to_add[sink].add((source, scope)) + if bidirectional: + nosync_to_add[source].add((sink, scope)) + + new_instructions = list(kernel.instructions) + + for i, insn in enumerate(new_instructions): + if insn.id in nosync_to_add: + new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with + | frozenset(nosync_to_add[insn.id])) + + return kernel.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index 94cdb499c096337db0657267d5afd472eef80a9c..1218847a7c42bd420a993d86a7534f066c2ab20e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2108,6 +2108,50 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_global_barrier_order_finding(): + knl = lp.make_kernel( + "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}", + """ + for i + for itrip + ... gbarrier {id=top} + <> z[i] = z[i+1] + z[i] {id=wr_z,dep=top} + <> v[i] = 11 {id=wr_v,dep=top} + ... gbarrier {dep=wr_z:wr_v,id=yoink} + z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} + end + ... nop {id=nop} + ... gbarrier {dep=iupd,id=postloop} + z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop} + end + """) + + assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop") + + for insn, barrier in ( + ("nop", None), + ("top", None), + ("wr_z", "top"), + ("wr_v", "top"), + ("yoink", "top"), + ("postloop", "yoink"), + ("zzzv", "postloop")): + assert lp.find_most_recent_global_barrier(knl, insn) == barrier + + +def test_global_barrier_error_if_unordered(): + # FIXME: Should be illegal to declare this + knl = lp.make_kernel("{[i]: 0 <= i < 10}", + """ + ... gbarrier + ... gbarrier + """) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.get_global_barrier_order(knl) + + def test_struct_assignment(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/test_transform.py b/test/test_transform.py index ac5a26f6a5683bf9e86055a2729ea5ee995dee1e..b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -402,6 +402,42 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") +def test_add_nosync(): + orig_knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>tmp[i] = 10 {id=insn1} + <>tmp2[i] = 10 {id=insn2} + + <>tmp3[2*i] = 0 {id=insn3} + <>tmp4 = 1 + tmp3[2*i] {id=insn4} + + <>tmp5[i] = 0 {id=insn5,groups=g1} + tmp5[i] = 1 {id=insn6,conflicts=g1} + """) + + orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") + orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + + # No dependency present - don't add nosync + knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2") + assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + + # Dependency present + knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Bidirectional + knl = lp.add_nosync( + orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Groups + knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])