From 86ae80994616355413bcf784c7460aee11260eab Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 03:16:20 -0600 Subject: [PATCH 1/7] Barrier insertion: Rework barrier insertion to keep finer track of dependencies, and also to treat loops more carefully. * Dependencies are now tracked based on a list of active potential sources, and the active set is narrowed by filtering through readers/writers * Stricter handling of dependencies within loops by not assuming that loops are nonempty * Changes the barrier insertion algorithm to act in two stages: first, barriers are recursively inserted into loops, then they are inserted at the outermost level. This is mostly meant to simplify the code. * See also #18 on gitlab --- loopy/schedule/__init__.py | 613 +++++++++++++++++++------------------ test/test_loopy.py | 40 ++- 2 files changed, 356 insertions(+), 297 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 545c53344..e7cc08a73 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1384,172 +1384,243 @@ class DependencyRecord(ImmutableRecord): var_kind=var_kind) -def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): - """If there exists a dependency between target and source and the two access - a common variable of *var_kind* in a way that requires a barrier (essentially, - at least one write), then the function will return a tuple - ``(target, source, var_name)``. Otherwise, it will return *None*. - - This function finds direct or indirect instruction dependencies, but does - not attempt to guess dependencies that exist based on common access to - variables. - - :arg reverse: a :class:`bool` indicating whether - forward or reverse dependencies are sought. (see above) - :arg var_kind: "global" or "local", the kind of variable based on which - barrier-needing dependencies should be found. - """ - - # If target or source are insn IDs, look up the actual instructions. - from loopy.kernel.data import InstructionBase - if not isinstance(source, InstructionBase): - source = kernel.id_to_insn[source] - if not isinstance(target, InstructionBase): - target = kernel.id_to_insn[target] - - if reverse: - source, target = target, source - - if source.id in kernel.get_nosync_set(target.id, var_kind): - return - - # {{{ check that a dependency exists +class DependencyTracker(object): - dep_descr = None + def __init__(self, kernel, var_kind, reverse): + self.kernel = kernel + self.reverse = reverse + self.var_kind = var_kind - target_deps = kernel.recursive_insn_dep_map()[target.id] - if source.id in target_deps: - if reverse: - dep_descr = "{src} rev-depends on {tgt}" + if var_kind == "local": + self.relevant_vars = kernel.local_var_names() + elif var_kind == "global": + self.relevant_vars = kernel.global_var_names() else: - dep_descr = "{tgt} depends on {src}" - - grps = source.groups & target.conflicts_with_groups - if grps: - dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + raise ValueError("unknown 'var_kind': %s" % var_kind) - grps = target.groups & source.conflicts_with_groups - if grps: - dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + from collections import defaultdict + self.writer_map = defaultdict(lambda: set()) + self.reader_map = defaultdict(lambda: set()) + self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map() - if not dep_descr: - return None - - # }}} - - if var_kind == "local": - relevant_vars = kernel.local_var_names() - elif var_kind == "global": - relevant_vars = kernel.global_var_names() - else: - raise ValueError("unknown 'var_kind': %s" % var_kind) - - temp_to_base_storage = kernel.get_temporary_to_base_storage_map() - - def map_to_base_storage(var_names): + def map_to_base_storage(self, var_names): result = set(var_names) for name in var_names: - bs = temp_to_base_storage.get(name) + bs = self.temp_to_base_storage.get(name) if bs is not None: result.add(bs) return result - tgt_write = map_to_base_storage( - set(target.assignee_var_names()) & relevant_vars) - tgt_read = map_to_base_storage( - target.read_dependency_names() & relevant_vars) + def discard_all_sources(self): + self.writer_map.clear() + self.reader_map.clear() - src_write = map_to_base_storage( - set(source.assignee_var_names()) & relevant_vars) - src_read = map_to_base_storage( - source.read_dependency_names() & relevant_vars) + def add_source(self, source): + """ + Specify that an instruction may be used as the source of a dependency edge. + """ + # If source is an insn ID, look up the actual instruction. + source = self.kernel.id_to_insn.get(source, source) - waw = tgt_write & src_write - raw = tgt_read & src_write - war = tgt_write & src_read + for written in self.map_to_base_storage( + set(source.assignee_var_names()) & self.relevant_vars): + self.writer_map[written].add(source.id) - for var_name in sorted(raw | war): - return DependencyRecord( - source=source, - target=target, - dep_descr=dep_descr, - variable=var_name, - var_kind=var_kind) + for read in self.map_to_base_storage( + source.read_dependency_names() & self.relevant_vars): + self.reader_map[read].add(source.id) + + def get_dependencies_with_target_at(self, target): + """ + Generate all dependencies edges whose target is the given instruction. + """ + # If target is an insn ID, look up the actual instruction. + target = self.kernel.id_to_insn.get(target, target) + + tgt_write = self.map_to_base_storage( + set(target.assignee_var_names()) & self.relevant_vars) + tgt_read = self.map_to_base_storage( + target.read_dependency_names() & self.relevant_vars) + + for (accessed_vars, accessor_map, ignore_self) in [ + (tgt_read, self.writer_map, False), + (tgt_write, self.reader_map, False), + (tgt_write, self.writer_map, True)]: + + for dep in self.get_conflicting_accesses( + accessed_vars, accessor_map, ignore_self, target.id): + yield dep + + def get_conflicting_accesses( + self, accessed_vars, var_to_accessor_map, ignore_self, target): + + def determine_conflict_nature(source, target): + if ignore_self and source == target: + return None + if (not self.reverse and source in + self.kernel.get_nosync_set(target, scope=self.var_kind)): + return None + if (self.reverse and target in + self.kernel.get_nosync_set(source, scope=self.var_kind)): + return None + return self.describe_dependency(source, target) + + for var in sorted(accessed_vars): + for source in sorted(var_to_accessor_map[var]): + dep_descr = determine_conflict_nature(source, target) + + if dep_descr is None: + continue - if source is target: - return None + yield DependencyRecord( + source=self.kernel.id_to_insn[source], + target=self.kernel.id_to_insn[target], + dep_descr=dep_descr, + variable=var, + var_kind=self.var_kind) - for var_name in sorted(waw): - return DependencyRecord( - source=source, - target=target, - dep_descr=dep_descr, - variable=var_name, - var_kind=var_kind) + def describe_dependency(self, source, target): + dep_descr = None + + source = self.kernel.id_to_insn[source] + target = self.kernel.id_to_insn[target] - return None + if self.reverse: + source, target = target, source + + target_deps = self.kernel.recursive_insn_dep_map()[target.id] + if source.id in target_deps: + if self.reverse: + dep_descr = "{tgt} rev-depends on {src}" + else: + dep_descr = "{tgt} depends on {src}" + + grps = source.groups & target.conflicts_with_groups + if not grps: + grps = target.groups & source.conflicts_with_groups + + if grps: + dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + + return dep_descr def barrier_kind_more_or_equally_global(kind1, kind2): return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") -def get_tail_starting_at_last_barrier(schedule, kind): - result = [] +def first_and_last_barrier_indices_not_within_loops(schedule, kind): + first_barrier_index = None + last_barrier_index = None - for sched_item in reversed(schedule): - if isinstance(sched_item, Barrier): - if barrier_kind_more_or_equally_global(sched_item.kind, kind): - break + loop_level = 0 - elif isinstance(sched_item, RunInstruction): - result.append(sched_item.insn_id) + for j, sub_sched_item in enumerate(schedule): + if ( + loop_level == 0 + and (isinstance(sub_sched_item, Barrier) + and barrier_kind_more_or_equally_global( + sub_sched_item.kind, kind))): + last_barrier_index = j + if first_barrier_index is None: + first_barrier_index = j + elif isinstance(sub_sched_item, EnterLoop): + loop_level += 1 + elif isinstance(sub_sched_item, LeaveLoop): + loop_level -= 1 - elif isinstance(sched_item, (EnterLoop, LeaveLoop)): - pass + return first_barrier_index, last_barrier_index - elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): - pass - else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) +def insn_ids_with_path_to_end_without_intervening_barrier(schedule, kind): + insn_ids_killable_by_level = [set()] - return reversed(result) + for sched_item in schedule: + if isinstance(sched_item, EnterLoop): + insn_ids_killable_by_level.append(set()) + elif isinstance(sched_item, LeaveLoop): + end = insn_ids_killable_by_level.pop() + # Deeper instructions can be killed by barriers at a shallower + # level, e.g.: + # + # for i + # insn0 + # end + # barrier() <= kills insn0 + # + # On the other hand, we can't assume that shallower instructions can + # be killed by deeper barriers, since loops might be empty, e.g.: + # + # insn0 <= isn't killed by barrier + # for i + # barrier() + # end + insn_ids_killable_by_level[-1].update(end) + elif isinstance(sched_item, Barrier): + if barrier_kind_more_or_equally_global(sched_item.kind, kind): + insn_ids_killable_by_level[-1].clear() + else: + insn_ids_killable_by_level[-1] |= set( + insn_id for insn_id in sched_item_to_insn_id(sched_item)) + return insn_ids_killable_by_level[0] -def insn_ids_from_schedule(schedule): - result = [] - for sched_item in reversed(schedule): - if isinstance(sched_item, RunInstruction): - result.append(sched_item.insn_id) - elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel, - ReturnFromKernel)): - pass +def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): + seen_barrier_at_level = [False] + insn_ids_alive_at_level = [set()] + result = set() + for sched_item in schedule: + if isinstance(sched_item, EnterLoop): + insn_ids_alive_at_level.append(set()) + seen_barrier_at_level.append(False) + elif isinstance(sched_item, LeaveLoop): + result |= insn_ids_alive_at_level.pop() + seen_barrier_at_level.pop() + elif isinstance(sched_item, Barrier): + if barrier_kind_more_or_equally_global(sched_item.kind, kind): + seen_barrier_at_level[-1] = True else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) + if not seen_barrier_at_level[-1]: + insn_ids_alive_at_level[-1] |= set( + insn_id for insn_id in sched_item_to_insn_id(sched_item)) + + return result | insn_ids_alive_at_level[-1] - return result +def insn_ids_from_schedule(schedule): + from itertools import chain + return chain.from_iterable( + sched_item_to_insn_id(sched_item) + for sched_item in schedule) + + +def append_barrier_or_raise_error(schedule, dep, verify_only): + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no" + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + dep.var_kind)) + else: + comment = "for %s (%s)" % ( + dep.variable, dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id)) + schedule.append(Barrier(comment=comment, kind=dep.var_kind)) -def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): + +def insert_barriers(kernel, schedule, kind, verify_only, level=0): """ - :arg reverse: a :class:`bool`. For ``level > 0``, this function should be - called twice, first with ``reverse=False`` to insert barriers for - forward dependencies, and then again with ``reverse=True`` to insert - reverse depedencies. This order is preferable because the forward pass - will limit the number of instructions that need to be considered as - depedency source candidates by already inserting some number of - barriers into *schedule*. - - Calling it with ``reverse==True and level==0` is not necessary, - since the root of the schedule is in no loop, therefore not repeated, - and therefore reverse dependencies don't need to be added. :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted. Generally, this function will be called once for each kind of barrier at the top level, where more global barriers should be inserted first. @@ -1557,184 +1628,120 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): missing. :arg level: the current level of loop nesting, 0 for outermost. """ - result = [] - # In straight-line code, we have only 'b depends on a'-type 'forward' - # dependencies. But a loop of the type - # - # for i in range(10): - # A - # B - # - # effectively glues multiple copies of 'A;B' one after the other: - # - # A - # B - # A - # B - # ... - # - # Now, if B depends on (i.e. is required to be textually before) A in a way - # requiring a barrier, then we will assume that the reverse dependency exists - # as well, i.e. a barrier between the tail end fo execution of B and the next - # beginning of A is also needed. - - if level == 0 and reverse: - # The global schedule is in no loop, therefore not repeated, and - # therefore reverse dependencies don't need to be added. - return schedule - - # a list of instruction IDs that could lead to barrier-needing dependencies. - if reverse: - candidates = set(get_tail_starting_at_last_barrier(schedule, kind)) - else: - candidates = set() + # {{{ insert barriers at outermost scheduling level - past_first_barrier = [False] + def insert_barriers_at_outer_level(schedule, reverse=False): + dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse) - def seen_barrier(): - past_first_barrier[0] = True + if reverse: + # Populate the dependency tracker with sources from the tail end of + # the schedule block. + for insn_id in ( + insn_ids_with_path_to_end_without_intervening_barrier( + schedule, kind)): + dep_tracker.add_source(insn_id) - # We've just gone across a barrier, so anything that needed - # one from above just got one. + result = [] - candidates.clear() + i = 0 + while i < len(schedule): + sched_item = schedule[i] - def issue_barrier(dep): - seen_barrier() + if isinstance(sched_item, EnterLoop): + subloop, new_i = gather_schedule_block(schedule, i) - comment = None - if dep is not None: - comment = "for %s (%s)" % ( - dep.variable, dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id)) + loop_head = ( + insn_ids_reachable_from_start_without_intervening_barrier( + subloop, kind)) - result.append(Barrier(comment=comment, kind=dep.var_kind)) + loop_tail = ( + insn_ids_with_path_to_end_without_intervening_barrier( + subloop, kind)) - i = 0 - while i < len(schedule): - sched_item = schedule[i] - - if isinstance(sched_item, EnterLoop): - # {{{ recurse for nested loop - - subloop, new_i = gather_schedule_block(schedule, i) - i = new_i + # Checks if a barrier is needed before the loop. This handles + # dependencies with targets that can be reached without an + # intervening barrier from the start of the loop: + # + # a[x] = ... <= source + # for i + # ... = a[y] <= target + # barrier() + # ... + from itertools import chain + for dep in chain.from_iterable( + dep_tracker.get_dependencies_with_target_at(insn) + for insn in loop_head): + append_barrier_or_raise_error(result, dep, verify_only) + # This barrier gets inserted outside the loop, hence it is + # executed unconditionally and so kills all sources before + # the loop. + dep_tracker.discard_all_sources() + break - # Run barrier insertion for inner loop - subresult = subloop[1:-1] - for sub_reverse in [False, True]: - subresult = insert_barriers( - kernel, subresult, - reverse=sub_reverse, kind=kind, - verify_only=verify_only, - level=level+1) + result.extend(subloop) - # {{{ find barriers in loop body + # Handle dependencies with sources not killed inside the loop: + # + # for i + # ... + # barrier() + # b[i] = ... <= source + # end for + # ... = f(b) <= target + for item in loop_tail: + dep_tracker.add_source(item) + + i = new_i + + elif isinstance(sched_item, Barrier): + result.append(sched_item) + if barrier_kind_more_or_equally_global(sched_item.kind, kind): + dep_tracker.discard_all_sources() + i += 1 + + elif isinstance(sched_item, RunInstruction): + for dep in dep_tracker.get_dependencies_with_target_at( + sched_item.insn_id): + append_barrier_or_raise_error(result, dep, verify_only) + dep_tracker.discard_all_sources() + break + result.append(sched_item) + dep_tracker.add_source(sched_item.insn_id) + i += 1 - first_barrier_index = None - last_barrier_index = None + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + result.append(sched_item) + i += 1 - for j, sub_sched_item in enumerate(subresult): - if (isinstance(sub_sched_item, Barrier) and - barrier_kind_more_or_equally_global( - sub_sched_item.kind, kind)): + else: + raise ValueError("unexpected schedule item type '%s'" + % type(sched_item).__name__) - last_barrier_index = j - if first_barrier_index is None: - first_barrier_index = j + return result - # }}} + # }}} - # {{{ check if a barrier is needed before the loop - - # (for leading (before-first-barrier) bit of loop body) - for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]): - search_set = sorted(candidates) - - for dep_src_insn_id in search_set: - dep = get_barrier_needing_dependency( - kernel, - target=insn_id, - source=dep_src_insn_id, - reverse=reverse, var_kind=kind) - if dep: - if verify_only: - from loopy.diagnostic import MissingBarrierError - raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " - "requires synchronization " - "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no" - "synchronization is needed)" - % ( - dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id), - dep.variable, - kind)) - else: - issue_barrier(dep=dep) - break + ################################################# + # Part 1: Recursively insert barriers in loops. # + ################################################# - # }}} - - # add trailing end (past-last-barrier) of loop body to candidates - if last_barrier_index is None: - candidates.update(insn_ids_from_schedule(subresult)) - else: - seen_barrier() - candidates.update( - insn_ids_from_schedule( - subresult[last_barrier_index+1:])) + result = [] + i = 0 + while i < len(schedule): + sched_item = schedule[i] + if isinstance(sched_item, EnterLoop): + subloop, new_i = gather_schedule_block(schedule, i) + new_subloop = insert_barriers( + kernel, subloop[1:-1], kind, verify_only, level + 1) result.append(subloop[0]) - result.extend(subresult) + result.extend(new_subloop) result.append(subloop[-1]) + i = new_i - # }}} - - elif isinstance(sched_item, Barrier): - i += 1 - - if barrier_kind_more_or_equally_global(sched_item.kind, kind): - seen_barrier() - - result.append(sched_item) - - elif isinstance(sched_item, RunInstruction): - i += 1 - - search_set = sorted(candidates) - - for dep_src_insn_id in search_set: - dep = get_barrier_needing_dependency( - kernel, - target=sched_item.insn_id, - source=dep_src_insn_id, - reverse=reverse, var_kind=kind) - if dep: - if verify_only: - from loopy.diagnostic import MissingBarrierError - raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " - "requires synchronization " - "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no " - "synchronization is needed)" - % ( - dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id), - dep.variable, - kind)) - - else: - issue_barrier(dep=dep) - break - - result.append(sched_item) - candidates.add(sched_item.insn_id) - - elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + elif isinstance(sched_item, + (Barrier, RunInstruction, CallKernel, ReturnFromKernel)): result.append(sched_item) i += 1 @@ -1742,13 +1749,33 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) - if past_first_barrier[0] and reverse: - # We can quit here, because we're only trying add - # reverse-dep barriers to the beginning of the loop, up to - # the first barrier. - - result.extend(schedule[i:]) - break + ################################################### + # Part 2: Insert barriers at the outermost level. # + ################################################### + + result = insert_barriers_at_outer_level(result) + # When level = 0 there is no loop. + if level != 0: + # In straight-line code, we have only 'b depends on a'-type 'forward' + # dependencies. But a loop of the type + # + # for i in range(10): + # A + # B + # + # effectively glues multiple copies of 'A;B' one after the other: + # + # A + # B + # A + # B + # ... + # + # Now, if B depends on (i.e. is required to be textually before) A in a + # way requiring a barrier, then we will assume that the reverse + # dependency exists as well, i.e. a barrier between the tail end of + # execution of B and the next beginning of A is also needed. + result = insert_barriers_at_outer_level(result, reverse=True) return result @@ -1897,11 +1924,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="global", verify_only=True) + kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="local", verify_only=False) + gen_sched = insert_barriers(kernel, gen_sched, kind="local", + verify_only=False) logger.info("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( diff --git a/test/test_loopy.py b/test/test_loopy.py index 48ccd8ee0..db4a38204 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1995,10 +1995,11 @@ def test_integer_reduction(ctx_factory): assert function(out) -def assert_barrier_between(knl, id1, id2): - from loopy.schedule import RunInstruction, Barrier +def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): + from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop) watch_for_barrier = False seen_barrier = False + loop_level = 0 for sched_item in knl.schedule: if isinstance(sched_item, RunInstruction): @@ -2008,9 +2009,13 @@ def assert_barrier_between(knl, id1, id2): assert watch_for_barrier assert seen_barrier return - if isinstance(sched_item, Barrier): - if watch_for_barrier: + elif isinstance(sched_item, Barrier): + if watch_for_barrier and loop_level not in ignore_barriers_in_levels: seen_barrier = True + elif isinstance(sched_item, EnterLoop): + loop_level += 1 + elif isinstance(sched_item, LeaveLoop): + loop_level -= 1 raise RuntimeError("id2 was not seen") @@ -2029,6 +2034,7 @@ def test_barrier_insertion_near_top_of_loop(): end """, seq_dependencies=True) + knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.set_temporary_scope(knl, "a", "local") knl = lp.set_temporary_scope(knl, "b", "local") @@ -2041,6 +2047,32 @@ def test_barrier_insertion_near_top_of_loop(): assert_barrier_between(knl, "bcomp1", "bcomp2") +def test_barrier_insertion_near_bottom_of_loop(): + knl = lp.make_kernel( + ["{[i]: 0 <= i < 10 }", + "[jmax] -> {[j]: 0 <= j < jmax}"], + """ + for i + <>a[i] = i {id=ainit} + for j + <>b[i,j] = a[i] + t {id=bcomp1} + b[i,j] = b[i,j] + 1 {id=bcomp2} + end + a[i] = i + 1 {id=aupdate} + end + """, + seq_dependencies=True) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + + print(knl) + + assert_barrier_between(knl, "bcomp1", "bcomp2") + assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 2b2accd3e1dd8a1fcafcb9fd3e80a4b9ac53073f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 03:31:24 -0600 Subject: [PATCH 2/7] Remove unused function. --- loopy/schedule/__init__.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index e7cc08a73..8b17e3c83 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1511,29 +1511,6 @@ def barrier_kind_more_or_equally_global(kind1, kind2): return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") -def first_and_last_barrier_indices_not_within_loops(schedule, kind): - first_barrier_index = None - last_barrier_index = None - - loop_level = 0 - - for j, sub_sched_item in enumerate(schedule): - if ( - loop_level == 0 - and (isinstance(sub_sched_item, Barrier) - and barrier_kind_more_or_equally_global( - sub_sched_item.kind, kind))): - last_barrier_index = j - if first_barrier_index is None: - first_barrier_index = j - elif isinstance(sub_sched_item, EnterLoop): - loop_level += 1 - elif isinstance(sub_sched_item, LeaveLoop): - loop_level -= 1 - - return first_barrier_index, last_barrier_index - - def insn_ids_with_path_to_end_without_intervening_barrier(schedule, kind): insn_ids_killable_by_level = [set()] -- GitLab From 3b704079033e39415ae7f3d036259d7b1072a7b2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 03:37:52 -0600 Subject: [PATCH 3/7] Fix error message. --- loopy/schedule/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 8b17e3c83..887cd3d22 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1582,7 +1582,7 @@ def append_barrier_or_raise_error(schedule, dep, verify_only): "Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no" + "instruction option to state that no " "synchronization is needed)" % ( dep.dep_descr.format( -- GitLab From c6f42c21bca2c73f3828df63a3bd73ab7634343c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 04:11:08 -0600 Subject: [PATCH 4/7] insn_ids_reachable_from_start_without_intervening_barrier(): Can further narrow set of instructions by considering barriers at the shallower level. --- loopy/schedule/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 887cd3d22..fe6269d86 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1553,7 +1553,9 @@ def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): for sched_item in schedule: if isinstance(sched_item, EnterLoop): insn_ids_alive_at_level.append(set()) - seen_barrier_at_level.append(False) + # Barriers seen at the shallower level will also prevent + # instructions from being reached at the deeper level. + seen_barrier_at_level.append(seen_barrier_at_level[-1]) elif isinstance(sched_item, LeaveLoop): result |= insn_ids_alive_at_level.pop() seen_barrier_at_level.pop() -- GitLab From 2340148b2658a188d4c72f7cc6443819c2c548f7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 22:01:24 -0600 Subject: [PATCH 5/7] Use a common implementation for insn_ids_reaching_*_without_interleaving_barrier(). --- loopy/schedule/__init__.py | 92 +++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index fe6269d86..8d9a5bb95 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1511,70 +1511,62 @@ def barrier_kind_more_or_equally_global(kind1, kind2): return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") -def insn_ids_with_path_to_end_without_intervening_barrier(schedule, kind): - insn_ids_killable_by_level = [set()] +def insn_ids_reaching_end_without_intervening_barrier(schedule, kind): + return _insn_ids_reaching_end(schedule, kind, reverse=False) + + +def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): + return _insn_ids_reaching_end(schedule, kind, reverse=True) + + +def _insn_ids_reaching_end(schedule, kind, reverse): + if reverse: + schedule = reversed(schedule) + enter_scope_item_kind = LeaveLoop + leave_scope_item_kind = EnterLoop + else: + enter_scope_item_kind = EnterLoop + leave_scope_item_kind = LeaveLoop + + insn_ids_alive_at_scope = [set()] for sched_item in schedule: - if isinstance(sched_item, EnterLoop): - insn_ids_killable_by_level.append(set()) - elif isinstance(sched_item, LeaveLoop): - end = insn_ids_killable_by_level.pop() - # Deeper instructions can be killed by barriers at a shallower - # level, e.g.: + if isinstance(sched_item, enter_scope_item_kind): + insn_ids_alive_at_scope.append(set()) + elif isinstance(sched_item, leave_scope_item_kind): + innermost_scope = insn_ids_alive_at_scope.pop() + # Instructions in deeper scopes are alive but could be killed by + # barriers at a shallower level, e.g.: # # for i # insn0 # end # barrier() <= kills insn0 # - # On the other hand, we can't assume that shallower instructions can - # be killed by deeper barriers, since loops might be empty, e.g.: + # Hence we merge this scope into the parent scope. + insn_ids_alive_at_scope[-1].update(innermost_scope) + elif isinstance(sched_item, Barrier): + # This barrier kills only the instruction ids that are alive at + # the current scope (or deeper). Without further analysis, we + # can't assume that instructions at shallower scope can be + # killed by deeper barriers, since loops might be empty, e.g.: # - # insn0 <= isn't killed by barrier + # insn0 <= isn't killed by barrier (i loop could be empty) # for i + # insn1 <= is killed by barrier + # for j + # insn2 <= is killed by barrier + # end # barrier() # end - insn_ids_killable_by_level[-1].update(end) - elif isinstance(sched_item, Barrier): - if barrier_kind_more_or_equally_global(sched_item.kind, kind): - insn_ids_killable_by_level[-1].clear() - else: - insn_ids_killable_by_level[-1] |= set( - insn_id for insn_id in sched_item_to_insn_id(sched_item)) - - return insn_ids_killable_by_level[0] - - -def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): - seen_barrier_at_level = [False] - insn_ids_alive_at_level = [set()] - result = set() - - for sched_item in schedule: - if isinstance(sched_item, EnterLoop): - insn_ids_alive_at_level.append(set()) - # Barriers seen at the shallower level will also prevent - # instructions from being reached at the deeper level. - seen_barrier_at_level.append(seen_barrier_at_level[-1]) - elif isinstance(sched_item, LeaveLoop): - result |= insn_ids_alive_at_level.pop() - seen_barrier_at_level.pop() - elif isinstance(sched_item, Barrier): if barrier_kind_more_or_equally_global(sched_item.kind, kind): - seen_barrier_at_level[-1] = True + insn_ids_alive_at_scope[-1].clear() else: - if not seen_barrier_at_level[-1]: - insn_ids_alive_at_level[-1] |= set( + insn_ids_alive_at_scope[-1] |= set( insn_id for insn_id in sched_item_to_insn_id(sched_item)) - return result | insn_ids_alive_at_level[-1] - - -def insn_ids_from_schedule(schedule): - from itertools import chain - return chain.from_iterable( - sched_item_to_insn_id(sched_item) - for sched_item in schedule) + assert len(insn_ids_alive_at_scope) == 1 + return insn_ids_alive_at_scope[-1] def append_barrier_or_raise_error(schedule, dep, verify_only): @@ -1617,7 +1609,7 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): # Populate the dependency tracker with sources from the tail end of # the schedule block. for insn_id in ( - insn_ids_with_path_to_end_without_intervening_barrier( + insn_ids_reaching_end_without_intervening_barrier( schedule, kind)): dep_tracker.add_source(insn_id) @@ -1635,7 +1627,7 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): subloop, kind)) loop_tail = ( - insn_ids_with_path_to_end_without_intervening_barrier( + insn_ids_reaching_end_without_intervening_barrier( subloop, kind)) # Checks if a barrier is needed before the loop. This handles -- GitLab From 9e877119f83600409a529cbbe8c4df1052f7fc4b Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 22 Dec 2016 23:32:17 -0600 Subject: [PATCH 6/7] Bump version for new barrier insertion algorithm. --- loopy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/version.py b/loopy/version.py index 503b6c54e..6a02f4d99 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v52-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v53-islpy%s" % _islpy_version -- GitLab From 8fb957c602f96da7ebbace961cf701f2fafccbf2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 25 Dec 2016 18:35:07 +0100 Subject: [PATCH 7/7] Improve documentation for barrier insertion rework --- loopy/schedule/__init__.py | 75 +++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c5e2f6c88..4148d7d75 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1385,8 +1385,43 @@ class DependencyRecord(ImmutableRecord): class DependencyTracker(object): + """ + A utility to help track dependencies between originating from a set + of sources (as defined by :meth:`add_source`. For each target, + dependencies can then be obtained using :meth:`gen_dependencies_with_target_at`. + + .. automethod:: add_source + .. automethod:: gen_dependencies_with_target_at + """ def __init__(self, kernel, var_kind, reverse): + """ + :arg var_kind: "global" or "local", the kind of variable based on which + barrier-needing dependencies should be found. + :arg reverse: + In straight-line code, this only tracks 'b depends on + a'-type 'forward' dependencies. But a loop of the type:: + + for i in range(10): + A + B + + effectively glues multiple copies of 'A;B' one after the other:: + + A + B + A + B + ... + + Now, if B depends on (i.e. is required to be textually before) A in a + way requiring a barrier, then we will assume that the reverse + dependency exists as well, i.e. a barrier between the tail end of + execution of B and the next beginning of A is also needed. + + Setting *reverse* to *True* tracks these reverse (instead of forward) + dependencies. + """ self.kernel = kernel self.reverse = reverse self.var_kind = var_kind @@ -1432,9 +1467,13 @@ class DependencyTracker(object): source.read_dependency_names() & self.relevant_vars): self.reader_map[read].add(source.id) - def get_dependencies_with_target_at(self, target): + def gen_dependencies_with_target_at(self, target): """ - Generate all dependencies edges whose target is the given instruction. + Generate :class:`DependencyRecord` instances for dependencies edges + whose target is the given instruction. + + :arg target: The ID of the instruction for which dependencies + with conflicting var access should be found. """ # If target is an insn ID, look up the actual instruction. target = self.kernel.id_to_insn.get(target, target) @@ -1641,7 +1680,7 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): # ... from itertools import chain for dep in chain.from_iterable( - dep_tracker.get_dependencies_with_target_at(insn) + dep_tracker.gen_dependencies_with_target_at(insn) for insn in loop_head): append_barrier_or_raise_error(result, dep, verify_only) # This barrier gets inserted outside the loop, hence it is @@ -1672,7 +1711,7 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): i += 1 elif isinstance(sched_item, RunInstruction): - for dep in dep_tracker.get_dependencies_with_target_at( + for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): append_barrier_or_raise_error(result, dep, verify_only) dep_tracker.discard_all_sources() @@ -1693,9 +1732,7 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): # }}} - ################################################# - # Part 1: Recursively insert barriers in loops. # - ################################################# + # {{{ recursively insert barriers in loops result = [] i = 0 @@ -1720,32 +1757,12 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) - ################################################### - # Part 2: Insert barriers at the outermost level. # - ################################################### + # }}} result = insert_barriers_at_outer_level(result) + # When level = 0 there is no loop. if level != 0: - # In straight-line code, we have only 'b depends on a'-type 'forward' - # dependencies. But a loop of the type - # - # for i in range(10): - # A - # B - # - # effectively glues multiple copies of 'A;B' one after the other: - # - # A - # B - # A - # B - # ... - # - # Now, if B depends on (i.e. is required to be textually before) A in a - # way requiring a barrier, then we will assume that the reverse - # dependency exists as well, i.e. a barrier between the tail end of - # execution of B and the next beginning of A is also needed. result = insert_barriers_at_outer_level(result, reverse=True) return result -- GitLab