From 93e0a9db830817c05146ff4b45ca3924ed69fc81 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 May 2020 18:59:20 -0500 Subject: [PATCH 01/45] cache the iname tag querying mechanism at kernel-level --- loopy/check.py | 2 +- loopy/kernel/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index da49c1d11..7c63bc44a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -628,7 +628,7 @@ def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * an (at least indirect) depdendency edge, or + * a (at least indirect) depdendency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.Instruction.no_sync_with`) """ diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 2d926aad4..ddd245261 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -744,9 +744,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ iname wrangling + @memoize_method def iname_tags(self, iname): return self.iname_to_tags.get(iname, frozenset()) + @memoize_method def iname_tags_of_type(self, iname, tag_type_or_types, max_num=None, min_num=None): """Return a subset of *tags* that matches type *tag_type*. Raises exception -- GitLab From 785b8167b83b3520cff8ad586ea43ffe79edf4f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 May 2020 19:00:29 -0500 Subject: [PATCH 02/45] [scheduler]: schedule available straight line instructions eagerly without recursing --- loopy/schedule/__init__.py | 91 ++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 9 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 032cdc276..798a34e78 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -29,7 +29,7 @@ import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa -from pytools import MinRecursionLimit, ProcessLogger +from pytools import natsorted, MinRecursionLimit, ProcessLogger from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder @@ -571,6 +571,7 @@ class SchedulerState(ImmutableRecord): .. attribute:: loop_priority + #FIXME: incorrect docs. See :func:`loop_nest_around_map`. .. attribute:: breakable_inames @@ -586,6 +587,10 @@ class SchedulerState(ImmutableRecord): .. rubric:: Time-varying scheduler state + .. attribute:: insn_ids_to_try + + #FIXME: docs? + .. attribute:: active_inames A tuple of active inames. @@ -651,6 +656,64 @@ class SchedulerState(ImmutableRecord): return None +def schedule_as_many_insns_as_possible(sched_state): + """ + Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending + all available instructions in the current loop nesting to the schedule. + """ + + # {{{ topological sort + + visited_insn_ids = set() + toposorted_insns = [] + + def insert_insn_into_order(insn): + if insn.id in visited_insn_ids: + return + visited_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on & sched_state.unscheduled_insn_ids): + dep_insn = sched_state.kernel.id_to_insn[dep_id] + if frozenset(sched_state.active_inames) <= dep_insn.within_inames: + insert_insn_into_order(dep_insn) + + toposorted_insns.append(insn) + + for unscheduled_insn_id in sched_state.unscheduled_insn_ids: + unscheduled_insn = sched_state.kernel.id_to_insn[unscheduled_insn_id] + if frozenset(sched_state.active_inames) <= unscheduled_insn.within_inames: + insert_insn_into_order(unscheduled_insn) + + # }}} + + # select the top instructions in toposorted_insns only which have active + # inames corresponding to those of sched_state + from loopy.kernel.instruction import MultiAssignmentBase + + updated_sched_state = sched_state.copy() + + for insn in toposorted_insns: + if isinstance(insn, MultiAssignmentBase): + if insn.within_inames == frozenset(sched_state.active_inames): + #FIXME: should we do any changes to preschedule? + sched_item = RunInstruction(insn_id=insn.id) + updated_schedule = updated_sched_state.schedule + (sched_item, ) + updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids + | frozenset([insn.id])) + updated_unscheduled_insn_ids = ( + updated_sched_state.unscheduled_insn_ids + - frozenset([insn.id])) + updated_sched_state = updated_sched_state.copy( + insn_ids_to_try=None, + schedule=updated_schedule, + scheduled_insn_ids=updated_scheduled_insn_ids, + unscheduled_insn_ids=updated_unscheduled_insn_ids) + continue + break + + return updated_sched_state + + def generate_loop_schedules_internal( sched_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop @@ -1041,9 +1104,7 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: - - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), @@ -1052,8 +1113,14 @@ def generate_loop_schedules_internal( sched_state.preschedule if last_entered_loop not in sched_state.prescheduled_inames - else sched_state.preschedule[1:]), - ), + else sched_state.preschedule[1:])) + + if not rec_allow_boost: + new_sched_state = ( + schedule_as_many_insns_as_possible(new_sched_state)) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -1254,8 +1321,7 @@ def generate_loop_schedules_internal( iname), reverse=True): - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (EnterLoop(iname=iname),)), @@ -1268,7 +1334,14 @@ def generate_loop_schedules_internal( sched_state.preschedule if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), - ), + ) + + if not rec_allow_boost: + new_sched_state = ( + schedule_as_many_insns_as_possible(new_sched_state)) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True -- GitLab From fd3316851e87fd5d4c1e46b2dd6086f685d68a9d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 May 2020 22:45:55 -0500 Subject: [PATCH 03/45] [scheduler]: bail conglomerating insns for kernels with barriers --- loopy/schedule/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 798a34e78..7e933efc8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -656,11 +656,15 @@ class SchedulerState(ImmutableRecord): return None -def schedule_as_many_insns_as_possible(sched_state): +def schedule_as_many_run_insns_as_possible(sched_state): """ Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending all available instructions in the current loop nesting to the schedule. """ + if sched_state.preschedule: + #FIXME: unsure how to perform this optimization for a scheduler with a + # preschedule, bail for now. + return sched_state # {{{ topological sort @@ -695,7 +699,6 @@ def schedule_as_many_insns_as_possible(sched_state): for insn in toposorted_insns: if isinstance(insn, MultiAssignmentBase): if insn.within_inames == frozenset(sched_state.active_inames): - #FIXME: should we do any changes to preschedule? sched_item = RunInstruction(insn_id=insn.id) updated_schedule = updated_sched_state.schedule + (sched_item, ) updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids @@ -1117,7 +1120,7 @@ def generate_loop_schedules_internal( if not rec_allow_boost: new_sched_state = ( - schedule_as_many_insns_as_possible(new_sched_state)) + schedule_as_many_run_insns_as_possible(new_sched_state)) for sub_sched in generate_loop_schedules_internal( new_sched_state, @@ -1338,7 +1341,8 @@ def generate_loop_schedules_internal( if not rec_allow_boost: new_sched_state = ( - schedule_as_many_insns_as_possible(new_sched_state)) + schedule_as_many_run_insns_as_possible( + new_sched_state)) for sub_sched in generate_loop_schedules_internal( new_sched_state, -- GitLab From 1b678302c55f087be969ec5fbe7bd12e4677fc5c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 18 May 2020 00:56:28 -0500 Subject: [PATCH 04/45] graph traversal without memoization; saves memory but increases time complexity by a lot --- loopy/check.py | 283 +++++++++++++++++++++++++++---------------------- 1 file changed, 159 insertions(+), 124 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 7c63bc44a..f9fc1024a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -449,45 +449,9 @@ def check_has_schedulable_iname_nesting(kernel): # {{{ check_variable_access_ordered -class IndirectDependencyEdgeFinder(object): - def __init__(self, kernel): - self.kernel = kernel - self.dep_edge_cache = {} - - def __call__(self, depender_id, dependee_id): - cache_key = (depender_id, dependee_id) - - try: - result = self.dep_edge_cache[cache_key] - except KeyError: - pass - else: - if result is None: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("when " - "checking for dependency edge between " - "depender '%s' and dependee '%s'" - % (depender_id, dependee_id)) - else: - return result - - depender = self.kernel.id_to_insn[depender_id] - - if dependee_id in depender.depends_on: - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = None - for dep in depender.depends_on: - if self(dep, dependee_id): - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = False - return False - - def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): + dep_a = kernel.id_to_insn[dep_a] + dep_b = kernel.id_to_insn[dep_b] from loopy.kernel.data import AddressSpace if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] @@ -510,116 +474,187 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync +class DependencyTraversingMapper(object): + def __init__(self, insn_id_to_dep_reqs, depends_on): + self.insn_id_to_dep_reqs = insn_id_to_dep_reqs + self.depends_on = depends_on + + def move(self, insn_id, rev_deps): + if insn_id in rev_deps: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(rev_deps)) + + self.insn_id_to_dep_reqs[insn_id] -= rev_deps + new_rev_deps = rev_deps | set([insn_id]) + for depender in self.depends_on[insn_id]: + self.move(depender, new_rev_deps) + + return + + +class ReverseDependencyTraversingMapper(object): + def __init__(self, insn_id_to_dep_reqs, rev_depends): + self.insn_id_to_dep_reqs = insn_id_to_dep_reqs + self.rev_depends = rev_depends + + def move(self, insn_id, deps): + if insn_id in deps: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(deps)) + + self.insn_id_to_dep_reqs[insn_id] -= deps + new_deps = deps | set([insn_id]) + for depender in self.rev_depends[insn_id]: + self.move(depender, new_deps) + + return + + +def _get_address_space(kernel, var): + from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg + if var in kernel.temporary_variables: + address_space = kernel.temporary_variables[var].address_space + else: + arg = kernel.arg_dict[var] + if isinstance(arg, ArrayArg): + address_space = arg.address_space + elif isinstance(arg, ValueArg): + address_space = AddressSpace.PRIVATE + else: + # No need to consider ConstantArg and ImageArg (for now) + # because those won't be written. + raise ValueError("could not determine address_space of '%s'" % var) + return address_space + + def _check_variable_access_ordered_inner(kernel): + from loopy.kernel.tools import find_aliasing_equivalence_classes + from loopy.symbolic import AccessRangeOverlapChecker + overlap_checker = AccessRangeOverlapChecker(kernel) + aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + logger.debug("%s: check_variable_access_ordered: start" % kernel.name) - checked_variables = kernel.get_written_variables() & ( - set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict)) + # insn_id_to_dep_reqs: A mapping from insn_id to set of insn_ids between + # whom dependency requirements should be proved in order to assert + # variable access ordering. + insn_id_to_dep_reqs = dict((insn.id, set()) for insn in + kernel.instructions) wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg - from loopy.kernel.tools import find_aliasing_equivalence_classes - - depfind = IndirectDependencyEdgeFinder(kernel) - aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + # {{{ populate 'insn_id_to_dep_reqs' - for name in checked_variables: - # This is a tad redundant in that this could probably be restructured - # to iterate only over equivalence classes and not individual variables. - # But then the access-range overlap check below would have to be smarter. - eq_class = aliasing_equiv_classes[name] + for var in kernel.get_written_variables(): + address_space = _get_address_space(kernel, var) + eq_class = aliasing_equiv_classes[var] readers = set.union( *[rmap.get(eq_name, set()) for eq_name in eq_class]) writers = set.union( *[wmap.get(eq_name, set()) for eq_name in eq_class]) - unaliased_readers = rmap.get(name, set()) - unaliased_writers = wmap.get(name, set()) - if not writers: - continue + for writer in writers: + required_deps = (readers | writers) - set([writer]) + required_deps = set([req_dep for req_dep in required_deps if not + declares_nosync_with(kernel, address_space, writer, + req_dep)]) - if name in kernel.temporary_variables: - address_space = kernel.temporary_variables[name].address_space - else: - arg = kernel.arg_dict[name] - if isinstance(arg, ArrayArg): - address_space = arg.address_space - elif isinstance(arg, ValueArg): - address_space = AddressSpace.PRIVATE - else: - # No need to consider ConstantArg and ImageArg (for now) - # because those won't be written. - raise ValueError("could not determine address_space of '%s'" % name) + insn_id_to_dep_reqs[writer] |= required_deps - # Check even for PRIVATE address space, to ensure intentional program order. + # }}} - from loopy.symbolic import AccessRangeOverlapChecker - overlap_checker = AccessRangeOverlapChecker(kernel) + # depends_on: mapping from insn_ids to their dependencies + depends_on = dict((insn.id, set()) for insn in + kernel.instructions) + # rev_depends: mapping from insn_ids to their reverse deps. + rev_depends = dict((insn.id, set()) for insn in + kernel.instructions) - for writer_id in writers: - for other_id in readers | writers: - if writer_id == other_id: - continue + # {{{ populate rev_depends, depends_on - writer = kernel.id_to_insn[writer_id] - other = kernel.id_to_insn[other_id] + for insn in kernel.instructions: + depends_on[insn.id].update(insn.depends_on) + for dep in insn.depends_on: + rev_depends[dep].add(insn.id) + # }}} - has_dependency_relationship = ( - declares_nosync_with(kernel, address_space, other, writer) - or - depfind(writer_id, other_id) - or - depfind(other_id, writer_id) - ) + forward_dep_traverser = DependencyTraversingMapper(insn_id_to_dep_reqs, + depends_on) + reverse_dep_traverser = ReverseDependencyTraversingMapper(insn_id_to_dep_reqs, + rev_depends) - if has_dependency_relationship: - continue + for insn_id, rev_deps in six.iteritems(rev_depends): + if not rev_deps: + forward_dep_traverser.move(insn_id, set()) - is_relationship_by_aliasing = not ( - writer_id in unaliased_writers - and (other_id in unaliased_writers - or other_id in unaliased_readers)) + for insn_id, deps in six.iteritems(depends_on): + if not deps: + reverse_dep_traverser.move(insn_id, set()) - # Do not enforce ordering for disjoint access ranges - if (not is_relationship_by_aliasing and not - overlap_checker.do_access_ranges_overlap_conservative( - writer_id, "w", other_id, "any", name)): - continue + for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): + insn = kernel.id_to_insn[insn_id] + vars_written_by_insn = insn.write_dependency_names() & ( + kernel.get_written_variables()) + for dep_id in dep_ids: + dep = kernel.id_to_insn[dep_id] + vars_accessed_by_dep = dep.dependency_names() & ( + kernel.get_written_variables() | ( + kernel.get_read_variables())) + eq_classes_accessed_by_dep = set().union( + *(aliasing_equiv_classes[_] for _ in vars_accessed_by_dep)) + + for var_written_by_insn in vars_written_by_insn: + eq_class = aliasing_equiv_classes[var_written_by_insn] + if eq_class & eq_classes_accessed_by_dep: + unaliased_readers = rmap.get(var_written_by_insn, set()) + unaliased_writers = wmap.get(var_written_by_insn, set()) + is_relationship_by_aliasing = not ( + insn_id in unaliased_writers + and (dep_id in unaliased_writers + or dep_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing and not + overlap_checker.do_access_ranges_overlap_conservative( + insn_id, "w", dep_id, "any", + var_written_by_insn)): + continue - # Do not enforce ordering for aliasing-based relationships - # in different groups. - if (is_relationship_by_aliasing and ( - bool(writer.groups & other.conflicts_with_groups) - or - bool(other.groups & writer.conflicts_with_groups))): - continue + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(writer.groups & dep_id.conflicts_with_groups) + or + bool(dep_id.groups & writer.conflicts_with_groups))): + continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=writer_id, - other_id=other_id, - var=( - "the variable '%s'" % name - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=insn_id, + other_id=dep_id, + var=( + "the variable '%s'" % var_written_by_insn + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) logger.debug("%s: check_variable_access_ordered: done" % kernel.name) @@ -628,7 +663,7 @@ def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * a (at least indirect) depdendency edge, or + * a direct/indirect depdendency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.Instruction.no_sync_with`) """ -- GitLab From 76c829e090439586d0ed3f640251eb3ce591cf05 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 18 May 2020 04:33:25 -0500 Subject: [PATCH 05/45] bug fixes in the graph traversal --- loopy/check.py | 62 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index f9fc1024a..0a010a229 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -475,9 +475,15 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): class DependencyTraversingMapper(object): - def __init__(self, insn_id_to_dep_reqs, depends_on): + """ + Helper class to traverse the dependency graph. + """ + def __init__(self, insn_id_to_dep_reqs, depends_on, rev_depends): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs self.depends_on = depends_on + self.rev_depends = rev_depends + self.visited = set() + self.memoized_rev_deps = {} def move(self, insn_id, rev_deps): if insn_id in rev_deps: @@ -485,18 +491,34 @@ class DependencyTraversingMapper(object): raise DependencyCycleFound("Dependency cycle found:" " '{}'.".format(rev_deps)) - self.insn_id_to_dep_reqs[insn_id] -= rev_deps - new_rev_deps = rev_deps | set([insn_id]) - for depender in self.depends_on[insn_id]: - self.move(depender, new_rev_deps) + if insn_id in self.visited: + return + + if self.rev_depends[insn_id] <= self.visited: + new_rev_deps = rev_deps | self.memoized_rev_deps.pop(insn_id, + set()) | set([insn_id]) + + self.insn_id_to_dep_reqs[insn_id] -= new_rev_deps + self.visited.add(insn_id) + for dep in self.depends_on[insn_id]: + self.move(dep, new_rev_deps) + else: + memoized_rev_deps = self.memoized_rev_deps.get(insn_id, set()) + memoized_rev_deps.update(rev_deps) return class ReverseDependencyTraversingMapper(object): - def __init__(self, insn_id_to_dep_reqs, rev_depends): + """ + Helper class to traverse the reverse dependency graph. + """ + def __init__(self, insn_id_to_dep_reqs, rev_depends, depends_on): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs self.rev_depends = rev_depends + self.depends_on = depends_on + self.visited = set() + self.memoized_deps = {} def move(self, insn_id, deps): if insn_id in deps: @@ -504,10 +526,20 @@ class ReverseDependencyTraversingMapper(object): raise DependencyCycleFound("Dependency cycle found:" " '{}'.".format(deps)) - self.insn_id_to_dep_reqs[insn_id] -= deps - new_deps = deps | set([insn_id]) - for depender in self.rev_depends[insn_id]: - self.move(depender, new_deps) + if insn_id in self.visited: + return + + if self.depends_on[insn_id] <= self.visited: + new_deps = deps | self.memoized_deps.pop(insn_id, + set()) | set([insn_id]) + + self.insn_id_to_dep_reqs[insn_id] -= new_deps + self.visited.add(insn_id) + for rev_dep in self.rev_depends[insn_id]: + self.move(rev_dep, new_deps) + else: + memoized_deps = self.memoized_deps.get(insn_id, set()) + memoized_deps.update(deps) return @@ -583,18 +615,24 @@ def _check_variable_access_ordered_inner(kernel): # }}} forward_dep_traverser = DependencyTraversingMapper(insn_id_to_dep_reqs, - depends_on) + depends_on, rev_depends) reverse_dep_traverser = ReverseDependencyTraversingMapper(insn_id_to_dep_reqs, - rev_depends) + rev_depends, depends_on) for insn_id, rev_deps in six.iteritems(rev_depends): if not rev_deps: forward_dep_traverser.move(insn_id, set()) + assert set([insn.id for insn in kernel.instructions]) == ( + forward_dep_traverser.visited) + for insn_id, deps in six.iteritems(depends_on): if not deps: reverse_dep_traverser.move(insn_id, set()) + assert set([insn.id for insn in kernel.instructions]) == ( + reverse_dep_traverser.visited) + for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] vars_written_by_insn = insn.write_dependency_names() & ( -- GitLab From aea8d5125246ff232fc055fb05150f0ab49ba59c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 19 May 2020 02:22:35 -0500 Subject: [PATCH 06/45] fix bugs in graph traversal --- loopy/check.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 0a010a229..854b81ab1 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -476,7 +476,7 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): class DependencyTraversingMapper(object): """ - Helper class to traverse the dependency graph. + Helper class to traverse the dependency graph in a postorder fashion. """ def __init__(self, insn_id_to_dep_reqs, depends_on, rev_depends): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs @@ -503,7 +503,7 @@ class DependencyTraversingMapper(object): for dep in self.depends_on[insn_id]: self.move(dep, new_rev_deps) else: - memoized_rev_deps = self.memoized_rev_deps.get(insn_id, set()) + memoized_rev_deps = self.memoized_rev_deps.setdefault(insn_id, set()) memoized_rev_deps.update(rev_deps) return @@ -511,7 +511,8 @@ class DependencyTraversingMapper(object): class ReverseDependencyTraversingMapper(object): """ - Helper class to traverse the reverse dependency graph. + Helper class to traverse the reverse dependency graph in a postorder + fashion. """ def __init__(self, insn_id_to_dep_reqs, rev_depends, depends_on): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs @@ -538,7 +539,7 @@ class ReverseDependencyTraversingMapper(object): for rev_dep in self.rev_depends[insn_id]: self.move(rev_dep, new_deps) else: - memoized_deps = self.memoized_deps.get(insn_id, set()) + memoized_deps = self.memoized_deps.setdefault(insn_id, set()) memoized_deps.update(deps) return @@ -665,9 +666,9 @@ def _check_variable_access_ordered_inner(kernel): # Do not enforce ordering for aliasing-based relationships # in different groups. if (is_relationship_by_aliasing and ( - bool(writer.groups & dep_id.conflicts_with_groups) + bool(insn.groups & dep.conflicts_with_groups) or - bool(dep_id.groups & writer.conflicts_with_groups))): + bool(dep.groups & insn.conflicts_with_groups))): continue msg = ("No dependency relationship found between " -- GitLab From d739eb529d5bbd22457298c4a9998a8c1f4fd9e7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 19 May 2020 02:45:48 -0500 Subject: [PATCH 07/45] convert assertion to dep cycle error --- loopy/check.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 854b81ab1..a4ba9ff2c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -624,15 +624,25 @@ def _check_variable_access_ordered_inner(kernel): if not rev_deps: forward_dep_traverser.move(insn_id, set()) - assert set([insn.id for insn in kernel.instructions]) == ( + if set([insn.id for insn in kernel.instructions]) != ( + forward_dep_traverser.visited): + not_visited_insns = set([insn.id for insn in kernel.instructions]) - ( forward_dep_traverser.visited) + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found in:" + " '{}'.".format(not_visited_insns)) for insn_id, deps in six.iteritems(depends_on): if not deps: reverse_dep_traverser.move(insn_id, set()) - assert set([insn.id for insn in kernel.instructions]) == ( + if set([insn.id for insn in kernel.instructions]) != ( + reverse_dep_traverser.visited): + not_visited_insns = set([insn.id for insn in kernel.instructions]) - ( reverse_dep_traverser.visited) + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found in:" + " '{}'.".format(not_visited_insns)) for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] -- GitLab From 2c080762b136f4767aac66bfa8f9bcf82ced867b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 May 2020 14:11:25 -0500 Subject: [PATCH 08/45] adds a FIXME --- loopy/check.py | 69 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index a4ba9ff2c..999a412fa 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -478,14 +478,16 @@ class DependencyTraversingMapper(object): """ Helper class to traverse the dependency graph in a postorder fashion. """ - def __init__(self, insn_id_to_dep_reqs, depends_on, rev_depends): + def __init__(self, insn_id_to_dep_reqs, depends_on, rev_depends, + topological_order): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs self.depends_on = depends_on self.rev_depends = rev_depends + self.topological_order = topological_order self.visited = set() self.memoized_rev_deps = {} - def move(self, insn_id, rev_deps): + def rec(self, insn_id, rev_deps): if insn_id in rev_deps: from loopy.diagnostic import DependencyCycleFound raise DependencyCycleFound("Dependency cycle found:" @@ -500,28 +502,37 @@ class DependencyTraversingMapper(object): self.insn_id_to_dep_reqs[insn_id] -= new_rev_deps self.visited.add(insn_id) - for dep in self.depends_on[insn_id]: - self.move(dep, new_rev_deps) + + deps = self.depends_on[insn_id] + # FIXME: Adding the line below emits segfault for huge kernels + # deps = sorted(list(deps), key=lambda elem: + # (-self.topological_order[elem])) + for dep in deps: + self.rec(dep, new_rev_deps) else: memoized_rev_deps = self.memoized_rev_deps.setdefault(insn_id, set()) memoized_rev_deps.update(rev_deps) return + __call__ = rec + class ReverseDependencyTraversingMapper(object): """ Helper class to traverse the reverse dependency graph in a postorder fashion. """ - def __init__(self, insn_id_to_dep_reqs, rev_depends, depends_on): + def __init__(self, insn_id_to_dep_reqs, rev_depends, depends_on, + topological_order): self.insn_id_to_dep_reqs = insn_id_to_dep_reqs self.rev_depends = rev_depends self.depends_on = depends_on + self.topological_order = topological_order self.visited = set() self.memoized_deps = {} - def move(self, insn_id, deps): + def rec(self, insn_id, deps): if insn_id in deps: from loopy.diagnostic import DependencyCycleFound raise DependencyCycleFound("Dependency cycle found:" @@ -536,14 +547,21 @@ class ReverseDependencyTraversingMapper(object): self.insn_id_to_dep_reqs[insn_id] -= new_deps self.visited.add(insn_id) - for rev_dep in self.rev_depends[insn_id]: - self.move(rev_dep, new_deps) + rev_deps = self.rev_depends[insn_id] + # FIXME: Adding the line below emits segfault for huge kernels + # rev_deps = sorted(list(rev_deps), key=lambda elem: + # (-self.topological_order[elem])) + + for rev_dep in rev_deps: + self.rec(rev_dep, new_deps) else: memoized_deps = self.memoized_deps.setdefault(insn_id, set()) memoized_deps.update(deps) return + __call__ = rec + def _get_address_space(kernel, var): from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg @@ -562,6 +580,32 @@ def _get_address_space(kernel, var): return address_space +def _get_topological_order(kernel): + from pytools import natsorted + + # {{{ topological sort + + visited_insn_ids = set() + insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in visited_insn_ids: + return + visited_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + # }}} + + return dict((insn.id, i) for i, insn in enumerate(insn_order)) + + def _check_variable_access_ordered_inner(kernel): from loopy.kernel.tools import find_aliasing_equivalence_classes from loopy.symbolic import AccessRangeOverlapChecker @@ -615,14 +659,15 @@ def _check_variable_access_ordered_inner(kernel): rev_depends[dep].add(insn.id) # }}} + topological_order = _get_topological_order(kernel) forward_dep_traverser = DependencyTraversingMapper(insn_id_to_dep_reqs, - depends_on, rev_depends) + depends_on, rev_depends, topological_order) reverse_dep_traverser = ReverseDependencyTraversingMapper(insn_id_to_dep_reqs, - rev_depends, depends_on) + rev_depends, depends_on, topological_order) for insn_id, rev_deps in six.iteritems(rev_depends): if not rev_deps: - forward_dep_traverser.move(insn_id, set()) + forward_dep_traverser(insn_id, set()) if set([insn.id for insn in kernel.instructions]) != ( forward_dep_traverser.visited): @@ -634,7 +679,7 @@ def _check_variable_access_ordered_inner(kernel): for insn_id, deps in six.iteritems(depends_on): if not deps: - reverse_dep_traverser.move(insn_id, set()) + reverse_dep_traverser(insn_id, set()) if set([insn.id for insn in kernel.instructions]) != ( reverse_dep_traverser.visited): -- GitLab From a7161062b001aa9beb5b6ed8cc8b84ee5d6f3c02 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 May 2020 01:56:47 -0500 Subject: [PATCH 09/45] do a simple topological sort order traversal instead of recursion --- loopy/check.py | 163 ++++++++++++++----------------------------------- 1 file changed, 45 insertions(+), 118 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 999a412fa..f1f889a6e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -474,95 +474,6 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync -class DependencyTraversingMapper(object): - """ - Helper class to traverse the dependency graph in a postorder fashion. - """ - def __init__(self, insn_id_to_dep_reqs, depends_on, rev_depends, - topological_order): - self.insn_id_to_dep_reqs = insn_id_to_dep_reqs - self.depends_on = depends_on - self.rev_depends = rev_depends - self.topological_order = topological_order - self.visited = set() - self.memoized_rev_deps = {} - - def rec(self, insn_id, rev_deps): - if insn_id in rev_deps: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(rev_deps)) - - if insn_id in self.visited: - return - - if self.rev_depends[insn_id] <= self.visited: - new_rev_deps = rev_deps | self.memoized_rev_deps.pop(insn_id, - set()) | set([insn_id]) - - self.insn_id_to_dep_reqs[insn_id] -= new_rev_deps - self.visited.add(insn_id) - - deps = self.depends_on[insn_id] - # FIXME: Adding the line below emits segfault for huge kernels - # deps = sorted(list(deps), key=lambda elem: - # (-self.topological_order[elem])) - for dep in deps: - self.rec(dep, new_rev_deps) - else: - memoized_rev_deps = self.memoized_rev_deps.setdefault(insn_id, set()) - memoized_rev_deps.update(rev_deps) - - return - - __call__ = rec - - -class ReverseDependencyTraversingMapper(object): - """ - Helper class to traverse the reverse dependency graph in a postorder - fashion. - """ - def __init__(self, insn_id_to_dep_reqs, rev_depends, depends_on, - topological_order): - self.insn_id_to_dep_reqs = insn_id_to_dep_reqs - self.rev_depends = rev_depends - self.depends_on = depends_on - self.topological_order = topological_order - self.visited = set() - self.memoized_deps = {} - - def rec(self, insn_id, deps): - if insn_id in deps: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(deps)) - - if insn_id in self.visited: - return - - if self.depends_on[insn_id] <= self.visited: - new_deps = deps | self.memoized_deps.pop(insn_id, - set()) | set([insn_id]) - - self.insn_id_to_dep_reqs[insn_id] -= new_deps - self.visited.add(insn_id) - rev_deps = self.rev_depends[insn_id] - # FIXME: Adding the line below emits segfault for huge kernels - # rev_deps = sorted(list(rev_deps), key=lambda elem: - # (-self.topological_order[elem])) - - for rev_dep in rev_deps: - self.rec(rev_dep, new_deps) - else: - memoized_deps = self.memoized_deps.setdefault(insn_id, set()) - memoized_deps.update(deps) - - return - - __call__ = rec - - def _get_address_space(kernel, var): from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg if var in kernel.temporary_variables: @@ -603,7 +514,7 @@ def _get_topological_order(kernel): # }}} - return dict((insn.id, i) for i, insn in enumerate(insn_order)) + return [insn.id for insn in insn_order] def _check_variable_access_ordered_inner(kernel): @@ -660,34 +571,50 @@ def _check_variable_access_ordered_inner(kernel): # }}} topological_order = _get_topological_order(kernel) - forward_dep_traverser = DependencyTraversingMapper(insn_id_to_dep_reqs, - depends_on, rev_depends, topological_order) - reverse_dep_traverser = ReverseDependencyTraversingMapper(insn_id_to_dep_reqs, - rev_depends, depends_on, topological_order) - - for insn_id, rev_deps in six.iteritems(rev_depends): - if not rev_deps: - forward_dep_traverser(insn_id, set()) - - if set([insn.id for insn in kernel.instructions]) != ( - forward_dep_traverser.visited): - not_visited_insns = set([insn.id for insn in kernel.instructions]) - ( - forward_dep_traverser.visited) - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found in:" - " '{}'.".format(not_visited_insns)) - - for insn_id, deps in six.iteritems(depends_on): - if not deps: - reverse_dep_traverser(insn_id, set()) - - if set([insn.id for insn in kernel.instructions]) != ( - reverse_dep_traverser.visited): - not_visited_insns = set([insn.id for insn in kernel.instructions]) - ( - reverse_dep_traverser.visited) - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found in:" - " '{}'.".format(not_visited_insns)) + + # {{{ forward dep. traversal + + # memoized_predecessors: mapping from insn_id to its direct/indirect predecessors + memoized_predecessors = {} + + for insn_id in topological_order[::-1]: + # accumulated_predecessors:insn_id's direct+indirect predecessors + accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) + + if insn_id in accumulated_predecessors: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(accumulated_predecessors)) + + insn_id_to_dep_reqs[insn_id] -= accumulated_predecessors + + for dep in depends_on[insn_id]: + memoized_predecessors.setdefault(dep, set()).update( + accumulated_predecessors | set([insn_id])) + + # }}} + + # {{{ reverse dep. traversal + + # memoized_successors: mapping from insn_id to its direct/indirect successors + memoized_successors = {} + + for insn_id in topological_order: + # accumulated_predecessors:insn_id's direct+indirect predecessors + accumulated_successors = memoized_successors.pop(insn_id, set()) + + if insn_id in accumulated_successors: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(accumulated_predecessors)) + + insn_id_to_dep_reqs[insn_id] -= accumulated_successors + + for rev_dep in rev_depends[insn_id]: + memoized_successors.setdefault(rev_dep, set()).update( + accumulated_successors | set([insn_id])) + + # }}} for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] -- GitLab From 2b0f6d1e103efa01b25adca74ec3c298912e519e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 May 2020 11:29:14 -0500 Subject: [PATCH 10/45] fills some voids in checking dep cycles --- loopy/check.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index f1f889a6e..82d9e44a6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -577,6 +577,7 @@ def _check_variable_access_ordered_inner(kernel): # memoized_predecessors: mapping from insn_id to its direct/indirect predecessors memoized_predecessors = {} + # reverse postorder traversal of dependency graph for insn_id in topological_order[::-1]: # accumulated_predecessors:insn_id's direct+indirect predecessors accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) @@ -594,11 +595,17 @@ def _check_variable_access_ordered_inner(kernel): # }}} + if memoized_predecessors: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(next(iter(six.viewvalues(memoized_predecessors))))) + # {{{ reverse dep. traversal # memoized_successors: mapping from insn_id to its direct/indirect successors memoized_successors = {} + # postorder traversal of dependency graph for insn_id in topological_order: # accumulated_predecessors:insn_id's direct+indirect predecessors accumulated_successors = memoized_successors.pop(insn_id, set()) @@ -616,6 +623,11 @@ def _check_variable_access_ordered_inner(kernel): # }}} + if memoized_successors: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound("Dependency cycle found:" + " '{}'.".format(next(iter(six.viewvalues(memoized_successors))))) + for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] vars_written_by_insn = insn.write_dependency_names() & ( -- GitLab From 008467471ebe71ef60340167e19671b16b240ae6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 25 May 2020 20:05:24 -0500 Subject: [PATCH 11/45] convert the FIXME to a TODO --- loopy/schedule/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 7e933efc8..6aeccf216 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -662,8 +662,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): all available instructions in the current loop nesting to the schedule. """ if sched_state.preschedule: - #FIXME: unsure how to perform this optimization for a scheduler with a - # preschedule, bail for now. + #TODO: handle cases with a preschedule. return sched_state # {{{ topological sort -- GitLab From 356fff2169d9829db4702ee6198f6fc8c7fa1988 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Jun 2020 18:49:27 -0500 Subject: [PATCH 12/45] uses pytools.compute_topological_order for performing topological sorting --- loopy/check.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 82d9e44a6..aae119af9 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -492,29 +492,14 @@ def _get_address_space(kernel, var): def _get_topological_order(kernel): - from pytools import natsorted - - # {{{ topological sort - - visited_insn_ids = set() - insn_order = [] - - def insert_insn_into_order(insn): - if insn.id in visited_insn_ids: - return - visited_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on): - insert_insn_into_order(kernel.id_to_insn[dep_id]) - - insn_order.append(insn) + from pytools.graph import compute_topological_order + rev_dep_map = {insn.id: set() for insn in kernel.instructions} for insn in kernel.instructions: - insert_insn_into_order(insn) - - # }}} + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) - return [insn.id for insn in insn_order] + return compute_topological_order(rev_dep_map) def _check_variable_access_ordered_inner(kernel): -- GitLab From 61738c3cb72255b275789e9c998705f3256360eb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Jun 2020 19:22:01 -0500 Subject: [PATCH 13/45] [scheduler]: decrease time complexity of schedule_as_many_insns_as_possible --- loopy/schedule/__init__.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 6aeccf216..b3c697431 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -695,24 +695,32 @@ def schedule_as_many_run_insns_as_possible(sched_state): updated_sched_state = sched_state.copy() + num_insns_to_be_scheduled = 0 + for insn in toposorted_insns: + num_insns_to_be_scheduled += 1 if isinstance(insn, MultiAssignmentBase): if insn.within_inames == frozenset(sched_state.active_inames): - sched_item = RunInstruction(insn_id=insn.id) - updated_schedule = updated_sched_state.schedule + (sched_item, ) - updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids - | frozenset([insn.id])) - updated_unscheduled_insn_ids = ( - updated_sched_state.unscheduled_insn_ids - - frozenset([insn.id])) - updated_sched_state = updated_sched_state.copy( - insn_ids_to_try=None, - schedule=updated_schedule, - scheduled_insn_ids=updated_scheduled_insn_ids, - unscheduled_insn_ids=updated_unscheduled_insn_ids) continue break + schedulable_insn_ids = tuple(insn.id for insn in + toposorted_insns[:num_insns_to_be_scheduled]) + sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in + schedulable_insn_ids) + + updated_schedule = updated_sched_state.schedule + sched_items + updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids + | frozenset(schedulable_insn_ids)) + updated_unscheduled_insn_ids = ( + updated_sched_state.unscheduled_insn_ids + - frozenset(schedulable_insn_ids)) + updated_sched_state = updated_sched_state.copy( + insn_ids_to_try=None, + schedule=updated_schedule, + scheduled_insn_ids=updated_scheduled_insn_ids, + unscheduled_insn_ids=updated_unscheduled_insn_ids) + return updated_sched_state -- GitLab From b6b019d31ca8f5885f5030fdbbe4764b93e9d662 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Jun 2020 20:27:04 -0500 Subject: [PATCH 14/45] [scheduler]: use pytools.compute_topological_order for topological sort --- loopy/schedule/__init__.py | 51 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b3c697431..4bf524b47 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -29,7 +29,7 @@ import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa -from pytools import natsorted, MinRecursionLimit, ProcessLogger +from pytools import memoize_method, MinRecursionLimit, ProcessLogger from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder @@ -655,6 +655,17 @@ class SchedulerState(ImmutableRecord): else: return None + @memoize_method + def get_insn_ids_in_a_topologically_sorted_order(self): + from pytools.graph import compute_topological_order + + rev_dep_map = {insn.id: set() for insn in self.kernel.instructions} + for insn in self.kernel.instructions: + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) + + return compute_topological_order(rev_dep_map) + def schedule_as_many_run_insns_as_possible(sched_state): """ @@ -667,25 +678,11 @@ def schedule_as_many_run_insns_as_possible(sched_state): # {{{ topological sort - visited_insn_ids = set() - toposorted_insns = [] - - def insert_insn_into_order(insn): - if insn.id in visited_insn_ids: - return - visited_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on & sched_state.unscheduled_insn_ids): - dep_insn = sched_state.kernel.id_to_insn[dep_id] - if frozenset(sched_state.active_inames) <= dep_insn.within_inames: - insert_insn_into_order(dep_insn) - - toposorted_insns.append(insn) - - for unscheduled_insn_id in sched_state.unscheduled_insn_ids: - unscheduled_insn = sched_state.kernel.id_to_insn[unscheduled_insn_id] - if frozenset(sched_state.active_inames) <= unscheduled_insn.within_inames: - insert_insn_into_order(unscheduled_insn) + toposorted_insn_ids = tuple(insn_id for insn_id in + sched_state.get_insn_ids_in_a_topologically_sorted_order() if + insn_id in sched_state.unscheduled_insn_ids and ( + sched_state.kernel.id_to_insn[insn_id].within_inames >= + frozenset(sched_state.active_inames))) # }}} @@ -697,24 +694,24 @@ def schedule_as_many_run_insns_as_possible(sched_state): num_insns_to_be_scheduled = 0 - for insn in toposorted_insns: - num_insns_to_be_scheduled += 1 + for insn_id in toposorted_insn_ids: + insn = sched_state.kernel.id_to_insn[insn_id] if isinstance(insn, MultiAssignmentBase): if insn.within_inames == frozenset(sched_state.active_inames): + num_insns_to_be_scheduled += 1 continue break - schedulable_insn_ids = tuple(insn.id for insn in - toposorted_insns[:num_insns_to_be_scheduled]) + newly_scheduled_insn_ids = toposorted_insn_ids[:num_insns_to_be_scheduled] sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in - schedulable_insn_ids) + newly_scheduled_insn_ids) updated_schedule = updated_sched_state.schedule + sched_items updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids - | frozenset(schedulable_insn_ids)) + | frozenset(newly_scheduled_insn_ids)) updated_unscheduled_insn_ids = ( updated_sched_state.unscheduled_insn_ids - - frozenset(schedulable_insn_ids)) + - frozenset(newly_scheduled_insn_ids)) updated_sched_state = updated_sched_state.copy( insn_ids_to_try=None, schedule=updated_schedule, -- GitLab From f0891c2f003536890d7abd4dc0845a49d1357e05 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 2 Jun 2020 01:09:03 -0500 Subject: [PATCH 15/45] catch CycleError and emit DependencyCycleFound --- loopy/check.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index aae119af9..81e0fb9c9 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -492,14 +492,20 @@ def _get_address_space(kernel, var): def _get_topological_order(kernel): - from pytools.graph import compute_topological_order + from pytools.graph import compute_topological_order, CycleError rev_dep_map = {insn.id: set() for insn in kernel.instructions} for insn in kernel.instructions: for dep in insn.depends_on: rev_dep_map[dep].add(insn.id) - return compute_topological_order(rev_dep_map) + try: + order = compute_topological_order(rev_dep_map) + except CycleError as e: + from loopy.diagnostic import DependencyCycleFound + raise DependencyCycleFound(str(e)) + + return order def _check_variable_access_ordered_inner(kernel): @@ -567,11 +573,6 @@ def _check_variable_access_ordered_inner(kernel): # accumulated_predecessors:insn_id's direct+indirect predecessors accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) - if insn_id in accumulated_predecessors: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(accumulated_predecessors)) - insn_id_to_dep_reqs[insn_id] -= accumulated_predecessors for dep in depends_on[insn_id]: @@ -580,11 +581,6 @@ def _check_variable_access_ordered_inner(kernel): # }}} - if memoized_predecessors: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(next(iter(six.viewvalues(memoized_predecessors))))) - # {{{ reverse dep. traversal # memoized_successors: mapping from insn_id to its direct/indirect successors @@ -595,11 +591,6 @@ def _check_variable_access_ordered_inner(kernel): # accumulated_predecessors:insn_id's direct+indirect predecessors accumulated_successors = memoized_successors.pop(insn_id, set()) - if insn_id in accumulated_successors: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(accumulated_predecessors)) - insn_id_to_dep_reqs[insn_id] -= accumulated_successors for rev_dep in rev_depends[insn_id]: @@ -608,11 +599,6 @@ def _check_variable_access_ordered_inner(kernel): # }}} - if memoized_successors: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("Dependency cycle found:" - " '{}'.".format(next(iter(six.viewvalues(memoized_successors))))) - for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] vars_written_by_insn = insn.write_dependency_names() & ( -- GitLab From 62fcfea7fe321c24ce16dc2f8d4e1ee208b275a9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 2 Jun 2020 16:46:03 -0500 Subject: [PATCH 16/45] better dep cycle error msg --- loopy/check.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 81e0fb9c9..2262aed1b 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -493,6 +493,7 @@ def _get_address_space(kernel, var): def _get_topological_order(kernel): from pytools.graph import compute_topological_order, CycleError + from loopy.diagnostic import DependencyCycleFound rev_dep_map = {insn.id: set() for insn in kernel.instructions} for insn in kernel.instructions: @@ -502,8 +503,40 @@ def _get_topological_order(kernel): try: order = compute_topological_order(rev_dep_map) except CycleError as e: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound(str(e)) + root = e.node + + visited = set() + visiting = [root] + stack = [(root, iter(rev_dep_map[root]))] + + # Depth first traversal from root + while stack: + node, children = stack.pop() + + for child in children: + # note: each iteration removes child from children + if child in visiting: + cycle = visiting[visiting.index(child):]+[child, ] + raise DependencyCycleFound('->'.join(cycle)) + + if child in visited: + continue + + visiting.append(child) + + # put (node, children) back on stack + stack.append((node, children)) + + # put (child, grandchildren) on stack + stack.append((child, iter(rev_dep_map.get(child, ())))) + break + else: + # loop did not break, + # so either this is a leaf or all children have been visited + assert node == visiting.pop() + visited.add(node) + + assert False return order -- GitLab From 9a4b7742b52cf2d1b410a507937842831f9c23e3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 2 Jun 2020 20:49:37 -0500 Subject: [PATCH 17/45] use functions for traversal --- loopy/check.py | 64 +++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 2262aed1b..f6f35776c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -596,41 +596,35 @@ def _check_variable_access_ordered_inner(kernel): topological_order = _get_topological_order(kernel) - # {{{ forward dep. traversal - - # memoized_predecessors: mapping from insn_id to its direct/indirect predecessors - memoized_predecessors = {} - - # reverse postorder traversal of dependency graph - for insn_id in topological_order[::-1]: - # accumulated_predecessors:insn_id's direct+indirect predecessors - accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) - - insn_id_to_dep_reqs[insn_id] -= accumulated_predecessors - - for dep in depends_on[insn_id]: - memoized_predecessors.setdefault(dep, set()).update( - accumulated_predecessors | set([insn_id])) - - # }}} - - # {{{ reverse dep. traversal - - # memoized_successors: mapping from insn_id to its direct/indirect successors - memoized_successors = {} - - # postorder traversal of dependency graph - for insn_id in topological_order: - # accumulated_predecessors:insn_id's direct+indirect predecessors - accumulated_successors = memoized_successors.pop(insn_id, set()) - - insn_id_to_dep_reqs[insn_id] -= accumulated_successors - - for rev_dep in rev_depends[insn_id]: - memoized_successors.setdefault(rev_dep, set()).update( - accumulated_successors | set([insn_id])) - - # }}} + def discard_dep_reqs_in_order(insn_id_to_dep_reqs, edges, order): + """ + Subtracts dependency requirements of insn_ids by all direct/indirect + predecessors of a directed graph of insn_ids as nodes and *edges* as + the connectivity. + + :arg order: An instance of :class:`list` of instruction ids in which the + *edges* graph is to be traversed. + """ + # memoized_predecessors: mapping from insn_id to its direct/indirect + # predecessors + memoized_predecessors = {} + + # reverse postorder traversal of dependency graph + for insn_id in order: + # accumulated_predecessors:insn_id's direct+indirect predecessors + accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) + + insn_id_to_dep_reqs[insn_id] -= accumulated_predecessors + + for successor in edges[insn_id]: + memoized_predecessors.setdefault(successor, set()).update( + accumulated_predecessors | set([insn_id])) + + # forward dep. graph traversal in reverse topological sort order + discard_dep_reqs_in_order(insn_id_to_dep_reqs, depends_on, + topological_order[::-1]) + # reverse dep. graph traversal in topological sort order + discard_dep_reqs_in_order(insn_id_to_dep_reqs, rev_depends, topological_order) for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] -- GitLab From 32fd07d8c825652fe6c1ee60e28b902f66850adb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 3 Jun 2020 12:20:08 -0500 Subject: [PATCH 18/45] better naming --- loopy/check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index f6f35776c..07029713f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -626,6 +626,8 @@ def _check_variable_access_ordered_inner(kernel): # reverse dep. graph traversal in topological sort order discard_dep_reqs_in_order(insn_id_to_dep_reqs, rev_depends, topological_order) + # {{{ handle dependency requirements that weren't satisfied + for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] vars_written_by_insn = insn.write_dependency_names() & ( @@ -636,7 +638,7 @@ def _check_variable_access_ordered_inner(kernel): kernel.get_written_variables() | ( kernel.get_read_variables())) eq_classes_accessed_by_dep = set().union( - *(aliasing_equiv_classes[_] for _ in vars_accessed_by_dep)) + *(aliasing_equiv_classes[var] for var in vars_accessed_by_dep)) for var_written_by_insn in vars_written_by_insn: eq_class = aliasing_equiv_classes[var_written_by_insn] @@ -687,6 +689,8 @@ def _check_variable_access_ordered_inner(kernel): from loopy.diagnostic import VariableAccessNotOrdered raise VariableAccessNotOrdered(msg) + # }}} + logger.debug("%s: check_variable_access_ordered: done" % kernel.name) -- GitLab From 4e74a48e7c15dd699554c1fae7ab4863d6ff2b76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 3 Jun 2020 14:56:28 -0500 Subject: [PATCH 19/45] adds an assert and some comments to clarify check_variable_access_ordered --- loopy/check.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 07029713f..0b42772d3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -632,7 +632,7 @@ def _check_variable_access_ordered_inner(kernel): insn = kernel.id_to_insn[insn_id] vars_written_by_insn = insn.write_dependency_names() & ( kernel.get_written_variables()) - for dep_id in dep_ids: + for dep_id in dep_ids: # iterate through deps reqs. which weren't satisfied dep = kernel.id_to_insn[dep_id] vars_accessed_by_dep = dep.dependency_names() & ( kernel.get_written_variables() | ( @@ -640,9 +640,14 @@ def _check_variable_access_ordered_inner(kernel): eq_classes_accessed_by_dep = set().union( *(aliasing_equiv_classes[var] for var in vars_accessed_by_dep)) + found_var_responsible_for_dep_req = False + + # iterate through all the variables written by 'insn' to find + # which was responsible for the dependency requirement for var_written_by_insn in vars_written_by_insn: eq_class = aliasing_equiv_classes[var_written_by_insn] if eq_class & eq_classes_accessed_by_dep: + found_var_responsible_for_dep_req = True unaliased_readers = rmap.get(var_written_by_insn, set()) unaliased_writers = wmap.get(var_written_by_insn, set()) is_relationship_by_aliasing = not ( @@ -689,6 +694,8 @@ def _check_variable_access_ordered_inner(kernel): from loopy.diagnostic import VariableAccessNotOrdered raise VariableAccessNotOrdered(msg) + assert found_var_responsible_for_dep_req + # }}} logger.debug("%s: check_variable_access_ordered: done" % kernel.name) -- GitLab From 37b47bd2a898b6271cc329016d61f6485f938402 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 3 Jun 2020 16:17:31 -0500 Subject: [PATCH 20/45] use compute_sccs to compute topological order and detect cycles at once --- loopy/check.py | 49 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 0b42772d3..493e41e1f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -492,51 +492,18 @@ def _get_address_space(kernel, var): def _get_topological_order(kernel): - from pytools.graph import compute_topological_order, CycleError + from pytools.graph import compute_sccs from loopy.diagnostic import DependencyCycleFound - rev_dep_map = {insn.id: set() for insn in kernel.instructions} - for insn in kernel.instructions: - for dep in insn.depends_on: - rev_dep_map[dep].add(insn.id) - - try: - order = compute_topological_order(rev_dep_map) - except CycleError as e: - root = e.node - - visited = set() - visiting = [root] - stack = [(root, iter(rev_dep_map[root]))] - - # Depth first traversal from root - while stack: - node, children = stack.pop() - - for child in children: - # note: each iteration removes child from children - if child in visiting: - cycle = visiting[visiting.index(child):]+[child, ] - raise DependencyCycleFound('->'.join(cycle)) + dep_map = {insn.id: insn.depends_on for insn in kernel.instructions} - if child in visited: - continue - - visiting.append(child) - - # put (node, children) back on stack - stack.append((node, children)) - - # put (child, grandchildren) on stack - stack.append((child, iter(rev_dep_map.get(child, ())))) - break - else: - # loop did not break, - # so either this is a leaf or all children have been visited - assert node == visiting.pop() - visited.add(node) + sccs = compute_sccs(dep_map) + order = [] - assert False + for scc in sccs: + if len(scc) != 1: + raise DependencyCycleFound(', '.join(scc)) + order.append(scc[0]) return order -- GitLab From 5ba502ab46b20be393015cf84e16e3eaed4d9033 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 3 Jun 2020 23:33:01 -0500 Subject: [PATCH 21/45] [scheduler]: schedule_as_many_insns_as_possible: handle cases with a preschedule --- loopy/schedule/__init__.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 4bf524b47..a77871929 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -672,8 +672,13 @@ def schedule_as_many_run_insns_as_possible(sched_state): Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending all available instructions in the current loop nesting to the schedule. """ - if sched_state.preschedule: - #TODO: handle cases with a preschedule. + + next_preschedule_item = ( + sched_state.preschedule[0] + if len(sched_state.preschedule) > 0 + else None) + + if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, Barrier)): return sched_state # {{{ topological sort @@ -703,6 +708,12 @@ def schedule_as_many_run_insns_as_possible(sched_state): break newly_scheduled_insn_ids = toposorted_insn_ids[:num_insns_to_be_scheduled] + num_presched_insns_newly_scheduled = len(set(newly_scheduled_insn_ids) & + sched_state.prescheduled_insn_ids) + + assert all(isinstance(sched_item, RunInstruction) and sched_item.insn_id in + newly_scheduled_insn_ids for sched_item in + sched_state.preschedule[:num_presched_insns_newly_scheduled]) sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in newly_scheduled_insn_ids) @@ -716,7 +727,9 @@ def schedule_as_many_run_insns_as_possible(sched_state): insn_ids_to_try=None, schedule=updated_schedule, scheduled_insn_ids=updated_scheduled_insn_ids, - unscheduled_insn_ids=updated_unscheduled_insn_ids) + unscheduled_insn_ids=updated_unscheduled_insn_ids, + preschedule=sched_state.preschedule[num_presched_insns_newly_scheduled:] + ) return updated_sched_state -- GitLab From b492ba921555cf42a3c51166ea2142932874b4d9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Jun 2020 12:24:59 -0500 Subject: [PATCH 22/45] [scheduler]: always try to schedule multiple instructions --- loopy/schedule/__init__.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index a77871929..5965196b8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -747,6 +747,10 @@ def generate_loop_schedules_internal( else: rec_allow_boost = False + if not rec_allow_boost: + sched_state = ( + schedule_as_many_run_insns_as_possible(sched_state)) + active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( @@ -1135,10 +1139,6 @@ def generate_loop_schedules_internal( not in sched_state.prescheduled_inames else sched_state.preschedule[1:])) - if not rec_allow_boost: - new_sched_state = ( - schedule_as_many_run_insns_as_possible(new_sched_state)) - for sub_sched in generate_loop_schedules_internal( new_sched_state, allow_boost=rec_allow_boost, debug=debug): @@ -1356,11 +1356,6 @@ def generate_loop_schedules_internal( else sched_state.preschedule[1:]), ) - if not rec_allow_boost: - new_sched_state = ( - schedule_as_many_run_insns_as_possible( - new_sched_state)) - for sub_sched in generate_loop_schedules_internal( new_sched_state, allow_boost=rec_allow_boost, -- GitLab From d813a48ef04b71a078328e00db15d3b384f0117a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Jun 2020 16:53:00 -0500 Subject: [PATCH 23/45] [scheduler]: do not schedule many insns at once if not in subkernel --- loopy/schedule/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 5965196b8..ac3270ac3 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -681,6 +681,9 @@ def schedule_as_many_run_insns_as_possible(sched_state): if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, Barrier)): return sched_state + if not sched_state.within_subkernel: + return sched_state + # {{{ topological sort toposorted_insn_ids = tuple(insn_id for insn_id in -- GitLab From 9842ab447fbbbd8c0fc0c10534f0a00a7f3830e4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Jun 2020 16:58:16 -0500 Subject: [PATCH 24/45] [scheduler]: comment --- loopy/schedule/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ac3270ac3..665ef0e09 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -682,6 +682,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): return sched_state if not sched_state.within_subkernel: + # cannot schedule RunInstructions when not in subkernel return sched_state # {{{ topological sort -- GitLab From ffb14a43ffa5c6d486566c92e5a1d86f3b70f52c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 19 Jun 2020 16:53:58 -0500 Subject: [PATCH 25/45] corrects the vars written by insn --- loopy/check.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 493e41e1f..cb4fab44a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -597,8 +597,7 @@ def _check_variable_access_ordered_inner(kernel): for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): insn = kernel.id_to_insn[insn_id] - vars_written_by_insn = insn.write_dependency_names() & ( - kernel.get_written_variables()) + vars_written_by_insn = set(insn.assignee_var_names()) for dep_id in dep_ids: # iterate through deps reqs. which weren't satisfied dep = kernel.id_to_insn[dep_id] vars_accessed_by_dep = dep.dependency_names() & ( -- GitLab From d074f97122a5097882e6f69a76afc10e5962d91b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 20 Jun 2020 18:49:12 -0500 Subject: [PATCH 26/45] change the way how dep reqs are stored --- loopy/check.py | 136 ++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 75 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index cb4fab44a..fdc8874a6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -516,16 +516,15 @@ def _check_variable_access_ordered_inner(kernel): logger.debug("%s: check_variable_access_ordered: start" % kernel.name) - # insn_id_to_dep_reqs: A mapping from insn_id to set of insn_ids between - # whom dependency requirements should be proved in order to assert - # variable access ordering. - insn_id_to_dep_reqs = dict((insn.id, set()) for insn in - kernel.instructions) + # dep_reqs_to_vars: A mapping from (writer_id, dep_req_id) between whom + # dependency must be established to the variables which prompted the + # dependency requirement. + dep_reqs_to_vars = {} wmap = kernel.writer_map() rmap = kernel.reader_map() - # {{{ populate 'insn_id_to_dep_reqs' + # {{{ populate 'dep_reqs_to_vars' for var in kernel.get_written_variables(): address_space = _get_address_space(kernel, var) @@ -542,7 +541,8 @@ def _check_variable_access_ordered_inner(kernel): declares_nosync_with(kernel, address_space, writer, req_dep)]) - insn_id_to_dep_reqs[writer] |= required_deps + for req_dep in required_deps: + dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var) # }}} @@ -563,7 +563,7 @@ def _check_variable_access_ordered_inner(kernel): topological_order = _get_topological_order(kernel) - def discard_dep_reqs_in_order(insn_id_to_dep_reqs, edges, order): + def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order): """ Subtracts dependency requirements of insn_ids by all direct/indirect predecessors of a directed graph of insn_ids as nodes and *edges* as @@ -581,86 +581,72 @@ def _check_variable_access_ordered_inner(kernel): # accumulated_predecessors:insn_id's direct+indirect predecessors accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) - insn_id_to_dep_reqs[insn_id] -= accumulated_predecessors + for pred in accumulated_predecessors: + dep_reqs_to_vars.pop((insn_id, pred), None) for successor in edges[insn_id]: memoized_predecessors.setdefault(successor, set()).update( accumulated_predecessors | set([insn_id])) # forward dep. graph traversal in reverse topological sort order - discard_dep_reqs_in_order(insn_id_to_dep_reqs, depends_on, + discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on, topological_order[::-1]) # reverse dep. graph traversal in topological sort order - discard_dep_reqs_in_order(insn_id_to_dep_reqs, rev_depends, topological_order) + discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order) # {{{ handle dependency requirements that weren't satisfied - for insn_id, dep_ids in six.iteritems(insn_id_to_dep_reqs): - insn = kernel.id_to_insn[insn_id] - vars_written_by_insn = set(insn.assignee_var_names()) - for dep_id in dep_ids: # iterate through deps reqs. which weren't satisfied - dep = kernel.id_to_insn[dep_id] - vars_accessed_by_dep = dep.dependency_names() & ( - kernel.get_written_variables() | ( - kernel.get_read_variables())) - eq_classes_accessed_by_dep = set().union( - *(aliasing_equiv_classes[var] for var in vars_accessed_by_dep)) - - found_var_responsible_for_dep_req = False - - # iterate through all the variables written by 'insn' to find - # which was responsible for the dependency requirement - for var_written_by_insn in vars_written_by_insn: - eq_class = aliasing_equiv_classes[var_written_by_insn] - if eq_class & eq_classes_accessed_by_dep: - found_var_responsible_for_dep_req = True - unaliased_readers = rmap.get(var_written_by_insn, set()) - unaliased_writers = wmap.get(var_written_by_insn, set()) - is_relationship_by_aliasing = not ( - insn_id in unaliased_writers - and (dep_id in unaliased_writers - or dep_id in unaliased_readers)) - - # Do not enforce ordering for disjoint access ranges - if (not is_relationship_by_aliasing and not - overlap_checker.do_access_ranges_overlap_conservative( - insn_id, "w", dep_id, "any", - var_written_by_insn)): - continue + for (writer_id, other_id), variables in six.iteritems(dep_reqs_to_vars): + writer = kernel.id_to_insn[writer_id] + other = kernel.id_to_insn[other_id] - # Do not enforce ordering for aliasing-based relationships - # in different groups. - if (is_relationship_by_aliasing and ( - bool(insn.groups & dep.conflicts_with_groups) - or - bool(dep.groups & insn.conflicts_with_groups))): - continue + for var in variables: + eq_class = aliasing_equiv_classes[var] + unaliased_readers = rmap.get(var, set()) + unaliased_writers = wmap.get(var, set()) + + is_relationship_by_aliasing = not ( + writer_id in unaliased_writers + and (writer_id in unaliased_writers + or other_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing and not + overlap_checker.do_access_ranges_overlap_conservative( + writer_id, "w", other_id, "any", var)): + continue + + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(insn.groups & other.conflicts_with_groups) + or + bool(other.groups & writer.conflicts_with_groups))): + continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=insn_id, - other_id=dep_id, - var=( - "the variable '%s'" % var_written_by_insn - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) - - assert found_var_responsible_for_dep_req + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % var + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) # }}} -- GitLab From 33bad321051822dc5edc2cb7cca3f9d5bd766c20 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 20 Jun 2020 19:41:14 -0500 Subject: [PATCH 27/45] human error: indentation --- loopy/check.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index fdc8874a6..d1923144e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -624,29 +624,29 @@ def _check_variable_access_ordered_inner(kernel): bool(other.groups & writer.conflicts_with_groups))): continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=writer_id, - other_id=other_id, - var=( - "the variable '%s'" % var - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % var + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) # }}} -- GitLab From a9c6c2e46f620d9d229a94d8cf1c454aed8e9659 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 01:19:34 -0500 Subject: [PATCH 28/45] batched implementation of get_usable_inames_for_conditional --- loopy/codegen/bounds.py | 89 ++++++++++++----------- loopy/codegen/control.py | 14 ++-- loopy/codegen/loop.py | 2 +- loopy/schedule/__init__.py | 144 +++++++++++++++++++++++++++++++++---- 4 files changed, 182 insertions(+), 67 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index b736191ec..c81e8b110 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -55,57 +55,56 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai # {{{ on which inames may a conditional depend? -def get_usable_inames_for_conditional(kernel, sched_index): +def get_usable_inames_for_conditional(kernel, sched_indices): + from loopy.schedule import ( - find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within, + get_subkernel_indices) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) - result = find_active_inames_at(kernel, sched_index) - crosses_barrier = has_barrier_within(kernel, sched_index) - + active_inames_list = find_active_inames_at(kernel, sched_indices) + crosses_barrier_list = has_barrier_within(kernel, sched_indices) # Find our containing subkernel. Grab inames for all insns from there. - within_subkernel = False - - for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): - from loopy.schedule import CallKernel, ReturnFromKernel - if isinstance(sched_item, CallKernel): - within_subkernel = True - subkernel_index = sched_item_index - elif isinstance(sched_item, ReturnFromKernel): - within_subkernel = False - - if not within_subkernel: - # Outside all subkernels - use only inames available to host. - return frozenset(result) - - insn_ids_for_subkernel = get_insn_ids_for_block_at( - kernel.schedule, subkernel_index) - - inames_for_subkernel = ( - iname - for insn in insn_ids_for_subkernel - for iname in kernel.insn_inames(insn)) - - for iname in inames_for_subkernel: - # Parallel inames are defined within a subkernel, BUT: - # - # - local indices may not be used in conditionals that cross barriers. - # - # - ILP indices and vector lane indices are not available in loop - # bounds, they only get defined at the innermost level of nesting. - - if ( - kernel.iname_tags_of_type(iname, ConcurrentTag) - and not kernel.iname_tags_of_type(iname, VectorizeTag) - and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) - and crosses_barrier) - and not kernel.iname_tags_of_type(iname, IlpBaseTag) - ): - result.add(iname) - - return frozenset(result) + subkernel_index_list = get_subkernel_indices(kernel, sched_indices) + + inames_for_subkernel = {} + + for subknl_idx in set(idx for idx in subkernel_index_list if idx is not None): + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subknl_idx) + + inames_for_subkernel[subknl_idx] = ( + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)) + + result = [] + + for active_inames, crosses_barrier, subknl_idx in zip(active_inames_list, + crosses_barrier_list, subkernel_index_list): + if subknl_idx is not None: + for iname in inames_for_subkernel[subknl_idx]: + # Parallel inames are defined within a subkernel, BUT: + # + # - local indices may not be used in conditionals that cross barriers + # + # - ILP indices and vector lane indices are not available in loop + # bounds, they only get defined at the innermost level of nesting + + if ( + kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag) + and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) + and crosses_barrier) + and not kernel.iname_tags_of_type(iname, IlpBaseTag) + ): + active_inames.add(iname) + + result.append(frozenset(active_inames)) + + return result # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 7319b16ac..c90f4c6b1 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -279,15 +279,17 @@ def build_loop_nest(codegen_state, schedule_index): from loopy.schedule import find_used_inames_within from loopy.codegen.bounds import get_usable_inames_for_conditional + admissible_cond_inames = get_usable_inames_for_conditional(kernel, + my_sched_indices) + sched_index_info_entries = [ ScheduleIndexInfo( - schedule_indices=[i], - admissible_cond_inames=( - get_usable_inames_for_conditional(kernel, i)), - required_predicates=get_required_predicates(kernel, i), - used_inames_within=find_used_inames_within(kernel, i) + schedule_indices=[my_sched_idx], + admissible_cond_inames=admissible_cond_inames[i], + required_predicates=get_required_predicates(kernel, my_sched_idx), + used_inames_within=find_used_inames_within(kernel, my_sched_idx) ) - for i in my_sched_indices + for i, my_sched_idx in enumerate(my_sched_indices) ] sched_index_info_entries = group_by( diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index b3a877988..3824f1bc1 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -353,7 +353,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index) + usable_inames = get_usable_inames_for_conditional(kernel, (sched_index,)) domain = kernel.get_inames_domain(loop_iname) result = [] diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 665ef0e09..be0ca73c0 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -106,6 +106,16 @@ class Barrier(ScheduleItem): # {{{ schedule utilities def gather_schedule_block(schedule, start_idx): + """ + Returns a :class:`tuple` of (list of schedule items, index just after the + block) for a block. + + :arg schedule: An instance of :class:`list` of + :class:`loopy.schedule.ScheduleItem`s. + + :arg start_idx: The index of a :class:`loopy.schedule.BeginBlockItem` of + the block whose schedule items are to be returned. + """ assert isinstance(schedule[start_idx], BeginBlockItem) level = 0 @@ -157,32 +167,102 @@ def get_insn_ids_for_block_at(schedule, start_idx): if isinstance(sub_sched_item, RunInstruction)) -def find_active_inames_at(kernel, sched_index): +def find_active_inames_at(kernel, sched_indices): + """ + Returns an instance of :class:`list` of :class:`set`s of inames occuring at + each schedule index in *sched_indices*. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ active_inames = [] + sched_idx_to_active_inames = {0: set()} + + sorted_sched_indices = sorted(sched_indices) from loopy.schedule import EnterLoop, LeaveLoop - for sched_item in kernel.schedule[:sched_index]: + + for sched_idx, sched_item in enumerate( + kernel.schedule[:sorted_sched_indices[-1]]): if isinstance(sched_item, EnterLoop): active_inames.append(sched_item.iname) if isinstance(sched_item, LeaveLoop): active_inames.pop() - return set(active_inames) + if sched_idx == (sorted_sched_indices[0]-1): + sched_idx_to_active_inames[sched_idx+1] = set(active_inames) + sorted_sched_indices.pop(0) + # eventually everythin should be popped + assert sorted_sched_indices in ([], [0]) -def has_barrier_within(kernel, sched_index): - sched_item = kernel.schedule[sched_index] + return [sched_idx_to_active_inames[idx] for idx in sched_indices] - if isinstance(sched_item, BeginBlockItem): - loop_contents, _ = gather_schedule_block( - kernel.schedule, sched_index) - from pytools import any - return any(isinstance(subsched_item, Barrier) - for subsched_item in loop_contents) - elif isinstance(sched_item, Barrier): - return True - else: - return False + +def has_barrier_within(kernel, sched_indices): + """ + Returns a :class:`list` of :class:`bool`s, with an entry for each schedule + index in *sched_indices* denoting if either there is barrier at the + schedule index or the schedule index is a + :class:`loopy.schedule.BeginBlockItem` containing a barrier in the block. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ + sched_idx_to_has_barrier_within = {} + begin_block_sched_indices = [] + + for sched_idx in sched_indices: + sched_item = kernel.schedule[sched_idx] + + if isinstance(sched_item, Barrier): + sched_idx_to_has_barrier_within[sched_idx] = True + elif isinstance(sched_item, BeginBlockItem): + begin_block_sched_indices.append(sched_idx) + else: + sched_idx_to_has_barrier_within[sched_idx] = False + + begin_block_sched_indices.sort() + + for sched_idx in begin_block_sched_indices: + if sched_idx in sched_idx_to_has_barrier_within: + # this block has already been dealt in a previous block + continue + + block_contents, _ = gather_schedule_block( + kernel.schedule, sched_idx) + + level = 1 + # block_stack: list of [sched_idx, has_barrier_within]'s for every + # level of block + block_stack = [[sched_idx, False]] + + for i, sched_item in enumerate(block_contents[1:], start=sched_idx+1): + if level == 0: + break + + if isinstance(sched_item, BeginBlockItem): + level += 1 + + block_stack.append([i, False]) + elif isinstance(sched_item, EndBlockItem): + level -= 1 + + exit_block_sched_idx, exit_block_contains_barrier = block_stack.pop() + + if block_stack: + # inner block contains barrier => outer block contains barrier + block_stack[-1][1] |= exit_block_contains_barrier + + sched_idx_to_has_barrier_within[exit_block_sched_idx] = ( + exit_block_contains_barrier) + elif isinstance(sched_item, Barrier): + block_stack[-1][1] = True + else: + pass + + assert level == 0 + + return [sched_idx_to_has_barrier_within[sched_idx] for sched_idx in + sched_indices] def find_used_inames_within(kernel, sched_index): @@ -415,6 +495,40 @@ def sched_item_to_insn_id(sched_item): and sched_item.originating_insn_id is not None): yield sched_item.originating_insn_id + +def get_subkernel_indices(kernel, sched_indices): + """ + Returns an instance of :class:`list` of :class:`int`s, with an entry for + each schedule index in *sched_indices* denoting the index of + :class:`loopy.schedule.CallKernel` for the subkernel it is in, if no + subkernel contains a schedule index its entry is set to *None*. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ + from loopy.schedule import CallKernel, ReturnFromKernel + + subkernel_index = None + sorted_sched_indices = sorted(sched_indices) + sched_idx_to_subkernel_idx = {0: None} + + for sched_idx, sched_item in enumerate( + kernel.schedule[:sorted_sched_indices[-1]]): + if isinstance(sched_item, CallKernel): + subkernel_index = sched_idx + elif isinstance(sched_item, ReturnFromKernel): + subkernel_index = None + + if sched_idx == (sorted_sched_indices[0]-1): + sched_idx_to_subkernel_idx[sched_idx+1] = subkernel_index + sorted_sched_indices.pop() + + # eventually everythin should be popped + assert sorted_sched_indices in ([], [0]) + + return [sched_idx_to_subkernel_idx[sched_idx] for sched_idx in + sched_indices] + + # }}} -- GitLab From 972f23dd4de1a9713917c7a752c8be945beab369 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 01:25:47 -0500 Subject: [PATCH 29/45] minor typo (bug) fix --- loopy/schedule/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index be0ca73c0..ecb5b103a 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -520,7 +520,7 @@ def get_subkernel_indices(kernel, sched_indices): if sched_idx == (sorted_sched_indices[0]-1): sched_idx_to_subkernel_idx[sched_idx+1] = subkernel_index - sorted_sched_indices.pop() + sorted_sched_indices.pop(0) # eventually everythin should be popped assert sorted_sched_indices in ([], [0]) -- GitLab From 9f9887a16a9e0a957efd8f0d1b79b9a4fb7c0f3d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 02:01:08 -0500 Subject: [PATCH 30/45] bug fixes --- loopy/schedule/__init__.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ecb5b103a..e6ae063b1 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -181,8 +181,13 @@ def find_active_inames_at(kernel, sched_indices): from loopy.schedule import EnterLoop, LeaveLoop + max_sched_idx = sorted_sched_indices[-1] + + if sorted_sched_indices and sorted_sched_indices[0] == 0: + sorted_sched_indices.pop(0) + for sched_idx, sched_item in enumerate( - kernel.schedule[:sorted_sched_indices[-1]]): + kernel.schedule[:max_sched_idx]): if isinstance(sched_item, EnterLoop): active_inames.append(sched_item.iname) if isinstance(sched_item, LeaveLoop): @@ -193,7 +198,7 @@ def find_active_inames_at(kernel, sched_indices): sorted_sched_indices.pop(0) # eventually everythin should be popped - assert sorted_sched_indices in ([], [0]) + assert len(sorted_sched_indices) == 0 return [sched_idx_to_active_inames[idx] for idx in sched_indices] @@ -511,8 +516,13 @@ def get_subkernel_indices(kernel, sched_indices): sorted_sched_indices = sorted(sched_indices) sched_idx_to_subkernel_idx = {0: None} + max_sched_idx = sorted_sched_indices[-1] + + if sorted_sched_indices and sorted_sched_indices[0] == 0: + sorted_sched_indices.pop(0) + for sched_idx, sched_item in enumerate( - kernel.schedule[:sorted_sched_indices[-1]]): + kernel.schedule[:max_sched_idx]): if isinstance(sched_item, CallKernel): subkernel_index = sched_idx elif isinstance(sched_item, ReturnFromKernel): @@ -523,7 +533,7 @@ def get_subkernel_indices(kernel, sched_indices): sorted_sched_indices.pop(0) # eventually everythin should be popped - assert sorted_sched_indices in ([], [0]) + assert len(sorted_sched_indices) == 0 return [sched_idx_to_subkernel_idx[sched_idx] for sched_idx in sched_indices] -- GitLab From c7b0cb89ff0ce1f20a57b99e76e502524e73228c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 02:53:31 -0500 Subject: [PATCH 31/45] bug fix --- loopy/codegen/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 3824f1bc1..c7489e759 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -353,7 +353,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, (sched_index,)) + usable_inames, = get_usable_inames_for_conditional(kernel, (sched_index,)) domain = kernel.get_inames_domain(loop_iname) result = [] -- GitLab From e9a103528357debfb8dd8f281766177907e1ab3e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 15:22:40 -0500 Subject: [PATCH 32/45] bug fix: was stepping through the same generator object multiple times --- loopy/codegen/bounds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c81e8b110..051b31607 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -75,10 +75,10 @@ def get_usable_inames_for_conditional(kernel, sched_indices): insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subknl_idx) - inames_for_subkernel[subknl_idx] = ( + inames_for_subkernel[subknl_idx] = [ iname for insn in insn_ids_for_subkernel - for iname in kernel.insn_inames(insn)) + for iname in kernel.insn_inames(insn)] result = [] -- GitLab From bd2e63ef56acc5d08c9ff7c02a8e96eb9f43752a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 22 Jun 2020 16:03:55 -0500 Subject: [PATCH 33/45] optimization: decrease complexity of checking which inames are available for conditionals --- loopy/codegen/bounds.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 051b31607..7169700fe 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -63,7 +63,6 @@ def get_usable_inames_for_conditional(kernel, sched_indices): from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) - active_inames_list = find_active_inames_at(kernel, sched_indices) crosses_barrier_list = has_barrier_within(kernel, sched_indices) # Find our containing subkernel. Grab inames for all insns from there. @@ -75,31 +74,33 @@ def get_usable_inames_for_conditional(kernel, sched_indices): insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subknl_idx) - inames_for_subkernel[subknl_idx] = [ + all_inames_in_the_subknl = [ iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)] + def is_eligible_in_conditional(iname): + # Parallel inames are defined within a subkernel, BUT: + # + # - ILP indices and vector lane indices are not available in loop + # bounds, they only get defined at the innermost level of nesting. + return ( + kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag) + and not kernel.iname_tags_of_type(iname, IlpBaseTag)) + + inames_for_subkernel[subknl_idx] = [iname for iname in + all_inames_in_the_subknl if is_eligible_in_conditional(iname)] + result = [] for active_inames, crosses_barrier, subknl_idx in zip(active_inames_list, crosses_barrier_list, subkernel_index_list): if subknl_idx is not None: for iname in inames_for_subkernel[subknl_idx]: - # Parallel inames are defined within a subkernel, BUT: - # - # - local indices may not be used in conditionals that cross barriers - # - # - ILP indices and vector lane indices are not available in loop - # bounds, they only get defined at the innermost level of nesting - - if ( - kernel.iname_tags_of_type(iname, ConcurrentTag) - and not kernel.iname_tags_of_type(iname, VectorizeTag) - and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) - and crosses_barrier) - and not kernel.iname_tags_of_type(iname, IlpBaseTag) - ): + # local indices may not be used in conditionals that cross barriers + if (not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) + and crosses_barrier)): active_inames.add(iname) result.append(frozenset(active_inames)) -- GitLab From 4a5a506d5dfe8935ddefe4783a6fe78301a68f66 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 23 Jun 2020 00:19:24 -0500 Subject: [PATCH 34/45] Call topo sort only once --- loopy/schedule/__init__.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 665ef0e09..49f39dd2f 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -646,6 +646,10 @@ class SchedulerState(ImmutableRecord): Used to produce warnings about deprecated 'boosting' behavior Should be removed along with boostability in 2017.x. + + .. attribute:: lazy_topological_sort_getter + + Used to get a topologically sort of the instructions lazily. """ @property @@ -655,8 +659,16 @@ class SchedulerState(ImmutableRecord): else: return None + +class LazyTopologicalSortGetter(object): + """Returns instruction ids of the kernel in a topologically sorted order + """ + + def __init__(self, kernel): + self.kernel = kernel + @memoize_method - def get_insn_ids_in_a_topologically_sorted_order(self): + def __call__(self): from pytools.graph import compute_topological_order rev_dep_map = {insn.id: set() for insn in self.kernel.instructions} @@ -688,7 +700,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): # {{{ topological sort toposorted_insn_ids = tuple(insn_id for insn_id in - sched_state.get_insn_ids_in_a_topologically_sorted_order() if + sched_state.lazy_topological_sort_getter() if insn_id in sched_state.unscheduled_insn_ids and ( sched_state.kernel.id_to_insn[insn_id].within_inames >= frozenset(sched_state.active_inames))) @@ -2009,7 +2021,9 @@ def generate_loop_schedules_inner(kernel, debug_args={}): group_insn_counts=group_insn_counts(kernel), active_group_counts={}, - uses_of_boostability=[]) + uses_of_boostability=[], + lazy_topological_sort_getter=LazyTopologicalSortGetter(kernel), + ) schedule_gen_kwargs = {} if kernel.options.ignore_boostable_into: -- GitLab From 60413eab7bf60c2402be57a282efe32bdb174da4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 23 Jun 2020 01:23:01 -0500 Subject: [PATCH 35/45] Don't recompute insn_ids_to_try if nothing changed --- loopy/schedule/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 49f39dd2f..8d896e7f3 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -739,8 +739,12 @@ def schedule_as_many_run_insns_as_possible(sched_state): updated_unscheduled_insn_ids = ( updated_sched_state.unscheduled_insn_ids - frozenset(newly_scheduled_insn_ids)) + if newly_scheduled_insn_ids: + new_insn_ids_to_try = None + else: + new_insn_ids_to_try = sched_state.insn_ids_to_try updated_sched_state = updated_sched_state.copy( - insn_ids_to_try=None, + insn_ids_to_try=new_insn_ids_to_try, schedule=updated_schedule, scheduled_insn_ids=updated_scheduled_insn_ids, unscheduled_insn_ids=updated_unscheduled_insn_ids, @@ -1149,6 +1153,7 @@ def generate_loop_schedules_internal( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1], + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if last_entered_loop @@ -1366,6 +1371,7 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if iname not in sched_state.prescheduled_inames -- GitLab From e63f1591af1651419ccef6fb7128c4ba817b1437 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 23 Jun 2020 09:49:06 -0500 Subject: [PATCH 36/45] Clarify why lazy --- loopy/schedule/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 8d896e7f3..470172f75 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -649,7 +649,9 @@ class SchedulerState(ImmutableRecord): .. attribute:: lazy_topological_sort_getter - Used to get a topologically sort of the instructions lazily. + Used to get a topological sort of the instructions lazily. + Topological sorting is expensive, so this is done only once and only + when requested. """ @property -- GitLab From 549059f8d343dfccbe9413203e943270589c8b79 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 23 Jun 2020 16:01:21 -0500 Subject: [PATCH 37/45] Save some id_to_insn calls Scheduling goes from 177s to 82s --- loopy/schedule/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 470172f75..072432d5a 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -663,7 +663,7 @@ class SchedulerState(ImmutableRecord): class LazyTopologicalSortGetter(object): - """Returns instruction ids of the kernel in a topologically sorted order + """Returns instructions of the kernel in a topologically sorted order """ def __init__(self, kernel): @@ -678,7 +678,8 @@ class LazyTopologicalSortGetter(object): for dep in insn.depends_on: rev_dep_map[dep].add(insn.id) - return compute_topological_order(rev_dep_map) + ids = compute_topological_order(rev_dep_map) + return [self.kernel.id_to_insn[insn_id] for insn_id in ids] def schedule_as_many_run_insns_as_possible(sched_state): @@ -701,11 +702,12 @@ def schedule_as_many_run_insns_as_possible(sched_state): # {{{ topological sort - toposorted_insn_ids = tuple(insn_id for insn_id in + def filter_insn(insn): + return insn.within_inames >= frozenset(sched_state.active_inames) + + toposorted_insn_ids = tuple(insn.id for insn in sched_state.lazy_topological_sort_getter() if - insn_id in sched_state.unscheduled_insn_ids and ( - sched_state.kernel.id_to_insn[insn_id].within_inames >= - frozenset(sched_state.active_inames))) + insn.id in sched_state.unscheduled_insn_ids and filter_insn(insn)) # }}} -- GitLab From 325444ec55c6165b73c94c5d6abf4aa39ae188d3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 23 Jun 2020 17:45:51 -0500 Subject: [PATCH 38/45] bug fix: wrong variable name --- loopy/check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index d1923144e..600f5670d 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -619,7 +619,7 @@ def _check_variable_access_ordered_inner(kernel): # Do not enforce ordering for aliasing-based relationships # in different groups. if (is_relationship_by_aliasing and ( - bool(insn.groups & other.conflicts_with_groups) + bool(writer.groups & other.conflicts_with_groups) or bool(other.groups & writer.conflicts_with_groups))): continue -- GitLab From 4bc450a79617f7e3945d93ee3d1ed2e7e8fd2aa4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 24 Jun 2020 13:36:42 -0500 Subject: [PATCH 39/45] [schedule]: schedule_as_many_run_insns_as_possible not performing correctly in the case of parallel_inames --- loopy/schedule/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 072432d5a..2f33fb0c9 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -702,8 +702,10 @@ def schedule_as_many_run_insns_as_possible(sched_state): # {{{ topological sort + have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames + def filter_insn(insn): - return insn.within_inames >= frozenset(sched_state.active_inames) + return insn.within_inames >= have_inames toposorted_insn_ids = tuple(insn.id for insn in sched_state.lazy_topological_sort_getter() if @@ -722,7 +724,8 @@ def schedule_as_many_run_insns_as_possible(sched_state): for insn_id in toposorted_insn_ids: insn = sched_state.kernel.id_to_insn[insn_id] if isinstance(insn, MultiAssignmentBase): - if insn.within_inames == frozenset(sched_state.active_inames): + if (insn.within_inames - sched_state.parallel_inames) == frozenset( + sched_state.active_inames): num_insns_to_be_scheduled += 1 continue break -- GitLab From c18ce2d7bbc8f37f5074ce4666bd1c57164ef119 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 24 Jun 2020 15:59:45 -0500 Subject: [PATCH 40/45] Reduce duplicate inames by using a set --- loopy/codegen/bounds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 7169700fe..90b1b91f7 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -74,10 +74,10 @@ def get_usable_inames_for_conditional(kernel, sched_indices): insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subknl_idx) - all_inames_in_the_subknl = [ + all_inames_in_the_subknl = set([ iname for insn in insn_ids_for_subkernel - for iname in kernel.insn_inames(insn)] + for iname in kernel.insn_inames(insn)]) def is_eligible_in_conditional(iname): # Parallel inames are defined within a subkernel, BUT: -- GitLab From b88b3f5a08975717ecc7895691e1803f4e83dcf4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 24 Jun 2020 17:01:14 -0500 Subject: [PATCH 41/45] Don't run filtering eagerly --- loopy/schedule/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2f33fb0c9..e92774df0 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -704,12 +704,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames - def filter_insn(insn): - return insn.within_inames >= have_inames - - toposorted_insn_ids = tuple(insn.id for insn in - sched_state.lazy_topological_sort_getter() if - insn.id in sched_state.unscheduled_insn_ids and filter_insn(insn)) + toposorted_insns = sched_state.lazy_topological_sort_getter() # }}} @@ -721,16 +716,21 @@ def schedule_as_many_run_insns_as_possible(sched_state): num_insns_to_be_scheduled = 0 - for insn_id in toposorted_insn_ids: - insn = sched_state.kernel.id_to_insn[insn_id] + newly_scheduled_insn_ids = [] + + for insn in toposorted_insns: + if insn.id in sched_state.unscheduled_insn_ids: + continue + if not insn.within_inames >= have_inames: + continue if isinstance(insn, MultiAssignmentBase): if (insn.within_inames - sched_state.parallel_inames) == frozenset( sched_state.active_inames): + newly_scheduled_insn_ids.append(insn.id) num_insns_to_be_scheduled += 1 continue break - newly_scheduled_insn_ids = toposorted_insn_ids[:num_insns_to_be_scheduled] num_presched_insns_newly_scheduled = len(set(newly_scheduled_insn_ids) & sched_state.prescheduled_insn_ids) -- GitLab From acfd2510e523b7fb06a1a7eaff18e662a1a9ff69 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 24 Jun 2020 17:16:09 -0500 Subject: [PATCH 42/45] Remove lazy computing of instruction sort order --- loopy/schedule/__init__.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index e92774df0..aeda3751b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -29,7 +29,7 @@ import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa -from pytools import memoize_method, MinRecursionLimit, ProcessLogger +from pytools import MinRecursionLimit, ProcessLogger from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder @@ -647,11 +647,9 @@ class SchedulerState(ImmutableRecord): Used to produce warnings about deprecated 'boosting' behavior Should be removed along with boostability in 2017.x. - .. attribute:: lazy_topological_sort_getter + .. attribute:: insns_in_topologically_sorted_order - Used to get a topological sort of the instructions lazily. - Topological sorting is expensive, so this is done only once and only - when requested. + A list of loopy :class:`Instruction` objects in topologically sorted order """ @property @@ -662,24 +660,16 @@ class SchedulerState(ImmutableRecord): return None -class LazyTopologicalSortGetter(object): - """Returns instructions of the kernel in a topologically sorted order - """ - - def __init__(self, kernel): - self.kernel = kernel - - @memoize_method - def __call__(self): - from pytools.graph import compute_topological_order +def get_insns_in_topologically_sorted_order(kernel): + from pytools.graph import compute_topological_order - rev_dep_map = {insn.id: set() for insn in self.kernel.instructions} - for insn in self.kernel.instructions: - for dep in insn.depends_on: - rev_dep_map[dep].add(insn.id) + rev_dep_map = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) - ids = compute_topological_order(rev_dep_map) - return [self.kernel.id_to_insn[insn_id] for insn_id in ids] + ids = compute_topological_order(rev_dep_map) + return [kernel.id_to_insn[insn_id] for insn_id in ids] def schedule_as_many_run_insns_as_possible(sched_state): @@ -704,7 +694,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames - toposorted_insns = sched_state.lazy_topological_sort_getter() + toposorted_insns = sched_state.insns_in_topologically_sorted_order # }}} @@ -2035,7 +2025,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): active_group_counts={}, uses_of_boostability=[], - lazy_topological_sort_getter=LazyTopologicalSortGetter(kernel), + insns_in_topologically_sorted_order=( + get_insns_in_topologically_sorted_order(kernel)), ) schedule_gen_kwargs = {} -- GitLab From 0be1e6c6a30c7b2284b9a64027f652f439b1fca0 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 24 Jun 2020 17:26:18 -0500 Subject: [PATCH 43/45] Fix condition typo --- loopy/schedule/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index aeda3751b..57e5c6949 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -709,7 +709,7 @@ def schedule_as_many_run_insns_as_possible(sched_state): newly_scheduled_insn_ids = [] for insn in toposorted_insns: - if insn.id in sched_state.unscheduled_insn_ids: + if not insn.id in sched_state.unscheduled_insn_ids: continue if not insn.within_inames >= have_inames: continue -- GitLab From 8915362221bdf6e22039f4e2ab59e5250ea1b90e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 24 Jun 2020 21:26:25 -0500 Subject: [PATCH 44/45] [scheduler]: ensure deps are not violated as insns are scheduled in batches --- loopy/schedule/__init__.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 57e5c6949..d0efb98e8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -690,34 +690,30 @@ def schedule_as_many_run_insns_as_possible(sched_state): # cannot schedule RunInstructions when not in subkernel return sched_state - # {{{ topological sort - have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames toposorted_insns = sched_state.insns_in_topologically_sorted_order - # }}} - # select the top instructions in toposorted_insns only which have active # inames corresponding to those of sched_state from loopy.kernel.instruction import MultiAssignmentBase updated_sched_state = sched_state.copy() - num_insns_to_be_scheduled = 0 - newly_scheduled_insn_ids = [] + ignored_unscheduled_insn_ids = set() for insn in toposorted_insns: - if not insn.id in sched_state.unscheduled_insn_ids: + if insn.id in sched_state.scheduled_insn_ids: continue if not insn.within_inames >= have_inames: + ignored_unscheduled_insn_ids.add(insn.id) continue if isinstance(insn, MultiAssignmentBase): if (insn.within_inames - sched_state.parallel_inames) == frozenset( - sched_state.active_inames): + sched_state.active_inames) and not (insn.depends_on & + ignored_unscheduled_insn_ids): newly_scheduled_insn_ids.append(insn.id) - num_insns_to_be_scheduled += 1 continue break -- GitLab From 01e169ae42f07625e383a356df1343b9fb41f634 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 24 Jun 2020 21:50:34 -0500 Subject: [PATCH 45/45] changes in docs after changing the scheduler --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1b017f701..23057cb13 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -957,8 +957,8 @@ Consider the following example: if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { - a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; + a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; } barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) -- GitLab