diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1b017f701f8161e93c4fdc1c14644dfe4b4fa74c..23057cb13048c029fbc3db5ebacf58696b039286 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -957,8 +957,8 @@ Consider the following example: if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { - a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; + a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; } barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) diff --git a/loopy/check.py b/loopy/check.py index da49c1d116df1a9fbf92e8ef41822b6741405604..600f5670d7fea3bc201c231061f4442201aee5cb 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -449,45 +449,9 @@ def check_has_schedulable_iname_nesting(kernel): # {{{ check_variable_access_ordered -class IndirectDependencyEdgeFinder(object): - def __init__(self, kernel): - self.kernel = kernel - self.dep_edge_cache = {} - - def __call__(self, depender_id, dependee_id): - cache_key = (depender_id, dependee_id) - - try: - result = self.dep_edge_cache[cache_key] - except KeyError: - pass - else: - if result is None: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("when " - "checking for dependency edge between " - "depender '%s' and dependee '%s'" - % (depender_id, dependee_id)) - else: - return result - - depender = self.kernel.id_to_insn[depender_id] - - if dependee_id in depender.depends_on: - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = None - for dep in depender.depends_on: - if self(dep, dependee_id): - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = False - return False - - def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): + dep_a = kernel.id_to_insn[dep_a] + dep_b = kernel.id_to_insn[dep_b] from loopy.kernel.data import AddressSpace if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] @@ -510,116 +474,181 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync +def _get_address_space(kernel, var): + from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg + if var in kernel.temporary_variables: + address_space = kernel.temporary_variables[var].address_space + else: + arg = kernel.arg_dict[var] + if isinstance(arg, ArrayArg): + address_space = arg.address_space + elif isinstance(arg, ValueArg): + address_space = AddressSpace.PRIVATE + else: + # No need to consider ConstantArg and ImageArg (for now) + # because those won't be written. + raise ValueError("could not determine address_space of '%s'" % var) + return address_space + + +def _get_topological_order(kernel): + from pytools.graph import compute_sccs + from loopy.diagnostic import DependencyCycleFound + + dep_map = {insn.id: insn.depends_on for insn in kernel.instructions} + + sccs = compute_sccs(dep_map) + order = [] + + for scc in sccs: + if len(scc) != 1: + raise DependencyCycleFound(', '.join(scc)) + order.append(scc[0]) + + return order + + def _check_variable_access_ordered_inner(kernel): + from loopy.kernel.tools import find_aliasing_equivalence_classes + from loopy.symbolic import AccessRangeOverlapChecker + overlap_checker = AccessRangeOverlapChecker(kernel) + aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + logger.debug("%s: check_variable_access_ordered: start" % kernel.name) - checked_variables = kernel.get_written_variables() & ( - set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict)) + # dep_reqs_to_vars: A mapping from (writer_id, dep_req_id) between whom + # dependency must be established to the variables which prompted the + # dependency requirement. + dep_reqs_to_vars = {} wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg - from loopy.kernel.tools import find_aliasing_equivalence_classes + # {{{ populate 'dep_reqs_to_vars' - depfind = IndirectDependencyEdgeFinder(kernel) - aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) - - for name in checked_variables: - # This is a tad redundant in that this could probably be restructured - # to iterate only over equivalence classes and not individual variables. - # But then the access-range overlap check below would have to be smarter. - eq_class = aliasing_equiv_classes[name] + for var in kernel.get_written_variables(): + address_space = _get_address_space(kernel, var) + eq_class = aliasing_equiv_classes[var] readers = set.union( *[rmap.get(eq_name, set()) for eq_name in eq_class]) writers = set.union( *[wmap.get(eq_name, set()) for eq_name in eq_class]) - unaliased_readers = rmap.get(name, set()) - unaliased_writers = wmap.get(name, set()) - - if not writers: - continue - - if name in kernel.temporary_variables: - address_space = kernel.temporary_variables[name].address_space - else: - arg = kernel.arg_dict[name] - if isinstance(arg, ArrayArg): - address_space = arg.address_space - elif isinstance(arg, ValueArg): - address_space = AddressSpace.PRIVATE - else: - # No need to consider ConstantArg and ImageArg (for now) - # because those won't be written. - raise ValueError("could not determine address_space of '%s'" % name) - - # Check even for PRIVATE address space, to ensure intentional program order. - from loopy.symbolic import AccessRangeOverlapChecker - overlap_checker = AccessRangeOverlapChecker(kernel) + for writer in writers: + required_deps = (readers | writers) - set([writer]) + required_deps = set([req_dep for req_dep in required_deps if not + declares_nosync_with(kernel, address_space, writer, + req_dep)]) - for writer_id in writers: - for other_id in readers | writers: - if writer_id == other_id: - continue - - writer = kernel.id_to_insn[writer_id] - other = kernel.id_to_insn[other_id] + for req_dep in required_deps: + dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var) - has_dependency_relationship = ( - declares_nosync_with(kernel, address_space, other, writer) - or - depfind(writer_id, other_id) - or - depfind(other_id, writer_id) - ) + # }}} - if has_dependency_relationship: - continue + # depends_on: mapping from insn_ids to their dependencies + depends_on = dict((insn.id, set()) for insn in + kernel.instructions) + # rev_depends: mapping from insn_ids to their reverse deps. + rev_depends = dict((insn.id, set()) for insn in + kernel.instructions) - is_relationship_by_aliasing = not ( - writer_id in unaliased_writers - and (other_id in unaliased_writers - or other_id in unaliased_readers)) + # {{{ populate rev_depends, depends_on - # Do not enforce ordering for disjoint access ranges - if (not is_relationship_by_aliasing and not - overlap_checker.do_access_ranges_overlap_conservative( - writer_id, "w", other_id, "any", name)): - continue + for insn in kernel.instructions: + depends_on[insn.id].update(insn.depends_on) + for dep in insn.depends_on: + rev_depends[dep].add(insn.id) + # }}} + + topological_order = _get_topological_order(kernel) + + def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order): + """ + Subtracts dependency requirements of insn_ids by all direct/indirect + predecessors of a directed graph of insn_ids as nodes and *edges* as + the connectivity. + + :arg order: An instance of :class:`list` of instruction ids in which the + *edges* graph is to be traversed. + """ + # memoized_predecessors: mapping from insn_id to its direct/indirect + # predecessors + memoized_predecessors = {} + + # reverse postorder traversal of dependency graph + for insn_id in order: + # accumulated_predecessors:insn_id's direct+indirect predecessors + accumulated_predecessors = memoized_predecessors.pop(insn_id, set()) + + for pred in accumulated_predecessors: + dep_reqs_to_vars.pop((insn_id, pred), None) + + for successor in edges[insn_id]: + memoized_predecessors.setdefault(successor, set()).update( + accumulated_predecessors | set([insn_id])) + + # forward dep. graph traversal in reverse topological sort order + discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on, + topological_order[::-1]) + # reverse dep. graph traversal in topological sort order + discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order) + + # {{{ handle dependency requirements that weren't satisfied + + for (writer_id, other_id), variables in six.iteritems(dep_reqs_to_vars): + writer = kernel.id_to_insn[writer_id] + other = kernel.id_to_insn[other_id] + + for var in variables: + eq_class = aliasing_equiv_classes[var] + unaliased_readers = rmap.get(var, set()) + unaliased_writers = wmap.get(var, set()) + + is_relationship_by_aliasing = not ( + writer_id in unaliased_writers + and (writer_id in unaliased_writers + or other_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing and not + overlap_checker.do_access_ranges_overlap_conservative( + writer_id, "w", other_id, "any", var)): + continue - # Do not enforce ordering for aliasing-based relationships - # in different groups. - if (is_relationship_by_aliasing and ( - bool(writer.groups & other.conflicts_with_groups) - or - bool(other.groups & writer.conflicts_with_groups))): - continue + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(writer.groups & other.conflicts_with_groups) + or + bool(other.groups & writer.conflicts_with_groups))): + continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=writer_id, - other_id=other_id, - var=( - "the variable '%s'" % name - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % var + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) + + # }}} logger.debug("%s: check_variable_access_ordered: done" % kernel.name) @@ -628,7 +657,7 @@ def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * an (at least indirect) depdendency edge, or + * a direct/indirect depdendency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.Instruction.no_sync_with`) """ diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index b736191ec1dadb842e12453fbec3b68e831338f6..90b1b91f70b56f7da21705e573f35dfff79dbe65 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -55,57 +55,57 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai # {{{ on which inames may a conditional depend? -def get_usable_inames_for_conditional(kernel, sched_index): +def get_usable_inames_for_conditional(kernel, sched_indices): + from loopy.schedule import ( - find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within, + get_subkernel_indices) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) - - result = find_active_inames_at(kernel, sched_index) - crosses_barrier = has_barrier_within(kernel, sched_index) - + active_inames_list = find_active_inames_at(kernel, sched_indices) + crosses_barrier_list = has_barrier_within(kernel, sched_indices) # Find our containing subkernel. Grab inames for all insns from there. - within_subkernel = False - - for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): - from loopy.schedule import CallKernel, ReturnFromKernel - if isinstance(sched_item, CallKernel): - within_subkernel = True - subkernel_index = sched_item_index - elif isinstance(sched_item, ReturnFromKernel): - within_subkernel = False - - if not within_subkernel: - # Outside all subkernels - use only inames available to host. - return frozenset(result) - - insn_ids_for_subkernel = get_insn_ids_for_block_at( - kernel.schedule, subkernel_index) - - inames_for_subkernel = ( - iname - for insn in insn_ids_for_subkernel - for iname in kernel.insn_inames(insn)) - - for iname in inames_for_subkernel: - # Parallel inames are defined within a subkernel, BUT: - # - # - local indices may not be used in conditionals that cross barriers. - # - # - ILP indices and vector lane indices are not available in loop - # bounds, they only get defined at the innermost level of nesting. - - if ( - kernel.iname_tags_of_type(iname, ConcurrentTag) - and not kernel.iname_tags_of_type(iname, VectorizeTag) - and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) - and crosses_barrier) - and not kernel.iname_tags_of_type(iname, IlpBaseTag) - ): - result.add(iname) - - return frozenset(result) + subkernel_index_list = get_subkernel_indices(kernel, sched_indices) + + inames_for_subkernel = {} + + for subknl_idx in set(idx for idx in subkernel_index_list if idx is not None): + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subknl_idx) + + all_inames_in_the_subknl = set([ + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)]) + + def is_eligible_in_conditional(iname): + # Parallel inames are defined within a subkernel, BUT: + # + # - ILP indices and vector lane indices are not available in loop + # bounds, they only get defined at the innermost level of nesting. + return ( + kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag) + and not kernel.iname_tags_of_type(iname, IlpBaseTag)) + + inames_for_subkernel[subknl_idx] = [iname for iname in + all_inames_in_the_subknl if is_eligible_in_conditional(iname)] + + result = [] + + for active_inames, crosses_barrier, subknl_idx in zip(active_inames_list, + crosses_barrier_list, subkernel_index_list): + if subknl_idx is not None: + for iname in inames_for_subkernel[subknl_idx]: + # local indices may not be used in conditionals that cross barriers + if (not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) + and crosses_barrier)): + active_inames.add(iname) + + result.append(frozenset(active_inames)) + + return result # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 7319b16ac2fe9f39872558a3878161b89cab15d9..c90f4c6b1b92ffa4648804555b1716d5c190759e 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -279,15 +279,17 @@ def build_loop_nest(codegen_state, schedule_index): from loopy.schedule import find_used_inames_within from loopy.codegen.bounds import get_usable_inames_for_conditional + admissible_cond_inames = get_usable_inames_for_conditional(kernel, + my_sched_indices) + sched_index_info_entries = [ ScheduleIndexInfo( - schedule_indices=[i], - admissible_cond_inames=( - get_usable_inames_for_conditional(kernel, i)), - required_predicates=get_required_predicates(kernel, i), - used_inames_within=find_used_inames_within(kernel, i) + schedule_indices=[my_sched_idx], + admissible_cond_inames=admissible_cond_inames[i], + required_predicates=get_required_predicates(kernel, my_sched_idx), + used_inames_within=find_used_inames_within(kernel, my_sched_idx) ) - for i in my_sched_indices + for i, my_sched_idx in enumerate(my_sched_indices) ] sched_index_info_entries = group_by( diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index b3a87798840bb1624d350c79830f29142e54ab6c..c7489e759b5e4217837db48dd462d0bd23d2bcea 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -353,7 +353,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index) + usable_inames, = get_usable_inames_for_conditional(kernel, (sched_index,)) domain = kernel.get_inames_domain(loop_iname) result = [] diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 2d926aad4faa511aa2919630c9b0e96b7f253ad9..ddd245261ab0f064e25060122d4d6af65e889c58 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -744,9 +744,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ iname wrangling + @memoize_method def iname_tags(self, iname): return self.iname_to_tags.get(iname, frozenset()) + @memoize_method def iname_tags_of_type(self, iname, tag_type_or_types, max_num=None, min_num=None): """Return a subset of *tags* that matches type *tag_type*. Raises exception diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 032cdc2760597f1fa6f701a8a88252312deac797..43a2b2657ba9f192828c72b0dd700271e594b0d7 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -106,6 +106,16 @@ class Barrier(ScheduleItem): # {{{ schedule utilities def gather_schedule_block(schedule, start_idx): + """ + Returns a :class:`tuple` of (list of schedule items, index just after the + block) for a block. + + :arg schedule: An instance of :class:`list` of + :class:`loopy.schedule.ScheduleItem`s. + + :arg start_idx: The index of a :class:`loopy.schedule.BeginBlockItem` of + the block whose schedule items are to be returned. + """ assert isinstance(schedule[start_idx], BeginBlockItem) level = 0 @@ -157,32 +167,107 @@ def get_insn_ids_for_block_at(schedule, start_idx): if isinstance(sub_sched_item, RunInstruction)) -def find_active_inames_at(kernel, sched_index): +def find_active_inames_at(kernel, sched_indices): + """ + Returns an instance of :class:`list` of :class:`set`s of inames occuring at + each schedule index in *sched_indices*. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ active_inames = [] + sched_idx_to_active_inames = {0: set()} + + sorted_sched_indices = sorted(sched_indices) from loopy.schedule import EnterLoop, LeaveLoop - for sched_item in kernel.schedule[:sched_index]: + + max_sched_idx = sorted_sched_indices[-1] + + if sorted_sched_indices and sorted_sched_indices[0] == 0: + sorted_sched_indices.pop(0) + + for sched_idx, sched_item in enumerate( + kernel.schedule[:max_sched_idx]): if isinstance(sched_item, EnterLoop): active_inames.append(sched_item.iname) if isinstance(sched_item, LeaveLoop): active_inames.pop() - return set(active_inames) + if sched_idx == (sorted_sched_indices[0]-1): + sched_idx_to_active_inames[sched_idx+1] = set(active_inames) + sorted_sched_indices.pop(0) + # eventually everythin should be popped + assert len(sorted_sched_indices) == 0 -def has_barrier_within(kernel, sched_index): - sched_item = kernel.schedule[sched_index] + return [sched_idx_to_active_inames[idx] for idx in sched_indices] - if isinstance(sched_item, BeginBlockItem): - loop_contents, _ = gather_schedule_block( - kernel.schedule, sched_index) - from pytools import any - return any(isinstance(subsched_item, Barrier) - for subsched_item in loop_contents) - elif isinstance(sched_item, Barrier): - return True - else: - return False + +def has_barrier_within(kernel, sched_indices): + """ + Returns a :class:`list` of :class:`bool`s, with an entry for each schedule + index in *sched_indices* denoting if either there is barrier at the + schedule index or the schedule index is a + :class:`loopy.schedule.BeginBlockItem` containing a barrier in the block. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ + sched_idx_to_has_barrier_within = {} + begin_block_sched_indices = [] + + for sched_idx in sched_indices: + sched_item = kernel.schedule[sched_idx] + + if isinstance(sched_item, Barrier): + sched_idx_to_has_barrier_within[sched_idx] = True + elif isinstance(sched_item, BeginBlockItem): + begin_block_sched_indices.append(sched_idx) + else: + sched_idx_to_has_barrier_within[sched_idx] = False + + begin_block_sched_indices.sort() + + for sched_idx in begin_block_sched_indices: + if sched_idx in sched_idx_to_has_barrier_within: + # this block has already been dealt in a previous block + continue + + block_contents, _ = gather_schedule_block( + kernel.schedule, sched_idx) + + level = 1 + # block_stack: list of [sched_idx, has_barrier_within]'s for every + # level of block + block_stack = [[sched_idx, False]] + + for i, sched_item in enumerate(block_contents[1:], start=sched_idx+1): + if level == 0: + break + + if isinstance(sched_item, BeginBlockItem): + level += 1 + + block_stack.append([i, False]) + elif isinstance(sched_item, EndBlockItem): + level -= 1 + + exit_block_sched_idx, exit_block_contains_barrier = block_stack.pop() + + if block_stack: + # inner block contains barrier => outer block contains barrier + block_stack[-1][1] |= exit_block_contains_barrier + + sched_idx_to_has_barrier_within[exit_block_sched_idx] = ( + exit_block_contains_barrier) + elif isinstance(sched_item, Barrier): + block_stack[-1][1] = True + else: + pass + + assert level == 0 + + return [sched_idx_to_has_barrier_within[sched_idx] for sched_idx in + sched_indices] def find_used_inames_within(kernel, sched_index): @@ -415,6 +500,45 @@ def sched_item_to_insn_id(sched_item): and sched_item.originating_insn_id is not None): yield sched_item.originating_insn_id + +def get_subkernel_indices(kernel, sched_indices): + """ + Returns an instance of :class:`list` of :class:`int`s, with an entry for + each schedule index in *sched_indices* denoting the index of + :class:`loopy.schedule.CallKernel` for the subkernel it is in, if no + subkernel contains a schedule index its entry is set to *None*. + + :arg sched_indices: A list of schedule indices of *kernel*. + """ + from loopy.schedule import CallKernel, ReturnFromKernel + + subkernel_index = None + sorted_sched_indices = sorted(sched_indices) + sched_idx_to_subkernel_idx = {0: None} + + max_sched_idx = sorted_sched_indices[-1] + + if sorted_sched_indices and sorted_sched_indices[0] == 0: + sorted_sched_indices.pop(0) + + for sched_idx, sched_item in enumerate( + kernel.schedule[:max_sched_idx]): + if isinstance(sched_item, CallKernel): + subkernel_index = sched_idx + elif isinstance(sched_item, ReturnFromKernel): + subkernel_index = None + + if sched_idx == (sorted_sched_indices[0]-1): + sched_idx_to_subkernel_idx[sched_idx+1] = subkernel_index + sorted_sched_indices.pop(0) + + # eventually everythin should be popped + assert len(sorted_sched_indices) == 0 + + return [sched_idx_to_subkernel_idx[sched_idx] for sched_idx in + sched_indices] + + # }}} @@ -571,6 +695,7 @@ class SchedulerState(ImmutableRecord): .. attribute:: loop_priority + #FIXME: incorrect docs. See :func:`loop_nest_around_map`. .. attribute:: breakable_inames @@ -586,6 +711,10 @@ class SchedulerState(ImmutableRecord): .. rubric:: Time-varying scheduler state + .. attribute:: insn_ids_to_try + + #FIXME: docs? + .. attribute:: active_inames A tuple of active inames. @@ -641,6 +770,10 @@ class SchedulerState(ImmutableRecord): Used to produce warnings about deprecated 'boosting' behavior Should be removed along with boostability in 2017.x. + + .. attribute:: insns_in_topologically_sorted_order + + A list of loopy :class:`Instruction` objects in topologically sorted order """ @property @@ -651,6 +784,93 @@ class SchedulerState(ImmutableRecord): return None +def get_insns_in_topologically_sorted_order(kernel): + from pytools.graph import compute_topological_order + + rev_dep_map = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) + + ids = compute_topological_order(rev_dep_map) + return [kernel.id_to_insn[insn_id] for insn_id in ids] + + +def schedule_as_many_run_insns_as_possible(sched_state): + """ + Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending + all available instructions in the current loop nesting to the schedule. + """ + + next_preschedule_item = ( + sched_state.preschedule[0] + if len(sched_state.preschedule) > 0 + else None) + + if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, Barrier)): + return sched_state + + if not sched_state.within_subkernel: + # cannot schedule RunInstructions when not in subkernel + return sched_state + + have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames + + toposorted_insns = sched_state.insns_in_topologically_sorted_order + + # select the top instructions in toposorted_insns only which have active + # inames corresponding to those of sched_state + from loopy.kernel.instruction import MultiAssignmentBase + + updated_sched_state = sched_state.copy() + + newly_scheduled_insn_ids = [] + ignored_unscheduled_insn_ids = set() + + for insn in toposorted_insns: + if insn.id in sched_state.scheduled_insn_ids: + continue + if not insn.within_inames >= have_inames: + ignored_unscheduled_insn_ids.add(insn.id) + continue + if isinstance(insn, MultiAssignmentBase): + if (insn.within_inames - sched_state.parallel_inames) == frozenset( + sched_state.active_inames) and not (insn.depends_on & + ignored_unscheduled_insn_ids): + newly_scheduled_insn_ids.append(insn.id) + continue + break + + num_presched_insns_newly_scheduled = len(set(newly_scheduled_insn_ids) & + sched_state.prescheduled_insn_ids) + + assert all(isinstance(sched_item, RunInstruction) and sched_item.insn_id in + newly_scheduled_insn_ids for sched_item in + sched_state.preschedule[:num_presched_insns_newly_scheduled]) + sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in + newly_scheduled_insn_ids) + + updated_schedule = updated_sched_state.schedule + sched_items + updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids + | frozenset(newly_scheduled_insn_ids)) + updated_unscheduled_insn_ids = ( + updated_sched_state.unscheduled_insn_ids + - frozenset(newly_scheduled_insn_ids)) + if newly_scheduled_insn_ids: + new_insn_ids_to_try = None + else: + new_insn_ids_to_try = sched_state.insn_ids_to_try + updated_sched_state = updated_sched_state.copy( + insn_ids_to_try=new_insn_ids_to_try, + schedule=updated_schedule, + scheduled_insn_ids=updated_scheduled_insn_ids, + unscheduled_insn_ids=updated_unscheduled_insn_ids, + preschedule=sched_state.preschedule[num_presched_insns_newly_scheduled:] + ) + + return updated_sched_state + + def generate_loop_schedules_internal( sched_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop @@ -664,6 +884,10 @@ def generate_loop_schedules_internal( else: rec_allow_boost = False + if not rec_allow_boost: + sched_state = ( + schedule_as_many_run_insns_as_possible(sched_state)) + active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( @@ -1041,19 +1265,20 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: - - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1], + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if last_entered_loop not in sched_state.prescheduled_inames - else sched_state.preschedule[1:]), - ), + else sched_state.preschedule[1:])) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -1254,8 +1479,7 @@ def generate_loop_schedules_internal( iname), reverse=True): - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (EnterLoop(iname=iname),)), @@ -1264,11 +1488,15 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), - ), + ) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True @@ -1916,7 +2144,10 @@ def generate_loop_schedules_inner(kernel, debug_args={}): group_insn_counts=group_insn_counts(kernel), active_group_counts={}, - uses_of_boostability=[]) + uses_of_boostability=[], + insns_in_topologically_sorted_order=( + get_insns_in_topologically_sorted_order(kernel)), + ) schedule_gen_kwargs = {} if kernel.options.ignore_boostable_into: