diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1b017f701f8161e93c4fdc1c14644dfe4b4fa74c..23057cb13048c029fbc3db5ebacf58696b039286 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -957,8 +957,8 @@ Consider the following example: if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { - a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; + a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; } barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) diff --git a/loopy/check.py b/loopy/check.py index da49c1d116df1a9fbf92e8ef41822b6741405604..c264123973fc579ce2d676c2cf99b73b62c3bddb 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -449,45 +449,9 @@ def check_has_schedulable_iname_nesting(kernel): # {{{ check_variable_access_ordered -class IndirectDependencyEdgeFinder(object): - def __init__(self, kernel): - self.kernel = kernel - self.dep_edge_cache = {} - - def __call__(self, depender_id, dependee_id): - cache_key = (depender_id, dependee_id) - - try: - result = self.dep_edge_cache[cache_key] - except KeyError: - pass - else: - if result is None: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("when " - "checking for dependency edge between " - "depender '%s' and dependee '%s'" - % (depender_id, dependee_id)) - else: - return result - - depender = self.kernel.id_to_insn[depender_id] - - if dependee_id in depender.depends_on: - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = None - for dep in depender.depends_on: - if self(dep, dependee_id): - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = False - return False - - def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): + dep_a = kernel.id_to_insn[dep_a] + dep_b = kernel.id_to_insn[dep_b] from loopy.kernel.data import AddressSpace if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] @@ -510,125 +474,203 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync +def _get_address_space(kernel, var): + from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg + if var in kernel.temporary_variables: + address_space = kernel.temporary_variables[var].address_space + else: + arg = kernel.arg_dict[var] + if isinstance(arg, ArrayArg): + address_space = arg.address_space + elif isinstance(arg, ValueArg): + address_space = AddressSpace.PRIVATE + else: + # No need to consider ConstantArg and ImageArg (for now) + # because those won't be written. + raise ValueError("could not determine address_space of '%s'" % var) + return address_space + + +def _get_topological_order(kernel): + from pytools.graph import compute_sccs + from loopy.diagnostic import DependencyCycleFound + + dep_map = {insn.id: insn.depends_on for insn in kernel.instructions} + + sccs = compute_sccs(dep_map) + order = [] + + for scc in sccs: + if len(scc) != 1: + raise DependencyCycleFound(', '.join(scc)) + order.append(scc[0]) + + return order + + def _check_variable_access_ordered_inner(kernel): - logger.debug("%s: check_variable_access_ordered: start" % kernel.name) + from loopy.kernel.tools import find_aliasing_equivalence_classes + from loopy.symbolic import AccessRangeOverlapChecker + overlap_checker = AccessRangeOverlapChecker(kernel) + aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) - checked_variables = kernel.get_written_variables() & ( - set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict)) + # dep_reqs_to_vars: A mapping (writer_id, dep_req_id) -> set of variable names, + # where the tuple denotes a pair of instructions IDs, and the variable + # names are the ones that necessitate a dependency. + # + # Note: This can be worst-case O(n^2) in the number of instructions. + dep_reqs_to_vars = {} wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg - from loopy.kernel.tools import find_aliasing_equivalence_classes - - depfind = IndirectDependencyEdgeFinder(kernel) - aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + # {{{ populate 'dep_reqs_to_vars' - for name in checked_variables: - # This is a tad redundant in that this could probably be restructured - # to iterate only over equivalence classes and not individual variables. - # But then the access-range overlap check below would have to be smarter. - eq_class = aliasing_equiv_classes[name] + for var in kernel.get_written_variables(): + address_space = _get_address_space(kernel, var) + eq_class = aliasing_equiv_classes[var] readers = set.union( *[rmap.get(eq_name, set()) for eq_name in eq_class]) writers = set.union( *[wmap.get(eq_name, set()) for eq_name in eq_class]) - unaliased_readers = rmap.get(name, set()) - unaliased_writers = wmap.get(name, set()) - if not writers: - continue - - if name in kernel.temporary_variables: - address_space = kernel.temporary_variables[name].address_space - else: - arg = kernel.arg_dict[name] - if isinstance(arg, ArrayArg): - address_space = arg.address_space - elif isinstance(arg, ValueArg): - address_space = AddressSpace.PRIVATE - else: - # No need to consider ConstantArg and ImageArg (for now) - # because those won't be written. - raise ValueError("could not determine address_space of '%s'" % name) - - # Check even for PRIVATE address space, to ensure intentional program order. + for writer in writers: + required_deps = (readers | writers) - set([writer]) + required_deps = set([req_dep + for req_dep in required_deps + if not declares_nosync_with(kernel, address_space, writer, + req_dep)]) - from loopy.symbolic import AccessRangeOverlapChecker - overlap_checker = AccessRangeOverlapChecker(kernel) + for req_dep in required_deps: + dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var) - for writer_id in writers: - for other_id in readers | writers: - if writer_id == other_id: - continue + # }}} - writer = kernel.id_to_insn[writer_id] - other = kernel.id_to_insn[other_id] + # depends_on: mapping from insn_ids to their dependencies + depends_on = dict((insn.id, set()) for insn in + kernel.instructions) + # rev_depends: mapping from insn_ids to their reverse deps. + rev_depends = dict((insn.id, set()) for insn in + kernel.instructions) - has_dependency_relationship = ( - declares_nosync_with(kernel, address_space, other, writer) - or - depfind(writer_id, other_id) - or - depfind(other_id, writer_id) - ) + # {{{ populate rev_depends, depends_on - if has_dependency_relationship: - continue - - is_relationship_by_aliasing = not ( - writer_id in unaliased_writers - and (other_id in unaliased_writers - or other_id in unaliased_readers)) - - # Do not enforce ordering for disjoint access ranges - if (not is_relationship_by_aliasing and not - overlap_checker.do_access_ranges_overlap_conservative( - writer_id, "w", other_id, "any", name)): - continue + for insn in kernel.instructions: + depends_on[insn.id].update(insn.depends_on) + for dep in insn.depends_on: + rev_depends[dep].add(insn.id) + + # }}} + + # {{{ remove pairs from dep_reqs_to_vars for which dependencies exist + + topological_order = _get_topological_order(kernel) + + def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order): + """ + Subtracts dependency requirements of insn_ids by all direct/indirect + predecessors of a directed graph of insn_ids as nodes and *edges* as + the connectivity. + + :arg order: An instance of :class:`list` of instruction ids in which the + *edges* graph is to be traversed. + """ + # predecessors: mapping from insn_id to its direct/indirect + # predecessors + predecessors = {} + + for insn_id in order: + # insn_predecessors:insn_id's direct+indirect predecessors + + # This set of predecessors is complete because we're + # traversing in topological order: No predecessor + # can occur after the instruction itself. + insn_predecessors = predecessors.pop(insn_id, set()) + + for pred in insn_predecessors: + dep_reqs_to_vars.pop( + (insn_id, pred), + # don't fail if pair doesn't exist + None) + + for successor in edges[insn_id]: + predecessors.setdefault(successor, set()).update( + insn_predecessors | set([insn_id])) + + # forward dep. graph traversal in reverse topological sort order + # (proceeds "end of program" -> "beginning of program") + discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on, + topological_order[::-1]) + + # reverse dep. graph traversal in topological sort order + # (proceeds "beginning of program" -> "end of program") + discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order) + + # }}} + + # {{{ handle dependency requirements that weren't satisfied + + for (writer_id, other_id), variables in six.iteritems(dep_reqs_to_vars): + writer = kernel.id_to_insn[writer_id] + other = kernel.id_to_insn[other_id] + + for var in variables: + eq_class = aliasing_equiv_classes[var] + unaliased_readers = rmap.get(var, set()) + unaliased_writers = wmap.get(var, set()) + + is_relationship_by_aliasing = not ( + writer_id in unaliased_writers + and (writer_id in unaliased_writers + or other_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing and not + overlap_checker.do_access_ranges_overlap_conservative( + writer_id, "w", other_id, "any", var)): + continue - # Do not enforce ordering for aliasing-based relationships - # in different groups. - if (is_relationship_by_aliasing and ( - bool(writer.groups & other.conflicts_with_groups) - or - bool(other.groups & writer.conflicts_with_groups))): - continue + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(writer.groups & other.conflicts_with_groups) + or + bool(other.groups & writer.conflicts_with_groups))): + continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=writer_id, - other_id=other_id, - var=( - "the variable '%s'" % name - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) - - logger.debug("%s: check_variable_access_ordered: done" % kernel.name) + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % var + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) + + # }}} def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * an (at least indirect) depdendency edge, or + * a direct/indirect depdendency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.Instruction.no_sync_with`) """ @@ -644,15 +686,17 @@ def check_variable_access_ordered(kernel): if kernel.options.enforce_variable_access_ordered == "no_check": return - if kernel.options.enforce_variable_access_ordered: - _check_variable_access_ordered_inner(kernel) - else: - from loopy.diagnostic import VariableAccessNotOrdered - try: + from pytools import ProcessLogger + with ProcessLogger(logger, "%s: check variable access ordered" % kernel.name): + if kernel.options.enforce_variable_access_ordered: _check_variable_access_ordered_inner(kernel) - except VariableAccessNotOrdered as e: - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "variable_access_ordered", str(e)) + else: + from loopy.diagnostic import VariableAccessNotOrdered + try: + _check_variable_access_ordered_inner(kernel) + except VariableAccessNotOrdered as e: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "variable_access_ordered", str(e)) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 2d926aad4faa511aa2919630c9b0e96b7f253ad9..ddd245261ab0f064e25060122d4d6af65e889c58 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -744,9 +744,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ iname wrangling + @memoize_method def iname_tags(self, iname): return self.iname_to_tags.get(iname, frozenset()) + @memoize_method def iname_tags_of_type(self, iname, tag_type_or_types, max_num=None, min_num=None): """Return a subset of *tags* that matches type *tag_type*. Raises exception diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 032cdc2760597f1fa6f701a8a88252312deac797..8a83ae84c5623544301b1656499606884b67d37f 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -561,7 +561,7 @@ class ScheduleDebugInput(Exception): # }}} -# {{{ scheduling algorithm +# {{{ scheduler state class SchedulerState(ImmutableRecord): """ @@ -571,6 +571,7 @@ class SchedulerState(ImmutableRecord): .. attribute:: loop_priority + #FIXME: incorrect docs. See :func:`loop_nest_around_map`. .. attribute:: breakable_inames @@ -586,6 +587,10 @@ class SchedulerState(ImmutableRecord): .. rubric:: Time-varying scheduler state + .. attribute:: insn_ids_to_try + + #FIXME: docs? + .. attribute:: active_inames A tuple of active inames. @@ -641,6 +646,10 @@ class SchedulerState(ImmutableRecord): Used to produce warnings about deprecated 'boosting' behavior Should be removed along with boostability in 2017.x. + + .. attribute:: insns_in_topologically_sorted_order + + A list of loopy :class:`Instruction` objects in topologically sorted order """ @property @@ -650,6 +659,101 @@ class SchedulerState(ImmutableRecord): else: return None +# }}} + + +def get_insns_in_topologically_sorted_order(kernel): + from pytools.graph import compute_topological_order + + rev_dep_map = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) + + ids = compute_topological_order(rev_dep_map) + return [kernel.id_to_insn[insn_id] for insn_id in ids] + + +# {{{ schedule_as_many_run_insns_as_possible + +def schedule_as_many_run_insns_as_possible(sched_state): + """ + Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending + all available instructions in the current loop nesting to the schedule. + """ + + next_preschedule_item = ( + sched_state.preschedule[0] + if sched_state.preschedule + else None) + + if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, Barrier)): + return sched_state + + if not sched_state.within_subkernel: + # cannot schedule RunInstructions when not in subkernel + return sched_state + + have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames + + toposorted_insns = sched_state.insns_in_topologically_sorted_order + + # select the top instructions in toposorted_insns only which have active + # inames corresponding to those of sched_state + from loopy.kernel.instruction import MultiAssignmentBase + + updated_sched_state = sched_state.copy() + + newly_scheduled_insn_ids = [] + ignored_unscheduled_insn_ids = set() + + for insn in toposorted_insns: + if insn.id in sched_state.scheduled_insn_ids: + continue + if insn.within_inames < have_inames: + ignored_unscheduled_insn_ids.add(insn.id) + continue + if isinstance(insn, MultiAssignmentBase): + if (insn.within_inames - sched_state.parallel_inames) == frozenset( + sched_state.active_inames) and not (insn.depends_on & + ignored_unscheduled_insn_ids): + newly_scheduled_insn_ids.append(insn.id) + continue + break + + num_presched_insns_newly_scheduled = len(set(newly_scheduled_insn_ids) & + sched_state.prescheduled_insn_ids) + + assert all(isinstance(sched_item, RunInstruction) and sched_item.insn_id in + newly_scheduled_insn_ids for sched_item in + sched_state.preschedule[:num_presched_insns_newly_scheduled]) + sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in + newly_scheduled_insn_ids) + + updated_schedule = updated_sched_state.schedule + sched_items + updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids + | frozenset(newly_scheduled_insn_ids)) + updated_unscheduled_insn_ids = ( + updated_sched_state.unscheduled_insn_ids + - frozenset(newly_scheduled_insn_ids)) + if newly_scheduled_insn_ids: + new_insn_ids_to_try = None + else: + new_insn_ids_to_try = sched_state.insn_ids_to_try + updated_sched_state = updated_sched_state.copy( + insn_ids_to_try=new_insn_ids_to_try, + schedule=updated_schedule, + scheduled_insn_ids=updated_scheduled_insn_ids, + unscheduled_insn_ids=updated_unscheduled_insn_ids, + preschedule=sched_state.preschedule[num_presched_insns_newly_scheduled:] + ) + + return updated_sched_state + +# }}} + + +# {{{ scheduling algorithm def generate_loop_schedules_internal( sched_state, allow_boost=False, debug=None): @@ -664,11 +768,15 @@ def generate_loop_schedules_internal( else: rec_allow_boost = False + if not rec_allow_boost: + sched_state = ( + schedule_as_many_run_insns_as_possible(sched_state)) + active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( sched_state.preschedule[0] - if len(sched_state.preschedule) > 0 + if sched_state.preschedule else None) # {{{ decide about debug mode @@ -1041,19 +1149,20 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: - - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1], + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if last_entered_loop not in sched_state.prescheduled_inames - else sched_state.preschedule[1:]), - ), + else sched_state.preschedule[1:])) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -1254,8 +1363,7 @@ def generate_loop_schedules_internal( iname), reverse=True): - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( + new_sched_state = sched_state.copy( schedule=( sched_state.schedule + (EnterLoop(iname=iname),)), @@ -1264,11 +1372,15 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), - ), + ) + + for sub_sched in generate_loop_schedules_internal( + new_sched_state, allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True @@ -1916,7 +2028,10 @@ def generate_loop_schedules_inner(kernel, debug_args={}): group_insn_counts=group_insn_counts(kernel), active_group_counts={}, - uses_of_boostability=[]) + uses_of_boostability=[], + insns_in_topologically_sorted_order=( + get_insns_in_topologically_sorted_order(kernel)), + ) schedule_gen_kwargs = {} if kernel.options.ignore_boostable_into: