diff --git a/loopy/__init__.py b/loopy/__init__.py index ce6cdca03dc4dd9b56338f65662926011a94ad4c..8975674442f049f51f724bfcb818fc6e8a441c88 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -66,6 +66,7 @@ from loopy.transform.iname import ( split_reduction_inward, split_reduction_outward, affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique, + has_schedulable_iname_nesting, get_iname_duplication_options, add_inames_to_insn) from loopy.transform.instruction import ( @@ -170,6 +171,7 @@ __all__ = [ "split_reduction_inward", "split_reduction_outward", "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", + "has_schedulable_iname_nesting", "get_iname_duplication_options", "add_inames_to_insn", "add_prefetch", "change_arg_to_image", diff --git a/loopy/check.py b/loopy/check.py index 8587ad04336131a902638db24a72a97ced66f0b2..2f48211da430b42e8000467f070b1633ab3a4f38 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -332,6 +332,22 @@ def check_write_destinations(kernel): or wvar in kernel.arg_dict) and wvar not in kernel.all_params(): raise LoopyError + +def check_has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import (has_schedulable_iname_nesting, + get_iname_duplication_options) + if not has_schedulable_iname_nesting(kernel): + import itertools as it + opt = get_iname_duplication_options(kernel) + opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) + for i, w in it.islice(opt, 3)) + raise LoopyError("Kernel does not have a schedulable iname nesting. " + "In order for there to exist a feasible loop nesting, you " + "may need to duplicate an iname. To do so, call " + "loopy.duplicate_iname. Use loopy.get_iname_duplication_options " + "to get hints about which iname to duplicate. Here are some " + "options:\n%s" % opt_str) + # }}} @@ -348,6 +364,7 @@ def pre_schedule_checks(kernel): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) + check_has_schedulable_iname_nesting(kernel) logger.info("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 105219c8cf8300563a364f0d5511c7ceab70b4da..1914b8d677af4cd0a5683d90ff5ac1168a660063 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -48,6 +48,10 @@ __doc__ = """ .. autofunction:: duplicate_inames +.. autofunction:: get_iname_duplication_options + +.. autofunction:: has_schedulable_iname_nesting + .. autofunction:: rename_iname .. autofunction:: remove_unused_inames @@ -825,6 +829,170 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # }}} +# {{{ iname duplication for schedulability + +def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])): + # Remove common inames of the current insn_deps, as they are not relevant + # for splitting. + common = frozenset([]).union(*insn_deps).intersection(*insn_deps) + + # If common inames were found, we reduce the problem and go into recursion + if common: + # Remove the common inames from the instruction dependencies + insn_deps = ( + frozenset(dep - common for dep in insn_deps) + - + frozenset([frozenset([])])) + # Join the common inames with those previously found + common = common.union(old_common_inames) + + # Go into recursion + for option in _get_iname_duplication_options(insn_deps, common): + yield option + # Do not yield anything beyond here! + return + + # Try finding a partitioning of the remaining inames, such that all instructions + # use only inames from one of the disjoint sets from the partitioning. + def join_sets_if_not_disjoint(sets): + for s1 in sets: + for s2 in sets: + if s1 != s2 and s1.intersection(s2): + return ( + (sets - frozenset([s1, s2])) + | frozenset([s1 | s2]) + ), False + + return sets, True + + partitioning = insn_deps + stop = False + while not stop: + partitioning, stop = join_sets_if_not_disjoint(partitioning) + + # If a partitioning was found we recursively apply this algorithm to the + # subproblems + if len(partitioning) > 1: + for part in partitioning: + working_set = frozenset(s for s in insn_deps if s.issubset(part)) + for option in _get_iname_duplication_options(working_set, + old_common_inames): + yield option + # If exactly one set was found, an iname duplication is necessary + elif len(partitioning) == 1: + inames, = partitioning + + # There are splitting options for all inames + for iname in inames: + iname_insns = frozenset( + insn for insn in insn_deps if frozenset([iname]).issubset(insn)) + + import itertools as it + # For a given iname, the set of instructions containing this iname + # is inspected. For each element of the power set without the + # empty and the full set, one duplication option is generated. + for insns_to_dup in it.chain.from_iterable( + it.combinations(iname_insns, l) + for l in range(1, len(iname_insns))): + yield ( + iname, + tuple(insn.union(old_common_inames) for insn in insns_to_dup)) + + # If partitioning was empty, we have recursed successfully and yield nothing + + +def get_iname_duplication_options(knl, use_boostable_into=False): + """List options for duplication of inames, if necessary for schedulability + + :returns: a generator listing all options to duplicate inames, if duplication + of an iname is necessary to ensure the schedulability of the kernel. + Duplication options are returned as tuples (iname, within) as + understood by :func:`duplicate_inames`. There is no guarantee, that the + transformed kernel will be schedulable, because multiple duplications + of iname may be necessary. + + Some kernels require the duplication of inames in order to be schedulable, as the + forced iname dependencies define an over-determined problem to the scheduler. + Consider the following minimal example: + + knl = lp.make_kernel(["{[i,j]:0<=i,j {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e