diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index cd8ec409cce1a3f210554a05daf4bd358781fb20..38d955f0395d16357d8920069e13aab9398a4ff9 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -31,8 +31,9 @@ from islpy import dim_type from loopy.symbolic import ( RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, - SubstitutionRuleMappingContext) + SubstitutionRuleMappingContext, WalkMapper) from loopy.diagnostic import LoopyError +from pymbolic.primitives import (Variable, Subscript) __doc__ = """ @@ -50,6 +51,8 @@ __doc__ = """ .. autofunction:: get_iname_duplication_options +.. autofunction:: need_duplication_with_deps + .. autofunction:: has_schedulable_iname_nesting .. autofunction:: prioritize_loops @@ -851,6 +854,251 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # }}} +# {{{ Helping mappers for duplication_with_deps + + +class InameChecker(WalkMapper): + result = None + iname = None + + def __init__(self, _iname): + self.result = False + self.iname = _iname + + def map_subscript(self, expr, *args, **kwargs): + if not self.visit(expr, *args, **kwargs): + return + + for index in expr.index_tuple: + if not isinstance(index, Variable): + self.rec(index, *args, **kwargs) + + self.post_visit(expr, *args, **kwargs) + + def post_visit(self, expr): + if isinstance(expr, Variable): + self.result = True + elif isinstance(expr, Subscript): + if Variable(self.iname) not in expr.index_tuple: + # iname not in indices needs dupl_with_deps + self.result = True + +# }}} + + +# {{{ iname duplication with dependencies + + +def needs_duplication_with_deps(knl, iname, within): + """ + The :func:`lp.duplicate_inames` might not reciprocate the logic when the + instruction to be duplicated has dependencies associated with it. This + function acts as a check between :func:`loop.duplicate_inames` and + :func:`loopy.duplicate_inames_with_deps`. + + .. note:: + Currently this function only supports for a single instruction and + iname. Please call this function multiple number of times, if it is + intended to be used for more than one instructions or inames. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`. + """ + from loopy.match import parse_stack_match + from loopy.match import Or + is_within_or = isinstance(within, Or) + + within = parse_stack_match(within) + + # Getting the instruction of interest + insn_list = [] + for insn in knl.instructions: + if within(knl, insn, ()): + insn_list.append(insn) + + assert len(insn_list) == 1 or is_within_or, ('The function can only handle' + 'one instruction currently, please give input accordingly') + + insn = insn_list[0] + + # Creating the class for checking whether the lhs and rhs are compliant + + iname_checker = InameChecker(iname) + + # Getting LHS and RHS + lhs = insn.assignee + rhs = insn.expression + + # Checking the inames for LHS and RHS + iname_checker(lhs) + iname_checker(rhs) + + if iname_checker.result: + return True + + # one last check is that the either forward dependencies or reverse + # depndencies of this insn do not have `iname` in their `within_inames` + + forward_within = set() + for forw in insn.depends_on: + forward_within.update(knl.id_to_insn[forw].within_inames) + + rev_within = set() + for rev in insn.depends_on: + rev_within.update(knl.id_to_insn[rev].within_inames) + + if iname in rev_within & forward_within: + return True + + return False + + +def get_forward_and_rev_deps(knl, iname, within): + """ + Returns a `frozenset` of forward and backward dependencies of an + instruction which have the particular `iname` as one of their + `within_inames`. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`. + """ + from loopy.kernel.instruction import Assignment + + if isinstance(within, Assignment): + insn = within + else: + from loopy.match import parse_stack_match + within = parse_stack_match(within) + for insn in knl.instructions: + if within(knl, insn, ()): + break + + from loopy.kernel.tools import find_reverse_dependencies + from collections import deque + + # Initializing for the traversal + forw_rev_depends = set([]) + has_or_will_visit = set([insn.id]) + q = deque([insn.id]) + + while q: + elem_id = q.pop() + elem_insn = knl.id_to_insn[elem_id] + if iname in elem_insn.within_inames: + forw_rev_depends.add(elem_id) + + # e is used to denote the insn_id's over the iterables + # adding the forward dependencies + for e in elem_insn.depends_on: + # checking that it has already not been visited/scheduled + if e not in has_or_will_visit: + q.append(e) + has_or_will_visit.add(e) + + # adding the reverse dependencies + for e in find_reverse_dependencies(knl, frozenset([elem_id])): + # checking that it has already not been visited/scheduled + if e not in has_or_will_visit: + q.append(e) + has_or_will_visit.add(e) + + return forw_rev_depends + + +def is_isomorphic_duplication(knl, iname, within): + """ + This is a check for the user to perform before using the + :func:`loopy.duplicate_iname_with_deps`. If the function returns `True`, + that is a signal to **NOT** perform the duplication, and instead explore + other available options. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`. + """ + # intended to store all the insn ids related to a given iname + insns_related_to_inames = set() + for insn in knl.instructions: + if iname in insn.within_inames: + insns_related_to_inames.add(insn.id) + + forward_and_rev_deps = get_forward_and_rev_deps(knl, iname, within) + + return forward_and_rev_deps == frozenset(insns_related_to_inames) + + +def duplicate_inames_with_deps(knl, iname, within, new_iname=None, suffix=None, + tags={}): + """ + This transformation will duplicate the particular iname in all the insns + that are dependendent on the current insn or even that the current insn + depends on. + + One must implent this variant of duplication over + :func:`loopy.duplicate_inames` if :func:`loopy.needs_duplication_with_deps` + returns `True`. + + As an example consider: + There are 7 instructions[a, b, c, d, e, f, g], with the dependency + graph [Parents at the top and children at the bottom] as: + ``` + a + / \ + b c + / \ \ + d e f + \ + g + ``` + And the `within_inames` for the instructions are as follows: + ``` + a -> i, j, k + b -> i, j + c -> i + d -> i, m + e -> p, q + f -> x, y + g -> i, j + ``` + If we need to duplicate `i` in `b`. + + - Consider the children of `b`: `d` and `e`. We must duplicate `i` in `d`, + but we need not duplicate `i` in `e`, as `i` is not in the `within_iname` + of `e`. + + - Consider the parents of `b`: `a` It has `i` in its `inames` hence + duplication is necessary. + + - Hence, now have figured that we also have to duplicate `i` in the + instructions `d` and `a` as well. This would again trigger, the + "duplication fire" for these instructions(`d` and `a`), and so finally we + are left with the requirement that, we need to duplicate `i` in : + `b, d, a, c`. + + This example in short summarizes the intent of the function. + + .. note:: + Currently this would only take a single iname for duplicating. For + multiple inames please use the same function multiple number of times. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`, preferably the one returned by + :func:`loopy.get_iname_duplication_options`. + """ + + # Getting the relevant forward and backward dependencies. + # On these instructions we need to perform the duplication + q_forward_and_rev_deps = get_forward_and_rev_deps(knl, iname, within) + + from loopy.match import Id, Or + # Making the Or object so that it could be fed to `duplicate_inames` + insns_match_or = Or(tuple(Id(k) for k in q_forward_and_rev_deps)) + knl = duplicate_inames(knl, [iname], insns_match_or, + new_iname, suffix, tags) + + return knl + +# }}} + # {{{ iname duplication for schedulability diff --git a/test/test_loopy.py b/test/test_loopy.py index e624ed346cd696bf18a116e9373f8e765dafdc9a..05455237f7bf3d74de626982e673fb7c4669fa57 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1792,6 +1792,76 @@ def test_unschedulable_kernel_detection(): assert len(list(lp.get_iname_duplication_options(knl))) == 10 +def test_duplication_options_with_deps(ctx_factory): + import loopy as lp + from loopy.transform.iname import (needs_duplication_with_deps, + is_isomorphic_duplication, + duplicate_inames_with_deps) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2**8 + import pyopencl.clrandom as cl_random + a_mat_dev = cl_random.rand(queue, (n, n), dtype=np.float32) + b_mat_dev = cl_random.rand(queue, (n, n), dtype=np.float32) + c_mat_dev = cl.array.zeros(queue, (n, n), dtype=np.float32) + + knl = lp.make_kernel( + "{ [i, j, k, k1]: 0<=i, j, k, k1<256 }", + """ + temp_cnst[k] = 2.0 {id=insn_1} + temp_cnst_2[k1] = 2*temp_cnst[k1] {id=insn_2} + c[i, j] = reduce(sum, k1, temp_cnst_2[k1]*a[i,k1]*b[k1,j]) {id=insn_3} + """, + [lp.TemporaryVariable("temp_cnst", + dtype=np.float32, + shape=lp.auto, + base_indices=lp.auto, + scope=lp.temp_var_scope.PRIVATE), + lp.TemporaryVariable("temp_cnst_2", + dtype=np.float32, + shape=lp.auto, + base_indices=lp.auto, + scope=lp.temp_var_scope.PRIVATE), + '...'] + ) + knl = lp.set_options(knl, ignore_boostable_into=True) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, + b=np.float32, + c=np.float32)) + processed_knl = lp.preprocess.preprocess_kernel(knl) + + # Assert that earlier it was not schedulable + assert not lp.has_schedulable_iname_nesting(processed_knl) + fixed_knl = processed_knl.copy() + while not lp.has_schedulable_iname_nesting(fixed_knl): + dupls_list = list(lp.get_iname_duplication_options(fixed_knl)) + for iname, insn in dupls_list: + if needs_duplication_with_deps(fixed_knl, iname, insn): + if not is_isomorphic_duplication(fixed_knl, iname, insn): + fixed_knl = duplicate_inames_with_deps(fixed_knl, + iname, insn) + break + else: + fixed_knl = lp.duplicate_inames(fixed_knl, iname, insn) + break + + # Assert that after the duplication, everything works + assert lp.has_schedulable_iname_nesting(fixed_knl) + knl = fixed_knl + kernel_args = {} + kernel_args['a'] = a_mat_dev + kernel_args['b'] = b_mat_dev + kernel_args['c'] = c_mat_dev + + evt, (out,) = knl(queue, **kernel_args) + + a = a_mat_dev.get() + b = b_mat_dev.get() + c = out.get() + + assert np.linalg.norm(4*a.dot(b)-c)/np.linalg.norm(4*a.dot(b)) < 1e-7 + + def test_regression_no_ret_call_removal(ctx_factory): # https://github.com/inducer/loopy/issues/32 knl = lp.make_kernel(