diff --git a/loopy/schedule.py b/loopy/schedule.py index 493108c68ff7e5322487dfd2796e8ffff6069e8d..768dedcac68a804ca978f1d59b39227f82532665 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -171,7 +171,33 @@ def find_used_inames_within(kernel, sched_index): return result -def find_loop_nest_map(kernel): +def find_loop_nest_with_map(kernel): + """Returns a dictionary mapping inames to other inames that are + always nested with them. + """ + result = {} + + from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + + all_nonpar_inames = set([ + iname + for iname in kernel.all_inames() + if not isinstance(kernel.iname_to_tag.get(iname), + (ParallelTag, IlpBaseTag, VectorizeTag))]) + + iname_to_insns = kernel.iname_to_insns() + + for iname in all_nonpar_inames: + result[iname] = set([ + other_iname + for insn in iname_to_insns[iname] + for other_iname in kernel.insn_inames(insn) & all_nonpar_inames + ]) + + return result + + +def find_loop_nest_around_map(kernel): """Returns a dictionary mapping inames to other inames that are always nested around them. """ @@ -212,20 +238,26 @@ def find_loop_nest_map(kernel): return result -def find_loop_insn_dep_map(kernel, loop_nest_map): +def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): """Returns a dictionary mapping inames to other instruction ids that need to be scheduled before the iname should be eligible for scheduling. """ result = {} - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if isinstance(kernel.iname_to_tag.get(iname), ParallelTag): continue + iname_dep = result.setdefault(iname, set()) + for dep_insn_id in insn.insn_deps: + if dep_insn_id in iname_dep: + # already depending, nothing to check + continue + dep_insn = kernel.id_to_insn[dep_insn_id] dep_insn_inames = kernel.insn_inames(dep_insn) @@ -235,22 +267,43 @@ def find_loop_insn_dep_map(kernel, loop_nest_map): continue # To make sure dep_insn belongs outside of iname, we must prove - # (via loop_nest_map) that all inames that dep_insn will be - # executed inside are nested *around* iname. - if not dep_insn_inames <= loop_nest_map[iname]: + # that all inames that dep_insn will be executed in nest + # outside of the loop over *iname*. (i.e. nested around, or + # before). + + may_add_to_loop_dep_map = True + for dep_insn_iname in dep_insn_inames: + if dep_insn_iname in loop_nest_around_map[iname]: + # dep_insn_iname is guaranteed to nest outside of iname + # -> safe. + continue + + tag = kernel.iname_to_tag.get(dep_insn_iname) + if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)): + # Parallel tags don't really nest, so we'll disregard + # them here. + continue + + if dep_insn_iname not in loop_nest_with_map.get(iname, []): + # dep_insn_iname does not nest with iname, so its nest + # must occur outside. + continue + + may_add_to_loop_dep_map = False + break + + if not may_add_to_loop_dep_map: continue - iname_dep = result.setdefault(iname, set()) - if dep_insn_id not in iname_dep: - logger.debug("{knl}: loop dependency map: iname '{iname}' " - "depends on '{dep_insn}' via '{insn}'" - .format( - knl=kernel.name, - iname=iname, - dep_insn=dep_insn_id, - insn=insn.id)) + logger.debug("{knl}: loop dependency map: iname '{iname}' " + "depends on '{dep_insn}' via '{insn}'" + .format( + knl=kernel.name, + iname=iname, + dep_insn=dep_insn_id, + insn=insn.id)) - iname_dep.add(dep_insn_id) + iname_dep.add(dep_insn_id) return result @@ -357,11 +410,11 @@ class SchedulerState(Record): """ .. attribute:: kernel - .. attribute:: loop_nest_map + .. attribute:: loop_nest_around_map .. attribute:: loop_priority - See :func:`loop_nest_map`. + See :func:`loop_nest_around_map`. .. attribute:: breakable_inames @@ -414,7 +467,7 @@ class SchedulerState(Record): def generate_loop_schedules_internal( - sched_state, allow_boost=False, allow_insn=False, debug=None): + sched_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. @@ -452,7 +505,7 @@ def generate_loop_schedules_internal( #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP:") - for iname, val in six.iteritems(sched_state.loop_nest_map): + for iname, val in six.iteritems(sched_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") @@ -535,7 +588,7 @@ def generate_loop_schedules_internal( if is_ready and debug_mode: print("ready to schedule '%s'" % insn.id) - if is_ready and allow_insn and not debug_mode: + if is_ready and not debug_mode: iid_set = frozenset([insn.id]) # {{{ update active group counts for added instruction @@ -572,8 +625,7 @@ def generate_loop_schedules_internal( for sub_sched in generate_loop_schedules_internal( new_sched_state, - allow_boost=rec_allow_boost, debug=debug, - allow_insn=True): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched if not sched_state.group_insn_counts: @@ -632,8 +684,7 @@ def generate_loop_schedules_internal( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1]), - allow_boost=rec_allow_boost, debug=debug, - allow_insn=allow_insn): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched return @@ -674,9 +725,11 @@ def generate_loop_schedules_internal( currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) - if not sched_state.loop_nest_map[iname] <= currently_accessible_inames: + if ( + not sched_state.loop_nest_around_map[iname] + <= currently_accessible_inames): if debug_mode: - print("scheduling %s prohibited by loop nest map" % iname) + print("scheduling %s prohibited by loop nest-around map" % iname) continue if ( @@ -701,7 +754,7 @@ def generate_loop_schedules_internal( iname_home_domain.get_var_names(dim_type.param)) # The previous check should have ensured this is true, because - # the loop_nest_map takes the domain dependency graph into + # the loop_nest_around_map takes the domain dependency graph into # consideration. assert (iname_home_domain_params & kernel.all_inames() <= currently_accessible_inames) @@ -841,20 +894,11 @@ def generate_loop_schedules_internal( yield sched_state.schedule else: - if not allow_insn: - # try again with boosting allowed - for sub_sched in generate_loop_schedules_internal( - sched_state, - allow_boost=allow_boost, debug=debug, - allow_insn=True): - yield sub_sched - if not allow_boost and allow_boost is not None: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( sched_state, - allow_boost=True, debug=debug, - allow_insn=allow_insn): + allow_boost=True, debug=debug): yield sub_sched else: # dead end @@ -1255,11 +1299,15 @@ def generate_loop_schedules(kernel, debug_args={}): iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) - loop_nest_map = find_loop_nest_map(kernel) + loop_nest_with_map = find_loop_nest_with_map(kernel) + loop_nest_around_map = find_loop_nest_around_map(kernel) sched_state = SchedulerState( kernel=kernel, - loop_nest_map=loop_nest_map, - loop_insn_dep_map=find_loop_insn_dep_map(kernel, loop_nest_map), + loop_nest_around_map=loop_nest_around_map, + loop_insn_dep_map=find_loop_insn_dep_map( + kernel, + loop_nest_with_map=loop_nest_with_map, + loop_nest_around_map=loop_nest_around_map), breakable_inames=ilp_inames, ilp_inames=ilp_inames, vec_inames=vec_inames,