diff --git a/loopy/check.py b/loopy/check.py index d8c63a302da8df9ec68879a81feea684a67a15b0..f490f85512817ed13710732f20973eae562bcf49 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -145,7 +145,6 @@ def check_for_write_races(kernel): raise RuntimeError("temp var '%s' hasn't decided on " "whether it is local" % temp_var.name) - else: raise RuntimeError("invalid assignee name in instruction '%s'" % insn.id) diff --git a/loopy/kernel.py b/loopy/kernel.py index b3acea7ffe733d12a7bd71fd64fa9026e3e29c08..09b5c9b9cb7d7334d4eda8109143f0484d866c8d 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -232,7 +232,9 @@ class Instruction(Record): by adding dependencies on any writes to temporaries read by this instruction. :ivar boostable: Whether the instruction may safely be executed inside more loops than advertised without changing the meaning - of the program. Allowed values are *None* (for unknwon), *True*, and *False*. + of the program. Allowed values are *None* (for unknown), *True*, and *False*. + :ivar boostable_into: a set of inames into which the instruction + may need to be boosted, as a heuristic help for the scheduler. The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is finished: @@ -245,6 +247,7 @@ class Instruction(Record): def __init__(self, id, assignee, expression, forced_iname_deps=set(), insn_deps=set(), boostable=None, + boostable_into=None, temp_var_type=None, duplicate_inames_and_tags=[]): assert isinstance(forced_iname_deps, set) @@ -254,6 +257,7 @@ class Instruction(Record): id=id, assignee=assignee, expression=expression, forced_iname_deps=forced_iname_deps, insn_deps=insn_deps, boostable=boostable, + boostable_into=boostable_into, temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags) @memoize_method @@ -276,7 +280,10 @@ class Instruction(Record): self.assignee, self.expression) if self.boostable == True: - result += " (boostable)" + if self.boostable_into: + result += " (boostable into '%s')" % ",".join(self.boostable_into) + else: + result += " (boostable)" elif self.boostable == False: result += " (not boostable)" elif self.boostable is None: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ca6b0d0c621e56a942b66c3dd8d804bf164d16f1..30baf5e07da5cad9d97bddc35c64c6a2066b7d8e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -287,6 +287,51 @@ def add_boostability_and_automatic_dependencies(kernel): # }}} +# {{{ limit boostability + +def limit_boostability(kernel): + """Finds out which other inames an instruction's inames occur with + and then limits boostability to just those inames. + """ + + iname_occurs_with = {} + for insn in kernel.instructions: + insn_inames = kernel.insn_inames(insn) + for iname in insn_inames: + iname_occurs_with.setdefault(iname, set()).update(insn_inames) + + iname_use_counts = {} + for insn in kernel.instructions: + for iname in kernel.insn_inames(insn): + iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1 + + single_use_inames = set(iname for iname, uc in iname_use_counts.iteritems() + if uc == 1) + + new_insns = [] + for insn in kernel.instructions: + if insn.boostable is None: + raise RuntimeError("insn '%s' has undetermined boostability" % insn.id) + elif insn.boostable: + boostable_into = set() + for iname in kernel.insn_inames(insn): + boostable_into.update(iname_occurs_with[iname]) + + boostable_into -= kernel.insn_inames(insn) | single_use_inames + + # Even if boostable_into is empty, leave boostable flag on--it is used + # for boosting into unused hw axes. + + insn = insn.copy(boostable_into=boostable_into) + else: + insn = insn.copy(boostable_into=set()) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + # {{{ guess good iname for local axis 0 def get_axis_0_ranking(kernel, insn): @@ -635,6 +680,7 @@ def preprocess_kernel(kernel): kernel = assign_automatic_axes(kernel) kernel = add_boostability_and_automatic_dependencies(kernel) + kernel = limit_boostability(kernel) kernel = adjust_local_temp_var_storage(kernel) return kernel diff --git a/loopy/schedule.py b/loopy/schedule.py index cef3f31bcad53c83c72ee1546114421d5b8e9c89..5523583dfacf48f4cae835ed4c54fb5f26c80138 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -171,7 +171,7 @@ def dump_schedule(schedule): else: assert False - return " ".join(entries), len(entries) + return " ".join(entries) class SchedulerDebugger: def __init__(self, debug_length): @@ -180,16 +180,22 @@ class SchedulerDebugger: self.dead_end_counter = 0 self.debug_length = debug_length + self.elapsed_store = 0 + self.start() + self.wrote_status = False + self.update() def update(self): if (self.success_counter + self.dead_end_counter) % 50 == 0: - sys.stdout.write("\rscheduling... %d successes, " - "%d dead ends (longest %d)" % ( - self.success_counter, - self.dead_end_counter, - len(self.longest_rejected_schedule))) - sys.stdout.flush() + if self.debug_length or self.elapsed_time() > 1: + sys.stdout.write("\rscheduling... %d successes, " + "%d dead ends (longest %d)" % ( + self.success_counter, + self.dead_end_counter, + len(self.longest_rejected_schedule))) + sys.stdout.flush() + self.wrote_status = True def log_success(self, schedule): self.success_counter += 1 @@ -202,14 +208,26 @@ class SchedulerDebugger: self.update() def done_scheduling(self): - sys.stdout.write("\rscheduler finished \n") - sys.stdout.flush() + if self.wrote_status: + sys.stdout.write("\rscheduler finished"+40*" "+"\n") + sys.stdout.flush() + def elapsed_time(self): + from time import time + return self.elapsed_store + time() - self.start_time + + def stop(self): + from time import time + self.elapsed_store += time()-self.start_time + + def start(self): + from time import time + self.start_time = time() # }}} # {{{ scheduling algorithm -def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=None): +def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_boost=False, debug=None): all_insn_ids = set(insn.id for insn in kernel.instructions) scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule @@ -248,33 +266,32 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N and len(schedule) >= debug.debug_length): debug_mode = True + #print dump_schedule(schedule), len(schedule) if debug_mode: print kernel print "--------------------------------------------" + print "CURRENT SCHEDULE:" print dump_schedule(schedule) + print "--------------------------------------------" - #if len(schedule) == 3: + #if len(schedule) == 2: #from pudb import set_trace; set_trace() - if debug_mode: - print "active:", ",".join(active_inames) - print "entered:", ",".join(entered_inames) - # }}} made_progress = False # {{{ see if any insn can be scheduled now - unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids) - insns_with_satisfied_deps = set() + # Also take note of insns that have a chance of being schedulable inside + # the current loop nest, in this set: - for insn_id in unscheduled_insn_ids: + reachable_insn_ids = set() + + for insn_id in all_insn_ids - scheduled_insn_ids: insn = kernel.id_to_insn[insn_id] schedule_now = set(insn.insn_deps) <= scheduled_insn_ids - if schedule_now: - insns_with_satisfied_deps.add(insn_id) if not schedule_now: if debug_mode: @@ -282,53 +299,60 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N insn.id, ",".join(set(insn.insn_deps) - scheduled_insn_ids)) continue - if insn.boostable == True: - # If insn is boostable, it may be placed inside a more deeply - # nested loop without harm. + # If insn is boostable, it may be placed inside a more deeply + # nested loop without harm. - # But if it can be scheduled on the way *out* of the currently - # active loops, now is not the right moment. + # But if it can be scheduled on the way *out* of the currently + # active loops, now is not the right moment. - schedulable_at_loop_levels = [] + schedulable_at_loop_levels = [] - for active_loop_count in xrange(len(active_inames), -1, -1): - outer_active_inames = set(active_inames[:active_loop_count]) - if ( - kernel.insn_inames(insn) - parallel_inames - <= - outer_active_inames - parallel_inames): + if allow_boost: + try_loop_counts = xrange(len(active_inames), -1, -1) + else: + try_loop_counts = [len(active_inames)] - schedulable_at_loop_levels.append(active_loop_count) + for active_loop_count in try_loop_counts: + outer_active_inames = set(active_inames[:active_loop_count]) - if schedulable_at_loop_levels != [len(active_inames)]: - schedule_now = False - if debug_mode: - if schedulable_at_loop_levels: - print ("instruction '%s' will be scheduled when more " - "loops have been exited" % insn.id) - else: - print ("instruction '%s' is missing inames '%s'" - % (insn.id, ",".join( - (kernel.insn_inames(insn) - parallel_inames) - - - (outer_active_inames - parallel_inames)))) + want = kernel.insn_inames(insn) - parallel_inames + have = outer_active_inames - parallel_inames - insn.boostable_into - elif insn.boostable == False: - # If insn is not boostable, we must insist that it is placed inside - # the exactly correct set of loops. + if allow_boost: + have -= insn.boostable_into - schedule_now = schedule_now and ( - kernel.insn_inames(insn) - parallel_inames - == - active_inames_set - parallel_inames) + if want == have: + schedulable_at_loop_levels.append(active_loop_count) + if schedulable_at_loop_levels != [len(active_inames)]: + schedule_now = False if debug_mode: - print ("instruction '%s' is not boostable and doesn't " - "match the active inames" % insn.id) + if schedulable_at_loop_levels: + print ("instruction '%s' will be scheduled when more " + "loops have been exited" % insn.id) + else: + want = (kernel.insn_inames(insn) - parallel_inames) + have = (active_inames_set - parallel_inames) + if want-have: + print ("instruction '%s' is missing inames '%s'" + % (insn.id, ",".join(want-have))) + if have-want: + print ("instruction '%s' won't work under inames '%s'" + % (insn.id, ",".join(have-want))) + + # {{{ determine reachability + + want = kernel.insn_inames(insn) - parallel_inames + have = active_inames_set - parallel_inames + if (not schedule_now and have <= want): + reachable_insn_ids.add(insn_id) else: - raise RuntimeError("instruction '%s' has undetermined boostability" - % insn.id) + if debug_mode: + print (" '%s' also not reachable because it won't work under '%s'" + % (insn.id, ",".join(have-want))) + + # }}} if schedule_now: scheduled_insn_ids.add(insn.id) @@ -346,6 +370,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] if last_entered_loop in kernel.insn_inames(insn): + if debug_mode: + print("cannot leave '%s' because '%s' still depends on it" + % (last_entered_loop, insn.id)) can_leave = False break @@ -364,6 +391,15 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N - parallel_inames ) + if debug_mode: + print "--------------------------------------------" + print "available :", ",".join(available_loops) + print "active:", ",".join(active_inames) + print "entered:", ",".join(entered_inames) + print "--------------------------------------------" + print "reachable insns:", ",".join(reachable_insn_ids) + print "--------------------------------------------" + # Don't be eager about scheduling new loops--if progress has been made, # revert to top of scheduler and see if more progress can be made another # way. (hence 'and not made_progress') @@ -372,27 +408,17 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N useful_loops = [] for iname in available_loops: - # {{{ determine if that gets us closer to being able to scheduling an insn + # {{{ determine if that gets us closer to being able to schedule an insn useful = False - hypothetical_active_loops = active_inames_set | set([iname]) - for insn_id in unscheduled_insn_ids: - if insn_id not in insns_with_satisfied_deps: - continue - + hypothetically_active_loops = active_inames_set | set([iname]) + for insn_id in reachable_insn_ids: insn = kernel.id_to_insn[insn_id] - if insn.boostable: - # if insn is boostable, just increasing the number of used - # inames is enough--not necessarily all must be truly 'useful'. - - if iname in kernel.insn_inames(insn): - useful = True - break - else: - if hypothetical_active_loops <= kernel.insn_inames(insn): - useful = True - break + if (hypothetically_active_loops + <= (kernel.insn_inames(insn) | insn.boostable_into)): + useful = True + break if not useful: if debug_mode: @@ -425,7 +451,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N if debug_mode: print "useful inames: %s" % ",".join(useful_loops) - raw_input("Enter:") for tier in priority_tiers: found_viable_schedule = False @@ -443,23 +468,32 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N # }}} - if debug_mode: raw_input("Enter:") if not active_inames and not available_loops and not unscheduled_insn_ids: # if done, yield result + debug.log_success(schedule) + yield schedule + elif made_progress: # if not done, but made some progress--try from the top + for sub_sched in generate_loop_schedules_internal( + kernel, loop_priority, schedule, + debug=debug): + yield sub_sched + else: + if not allow_boost: + # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule, - debug=debug): + kernel, loop_priority, schedule=schedule, + allow_boost=True, debug=debug): yield sub_sched - else: - # dead end - if debug is not None: - debug.log_dead_end(schedule) + else: + # dead end + if debug is not None: + debug.log_dead_end(schedule) # }}} @@ -577,7 +611,7 @@ def insert_barriers(kernel, schedule, level=0): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, loop_priority=[], debug=False): +def generate_loop_schedules(kernel, loop_priority=[], debug=None): from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) @@ -586,13 +620,7 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=False): schedule_count = 0 - if debug: - if debug == True: - debug = SchedulerDebugger(None) - else: - debug = SchedulerDebugger(debug) - else: - debug = None + debug = SchedulerDebugger(debug) for gen_sched in generate_loop_schedules_internal(kernel, loop_priority, debug=debug): @@ -605,10 +633,14 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=False): "This often means that local memory was " "written, but never read." % ",".join(owed_barriers), LoopyAdvisory) + debug.stop() yield kernel.copy(schedule=gen_sched) + debug.start() schedule_count += 1 + debug.done_scheduling() + if not schedule_count: raise RuntimeError("no valid schedules found")