From de71343d20e9b26386f6e3e7266007f5603466d2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Tue, 21 Feb 2012 01:27:38 +0100 Subject: [PATCH] Make ILP go through regular loop emission, implement ilp.seq. Introduce 'breakable' loops. --- MEMO | 10 ++- doc/reference.rst | 12 +++- loopy/check.py | 14 ++-- loopy/codegen/instruction.py | 113 ++++--------------------------- loopy/kernel.py | 13 ++-- loopy/preprocess.py | 44 +++++++++++- loopy/schedule.py | 126 ++++++++++++++++++++++++----------- 7 files changed, 177 insertions(+), 155 deletions(-) diff --git a/MEMO b/MEMO index 66f5f2f4e..b2dd2d070 100644 --- a/MEMO +++ b/MEMO @@ -46,8 +46,6 @@ To-do - When duplicating inames, use iname aliases to relieve burden on isl -- Differentiate ilp.unr from ilp.seq - - Add dependencies after the fact - bug? with fetching only g[j,*] inside j loop @@ -56,6 +54,8 @@ To-do - nbody GPU -> pending better prefetch spec + - Prefetch by sample access + - Exclude by precompute name - Expose iname-duplicate-and-rename as a primitive. @@ -66,9 +66,13 @@ To-do - Fix all tests +- Scalar insn priority + Future ideas ^^^^^^^^^^^^ +- Check for unordered (no-dependency) writes to the same location + - String instructions? - How is intra-instruction ordering of ILP loops going to be determined? @@ -111,6 +115,8 @@ Future ideas Dealt with ^^^^^^^^^^ +- Differentiate ilp.unr from ilp.seq + - Allow complex-valued arithmetic, despite CL's best efforts. - "No schedule found" debug help: diff --git a/doc/reference.rst b/doc/reference.rst index 0e5387719..0c5128490 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -21,6 +21,14 @@ Expressions Assignments and Substitution Rules ---------------------------------- +Inames +------ + +Loops are (by default) entered exactly once. This is necessary to preserve +depdency semantics--otherwise e.g. a fetch could happen inside one loop nest, +and then the instruction using that fetch could be inside a wholly different +loop nest. + Tags ---- @@ -38,9 +46,9 @@ Tag Meaning (Throughout this table, `N` must be replaced by an actual number.) -ILP is really three things combined: +"ILP" does three things: -* Restricts loops to be innermost (excludes them from scheduling) +* Restricts loops to be innermost * Duplicates reduction storage for any reductions nested around ILP usage * Causes a loop (unrolled or not) to be opened/generated for each involved instruction diff --git a/loopy/check.py b/loopy/check.py index c0bfeca13..454f3450f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -133,13 +133,15 @@ def check_for_write_races(kernel): assignee_inames & local_parallel_insn_inames) elif temp_var.is_local == False: - ilp_inames = set( - iname - for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + #from loopy.kernel import IlpBaseTag + #ilp_inames = set( + #iname + #for iname in kernel.insn_inames(insn) + #if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + + #inames_without_write_dep = ilp_inames - assignee_inames - inames_without_write_dep = ilp_inames - ( - assignee_inames & ilp_inames) + inames_without_write_dep = set() else: raise RuntimeError("temp var '%s' hasn't decided on " diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 688ae2c7f..0970fa5c4 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -1,115 +1,26 @@ """Code generation for Instruction objects.""" from __future__ import division -from pytools import Record -import islpy as isl - - - - -# {{{ ILP instance - -class ILPInstance(Record): - """ - :ivar ilp_key: a frozenset of tuples (iname, assignment) - """ - __slots__ = ["implemented_domain", "assignments", "ilp_key"] - - def __init__(self, implemented_domain, assignments, ilp_key): - Record.__init__(self, - implemented_domain=implemented_domain, - assignments=assignments, - ilp_key=ilp_key) - - def fix(self, iname, aff): - from loopy.isl_helpers import iname_rel_aff - iname_plus_lb_aff = iname_rel_aff( - self.implemented_domain.get_space(), iname, "==", aff) - - from loopy.symbolic import pw_aff_to_expr - expr = pw_aff_to_expr(aff) - - cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) - - new_assignments = self.assignments.copy() - new_assignments[iname] = expr - return ILPInstance( - implemented_domain=self.implemented_domain.add_constraint(cns), - assignments=new_assignments, - ilp_key=self.ilp_key | set([(iname, expr)])) - -# }}} - - - - -def generate_ilp_instances(kernel, insn, codegen_state): - impl_domain = codegen_state.implemented_domain - - from loopy.kernel import IlpTag - - result = [ILPInstance(impl_domain, {}, frozenset())] - - # {{{ pass 2: treat all ILP dimensions - - for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) - - if not isinstance(tag, IlpTag): - continue - - - bounds = kernel.get_iname_bounds(iname) - - from loopy.isl_helpers import ( - static_max_of_pw_aff, static_value_of_pw_aff) - from loopy.symbolic import pw_aff_to_expr - - length = int(pw_aff_to_expr( - static_max_of_pw_aff(bounds.size, constants_only=True))) - lower_bound_aff = static_value_of_pw_aff( - bounds.lower_bound_pw_aff.coalesce(), - constants_only=False) - - new_result = [] - for ilpi in result: - for i in range(length): - idx_aff = lower_bound_aff + i - new_result.append(ilpi.fix(iname, idx_aff)) - - result = new_result - - # }}} - - return result - def generate_instruction_code(kernel, insn, codegen_state): - result = [] from loopy.codegen import GeneratedInstruction - for ilpi in generate_ilp_instances(kernel, insn, codegen_state): - ccm = codegen_state.c_code_mapper.copy_and_assign_many(ilpi.assignments) - - # FIXME we should probably share some checks across ILP instances - - from cgen import Assign - insn_code = Assign(ccm(insn.assignee), ccm(insn.expression)) - from loopy.codegen.bounds import wrap_in_bounds_checks - insn_code, impl_domain = wrap_in_bounds_checks( - ccm, kernel.domain, kernel.insn_inames(insn), ilpi.implemented_domain, - insn_code) - - result.append(GeneratedInstruction( - insn_id=insn.id, - implemented_domain=impl_domain, - ast=insn_code)) + ccm = codegen_state.c_code_mapper - from loopy.codegen import gen_code_block - return gen_code_block(result) + from cgen import Assign + insn_code = Assign(ccm(insn.assignee), ccm(insn.expression)) + from loopy.codegen.bounds import wrap_in_bounds_checks + insn_code, impl_domain = wrap_in_bounds_checks( + ccm, kernel.domain, kernel.insn_inames(insn), + codegen_state.implemented_domain, + insn_code) + return GeneratedInstruction( + insn_id=insn.id, + implemented_domain=impl_domain, + ast=insn_code) diff --git a/loopy/kernel.py b/loopy/kernel.py index 8b5010c44..f5985bf56 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -492,6 +492,8 @@ class LoopKernel(Record): length 16. :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule` objects + :ivar lowest_priority_inames: + :ivar breakable_inames: these inames' loops may be broken up by the scheduler :ivar cache_manager: @@ -508,7 +510,7 @@ class LoopKernel(Record): temporary_variables={}, local_sizes={}, iname_to_tag={}, iname_to_tag_requests=None, substitutions={}, - cache_manager=None): + cache_manager=None, lowest_priority_inames=[], breakable_inames=set()): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" @@ -703,7 +705,9 @@ class LoopKernel(Record): iname_to_tag=iname_to_tag, iname_to_tag_requests=iname_to_tag_requests, substitutions=substitutions, - cache_manager=cache_manager) + cache_manager=cache_manager, + lowest_priority_inames=lowest_priority_inames, + breakable_inames=breakable_inames) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: @@ -763,8 +767,6 @@ class LoopKernel(Record): implicit_inames = None for writer_id in writers[tv_name]: - #writer_insn = self.id_to_insn[writer_id] - writer_implicit_inames = ( insn_id_to_inames[writer_id] - insn_assignee_inames[writer_id]) @@ -1077,6 +1079,9 @@ class LoopKernel(Record): for inner_iname in self.all_inames(): result[inner_iname] = set() for outer_iname in self.all_inames(): + if outer_iname in self.breakable_inames: + continue + if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: result[inner_iname].add(outer_iname) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 40c702fb9..5c86369a3 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -6,6 +6,40 @@ import pyopencl.characterize as cl_char +# {{{ transform ilp into lower-level constructs + +def realize_ilp(kernel): + from loopy.kernel import ( + + UnrolledIlpTag, UnrollTag, LoopedIlpTag) + ILP_TO_BASE_TAG = { + UnrolledIlpTag: UnrollTag, + LoopedIlpTag: None, + } + + lpi = kernel.lowest_priority_inames[:] + breakable_inames = kernel.breakable_inames.copy() + + new_iname_to_tag = kernel.iname_to_tag.copy() + for iname in kernel.all_inames(): + tag = kernel.iname_to_tag.get(iname) + if type(tag) in ILP_TO_BASE_TAG: + new_tag_cls = ILP_TO_BASE_TAG[type(tag)] + if new_tag_cls is None: + new_iname_to_tag[iname] = None + else: + new_iname_to_tag[iname] = new_tag_cls() + + lpi.append(iname) + breakable_inames.add(iname) + + return kernel.copy( + iname_to_tag=new_iname_to_tag, + lowest_priority_inames=lpi, + breakable_inames=breakable_inames) + +# }}} + # {{{ local temporary finding def mark_local_temporaries(kernel): @@ -138,7 +172,7 @@ def realize_reduction(kernel, insn_id_filter=None): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. - # {{{ see if this reduction is nested around some ILP loops + # {{{ see if this reduction is nested inside some ILP loops ilp_inames = [iname for iname in temp_kernel.insn_inames(insn) @@ -706,6 +740,14 @@ def preprocess_kernel(kernel): kernel = apply_subst(kernel) kernel = realize_reduction(kernel) + + # Ordering restriction: + # Must realize reductions before realizing ILP, because realize_ilp() + # gets rid of ILP tags, but realize_reduction() needs them to do + # reduction variable duplication. + + kernel = realize_ilp(kernel) + kernel = mark_local_temporaries(kernel) kernel = assign_automatic_axes(kernel) kernel = add_boostability_and_automatic_dependencies(kernel) diff --git a/loopy/schedule.py b/loopy/schedule.py index 4814f9729..f38928817 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -274,7 +274,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b and len(schedule) >= debug.debug_length): debug_mode = True - #print dump_schedule(schedule), len(schedule) if debug_mode: print 75*"=" print "KERNEL:" @@ -292,8 +291,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # }}} - made_progress = False - # {{{ see if any insn can be scheduled now # Also take note of insns that have a chance of being schedulable inside @@ -301,7 +298,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b reachable_insn_ids = set() - for insn_id in all_insn_ids - scheduled_insn_ids: + unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids + + for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] schedule_now = set(insn.insn_deps) <= scheduled_insn_ids @@ -346,56 +345,100 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b print "scheduling '%s'" % insn.id scheduled_insn_ids.add(insn.id) schedule = schedule + [RunInstruction(insn_id=insn.id)] - made_progress = True + + # Don't be eager about entering/leaving loops--if progress has been + # made, revert to top of scheduler and see if more progress can be + # made. + + for sub_sched in generate_loop_schedules_internal( + kernel, loop_priority, schedule, + allow_boost=rec_allow_boost, debug=debug): + yield sub_sched + + return unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids) # }}} - # {{{ see if we're ready to leave a loop + # {{{ see if we're ready to leave the innermost loop if last_entered_loop is not None: can_leave = True - for insn_id in unscheduled_insn_ids: - insn = kernel.id_to_insn[insn_id] - if last_entered_loop in kernel.insn_inames(insn): - if debug_mode: - print("cannot leave '%s' because '%s' still depends on it" - % (last_entered_loop, insn.id)) - can_leave = False - break + + if last_entered_loop not in kernel.breakable_inames: + # If the iname is not breakable, then check that we've + # scheduled all the instructions that require it. + + for insn_id in unscheduled_insn_ids: + insn = kernel.id_to_insn[insn_id] + if last_entered_loop in kernel.insn_inames(insn): + if debug_mode: + print("cannot leave '%s' because '%s' still depends on it" + % (last_entered_loop, insn.id)) + can_leave = False + break if can_leave: - schedule = schedule + [LeaveLoop(iname=last_entered_loop)] - made_progress = True + can_leave = False + + # We may only leave this loop if we've scheduled an instruction + # since entering it. + + seen_an_insn = False + ignore_count = 0 + for sched_item in schedule[::-1]: + if isinstance(sched_item, RunInstruction): + seen_an_insn = True + elif isinstance(sched_item, LeaveLoop): + ignore_count +=1 + elif isinstance(sched_item, EnterLoop): + if ignore_count: + ignore_count -= 1 + else: + assert sched_item.iname == last_entered_loop + if seen_an_insn: + can_leave = True + break + + if can_leave: + schedule = schedule + [LeaveLoop(iname=last_entered_loop)] + + for sub_sched in generate_loop_schedules_internal( + kernel, loop_priority, schedule, + allow_boost=rec_allow_boost, debug=debug): + yield sub_sched + + return # }}} # {{{ see if any loop can be entered now - available_loops = (kernel.all_referenced_inames() - # loops can only be entered once - - entered_inames - # there's no notion of 'entering' a parallel loop + # Find inames that are being referenced by as yet unscheduled instructions. + needed_inames = set() + for insn_id in unscheduled_insn_ids: + needed_inames.update(kernel.insn_inames(insn_id)) + + needed_inames = (needed_inames + # There's no notion of 'entering' a parallel loop - parallel_inames - ) + + # Don't reenter a loop we're already in. + - active_inames_set) if debug_mode: print 75*"-" - print "available inames :", ",".join(available_loops) + print "inames still needed :", ",".join(needed_inames) print "active inames :", ",".join(active_inames) print "inames entered so far :", ",".join(entered_inames) print "reachable insns:", ",".join(reachable_insn_ids) print 75*"-" - # Don't be eager about scheduling new loops--if progress has been made, - # revert to top of scheduler and see if more progress can be made another - # way. (hence 'and not made_progress') - - if available_loops and not made_progress: + if needed_inames: useful_loops = [] - for iname in available_loops: + for iname in needed_inames: if not kernel.loop_nest_map()[iname] <= active_inames_set | parallel_inames: continue @@ -428,17 +471,28 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # loops in the second are not even tried (and so on). loop_priority_set = set(loop_priority) - useful_and_desired = set(useful_loops) & loop_priority_set + lowest_priority_set = set(kernel.lowest_priority_inames) + useful_loops_set = set(useful_loops) + useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: priority_tiers = [[iname] for iname in loop_priority - if iname in useful_and_desired] + if iname in useful_and_desired + and iname not in kernel.lowest_priority_inames] priority_tiers.append( - set(useful_loops) - loop_priority_set) + set(useful_loops) + - loop_priority_set + - lowest_priority_set) else: - priority_tiers = [useful_loops] + priority_tiers = [set(useful_loops) - lowest_priority_set] + + priority_tiers.extend([ + [iname] + for iname in kernel.lowest_priority_inames + if iname in useful_loops + ]) # }}} @@ -467,18 +521,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b print 75*"=" raw_input("Hit Enter for next schedule:") - if not active_inames and not available_loops and not unscheduled_insn_ids: + if not active_inames and not unscheduled_insn_ids: # if done, yield result debug.log_success(schedule) yield schedule - elif made_progress: - # if not done, but made some progress--try from the top - for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule, - allow_boost=rec_allow_boost, debug=debug): - yield sub_sched else: if not allow_boost and allow_boost is not None: # try again with boosting allowed -- GitLab