From 89f3ee561a8022123247af3bc892737ec91dfd44 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Fri, 24 Aug 2012 23:58:34 -0400 Subject: [PATCH] Add, use instruction priority. Add {options} instruction syntax. --- MEMO | 10 ++- doc/reference.rst | 7 +++ loopy/__init__.py | 76 +++++++++++++++++++++++ loopy/kernel.py | 77 +++++++++++++++++------- loopy/schedule.py | 72 ++++++++++++++-------- test/test_linalg.py | 48 ++++++--------- test/test_loopy.py | 2 +- {proto-tests => test}/test_sem_reagan.py | 0 8 files changed, 206 insertions(+), 86 deletions(-) rename {proto-tests => test}/test_sem_reagan.py (100%) diff --git a/MEMO b/MEMO index aeeb733a0..d9d823b65 100644 --- a/MEMO +++ b/MEMO @@ -41,8 +41,6 @@ Things to consider To-do ^^^^^ -- Clean up loopy.kernel. - - Group instructions by dependency/inames for scheduling, to increase sched. scalability @@ -51,10 +49,6 @@ To-do - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) -- Add dependencies after the fact - -- Scalar insn priority - - If finding a maximum proves troublesome, move parameters into the domain - : (as in, Matlab full-slice) in prefetches @@ -111,6 +105,10 @@ Future ideas Dealt with ^^^^^^^^^^ +- Add dependencies after the fact + +- Scalar insn priority + - ScalarArg is a bad name -> renamed to ValueArg diff --git a/doc/reference.rst b/doc/reference.rst index 69b1b0fe2..9b9784bef 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -182,6 +182,13 @@ Manipulating Reductions .. autofunction:: realize_reduction +Manipulating Instructions +------------------------- + +.. autofunction:: set_instruction_priority + +.. autofunction:: add_dependency + Finishing up ------------ diff --git a/loopy/__init__.py b/loopy/__init__.py index 43c13f8f6..dc6c10c85 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -263,6 +263,9 @@ def tag_dimensions(kernel, iname_to_tag, force=False): new_iname_to_tag = kernel.iname_to_tag.copy() for iname, new_tag in iname_to_tag.iteritems(): + if iname not in kernel.all_inames(): + raise RuntimeError("iname '%s' does not exist" % iname) + old_tag = kernel.iname_to_tag.get(iname) retag_ok = False @@ -422,6 +425,79 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel +# }}} + +# {{{ instruction processing + +class _IdMatch(object): + def __init__(self, value): + self.value = value + +class _ExactIdMatch(_IdMatch): + def __call__(self, insn): + return insn.id == self.value + +class _ReIdMatch: + def __call__(self, insn): + return self.value.match(insn.id) is not None + +def _parse_insn_match(insn_match): + import re + colon_idx = insn_match.find(":") + if colon_idx == -1: + return _ExactIdMatch(insn_match) + + match_tp = insn_match[:colon_idx] + match_val = insn_match[colon_idx+1:] + + if match_tp == "glob": + from fnmatch import translate + return _ReIdMatch(re.compile(translate(match_val))) + elif match_tp == "re": + return _ReIdMatch(re.compile(match_val)) + else: + raise ValueError("match type '%s' not understood" % match_tp) + + + + +def find_instructions(kernel, insn_match): + match = _parse_insn_match(insn_match) + return [insn for insn in kernel.instructions if match(insn)] + +def map_instructions(kernel, insn_match, f): + match = _parse_insn_match(insn_match) + + new_insns = [] + + for insn in kernel.instructions: + if match(insn): + new_insns.append(f(insn)) + else: + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +def set_instruction_priority(kernel, insn_match, priority): + """Set the priority of instructions matching *insn_match* to *priority*. + + *insn_match* may be an instruction id, a regular expression prefixed by `re:`, + or a file-name-style glob prefixed by `glob:`. + """ + + def set_prio(insn): return insn.copy(priority=priority) + return map_instructions(kernel, insn_match, set_prio) + +def add_dependency(kernel, insn_match, dependency): + """Add the instruction dependency *dependency* to the instructions matched + by *insn_match*. + + *insn_match* may be an instruction id, a regular expression prefixed by `re:`, + or a file-name-style glob prefixed by `glob:`. + """ + + def add_dep(insn): return insn.copy(insn_deps=insn.insn_deps + [dependency]) + return map_instructions(kernel, insn_match, add_dep) # }}} diff --git a/loopy/kernel.py b/loopy/kernel.py index a6d8095bb..c14459298 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -296,6 +296,7 @@ class Instruction(Record): of the program. Allowed values are *None* (for unknown), *True*, and *False*. :ivar boostable_into: a set of inames into which the instruction may need to be boosted, as a heuristic help for the scheduler. + :ivar priority: scheduling priority The following two instance variables are only used until :func:`loopy.make_kernel` is finished: @@ -309,7 +310,8 @@ class Instruction(Record): id, assignee, expression, forced_iname_deps=frozenset(), insn_deps=set(), boostable=None, boostable_into=None, - temp_var_type=None, duplicate_inames_and_tags=[]): + temp_var_type=None, duplicate_inames_and_tags=[], + priority=0): from loopy.symbolic import parse if isinstance(assignee, str): @@ -325,7 +327,9 @@ class Instruction(Record): forced_iname_deps=forced_iname_deps, insn_deps=insn_deps, boostable=boostable, boostable_into=boostable_into, - temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags) + temp_var_type=temp_var_type, + duplicate_inames_and_tags=duplicate_inames_and_tags, + priority=priority) @memoize_method def reduction_inames(self): @@ -358,8 +362,12 @@ class Instruction(Record): else: raise RuntimeError("unexpected value for Instruction.boostable") + options = [] + if self.insn_deps: - result += "\n : " + ", ".join(self.insn_deps) + options.append("deps="+":".join(self.insn_deps)) + if self.priority: + options.append("priority=%d" % self.priority) return result @@ -644,7 +652,7 @@ class LoopKernel(Record): were applied to the kernel. These are stored so that they may be repeated on expressions the user specifies later. :ivar cache_manager: - :ivar lowest_priority_inames: + :ivar lowest_priority_inames: (used internally to realize ILP) :ivar breakable_inames: these inames' loops may be broken up by the scheduler The following instance variables are only used until :func:`loopy.make_kernel` is @@ -695,14 +703,13 @@ class LoopKernel(Record): INAME_ENTRY_RE = re.compile( r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$") INSN_RE = re.compile( - r"^\s*(?:(?P<label>\w+):)?" "\s*(?:\[" "(?P<iname_deps_and_tags>[\s\w,:.]*)" "(?:\|(?P<duplicate_inames_and_tags>[\s\w,:.]*))?" "\])?" "\s*(?:\<(?P<temp_var_type>.*?)\>)?" "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)" - "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$" + "\s*?(?:\{(?P<options>[\s\w=,:]+)\}\s*)?$" ) SUBST_RE = re.compile( r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$" @@ -738,7 +745,7 @@ class LoopKernel(Record): insn_match = INSN_RE.match(insn) subst_match = SUBST_RE.match(insn) if insn_match is not None and subst_match is not None: - raise RuntimeError("insn parse error") + raise RuntimeError("instruction parse error: %s" % insn) if insn_match is not None: groups = insn_match.groupdict() @@ -752,15 +759,33 @@ class LoopKernel(Record): rhs = parse(groups["rhs"]) if insn_match is not None: - if groups["label"] is not None: - label = groups["label"] - else: - label = "insn" - - if groups["insn_deps"] is not None: - insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(",")) - else: - insn_deps = set() + insn_deps = set() + insn_id = "insn" + priority = 0 + + if groups["options"] is not None: + for option in groups["options"].split(","): + option = option.strip() + if not option: + raise RuntimeError("empty option supplied") + + equal_idx = option.find("=") + if equal_idx == -1: + opt_key = option + opt_value = None + else: + opt_key = option[:equal_idx].strip() + opt_value = option[equal_idx+1:].strip() + + if opt_key == "id": + insn_id = opt_value + elif opt_key == "priority": + priority = int(opt_value) + elif opt_key == "dep": + insn_deps = opt_value.split(":") + else: + raise ValueError("unrecognized instruction option '%s'" + % opt_key) if groups["iname_deps_and_tags"] is not None: inames_and_tags = parse_iname_and_tag_list( @@ -792,12 +817,14 @@ class LoopKernel(Record): parsed_instructions.append( Instruction( - id=self.make_unique_instruction_id(parsed_instructions, based_on=label), + id=self.make_unique_instruction_id( + parsed_instructions, based_on=insn_id), insn_deps=insn_deps, forced_iname_deps=forced_iname_deps, assignee=lhs, expression=rhs, temp_var_type=temp_var_type, - duplicate_inames_and_tags=duplicate_inames_and_tags)) + duplicate_inames_and_tags=duplicate_inames_and_tags, + priority=priority)) elif subst_match is not None: from pymbolic.primitives import Variable, Call @@ -1378,8 +1405,8 @@ class LoopKernel(Record): all_inames_by_insns |= self.insn_inames(insn) if not all_inames_by_insns <= self.all_inames(): - raise RuntimeError("inames collected from instructions (%s) " - "that are not present in domain (%s)" + raise RuntimeError("some inames collected from instructions (%s) " + "are not present in domain (%s)" % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) @@ -1552,14 +1579,20 @@ class LoopKernel(Record): loop_list_width = 35 for insn in self.instructions: loop_list = ",".join(sorted(self.insn_inames(insn))) + + options = [insn.id] + if insn.priority: + options.append("priority=%d" % insn.priority) + if len(loop_list) > loop_list_width: lines.append("[%s]" % loop_list) lines.append("%s%s <- %s # %s" % ( - (loop_list_width+2)*" ", insn.assignee, insn.expression, insn.id)) + (loop_list_width+2)*" ", insn.assignee, + insn.expression, ", ".join(options))) else: lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), - insn.assignee, insn.expression, insn.id)) + insn.assignee, insn.expression, ", ".join(options))) lines.append(sep) lines.append("DEPENDENCIES:") diff --git a/loopy/schedule.py b/loopy/schedule.py index a6bc240c4..62b0214ce 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -236,12 +236,18 @@ class ScheduleDebugger: # {{{ scheduling algorithm -def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_boost=False, debug=None): +def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], + allow_boost=False, allow_insn=False, debug=None): + # allow_insn is set to False initially and after entering each loop + # to give loops containing high-priority instructions a chance. + all_insn_ids = set(insn.id for insn in kernel.instructions) scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule if isinstance(sched_item, RunInstruction)) + unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids + if allow_boost is None: rec_allow_boost = None else: @@ -298,21 +304,22 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # }}} - # {{{ see if any insn can be scheduled now + # {{{ see if any insns are ready to be scheduled now # Also take note of insns that have a chance of being schedulable inside # the current loop nest, in this set: reachable_insn_ids = set() - unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids + for insn_id in sorted(unscheduled_insn_ids, + key=lambda insn_id: kernel.id_to_insn[insn_id].priority, + reverse=True): - for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] - schedule_now = set(insn.insn_deps) <= scheduled_insn_ids + is_ready = set(insn.insn_deps) <= scheduled_insn_ids - if not schedule_now: + if not is_ready: if debug_mode: print "instruction '%s' is missing insn depedencies '%s'" % ( insn.id, ",".join(set(insn.insn_deps) - scheduled_insn_ids)) @@ -330,7 +337,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b have = have - insn.boostable_into if want != have: - schedule_now = False + is_ready = False if debug_mode: if want-have: @@ -342,12 +349,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # {{{ determine reachability - if (not schedule_now and have <= want): + if (not is_ready and have <= want): reachable_insn_ids.add(insn_id) # }}} - if schedule_now: + if is_ready and allow_insn: if debug_mode: print "scheduling '%s'" % insn.id scheduled_insn_ids.add(insn.id) @@ -359,13 +366,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, schedule, - allow_boost=rec_allow_boost, debug=debug): + allow_boost=rec_allow_boost, debug=debug, + allow_insn=True): yield sub_sched return - unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids) - # }}} # {{{ see if we're ready to leave the innermost loop @@ -413,7 +419,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, schedule, - allow_boost=rec_allow_boost, debug=debug): + allow_boost=rec_allow_boost, debug=debug, + allow_insn=allow_insn): yield sub_sched return @@ -443,7 +450,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b print 75*"-" if needed_inames: - useful_loops = [] + iname_to_usefulness = {} for iname in needed_inames: @@ -483,7 +490,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # {{{ determine if that gets us closer to being able to schedule an insn - useful = False + usefulness = None # highest insn priority enabled by iname hypothetically_active_loops = active_inames_set | set([iname]) for insn_id in reachable_insn_ids: @@ -492,15 +499,17 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b want = kernel.insn_inames(insn) | insn.boostable_into if hypothetically_active_loops <= want: - useful = True - break + if usefulness is None: + usefulness = insn.priority + else: + usefulness = max(usefulness, insn.priority) - if not useful: + if usefulness is None: if debug_mode: print "iname '%s' deemed not useful" % iname continue - useful_loops.append(iname) + iname_to_usefulness[iname] = usefulness # }}} @@ -511,7 +520,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b loop_priority_set = set(loop_priority) lowest_priority_set = set(kernel.lowest_priority_inames) - useful_loops_set = set(useful_loops) + useful_loops_set = set(iname_to_usefulness.iterkeys()) useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: @@ -521,27 +530,29 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b and iname not in kernel.lowest_priority_inames] priority_tiers.append( - set(useful_loops) + useful_loops_set - loop_priority_set - lowest_priority_set) else: - priority_tiers = [set(useful_loops) - lowest_priority_set] + priority_tiers = [useful_loops_set - lowest_priority_set] priority_tiers.extend([ [iname] for iname in kernel.lowest_priority_inames - if iname in useful_loops + if iname in useful_loops_set ]) # }}} if debug_mode: - print "useful inames: %s" % ",".join(useful_loops) + print "useful inames: %s" % ",".join(useful_loops_set) for tier in priority_tiers: found_viable_schedule = False - for iname in tier: + for iname in sorted(tier, + key=lambda iname: iname_to_usefulness.get(iname, 0), + reverse=True): new_schedule = schedule + [EnterLoop(iname=iname)] for sub_sched in generate_loop_schedules_internal( @@ -567,11 +578,20 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b yield schedule else: + if not allow_insn: + # try again with boosting allowed + for sub_sched in generate_loop_schedules_internal( + kernel, loop_priority, schedule=schedule, + allow_boost=allow_boost, debug=debug, + allow_insn=True): + yield sub_sched + if not allow_boost and allow_boost is not None: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, schedule=schedule, - allow_boost=True, debug=debug): + allow_boost=True, debug=debug, + allow_insn=allow_insn): yield sub_sched else: # dead end diff --git a/test/test_linalg.py b/test/test_linalg.py index 74bd1c501..6cad0b080 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -240,7 +240,7 @@ def test_variable_size_matrix_mul(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ - "label: c[i, j] = sum_float32(k, a[i, k]*b[k, j])" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j]) {id=labl}" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), @@ -291,7 +291,7 @@ def test_rank_one(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j]: 0<=i,j<n}", [ - "label: c[i, j] = a[i]*b[j]" + "c[i, j] = a[i]*b[j] {id=mylabel, priority =5}" ], [ lp.GlobalArg("a", dtype, shape=(n,), order=order), @@ -478,62 +478,48 @@ def test_intel_matrix_mul(ctx_factory): def test_magma_fermi_matrix_mul(ctx_factory): - 1/0 # not updated to new conventions - dtype = np.float32 ctx = ctx_factory() order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - n = 6*16*16 + n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" + "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ - lp.ImageArg("a", dtype, 2), - lp.ImageArg("b", dtype, 2), + lp.ImageArg("a", dtype, shape=(n, n)), + lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") + seq_knl = knl + i_reg = 4 j_reg = 4 i_chunks = 16 j_chunks = 16 + + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_dimension(knl, "k", 16) - #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") - knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) - knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") + # FIXME + #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) + #knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) kernel_gen = lp.generate_loop_schedules(knl) - #hints=["k_outer", "k_inner_outer", "k_inner_inner"] kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - a_img = cl.image_from_array(ctx, a.get(), 1) - b_img = cl.image_from_array(ctx, b.get(), 1) - c = cl_array.empty_like(a) - refsol = np.dot(a.get(), b.get()) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + op_count=[2*n**3/1e9], op_label=["GFlops"], + parameters={}) diff --git a/test/test_loopy.py b/test/test_loopy.py index 0639bbfec..ffad62f6a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -41,7 +41,7 @@ def test_wg_too_small(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i]: 0<=i<100}", [ - "[i:l.0] <float32> z[i] = a[i]" + "[i:l.0] <float32> z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) diff --git a/proto-tests/test_sem_reagan.py b/test/test_sem_reagan.py similarity index 100% rename from proto-tests/test_sem_reagan.py rename to test/test_sem_reagan.py -- GitLab