diff --git a/MEMO b/MEMO index 1a76b61e040098083629219715aade111f56ad0a..675b8f05793c5b57f6200f753042430b08072434 100644 --- a/MEMO +++ b/MEMO @@ -62,8 +62,6 @@ Things to consider TODO ^^^^ -- Make axpy better. - - implemented_domain may end up being smaller than requested in cse evaluations--check that! @@ -90,6 +88,10 @@ TODO Dealt with ^^^^^^^^^^ +- Allow prioritization of loops in scheduling. + +- Make axpy better. + - Screwy lower bounds in slab decomposition - reimplement add_prefetch diff --git a/loopy/schedule.py b/loopy/schedule.py index cf146782bf59945a857673aef566cfee67918f40..c7e3d57a2d349cbfde2cab0557ac0e078f3b7606 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -600,7 +600,7 @@ def assign_automatic_axes(kernel, only_axis_0=True): # {{{ scheduling algorithm -def generate_loop_schedules_internal(kernel, schedule=[]): +def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): all_insn_ids = set(insn.id for insn in kernel.instructions) scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule @@ -682,7 +682,7 @@ def generate_loop_schedules_internal(kernel, schedule=[]): ) if available_loops: - found_something_useful = False + useful_loops = [] for iname in available_loops: # {{{ determine if that gets us closer to being able to scheduling an insn @@ -699,16 +699,26 @@ def generate_loop_schedules_internal(kernel, schedule=[]): if not useful: continue - found_something_useful = True + useful_loops.append(iname) # }}} + useful_and_desired = set(useful_loops) & set(loop_priority) + if useful_and_desired: + # restrict to the first ('highest-priority') loop that's useful + + for iname in loop_priority: + if iname in useful_and_desired: + useful_loops = [iname] + break + + for iname in useful_loops: new_schedule = schedule + [EnterLoop(iname=iname)] for sub_sched in generate_loop_schedules_internal( - kernel, new_schedule): + kernel, loop_priority, new_schedule): yield sub_sched - if found_something_useful: + if useful_loops: return # }}} @@ -735,7 +745,8 @@ def generate_loop_schedules_internal(kernel, schedule=[]): else: # if not done, but made some progress--try from the top if made_progress: - for sub_sched in generate_loop_schedules_internal(kernel, schedule): + for sub_sched in generate_loop_schedules_internal( + kernel, loop_priority, schedule): yield sub_sched # }}} @@ -843,7 +854,7 @@ def insert_barriers(kernel, schedule, level=0): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel): +def generate_loop_schedules(kernel, loop_priority=[]): kernel = realize_reduction(kernel) # {{{ check that all CSEs have been realized @@ -867,7 +878,7 @@ def generate_loop_schedules(kernel): schedule_count = 0 - for gen_sched in generate_loop_schedules_internal(kernel): + for gen_sched in generate_loop_schedules_internal(kernel, loop_priority): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) assert not owed_barriers diff --git a/test/test_matmul.py b/test/test_matmul.py index b0b3b123a33920d549c477cabfc96ae5c4f2ef4b..0ed5f12878efbe0daa642eef14d3990da06b9a1b 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -131,7 +131,8 @@ def test_axpy(ctx_factory): refsol = (2*a+3*b).get() for variant in [variant_cpu, variant_gpu]: - kernel_gen = lp.generate_loop_schedules(variant(knl)) + kernel_gen = lp.generate_loop_schedules(variant(knl), + loop_priority=["i_inner_outer"]) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) def launcher(kernel, gsize, lsize, check):