diff --git a/MEMO b/MEMO index f09d763150d2b0f7750e37b908e7f5ff66ab3075..3111326d31a33b35639a1dfbb9b564d610da858c 100644 --- a/MEMO +++ b/MEMO @@ -39,6 +39,8 @@ Things to consider To-do ^^^^^ +- CSE should be more like variable assignment + - dim_max caching - Fix all tests @@ -54,6 +56,7 @@ Future ideas - Float4 joining on fetch/store? - How can one automatically generate something like microblocks? + -> Some sort of axis-adding transform? - Better for loop bound generation -> Try a triangular loop @@ -85,6 +88,9 @@ Future ideas Dealt with ^^^^^^^^^^ +- Exhaust the search for a no-boost solution first, before looking + for a schedule with boosts. + - Pick not just axis 0, but all axes by lowest available stride - Scheduler tries too many boostability-related options diff --git a/loopy/schedule.py b/loopy/schedule.py index 67ffbb645c257f4f0ec28aea4e3c7a80dfb1ab9b..0b1fb08fbe8d82a11f3f03d703e8a76eb03206b3 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -232,6 +232,11 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule if isinstance(sched_item, RunInstruction)) + if allow_boost is None: + rec_allow_boost = None + else: + rec_allow_boost = False + # {{{ find active and entered loops active_inames = [] @@ -441,8 +446,10 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b for iname in tier: new_schedule = schedule + [EnterLoop(iname=iname)] + for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, new_schedule, + allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True yield sub_sched @@ -465,10 +472,10 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # if not done, but made some progress--try from the top for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, schedule, - debug=debug): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched else: - if not allow_boost: + if not allow_boost and allow_boost is not None: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( kernel, loop_priority, schedule=schedule, @@ -619,22 +626,31 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=None): debug = SchedulerDebugger(debug) - for gen_sched in generate_loop_schedules_internal(kernel, loop_priority, - debug=debug): - gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) - if owed_barriers: - from warnings import warn - from loopy import LoopyAdvisory - warn("Barrier insertion finished without inserting barriers for " - "local memory writes in these instructions: '%s'. " - "This often means that local memory was " - "written, but never read." % ",".join(owed_barriers), LoopyAdvisory) - - debug.stop() - yield kernel.copy(schedule=gen_sched) - debug.start() - - schedule_count += 1 + generators = [ + generate_loop_schedules_internal(kernel, loop_priority, + debug=debug, allow_boost=None), + generate_loop_schedules_internal(kernel, loop_priority, + debug=debug)] + for gen in generators: + for gen_sched in gen: + gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) + if owed_barriers: + from warnings import warn + from loopy import LoopyAdvisory + warn("Barrier insertion finished without inserting barriers for " + "local memory writes in these instructions: '%s'. " + "This often means that local memory was " + "written, but never read." % ",".join(owed_barriers), LoopyAdvisory) + + debug.stop() + yield kernel.copy(schedule=gen_sched) + debug.start() + + schedule_count += 1 + + # if no-boost mode yielded a viable schedule, stop now + if schedule_count: + break debug.done_scheduling() diff --git a/test/test_linalg.py b/test/test_linalg.py index 07fcde54bc5771f2396e8b6c01ed5c95201977a6..f5d9c2f00e42a78faee6396bde0101d7bb1c1c35 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -368,8 +368,7 @@ def test_rank_one(ctx_factory): seq_knl = knl - #for variant in [variant_1, variant_2, variant_4]: - for variant in [variant_4]: + for variant in [variant_1, variant_2, variant_4]: kernel_gen = lp.generate_loop_schedules(variant(knl)) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))