From 73b991f1282dc2987efb5c61741ce7827e7254ae Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 9 Nov 2011 09:54:20 -0500
Subject: [PATCH] Try to schedule entirely without boosting first before
 allowing it.

---
 MEMO                |  6 ++++++
 loopy/schedule.py   | 52 +++++++++++++++++++++++++++++----------------
 test/test_linalg.py |  3 +--
 3 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/MEMO b/MEMO
index f09d76315..3111326d3 100644
--- a/MEMO
+++ b/MEMO
@@ -39,6 +39,8 @@ Things to consider
 To-do
 ^^^^^
 
+- CSE should be more like variable assignment
+
 - dim_max caching
 
 - Fix all tests
@@ -54,6 +56,7 @@ Future ideas
 - Float4 joining on fetch/store?
 
 - How can one automatically generate something like microblocks?
+  -> Some sort of axis-adding transform?
 
 - Better for loop bound generation
   -> Try a triangular loop
@@ -85,6 +88,9 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- Exhaust the search for a no-boost solution first, before looking
+  for a schedule with boosts.
+
 - Pick not just axis 0, but all axes by lowest available stride
 
 - Scheduler tries too many boostability-related options
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 67ffbb645..0b1fb08fb 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -232,6 +232,11 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
     scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule
             if isinstance(sched_item, RunInstruction))
 
+    if allow_boost is None:
+        rec_allow_boost = None
+    else:
+        rec_allow_boost = False
+
     # {{{ find active and entered loops
 
     active_inames = []
@@ -441,8 +446,10 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
             for iname in tier:
                 new_schedule = schedule + [EnterLoop(iname=iname)]
+
                 for sub_sched in generate_loop_schedules_internal(
                         kernel, loop_priority, new_schedule,
+                        allow_boost=rec_allow_boost,
                         debug=debug):
                     found_viable_schedule = True
                     yield sub_sched
@@ -465,10 +472,10 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         # if not done, but made some progress--try from the top
         for sub_sched in generate_loop_schedules_internal(
                 kernel, loop_priority, schedule,
-                debug=debug):
+                allow_boost=rec_allow_boost, debug=debug):
             yield sub_sched
     else:
-        if not allow_boost:
+        if not allow_boost and allow_boost is not None:
             # try again with boosting allowed
             for sub_sched in generate_loop_schedules_internal(
                     kernel, loop_priority, schedule=schedule,
@@ -619,22 +626,31 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=None):
 
     debug = SchedulerDebugger(debug)
 
-    for gen_sched in generate_loop_schedules_internal(kernel, loop_priority,
-            debug=debug):
-        gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
-        if owed_barriers:
-            from warnings import warn
-            from loopy import LoopyAdvisory
-            warn("Barrier insertion finished without inserting barriers for "
-                    "local memory writes in these instructions: '%s'. "
-                    "This often means that local memory was "
-                    "written, but never read." % ",".join(owed_barriers), LoopyAdvisory)
-
-        debug.stop()
-        yield kernel.copy(schedule=gen_sched)
-        debug.start()
-
-        schedule_count += 1
+    generators = [
+            generate_loop_schedules_internal(kernel, loop_priority,
+                debug=debug, allow_boost=None),
+            generate_loop_schedules_internal(kernel, loop_priority,
+                debug=debug)]
+    for gen in generators:
+        for gen_sched in gen:
+            gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
+            if owed_barriers:
+                from warnings import warn
+                from loopy import LoopyAdvisory
+                warn("Barrier insertion finished without inserting barriers for "
+                        "local memory writes in these instructions: '%s'. "
+                        "This often means that local memory was "
+                        "written, but never read." % ",".join(owed_barriers), LoopyAdvisory)
+
+            debug.stop()
+            yield kernel.copy(schedule=gen_sched)
+            debug.start()
+
+            schedule_count += 1
+
+        # if no-boost mode yielded a viable schedule, stop now
+        if schedule_count:
+            break
 
     debug.done_scheduling()
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 07fcde54b..f5d9c2f00 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -368,8 +368,7 @@ def test_rank_one(ctx_factory):
 
     seq_knl = knl
 
-    #for variant in [variant_1, variant_2, variant_4]:
-    for variant in [variant_4]:
+    for variant in [variant_1, variant_2, variant_4]:
         kernel_gen = lp.generate_loop_schedules(variant(knl))
         kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
-- 
GitLab