From 8ba555dc04022bbc7d10227f87e0c0af719182fc Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 18 Jan 2012 01:35:34 -0500
Subject: [PATCH] Expose reduction realization to the user. Add more notes.

---
 MEMO                  | 14 ++++++++++++++
 doc/reference.rst     |  5 +++++
 loopy/__init__.py     |  2 +-
 loopy/codegen/loop.py |  2 +-
 loopy/preprocess.py   | 31 +++++++++++++++++++++++++++----
 loopy/schedule.py     |  9 +++++----
 6 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/MEMO b/MEMO
index d6300e1dc..ebd410f4f 100644
--- a/MEMO
+++ b/MEMO
@@ -42,11 +42,25 @@ To-do
 - What if no universally valid precompute base index expression is found?
   (test_intel_matrix_mul with n = 6*16, e.g.?)
 
+- "No schedule found" debug help:
+
+  - Find longest dead-end
+  - Automatically report on what hinders progress there
+
+- When duplicating, use iname aliases to relieve burden on isl
+
+- Differentiate ilp.unr from ilp.seq
+
+- Expose iname-duplicate-and-rename as a primitive.
+
 - Fix all tests
 
 Future ideas
 ^^^^^^^^^^^^
 
+- How is intra-instruction ordering of ILP loops going to be determined?
+  (taking into account that it could vary even per-instruction?)
+
 - Barriers for data exchanged via global vars?
 
 - Float4 joining on fetch/store?
diff --git a/doc/reference.rst b/doc/reference.rst
index c8844f4b8..71062604e 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -114,6 +114,11 @@ Precomputation and Prefetching
 
     Uses :func:`extract_subst` and :func:`precompute`.
 
+Manipulating Reductions
+-----------------------
+
+.. autofunction:: realize_reduction
+
 Finishing up
 ------------
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 912532ae1..94616f25d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -25,7 +25,7 @@ from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg
 from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel
 from loopy.subst import extract_subst, apply_subst
 from loopy.cse import precompute
-from loopy.preprocess import preprocess_kernel
+from loopy.preprocess import preprocess_kernel, realize_reduction
 from loopy.schedule import generate_loop_schedules
 from loopy.codegen import generate_code
 from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_ref
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index fb94fbeca..d1ffea7a3 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -149,7 +149,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
 
 # }}}
 
-# {{{ parallel loop
+# {{{ hw-parallel loop
 
 def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
     from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index a9e39c9fb..43e93ddd0 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -110,7 +110,25 @@ def duplicate_reduction_inames(kernel):
 
 # {{{ rewrite reduction to imperative form
 
-def realize_reduction(kernel):
+def realize_reduction(kernel, insn_id_filter=None):
+    """Rewrites reductions into their imperative form. With *insn_id_filter* specified,
+    operate only on the instruction with an instruction id matching insn_id_filter.
+
+    If *insn_id_filter* is given, only the outermost level of reductions will be
+    expanded, inner reductions will be left alone (because they end up in a new
+    instruction with a different ID, which doesn't match the filter).
+
+    If *insn_id_filter* is not given, all reductions in all instructions will
+    be realized.
+
+    This routine also implicitly performs (global) reduction iname duplication,
+    if requested by '@' prefixes on any reduction iname.
+    """
+
+    # Reduction iname duplication needs to happen beforehand, and it is
+    # idempotent. So just call it now.
+    kernel = duplicate_reduction_inames(kernel)
+
     new_insns = []
     new_temporary_variables = kernel.temporary_variables.copy()
 
@@ -130,7 +148,9 @@ def realize_reduction(kernel):
 
         ilp_iname_lengths = []
         for iname in ilp_inames:
-            # original kernel ok here--we're not messing with inames
+            # Using the original kernel is ok here. Nothing in realize_reductions
+            # messes with inames. This is useful because it takes advantage
+            # of bounds caching.
             bounds = kernel.get_iname_bounds(iname)
 
             from loopy.symbolic import pw_aff_to_expr
@@ -206,6 +226,10 @@ def realize_reduction(kernel):
 
         insn = insn_queue.pop(0)
 
+        if insn_id_filter is not None and insn.id != insn_id_filter:
+            new_insns.append(insn)
+            continue
+
         # Run reduction expansion.
         new_expression = cb_mapper(insn.expression)
 
@@ -681,9 +705,8 @@ def preprocess_kernel(kernel):
     from loopy.subst import apply_subst
     kernel = apply_subst(kernel)
 
-    kernel = mark_local_temporaries(kernel)
-    kernel = duplicate_reduction_inames(kernel)
     kernel = realize_reduction(kernel)
+    kernel = mark_local_temporaries(kernel)
     kernel = assign_automatic_axes(kernel)
     kernel = add_boostability_and_automatic_dependencies(kernel)
     kernel = limit_boostability(kernel)
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 64204f20d..9bdf9ca5c 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -267,6 +267,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
     # {{{ decide about debug mode
 
     debug_mode = False
+    #if len(schedule) == 15:
+        #debug_mode = True
+
     if debug is not None:
         if (debug.debug_length is not None
                 and len(schedule) >= debug.debug_length):
@@ -313,6 +316,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         # nested loop without harm.
 
         if allow_boost:
+            # Note that the inames in 'insn.boostable_into' necessarily won't
+            # be contained in 'want'.
             have = have - insn.boostable_into
 
         if want != have:
@@ -330,10 +335,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
         if (not schedule_now and have <= want):
             reachable_insn_ids.add(insn_id)
-        else:
-            if debug_mode:
-                print ("    '%s' also not reachable because it won't work under '%s'"
-                        % (insn.id, ",".join(have-want)))
 
         # }}}
 
-- 
GitLab