Expose reduction realization to the user. Add more notes.

8ba555dc · Andreas Klöckner · c8064886 · 8ba555dc · 8ba555dc · 8ba555dc
Commit 8ba555dc authored 13 years ago by Andreas Klöckner
--- a/MEMO
+++ b/MEMO
@@ -42,11 +42,25 @@ To-do
 - What if no universally valid precompute base index expression is found?
  (test_intel_matrix_mul with n = 6*16, e.g.?)

+- "No schedule found" debug help:
+
+  - Find longest dead-end
+  - Automatically report on what hinders progress there
+
+- When duplicating, use iname aliases to relieve burden on isl
+
+- Differentiate ilp.unr from ilp.seq
+
+- Expose iname-duplicate-and-rename as a primitive.
+
 - Fix all tests

 Future ideas
 ^^^^^^^^^^^^

+- How is intra-instruction ordering of ILP loops going to be determined?
+  (taking into account that it could vary even per-instruction?)
+
 - Barriers for data exchanged via global vars?

 - Float4 joining on fetch/store?

--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -114,6 +114,11 @@ Precomputation and Prefetching

    Uses :func:`extract_subst` and :func:`precompute`.

+Manipulating Reductions
+-----------------------
+
+.. autofunction:: realize_reduction
+
 Finishing up
 ------------


--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -25,7 +25,7 @@ from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg
 from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel
 from loopy.subst import extract_subst, apply_subst
 from loopy.cse import precompute
-from loopy.preprocess import preprocess_kernel
+from loopy.preprocess import preprocess_kernel, realize_reduction
 from loopy.schedule import generate_loop_schedules
 from loopy.codegen import generate_code
 from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_ref

--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -149,7 +149,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):

 # }}}

-# {{{ parallel loop
+# {{{ hw-parallel loop

 def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
    from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag

--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -110,7 +110,25 @@ def duplicate_reduction_inames(kernel):

 # {{{ rewrite reduction to imperative form

-def realize_reduction(kernel):
+def realize_reduction(kernel, insn_id_filter=None):
+    """Rewrites reductions into their imperative form. With *insn_id_filter* specified,
+    operate only on the instruction with an instruction id matching insn_id_filter.
+
+    If *insn_id_filter* is given, only the outermost level of reductions will be
+    expanded, inner reductions will be left alone (because they end up in a new
+    instruction with a different ID, which doesn't match the filter).
+
+    If *insn_id_filter* is not given, all reductions in all instructions will
+    be realized.
+
+    This routine also implicitly performs (global) reduction iname duplication,
+    if requested by '@' prefixes on any reduction iname.
+    """
+
+    # Reduction iname duplication needs to happen beforehand, and it is
+    # idempotent. So just call it now.
+    kernel = duplicate_reduction_inames(kernel)
+
    new_insns = []
    new_temporary_variables = kernel.temporary_variables.copy()

@@ -130,7 +148,9 @@ def realize_reduction(kernel):

        ilp_iname_lengths = []
        for iname in ilp_inames:
-            # original kernel ok here--we're not messing with inames
+            # Using the original kernel is ok here. Nothing in realize_reductions
+            # messes with inames. This is useful because it takes advantage
+            # of bounds caching.
            bounds = kernel.get_iname_bounds(iname)

            from loopy.symbolic import pw_aff_to_expr
@@ -206,6 +226,10 @@ def realize_reduction(kernel):

        insn = insn_queue.pop(0)

+        if insn_id_filter is not None and insn.id != insn_id_filter:
+            new_insns.append(insn)
+            continue
+
        # Run reduction expansion.
        new_expression = cb_mapper(insn.expression)

@@ -681,9 +705,8 @@ def preprocess_kernel(kernel):
    from loopy.subst import apply_subst
    kernel = apply_subst(kernel)

-    kernel = mark_local_temporaries(kernel)
-    kernel = duplicate_reduction_inames(kernel)
    kernel = realize_reduction(kernel)
+    kernel = mark_local_temporaries(kernel)
    kernel = assign_automatic_axes(kernel)
    kernel = add_boostability_and_automatic_dependencies(kernel)
    kernel = limit_boostability(kernel)

--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -267,6 +267,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
    # {{{ decide about debug mode

    debug_mode = False
+    #if len(schedule) == 15:
+        #debug_mode = True
+
    if debug is not None:
        if (debug.debug_length is not None
                and len(schedule) >= debug.debug_length):
@@ -313,6 +316,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
        # nested loop without harm.

        if allow_boost:
+            # Note that the inames in 'insn.boostable_into' necessarily won't
+            # be contained in 'want'.
            have = have - insn.boostable_into

        if want != have:
@@ -330,10 +335,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b

        if (not schedule_now and have <= want):
            reachable_insn_ids.add(insn_id)
-        else:
-            if debug_mode:
-                print ("    '%s' also not reachable because it won't work under '%s'"
-                        % (insn.id, ",".join(have-want)))

        # }}}