From de71343d20e9b26386f6e3e7266007f5603466d2 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 21 Feb 2012 01:27:38 +0100
Subject: [PATCH] Make ILP go through regular loop emission, implement ilp.seq.

Introduce 'breakable' loops.
---
 MEMO                         |  10 ++-
 doc/reference.rst            |  12 +++-
 loopy/check.py               |  14 ++--
 loopy/codegen/instruction.py | 113 ++++---------------------------
 loopy/kernel.py              |  13 ++--
 loopy/preprocess.py          |  44 +++++++++++-
 loopy/schedule.py            | 126 ++++++++++++++++++++++++-----------
 7 files changed, 177 insertions(+), 155 deletions(-)

diff --git a/MEMO b/MEMO
index 66f5f2f4e..b2dd2d070 100644
--- a/MEMO
+++ b/MEMO
@@ -46,8 +46,6 @@ To-do
 
 - When duplicating inames, use iname aliases to relieve burden on isl
 
-- Differentiate ilp.unr from ilp.seq
-
 - Add dependencies after the fact
 
 - bug? with fetching only g[j,*] inside j loop
@@ -56,6 +54,8 @@ To-do
 
 - nbody GPU
   -> pending better prefetch spec
+  - Prefetch by sample access
+  - Exclude by precompute name
 
 - Expose iname-duplicate-and-rename as a primitive.
 
@@ -66,9 +66,13 @@ To-do
 
 - Fix all tests
 
+- Scalar insn priority
+
 Future ideas
 ^^^^^^^^^^^^
 
+- Check for unordered (no-dependency) writes to the same location
+
 - String instructions?
 
 - How is intra-instruction ordering of ILP loops going to be determined?
@@ -111,6 +115,8 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- Differentiate ilp.unr from ilp.seq
+
 - Allow complex-valued arithmetic, despite CL's best efforts.
 
 - "No schedule found" debug help:
diff --git a/doc/reference.rst b/doc/reference.rst
index 0e5387719..0c5128490 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -21,6 +21,14 @@ Expressions
 Assignments and Substitution Rules
 ----------------------------------
 
+Inames
+------
+
+Loops are (by default) entered exactly once. This is necessary to preserve
+depdency semantics--otherwise e.g. a fetch could happen inside one loop nest,
+and then the instruction using that fetch could be inside a wholly different
+loop nest.
+
 Tags
 ----
 
@@ -38,9 +46,9 @@ Tag                   Meaning
 
 (Throughout this table, `N` must be replaced by an actual number.)
 
-ILP is really three things combined:
+"ILP" does three things:
 
-* Restricts loops to be innermost (excludes them from scheduling)
+* Restricts loops to be innermost
 * Duplicates reduction storage for any reductions nested around ILP usage
 * Causes a loop (unrolled or not) to be opened/generated for each
   involved instruction
diff --git a/loopy/check.py b/loopy/check.py
index c0bfeca13..454f3450f 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -133,13 +133,15 @@ def check_for_write_races(kernel):
                         assignee_inames & local_parallel_insn_inames)
 
             elif temp_var.is_local == False:
-                ilp_inames = set(
-                        iname
-                        for iname in kernel.insn_inames(insn)
-                        if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag))
+                #from loopy.kernel import IlpBaseTag
+                #ilp_inames = set(
+                        #iname
+                        #for iname in kernel.insn_inames(insn)
+                        #if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag))
+
+                #inames_without_write_dep = ilp_inames - assignee_inames
 
-                inames_without_write_dep = ilp_inames - (
-                        assignee_inames & ilp_inames)
+                inames_without_write_dep = set()
 
             else:
                 raise RuntimeError("temp var '%s' hasn't decided on "
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 688ae2c7f..0970fa5c4 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -1,115 +1,26 @@
 """Code generation for Instruction objects."""
 from __future__ import division
 
-from pytools import Record
-import islpy as isl
-
-
-
-
-# {{{ ILP instance
-
-class ILPInstance(Record):
-    """
-    :ivar ilp_key: a frozenset of tuples (iname, assignment)
-    """
-    __slots__ = ["implemented_domain", "assignments", "ilp_key"]
-
-    def __init__(self, implemented_domain, assignments, ilp_key):
-        Record.__init__(self,
-                implemented_domain=implemented_domain,
-                assignments=assignments,
-                ilp_key=ilp_key)
-
-    def fix(self, iname, aff):
-        from loopy.isl_helpers import iname_rel_aff
-        iname_plus_lb_aff = iname_rel_aff(
-                self.implemented_domain.get_space(), iname, "==", aff)
-
-        from loopy.symbolic import pw_aff_to_expr
-        expr = pw_aff_to_expr(aff)
-
-        cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff)
-
-        new_assignments = self.assignments.copy()
-        new_assignments[iname] = expr
-        return ILPInstance(
-                implemented_domain=self.implemented_domain.add_constraint(cns),
-                assignments=new_assignments,
-                ilp_key=self.ilp_key | set([(iname, expr)]))
-
-# }}}
-
-
-
-
-def generate_ilp_instances(kernel, insn, codegen_state):
-    impl_domain = codegen_state.implemented_domain
-
-    from loopy.kernel import IlpTag
-
-    result = [ILPInstance(impl_domain, {}, frozenset())]
-
-    # {{{ pass 2: treat all ILP dimensions
-
-    for iname in kernel.insn_inames(insn):
-        tag = kernel.iname_to_tag.get(iname)
-
-        if not isinstance(tag, IlpTag):
-            continue
-
-
-        bounds = kernel.get_iname_bounds(iname)
-
-        from loopy.isl_helpers import (
-                static_max_of_pw_aff, static_value_of_pw_aff)
-        from loopy.symbolic import pw_aff_to_expr
-
-        length = int(pw_aff_to_expr(
-            static_max_of_pw_aff(bounds.size, constants_only=True)))
-        lower_bound_aff = static_value_of_pw_aff(
-                bounds.lower_bound_pw_aff.coalesce(),
-                constants_only=False)
-
-        new_result = []
-        for ilpi in result:
-            for i in range(length):
-                idx_aff = lower_bound_aff + i
-                new_result.append(ilpi.fix(iname, idx_aff))
-
-        result = new_result
-
-    # }}}
-
-    return result
-
 
 
 
 def generate_instruction_code(kernel, insn, codegen_state):
-    result = []
     from loopy.codegen import GeneratedInstruction
 
-    for ilpi in generate_ilp_instances(kernel, insn, codegen_state):
-        ccm = codegen_state.c_code_mapper.copy_and_assign_many(ilpi.assignments)
-
-        # FIXME we should probably share some checks across ILP instances
-
-        from cgen import Assign
-        insn_code = Assign(ccm(insn.assignee), ccm(insn.expression))
-        from loopy.codegen.bounds import wrap_in_bounds_checks
-        insn_code, impl_domain = wrap_in_bounds_checks(
-                ccm, kernel.domain, kernel.insn_inames(insn), ilpi.implemented_domain,
-                insn_code)
-
-        result.append(GeneratedInstruction(
-            insn_id=insn.id,
-            implemented_domain=impl_domain,
-            ast=insn_code))
+    ccm = codegen_state.c_code_mapper
 
-    from loopy.codegen import gen_code_block
-    return gen_code_block(result)
+    from cgen import Assign
+    insn_code = Assign(ccm(insn.assignee), ccm(insn.expression))
+    from loopy.codegen.bounds import wrap_in_bounds_checks
+    insn_code, impl_domain = wrap_in_bounds_checks(
+            ccm, kernel.domain, kernel.insn_inames(insn),
+            codegen_state.implemented_domain,
+            insn_code)
 
+    return GeneratedInstruction(
+        insn_id=insn.id,
+        implemented_domain=impl_domain,
+        ast=insn_code)
 
 
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 8b5010c44..f5985bf56 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -492,6 +492,8 @@ class LoopKernel(Record):
         length 16.
     :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
         objects
+    :ivar lowest_priority_inames:
+    :ivar breakable_inames: these inames' loops may be broken up by the scheduler
 
     :ivar cache_manager:
 
@@ -508,7 +510,7 @@ class LoopKernel(Record):
             temporary_variables={},
             local_sizes={},
             iname_to_tag={}, iname_to_tag_requests=None, substitutions={},
-            cache_manager=None):
+            cache_manager=None, lowest_priority_inames=[], breakable_inames=set()):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
@@ -703,7 +705,9 @@ class LoopKernel(Record):
                 iname_to_tag=iname_to_tag,
                 iname_to_tag_requests=iname_to_tag_requests,
                 substitutions=substitutions,
-                cache_manager=cache_manager)
+                cache_manager=cache_manager,
+                lowest_priority_inames=lowest_priority_inames,
+                breakable_inames=breakable_inames)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
         if insns is None:
@@ -763,8 +767,6 @@ class LoopKernel(Record):
                     implicit_inames = None
 
                     for writer_id in writers[tv_name]:
-                        #writer_insn = self.id_to_insn[writer_id]
-
                         writer_implicit_inames = (
                                 insn_id_to_inames[writer_id]
                                 - insn_assignee_inames[writer_id])
@@ -1077,6 +1079,9 @@ class LoopKernel(Record):
         for inner_iname in self.all_inames():
             result[inner_iname] = set()
             for outer_iname in self.all_inames():
+                if outer_iname in self.breakable_inames:
+                    continue
+
                 if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]:
                     result[inner_iname].add(outer_iname)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 40c702fb9..5c86369a3 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -6,6 +6,40 @@ import pyopencl.characterize as cl_char
 
 
 
+# {{{ transform ilp into lower-level constructs
+
+def realize_ilp(kernel):
+    from loopy.kernel import (
+
+            UnrolledIlpTag, UnrollTag, LoopedIlpTag)
+    ILP_TO_BASE_TAG = {
+            UnrolledIlpTag: UnrollTag,
+            LoopedIlpTag: None,
+            }
+
+    lpi = kernel.lowest_priority_inames[:]
+    breakable_inames = kernel.breakable_inames.copy()
+
+    new_iname_to_tag = kernel.iname_to_tag.copy()
+    for iname in kernel.all_inames():
+        tag = kernel.iname_to_tag.get(iname)
+        if type(tag) in ILP_TO_BASE_TAG:
+            new_tag_cls = ILP_TO_BASE_TAG[type(tag)]
+            if new_tag_cls is None:
+                new_iname_to_tag[iname] = None
+            else:
+                new_iname_to_tag[iname] = new_tag_cls()
+
+            lpi.append(iname)
+            breakable_inames.add(iname)
+
+    return kernel.copy(
+            iname_to_tag=new_iname_to_tag,
+            lowest_priority_inames=lpi,
+            breakable_inames=breakable_inames)
+
+# }}}
+
 # {{{ local temporary finding
 
 def mark_local_temporaries(kernel):
@@ -138,7 +172,7 @@ def realize_reduction(kernel, insn_id_filter=None):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
-        # {{{ see if this reduction is nested around some ILP loops
+        # {{{ see if this reduction is nested inside some ILP loops
 
         ilp_inames = [iname
                 for iname in temp_kernel.insn_inames(insn)
@@ -706,6 +740,14 @@ def preprocess_kernel(kernel):
     kernel = apply_subst(kernel)
 
     kernel = realize_reduction(kernel)
+
+    # Ordering restriction:
+    # Must realize reductions before realizing ILP, because realize_ilp()
+    # gets rid of ILP tags, but realize_reduction() needs them to do
+    # reduction variable duplication.
+
+    kernel = realize_ilp(kernel)
+
     kernel = mark_local_temporaries(kernel)
     kernel = assign_automatic_axes(kernel)
     kernel = add_boostability_and_automatic_dependencies(kernel)
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 4814f9729..f38928817 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -274,7 +274,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                 and len(schedule) >= debug.debug_length):
             debug_mode = True
 
-    #print dump_schedule(schedule), len(schedule)
     if debug_mode:
         print 75*"="
         print "KERNEL:"
@@ -292,8 +291,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
     # }}}
 
-    made_progress = False
-
     # {{{ see if any insn can be scheduled now
 
     # Also take note of insns that have a chance of being schedulable inside
@@ -301,7 +298,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
     reachable_insn_ids = set()
 
-    for insn_id in all_insn_ids - scheduled_insn_ids:
+    unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids
+
+    for insn_id in unscheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
 
         schedule_now = set(insn.insn_deps) <= scheduled_insn_ids
@@ -346,56 +345,100 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                 print "scheduling '%s'" % insn.id
             scheduled_insn_ids.add(insn.id)
             schedule = schedule + [RunInstruction(insn_id=insn.id)]
-            made_progress = True
+
+            # Don't be eager about entering/leaving loops--if progress has been
+            # made, revert to top of scheduler and see if more progress can be
+            # made.
+
+            for sub_sched in generate_loop_schedules_internal(
+                    kernel, loop_priority, schedule,
+                    allow_boost=rec_allow_boost, debug=debug):
+                yield sub_sched
+
+            return
 
     unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids)
 
     # }}}
 
-    # {{{ see if we're ready to leave a loop
+    # {{{ see if we're ready to leave the innermost loop
 
     if  last_entered_loop is not None:
         can_leave = True
-        for insn_id in unscheduled_insn_ids:
-            insn = kernel.id_to_insn[insn_id]
-            if last_entered_loop in kernel.insn_inames(insn):
-                if debug_mode:
-                    print("cannot leave '%s' because '%s' still depends on it"
-                            % (last_entered_loop, insn.id))
-                can_leave = False
-                break
+
+        if last_entered_loop not in kernel.breakable_inames:
+            # If the iname is not breakable, then check that we've
+            # scheduled all the instructions that require it.
+
+            for insn_id in unscheduled_insn_ids:
+                insn = kernel.id_to_insn[insn_id]
+                if last_entered_loop in kernel.insn_inames(insn):
+                    if debug_mode:
+                        print("cannot leave '%s' because '%s' still depends on it"
+                                % (last_entered_loop, insn.id))
+                    can_leave = False
+                    break
 
         if can_leave:
-            schedule = schedule + [LeaveLoop(iname=last_entered_loop)]
-            made_progress = True
+            can_leave = False
+
+            # We may only leave this loop if we've scheduled an instruction
+            # since entering it.
+
+            seen_an_insn = False
+            ignore_count = 0
+            for sched_item in schedule[::-1]:
+                if isinstance(sched_item, RunInstruction):
+                    seen_an_insn = True
+                elif isinstance(sched_item, LeaveLoop):
+                    ignore_count +=1
+                elif isinstance(sched_item, EnterLoop):
+                    if ignore_count:
+                        ignore_count -= 1
+                    else:
+                        assert sched_item.iname == last_entered_loop
+                        if seen_an_insn:
+                            can_leave = True
+                        break
+
+            if can_leave:
+                schedule = schedule + [LeaveLoop(iname=last_entered_loop)]
+
+                for sub_sched in generate_loop_schedules_internal(
+                        kernel, loop_priority, schedule,
+                        allow_boost=rec_allow_boost, debug=debug):
+                    yield sub_sched
+
+                return
 
     # }}}
 
     # {{{ see if any loop can be entered now
 
-    available_loops = (kernel.all_referenced_inames()
-            # loops can only be entered once
-            - entered_inames
-            # there's no notion of 'entering' a parallel loop
+    # Find inames that are being referenced by as yet unscheduled instructions.
+    needed_inames = set()
+    for insn_id in unscheduled_insn_ids:
+        needed_inames.update(kernel.insn_inames(insn_id))
+
+    needed_inames = (needed_inames
+            # There's no notion of 'entering' a parallel loop
             - parallel_inames
-            )
+
+            # Don't reenter a loop we're already in.
+            - active_inames_set)
 
     if debug_mode:
         print 75*"-"
-        print "available inames :", ",".join(available_loops)
+        print "inames still needed :", ",".join(needed_inames)
         print "active inames :", ",".join(active_inames)
         print "inames entered so far :", ",".join(entered_inames)
         print "reachable insns:", ",".join(reachable_insn_ids)
         print 75*"-"
 
-    # Don't be eager about scheduling new loops--if progress has been made,
-    # revert to top of scheduler and see if more progress can be made another
-    # way. (hence 'and not made_progress')
-
-    if available_loops and not made_progress:
+    if needed_inames:
         useful_loops = []
 
-        for iname in available_loops:
+        for iname in needed_inames:
             if not kernel.loop_nest_map()[iname] <= active_inames_set | parallel_inames:
                 continue
 
@@ -428,17 +471,28 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         # loops in the second are not even tried (and so on).
 
         loop_priority_set = set(loop_priority)
-        useful_and_desired = set(useful_loops) & loop_priority_set
+        lowest_priority_set = set(kernel.lowest_priority_inames)
+        useful_loops_set = set(useful_loops)
+        useful_and_desired = useful_loops_set & loop_priority_set
 
         if useful_and_desired:
             priority_tiers = [[iname]
                     for iname in loop_priority
-                    if iname in useful_and_desired]
+                    if iname in useful_and_desired
+                    and iname not in kernel.lowest_priority_inames]
 
             priority_tiers.append(
-                    set(useful_loops) - loop_priority_set)
+                    set(useful_loops)
+                    - loop_priority_set
+                    - lowest_priority_set)
         else:
-            priority_tiers = [useful_loops]
+            priority_tiers = [set(useful_loops) - lowest_priority_set]
+
+        priority_tiers.extend([
+            [iname]
+            for iname in kernel.lowest_priority_inames
+            if iname in useful_loops
+            ])
 
         # }}}
 
@@ -467,18 +521,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         print 75*"="
         raw_input("Hit Enter for next schedule:")
 
-    if not active_inames and not available_loops and not unscheduled_insn_ids:
+    if not active_inames and not unscheduled_insn_ids:
         # if done, yield result
         debug.log_success(schedule)
 
         yield schedule
 
-    elif made_progress:
-        # if not done, but made some progress--try from the top
-        for sub_sched in generate_loop_schedules_internal(
-                kernel, loop_priority, schedule,
-                allow_boost=rec_allow_boost, debug=debug):
-            yield sub_sched
     else:
         if not allow_boost and allow_boost is not None:
             # try again with boosting allowed
-- 
GitLab