From 3cecf117943dbdba68e0a2e684ad0620a954fb83 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 24 Oct 2011 10:45:20 -0400
Subject: [PATCH] Rename loopy.codegen.{dispatch->control}.

---
 loopy/codegen/__init__.py |   2 +-
 loopy/codegen/control.py  | 242 ++++++++++++++++++++++++++++++++++++++
 loopy/codegen/loop.py     |  10 +-
 3 files changed, 245 insertions(+), 9 deletions(-)
 create mode 100644 loopy/codegen/control.py

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 46a17999c..093df9921 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -271,7 +271,7 @@ def generate_code(kernel):
         """),
         Line()])
 
-    # {{{ build lmem array declarators for prefetches
+    # {{{ build lmem array declarators for temporary variables
 
     for tv in kernel.temporary_variables.itervalues():
         temp_var_decl = POD(tv.dtype, tv.name)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
new file mode 100644
index 000000000..2b6767db9
--- /dev/null
+++ b/loopy/codegen/control.py
@@ -0,0 +1,242 @@
+"""Loop nest build top-level control/hoisting."""
+from __future__ import division
+
+from loopy.codegen import CodeGenerationState, gen_code_block
+import islpy as isl
+
+
+
+
+def get_admissible_conditional_inames_for(kernel, sched_index):
+    """This function disallows conditionals on local-idx tagged
+    inames if there is a barrier nested somewhere within.
+    """
+
+    from loopy.kernel import LocalIndexTag, HardwareParallelTag
+
+    from loopy.schedule import find_active_inames_at, has_barrier_within
+    result = find_active_inames_at(kernel, sched_index)
+
+    has_barrier = has_barrier_within(kernel, sched_index)
+
+    for iname, tag in kernel.iname_to_tag.iteritems():
+        if isinstance(tag, HardwareParallelTag):
+            if not has_barrier or not isinstance(tag, LocalIndexTag):
+                result.add(iname)
+
+    return result
+
+
+
+
+def generate_code_for_sched_index(kernel, sched_index, codegen_state):
+    from loopy.schedule import (EnterLoop, RunInstruction, Barrier)
+
+    sched_item = kernel.schedule[sched_index]
+
+    if isinstance(sched_item, EnterLoop):
+        tag = kernel.iname_to_tag.get(sched_item.iname)
+
+        from loopy.codegen.loop import (
+                generate_unroll_loop,
+                generate_sequential_loop_dim_code)
+
+        from loopy.kernel import UnrollTag, SequentialTag
+        if isinstance(tag, UnrollTag):
+            func = generate_unroll_loop
+        elif tag is None or isinstance(tag, SequentialTag):
+            func = generate_sequential_loop_dim_code
+        else:
+            raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'"
+                    % (sched_item.iname, tag))
+
+        return func(kernel, sched_index, codegen_state)
+
+    elif isinstance(sched_item, Barrier):
+        from loopy.codegen import GeneratedInstruction
+        from cgen import Statement as S
+        return GeneratedInstruction(
+                ast=S("barrier(CLK_LOCAL_MEM_FENCE)"),
+                implemented_domain=None)
+
+    elif isinstance(sched_item, RunInstruction):
+        insn = kernel.id_to_insn[sched_item.insn_id]
+
+        from loopy.codegen.instruction import generate_instruction_code
+        return generate_instruction_code(kernel, insn, codegen_state)
+
+    else:
+        raise RuntimeError("unexpected schedule item type: %s"
+                % type(sched_item))
+
+
+
+
+def remove_inames_for_shared_hw_axes(kernel, cond_inames):
+    """
+    See if cond_inames contains references to two (or more) inames that
+    boil down to the same tag. If so, exclude them. (We shouldn't be writing
+    conditionals for such inames because we would be implicitly restricting
+    the other inames as well.)
+    """
+
+    tag_key_use_count = {}
+
+    from loopy.kernel import HardwareParallelTag
+
+    for iname in cond_inames:
+        tag = kernel.iname_to_tag.get(iname)
+
+        if isinstance(tag, HardwareParallelTag):
+            tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1
+
+    multi_use_keys = set(
+            key for key, count in tag_key_use_count.iteritems()
+            if count > 1)
+
+    multi_use_inames = set()
+    for iname in cond_inames:
+        tag = kernel.iname_to_tag.get(iname)
+        if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+            multi_use_inames.add(iname)
+
+    return cond_inames - multi_use_inames
+
+
+
+
+def build_loop_nest(kernel, sched_index, codegen_state):
+    # Most of the complexity of this function goes towards finding groups of
+    # instructions that can be nested inside a shared conditional.
+
+    assert isinstance(codegen_state, CodeGenerationState)
+
+    from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
+            gather_schedule_subloop)
+
+    # {{{ pass 1: pre-scan schedule for my schedule items' indices
+
+    my_sched_indices = []
+
+    while sched_index < len(kernel.schedule):
+        sched_item = kernel.schedule[sched_index]
+
+        if isinstance(sched_item, LeaveLoop):
+            break
+
+        my_sched_indices.append(sched_index)
+
+        if isinstance(sched_item, EnterLoop):
+            _, sched_index = gather_schedule_subloop(
+                    kernel.schedule, sched_index)
+        elif isinstance(sched_item, Barrier):
+            sched_index += 1
+
+        elif isinstance(sched_item, RunInstruction):
+            sched_index += 1
+        else:
+            raise RuntimeError("unexpected schedule item type: %s"
+                    % type(sched_item))
+
+    # }}}
+
+    # {{{ pass 2: find admissible conditional inames for each schedule item
+
+    admissible_cond_inames = [
+            get_admissible_conditional_inames_for(kernel, sched_index)
+            for sched_index in my_sched_indices]
+
+    # }}}
+
+    # {{{ pass 3: greedily group schedule items that share admissible inames
+
+    def build_insn_group(sched_indices_and_cond_inames, codegen_state,
+            min_iname_count=1):
+        # min_iname_count serves to prevent infinite recursion by imposing a
+        # bigger and bigger minimum size on the group of shared inames found.
+
+        if not sched_indices_and_cond_inames:
+            return []
+
+        sched_index, cond_inames = sched_indices_and_cond_inames[0]
+
+        # {{{ grow schedule item group
+
+        # Keep growing schedule item group as long as group fulfills minimum
+        # size requirement.
+
+        current_iname_set = cond_inames
+
+        idx = 1
+        while (len(current_iname_set) >= min_iname_count
+                and idx < len(sched_indices_and_cond_inames)):
+            other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx]
+            new_iname_set = current_iname_set & other_cond_inames
+
+            if len(new_iname_set) >= min_iname_count:
+                idx += 1
+                current_iname_set = new_iname_set
+            else:
+                break
+
+        # }}}
+
+        if len(current_iname_set) >= min_iname_count:
+            # Success: found a big enough group of inames for a conditional.
+            # See if there are bounds checks available for that set.
+
+            # {{{ see which inames were actually used in group
+
+            # And only generate conditionals for those.
+            from loopy.schedule import find_used_inames_within
+            used_inames = set()
+            for subsched_index, _ in sched_indices_and_cond_inames[0:idx]:
+                used_inames |= find_used_inames_within(kernel, subsched_index)
+
+            # }}}
+
+            from loopy.codegen.bounds import generate_bounds_checks
+            bounds_checks = generate_bounds_checks(kernel.domain,
+                    remove_inames_for_shared_hw_axes(kernel,
+                        current_iname_set & used_inames),
+                    codegen_state.implemented_domain)
+        else:
+            bounds_checks = []
+
+        if bounds_checks:
+            check_set = isl.BasicSet.universe(kernel.space)
+            for cns in bounds_checks:
+                check_set = check_set.add_constraint(cns)
+
+            new_codegen_state = codegen_state.intersect(check_set)
+        else:
+            new_codegen_state = codegen_state
+
+        if idx == 1:
+            # group only contains starting schedule item
+            result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)]
+        else:
+            # recurse with a bigger minimum iname count
+            result = build_insn_group(sched_indices_and_cond_inames[0:idx],
+                    new_codegen_state, len(current_iname_set)+1)
+
+        if bounds_checks:
+            from loopy.codegen import wrap_in_if
+            from loopy.codegen.bounds import constraint_to_code
+            result = [wrap_in_if(
+                    [constraint_to_code(codegen_state.c_code_mapper, cns) for cns in bounds_checks],
+                    gen_code_block(result))]
+
+        return result + build_insn_group(
+                sched_indices_and_cond_inames[idx:], codegen_state)
+
+    # }}}
+
+    return gen_code_block(
+            build_insn_group(zip(
+                my_sched_indices, admissible_cond_inames), codegen_state))
+
+
+
+
+# vim: foldmethod=marker
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index f050b4b20..ade8d9fe5 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -1,11 +1,8 @@
 from __future__ import division
 
-import numpy as np
-from loopy.codegen import CodeGenerationState, gen_code_block
-from pytools import Record
+from loopy.codegen import gen_code_block
 import islpy as isl
-from islpy import dim_type
-from loopy.codegen.dispatch import build_loop_nest
+from loopy.codegen.control import build_loop_nest
 
 
 
@@ -40,8 +37,6 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 
     lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0))
 
-    # {{{ build slabs
-
     iname_tp, iname_idx = kernel.iname_to_dim[iname]
 
     constraints = [lb_cns_orig]
@@ -167,7 +162,6 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=
                 for iname in kernel.all_inames()
                 if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
 
-    from loopy.codegen.dispatch import build_loop_nest
     if not hw_inames_left:
         return build_loop_nest(kernel, sched_index, codegen_state)
 
-- 
GitLab