From 85194627bf520b3feaa4829c9773a8da3cc6842a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 25 Oct 2011 01:46:41 -0400
Subject: [PATCH] Revamp conditional hoist algorithm.

The previous algorithm would miss hoist opportunities if a larger group
of candidate inames existed which then in turn did not lead to a viable
hoisted condition.
---
 MEMO                      |   3 -
 loopy/codegen/control.py  |  66 ++++++-----
 loopy/codegen/dispatch.py | 242 --------------------------------------
 test/test_matmul.py       |  50 ++++----
 4 files changed, 62 insertions(+), 299 deletions(-)
 delete mode 100644 loopy/codegen/dispatch.py

diff --git a/MEMO b/MEMO
index b16d1c3ac..56bc50c2a 100644
--- a/MEMO
+++ b/MEMO
@@ -6,10 +6,7 @@ For writeup:
 TODO: Reimplement forced lengths
 TODO: Try, fix reg. prefetch (DG example) / CSEs
   ILP and reg. prefetch interact!
-TODO: Functions
-TODO: ILP arrays
 FIXME: support non-reductive dimensions (what did I mean here?)
-FIXME: write names should be assigned during scheduling
 FIXME: screwy lower bounds in ILP
 FIXME: Leading syncthreads elimination
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 2b6767db9..788625e0e 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -80,7 +80,7 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames):
     the other inames as well.)
     """
 
-    tag_key_use_count = {}
+    tag_key_uses = {}
 
     from loopy.kernel import HardwareParallelTag
 
@@ -88,11 +88,11 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames):
         tag = kernel.iname_to_tag.get(iname)
 
         if isinstance(tag, HardwareParallelTag):
-            tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1
+            tag_key_uses.setdefault(tag.key, []).append(iname)
 
     multi_use_keys = set(
-            key for key, count in tag_key_use_count.iteritems()
-            if count > 1)
+            key for key, user_inames in tag_key_uses.iteritems()
+            if len(user_inames) > 1)
 
     multi_use_inames = set()
     for iname in cond_inames:
@@ -150,8 +150,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     # {{{ pass 3: greedily group schedule items that share admissible inames
 
-    def build_insn_group(sched_indices_and_cond_inames, codegen_state,
-            min_iname_count=1):
+    def build_insn_group(sched_indices_and_cond_inames, codegen_state, done_group_lengths=set()):
         # min_iname_count serves to prevent infinite recursion by imposing a
         # bigger and bigger minimum size on the group of shared inames found.
 
@@ -167,41 +166,46 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
         current_iname_set = cond_inames
 
-        idx = 1
-        while (len(current_iname_set) >= min_iname_count
-                and idx < len(sched_indices_and_cond_inames)):
-            other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx]
-            new_iname_set = current_iname_set & other_cond_inames
+        found_hoists = []
 
-            if len(new_iname_set) >= min_iname_count:
-                idx += 1
-                current_iname_set = new_iname_set
-            else:
-                break
+        candidate_group_length = 1
+        while candidate_group_length <= len(sched_indices_and_cond_inames):
+            if candidate_group_length in done_group_lengths:
+                candidate_group_length += 1
+                continue
 
-        # }}}
-
-        if len(current_iname_set) >= min_iname_count:
-            # Success: found a big enough group of inames for a conditional.
-            # See if there are bounds checks available for that set.
+            other_sched_index, other_cond_inames = sched_indices_and_cond_inames[candidate_group_length-1]
+            current_iname_set = current_iname_set & other_cond_inames
 
-            # {{{ see which inames were actually used in group
+            # {{{ see which inames are actually used in group
 
             # And only generate conditionals for those.
             from loopy.schedule import find_used_inames_within
             used_inames = set()
-            for subsched_index, _ in sched_indices_and_cond_inames[0:idx]:
+            for subsched_index, _ in sched_indices_and_cond_inames[0:candidate_group_length]:
                 used_inames |= find_used_inames_within(kernel, subsched_index)
 
             # }}}
 
             from loopy.codegen.bounds import generate_bounds_checks
+            only_unshared_inames = remove_inames_for_shared_hw_axes(kernel,
+                    current_iname_set & used_inames)
+
             bounds_checks = generate_bounds_checks(kernel.domain,
                     remove_inames_for_shared_hw_axes(kernel,
-                        current_iname_set & used_inames),
+                        only_unshared_inames),
                     codegen_state.implemented_domain)
-        else:
-            bounds_checks = []
+
+            if bounds_checks or candidate_group_length == 1:
+                # length-1 must always be an option to reach the recursion base case below
+                found_hoists.append((candidate_group_length, bounds_checks))
+
+            candidate_group_length += 1
+
+        # }}}
+
+        # pick largest such group
+        group_length, bounds_checks = max(found_hoists)
 
         if bounds_checks:
             check_set = isl.BasicSet.universe(kernel.space)
@@ -212,13 +216,15 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         else:
             new_codegen_state = codegen_state
 
-        if idx == 1:
+        if group_length == 1:
             # group only contains starting schedule item
             result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)]
         else:
             # recurse with a bigger minimum iname count
-            result = build_insn_group(sched_indices_and_cond_inames[0:idx],
-                    new_codegen_state, len(current_iname_set)+1)
+            result = build_insn_group(
+                    sched_indices_and_cond_inames[0:group_length],
+                    new_codegen_state,
+                    done_group_lengths=done_group_lengths | set([group_length]))
 
         if bounds_checks:
             from loopy.codegen import wrap_in_if
@@ -228,7 +234,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                     gen_code_block(result))]
 
         return result + build_insn_group(
-                sched_indices_and_cond_inames[idx:], codegen_state)
+                sched_indices_and_cond_inames[group_length:], codegen_state)
 
     # }}}
 
diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py
deleted file mode 100644
index d5ee62839..000000000
--- a/loopy/codegen/dispatch.py
+++ /dev/null
@@ -1,242 +0,0 @@
-"""Loop nest build top-level dispatch."""
-from __future__ import division
-
-from loopy.codegen import CodeGenerationState, gen_code_block
-import islpy as isl
-
-
-
-
-def get_admissible_conditional_inames_for(kernel, sched_index):
-    """This function disallows conditionals on local-idx tagged
-    inames if there is a barrier nested somewhere within.
-    """
-
-    from loopy.kernel import LocalIndexTag, HardwareParallelTag
-
-    from loopy.schedule import find_active_inames_at, has_barrier_within
-    result = find_active_inames_at(kernel, sched_index)
-
-    has_barrier = has_barrier_within(kernel, sched_index)
-
-    for iname, tag in kernel.iname_to_tag.iteritems():
-        if isinstance(tag, HardwareParallelTag):
-            if not has_barrier or not isinstance(tag, LocalIndexTag):
-                result.add(iname)
-
-    return result
-
-
-
-
-def generate_code_for_sched_index(kernel, sched_index, codegen_state):
-    from loopy.schedule import (EnterLoop, RunInstruction, Barrier)
-
-    sched_item = kernel.schedule[sched_index]
-
-    if isinstance(sched_item, EnterLoop):
-        tag = kernel.iname_to_tag.get(sched_item.iname)
-
-        from loopy.codegen.loop import (
-                generate_unroll_loop,
-                generate_sequential_loop_dim_code)
-
-        from loopy.kernel import UnrollTag, SequentialTag
-        if isinstance(tag, UnrollTag):
-            func = generate_unroll_loop
-        elif tag is None or isinstance(tag, SequentialTag):
-            func = generate_sequential_loop_dim_code
-        else:
-            raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'"
-                    % (sched_item.iname, tag))
-
-        return func(kernel, sched_index, codegen_state)
-
-    elif isinstance(sched_item, Barrier):
-        from loopy.codegen import GeneratedInstruction
-        from cgen import Statement as S
-        return GeneratedInstruction(
-                ast=S("barrier(CLK_LOCAL_MEM_FENCE)"),
-                implemented_domain=None)
-
-    elif isinstance(sched_item, RunInstruction):
-        insn = kernel.id_to_insn[sched_item.insn_id]
-
-        from loopy.codegen.instruction import generate_instruction_code
-        return generate_instruction_code(kernel, insn, codegen_state)
-
-    else:
-        raise RuntimeError("unexpected schedule item type: %s"
-                % type(sched_item))
-
-
-
-
-def remove_inames_for_shared_hw_axes(kernel, cond_inames):
-    """
-    See if cond_inames contains references to two (or more) inames that
-    boil down to the same tag. If so, exclude them. (We shouldn't be writing
-    conditionals for such inames because we would be implicitly restricting
-    the other inames as well.)
-    """
-
-    tag_key_use_count = {}
-
-    from loopy.kernel import HardwareParallelTag
-
-    for iname in cond_inames:
-        tag = kernel.iname_to_tag.get(iname)
-
-        if isinstance(tag, HardwareParallelTag):
-            tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1
-
-    multi_use_keys = set(
-            key for key, count in tag_key_use_count.iteritems()
-            if count > 1)
-
-    multi_use_inames = set()
-    for iname in cond_inames:
-        tag = kernel.iname_to_tag.get(iname)
-        if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
-            multi_use_inames.add(iname)
-
-    return cond_inames - multi_use_inames
-
-
-
-
-def build_loop_nest(kernel, sched_index, codegen_state):
-    # Most of the complexity of this function goes towards finding groups of
-    # instructions that can be nested inside a shared conditional.
-
-    assert isinstance(codegen_state, CodeGenerationState)
-
-    from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
-            gather_schedule_subloop)
-
-    # {{{ pass 1: pre-scan schedule for my schedule items' indices
-
-    my_sched_indices = []
-
-    while sched_index < len(kernel.schedule):
-        sched_item = kernel.schedule[sched_index]
-
-        if isinstance(sched_item, LeaveLoop):
-            break
-
-        my_sched_indices.append(sched_index)
-
-        if isinstance(sched_item, EnterLoop):
-            _, sched_index = gather_schedule_subloop(
-                    kernel.schedule, sched_index)
-        elif isinstance(sched_item, Barrier):
-            sched_index += 1
-
-        elif isinstance(sched_item, RunInstruction):
-            sched_index += 1
-        else:
-            raise RuntimeError("unexpected schedule item type: %s"
-                    % type(sched_item))
-
-    # }}}
-
-    # {{{ pass 2: find admissible conditional inames for each schedule item
-
-    admissible_cond_inames = [
-            get_admissible_conditional_inames_for(kernel, sched_index)
-            for sched_index in my_sched_indices]
-
-    # }}}
-
-    # {{{ pass 3: greedily group schedule items that share admissible inames
-
-    def build_insn_group(sched_indices_and_cond_inames, codegen_state,
-            min_iname_count=1):
-        # min_iname_count serves to prevent infinite recursion by imposing a
-        # bigger and bigger minimum size on the group of shared inames found.
-
-        if not sched_indices_and_cond_inames:
-            return []
-
-        sched_index, cond_inames = sched_indices_and_cond_inames[0]
-
-        # {{{ grow schedule item group
-
-        # Keep growing schedule item group as long as group fulfills minimum
-        # size requirement.
-
-        current_iname_set = cond_inames
-
-        idx = 1
-        while (len(current_iname_set) >= min_iname_count
-                and idx < len(sched_indices_and_cond_inames)):
-            other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx]
-            new_iname_set = current_iname_set & other_cond_inames
-
-            if len(new_iname_set) >= min_iname_count:
-                idx += 1
-                current_iname_set = new_iname_set
-            else:
-                break
-
-        # }}}
-
-        if len(current_iname_set) >= min_iname_count:
-            # Success: found a big enough group of inames for a conditional.
-            # See if there are bounds checks available for that set.
-
-            # {{{ see which inames were actually used in group
-
-            # And only generate conditionals for those.
-            from loopy.schedule import find_used_inames_within
-            used_inames = set()
-            for subsched_index, _ in sched_indices_and_cond_inames[0:idx]:
-                used_inames |= find_used_inames_within(kernel, subsched_index)
-
-            # }}}
-
-            from loopy.codegen.bounds import generate_bounds_checks
-            bounds_checks = generate_bounds_checks(kernel.domain,
-                    remove_inames_for_shared_hw_axes(kernel,
-                        current_iname_set & used_inames),
-                    codegen_state.implemented_domain)
-        else:
-            bounds_checks = []
-
-        if bounds_checks:
-            check_set = isl.BasicSet.universe(kernel.space)
-            for cns in bounds_checks:
-                check_set = check_set.add_constraint(cns)
-
-            new_codegen_state = codegen_state.intersect(check_set)
-        else:
-            new_codegen_state = codegen_state
-
-        if idx == 1:
-            # group only contains starting schedule item
-            result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)]
-        else:
-            # recurse with a bigger minimum iname count
-            result = build_insn_group(sched_indices_and_cond_inames[0:idx],
-                    new_codegen_state, len(current_iname_set)+1)
-
-        if bounds_checks:
-            from loopy.codegen import wrap_in_if
-            from loopy.codegen.bounds import constraint_to_code
-            result = [wrap_in_if(
-                    [constraint_to_code(codegen_state.c_code_mapper, cns) for cns in bounds_checks],
-                    gen_code_block(result))]
-
-        return result + build_insn_group(
-                sched_indices_and_cond_inames[idx:], codegen_state)
-
-    # }}}
-
-    return gen_code_block(
-            build_insn_group(zip(
-                my_sched_indices, admissible_cond_inames), codegen_state))
-
-
-
-
-# vim: foldmethod=marker
diff --git a/test/test_matmul.py b/test/test_matmul.py
index e54893cb6..bc2b0c65b 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -361,7 +361,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     knl = lp.LoopKernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = a[i, k]*b[k, j]"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),
@@ -374,16 +374,15 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     j_reg = 2
     i_chunks = 16
     j_chunks = 16
-    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True)
-    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True)
-    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "k", 16, no_slabs=True)
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner"])
-    assert knl.get_problems({})[0] <= 2
+    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0")
+    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1")
+    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "k", 16)
+    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"])
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl))
+    kernel_gen = lp.generate_loop_schedules(knl)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -411,12 +410,12 @@ def test_intel_matrix_mul(ctx_factory):
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = 6*16*16
+    n = 6*16
 
     knl = lp.LoopKernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = a[i, k]*b[k, j]"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),
@@ -429,20 +428,23 @@ def test_intel_matrix_mul(ctx_factory):
     j_reg = 4
     i_chunks = 16
     j_chunks = 16
-    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True)
-    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True)
-    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "k", 16, no_slabs=True)
+    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0")
+    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1")
+    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "k", 16)
     #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr")
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")])
-    knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
-    assert knl.get_problems({})[0] <= 2
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl,
-                hints=["k_outer", "k_inner_outer", "k_inner_inner"]
-                ))
+    knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"])
+    knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"])
+
+    # FIXME: Grouped prefetch
+    #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")])
+    #knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
+
+    kernel_gen = lp.generate_loop_schedules(knl)
+    #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-- 
GitLab