From 6f140ea6d0567b601918662ef744dbd683fd1672 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 17 Oct 2011 01:27:12 -0400
Subject: [PATCH] Iname dependency cleanups.

- Flag idempotent instructions.

- Exploit idempotent insns in scheduling, allowing them to be executed
  inside "too many" loops.

- Be more exact in what inames to duplicate in CSE pre-computes.
---
 MEMO                      | 21 ++++++++-------
 loopy/__init__.py         | 24 +++++++++++++----
 loopy/codegen/dispatch.py |  2 +-
 loopy/kernel.py           | 12 ++++++---
 loopy/schedule.py         | 40 ++++++++++++++++++++++++-----
 test/test_matmul.py       | 54 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 126 insertions(+), 27 deletions(-)

diff --git a/MEMO b/MEMO
index a5cbf93f0..facaab5a8 100644
--- a/MEMO
+++ b/MEMO
@@ -56,6 +56,8 @@ Things to consider
 - Parallel dimension splitting/merging via tags
   -> unnecessary?
 
+- All user-supplied commands are assumed to be idempotent.
+
 TODO
 ^^^^
 
@@ -78,23 +80,20 @@ TODO
 
 - Slab decomposition for ILP
 
-- Some things involving CSEs might be impossible to schedule
-  a[i,j] = cse(b[i]) * cse(c[j])
-
-- Flag, exploit idempotence
-
-- How should we implement the dim shuffling for odd-size prefetches?
-
 - Better for loop bound generation
   -> Try a triangular loop
 
-- AUTO_PICK or AUTO_FIT
-
-- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT
-
 Dealt with
 ^^^^^^^^^^
 
+- Flag, exploit idempotence
+
+- Some things involving CSEs might be impossible to schedule
+  a[i,j] = cse(b[i]) * cse(c[j])
+
+- Be smarter about automatic local axis choice
+  -> What if we run out of axes?
+
 - Implement condition hoisting
   (needed, e.g., by slab decomposition)
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index d3adafe88..784bc99cf 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -245,7 +245,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
             if iname in duplicate_inames:
                 tag = dup_iname_to_tag[iname]
             else:
-                tag = kernel.iname_to_tag[iname]
+                tag = kernel.iname_to_tag.get(iname)
 
             if isinstance(tag, LocalIndexTagBase):
                 kind = "l"
@@ -273,9 +273,22 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                         "that the CSE does not depend on "
                         "does not make sense")
 
-            force_dependency = True
-            if kind == "l" and target_var_is_local:
-                force_dependency = False
+            # Which iname dependencies are carried over from CSE host
+            # to the CSE compute instruction?
+
+            if not target_var_is_local:
+                # If we're writing to a private variable, then each
+                # hardware-parallel iname must execute its own copy of
+                # the CSE compute instruction. After all, each work item
+                # has its own set of private variables.
+
+                force_dependency = kind in "gl"
+            else:
+                # If we're writing to a local variable, then all other local
+                # dimensions see our updates, and thus they do *not* need to
+                # execute their own copy of this instruction.
+
+                force_dependency = kind == "g"
 
             if force_dependency:
                 forced_iname_deps.append(iname)
@@ -308,7 +321,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                 id=kernel.make_unique_instruction_id(based_on=cse_tag),
                 assignee=assignee,
                 expression=new_inner_expr,
-                forced_iname_deps=forced_iname_deps)
+                forced_iname_deps=forced_iname_deps,
+                idempotent=True)
 
         cse_result_insns.append(new_insn)
 
diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py
index b79ce784c..ae6144cee 100644
--- a/loopy/codegen/dispatch.py
+++ b/loopy/codegen/dispatch.py
@@ -35,7 +35,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
     sched_item = kernel.schedule[sched_index]
 
     if isinstance(sched_item, EnterLoop):
-        tag = kernel.iname_to_tag[sched_item.iname]
+        tag = kernel.iname_to_tag.get(sched_item.iname)
 
         from loopy.codegen.loop import (
                 generate_unroll_loop,
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 699b159ff..0d9ea7cf0 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -220,16 +220,19 @@ class Instruction(Record):
     :ivar insn_deps: a list of ids of :class:`Instruction` instances that
         *must* be executed before this one. Note that loop scheduling augments this
         by adding dependencies on any writes to temporaries read by this instruction.
+    :ivar idempotent: Whether the instruction may be executed repeatedly (while obeying
+        dependencies) without changing the meaning of the program.
     """
     def __init__(self,
-            id, assignee, expression,
+            id, assignee, expression, idempotent,
             forced_iname_deps=[], insn_deps=[]):
 
+        assert isinstance(idempotent, bool)
+
         Record.__init__(self,
                 id=id, assignee=assignee, expression=expression,
                 forced_iname_deps=forced_iname_deps,
-                insn_deps=insn_deps,
-                )
+                insn_deps=insn_deps, idempotent=idempotent)
 
     @memoize_method
     def all_inames(self):
@@ -447,7 +450,8 @@ class LoopKernel(Record):
                     id=self.make_unique_instruction_id(insns, based_on=label),
                     insn_deps=insn_deps,
                     forced_iname_deps=forced_iname_deps,
-                    assignee=lhs, expression=rhs)
+                    assignee=lhs, expression=rhs,
+                    idempotent=True)
 
         if isinstance(domain, str):
             ctx = isl.Context()
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 7afd6bb87..3d7a329ce 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -58,7 +58,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
                 forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
-                expression=expr.operation.neutral_element)
+                expression=expr.operation.neutral_element,
+                idempotent=True)
 
         new_insns.append(init_insn)
 
@@ -68,7 +69,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                 assignee=target_var,
                 expression=expr.operation(target_var, sub_expr),
                 insn_deps=[init_insn.id],
-                forced_iname_deps=list(insn.all_inames()))
+                forced_iname_deps=list(insn.all_inames()),
+                idempotent=False)
 
         new_insns.append(reduction_insn)
 
@@ -208,9 +210,11 @@ def check_for_unused_hw_axes(kernel):
                 raise RuntimeError("auto local tag encountered")
 
         if group_axes != group_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw group axes")
+            raise RuntimeError("instruction '%s' does not use all hw group axes"
+                    % insn.id)
         if local_axes != local_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw local axes")
+            raise RuntimeError("instruction '%s' does not use all hw local axes"
+                    % insn.id)
 
 
 
@@ -608,8 +612,25 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
 
     for insn_id in unscheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
-        if (active_inames - parallel_inames 
-                == insn.all_inames() - parallel_inames
+
+        if insn.idempotent:
+            # If insn is idempotent, it may be placed inside a more deeply
+            # nested loop without harm.
+
+            iname_deps_satisfied = (
+                    insn.all_inames() - parallel_inames
+                    <=
+                    active_inames - parallel_inames)
+        else:
+            # If insn is not idempotent, we must insist that it is placed inside
+            # the exactly correct set of loops.
+
+            iname_deps_satisfied = (
+                    insn.all_inames() - parallel_inames
+                    ==
+                    active_inames - parallel_inames)
+
+        if (iname_deps_satisfied
                 and set(insn.insn_deps) <= scheduled_insn_ids):
             scheduled_insn_ids.add(insn.id)
             schedule = schedule + [RunInstruction(insn_id=insn.id)]
@@ -812,12 +833,19 @@ def generate_loop_schedules(kernel):
     check_for_double_use_of_hw_axes(kernel)
     check_for_unused_hw_axes(kernel)
 
+    schedule_count = 0
+
     for gen_sched in generate_loop_schedules_internal(kernel):
         gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
         assert not owed_barriers
 
         yield kernel.copy(schedule=gen_sched)
 
+        schedule_count += 1
+
+    if not schedule_count:
+        raise RuntimeError("no valid schedules found")
+
 # }}}
 
 
diff --git a/test/test_matmul.py b/test/test_matmul.py
index f861153c8..c6797c6bc 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -244,6 +244,60 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
 
 
 
+def test_rank_one(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    n = int(get_suitable_size(ctx)**(3/2))
+    print n
+
+    knl = lp.LoopKernel(ctx.devices[0],
+            "[n] -> {[i,j]: 0<=i,j<n}",
+            [
+                "label: c[i, j] = cse(a[i], a)*cse(b[j], b)"
+                ],
+            [
+                lp.ArrayArg("a", dtype, shape=(n,), order=order),
+                lp.ArrayArg("b", dtype, shape=(n,), order=order),
+                lp.ArrayArg("c", dtype, shape=(n, n), order=order),
+                lp.ScalarArg("n", np.int32, approximately=n),
+                ],
+            name="rank_one", assumptions="n >= 16")
+
+    #knl = lp.split_dimension(knl, "i", 16,
+            #outer_tag="g.0", inner_tag="l.1", no_slabs=True)
+    #knl = lp.split_dimension(knl, "j", 8,
+            #outer_tag="g.1", inner_tag="l.0", no_slabs=True)
+    #knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
+
+    knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"])
+    knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"])
+
+    kernel_gen = lp.generate_loop_schedules(knl)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
+
+    a = cl_random.rand(queue, n, dtype=dtype)
+    b = cl_random.rand(queue, n, dtype=dtype)
+    refsol = a.get()[:, np.newaxis] * b.get()
+    c = cl_array.empty(queue, refsol.shape, refsol.dtype)
+
+    def launcher(kernel, gsize, lsize, check):
+        evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
+                g_times_l=True)
+
+        if check:
+            check_error(refsol, c.get())
+
+        return evt
+
+    lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
+
+
+
+
 def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
-- 
GitLab