Iname dependency cleanups.

- Flag idempotent instructions. - Exploit idempotent insns in scheduling, allowing them to be executed inside "too many" loops. - Be more exact in what inames to duplicate in CSE pre-computes.

Iname dependency cleanups.
6f140ea6 · Andreas Klöckner · 1b8354b2 · 6f140ea6 · 6f140ea6 · 6f140ea6
Commit 6f140ea6 authored 13 years ago by Andreas Klöckner
--- a/MEMO
+++ b/MEMO
@@ -56,6 +56,8 @@ Things to consider
 - Parallel dimension splitting/merging via tags
  -> unnecessary?

+- All user-supplied commands are assumed to be idempotent.
+
 TODO
 ^^^^

@@ -78,23 +80,20 @@ TODO

 - Slab decomposition for ILP

- Some things involving CSEs might be impossible to schedule
-  a[i,j] = cse(b[i]) * cse(c[j])
-
- Flag, exploit idempotence
-
- How should we implement the dim shuffling for odd-size prefetches?
-
 - Better for loop bound generation
  -> Try a triangular loop

- AUTO_PICK or AUTO_FIT
-
- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT
-
 Dealt with
 ^^^^^^^^^^

+- Flag, exploit idempotence
+
+- Some things involving CSEs might be impossible to schedule
+  a[i,j] = cse(b[i]) * cse(c[j])
+
+- Be smarter about automatic local axis choice
+  -> What if we run out of axes?
+
 - Implement condition hoisting
  (needed, e.g., by slab decomposition)


--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -245,7 +245,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
            if iname in duplicate_inames:
                tag = dup_iname_to_tag[iname]
            else:
-                tag = kernel.iname_to_tag[iname]
+                tag = kernel.iname_to_tag.get(iname)

            if isinstance(tag, LocalIndexTagBase):
                kind = "l"
@@ -273,9 +273,22 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                        "that the CSE does not depend on "
                        "does not make sense")

-            force_dependency = True
-            if kind == "l" and target_var_is_local:
-                force_dependency = False
+            # Which iname dependencies are carried over from CSE host
+            # to the CSE compute instruction?
+
+            if not target_var_is_local:
+                # If we're writing to a private variable, then each
+                # hardware-parallel iname must execute its own copy of
+                # the CSE compute instruction. After all, each work item
+                # has its own set of private variables.
+
+                force_dependency = kind in "gl"
+            else:
+                # If we're writing to a local variable, then all other local
+                # dimensions see our updates, and thus they do *not* need to
+                # execute their own copy of this instruction.
+
+                force_dependency = kind == "g"

            if force_dependency:
                forced_iname_deps.append(iname)
@@ -308,7 +321,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                id=kernel.make_unique_instruction_id(based_on=cse_tag),
                assignee=assignee,
                expression=new_inner_expr,
-                forced_iname_deps=forced_iname_deps)
+                forced_iname_deps=forced_iname_deps,
+                idempotent=True)

        cse_result_insns.append(new_insn)


--- a/loopy/codegen/dispatch.py
+++ b/loopy/codegen/dispatch.py
@@ -35,7 +35,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, EnterLoop):
-        tag = kernel.iname_to_tag[sched_item.iname]
+        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (
                generate_unroll_loop,

--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -220,16 +220,19 @@ class Instruction(Record):
    :ivar insn_deps: a list of ids of :class:`Instruction` instances that
        *must* be executed before this one. Note that loop scheduling augments this
        by adding dependencies on any writes to temporaries read by this instruction.
+    :ivar idempotent: Whether the instruction may be executed repeatedly (while obeying
+        dependencies) without changing the meaning of the program.
    """
    def __init__(self,
-            id, assignee, expression,
+            id, assignee, expression, idempotent,
            forced_iname_deps=[], insn_deps=[]):

+        assert isinstance(idempotent, bool)
+
        Record.__init__(self,
                id=id, assignee=assignee, expression=expression,
                forced_iname_deps=forced_iname_deps,
-                insn_deps=insn_deps,
-                )
+                insn_deps=insn_deps, idempotent=idempotent)

    @memoize_method
    def all_inames(self):
@@ -447,7 +450,8 @@ class LoopKernel(Record):
                    id=self.make_unique_instruction_id(insns, based_on=label),
                    insn_deps=insn_deps,
                    forced_iname_deps=forced_iname_deps,
-                    assignee=lhs, expression=rhs)
+                    assignee=lhs, expression=rhs,
+                    idempotent=True)

        if isinstance(domain, str):
            ctx = isl.Context()

--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -58,7 +58,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                    extra_used_ids=set(ni.id for ni in new_insns)),
                assignee=target_var,
                forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
-                expression=expr.operation.neutral_element)
+                expression=expr.operation.neutral_element,
+                idempotent=True)

        new_insns.append(init_insn)

@@ -68,7 +69,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
                assignee=target_var,
                expression=expr.operation(target_var, sub_expr),
                insn_deps=[init_insn.id],
-                forced_iname_deps=list(insn.all_inames()))
+                forced_iname_deps=list(insn.all_inames()),
+                idempotent=False)

        new_insns.append(reduction_insn)

@@ -208,9 +210,11 @@ def check_for_unused_hw_axes(kernel):
                raise RuntimeError("auto local tag encountered")

        if group_axes != group_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw group axes")
+            raise RuntimeError("instruction '%s' does not use all hw group axes"
+                    % insn.id)
        if local_axes != local_axes_used:
-            raise RuntimeError("instruction '%s' does not use all hw local axes")
+            raise RuntimeError("instruction '%s' does not use all hw local axes"
+                    % insn.id)



@@ -608,8 +612,25 @@ def generate_loop_schedules_internal(kernel, schedule=[]):

    for insn_id in unscheduled_insn_ids:
        insn = kernel.id_to_insn[insn_id]
-        if (active_inames - parallel_inames 
-                == insn.all_inames() - parallel_inames
+
+        if insn.idempotent:
+            # If insn is idempotent, it may be placed inside a more deeply
+            # nested loop without harm.
+
+            iname_deps_satisfied = (
+                    insn.all_inames() - parallel_inames
+                    <=
+                    active_inames - parallel_inames)
+        else:
+            # If insn is not idempotent, we must insist that it is placed inside
+            # the exactly correct set of loops.
+
+            iname_deps_satisfied = (
+                    insn.all_inames() - parallel_inames
+                    ==
+                    active_inames - parallel_inames)
+
+        if (iname_deps_satisfied
                and set(insn.insn_deps) <= scheduled_insn_ids):
            scheduled_insn_ids.add(insn.id)
            schedule = schedule + [RunInstruction(insn_id=insn.id)]
@@ -812,12 +833,19 @@ def generate_loop_schedules(kernel):
    check_for_double_use_of_hw_axes(kernel)
    check_for_unused_hw_axes(kernel)

+    schedule_count = 0
+
    for gen_sched in generate_loop_schedules_internal(kernel):
        gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
        assert not owed_barriers

        yield kernel.copy(schedule=gen_sched)

+        schedule_count += 1
+
+    if not schedule_count:
+        raise RuntimeError("no valid schedules found")
+
 # }}}



--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -244,6 +244,60 @@ def test_plain_matrix_mul_new_ui(ctx_factory):



+def test_rank_one(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    n = int(get_suitable_size(ctx)**(3/2))
+    print n
+
+    knl = lp.LoopKernel(ctx.devices[0],
+            "[n] -> {[i,j]: 0<=i,j<n}",
+            [
+                "label: c[i, j] = cse(a[i], a)*cse(b[j], b)"
+                ],
+            [
+                lp.ArrayArg("a", dtype, shape=(n,), order=order),
+                lp.ArrayArg("b", dtype, shape=(n,), order=order),
+                lp.ArrayArg("c", dtype, shape=(n, n), order=order),
+                lp.ScalarArg("n", np.int32, approximately=n),
+                ],
+            name="rank_one", assumptions="n >= 16")
+
+    #knl = lp.split_dimension(knl, "i", 16,
+            #outer_tag="g.0", inner_tag="l.1", no_slabs=True)
+    #knl = lp.split_dimension(knl, "j", 8,
+            #outer_tag="g.1", inner_tag="l.0", no_slabs=True)
+    #knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
+
+    knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"])
+    knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"])
+
+    kernel_gen = lp.generate_loop_schedules(knl)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
+
+    a = cl_random.rand(queue, n, dtype=dtype)
+    b = cl_random.rand(queue, n, dtype=dtype)
+    refsol = a.get()[:, np.newaxis] * b.get()
+    c = cl_array.empty(queue, refsol.shape, refsol.dtype)
+
+    def launcher(kernel, gsize, lsize, check):
+        evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
+                g_times_l=True)
+
+        if check:
+            check_error(refsol, c.get())
+
+        return evt
+
+    lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
+
+
+
+
 def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()