From db61640ad5694b9b9e5d59813090b9ebdb5c4c6a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 1 Jan 2014 09:53:21 +0100
Subject: [PATCH] Push tutorial forward

---
 doc/tutorial.rst         | 398 +++++++++++++++++++++++++++++++++++++--
 loopy/__init__.py        |   9 +
 loopy/codegen/loop.py    |   2 +-
 loopy/kernel/__init__.py |   2 +-
 loopy/kernel/creation.py |   4 +-
 loopy/schedule.py        |   4 +-
 6 files changed, 402 insertions(+), 17 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index f4cd4af24..b65bd9176 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -21,6 +21,7 @@ and initialize :mod:`pyopencl`
     >>> import pyopencl.array
     >>> import pyopencl.clrandom
     >>> import loopy as lp
+    >>> from pytools import StderrToStdout as IncludeWarningsInDoctest
 
     >>> ctx = cl.create_some_context(interactive=False)
     >>> queue = cl.CommandQueue(ctx)
@@ -30,7 +31,7 @@ examples:
 
 .. doctest::
 
-    >>> n = 1000
+    >>> n = 16*16
     >>> x_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
     >>> y_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
     >>> a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
@@ -99,7 +100,7 @@ always see loopy's view of a kernel by printing it.
     DOMAINS:
     [n] -> { [i] : i >= 0 and i <= -1 + n }
     ---------------------------------------------------------------------------
-    INAME-TO-TAG MAP:
+    INAME IMPLEMENTATION TAGS:
     i: None
     ---------------------------------------------------------------------------
     INSTRUCTIONS:
@@ -266,8 +267,8 @@ loopy's programming model is completely *unordered* by default. This means
 that:
 
 * There is no guarantee about the order in which the loop domain is
-  traversed. ``i==3`` could be reached before ``i==0`` but still after
-  ``i==0``. Your program is only correct if it produces a valid result
+  traversed. ``i==3`` could be reached before ``i==0`` but also before
+  ``i==17``. Your program is only correct if it produces a valid result
   irrespective of this ordering.
 
 * In addition, there is (by default) no ordering between instructions
@@ -335,6 +336,9 @@ dependencies are *in addition* to any manual dependencies added via
 dependencies.  In this case, ``{dep=*...}`` (i.e. a leading asterisk) to
 prevent the heuristic from adding dependencies for this instruction.
 
+Loops and dependencies
+~~~~~~~~~~~~~~~~~~~~~~
+
 Next, it is important to understand how loops and dependencies interact.
 Let us take a look at the generated code for the above kernel:
 
@@ -414,10 +418,294 @@ graph:
 
 .. image:: images/dep-graph-correct.svg
 
+Loop nesting
+~~~~~~~~~~~~
+
+One last aspect of ordering over which we have thus far not exerted any
+control is the nesting of loops. For example, should the *i* loop be nested
+around the *j* loop, or the other way around, in the following simple
+zero-fill kernel?
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...     "{ [i,j]: 0<=i,j<n }",
+    ...     """
+    ...     a[i,j] = 0
+    ...     """)
+
+
+    >>> with IncludeWarningsInDoctest():
+    ...     evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i = 0; i <= (-1 + n); ++i)
+        for (int j = 0; j <= (-1 + n); ++j)
+          a[n * i + j] = 0.0f;
+    ...
+
+Loopy has chosen a loop nesting for us, but it did not like doing so, as it
+also issued the following warning::
+
+    LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
+
+This is easily resolved:
+
+.. doctest::
+
+    >>> knl = lp.set_loop_priority(knl, "j,i")
+
+:func:`loopy.set_loop_priority` indicates the textual order in which loops
+should be entered in the kernel code.  Note that this priority has an
+advisory role only. If the kernel logically requires a different nesting,
+loop priority is ignored.  Priority is only considered if loop nesting is
+ambiguous.
+
+.. doctest::
+
+    >>> with IncludeWarningsInDoctest():
+    ...     evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int j = 0; j <= (-1 + n); ++j)
+        for (int i = 0; i <= (-1 + n); ++i)
+          a[n * i + j] = 0.0f;
+    ...
+
+No more warnings! Loop nesting is also reflected in the dependency graph:
+
+.. image:: images/dep-graph-nesting.svg
+
+.. _intro-transformations:
+
+Introduction to Kernel Transformations
+--------------------------------------
+
+What we have covered thus far puts you in a position to describe many kinds
+of computations to loopy--in the sense that loopy will generate code that
+carries out the correct operation. That's nice, but it's natural to also
+want control over *how* a program is executed. Loopy's way of capturing
+this information is by way of *transformations*. These have the following
+general shape::
+
+    new_kernel = lp.do_something(old_knl, arguments...)
+
+These transformations always return a *copy* of the old kernel with the
+requested change applied. Typically, the variable holding the old kernel
+is overwritten with the new kernel::
+
+    knl = lp.do_something(knl, arguments...)
+
+We've already seen an example of a transformation above:
+For instance, :func:`set_loop_priority` fit the pattern.
+
+:func:`loopy.split_iname` is another fundamental (and useful) transformation. It
+turns one existing iname (recall that this is loopy's word for a 'loop
+variable', roughly) into two new ones, an 'inner' and an 'outer' one,
+where the 'inner' loopy is of a fixed, specified length, and the 'outer'
+loop runs over these fixed-length 'chunks'. The three inames have the
+following relationship to one another::
+
+    OLD = INNER + GROUP_SIZE * OUTER
+
+Consider this example:
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...     "{ [i]: 0<=i<n }",
+    ...     "a[i] = 0", assumptions="n>=0")
+    >>> knl = lp.split_iname(knl, "i", 16)
+    >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer)
+        for (int i_inner = 0; i_inner <= 15; ++i_inner)
+          if ((-1 + -1 * i_inner + -16 * i_outer + n) >= 0)
+            a[i_inner + i_outer * 16] = 0.0f;
+    ...
+
+By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
+where *OLD* is the name of the previous iname. Upon exit from
+:func:`loopy.split_iname`, *OLD* is removed from the kernel and replaced by
+*OLD_inner* and *OLD_outer*.
+
+Also take note of the *assumptions* argument. This makes it possible to
+communicate assumptions about loop domain parameters. (but *not* about
+data) In this case, assuming non-negativity helps loopy generate more
+efficient code for division in the loop bound for *i_outer*. See below
+on how to communicate divisibility assumptions.
+
+Note that the words 'inner' and 'outer' here have no implied meaning in
+relation to loop nesting. For example, it's perfectly possible to request
+*i_inner* to be nested outside *i_outer*:
+
+.. doctest::
+
+    >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer")
+    >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i_inner = 0; i_inner <= 15; ++i_inner)
+        if ((-1 + n) >= 0)
+          for (int i_outer = 0; i_outer <= (-1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16)); ++i_outer)
+            a[i_inner + i_outer * 16] = 0.0f;
+    ...
+
+Notice how loopy has automatically generated guard conditionals to make
+sure the bounds on the old iname are obeyed.
+
+The combination of :func:`loopy.split_iname` and
+:func:`loopy.set_loop_priority` is already good enough to implement what is
+commonly called 'loop tiling':
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...     "{ [i,j]: 0<=i,j<n }",
+    ...     "out[i,j] = a[j,i]",
+    ...     assumptions="n mod 16 = 0 and n >= 1")
+    >>> knl = lp.split_iname(knl, "i", 16)
+    >>> knl = lp.split_iname(knl, "j", 16)
+    >>> knl = lp.set_loop_priority(knl, "i_outer,j_outer,i_inner")
+    >>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer)
+        for (int j_outer = 0; j_outer <= (-1 + ((15 + n) / 16)); ++j_outer)
+          for (int i_inner = 0; i_inner <= 15; ++i_inner)
+            for (int j_inner = 0; j_inner <= 15; ++j_inner)
+              out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16];
+    ...
+
+
 .. _implementing-inames:
 
-Implementing Inames
--------------------
+Implementing Loop Axes ("Inames")
+---------------------------------
+
+So far, all the loops we have seen loopy implement were ``for`` loops. Each
+iname in loopy carries a so-called 'implementation tag'. Let's split the
+main loop of a vector fill kernel into chunks of 4 and unroll the
+fixed-length inner loop by setting the inner loop's tag to ``"unr"``:
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...     "{ [i]: 0<=i<n }",
+    ...     "a[i] = 0", assumptions="n>=0 and n mod 4 = 0")
+    >>> orig_knl = knl
+    >>> knl = lp.split_iname(knl, "i", 4)
+    >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
+    >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+      {
+        a[0 + i_outer * 4] = 0.0f;
+        a[1 + i_outer * 4] = 0.0f;
+        a[2 + i_outer * 4] = 0.0f;
+        a[3 + i_outer * 4] = 0.0f;
+      }
+    ...
+
+
+:func:`loopy.tag_inames` is a new transformation that assigns
+implementation tags to kernels.  ``"unr'`` is the first tag we've
+explicitly learned about. Technically, though, it is the second--``"for"``
+(or, equivalently, *None*), which is the default, instructs loopy to
+implement an iname using a for loop.
+
+Unrolling obviously only works for inames with a fixed maximum number of
+values, since only a finite amount of code can be generated. Unrolling the
+entire *i* loop in the kernel above would not work.
+
+Split-and-tag
+~~~~~~~~~~~~~
+
+Since split-and-tag is such a common combination, :func:`loopy.split_iname`
+provides a shortcut:
+
+.. doctest::
+
+    >>> knl = orig_knl
+    >>> knl = lp.split_iname(knl, "i", 4, inner_tag="unr")
+
+The *outer_tag* keyword argument exists, too, and works just like you would
+expect.
+
+Printing
+~~~~~~~~
+
+Iname implementation tags are also printed along with the entire kernel:
+
+.. doctest::
+
+    >>> print knl
+    ---------------------------------------------------------------------------
+    ...
+    INAME IMPLEMENTATION TAGS:
+    i_inner: unr
+    i_outer: None
+    ---------------------------------------------------------------------------
+    ...
+
+
+Avoiding Conditionals
+~~~~~~~~~~~~~~~~~~~~~
+
+You may have observed above that we have used a divisibility assumption on
+*n* in the kernels above. Without this assumption, loopy would generate
+conditional code to make sure no out-of-bounds loop instances are executed.
+This here is the original unrolling example without the divisibility
+assumption:
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...     "{ [i]: 0<=i<n }",
+    ...     "a[i] = 0", assumptions="n>=0")
+    >>> orig_knl = knl
+    >>> knl = lp.split_iname(knl, "i", 4)
+    >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
+    >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+    <BLANKLINE>
+    ...
+      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+      {
+        a[0 + i_outer * 4] = 0.0f;
+        if ((-2 + -4 * i_outer + n) >= 0)
+          a[1 + i_outer * 4] = 0.0f;
+        if ((-3 + -4 * i_outer + n) >= 0)
+          a[2 + i_outer * 4] = 0.0f;
+        if ((-4 + -4 * i_outer + n) >= 0)
+          a[3 + i_outer * 4] = 0.0f;
+      }
+    ...
+
+While these conditionals enable the generated code to deal with arbitrary
+*n*, they come at a performance cost. But there's still no reason to pay
+for them with *every* item processed. Loopy allows generating separate code
+for the last iteration of the loop, by using the *slabs* keyword argument
+to :func:`split_iname`:
+
+.. doctest::
+
+    >>> knl = orig_knl
+    >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
+    >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+    <BLANKLINE>
+      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+      {
+        a[0 + i_outer * 4] = 0.0f;
+        if ((-2 + -4 * i_outer + n) >= 0)
+          a[1 + i_outer * 4] = 0.0f;
+        if ((-3 + -4 * i_outer + n) >= 0)
+          a[2 + i_outer * 4] = 0.0f;
+        if ((-4 + -4 * i_outer + n) >= 0)
+          a[3 + i_outer * 4] = 0.0f;
+      }
+    ...
 
 .. _specifying-arguments:
 
@@ -429,6 +717,23 @@ Specifying arguments
 Implementing Array Axes
 -----------------------
 
+.. _more-complicated-programs:
+
+More complicated programs
+-------------------------
+
+SCOP
+
+Data-dependent control flow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Conditionals
+~~~~~~~~~~~~
+
+Snippets of C
+~~~~~~~~~~~~~
+
+
 Common Problems
 ---------------
 
@@ -440,12 +745,12 @@ Attempting to create this kernel results in an error:
 .. doctest::
     :options: +ELLIPSIS
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
-    ...      "{ [i]: 0<=i<n }",
-    ...      """
-    ...      out[i] = 5
-    ...      out[0] = 6
-    ...      """)
+    >>> lp.make_kernel(ctx.devices[0],
+    ...     "{ [i]: 0<=i<n }",
+    ...     """
+    ...     out[i] = 5
+    ...     out[0] = 6
+    ...     """)
     ... # Loopy prints the following before this exception:
     ... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
     Traceback (most recent call last):
@@ -453,7 +758,8 @@ Attempting to create this kernel results in an error:
     ValueError: a static maximum was not found for PwAff '[n] -> { [(1)] : n = 1; [(n)] : n >= 2; [(1)] : n <= 0 }'
 
 The problem is that loopy cannot find a simple, universally valid expression
-for the length of *out* in this case. The set notation at the end of the error
+for the length of *out* in this case. Notice how the kernel accesses both the
+*i*th and the first element of out.  The set notation at the end of the error
 message summarizes its best attempt:
 
 * If n=1, then out has size 1.
@@ -480,4 +786,70 @@ Other situations where this error message can occur include:
 * Finding sizes of argument arrays
 * Finding workgroup sizes
 
+Write races
+~~~~~~~~~~~
+
+This kernel performs a simple transposition of an input matrix:
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(ctx.devices[0],
+    ...       "{ [i,j]: 0<=i,j<n }",
+    ...       """
+    ...       out[j,i] = a[i,j]
+    ...       """, assumptions="n>=1", name="transpose")
+
+To get it ready for execution on a GPU, we split the *i* and *j* loops into
+groups of 16.
+
+.. doctest::
+
+    >>> knl = lp.split_iname(knl,  "j", 16, inner_tag="l.1", outer_tag="g.0")
+    >>> knl = lp.split_iname(knl,  "i", 16, inner_tag="l.0", outer_tag="g.1")
+
+We'll also request a prefetch--but suppose we only do so across the
+*i_inner* iname:
+
+.. doctest::
+
+    >>> knl = lp.add_prefetch(knl, "a", "i_inner")
+
+When we try to run our code, we get the following warning from loopy as a first
+sign that something is amiss:
+
+.. doctest::
+
+    >>> with IncludeWarningsInDoctest():
+    ...     evt, (out,) = knl(queue, a=a_mat_dev)
+    /...: WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable)
+      warn(text, type)
+
+When we ask to see the code, the issue becomes apparent:
+
+.. doctest::
+    :options: +ELLIPSIS
+
+    >>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
+    #define gid(N) ((int) get_group_id(N))
+    <BLANKLINE>
+    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out)
+    {
+      float a_fetch_0[16];
+    <BLANKLINE>
+      ...
+          a_fetch_0[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)];
+      ...
+          out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch_0[lid(0)];
+      ...
+    }
+
+Loopy has a 2D workgroup to use for prefetching of a 1D array. When it
+considers making *a_fetch_0* ``local`` (in the OpenCL memory sense of the word)
+to make use of parallelism in prefetching, it discovers that a write race
+across the remaining axis of the workgroup would emerge.
+
+FIXME
+
 .. vim: tw=75:spell
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 78d630e79..34d417f90 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -792,6 +792,15 @@ def remove_unused_inames(knl, inames=None):
 # {{{ set loop priority
 
 def set_loop_priority(kernel, loop_priority):
+    """Indicates the textual order in which loops should be entered in the
+    kernel code. Note that this priority has an advisory role only. If the
+    kernel logically requires a different nesting, priority is ignored.
+    Priority is only considered if loop nesting is ambiguous.
+
+    :arg: an iterable of inames, or, for brevity, a comma-seaprated string of
+        inames
+    """
+
     if isinstance(loop_priority, str):
         loop_priority = [s.strip() for s in loop_priority.split(",")]
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index e7c30a864..03074ea73 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -29,7 +29,7 @@ from islpy import dim_type
 from loopy.codegen.control import build_loop_nest
 
 
-# {{{ conditional-minimizing slab decomposition
+# {{{ conditional-reducing slab decomposition
 
 def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
     iname_domain = kernel.get_inames_domain(iname)
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 67291e8f6..b4c773418 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -846,7 +846,7 @@ class LoopKernel(Record):
             lines.append(len(parents)*"  " + str(dom))
 
         lines.append(sep)
-        lines.append("INAME-TO-TAG MAP:")
+        lines.append("INAME IMPLEMENTATION TAGS:")
         for iname in sorted(self.all_inames()):
             line = "%s: %s" % (iname, self.iname_to_tag.get(iname))
             lines.append(line)
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 8da34b880..15d200032 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -991,7 +991,9 @@ def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs):
         a tuple (result_dtype, c_name), where c_name is the C-level symbol to
         be evaluated.
     :arg assumptions: the initial implemented_domain, captures assumptions
-        on the parameters. (an isl.Set)
+        on loop domain parameters. (an isl.Set or a string in
+        :ref:`isl-syntax`.  If given as a string, only the CONDITIONS part of
+        the set notation should be given.)
     :arg local_sizes: A dictionary from integers to integers, mapping
         workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be
         length 16.
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 87f3ad5ca..70cbe41b8 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -915,8 +915,10 @@ def get_one_scheduled_kernel(kernel):
 
         if kernel_count == 2:
             from warnings import warn
+            from loopy.diagnostic import LoopyWarning
             warn("kernel scheduling was ambiguous--more than one "
-                    "schedule found, ignoring", stacklevel=2)
+                    "schedule found, ignoring", LoopyWarning,
+                    stacklevel=2)
             break
 
     return result
-- 
GitLab