From ac7496f0f4ebde4bfbf358d6e4d5ac4a96458ca6 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 11 Feb 2018 19:46:24 -0600
Subject: [PATCH] Add vectorization example to FAQ for Timo

---
 doc/misc.rst                    | 35 +++++++++++++++++++++++++++++++++
 examples/python/vector-types.cl | 26 ++++++++++++++++++++++++
 examples/python/vector-types.py | 21 ++++++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 examples/python/vector-types.cl
 create mode 100644 examples/python/vector-types.py

diff --git a/doc/misc.rst b/doc/misc.rst
index cd6fe102c..441efd80f 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -342,6 +342,41 @@ This list is always growing, but here are a few pointers:
 
   Use :func:`loopy.join_inames`.
 
+In what sense does Loopy suport vectorization?
+----------------------------------------------
+
+There are really two ways in which the OpenCL/CUDA model of computation exposes
+vectorization:
+
+* "SIMT": The user writes scalar program instances and either the compiler or
+  the hardware joins the individual program instances into vectors of a
+  hardware-given length for execution.
+
+* "Short vectors": This type of vectorization is based on vector types,
+  e.g. ``float4``, which support arithmetic with implicit vector semantics
+  as well as a number of 'intrinsic' functions.
+
+Loopy suports both. The first one, SIMT, is accessible by tagging inames with,
+e.g., ``l.0```. Accessing the second one requires using both execution- and
+data-reshaping capabilities in loopy. To start with, you need an array that
+has an axis with the length of the desired vector. If that's not yet available,
+you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
+an iname whose bounds match those of the desired vector length. Again, if you
+don't already have one, :func:`loopy.split_iname` will easily produce one.
+Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
+Here is an example of this machinery in action:
+
+.. literalinclude:: ../examples/python/vector-types.py
+    :language: python
+
+Note how the example slices off the last 'slab' of iterations to ensure that
+the bulk of the iteration does not require conditionals which would prevent
+successful vectorization. This generates the following code:
+
+.. literalinclude:: ../examples/python/vector-types.cl
+    :language: c
+
+
 Uh-oh. I got a scheduling error. Any hints?
 -------------------------------------------
 
diff --git a/examples/python/vector-types.cl b/examples/python/vector-types.cl
new file mode 100644
index 000000000..9e05994db
--- /dev/null
+++ b/examples/python/vector-types.cl
@@ -0,0 +1,26 @@
+#define lid(N) ((int) get_local_id(N))
+#define gid(N) ((int) get_group_id(N))
+#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
+
+__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float4 const *__restrict__ a, int const n, __global float4 *__restrict__ out)
+{
+  /* bulk slab for 'i_outer' */
+  for (int i_outer = 0; i_outer <= -2 + int_floor_div_pos_b(3 + n, 4); ++i_outer)
+    out[i_outer] = 2.0f * a[i_outer];
+  /* final slab for 'i_outer' */
+  {
+    int const i_outer = -1 + n + -1 * int_floor_div_pos_b(3 * n, 4);
+
+    if (-1 + n >= 0)
+    {
+      if (-1 + -4 * i_outer + n >= 0)
+        out[i_outer].s0 = 2.0f * a[i_outer].s0;
+      if (-1 + -4 * i_outer + -1 + n >= 0)
+        out[i_outer].s1 = 2.0f * a[i_outer].s1;
+      if (-1 + -4 * i_outer + -1 * 2 + n >= 0)
+        out[i_outer].s2 = 2.0f * a[i_outer].s2;
+      if (-1 + -4 * i_outer + -1 * 3 + n >= 0)
+        out[i_outer].s3 = 2.0f * a[i_outer].s3;
+    }
+  }
+}
diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py
new file mode 100644
index 000000000..328aea154
--- /dev/null
+++ b/examples/python/vector-types.py
@@ -0,0 +1,21 @@
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.array
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+n = 15 * 10**6
+a = cl.array.arange(queue, n, dtype=np.float32)
+
+knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]")
+
+knl = lp.set_options(knl, write_code=True)
+knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+knl(queue, a=a.reshape(-1, 4), n=n)
-- 
GitLab