From ac7496f0f4ebde4bfbf358d6e4d5ac4a96458ca6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sun, 11 Feb 2018 19:46:24 -0600 Subject: [PATCH] Add vectorization example to FAQ for Timo --- doc/misc.rst | 35 +++++++++++++++++++++++++++++++++ examples/python/vector-types.cl | 26 ++++++++++++++++++++++++ examples/python/vector-types.py | 21 ++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 examples/python/vector-types.cl create mode 100644 examples/python/vector-types.py diff --git a/doc/misc.rst b/doc/misc.rst index cd6fe102c..441efd80f 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -342,6 +342,41 @@ This list is always growing, but here are a few pointers: Use :func:`loopy.join_inames`. +In what sense does Loopy suport vectorization? +---------------------------------------------- + +There are really two ways in which the OpenCL/CUDA model of computation exposes +vectorization: + +* "SIMT": The user writes scalar program instances and either the compiler or + the hardware joins the individual program instances into vectors of a + hardware-given length for execution. + +* "Short vectors": This type of vectorization is based on vector types, + e.g. ``float4``, which support arithmetic with implicit vector semantics + as well as a number of 'intrinsic' functions. + +Loopy suports both. The first one, SIMT, is accessible by tagging inames with, +e.g., ``l.0```. Accessing the second one requires using both execution- and +data-reshaping capabilities in loopy. To start with, you need an array that +has an axis with the length of the desired vector. If that's not yet available, +you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need +an iname whose bounds match those of the desired vector length. Again, if you +don't already have one, :func:`loopy.split_iname` will easily produce one. +Lastly, both the array axis an the iname need the implementation tag ``"vec"``. +Here is an example of this machinery in action: + +.. literalinclude:: ../examples/python/vector-types.py + :language: python + +Note how the example slices off the last 'slab' of iterations to ensure that +the bulk of the iteration does not require conditionals which would prevent +successful vectorization. This generates the following code: + +.. literalinclude:: ../examples/python/vector-types.cl + :language: c + + Uh-oh. I got a scheduling error. Any hints? ------------------------------------------- diff --git a/examples/python/vector-types.cl b/examples/python/vector-types.cl new file mode 100644 index 000000000..9e05994db --- /dev/null +++ b/examples/python/vector-types.cl @@ -0,0 +1,26 @@ +#define lid(N) ((int) get_local_id(N)) +#define gid(N) ((int) get_group_id(N)) +#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) ) + +__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float4 const *__restrict__ a, int const n, __global float4 *__restrict__ out) +{ + /* bulk slab for 'i_outer' */ + for (int i_outer = 0; i_outer <= -2 + int_floor_div_pos_b(3 + n, 4); ++i_outer) + out[i_outer] = 2.0f * a[i_outer]; + /* final slab for 'i_outer' */ + { + int const i_outer = -1 + n + -1 * int_floor_div_pos_b(3 * n, 4); + + if (-1 + n >= 0) + { + if (-1 + -4 * i_outer + n >= 0) + out[i_outer].s0 = 2.0f * a[i_outer].s0; + if (-1 + -4 * i_outer + -1 + n >= 0) + out[i_outer].s1 = 2.0f * a[i_outer].s1; + if (-1 + -4 * i_outer + -1 * 2 + n >= 0) + out[i_outer].s2 = 2.0f * a[i_outer].s2; + if (-1 + -4 * i_outer + -1 * 3 + n >= 0) + out[i_outer].s3 = 2.0f * a[i_outer].s3; + } + } +} diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py new file mode 100644 index 000000000..328aea154 --- /dev/null +++ b/examples/python/vector-types.py @@ -0,0 +1,21 @@ +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.array + +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +n = 15 * 10**6 +a = cl.array.arange(queue, n, dtype=np.float32) + +knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]") + +knl = lp.set_options(knl, write_code=True) +knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") +knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) +knl = lp.tag_array_axes(knl, "a,out", "C,vec") + +knl(queue, a=a.reshape(-1, 4), n=n) -- GitLab