diff --git a/doc/conf.py b/doc/conf.py index 52691247f0bc4d64330aa54d526a78611475399e..3be8bf76101f2111e0b0996a050c467b11b11e8b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -25,7 +25,12 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode'] +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.viewcode', + 'sphinx.ext.doctest', + ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/doc/images/dep-graph-correct.svg b/doc/images/dep-graph-correct.svg new file mode 100644 index 0000000000000000000000000000000000000000..397cb2d101792ac07b6e5ae8897d41cd0a457f54 --- /dev/null +++ b/doc/images/dep-graph-correct.svg @@ -0,0 +1,48 @@ + + + + + + +loopy_kernel + +cluster_i + +i + +cluster_j + +j + +cluster_ii + +ii + +cluster_jj + +jj + + +transpose + + +out[(j, i)] <- a[(i, j)] + + + +insn + + +out[(ii, jj)] <- 2*out[(ii, jj)] + + + +transpose->insn + + + + + diff --git a/doc/images/dep-graph-incorrect.svg b/doc/images/dep-graph-incorrect.svg new file mode 100644 index 0000000000000000000000000000000000000000..363080aef591a2d0ccdb4d0a6ac63520f5bf43df --- /dev/null +++ b/doc/images/dep-graph-incorrect.svg @@ -0,0 +1,40 @@ + + + + + + +loopy_kernel + +cluster_i + +i + +cluster_j + +j + + +transpose + + +out[(j, i)] <- a[(i, j)] + + + +insn + + +out[(i, j)] <- 2*out[(i, j)] + + + +transpose->insn + + + + + diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 8c447ef876e67721a9247defc58e9588cdec144c..063e497fbd0f4f25aafc58b8307c678f72979dbc 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1,22 +1,432 @@ -.. _tutorial +.. _tutorial: Tutorial ======== This guide provides a gentle introduction into what loopy is, how it works, and -what it can do. In doing so, some information is omitted or glossed over for -sake of clarity. There's also the :ref:`reference` that clearly defines all +what it can do. There's also the :ref:`reference` that clearly defines all aspects of loopy. -Loopy's View of a Kernel ------------------------- +Preparation +----------- -.. literalinclude:: ../examples/rank-one.py - :start-after: SETUPBEGIN - :end-before: SETUPEND +For now, :mod:`loopy` depends on :mod:`pyopencl`. We import a few modules +and initialize :mod:`pyopencl` -This example is included in the :mod:`loopy` distribution as -:download:`examples/rank-one.py <../examples/rank-one.py>`. +.. doctest:: + :options: +ELLIPSIS + >>> import numpy as np + >>> import pyopencl as cl + >>> import pyopencl.array + >>> import pyopencl.clrandom + >>> import loopy as lp -.. vim: tw=75 + >>> ctx = cl.create_some_context(interactive=False) + >>> queue = cl.CommandQueue(ctx) + +We also create some data on the device that we'll use throughout our +examples: + +.. doctest:: + + >>> n = 1000 + >>> x_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32) + >>> y_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32) + >>> a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32) + >>> b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32) + +And some data on the host: + +.. doctest:: + + >>> x_vec_host = np.random.randn(n).astype(np.float32) + >>> y_vec_host = np.random.randn(n).astype(np.float32) + +Getting started +--------------- + +We'll start by taking a closer look at a very simple kernel that reads in +one vector, doubles it, and writes it to another. + +.. doctest:: + + >>> knl = lp.make_kernel(ctx.devices[0], + ... "{ [i]: 0<=i>> print knl + --------------------------------------------------------------------------- + KERNEL: loopy_kernel + --------------------------------------------------------------------------- + ARGUMENTS: + a: GlobalArg, type: , shape: (n), dim_tags: (stride:1) + n: ValueArg, type: + out: GlobalArg, type: , shape: (n), dim_tags: (stride:1) + --------------------------------------------------------------------------- + DOMAINS: + [n] -> { [i] : i >= 0 and i <= -1 + n } + --------------------------------------------------------------------------- + INAME-TO-TAG MAP: + i: None + --------------------------------------------------------------------------- + INSTRUCTIONS: + [i] out[i] <- 2*a[i] # insn + --------------------------------------------------------------------------- + +You'll likely have noticed that there's quite a bit more information here +than there was in the input. Most of this comes from default values that +loopy assumes to cover common use cases. These defaults can all be +overridden. + +We've seen the domain and the instructions above, and we'll discuss the +'iname-to-tag-map' in :ref:`implementing-inames`. The remaining big chunk +of added information is in the 'arguments' section, where we observe the +following: + +* ``a`` and ``out`` have been classified as pass-by-reference (i.e. + pointer) arguments in global device memory. Any referenced array variable + will default to global unless otherwise specified. + +* Loopy has also examined our access to ``a`` and ``out`` and determined + the bounds of the array from the values we are accessing. This is shown + after **shape:**. Like :mod:`numpy`, loopy works on multi-dimensional + arrays. Loopy's idea of arrays is very similar to that of :mod:`numpy`, + including the *shape* attribute. + + Sometimes, loopy will be unable to make this determination. It will tell + you so--for example when the array indices consist of data read from + memory. Other times, arrays are larger than the accessed footprint. In + either case, you will want to specify the kernel arguments explicitly. + See :ref:`specifying-arguments`. + +* Loopy has not determined the type of ``a`` and ``out``. The data type is + given as ````, which means that these types will be determined + by the data passed in when the kernel is invoked. Loopy generates (and + caches!) a copy of the kernel for each combination of types passed in. + +* In addition, each array axis has a 'dimension tag'. This is shown above + as ``(stride:1)``. We will see more on this in + :ref:`implementing-array-axes`. + +Running a kernel +---------------- + +Running the kernel that we've just created is easy. Let's check the result +for good measure. + +.. doctest:: + + >>> evt, (out,) = knl(queue, a=x_vec_dev) + >>> assert (out.get() == (2*x_vec_dev).get()).all() + +We can have loopy print the OpenCL kernel it generated +by passing :attr:`loopy.Flags.write_cl`. + +.. doctest:: + + >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl") + + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + { + + for (int i = 0; i <= (-1 + n); ++i) + out[i] = 2.0f * a[i]; + } + + +As promised, loopy has used the type of *x_vec_dev* to specialize the +kernel. If a variable is written as part of the kernel code, loopy will +automatically return it in the second element of the result of a kernel +call (the first being the :class:`pyopencl.Event` associated with the +execution of the kernel). (If the ordering of the output tuple is not +clear, it can be specified or turned into a :class:`dict`. See the +*kernel_data* argument of :func:`loopy.make_kernel` and +:attr:`loopy.Flags.return_dict`.) + +For convenience, loopy kernels also directly accept :mod:`numpy` arrays: + +.. doctest:: + + >>> evt, (out,) = knl(queue, a=x_vec_host) + >>> assert (out == (2*x_vec_host)).all() + +Notice how both *out* nor *a* are :mod:`numpy` arrays, but neither needed +to be transferred to or from the device. Checking for numpy arrays and +transferring them if needed comes at a potential performance cost. If you +would like to make sure that you avoid this cost, pass +:attr:`loopy.Flags.no_numpy`. + +Further notice how *n*, while technically being an argument, did not need +to be passed, as loopy is able to find *n* from the shape of the input +argument *a*. + +For efficiency, loopy generates Python code that handles kernel invocation. +If you are suspecting that this code is causing you an issue, you can +inspect that code, too, using :attr:`loopy.Flags.write_wrapper`: + +.. doctest:: + :options: +ELLIPSIS + + >>> evt, (out,) = knl(queue, a=x_vec_host, flags="write_wrapper") + from __future__ import division + ... + def invoke_loopy_kernel_loopy_kernel(cl_kernel, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None): + if allocator is None: + allocator = _lpy_cl_tools.DeferredAllocator(queue.context) + + # {{{ find integer arguments from shapes + + if n is None: + if a is not None: + n = a.shape[0] + elif out is not None: + n = out.shape[0] + + # }}} + ... + +Generating code +~~~~~~~~~~~~~~~ + +Instead of using loopy to run the code it generates, you can also just use +loopy as a code generator and take care of executing the generated kernels +yourself. In this case, make sure loopy knows about all types, and then +call :func:`loopy.generate_code`: + +.. doctest:: + + >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) + >>> code, _ = lp.generate_code(typed_knl) + >>> print code + + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + { + + for (int i = 0; i <= (-1 + n); ++i) + out[i] = 2.0f * a[i]; + } + +Ordering +-------- + +Next, we'll change our kernel a bit. Our goal will be to transpose a matrix +and double its entries, and we will do this in two steps for the sake of +argument: + +.. doctest:: + + >>> # WARNING: Incorrect. + >>> knl = lp.make_kernel(ctx.devices[0], + ... "{ [i,j]: 0<=i,j>> # WARNING: Incorrect. + >>> knl = lp.make_kernel(ctx.devices[0], + ... "{ [i,j]: 0<=i,j>> print knl + --------------------------------------------------------------------------- + KERNEL: loopy_kernel + --------------------------------------------------------------------------- + ... + --------------------------------------------------------------------------- + DEPENDENCIES: (use loopy.show_dependency_graph to visualize) + insn : transpose + --------------------------------------------------------------------------- + +These dependencies are in a ``dependent : prerequisite`` format that should +be familiar if you have previously dealt with Makefiles. For larger +kernels, these dependency lists can become quite verbose, and there is an +increasing risk that required dependencies are missed. To help catch these, +loopy can also show an instruction dependency graph, using +:func:`loopy.show_dependency_graph`: + +.. image:: images/dep-graph-incorrect.svg + +Dependencies are shown as arrows from prerequisite to dependent in the +graph. This functionality requires the open-source `graphviz +`_ graph drawing tools to be installed. The generated +graph will open in a browser window. + +Since manually notating lots of dependencies is cumbersome, loopy has +a heuristic: + + If a variable is written by exactly one instruction, then all + instructions reading that variable will automatically depend on the + writing instruction. + +The intention of this heuristic is to cover the common case of a +precomputed result being stored and used many times. Generally, these +dependencies are *in addition* to any manual dependencies added via +``{dep=...}``. It is possible (but rare) that the heuristic adds undesired +dependencies. In this case, ``{dep=*...}`` (i.e. a leading asterisk) to +prevent the heuristic from adding dependencies for this instruction. + +Next, it is important to understand how loops and dependencies interact. +Let us take a look at the generated code for the above kernel: + +.. doctest:: + + >>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl") + + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + { + + for (int i = 0; i <= (-1 + n); ++i) + for (int j = 0; j <= (-1 + n); ++j) + { + out[n * j + i] = a[n * i + j]; + out[n * i + j] = 2.0f * out[n * i + j]; + } + } + +While our requested instruction ordering has been obeyed, something is +still not right: + +.. doctest:: + :options: +ELLIPSIS + + >>> assert (out.get() == a_mat_dev.get().T*2).all() + Traceback (most recent call last): + ... + AssertionError + +For the kernel to perform the desired computation, *all +instances* (loop iterations) of the first instruction need to be completed, +not just the one for the current values of *(i, j)*. + + Dependencies in loopy act *within* the largest common set of shared + inames. + +As a result, our example above realizes the dependency *within* the *i* and *j* +loops. To fix our example, we simply create a new pair of loops *ii* and *jj* +with identical bounds, for the use of the transpose: + +.. doctest:: + + >>> knl = lp.make_kernel(ctx.devices[0], + ... "{ [i,j,ii,jj]: 0<=i,j,ii,jj>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl") + + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + { + + for (int i = 0; i <= (-1 + n); ++i) + for (int j = 0; j <= (-1 + n); ++j) + out[n * j + i] = a[n * i + j]; + for (int ii = 0; ii <= (-1 + n); ++ii) + for (int jj = 0; jj <= (-1 + n); ++jj) + out[n * ii + jj] = 2.0f * out[n * ii + jj]; + } + >>> assert (out.get() == a_mat_dev.get().T*2).all() + +Also notice how the changed loop structure is reflected in the dependency +graph: + +.. image:: images/dep-graph-correct.svg + +.. _implementing-inames: + +Implementing Inames +------------------- + +.. _specifying-arguments: + +Specifying arguments +-------------------- + +.. _implementing-array-axes: + +Implementing Array Axes +----------------------- + +.. vim: tw=75:spell diff --git a/examples/hello-loopy.py b/examples/hello-loopy.py index 2c8ff4c3bbdfe331596eec1c0dcb49f6bd8b6e46..c7c7ade30c5123938482412e9a4abeca6732ef55 100644 --- a/examples/hello-loopy.py +++ b/examples/hello-loopy.py @@ -23,7 +23,7 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # execute # ------- -evt, (out,) = knl(queue, a=a, n=n) +evt, (out,) = knl(queue, a=a) # ENDEXAMPLE cknl = lp.CompiledKernel(ctx, knl)