diff --git a/doc/conf.py b/doc/conf.py
index 52691247f0bc4d64330aa54d526a78611475399e..3be8bf76101f2111e0b0996a050c467b11b11e8b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -25,7 +25,12 @@
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.doctest',
+ ]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
diff --git a/doc/images/dep-graph-correct.svg b/doc/images/dep-graph-correct.svg
new file mode 100644
index 0000000000000000000000000000000000000000..397cb2d101792ac07b6e5ae8897d41cd0a457f54
--- /dev/null
+++ b/doc/images/dep-graph-correct.svg
@@ -0,0 +1,48 @@
+
+
+
+
+
diff --git a/doc/images/dep-graph-incorrect.svg b/doc/images/dep-graph-incorrect.svg
new file mode 100644
index 0000000000000000000000000000000000000000..363080aef591a2d0ccdb4d0a6ac63520f5bf43df
--- /dev/null
+++ b/doc/images/dep-graph-incorrect.svg
@@ -0,0 +1,40 @@
+
+
+
+
+
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 8c447ef876e67721a9247defc58e9588cdec144c..063e497fbd0f4f25aafc58b8307c678f72979dbc 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1,22 +1,432 @@
-.. _tutorial
+.. _tutorial:
Tutorial
========
This guide provides a gentle introduction into what loopy is, how it works, and
-what it can do. In doing so, some information is omitted or glossed over for
-sake of clarity. There's also the :ref:`reference` that clearly defines all
+what it can do. There's also the :ref:`reference` that clearly defines all
aspects of loopy.
-Loopy's View of a Kernel
-------------------------
+Preparation
+-----------
-.. literalinclude:: ../examples/rank-one.py
- :start-after: SETUPBEGIN
- :end-before: SETUPEND
+For now, :mod:`loopy` depends on :mod:`pyopencl`. We import a few modules
+and initialize :mod:`pyopencl`
-This example is included in the :mod:`loopy` distribution as
-:download:`examples/rank-one.py <../examples/rank-one.py>`.
+.. doctest::
+ :options: +ELLIPSIS
+ >>> import numpy as np
+ >>> import pyopencl as cl
+ >>> import pyopencl.array
+ >>> import pyopencl.clrandom
+ >>> import loopy as lp
-.. vim: tw=75
+ >>> ctx = cl.create_some_context(interactive=False)
+ >>> queue = cl.CommandQueue(ctx)
+
+We also create some data on the device that we'll use throughout our
+examples:
+
+.. doctest::
+
+ >>> n = 1000
+ >>> x_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
+ >>> y_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
+ >>> a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
+ >>> b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
+
+And some data on the host:
+
+.. doctest::
+
+ >>> x_vec_host = np.random.randn(n).astype(np.float32)
+ >>> y_vec_host = np.random.randn(n).astype(np.float32)
+
+Getting started
+---------------
+
+We'll start by taking a closer look at a very simple kernel that reads in
+one vector, doubles it, and writes it to another.
+
+.. doctest::
+
+ >>> knl = lp.make_kernel(ctx.devices[0],
+ ... "{ [i]: 0<=i>> print knl
+ ---------------------------------------------------------------------------
+ KERNEL: loopy_kernel
+ ---------------------------------------------------------------------------
+ ARGUMENTS:
+ a: GlobalArg, type: , shape: (n), dim_tags: (stride:1)
+ n: ValueArg, type:
+ out: GlobalArg, type: , shape: (n), dim_tags: (stride:1)
+ ---------------------------------------------------------------------------
+ DOMAINS:
+ [n] -> { [i] : i >= 0 and i <= -1 + n }
+ ---------------------------------------------------------------------------
+ INAME-TO-TAG MAP:
+ i: None
+ ---------------------------------------------------------------------------
+ INSTRUCTIONS:
+ [i] out[i] <- 2*a[i] # insn
+ ---------------------------------------------------------------------------
+
+You'll likely have noticed that there's quite a bit more information here
+than there was in the input. Most of this comes from default values that
+loopy assumes to cover common use cases. These defaults can all be
+overridden.
+
+We've seen the domain and the instructions above, and we'll discuss the
+'iname-to-tag-map' in :ref:`implementing-inames`. The remaining big chunk
+of added information is in the 'arguments' section, where we observe the
+following:
+
+* ``a`` and ``out`` have been classified as pass-by-reference (i.e.
+ pointer) arguments in global device memory. Any referenced array variable
+ will default to global unless otherwise specified.
+
+* Loopy has also examined our access to ``a`` and ``out`` and determined
+ the bounds of the array from the values we are accessing. This is shown
+ after **shape:**. Like :mod:`numpy`, loopy works on multi-dimensional
+ arrays. Loopy's idea of arrays is very similar to that of :mod:`numpy`,
+ including the *shape* attribute.
+
+ Sometimes, loopy will be unable to make this determination. It will tell
+ you so--for example when the array indices consist of data read from
+ memory. Other times, arrays are larger than the accessed footprint. In
+ either case, you will want to specify the kernel arguments explicitly.
+ See :ref:`specifying-arguments`.
+
+* Loopy has not determined the type of ``a`` and ``out``. The data type is
+ given as ````, which means that these types will be determined
+ by the data passed in when the kernel is invoked. Loopy generates (and
+ caches!) a copy of the kernel for each combination of types passed in.
+
+* In addition, each array axis has a 'dimension tag'. This is shown above
+ as ``(stride:1)``. We will see more on this in
+ :ref:`implementing-array-axes`.
+
+Running a kernel
+----------------
+
+Running the kernel that we've just created is easy. Let's check the result
+for good measure.
+
+.. doctest::
+
+ >>> evt, (out,) = knl(queue, a=x_vec_dev)
+ >>> assert (out.get() == (2*x_vec_dev).get()).all()
+
+We can have loopy print the OpenCL kernel it generated
+by passing :attr:`loopy.Flags.write_cl`.
+
+.. doctest::
+
+ >>> evt, (out,) = knl(queue, a=x_vec_dev, flags="write_cl")
+
+ #define lid(N) ((int) get_local_id(N))
+ #define gid(N) ((int) get_group_id(N))
+
+ __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+ {
+
+ for (int i = 0; i <= (-1 + n); ++i)
+ out[i] = 2.0f * a[i];
+ }
+
+
+As promised, loopy has used the type of *x_vec_dev* to specialize the
+kernel. If a variable is written as part of the kernel code, loopy will
+automatically return it in the second element of the result of a kernel
+call (the first being the :class:`pyopencl.Event` associated with the
+execution of the kernel). (If the ordering of the output tuple is not
+clear, it can be specified or turned into a :class:`dict`. See the
+*kernel_data* argument of :func:`loopy.make_kernel` and
+:attr:`loopy.Flags.return_dict`.)
+
+For convenience, loopy kernels also directly accept :mod:`numpy` arrays:
+
+.. doctest::
+
+ >>> evt, (out,) = knl(queue, a=x_vec_host)
+ >>> assert (out == (2*x_vec_host)).all()
+
+Notice how both *out* nor *a* are :mod:`numpy` arrays, but neither needed
+to be transferred to or from the device. Checking for numpy arrays and
+transferring them if needed comes at a potential performance cost. If you
+would like to make sure that you avoid this cost, pass
+:attr:`loopy.Flags.no_numpy`.
+
+Further notice how *n*, while technically being an argument, did not need
+to be passed, as loopy is able to find *n* from the shape of the input
+argument *a*.
+
+For efficiency, loopy generates Python code that handles kernel invocation.
+If you are suspecting that this code is causing you an issue, you can
+inspect that code, too, using :attr:`loopy.Flags.write_wrapper`:
+
+.. doctest::
+ :options: +ELLIPSIS
+
+ >>> evt, (out,) = knl(queue, a=x_vec_host, flags="write_wrapper")
+ from __future__ import division
+ ...
+ def invoke_loopy_kernel_loopy_kernel(cl_kernel, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
+ if allocator is None:
+ allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
+
+ # {{{ find integer arguments from shapes
+
+ if n is None:
+ if a is not None:
+ n = a.shape[0]
+ elif out is not None:
+ n = out.shape[0]
+
+ # }}}
+ ...
+
+Generating code
+~~~~~~~~~~~~~~~
+
+Instead of using loopy to run the code it generates, you can also just use
+loopy as a code generator and take care of executing the generated kernels
+yourself. In this case, make sure loopy knows about all types, and then
+call :func:`loopy.generate_code`:
+
+.. doctest::
+
+ >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
+ >>> code, _ = lp.generate_code(typed_knl)
+ >>> print code
+
+ #define lid(N) ((int) get_local_id(N))
+ #define gid(N) ((int) get_group_id(N))
+
+ __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+ {
+
+ for (int i = 0; i <= (-1 + n); ++i)
+ out[i] = 2.0f * a[i];
+ }
+
+Ordering
+--------
+
+Next, we'll change our kernel a bit. Our goal will be to transpose a matrix
+and double its entries, and we will do this in two steps for the sake of
+argument:
+
+.. doctest::
+
+ >>> # WARNING: Incorrect.
+ >>> knl = lp.make_kernel(ctx.devices[0],
+ ... "{ [i,j]: 0<=i,j>> # WARNING: Incorrect.
+ >>> knl = lp.make_kernel(ctx.devices[0],
+ ... "{ [i,j]: 0<=i,j>> print knl
+ ---------------------------------------------------------------------------
+ KERNEL: loopy_kernel
+ ---------------------------------------------------------------------------
+ ...
+ ---------------------------------------------------------------------------
+ DEPENDENCIES: (use loopy.show_dependency_graph to visualize)
+ insn : transpose
+ ---------------------------------------------------------------------------
+
+These dependencies are in a ``dependent : prerequisite`` format that should
+be familiar if you have previously dealt with Makefiles. For larger
+kernels, these dependency lists can become quite verbose, and there is an
+increasing risk that required dependencies are missed. To help catch these,
+loopy can also show an instruction dependency graph, using
+:func:`loopy.show_dependency_graph`:
+
+.. image:: images/dep-graph-incorrect.svg
+
+Dependencies are shown as arrows from prerequisite to dependent in the
+graph. This functionality requires the open-source `graphviz
+`_ graph drawing tools to be installed. The generated
+graph will open in a browser window.
+
+Since manually notating lots of dependencies is cumbersome, loopy has
+a heuristic:
+
+ If a variable is written by exactly one instruction, then all
+ instructions reading that variable will automatically depend on the
+ writing instruction.
+
+The intention of this heuristic is to cover the common case of a
+precomputed result being stored and used many times. Generally, these
+dependencies are *in addition* to any manual dependencies added via
+``{dep=...}``. It is possible (but rare) that the heuristic adds undesired
+dependencies. In this case, ``{dep=*...}`` (i.e. a leading asterisk) to
+prevent the heuristic from adding dependencies for this instruction.
+
+Next, it is important to understand how loops and dependencies interact.
+Let us take a look at the generated code for the above kernel:
+
+.. doctest::
+
+ >>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+
+ #define lid(N) ((int) get_local_id(N))
+ #define gid(N) ((int) get_group_id(N))
+
+ __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+ {
+
+ for (int i = 0; i <= (-1 + n); ++i)
+ for (int j = 0; j <= (-1 + n); ++j)
+ {
+ out[n * j + i] = a[n * i + j];
+ out[n * i + j] = 2.0f * out[n * i + j];
+ }
+ }
+
+While our requested instruction ordering has been obeyed, something is
+still not right:
+
+.. doctest::
+ :options: +ELLIPSIS
+
+ >>> assert (out.get() == a_mat_dev.get().T*2).all()
+ Traceback (most recent call last):
+ ...
+ AssertionError
+
+For the kernel to perform the desired computation, *all
+instances* (loop iterations) of the first instruction need to be completed,
+not just the one for the current values of *(i, j)*.
+
+ Dependencies in loopy act *within* the largest common set of shared
+ inames.
+
+As a result, our example above realizes the dependency *within* the *i* and *j*
+loops. To fix our example, we simply create a new pair of loops *ii* and *jj*
+with identical bounds, for the use of the transpose:
+
+.. doctest::
+
+ >>> knl = lp.make_kernel(ctx.devices[0],
+ ... "{ [i,j,ii,jj]: 0<=i,j,ii,jj>> evt, (out,) = knl(queue, a=a_mat_dev, flags="write_cl")
+
+ #define lid(N) ((int) get_local_id(N))
+ #define gid(N) ((int) get_group_id(N))
+
+ __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+ {
+
+ for (int i = 0; i <= (-1 + n); ++i)
+ for (int j = 0; j <= (-1 + n); ++j)
+ out[n * j + i] = a[n * i + j];
+ for (int ii = 0; ii <= (-1 + n); ++ii)
+ for (int jj = 0; jj <= (-1 + n); ++jj)
+ out[n * ii + jj] = 2.0f * out[n * ii + jj];
+ }
+ >>> assert (out.get() == a_mat_dev.get().T*2).all()
+
+Also notice how the changed loop structure is reflected in the dependency
+graph:
+
+.. image:: images/dep-graph-correct.svg
+
+.. _implementing-inames:
+
+Implementing Inames
+-------------------
+
+.. _specifying-arguments:
+
+Specifying arguments
+--------------------
+
+.. _implementing-array-axes:
+
+Implementing Array Axes
+-----------------------
+
+.. vim: tw=75:spell
diff --git a/examples/hello-loopy.py b/examples/hello-loopy.py
index 2c8ff4c3bbdfe331596eec1c0dcb49f6bd8b6e46..c7c7ade30c5123938482412e9a4abeca6732ef55 100644
--- a/examples/hello-loopy.py
+++ b/examples/hello-loopy.py
@@ -23,7 +23,7 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# execute
# -------
-evt, (out,) = knl(queue, a=a, n=n)
+evt, (out,) = knl(queue, a=a)
# ENDEXAMPLE
cknl = lp.CompiledKernel(ctx, knl)