From 844ff2923726450991df300d13b6e7aa2481bef6 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 24 Feb 2016 01:17:02 -0600
Subject: [PATCH] Start to clean out the Augean stables of loopy docs

---
 doc/conf.py                           |   9 +-
 doc/index.rst                         |   5 +-
 doc/ref_creation.rst                  |  33 ++
 doc/{reference.rst => ref_kernel.rst} | 570 ++++++++++----------------
 doc/ref_other.rst                     |  46 +++
 doc/ref_transform.rst                 | 135 ++++++
 doc/tutorial.rst                      |   8 +-
 loopy/__init__.py                     |  34 +-
 loopy/compiled.py                     |  18 +-
 loopy/frontend/fortran/__init__.py    |   5 +-
 loopy/frontend/fortran/translator.py  |   4 +-
 loopy/kernel/__init__.py              |  11 +-
 loopy/kernel/creation.py              |  22 +-
 loopy/kernel/data.py                  |  20 +-
 loopy/statistics.py                   |  14 +
 loopy/target/__init__.py              |  19 +-
 loopy/target/c/__init__.py            |   3 +
 loopy/target/cuda.py                  |   2 +
 loopy/target/ispc.py                  |   5 +
 loopy/target/opencl.py                |   3 +
 loopy/target/pyopencl.py              |   9 +
 loopy/transform/arithmetic.py         |  99 -----
 loopy/transform/batch.py              |   4 +
 loopy/transform/iname.py              | 138 ++++++-
 loopy/transform/parameter.py          |  43 ++
 loopy/transform/subst.py              |   2 +-
 26 files changed, 761 insertions(+), 500 deletions(-)
 create mode 100644 doc/ref_creation.rst
 rename doc/{reference.rst => ref_kernel.rst} (72%)
 create mode 100644 doc/ref_other.rst
 create mode 100644 doc/ref_transform.rst

diff --git a/doc/conf.py b/doc/conf.py
index 301604607..74e1aec30 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -46,7 +46,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'loopy'
-copyright = u'2011, Andreas Klöckner'
+copyright = u'2016, Andreas Klöckner'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -54,7 +54,10 @@ copyright = u'2011, Andreas Klöckner'
 #
 # The short X.Y version.
 ver_dic = {}
-exec(compile(open("../loopy/version.py").read(), "../loopy/version.py", 'exec'), ver_dic)
+with open("../loopy/version.py") as vpy_file:
+    version_py = vpy_file.read()
+
+exec(compile(version_py, "../loopy/version.py", 'exec'), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
@@ -249,4 +252,4 @@ intersphinx_mapping = {
     'http://docs.scipy.org/doc/numpy/': None,
     }
 
-autoclass_content = "both"
+autoclass_content = "class"
diff --git a/doc/index.rst b/doc/index.rst
index 19bbe8772..a0bad2898 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -63,7 +63,10 @@ Please check :ref:`installation` to get started.
     :maxdepth: 2
 
     tutorial
-    reference
+    ref_creation
+    ref_kernel
+    ref_transform
+    ref_other
     misc
 
 Indices and tables
diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst
new file mode 100644
index 000000000..92eff09c9
--- /dev/null
+++ b/doc/ref_creation.rst
@@ -0,0 +1,33 @@
+.. module:: loopy
+.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
+
+.. _creating-kernels:
+
+Reference: Creating Kernels
+===========================
+
+From Loop Domains and Instructions
+----------------------------------
+
+.. autofunction:: make_kernel
+
+From Fortran
+------------
+
+.. autofunction:: parse_fortran
+
+.. autofunction:: parse_transformed_fortran
+
+.. autofunction:: c_preprocess
+
+From Other Kernels
+------------------
+
+.. autofunction:: fuse_kernels
+
+To Copy between Data Formats
+----------------------------
+
+.. autofunction:: make_copy_kernel
+
+.. vim: tw=75:spell:fdm=marker
diff --git a/doc/reference.rst b/doc/ref_kernel.rst
similarity index 72%
rename from doc/reference.rst
rename to doc/ref_kernel.rst
index 351a2374a..ff27d84f5 100644
--- a/doc/reference.rst
+++ b/doc/ref_kernel.rst
@@ -1,21 +1,39 @@
-.. _reference:
+.. currentmodule:: loopy
 
-Reference Guide
-===============
+Reference: Loopy's Model of a Kernel
+====================================
 
-.. module:: loopy
-.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
+.. _domain-tree:
 
-This guide defines all functionality exposed by loopy. If you would like
-a more gentle introduction, you may consider reading the example-based
-:ref:`tutorial` instead.
+Loop Domain Tree
+----------------
 
-.. _inames:
+.. {{{
+
+Example::
+
+    { [i]: 0<=i<n }
+
+A kernel's iteration domain is given by a list of :class:`islpy.BasicSet`
+instances (which parametrically represent multi-dimensional sets of
+tuples of integers).  They define the integer values of the loop variables
+for which instructions (see below) will be executed.
+It is written in :ref:`isl-syntax`.  :mod:`loopy` calls the loop variables
+*inames*. In this case, *i* is the sole iname. The loop
+domain is given as a conjunction of affine equality
+and inequality constraints. Integer divisibility constraints (resulting
+in strides) are also allowed. In the absence of divisibility
+constraints, the loop domain is convex.
 
-Domain Tree
------------
+Note that *n* in the example is not an iname. It is a
+:ref:`domain-parameter` that is passed to the kernel by the user.
 
+To accommodate some data-dependent control flow, there is not actually
+a single loop domain, but rather a *tree of loop domains*,
+allowing more deeply nested domains to depend on inames
+introduced by domains closer to the root.
 
+.. _inames:
 
 Inames
 ^^^^^^
@@ -25,161 +43,10 @@ dependency semantics--otherwise e.g. a fetch could happen inside one loop nest,
 and then the instruction using that fetch could be inside a wholly different
 loop nest.
 
-Instructions
-------------
-
-Expressions
-^^^^^^^^^^^
-
-Loopy's expressions are a slight superset of the expressions supported by
-:mod:`pymbolic`.
-
-* `if`
-* `reductions`
-    * duplication of reduction inames
-* complex-valued arithmetic
-* tagging of array access and substitution rule use ("$")
-* ``indexof``, ``indexof_vec``
-
-.. _types:
-
-Specifying Types
-----------------
-
-:mod:`loopy` uses the same type system as :mod:`numpy`. (See
-:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of
-user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype`
-and related functions.
-
-For a string representation of types, all numpy types (e.g. ``float32`` etc.)
-are accepted, in addition to what is registered in :mod:`pyopencl`.
-
-.. _iname-tags:
-
-Iname Implementation Tags
--------------------------
-
-=============================== ====================================================
-Tag                             Meaning
-=============================== ====================================================
-``None`` | ``"for"``            Sequential loop
-``"l.N"``                       Local (intra-group) axis N ("local")
-``"g.N"``                       Group-number axis N ("group")
-``"unr"``                       Unroll
-``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
-``"ilp.seq"``                   Realize parallel iname as innermost loop
-``"like.INAME"``                Can be used when tagging inames to tag like another
-``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis
-=============================== ====================================================
-
-(Throughout this table, `N` must be replaced by an actual, zero-based number.)
-
-"ILP" does three things:
-
-* Restricts loops to be innermost
-* Duplicates reduction storage for any reductions nested around ILP usage
-* Causes a loop (unrolled or not) to be opened/generated for each
-  involved instruction
-
-.. _data-dim-tags:
-
-Data Axis Tags
---------------
-
-Data axis tags specify how a multi-dimensional array (which is loopy's
-main way of storing data) is represented in (linear, 1D) computer
-memory. This storage format is given as a number of "tags", as listed
-in the table below. Each axis of an array has a tag corresponding to it.
-In the user interface, array dim tags are specified as a tuple of these
-tags or a comma-separated string containing them, such as the following::
-
-    c,vec,sep,c
-
-The interpretation of these tags is order-dependent, they are read
-from left to right.
-
-===================== ====================================================
-Tag                   Meaning
-===================== ====================================================
-``c``                 Nest current axis around the ones that follow
-``f``                 Nest current axis inside the ones that follow
-``N0`` ... ``N9``     Specify an explicit nesting level for this axis
-``stride:EXPR``       A fixed stride
-``sep``               Implement this axis by mapping to separate arrays
-``vec``               Implement this axis as entries in a vector
-===================== ====================================================
-
-``sep`` and ``vec`` obviously require the number of entries
-in the array along their respective axis to be known at code
-generation time.
-
-When the above speaks about 'nesting levels', this means that axes
-"nested inside" others are "faster-moving" when viewed from linear
-memory.
-
-In addition, each tag may be followed by a question mark (``?``),
-which indicates that if there are more dimension tags specified
-than array axes present, that this axis should be omitted. Axes
-with question marks are omitted in a left-first manner until the correct
-number of dimension tags is achieved.
-
-Some examples follow, all of which use a three-dimensional array of shape
-*(3, M, 4)*. For simplicity, we assume that array entries have size one.
-
-*   ``c,c,c``: The axes will have strides *(M*4, 4, 1)*,
-    leading to a C-like / row-major layout.
-
-*   ``f,f,f``: The axes will have strides *(1, 3, 3*M)*,
-    leading to a Fortran-like / row-major layout.
-
-*   ``sep,c,c``: The array will be mapped to three arrays of
-    shape *(M, 4)*, each with strides *(4, 1)*.
-
-*   ``c,c,vec``: The array will be mapped to an array of
-    ``float4`` vectors, with (``float4``-based) strides of
-    *(M, 1)*.
-
-*   ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*.
-
-.. _creating-kernels:
-
-Creating Kernels
-----------------
-
-.. autoclass:: auto
-
-.. _arguments:
-
-Arguments
-^^^^^^^^^
-
-.. autoclass:: ValueArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: GlobalArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: ConstantArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: ImageArg
-    :members:
-    :undoc-members:
-
-.. _temporaries:
-
-Loop domains
-^^^^^^^^^^^^
-
-TODO: Explain the domain tree
-
 .. _isl-syntax:
 
 ISL syntax
-~~~~~~~~~~
+^^^^^^^^^^
 
 The general syntax of an ISL set is the following::
 
@@ -217,27 +84,73 @@ Examples of constructs that are **not** allowed:
   (**Note:** This may be added in a future version of loopy.
   For now, loop domains have to be convex.)
 
-Temporary Variables
-^^^^^^^^^^^^^^^^^^^
+.. _domain-parameters:
 
-Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
-have the lifetime of a kernel invocation.
+Domain parameters
+^^^^^^^^^^^^^^^^^
 
-.. autoclass:: TemporaryVariable
-    :members:
-    :undoc-members:
+Domain parameters are identifiers being used in loop domains that are not
+*inames*, i.e. they do not define loop variables. In the following domain
+specification, *n* is a domain parameter::
+
+    {[i,j]: 0 <= i,j < n}
+
+Values of domain parameters arise from
+
+* being passed to the kernel as :ref:`arguments`
+
+* being assigned to :ref:`temporaries` to feed into domains
+  lower in the :ref:`domain-tree`.
+
+.. _iname-tags:
+
+Iname Implementation Tags
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============================== ====================================================
+Tag                             Meaning
+=============================== ====================================================
+``None`` | ``"for"``            Sequential loop
+``"l.N"``                       Local (intra-group) axis N ("local")
+``"g.N"``                       Group-number axis N ("group")
+``"unr"``                       Unroll
+``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
+``"ilp.seq"``                   Realize parallel iname as innermost loop
+``"like.INAME"``                Can be used when tagging inames to tag like another
+``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis
+=============================== ====================================================
+
+(Throughout this table, `N` must be replaced by an actual, zero-based number.)
+
+"ILP" does three things:
+
+* Restricts loops to be innermost
+* Duplicates reduction storage for any reductions nested around ILP usage
+* Causes a loop (unrolled or not) to be opened/generated for each
+  involved instruction
+
+.. }}}
+
+.. _instructions:
 
 Instructions
-^^^^^^^^^^^^
+------------
 
-.. autoclass:: UniqueName
+.. {{{
 
 .. autoclass:: InstructionBase
 
 .. _assignments:
 
-Assignments
-~~~~~~~~~~~
+Assignment objects
+^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: Assignment
+
+.. _assignment-syntax:
+
+Textual Assignment Syntax
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The general syntax of an instruction is a simple assignment::
 
@@ -349,254 +262,205 @@ These are usually key-value pairs. The following attributes are recognized:
   given instruction groups. See
   :class:`InstructionBase.conflicts_with_groups`.
 
-Assignment instructions are expressed as instances of the following class:
+.. _expression-syntax:
 
-.. autoclass:: ExpressionInstruction
+Expressions
+^^^^^^^^^^^
 
-.. _expression-syntax:
+Loopy's expressions are a slight superset of the expressions supported by
+:mod:`pymbolic`.
 
-Expression Syntax
-~~~~~~~~~~~~~~~~~
+* `if`
+* `reductions`
+    * duplication of reduction inames
+* complex-valued arithmetic
+* tagging of array access and substitution rule use ("$")
+* ``indexof``, ``indexof_vec``
 
 TODO: Functions
 TODO: Reductions
 
 C Block Instructions
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: CInstruction
 
-.. _substitution-rule:
-
-Substitution Rules
-^^^^^^^^^^^^^^^^^^
-
-Syntax of a substitution rule::
-
-    rule_name(arg1, arg2) := EXPRESSION
+.. }}}
 
-Kernels
-^^^^^^^
+Data: Arguments and Temporaries
+-------------------------------
 
-.. class:: LoopKernel
+.. {{{
 
-Do not create :class:`LoopKernel` objects directly. Instead, use the following
-function, which is responsible for creating kernels:
+Kernels operate on two types of data: 'arguments' carrying data into and out of a kernel,
+and temporaries with lifetimes tied to the runtime of the kernel.
 
-.. autofunction:: make_kernel
-
-.. autofunction:: parse_fortran
-
-.. autofunction:: parse_transformed_fortran
+.. _arguments:
 
-.. autofunction:: make_copy_kernel
+Arguments
+^^^^^^^^^
 
-.. autofunction:: fuse_kernels
+.. autoclass:: KernelArgument
+    :members:
+    :undoc-members:
 
-.. autofunction:: c_preprocess
+.. autoclass:: ValueArg
+    :members:
+    :undoc-members:
 
-Transforming Kernels
---------------------
+.. autoclass:: GlobalArg
+    :members:
+    :undoc-members:
 
-.. _context-matching:
+.. autoclass:: ConstantArg
+    :members:
+    :undoc-members:
 
-Matching contexts
-^^^^^^^^^^^^^^^^^
+.. autoclass:: ImageArg
+    :members:
+    :undoc-members:
 
-TODO: Matching instruction tags
+.. _temporaries:
 
-.. automodule:: loopy.context_matching
+Temporary Variables
+^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: parse_match
+Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
+have the lifetime of a kernel invocation.
 
-.. autofunction:: parse_stack_match
+.. autoclass:: TemporaryVariable
+    :members:
+    :undoc-members:
 
-.. currentmodule:: loopy
+.. _types:
 
-Wrangling inames
+Specifying Types
 ^^^^^^^^^^^^^^^^
 
-.. autofunction:: split_iname
-
-.. autofunction:: chunk_iname
-
-.. autofunction:: join_inames
-
-.. autofunction:: tag_inames
-
-.. autofunction:: duplicate_inames
-
-.. undocumented .. autofunction:: link_inames
-
-.. autofunction:: rename_iname
-
-.. autofunction:: remove_unused_inames
-
-.. autofunction:: set_loop_priority
-
-.. autofunction:: split_reduction_inward
-
-.. autofunction:: split_reduction_outward
-
-.. autofunction:: affine_map_inames
-
-.. autofunction:: realize_ilp
-
-.. autofunction:: find_unused_axis_tag
-
-Dealing with Parameters
-^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autofunction:: fix_parameters
-
-.. autofunction:: assume
+:mod:`loopy` uses the same type system as :mod:`numpy`. (See
+:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of
+user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype`
+and related functions.
 
-Dealing with Substitution Rules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For a string representation of types, all numpy types (e.g. ``float32`` etc.)
+are accepted, in addition to what is registered in :mod:`pyopencl`.
 
-.. autofunction:: extract_subst
+.. _data-dim-tags:
 
-.. autofunction:: assignment_to_subst
+Data Axis Tags
+^^^^^^^^^^^^^^
 
-.. autofunction:: expand_subst
+Data axis tags specify how a multi-dimensional array (which is loopy's
+main way of storing data) is represented in (linear, 1D) computer
+memory. This storage format is given as a number of "tags", as listed
+in the table below. Each axis of an array has a tag corresponding to it.
+In the user interface, array dim tags are specified as a tuple of these
+tags or a comma-separated string containing them, such as the following::
 
-.. autofunction:: find_rules_matching
+    c,vec,sep,c
 
-.. autofunction:: find_one_rule_matching
+The interpretation of these tags is order-dependent, they are read
+from left to right.
 
-Caching, Precomputation and Prefetching
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+===================== ====================================================
+Tag                   Meaning
+===================== ====================================================
+``c``                 Nest current axis around the ones that follow
+``f``                 Nest current axis inside the ones that follow
+``N0`` ... ``N9``     Specify an explicit nesting level for this axis
+``stride:EXPR``       A fixed stride
+``sep``               Implement this axis by mapping to separate arrays
+``vec``               Implement this axis as entries in a vector
+===================== ====================================================
 
-.. autofunction:: precompute
+``sep`` and ``vec`` obviously require the number of entries
+in the array along their respective axis to be known at code
+generation time.
 
-.. autofunction:: add_prefetch
+When the above speaks about 'nesting levels', this means that axes
+"nested inside" others are "faster-moving" when viewed from linear
+memory.
 
-.. autofunction:: buffer_array
+In addition, each tag may be followed by a question mark (``?``),
+which indicates that if there are more dimension tags specified
+than array axes present, that this axis should be omitted. Axes
+with question marks are omitted in a left-first manner until the correct
+number of dimension tags is achieved.
 
-.. autofunction:: alias_temporaries
+Some examples follow, all of which use a three-dimensional array of shape
+*(3, M, 4)*. For simplicity, we assume that array entries have size one.
 
-Influencing data access
-^^^^^^^^^^^^^^^^^^^^^^^
+*   ``c,c,c``: The axes will have strides *(M*4, 4, 1)*,
+    leading to a C-like / row-major layout.
 
-.. autofunction:: change_arg_to_image
+*   ``f,f,f``: The axes will have strides *(1, 3, 3*M)*,
+    leading to a Fortran-like / row-major layout.
 
-.. autofunction:: tag_data_axes
+*   ``sep,c,c``: The array will be mapped to three arrays of
+    shape *(M, 4)*, each with strides *(4, 1)*.
 
-.. autofunction:: remove_unused_arguments
+*   ``c,c,vec``: The array will be mapped to an array of
+    ``float4`` vectors, with (``float4``-based) strides of
+    *(M, 1)*.
 
-.. autofunction:: set_array_dim_names
+*   ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*.
 
-Padding
-^^^^^^^
+.. }}}
 
-.. autofunction:: split_array_dim
+.. _substitution-rule:
 
-.. autofunction:: find_padding_multiple
+Substitution Rules
+------------------
 
-.. autofunction:: add_padding
+.. {{{
 
-Manipulating Instructions
+Substitution Rule Objects
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: set_instruction_priority
-
-.. autofunction:: add_dependency
-
-.. autofunction:: remove_instructions
-
-.. autofunction:: tag_instructions
-
-Library interface
-^^^^^^^^^^^^^^^^^
-
-.. autofunction:: register_reduction_parser
-
-.. autofunction:: register_preamble_generators
-
-.. autofunction:: register_symbol_manglers
-
-.. autofunction:: register_function_manglers
-
-Arguments
-^^^^^^^^^
-
-.. autofunction:: set_argument_order
-
-.. autofunction:: add_dtypes
-
-.. autofunction:: infer_unknown_types
-
-.. autofunction:: add_and_infer_dtypes
-
-.. autofunction:: rename_argument
-
-Batching
-^^^^^^^^
+.. autoclass:: SubstitutionRule
 
-.. autofunction:: to_batched
+.. _subst-rule-syntax:
 
-Finishing up
-^^^^^^^^^^^^
+Textual Syntax for Substitution Rules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: generate_loop_schedules
-
-.. autofunction:: get_one_scheduled_kernel
-
-.. autofunction:: generate_code
-
-Running
--------
-
-.. autoclass:: CompiledKernel
-
-Automatic Testing
------------------
-
-.. autofunction:: auto_test_vs_ref
-
-Troubleshooting
----------------
-
-Printing :class:`LoopKernel` objects
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you're confused about things loopy is referring to in an error message or
-about the current state of the :class:`LoopKernel` you are transforming, the
-following always works::
-
-    print kernel
+Syntax of a substitution rule::
 
-(And it yields a human-readable--albeit terse--representation of *kernel*.)
+    rule_name(arg1, arg2) := EXPRESSION
 
-.. autofunction:: preprocess_kernel
+.. }}}
 
-.. autofunction:: get_dot_dependency_graph
+Kernel Options
+--------------
 
-.. autofunction:: show_dependency_graph
+.. autoclass:: Options
 
-Options
+Targets
 -------
 
-.. autoclass:: Options
+.. automodule:: loopy.target
 
-.. autofunction:: set_options
+Helper values
+-------------
 
-Controlling caching
--------------------
+.. {{{
 
-.. autofunction:: set_caching_enabled
+.. autoclass:: auto
 
-.. autoclass:: CacheMode
+.. autoclass:: UniqueName
 
-Obtaining Kernel Statistics
----------------------------
+.. }}}
 
-.. autofunction:: get_op_poly
+The Kernel Object
+-----------------
 
-.. autofunction:: get_gmem_access_poly
+Do not create :class:`LoopKernel` objects directly. Instead, refer to
+:ref:`creating-kernels`.
 
-.. autofunction:: sum_mem_access_to_bytes
+.. autoclass:: LoopKernel
 
-.. autofunction:: get_barrier_poly
+.. autoclass:: kernel_state
+    :members:
+    :undoc-members:
 
-.. vim: tw=75:spell
+.. vim: tw=75:spell:fdm=marker
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
new file mode 100644
index 000000000..71d6c54b1
--- /dev/null
+++ b/doc/ref_other.rst
@@ -0,0 +1,46 @@
+Reference: Other Functionality
+==============================
+
+Obtaining Kernel Performance Statistics
+---------------------------------------
+
+.. automodule:: loopy.statistics
+
+Controlling caching
+-------------------
+
+.. autofunction:: set_caching_enabled
+
+.. autoclass:: CacheMode
+
+Running Kernels
+---------------
+
+In addition to simply calling kernels using :class:`LoopKernel.__call__`,
+the following underlying functionality may be used:
+
+.. autoclass:: CompiledKernel
+
+Automatic Testing
+-----------------
+
+.. autofunction:: auto_test_vs_ref
+
+Troubleshooting
+---------------
+
+Printing :class:`LoopKernel` objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you're confused about things loopy is referring to in an error message or
+about the current state of the :class:`LoopKernel` you are transforming, the
+following always works::
+
+    print(kernel)
+
+(And it yields a human-readable--albeit terse--representation of *kernel*.)
+
+.. autofunction:: get_dot_dependency_graph
+
+.. autofunction:: show_dependency_graph
+
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
new file mode 100644
index 000000000..4c3c24873
--- /dev/null
+++ b/doc/ref_transform.rst
@@ -0,0 +1,135 @@
+.. _reference-transform:
+
+Reference: Transforming Kernels
+===============================
+
+Dealing with Parameters
+-----------------------
+
+.. automodule:: loopy.transform.parameter
+
+Wrangling inames
+----------------
+
+.. automodule:: loopy.transform.iname
+
+Dealing with Substitution Rules
+-------------------------------
+
+.. currentmodule:: loopy
+
+.. autofunction:: extract_subst
+
+.. autofunction:: assignment_to_subst
+
+.. autofunction:: expand_subst
+
+.. autofunction:: find_rules_matching
+
+.. autofunction:: find_one_rule_matching
+
+Caching, Precomputation and Prefetching
+---------------------------------------
+
+.. autofunction:: precompute
+
+.. autofunction:: add_prefetch
+
+.. autofunction:: buffer_array
+
+.. autofunction:: alias_temporaries
+
+Influencing data access
+-----------------------
+
+.. autofunction:: change_arg_to_image
+
+.. autofunction:: tag_data_axes
+
+.. autofunction:: remove_unused_arguments
+
+.. autofunction:: set_array_dim_names
+
+Padding Data
+------------
+
+.. autofunction:: split_array_dim
+
+.. autofunction:: find_padding_multiple
+
+.. autofunction:: add_padding
+
+Manipulating Instructions
+-------------------------
+
+.. autofunction:: set_instruction_priority
+
+.. autofunction:: add_dependency
+
+.. autofunction:: remove_instructions
+
+.. autofunction:: tag_instructions
+
+Registering Library Routines
+----------------------------
+
+.. autofunction:: register_reduction_parser
+
+.. autofunction:: register_preamble_generators
+
+.. autofunction:: register_symbol_manglers
+
+.. autofunction:: register_function_manglers
+
+Modifying Arguments
+-------------------
+
+.. autofunction:: set_argument_order
+
+.. autofunction:: add_dtypes
+
+.. autofunction:: infer_unknown_types
+
+.. autofunction:: add_and_infer_dtypes
+
+.. autofunction:: rename_argument
+
+Creating Batches of Operations
+------------------------------
+
+.. automodule:: loopy.transform.batch
+
+Finishing up
+------------
+
+.. currentmodule:: loopy
+
+.. autofunction:: preprocess_kernel
+
+.. autofunction:: generate_loop_schedules
+
+.. autofunction:: get_one_scheduled_kernel
+
+.. autofunction:: generate_code
+
+Setting options
+---------------
+
+.. autofunction:: set_options
+
+.. _context-matching:
+
+Matching contexts
+-----------------
+
+TODO: Matching instruction tags
+
+.. automodule:: loopy.context_matching
+
+.. autofunction:: parse_match
+
+.. autofunction:: parse_stack_match
+
+
+.. vim: tw=75:spell
+
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 24ec0ce4f..4421fd80f 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1186,8 +1186,12 @@ across the remaining axis of the workgroup would emerge.
 
 TODO
 
-Obtaining Kernel Statistics
----------------------------
+.. }}}
+
+Obtaining Performance Statistics
+--------------------------------
+
+.. {{{
 
 Operations, array access, and barriers can all be counted, which may facilitate
 performance prediction and optimization of a :mod:`loopy` kernel.
diff --git a/loopy/__init__.py b/loopy/__init__.py
index bf0a2be1b..c71a03fec 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -38,11 +38,13 @@ from loopy.library.function import (
 
 from loopy.kernel.data import (
         auto,
+        KernelArgument,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
         InstructionBase, Assignment, ExpressionInstruction, CInstruction,
-        TemporaryVariable)
+        TemporaryVariable,
+        SubstitutionRule)
 
-from loopy.kernel import LoopKernel
+from loopy.kernel import LoopKernel, kernel_state
 from loopy.kernel.tools import (
         get_dot_dependency_graph,
         show_dependency_graph,
@@ -54,9 +56,10 @@ from loopy.library.reduction import register_reduction_parser
 # {{{ import transforms
 
 from loopy.transform.iname import (
-        assume, set_loop_priority,
+        set_loop_priority,
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
         rename_iname, link_inames, remove_unused_inames,
+        split_reduction_inward, split_reduction_outward,
         affine_map_inames, find_unused_axis_tag)
 
 from loopy.transform.instruction import (
@@ -79,8 +82,7 @@ from loopy.transform.buffer import buffer_array
 from loopy.transform.fusion import fuse_kernels
 
 from loopy.transform.arithmetic import (
-        split_reduction_inward,
-        split_reduction_outward, fold_constants,
+        fold_constants,
         collect_common_factors_on_increment)
 
 from loopy.transform.padding import (
@@ -89,7 +91,7 @@ from loopy.transform.padding import (
 
 from loopy.transform.ilp import realize_ilp
 from loopy.transform.batch import to_batched
-from loopy.transform.parameter import fix_parameters
+from loopy.transform.parameter import assume, fix_parameters
 
 # }}}
 
@@ -107,15 +109,24 @@ from loopy.auto_test import auto_test_vs_ref
 from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
         parse_fortran)
 
+from loopy.target import TargetBase
+from loopy.target.c import CTarget
+from loopy.target.cuda import CudaTarget
+from loopy.target.opencl import OpenCLTarget
+from loopy.target.pyopencl import PyOpenCLTarget
+from loopy.target.ispc import ISPCTarget
+
 __all__ = [
         "TaggedVariable", "Reduction", "LinearSubscript",
 
         "auto",
 
-        "LoopKernel",
+        "LoopKernel", "kernel_state",
 
+        "KernelArgument",
         "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg",
         "TemporaryVariable",
+        "SubstitutionRule",
 
         "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction",
 
@@ -127,10 +138,11 @@ __all__ = [
 
         # {{{ transforms
 
-        "assume", "set_loop_priority",
+        "set_loop_priority",
         "split_iname", "chunk_iname", "join_inames", "tag_inames",
         "duplicate_inames",
         "rename_iname", "link_inames", "remove_unused_inames",
+        "split_reduction_inward", "split_reduction_outward",
         "affine_map_inames", "find_unused_axis_tag",
 
         "add_prefetch", "change_arg_to_image", "tag_data_axes",
@@ -148,7 +160,6 @@ __all__ = [
         "precompute", "buffer_array",
         "fuse_kernels",
 
-        "split_reduction_inward", "split_reduction_outward",
         "fold_constants", "collect_common_factors_on_increment",
 
         "split_array_dim", "split_arg_axis", "find_padding_multiple",
@@ -158,7 +169,7 @@ __all__ = [
 
         "to_batched",
 
-        "fix_parameters",
+        "assume", "fix_parameters",
 
         # }}}
 
@@ -186,6 +197,9 @@ __all__ = [
 
         "LoopyError", "LoopyWarning",
 
+        "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget",
+        "PyOpenCLTarget", "ISPCTarget",
+
         # {{{ from this file
 
         "register_preamble_generators",
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 6d4396b5a..32536021d 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -791,8 +791,16 @@ class _CLKernelInfo(Record):
 
 
 class CompiledKernel:
+    """An object connecting a kernel to a :class:`pyopencl.Context`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
     def __init__(self, context, kernel):
         """
+        :arg context: a :class:`pyopencl.Context`
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
             kernel has not yet been loop-scheduled, that is done, too, with no
@@ -908,10 +916,12 @@ class CompiledKernel:
 
     def __call__(self, queue, **kwargs):
         """
-        :arg allocator:
-        :arg wait_for:
-        :arg out_host:
-
+        :arg allocator: a callable passed a byte count and returning
+            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
+            maybe.
+        :arg wait_for: A list of :class:`pyopencl.Event` instances
+            for which to wait.
+        :arg out_host: :class:`bool`
             Decides whether output arguments (i.e. arguments
             written by the kernel) are to be returned as
             :mod:`numpy` arrays. *True* for yes, *False* for no.
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index f2bbb2882..b3a7c2133 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -132,7 +132,7 @@ def _extract_loopy_lines(source):
 
 def parse_transformed_fortran(source, free_form=True, strict=True,
         pre_transform_code=None, transform_code_context=None,
-        filename="<floopy code>"):
+        filename="<floopy code>", target=None):
     """
     :arg source: a string of Fortran source code which must include
         a snippet of transform code as described below.
@@ -257,7 +257,8 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
                 "and returned invalid data (Sorry!)")
 
     from loopy.frontend.fortran.translator import F2LoopyTranslator
-    f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies)
+    f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies,
+            target=None)
     f2loopy(tree)
 
     return f2loopy.make_kernels()
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 53ea602f1..68dd5fa95 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -198,10 +198,11 @@ class Scope(object):
 # {{{ translator
 
 class F2LoopyTranslator(FTreeWalkerBase):
-    def __init__(self, filename, auto_dependencies):
+    def __init__(self, filename, auto_dependencies, target=None):
         FTreeWalkerBase.__init__(self)
 
         self.auto_dependencies = auto_dependencies
+        self.target = target
 
         self.scope_stack = []
 
@@ -679,6 +680,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     name=sub.subprogram_name,
                     default_order="F",
                     index_dtype=self.index_dtype,
+                    target=self.target,
                     )
 
             from loopy.loop import fuse_loop_domains
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 42f83bfdf..91c22dbeb 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -90,11 +90,16 @@ class LoopKernel(RecordWithoutPickling):
     .. attribute:: domains
 
         a list of :class:`islpy.BasicSet` instances
+        representing the :ref:`domain-tree`.
 
     .. attribute:: instructions
+
+        A list of :class:`InstructionBase` instances, e.g.
+        :class:`Assignment`. See :ref:`instructions`.
+
     .. attribute:: args
 
-        A list of :class:`loopy.kernel.data.KernelArgument`
+        A list of :class:`loopy.KernelArgument`
 
     .. attribute:: schedule
 
@@ -108,7 +113,7 @@ class LoopKernel(RecordWithoutPickling):
     .. attribute:: temporary_variables
 
         A :class:`dict` of mapping variable names to
-        :class:`loopy.kernel.data.TemporaryVariable`
+        :class:`loopy.TemporaryVariable`
         instances.
 
     .. attribute:: iname_to_tag
@@ -155,7 +160,7 @@ class LoopKernel(RecordWithoutPickling):
 
     .. attribute:: target
 
-        A subclass of :class:`loopy.target.TargetBase`.
+        A subclass of :class:`loopy.TargetBase`.
     """
 
     # {{{ constructor
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f7c478dff..6a5d523e6 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1011,8 +1011,22 @@ def resolve_wildcard_deps(knl):
 def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     """User-facing kernel creation entrypoint.
 
-    :arg domains: :class:`islpy.BasicSet`
+    :arg domains:
+
+        A list of :class:`islpy.BasicSet` (i.e. convex set) instances
+        representing the :ref:`domain-tree`. May also be a list of strings
+        which will be parsed into such instances according to :ref:`isl-syntax`.
+
     :arg instructions:
+
+        A list of :class:`Assignment` (or other :class:`InstructionBase`
+        subclasses), possibly intermixed with instances of
+        :class:`SubstitutionRule`. This same list may also contain strings
+        which will be parsed into such objects using the
+        :ref:`assignment-syntax` and the :ref:`subst-rule-syntax`.  May also be
+        a single multi-line string which will be split into lines and then
+        parsed.
+
     :arg kernel_data:
 
         A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances.
@@ -1054,7 +1068,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         strides. They are expanded only upon kernel creation.
     :arg default_order: "C" (default) or "F"
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
-        *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments.
+        *offset* in :attr:`GlobalArg` for guessed arguments.
         Defaults to 0.
     :arg function_manglers: list of functions of signature (name, arg_dtypes)
         returning a tuple (result_dtype, c_name)
@@ -1074,8 +1088,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         to silence
     :arg options: an instance of :class:`loopy.Options` or an equivalent
         string representation
-    :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*,
-        to use the default target. (see :func:`loopy.set_default_target`)
+    :arg target: an instance of :class:`loopy.TargetBase`, or *None*,
+        to use the default target.
     """
 
     defines = kwargs.pop("defines", {})
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 9d0848788..5b0cf57e5 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -185,6 +185,8 @@ def parse_tag(tag):
 # {{{ arguments
 
 class KernelArgument(Record):
+    """Base class for all argument types"""
+
     def __init__(self, **kwargs):
         kwargs["name"] = intern(kwargs.pop("name"))
 
@@ -418,7 +420,9 @@ class SubstitutionRule(Record):
 # {{{ base class
 
 class InstructionBase(Record):
-    """
+    """A base class for all types of instruction that can occur in
+    a kernel.
+
     .. attribute:: id
 
         An (otherwise meaningless) identifier that is unique within
@@ -427,7 +431,7 @@ class InstructionBase(Record):
     .. attribute:: depends_on
 
         a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
-         that *must* be executed before this one. Note that
+        that *must* be executed before this one. Note that
         :func:`loopy.preprocess_kernel` (usually invoked automatically)
         augments this by adding dependencies on any writes to temporaries read
         by this instruction.
@@ -493,6 +497,14 @@ class InstructionBase(Record):
 
         A tuple of string identifiers that can be used to identify groups
         of instructions.
+
+    .. automethod:: __init__
+    .. automethod:: assignees_and_indices
+    .. automethod:: with_transformed_expressions
+    .. automethod:: write_dependency_names
+    .. automethod:: dependency_names
+    .. automethod:: assignee_var_names
+    .. automethod:: copy
     """
 
     fields = set("id depends_on depends_on_is_final "
@@ -568,10 +580,12 @@ class InstructionBase(Record):
                 predicates=predicates,
                 tags=tags)
 
+    # legacy
     @property
     def insn_deps(self):
         return self.depends_on
 
+    # legacy
     @property
     def insn_deps_is_final(self):
         return self.depends_on_is_final
@@ -740,6 +754,8 @@ class Assignment(InstructionBase):
 
         if not *None*, a type that will be assigned to the new temporary variable
         created from the assignee
+
+    .. automethod:: __init__
     """
 
     fields = InstructionBase.fields | \
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 40bf6da2b..f0a01463a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -35,6 +35,20 @@ from loopy.kernel.data import Assignment
 from loopy.diagnostic import warn
 
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autofunction:: get_op_poly
+
+.. autofunction:: get_gmem_access_poly
+
+.. autofunction:: sum_mem_access_to_bytes
+
+.. autofunction:: get_barrier_poly
+
+"""
+
 # {{{ ToCountMap
 
 class ToCountMap:
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index ee28594a5..85e58a809 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -24,9 +24,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: TargetBase
+.. autoclass:: CTarget
+.. autoclass:: CudaTarget
+.. autoclass:: OpenCLTarget
+.. autoclass:: PyOpenCLTarget
+.. autoclass:: ISPCTarget
+
+"""
+
 
 class TargetBase(object):
-    """Objects of this type must be picklable."""
+    """Base class for all targets, i.e. different types of code that
+    loopy can generate.
+
+    Objects of this type must be picklable.
+    """
 
     # {{{ persistent hashing
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 73310c4db..5d6a856d1 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -56,6 +56,9 @@ def _preamble_generator(kernel, seen_dtypes, seen_functions):
 
 
 class CTarget(TargetBase):
+    """A target for plain "C", without any parallel extensions.
+    """
+
     hash_fields = TargetBase.hash_fields + ("fortran_abi",)
     comparison_fields = TargetBase.comparison_fields + ("fortran_abi",)
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 94a144cb9..55f8da4d6 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -162,6 +162,8 @@ class LoopyCudaCCodeMapper(LoopyCCodeMapper):
 # {{{ target
 
 class CudaTarget(CTarget):
+    """A target for Nvidia's CUDA GPU programming language."""
+
     def __init__(self, extern_c=True):
         """
         :arg extern_c: If *True*, declare kernels using "extern C" to
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 737ee0d99..2c751e103 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -55,6 +55,11 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
 
 
 class ISPCTarget(CTarget):
+    """A code generation target for Intel's `ISPC <https://ispc.github.io/>`_
+    SPMD programming language, to target Intel's Knight's hardware and modern
+    Intel CPUs with wide vector units.
+    """
+
     def __init__(self, occa_mode=False):
         """
         :arg occa_mode: Whether to modify the generated call signature to
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index a009e9336..cfdc8620b 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -189,6 +189,9 @@ class LoopyOpenCLCCodeMapper(LoopyCCodeMapper):
 # {{{ target
 
 class OpenCLTarget(CTarget):
+    """A target for the OpenCL C heterogeneous compute programming language.
+    """
+
     # {{{ library
 
     def function_manglers(self):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index d13384534..3c9e8aac7 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -244,8 +244,17 @@ class _LegacyTypeRegistryStub(object):
         from pyopencl.compyte.dtypes import dtype_to_ctype
         return dtype_to_ctype(dtype)
 
+# }}}
+
+
+# {{{ target
 
 class PyOpenCLTarget(OpenCLTarget):
+    """A code generation target that takes special advantage of :mod:`pyopencl`
+    features such as run-time knowledge of the target device (to generate
+    warnings) and support for complex numbers.
+    """
+
     def __init__(self, device=None):
         super(PyOpenCLTarget, self).__init__()
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index a68300929..d41222c26 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -25,108 +25,9 @@ THE SOFTWARE.
 
 import six
 
-from loopy.symbolic import (RuleAwareIdentityMapper,
-        SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 
 
-# {{{ split_reduction
-
-class _ReductionSplitter(RuleAwareIdentityMapper):
-    def __init__(self, rule_mapping_context, within, inames, direction):
-        super(_ReductionSplitter, self).__init__(
-                rule_mapping_context)
-
-        self.within = within
-        self.inames = inames
-        self.direction = direction
-
-    def map_reduction(self, expr, expn_state):
-        if set(expr.inames) & set(expn_state.arg_context):
-            # FIXME
-            raise NotImplementedError()
-
-        if (self.inames <= set(expr.inames)
-                and self.within(
-                    expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
-            leftover_inames = set(expr.inames) - self.inames
-
-            from loopy.symbolic import Reduction
-            if self.direction == "in":
-                return Reduction(expr.operation, tuple(leftover_inames),
-                        Reduction(expr.operation, tuple(self.inames),
-                            self.rec(expr.expr, expn_state)))
-            elif self.direction == "out":
-                return Reduction(expr.operation, tuple(self.inames),
-                        Reduction(expr.operation, tuple(leftover_inames),
-                            self.rec(expr.expr, expn_state)))
-            else:
-                assert False
-        else:
-            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
-
-
-def _split_reduction(kernel, inames, direction, within=None):
-    if direction not in ["in", "out"]:
-        raise ValueError("invalid value for 'direction': %s" % direction)
-
-    if isinstance(inames, str):
-        inames = inames.split(",")
-    inames = set(inames)
-
-    from loopy.context_matching import parse_stack_match
-    within = parse_stack_match(within)
-
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            kernel.substitutions, kernel.get_var_name_generator())
-    rsplit = _ReductionSplitter(rule_mapping_context,
-            within, inames, direction)
-    return rule_mapping_context.finish_kernel(
-            rsplit.map_kernel(kernel))
-
-
-def split_reduction_inward(kernel, inames, within=None):
-    """Takes a reduction of the form::
-
-        sum([i,j,k], ...)
-
-    and splits it into two nested reductions::
-
-        sum([j,k], sum([i], ...))
-
-    In this case, *inames* would have been ``"i"`` indicating that
-    the iname ``i`` should be made the iname governing the inner reduction.
-
-    :arg inames: A list of inames, or a comma-separated string that can
-        be parsed into those
-    """
-
-    return _split_reduction(kernel, inames, "in", within)
-
-
-def split_reduction_outward(kernel, inames, within=None):
-    """Takes a reduction of the form::
-
-        sum([i,j,k], ...)
-
-    and splits it into two nested reductions::
-
-        sum([i], sum([j,k], ...))
-
-    In this case, *inames* would have been ``"i"`` indicating that
-    the iname ``i`` should be made the iname governing the outer reduction.
-
-    :arg inames: A list of inames, or a comma-separated string that can
-        be parsed into those
-    """
-
-    return _split_reduction(kernel, inames, "out", within)
-
-# }}}
-
-
 # {{{ fold constants
 
 def fold_constants(kernel):
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 59239104e..1dc54f94b 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -29,6 +29,10 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont
 from loopy.kernel.data import ValueArg, GlobalArg
 import islpy as isl
 
+__doc__ = """
+.. autofunction:: to_batched
+"""
+
 
 # {{{ to_batched
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 8cf16bfd3..728934915 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -35,25 +35,38 @@ from loopy.symbolic import (
 from loopy.diagnostic import LoopyError
 
 
-# {{{ assume
+__doc__ = """
+.. currentmodule:: loopy
 
-def assume(kernel, assumptions):
-    if isinstance(assumptions, str):
-        assumptions_set_str = "[%s] -> { : %s}" \
-                % (",".join(s for s in kernel.outer_params()),
-                    assumptions)
-        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
-                assumptions_set_str)
+.. autofunction:: split_iname
 
-    if not isinstance(assumptions, isl.BasicSet):
-        raise TypeError("'assumptions' must be a BasicSet or a string")
+.. autofunction:: chunk_iname
 
-    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)
+.. autofunction:: join_inames
 
-    return kernel.copy(
-            assumptions=old_assumptions.params() & new_assumptions.params())
+.. autofunction:: tag_inames
 
-# }}}
+.. autofunction:: duplicate_inames
+
+.. undocumented .. autofunction:: link_inames
+
+.. autofunction:: rename_iname
+
+.. autofunction:: remove_unused_inames
+
+.. autofunction:: set_loop_priority
+
+.. autofunction:: split_reduction_inward
+
+.. autofunction:: split_reduction_outward
+
+.. autofunction:: affine_map_inames
+
+.. autofunction:: realize_ilp
+
+.. autofunction:: find_unused_axis_tag
+
+"""
 
 
 # {{{ set loop priority
@@ -1021,6 +1034,103 @@ def remove_unused_inames(knl, inames=None):
 # }}}
 
 
+# {{{ split_reduction
+
+class _ReductionSplitter(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, within, inames, direction):
+        super(_ReductionSplitter, self).__init__(
+                rule_mapping_context)
+
+        self.within = within
+        self.inames = inames
+        self.direction = direction
+
+    def map_reduction(self, expr, expn_state):
+        if set(expr.inames) & set(expn_state.arg_context):
+            # FIXME
+            raise NotImplementedError()
+
+        if (self.inames <= set(expr.inames)
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
+            leftover_inames = set(expr.inames) - self.inames
+
+            from loopy.symbolic import Reduction
+            if self.direction == "in":
+                return Reduction(expr.operation, tuple(leftover_inames),
+                        Reduction(expr.operation, tuple(self.inames),
+                            self.rec(expr.expr, expn_state)))
+            elif self.direction == "out":
+                return Reduction(expr.operation, tuple(self.inames),
+                        Reduction(expr.operation, tuple(leftover_inames),
+                            self.rec(expr.expr, expn_state)))
+            else:
+                assert False
+        else:
+            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
+
+
+def _split_reduction(kernel, inames, direction, within=None):
+    if direction not in ["in", "out"]:
+        raise ValueError("invalid value for 'direction': %s" % direction)
+
+    if isinstance(inames, str):
+        inames = inames.split(",")
+    inames = set(inames)
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    rsplit = _ReductionSplitter(rule_mapping_context,
+            within, inames, direction)
+    return rule_mapping_context.finish_kernel(
+            rsplit.map_kernel(kernel))
+
+
+def split_reduction_inward(kernel, inames, within=None):
+    """Takes a reduction of the form::
+
+        sum([i,j,k], ...)
+
+    and splits it into two nested reductions::
+
+        sum([j,k], sum([i], ...))
+
+    In this case, *inames* would have been ``"i"`` indicating that
+    the iname ``i`` should be made the iname governing the inner reduction.
+
+    :arg inames: A list of inames, or a comma-separated string that can
+        be parsed into those
+    """
+
+    return _split_reduction(kernel, inames, "in", within)
+
+
+def split_reduction_outward(kernel, inames, within=None):
+    """Takes a reduction of the form::
+
+        sum([i,j,k], ...)
+
+    and splits it into two nested reductions::
+
+        sum([i], sum([j,k], ...))
+
+    In this case, *inames* would have been ``"i"`` indicating that
+    the iname ``i`` should be made the iname governing the outer reduction.
+
+    :arg inames: A list of inames, or a comma-separated string that can
+        be parsed into those
+    """
+
+    return _split_reduction(kernel, inames, "out", within)
+
+# }}}
+
+
 # {{{ affine map inames
 
 def affine_map_inames(kernel, old_inames, new_inames, equations):
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index 1567263cd..f7600b212 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -28,6 +28,42 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 import islpy as isl
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autofunction:: fix_parameters
+
+.. autofunction:: assume
+"""
+
+
+# {{{ assume
+
+def assume(kernel, assumptions):
+    """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
+    `n mod 4 = 0`.
+
+    :arg assumptions: a :class:`islpy.BasicSet` or a string representation of
+        the assumptions in :ref:`isl-syntax`.
+    """
+    if isinstance(assumptions, str):
+        assumptions_set_str = "[%s] -> { : %s}" \
+                % (",".join(s for s in kernel.outer_params()),
+                    assumptions)
+        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
+                assumptions_set_str)
+
+    if not isinstance(assumptions, isl.BasicSet):
+        raise TypeError("'assumptions' must be a BasicSet or a string")
+
+    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)
+
+    return kernel.copy(
+            assumptions=old_assumptions.params() & new_assumptions.params())
+
+# }}}
+
 
 # {{{ fix_parameter
 
@@ -99,6 +135,13 @@ def _fix_parameter(kernel, name, value):
 
 
 def fix_parameters(kernel, **value_dict):
+    """Fix the values of the arguments to specific constants.
+
+    *value_dict* consists of *name*/*value* pairs, where *name* will be fixed
+    to be *value*. *name* may refer to :ref:`domain-parameters` or
+    :ref:`arguments`.
+    """
+
     for name, value in six.iteritems(value_dict):
         kernel = _fix_parameter(kernel, name, value)
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 7623fb891..9ce1f9c54 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -289,7 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
     """Extract an assignment (to a temporary variable or an argument)
-    as a :ref:`substituiton-rule`. The temporary may be an array, in
+    as a :ref:`substitution-rule`. The temporary may be an array, in
     which case the array indices will become arguments to the substitution
     rule.
 
-- 
GitLab