diff --git a/doc/conf.py b/doc/conf.py
index 301604607ef9543b559aa7cfdaf875e9140f1299..74e1aec303972378a28dea0b4186720d6a61019e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -46,7 +46,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'loopy'
-copyright = u'2011, Andreas Klöckner'
+copyright = u'2016, Andreas Klöckner'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -54,7 +54,10 @@ copyright = u'2011, Andreas Klöckner'
 #
 # The short X.Y version.
 ver_dic = {}
-exec(compile(open("../loopy/version.py").read(), "../loopy/version.py", 'exec'), ver_dic)
+with open("../loopy/version.py") as vpy_file:
+    version_py = vpy_file.read()
+
+exec(compile(version_py, "../loopy/version.py", 'exec'), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
@@ -249,4 +252,4 @@ intersphinx_mapping = {
     'http://docs.scipy.org/doc/numpy/': None,
     }
 
-autoclass_content = "both"
+autoclass_content = "class"
diff --git a/doc/index.rst b/doc/index.rst
index 19bbe87727e8d3ffa5aa0cb1c31e4e206bd97b3e..a0bad2898be4aab74dead90aae825e4e0a460c87 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -63,7 +63,10 @@ Please check :ref:`installation` to get started.
     :maxdepth: 2
 
     tutorial
-    reference
+    ref_creation
+    ref_kernel
+    ref_transform
+    ref_other
     misc
 
 Indices and tables
diff --git a/doc/misc.rst b/doc/misc.rst
index 2f8fac090455eea1be87c3b4eae7bfd72eba24ee..89279d9744d2f1cf4f080618d0d2b2e0f078a723 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -5,7 +5,9 @@ Installation
 
 This command should install :mod:`loopy`::
 
-    pip install loopy
+    pip install loo.py
+
+(Note the extra "."!)
 
 You may need to run this with :command:`sudo`.
 If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..92eff09c9e3ecacfd8bb9030a9e4b9f002fefc71
--- /dev/null
+++ b/doc/ref_creation.rst
@@ -0,0 +1,33 @@
+.. module:: loopy
+.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
+
+.. _creating-kernels:
+
+Reference: Creating Kernels
+===========================
+
+From Loop Domains and Instructions
+----------------------------------
+
+.. autofunction:: make_kernel
+
+From Fortran
+------------
+
+.. autofunction:: parse_fortran
+
+.. autofunction:: parse_transformed_fortran
+
+.. autofunction:: c_preprocess
+
+From Other Kernels
+------------------
+
+.. autofunction:: fuse_kernels
+
+To Copy between Data Formats
+----------------------------
+
+.. autofunction:: make_copy_kernel
+
+.. vim: tw=75:spell:fdm=marker
diff --git a/doc/reference.rst b/doc/ref_kernel.rst
similarity index 71%
rename from doc/reference.rst
rename to doc/ref_kernel.rst
index 351a2374a1245bf58296f9d2ee50551050c15f76..33d40385b529f72e54da65238304e87bdb2cddab 100644
--- a/doc/reference.rst
+++ b/doc/ref_kernel.rst
@@ -1,21 +1,39 @@
-.. _reference:
+.. currentmodule:: loopy
 
-Reference Guide
-===============
+Reference: Loopy's Model of a Kernel
+====================================
 
-.. module:: loopy
-.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
+.. _domain-tree:
 
-This guide defines all functionality exposed by loopy. If you would like
-a more gentle introduction, you may consider reading the example-based
-:ref:`tutorial` instead.
+Loop Domain Tree
+----------------
 
-.. _inames:
+.. {{{
+
+Example::
+
+    { [i]: 0<=i<n }
+
+A kernel's iteration domain is given by a list of :class:`islpy.BasicSet`
+instances (which parametrically represent multi-dimensional sets of
+tuples of integers).  They define the integer values of the loop variables
+for which instructions (see below) will be executed.
+It is written in :ref:`isl-syntax`.  :mod:`loopy` calls the loop variables
+*inames*. In this case, *i* is the sole iname. The loop
+domain is given as a conjunction of affine equality
+and inequality constraints. Integer divisibility constraints (resulting
+in strides) are also allowed. In the absence of divisibility
+constraints, the loop domain is convex.
 
-Domain Tree
------------
+Note that *n* in the example is not an iname. It is a
+:ref:`domain-parameter` that is passed to the kernel by the user.
 
+To accommodate some data-dependent control flow, there is not actually
+a single loop domain, but rather a *tree of loop domains*,
+allowing more deeply nested domains to depend on inames
+introduced by domains closer to the root.
 
+.. _inames:
 
 Inames
 ^^^^^^
@@ -25,161 +43,10 @@ dependency semantics--otherwise e.g. a fetch could happen inside one loop nest,
 and then the instruction using that fetch could be inside a wholly different
 loop nest.
 
-Instructions
-------------
-
-Expressions
-^^^^^^^^^^^
-
-Loopy's expressions are a slight superset of the expressions supported by
-:mod:`pymbolic`.
-
-* `if`
-* `reductions`
-    * duplication of reduction inames
-* complex-valued arithmetic
-* tagging of array access and substitution rule use ("$")
-* ``indexof``, ``indexof_vec``
-
-.. _types:
-
-Specifying Types
-----------------
-
-:mod:`loopy` uses the same type system as :mod:`numpy`. (See
-:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of
-user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype`
-and related functions.
-
-For a string representation of types, all numpy types (e.g. ``float32`` etc.)
-are accepted, in addition to what is registered in :mod:`pyopencl`.
-
-.. _iname-tags:
-
-Iname Implementation Tags
--------------------------
-
-=============================== ====================================================
-Tag                             Meaning
-=============================== ====================================================
-``None`` | ``"for"``            Sequential loop
-``"l.N"``                       Local (intra-group) axis N ("local")
-``"g.N"``                       Group-number axis N ("group")
-``"unr"``                       Unroll
-``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
-``"ilp.seq"``                   Realize parallel iname as innermost loop
-``"like.INAME"``                Can be used when tagging inames to tag like another
-``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis
-=============================== ====================================================
-
-(Throughout this table, `N` must be replaced by an actual, zero-based number.)
-
-"ILP" does three things:
-
-* Restricts loops to be innermost
-* Duplicates reduction storage for any reductions nested around ILP usage
-* Causes a loop (unrolled or not) to be opened/generated for each
-  involved instruction
-
-.. _data-dim-tags:
-
-Data Axis Tags
---------------
-
-Data axis tags specify how a multi-dimensional array (which is loopy's
-main way of storing data) is represented in (linear, 1D) computer
-memory. This storage format is given as a number of "tags", as listed
-in the table below. Each axis of an array has a tag corresponding to it.
-In the user interface, array dim tags are specified as a tuple of these
-tags or a comma-separated string containing them, such as the following::
-
-    c,vec,sep,c
-
-The interpretation of these tags is order-dependent, they are read
-from left to right.
-
-===================== ====================================================
-Tag                   Meaning
-===================== ====================================================
-``c``                 Nest current axis around the ones that follow
-``f``                 Nest current axis inside the ones that follow
-``N0`` ... ``N9``     Specify an explicit nesting level for this axis
-``stride:EXPR``       A fixed stride
-``sep``               Implement this axis by mapping to separate arrays
-``vec``               Implement this axis as entries in a vector
-===================== ====================================================
-
-``sep`` and ``vec`` obviously require the number of entries
-in the array along their respective axis to be known at code
-generation time.
-
-When the above speaks about 'nesting levels', this means that axes
-"nested inside" others are "faster-moving" when viewed from linear
-memory.
-
-In addition, each tag may be followed by a question mark (``?``),
-which indicates that if there are more dimension tags specified
-than array axes present, that this axis should be omitted. Axes
-with question marks are omitted in a left-first manner until the correct
-number of dimension tags is achieved.
-
-Some examples follow, all of which use a three-dimensional array of shape
-*(3, M, 4)*. For simplicity, we assume that array entries have size one.
-
-*   ``c,c,c``: The axes will have strides *(M*4, 4, 1)*,
-    leading to a C-like / row-major layout.
-
-*   ``f,f,f``: The axes will have strides *(1, 3, 3*M)*,
-    leading to a Fortran-like / row-major layout.
-
-*   ``sep,c,c``: The array will be mapped to three arrays of
-    shape *(M, 4)*, each with strides *(4, 1)*.
-
-*   ``c,c,vec``: The array will be mapped to an array of
-    ``float4`` vectors, with (``float4``-based) strides of
-    *(M, 1)*.
-
-*   ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*.
-
-.. _creating-kernels:
-
-Creating Kernels
-----------------
-
-.. autoclass:: auto
-
-.. _arguments:
-
-Arguments
-^^^^^^^^^
-
-.. autoclass:: ValueArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: GlobalArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: ConstantArg
-    :members:
-    :undoc-members:
-
-.. autoclass:: ImageArg
-    :members:
-    :undoc-members:
-
-.. _temporaries:
-
-Loop domains
-^^^^^^^^^^^^
-
-TODO: Explain the domain tree
-
 .. _isl-syntax:
 
 ISL syntax
-~~~~~~~~~~
+^^^^^^^^^^
 
 The general syntax of an ISL set is the following::
 
@@ -217,27 +84,73 @@ Examples of constructs that are **not** allowed:
   (**Note:** This may be added in a future version of loopy.
   For now, loop domains have to be convex.)
 
-Temporary Variables
-^^^^^^^^^^^^^^^^^^^
+.. _domain-parameters:
 
-Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
-have the lifetime of a kernel invocation.
+Domain parameters
+^^^^^^^^^^^^^^^^^
 
-.. autoclass:: TemporaryVariable
-    :members:
-    :undoc-members:
+Domain parameters are identifiers being used in loop domains that are not
+*inames*, i.e. they do not define loop variables. In the following domain
+specification, *n* is a domain parameter::
+
+    {[i,j]: 0 <= i,j < n}
+
+Values of domain parameters arise from
+
+* being passed to the kernel as :ref:`arguments`
+
+* being assigned to :ref:`temporaries` to feed into domains
+  lower in the :ref:`domain-tree`.
+
+.. _iname-tags:
+
+Iname Implementation Tags
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============================== ====================================================
+Tag                             Meaning
+=============================== ====================================================
+``None`` | ``"for"``            Sequential loop
+``"l.N"``                       Local (intra-group) axis N ("local")
+``"g.N"``                       Group-number axis N ("group")
+``"unr"``                       Unroll
+``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
+``"ilp.seq"``                   Realize parallel iname as innermost loop
+``"like.INAME"``                Can be used when tagging inames to tag like another
+``"unused.g"`` | ``"unused.l"`` Can be to tag as the next unused group/local axis
+=============================== ====================================================
+
+(Throughout this table, `N` must be replaced by an actual, zero-based number.)
+
+"ILP" does three things:
+
+* Restricts loops to be innermost
+* Duplicates reduction storage for any reductions nested around ILP usage
+* Causes a loop (unrolled or not) to be opened/generated for each
+  involved instruction
+
+.. }}}
+
+.. _instructions:
 
 Instructions
-^^^^^^^^^^^^
+------------
 
-.. autoclass:: UniqueName
+.. {{{
 
 .. autoclass:: InstructionBase
 
 .. _assignments:
 
-Assignments
-~~~~~~~~~~~
+Assignment objects
+^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: Assignment
+
+.. _assignment-syntax:
+
+Textual Assignment Syntax
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The general syntax of an instruction is a simple assignment::
 
@@ -349,254 +262,206 @@ These are usually key-value pairs. The following attributes are recognized:
   given instruction groups. See
   :class:`InstructionBase.conflicts_with_groups`.
 
-Assignment instructions are expressed as instances of the following class:
+.. _expression-syntax:
 
-.. autoclass:: ExpressionInstruction
+Expressions
+^^^^^^^^^^^
 
-.. _expression-syntax:
+Loopy's expressions are a slight superset of the expressions supported by
+:mod:`pymbolic`.
 
-Expression Syntax
-~~~~~~~~~~~~~~~~~
+* ``if``
+* ``reductions``
+    * duplication of reduction inames
+    * ``reduce`` vs ``simul_reduce``
+* complex-valued arithmetic
+* tagging of array access and substitution rule use ("$")
+* ``indexof``, ``indexof_vec``
 
 TODO: Functions
 TODO: Reductions
 
 C Block Instructions
-~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: CInstruction
 
-.. _substitution-rule:
-
-Substitution Rules
-^^^^^^^^^^^^^^^^^^
-
-Syntax of a substitution rule::
-
-    rule_name(arg1, arg2) := EXPRESSION
-
-Kernels
-^^^^^^^
-
-.. class:: LoopKernel
+.. }}}
 
-Do not create :class:`LoopKernel` objects directly. Instead, use the following
-function, which is responsible for creating kernels:
+Data: Arguments and Temporaries
+-------------------------------
 
-.. autofunction:: make_kernel
+.. {{{
 
-.. autofunction:: parse_fortran
+Kernels operate on two types of data: 'arguments' carrying data into and out of a kernel,
+and temporaries with lifetimes tied to the runtime of the kernel.
 
-.. autofunction:: parse_transformed_fortran
+.. _arguments:
 
-.. autofunction:: make_copy_kernel
+Arguments
+^^^^^^^^^
 
-.. autofunction:: fuse_kernels
+.. autoclass:: KernelArgument
+    :members:
+    :undoc-members:
 
-.. autofunction:: c_preprocess
+.. autoclass:: ValueArg
+    :members:
+    :undoc-members:
 
-Transforming Kernels
---------------------
+.. autoclass:: GlobalArg
+    :members:
+    :undoc-members:
 
-.. _context-matching:
+.. autoclass:: ConstantArg
+    :members:
+    :undoc-members:
 
-Matching contexts
-^^^^^^^^^^^^^^^^^
+.. autoclass:: ImageArg
+    :members:
+    :undoc-members:
 
-TODO: Matching instruction tags
+.. _temporaries:
 
-.. automodule:: loopy.context_matching
+Temporary Variables
+^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: parse_match
+Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
+have the lifetime of a kernel invocation.
 
-.. autofunction:: parse_stack_match
+.. autoclass:: TemporaryVariable
+    :members:
+    :undoc-members:
 
-.. currentmodule:: loopy
+.. _types:
 
-Wrangling inames
+Specifying Types
 ^^^^^^^^^^^^^^^^
 
-.. autofunction:: split_iname
-
-.. autofunction:: chunk_iname
-
-.. autofunction:: join_inames
-
-.. autofunction:: tag_inames
-
-.. autofunction:: duplicate_inames
-
-.. undocumented .. autofunction:: link_inames
-
-.. autofunction:: rename_iname
-
-.. autofunction:: remove_unused_inames
-
-.. autofunction:: set_loop_priority
-
-.. autofunction:: split_reduction_inward
-
-.. autofunction:: split_reduction_outward
-
-.. autofunction:: affine_map_inames
-
-.. autofunction:: realize_ilp
-
-.. autofunction:: find_unused_axis_tag
-
-Dealing with Parameters
-^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autofunction:: fix_parameters
-
-.. autofunction:: assume
+:mod:`loopy` uses the same type system as :mod:`numpy`. (See
+:class:`numpy.dtype`) It also uses :mod:`pyopencl` for a registry of
+user-defined types and their C equivalents. See :func:`pyopencl.tools.get_or_register_dtype`
+and related functions.
 
-Dealing with Substitution Rules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For a string representation of types, all numpy types (e.g. ``float32`` etc.)
+are accepted, in addition to what is registered in :mod:`pyopencl`.
 
-.. autofunction:: extract_subst
+.. _data-dim-tags:
 
-.. autofunction:: assignment_to_subst
+Data Axis Tags
+^^^^^^^^^^^^^^
 
-.. autofunction:: expand_subst
+Data axis tags specify how a multi-dimensional array (which is loopy's
+main way of storing data) is represented in (linear, 1D) computer
+memory. This storage format is given as a number of "tags", as listed
+in the table below. Each axis of an array has a tag corresponding to it.
+In the user interface, array dim tags are specified as a tuple of these
+tags or a comma-separated string containing them, such as the following::
 
-.. autofunction:: find_rules_matching
+    c,vec,sep,c
 
-.. autofunction:: find_one_rule_matching
+The interpretation of these tags is order-dependent, they are read
+from left to right.
 
-Caching, Precomputation and Prefetching
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+===================== ====================================================
+Tag                   Meaning
+===================== ====================================================
+``c``                 Nest current axis around the ones that follow
+``f``                 Nest current axis inside the ones that follow
+``N0`` ... ``N9``     Specify an explicit nesting level for this axis
+``stride:EXPR``       A fixed stride
+``sep``               Implement this axis by mapping to separate arrays
+``vec``               Implement this axis as entries in a vector
+===================== ====================================================
 
-.. autofunction:: precompute
+``sep`` and ``vec`` obviously require the number of entries
+in the array along their respective axis to be known at code
+generation time.
 
-.. autofunction:: add_prefetch
+When the above speaks about 'nesting levels', this means that axes
+"nested inside" others are "faster-moving" when viewed from linear
+memory.
 
-.. autofunction:: buffer_array
+In addition, each tag may be followed by a question mark (``?``),
+which indicates that if there are more dimension tags specified
+than array axes present, that this axis should be omitted. Axes
+with question marks are omitted in a left-first manner until the correct
+number of dimension tags is achieved.
 
-.. autofunction:: alias_temporaries
+Some examples follow, all of which use a three-dimensional array of shape
+*(3, M, 4)*. For simplicity, we assume that array entries have size one.
 
-Influencing data access
-^^^^^^^^^^^^^^^^^^^^^^^
+*   ``c,c,c``: The axes will have strides *(M*4, 4, 1)*,
+    leading to a C-like / row-major layout.
 
-.. autofunction:: change_arg_to_image
+*   ``f,f,f``: The axes will have strides *(1, 3, 3*M)*,
+    leading to a Fortran-like / row-major layout.
 
-.. autofunction:: tag_data_axes
+*   ``sep,c,c``: The array will be mapped to three arrays of
+    shape *(M, 4)*, each with strides *(4, 1)*.
 
-.. autofunction:: remove_unused_arguments
+*   ``c,c,vec``: The array will be mapped to an array of
+    ``float4`` vectors, with (``float4``-based) strides of
+    *(M, 1)*.
 
-.. autofunction:: set_array_dim_names
+*   ``N1,N0,N2``: The axes will have strides *(M, 1, 3*M)*.
 
-Padding
-^^^^^^^
+.. }}}
 
-.. autofunction:: split_array_dim
+.. _substitution-rule:
 
-.. autofunction:: find_padding_multiple
+Substitution Rules
+------------------
 
-.. autofunction:: add_padding
+.. {{{
 
-Manipulating Instructions
+Substitution Rule Objects
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: set_instruction_priority
-
-.. autofunction:: add_dependency
-
-.. autofunction:: remove_instructions
-
-.. autofunction:: tag_instructions
-
-Library interface
-^^^^^^^^^^^^^^^^^
-
-.. autofunction:: register_reduction_parser
-
-.. autofunction:: register_preamble_generators
-
-.. autofunction:: register_symbol_manglers
-
-.. autofunction:: register_function_manglers
-
-Arguments
-^^^^^^^^^
-
-.. autofunction:: set_argument_order
-
-.. autofunction:: add_dtypes
-
-.. autofunction:: infer_unknown_types
-
-.. autofunction:: add_and_infer_dtypes
-
-.. autofunction:: rename_argument
-
-Batching
-^^^^^^^^
+.. autoclass:: SubstitutionRule
 
-.. autofunction:: to_batched
+.. _subst-rule-syntax:
 
-Finishing up
-^^^^^^^^^^^^
+Textual Syntax for Substitution Rules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autofunction:: generate_loop_schedules
-
-.. autofunction:: get_one_scheduled_kernel
-
-.. autofunction:: generate_code
-
-Running
--------
-
-.. autoclass:: CompiledKernel
-
-Automatic Testing
------------------
-
-.. autofunction:: auto_test_vs_ref
-
-Troubleshooting
----------------
-
-Printing :class:`LoopKernel` objects
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you're confused about things loopy is referring to in an error message or
-about the current state of the :class:`LoopKernel` you are transforming, the
-following always works::
-
-    print kernel
+Syntax of a substitution rule::
 
-(And it yields a human-readable--albeit terse--representation of *kernel*.)
+    rule_name(arg1, arg2) := EXPRESSION
 
-.. autofunction:: preprocess_kernel
+.. }}}
 
-.. autofunction:: get_dot_dependency_graph
+Kernel Options
+--------------
 
-.. autofunction:: show_dependency_graph
+.. autoclass:: Options
 
-Options
+Targets
 -------
 
-.. autoclass:: Options
+.. automodule:: loopy.target
 
-.. autofunction:: set_options
+Helper values
+-------------
 
-Controlling caching
--------------------
+.. {{{
 
-.. autofunction:: set_caching_enabled
+.. autoclass:: auto
 
-.. autoclass:: CacheMode
+.. autoclass:: UniqueName
 
-Obtaining Kernel Statistics
----------------------------
+.. }}}
 
-.. autofunction:: get_op_poly
+The Kernel Object
+-----------------
 
-.. autofunction:: get_gmem_access_poly
+Do not create :class:`LoopKernel` objects directly. Instead, refer to
+:ref:`creating-kernels`.
 
-.. autofunction:: sum_mem_access_to_bytes
+.. autoclass:: LoopKernel
 
-.. autofunction:: get_barrier_poly
+.. autoclass:: kernel_state
+    :members:
+    :undoc-members:
 
-.. vim: tw=75:spell
+.. vim: tw=75:spell:fdm=marker
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
new file mode 100644
index 0000000000000000000000000000000000000000..71d6c54b11dcd15977bdb375cea2207d881b5696
--- /dev/null
+++ b/doc/ref_other.rst
@@ -0,0 +1,46 @@
+Reference: Other Functionality
+==============================
+
+Obtaining Kernel Performance Statistics
+---------------------------------------
+
+.. automodule:: loopy.statistics
+
+Controlling caching
+-------------------
+
+.. autofunction:: set_caching_enabled
+
+.. autoclass:: CacheMode
+
+Running Kernels
+---------------
+
+In addition to simply calling kernels using :class:`LoopKernel.__call__`,
+the following underlying functionality may be used:
+
+.. autoclass:: CompiledKernel
+
+Automatic Testing
+-----------------
+
+.. autofunction:: auto_test_vs_ref
+
+Troubleshooting
+---------------
+
+Printing :class:`LoopKernel` objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you're confused about things loopy is referring to in an error message or
+about the current state of the :class:`LoopKernel` you are transforming, the
+following always works::
+
+    print(kernel)
+
+(And it yields a human-readable--albeit terse--representation of *kernel*.)
+
+.. autofunction:: get_dot_dependency_graph
+
+.. autofunction:: show_dependency_graph
+
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4c3c24873945dc0d87ec414ff7870fafd0a5bda1
--- /dev/null
+++ b/doc/ref_transform.rst
@@ -0,0 +1,135 @@
+.. _reference-transform:
+
+Reference: Transforming Kernels
+===============================
+
+Dealing with Parameters
+-----------------------
+
+.. automodule:: loopy.transform.parameter
+
+Wrangling inames
+----------------
+
+.. automodule:: loopy.transform.iname
+
+Dealing with Substitution Rules
+-------------------------------
+
+.. currentmodule:: loopy
+
+.. autofunction:: extract_subst
+
+.. autofunction:: assignment_to_subst
+
+.. autofunction:: expand_subst
+
+.. autofunction:: find_rules_matching
+
+.. autofunction:: find_one_rule_matching
+
+Caching, Precomputation and Prefetching
+---------------------------------------
+
+.. autofunction:: precompute
+
+.. autofunction:: add_prefetch
+
+.. autofunction:: buffer_array
+
+.. autofunction:: alias_temporaries
+
+Influencing data access
+-----------------------
+
+.. autofunction:: change_arg_to_image
+
+.. autofunction:: tag_data_axes
+
+.. autofunction:: remove_unused_arguments
+
+.. autofunction:: set_array_dim_names
+
+Padding Data
+------------
+
+.. autofunction:: split_array_dim
+
+.. autofunction:: find_padding_multiple
+
+.. autofunction:: add_padding
+
+Manipulating Instructions
+-------------------------
+
+.. autofunction:: set_instruction_priority
+
+.. autofunction:: add_dependency
+
+.. autofunction:: remove_instructions
+
+.. autofunction:: tag_instructions
+
+Registering Library Routines
+----------------------------
+
+.. autofunction:: register_reduction_parser
+
+.. autofunction:: register_preamble_generators
+
+.. autofunction:: register_symbol_manglers
+
+.. autofunction:: register_function_manglers
+
+Modifying Arguments
+-------------------
+
+.. autofunction:: set_argument_order
+
+.. autofunction:: add_dtypes
+
+.. autofunction:: infer_unknown_types
+
+.. autofunction:: add_and_infer_dtypes
+
+.. autofunction:: rename_argument
+
+Creating Batches of Operations
+------------------------------
+
+.. automodule:: loopy.transform.batch
+
+Finishing up
+------------
+
+.. currentmodule:: loopy
+
+.. autofunction:: preprocess_kernel
+
+.. autofunction:: generate_loop_schedules
+
+.. autofunction:: get_one_scheduled_kernel
+
+.. autofunction:: generate_code
+
+Setting options
+---------------
+
+.. autofunction:: set_options
+
+.. _context-matching:
+
+Matching contexts
+-----------------
+
+TODO: Matching instruction tags
+
+.. automodule:: loopy.context_matching
+
+.. autofunction:: parse_match
+
+.. autofunction:: parse_stack_match
+
+
+.. vim: tw=75:spell
+
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 24ec0ce4fb7a140fb70e65a8793edcf7e03b2042..4275474d9ce74e04aa3c7ae69356f0672db2128c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -96,6 +96,7 @@ always see loopy's view of a kernel by printing it.
 
 .. doctest::
 
+    >>> knl = lp.set_options(knl, allow_terminal_colors=False)
     >>> print(knl)
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
@@ -245,8 +246,6 @@ call :func:`loopy.generate_code`:
 .. doctest::
 
     >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
-    >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0])
-    >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl)
     >>> code, _ = lp.generate_code(typed_knl)
     >>> print(code)
     #define lid(N) ((int) get_local_id(N))
@@ -1186,8 +1185,12 @@ across the remaining axis of the workgroup would emerge.
 
 TODO
 
-Obtaining Kernel Statistics
----------------------------
+.. }}}
+
+Obtaining Performance Statistics
+--------------------------------
+
+.. {{{
 
 Operations, array access, and barriers can all be counted, which may facilitate
 performance prediction and optimization of a :mod:`loopy` kernel.
diff --git a/examples/python/ispc-harness.py b/examples/python/ispc-harness.py
deleted file mode 100644
index 637fd41fcdcdbcc7a464f8209680931adfd014f5..0000000000000000000000000000000000000000
--- a/examples/python/ispc-harness.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import loopy as lp
-import numpy as np
-import numpy.linalg as la
-import ctypes
-import os
-from time import time
-
-
-# {{{ temporary directory
-
-class TemporaryDirectory(object):
-    """Create and return a temporary directory.  This has the same
-    behavior as mkdtemp but can be used as a context manager.  For
-    example:
-
-        with TemporaryDirectory() as tmpdir:
-            ...
-
-    Upon exiting the context, the directory and everything contained
-    in it are removed.
-    """
-
-    # Yanked from
-    # https://hg.python.org/cpython/file/3.3/Lib/tempfile.py
-
-    # Handle mkdtemp raising an exception
-    name = None
-    _closed = False
-
-    def __init__(self, suffix="", prefix="tmp", dir=None):
-        from tempfile import mkdtemp
-        self.name = mkdtemp(suffix, prefix, dir)
-
-    def __repr__(self):
-        return "<{} {!r}>".format(self.__class__.__name__, self.name)
-
-    def __enter__(self):
-        return self.name
-
-    def cleanup(self, _warn=False):
-        import warnings
-        if self.name and not self._closed:
-            from shutil import rmtree
-            try:
-                rmtree(self.name)
-            except (TypeError, AttributeError) as ex:
-                if "None" not in '%s' % (ex,):
-                    raise
-                self._rmtree(self.name)
-            self._closed = True
-            if _warn and warnings.warn:
-                warnings.warn("Implicitly cleaning up {!r}".format(self))
-
-    def __exit__(self, exc, value, tb):
-        self.cleanup()
-
-    def __del__(self):
-        # Issue a ResourceWarning if implicit cleanup needed
-        self.cleanup(_warn=True)
-
-# }}}
-
-
-# {{{ build_ispc_shared_lib
-
-def build_ispc_shared_lib(
-        cwd, ispc_sources, cxx_sources,
-        ispc_options=[], cxx_options=[]):
-    from os.path import join
-
-    ispc_source_names = []
-    for name, contents in ispc_sources:
-        ispc_source_names.append(name)
-
-        with open(join(cwd, name), "w") as srcf:
-            srcf.write(contents)
-
-    cxx_source_names = []
-    for name, contents in cxx_sources:
-        cxx_source_names.append(name)
-
-        with open(join(cwd, name), "w") as srcf:
-            srcf.write(contents)
-
-    from subprocess import check_call
-
-    check_call(
-            ["ispc",
-                "--pic",
-                "--opt=force-aligned-memory",
-                "--target=avx2-i32x8",
-                "-o", "ispc.o"]
-            + ispc_options
-            + list(ispc_source_names),
-            cwd=cwd)
-
-    check_call(
-            [
-                "g++",
-                "-shared", "-fopenmp", "-Wl,--export-dynamic",
-                "-fPIC",
-                "-oshared.so",
-                "ispc.o",
-                ]
-            + cxx_options
-            + list(cxx_source_names),
-            cwd=cwd)
-
-# }}}
-
-
-def cptr_from_numpy(obj):
-    ary_intf = getattr(obj, "__array_interface__", None)
-    if ary_intf is None:
-        raise RuntimeError("no array interface")
-
-    buf_base, is_read_only = ary_intf["data"]
-    return ctypes.c_void_p(buf_base + ary_intf.get("offset", 0))
-
-
-def main():
-    with open("tasksys.cpp", "r") as ts_file:
-        tasksys_source = ts_file.read()
-
-    from loopy.target.ispc import ISPCTarget
-    stream_knl = lp.make_kernel(
-            "{[i]: 0<=i<n}",
-            "z[i] = a*x[i] + y[i]",
-            target=ISPCTarget())
-
-    stream_dtype = np.float64
-    stream_ctype = ctypes.c_double
-
-    stream_knl = lp.add_and_infer_dtypes(stream_knl, {
-        "a": stream_dtype,
-        "x": stream_dtype,
-        "y": stream_dtype
-        })
-
-    stream_knl = lp.assume(stream_knl, "n>0")
-    stream_knl = lp.split_iname(stream_knl, "i", 8, inner_tag="l.0")
-    stream_knl = lp.split_iname(stream_knl,
-            "i_outer", 2**22, outer_tag="g.0")
-    stream_knl = lp.preprocess_kernel(stream_knl)
-    stream_knl = lp.get_one_scheduled_kernel(stream_knl)
-    stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z")
-    ispc_code, arg_info = lp.generate_code(stream_knl)
-
-    with TemporaryDirectory() as tmpdir:
-        build_ispc_shared_lib(
-                tmpdir,
-                [("stream.ispc", ispc_code)],
-                [("tasksys.cpp", tasksys_source)])
-
-        print(ispc_code)
-        knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so"))
-
-        n = 2**28
-        a = 5
-        x = np.empty(n, dtype=stream_dtype)
-        y = np.empty(n, dtype=stream_dtype)
-        z = np.empty(n, dtype=stream_dtype)
-
-        nruns = 30
-        start_time = time()
-        for irun in range(nruns):
-            knl_lib.loopy_kernel(
-                    ctypes.c_int(n), stream_ctype(a),
-                    cptr_from_numpy(x),
-                    cptr_from_numpy(y),
-                    cptr_from_numpy(z))
-        elapsed = time() - start_time
-
-        print(1e-9*3*x.nbytes*nruns/elapsed, "GB/s")
-
-        print(la.norm(z-a*x+y))
-
-
-
-if __name__ == "__main__":
-    main()
-
-# vim: foldmethod=marker
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
new file mode 100644
index 0000000000000000000000000000000000000000..a402896c85baa49bc23bb5d770607cf73641c273
--- /dev/null
+++ b/examples/python/ispc-stream-harness.py
@@ -0,0 +1,156 @@
+import loopy as lp
+import numpy as np
+import numpy.linalg as la
+import ctypes
+import ctypes.util
+import os
+from time import time
+from tempfile import TemporaryDirectory
+
+from loopy.tools import (empty_aligned, address_from_numpy,
+        build_ispc_shared_lib, cptr_from_numpy)
+
+
+def transform(knl, vars, stream_dtype):
+    vars = [v.strip() for v in vars.split(",")]
+    knl = lp.assume(knl, "n>0")
+    knl = lp.split_iname(
+        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
+    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        var: stream_dtype
+        for var in vars
+        })
+
+    knl = lp.set_argument_order(knl, vars + ["n"])
+
+    return knl
+
+
+def gen_code(knl):
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    ispc_code, arg_info = lp.generate_code(knl)
+
+    return ispc_code
+
+
+NRUNS = 10
+ALIGN_TO = 4096
+ARRAY_SIZE = 2**28
+
+if 0:
+    STREAM_DTYPE = np.float64
+    STREAM_CTYPE = ctypes.c_double
+else:
+    STREAM_DTYPE = np.float32
+    STREAM_CTYPE = ctypes.c_float
+
+if 1:
+    INDEX_DTYPE = np.int32
+    INDEX_CTYPE = ctypes.c_int
+else:
+    INDEX_DTYPE = np.int64
+    INDEX_CTYPE = ctypes.c_longlong
+
+
+def main():
+    with open("tasksys.cpp", "r") as ts_file:
+        tasksys_source = ts_file.read()
+
+    def make_knl(name, insn, vars):
+        knl = lp.make_kernel(
+                "{[i]: 0<=i<n}",
+                insn,
+                target=lp.ISPCTarget(), index_dtype=INDEX_DTYPE,
+                name="stream_"+name+"_tasks")
+
+        knl = transform(knl, vars, STREAM_DTYPE)
+        return knl
+
+    init_knl = make_knl("init", """
+                a[i] = 1
+                b[i] = 2
+                c[i] = 0
+                """, "a,b,c")
+    triad_knl = make_knl("triad", """
+            a[i] = b[i] + scalar * c[i]
+            """, "a,b,c,scalar")
+
+    with TemporaryDirectory() as tmpdir:
+        ispc_code = gen_code(init_knl) + gen_code(triad_knl)
+        print(ispc_code)
+
+        build_ispc_shared_lib(
+                tmpdir,
+                [("stream.ispc", ispc_code)],
+                [("tasksys.cpp", tasksys_source)],
+                cxx_options=["-g", "-fopenmp", "-DISPC_USE_OMP"],
+                ispc_options=([
+                    #"-g", "--no-omit-frame-pointer",
+                    "--target=avx2-i32x8",
+                    "--opt=force-aligned-memory",
+                    "--opt=disable-loop-unroll",
+                    #"--opt=fast-math",
+                    #"--opt=disable-fma",
+                    ]
+                    + (["--addressing=64"] if INDEX_DTYPE == np.int64 else [])
+                    ),
+                #ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
+                quiet=False,
+                )
+
+        knl_lib = ctypes.cdll.LoadLibrary(os.path.join(tmpdir, "shared.so"))
+
+        scalar = 5
+
+        a = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+        b = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+        c = empty_aligned(ARRAY_SIZE, dtype=STREAM_DTYPE, n=ALIGN_TO)
+
+        print(
+                hex(address_from_numpy(a)),
+                hex(address_from_numpy(b)),
+                hex(address_from_numpy(c)))
+        assert address_from_numpy(a) % ALIGN_TO == 0
+        assert address_from_numpy(b) % ALIGN_TO == 0
+        assert address_from_numpy(c) % ALIGN_TO == 0
+
+        knl_lib.stream_init_tasks(
+                cptr_from_numpy(a),
+                cptr_from_numpy(b),
+                cptr_from_numpy(c),
+                INDEX_CTYPE(ARRAY_SIZE),
+                )
+
+        def call_kernel():
+            knl_lib.stream_triad_tasks(
+                    cptr_from_numpy(a),
+                    cptr_from_numpy(b),
+                    cptr_from_numpy(c),
+                    STREAM_CTYPE(scalar),
+                    INDEX_CTYPE(ARRAY_SIZE),
+                    )
+
+        call_kernel()
+        call_kernel()
+
+        start_time = time()
+
+        for irun in range(NRUNS):
+            call_kernel()
+
+        elapsed = time() - start_time
+
+        print(elapsed/NRUNS)
+
+        print(1e-9*3*a.nbytes*NRUNS/elapsed, "GB/s")
+
+        assert la.norm(a-b+scalar*c, np.inf) < np.finfo(STREAM_DTYPE).eps * 10
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: foldmethod=marker
diff --git a/examples/python/run-ispc-harness.sh b/examples/python/run-ispc-harness.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f39a51fefee478b8023ded65a2ce3c8de018c852
--- /dev/null
+++ b/examples/python/run-ispc-harness.sh
@@ -0,0 +1,3 @@
+#! /bin/bash
+
+OMP_PLACES=cores OMP_DISPLAY_ENV=true OMP_SCHEDULE=static python "$@"
diff --git a/loopy/__init__.py b/loopy/__init__.py
index bf0a2be1bae22f93bad784ea59874fb305f898f8..5e3ad508596a20cce8a084e4d70604cdafa47c79 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -38,11 +38,13 @@ from loopy.library.function import (
 
 from loopy.kernel.data import (
         auto,
+        KernelArgument,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
         InstructionBase, Assignment, ExpressionInstruction, CInstruction,
-        TemporaryVariable)
+        TemporaryVariable,
+        SubstitutionRule)
 
-from loopy.kernel import LoopKernel
+from loopy.kernel import LoopKernel, kernel_state
 from loopy.kernel.tools import (
         get_dot_dependency_graph,
         show_dependency_graph,
@@ -54,10 +56,12 @@ from loopy.library.reduction import register_reduction_parser
 # {{{ import transforms
 
 from loopy.transform.iname import (
-        assume, set_loop_priority,
+        set_loop_priority,
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
         rename_iname, link_inames, remove_unused_inames,
-        affine_map_inames, find_unused_axis_tag)
+        split_reduction_inward, split_reduction_outward,
+        affine_map_inames, find_unused_axis_tag,
+        make_reduction_inames_unique)
 
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
@@ -79,8 +83,7 @@ from loopy.transform.buffer import buffer_array
 from loopy.transform.fusion import fuse_kernels
 
 from loopy.transform.arithmetic import (
-        split_reduction_inward,
-        split_reduction_outward, fold_constants,
+        fold_constants,
         collect_common_factors_on_increment)
 
 from loopy.transform.padding import (
@@ -89,7 +92,7 @@ from loopy.transform.padding import (
 
 from loopy.transform.ilp import realize_ilp
 from loopy.transform.batch import to_batched
-from loopy.transform.parameter import fix_parameters
+from loopy.transform.parameter import assume, fix_parameters
 
 # }}}
 
@@ -107,15 +110,24 @@ from loopy.auto_test import auto_test_vs_ref
 from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
         parse_fortran)
 
+from loopy.target import TargetBase
+from loopy.target.c import CTarget
+from loopy.target.cuda import CudaTarget
+from loopy.target.opencl import OpenCLTarget
+from loopy.target.pyopencl import PyOpenCLTarget
+from loopy.target.ispc import ISPCTarget
+
 __all__ = [
         "TaggedVariable", "Reduction", "LinearSubscript",
 
         "auto",
 
-        "LoopKernel",
+        "LoopKernel", "kernel_state",
 
+        "KernelArgument",
         "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg",
         "TemporaryVariable",
+        "SubstitutionRule",
 
         "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction",
 
@@ -127,11 +139,13 @@ __all__ = [
 
         # {{{ transforms
 
-        "assume", "set_loop_priority",
+        "set_loop_priority",
         "split_iname", "chunk_iname", "join_inames", "tag_inames",
         "duplicate_inames",
         "rename_iname", "link_inames", "remove_unused_inames",
+        "split_reduction_inward", "split_reduction_outward",
         "affine_map_inames", "find_unused_axis_tag",
+        "make_reduction_inames_unique",
 
         "add_prefetch", "change_arg_to_image", "tag_data_axes",
         "set_array_dim_names", "remove_unused_arguments",
@@ -148,7 +162,6 @@ __all__ = [
         "precompute", "buffer_array",
         "fuse_kernels",
 
-        "split_reduction_inward", "split_reduction_outward",
         "fold_constants", "collect_common_factors_on_increment",
 
         "split_array_dim", "split_arg_axis", "find_padding_multiple",
@@ -158,7 +171,7 @@ __all__ = [
 
         "to_batched",
 
-        "fix_parameters",
+        "assume", "fix_parameters",
 
         # }}}
 
@@ -186,6 +199,9 @@ __all__ = [
 
         "LoopyError", "LoopyWarning",
 
+        "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget",
+        "PyOpenCLTarget", "ISPCTarget",
+
         # {{{ from this file
 
         "register_preamble_generators",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index d51f669ca1e9d0afb53d8c39a7a784e772075767..607d091b98b7a1919dacbe10f0280ea35c666a43 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -1,7 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from six.moves import range, zip
 from pytools import Record
 from warnings import warn
 
@@ -313,15 +311,28 @@ def _default_check_result(result, ref_result):
 
 # {{{ find device for reference test
 
-def _enumerate_cl_devices_for_ref_test():
+def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
     import pyopencl as cl
 
     noncpu_devs = []
     cpu_devs = []
 
+    if isinstance(blacklist_ref_vendors, str):
+        blacklist_ref_vendors = blacklist_ref_vendors.split(",")
+
     for pf in cl.get_platforms():
         for dev in pf.get_devices():
+            if any(bl in dev.platform.vendor
+                    for bl in blacklist_ref_vendors):
+                print("blacklist", dev, blacklist_ref_vendors)
+                continue
+
             if dev.type & cl.device_type.CPU:
+                if "Intel" in dev.platform.vendor:
+                    # Sorry, Intel, your CPU CL has gotten too crashy of late.
+                    # (Feb 2016)
+                    continue
+
                 cpu_devs.append(dev)
             else:
                 noncpu_devs.append(dev)
@@ -351,7 +362,7 @@ def auto_test_vs_ref(
         dump_binary=False,
         fills_entire_output=None, do_check=True, check_result=None,
         max_test_kernel_count=1,
-        quiet=False):
+        quiet=False, blacklist_ref_vendors=[]):
     """Compare results of `ref_knl` to the kernels generated by
     scheduling *test_knl*.
 
@@ -406,7 +417,7 @@ def auto_test_vs_ref(
 
     ref_errors = []
 
-    for dev in _enumerate_cl_devices_for_ref_test():
+    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
         ref_ctx = cl.Context([dev])
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
@@ -417,6 +428,9 @@ def auto_test_vs_ref(
             ref_sched_kernel = knl
             break
 
+        logger.info("%s (ref): trying %s for the reference calculation" % (
+            ref_knl.name, dev))
+
         ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
         if not quiet and print_ref_code:
             print(75*"-")
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 2e136d7bb5264238f024775b4b9b9d31479bb914..91af52cde0000ce811172be53bf21e13ce5fde97 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -464,10 +464,15 @@ def generate_code(kernel, device=None):
         warn("passing 'device' to generate_code() is deprecated",
                 DeprecationWarning, stacklevel=2)
 
+    from loopy.kernel import kernel_state
+    if kernel.state == kernel_state.INITIAL:
+        from loopy.preprocess import preprocess_kernel
+        kernel = preprocess_kernel(kernel)
+
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel)
-    from loopy.kernel import kernel_state
+
     if kernel.state != kernel_state.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
                 "scheduled")
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 6d0b2ca60504060e677d1e3bb941b6c3ccbb3fc2..70530d3a90cbdd5a48395034dd05000f6781b473 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -424,13 +424,15 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
             from cgen import Comment
             result.append(Comment(cmt))
 
-        from cgen import Initializer, POD, Const, Line
+        from loopy.codegen import POD
+        from cgen import Initializer, Const, Line
         from loopy.symbolic import aff_to_expr
 
         if (static_ubound - static_lbound).plain_is_zero():
             # single-trip, generate just a variable assignment, not a loop
             result.append(gen_code_block([
-                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
+                Initializer(
+                    Const(POD(kernel.target, kernel.index_dtype, loop_iname)),
                     ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                 Line(),
                 inner,
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 6d4396b5a11ef99bcf7e8e0b03b5d9d7d8fb6d88..32536021dcfb3a1460f23af7e066aff9bf772f46 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -791,8 +791,16 @@ class _CLKernelInfo(Record):
 
 
 class CompiledKernel:
+    """An object connecting a kernel to a :class:`pyopencl.Context`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
     def __init__(self, context, kernel):
         """
+        :arg context: a :class:`pyopencl.Context`
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
             kernel has not yet been loop-scheduled, that is done, too, with no
@@ -908,10 +916,12 @@ class CompiledKernel:
 
     def __call__(self, queue, **kwargs):
         """
-        :arg allocator:
-        :arg wait_for:
-        :arg out_host:
-
+        :arg allocator: a callable passed a byte count and returning
+            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
+            maybe.
+        :arg wait_for: A list of :class:`pyopencl.Event` instances
+            for which to wait.
+        :arg out_host: :class:`bool`
             Decides whether output arguments (i.e. arguments
             written by the kernel) are to be returned as
             :mod:`numpy` arrays. *True* for yes, *False* for no.
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index f2bbb288249332dc6a333f95fa2600fa48967e60..561a29cc816ba25372fb957311420b8c18291ac7 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -237,7 +237,7 @@ def parse_transformed_fortran(source, free_form=True, strict=True,
 
 
 def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
-        auto_dependencies=True):
+        auto_dependencies=True, target=None):
     """
     :returns: a list of :class:`loopy.LoopKernel` objects
     """
@@ -257,7 +257,8 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
                 "and returned invalid data (Sorry!)")
 
     from loopy.frontend.fortran.translator import F2LoopyTranslator
-    f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies)
+    f2loopy = F2LoopyTranslator(filename, auto_dependencies=auto_dependencies,
+            target=target)
     f2loopy(tree)
 
     return f2loopy.make_kernels()
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 53ea602f1d9b0a5b4fd9b04bc2486d7daae0ca5f..68dd5fa952c2dca519397980288ef56c1b5fdd5f 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -198,10 +198,11 @@ class Scope(object):
 # {{{ translator
 
 class F2LoopyTranslator(FTreeWalkerBase):
-    def __init__(self, filename, auto_dependencies):
+    def __init__(self, filename, auto_dependencies, target=None):
         FTreeWalkerBase.__init__(self)
 
         self.auto_dependencies = auto_dependencies
+        self.target = target
 
         self.scope_stack = []
 
@@ -679,6 +680,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     name=sub.subprogram_name,
                     default_order="F",
                     index_dtype=self.index_dtype,
+                    target=self.target,
                     )
 
             from loopy.loop import fuse_loop_domains
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 42f83bfdf45d030a77e3fb045460f064ff58cbc9..fb081a959523114ee5fc823eec59420f4adf3457 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -90,11 +90,16 @@ class LoopKernel(RecordWithoutPickling):
     .. attribute:: domains
 
         a list of :class:`islpy.BasicSet` instances
+        representing the :ref:`domain-tree`.
 
     .. attribute:: instructions
+
+        A list of :class:`InstructionBase` instances, e.g.
+        :class:`Assignment`. See :ref:`instructions`.
+
     .. attribute:: args
 
-        A list of :class:`loopy.kernel.data.KernelArgument`
+        A list of :class:`loopy.KernelArgument`
 
     .. attribute:: schedule
 
@@ -108,7 +113,7 @@ class LoopKernel(RecordWithoutPickling):
     .. attribute:: temporary_variables
 
         A :class:`dict` of mapping variable names to
-        :class:`loopy.kernel.data.TemporaryVariable`
+        :class:`loopy.TemporaryVariable`
         instances.
 
     .. attribute:: iname_to_tag
@@ -155,7 +160,7 @@ class LoopKernel(RecordWithoutPickling):
 
     .. attribute:: target
 
-        A subclass of :class:`loopy.target.TargetBase`.
+        A subclass of :class:`loopy.TargetBase`.
     """
 
     # {{{ constructor
@@ -1007,6 +1012,9 @@ class LoopKernel(RecordWithoutPickling):
 
         printed_insn_ids = set()
 
+        Fore = self.options._fore
+        Style = self.options._style
+
         def print_insn(insn):
             if insn.id in printed_insn_ids:
                 return
@@ -1030,7 +1038,7 @@ class LoopKernel(RecordWithoutPickling):
 
             loop_list = ",".join(sorted(kernel.insn_inames(insn)))
 
-            options = [insn.id]
+            options = [Fore.GREEN+insn.id+Style.RESET_ALL]
             if insn.priority:
                 options.append("priority=%d" % insn.priority)
             if insn.tags:
@@ -1043,12 +1051,15 @@ class LoopKernel(RecordWithoutPickling):
             if len(loop_list) > loop_list_width:
                 lines.append("[%s]" % loop_list)
                 lines.append("%s%s <- %s   # %s" % (
-                    (loop_list_width+2)*" ", lhs,
-                    rhs, ", ".join(options)))
+                    (loop_list_width+2)*" ", Fore.BLUE+lhs+Style.RESET_ALL,
+                    Fore.MAGENTA+rhs+Style.RESET_ALL,
+                    ", ".join(options)))
             else:
                 lines.append("[%s]%s%s <- %s   # %s" % (
                     loop_list, " "*(loop_list_width-len(loop_list)),
-                    lhs, rhs, ",".join(options)))
+                    Fore.BLUE + lhs + Style.RESET_ALL,
+                    Fore.MAGENTA+rhs+Style.RESET_ALL,
+                    ",".join(options)))
 
             lines.extend(trailing)
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f7c478dff296f26bdd342670b319104e7c3ef694..6a5d523e61c27c068a380b7de6c206147d7bb271 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1011,8 +1011,22 @@ def resolve_wildcard_deps(knl):
 def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     """User-facing kernel creation entrypoint.
 
-    :arg domains: :class:`islpy.BasicSet`
+    :arg domains:
+
+        A list of :class:`islpy.BasicSet` (i.e. convex set) instances
+        representing the :ref:`domain-tree`. May also be a list of strings
+        which will be parsed into such instances according to :ref:`isl-syntax`.
+
     :arg instructions:
+
+        A list of :class:`Assignment` (or other :class:`InstructionBase`
+        subclasses), possibly intermixed with instances of
+        :class:`SubstitutionRule`. This same list may also contain strings
+        which will be parsed into such objects using the
+        :ref:`assignment-syntax` and the :ref:`subst-rule-syntax`.  May also be
+        a single multi-line string which will be split into lines and then
+        parsed.
+
     :arg kernel_data:
 
         A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances.
@@ -1054,7 +1068,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         strides. They are expanded only upon kernel creation.
     :arg default_order: "C" (default) or "F"
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
-        *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments.
+        *offset* in :attr:`GlobalArg` for guessed arguments.
         Defaults to 0.
     :arg function_manglers: list of functions of signature (name, arg_dtypes)
         returning a tuple (result_dtype, c_name)
@@ -1074,8 +1088,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         to silence
     :arg options: an instance of :class:`loopy.Options` or an equivalent
         string representation
-    :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*,
-        to use the default target. (see :func:`loopy.set_default_target`)
+    :arg target: an instance of :class:`loopy.TargetBase`, or *None*,
+        to use the default target.
     """
 
     defines = kwargs.pop("defines", {})
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 9d0848788a09b7a6b914a1ce5569faf3102f18b2..5b0cf57e50298febe0580da5fb0efe5756557527 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -185,6 +185,8 @@ def parse_tag(tag):
 # {{{ arguments
 
 class KernelArgument(Record):
+    """Base class for all argument types"""
+
     def __init__(self, **kwargs):
         kwargs["name"] = intern(kwargs.pop("name"))
 
@@ -418,7 +420,9 @@ class SubstitutionRule(Record):
 # {{{ base class
 
 class InstructionBase(Record):
-    """
+    """A base class for all types of instruction that can occur in
+    a kernel.
+
     .. attribute:: id
 
         An (otherwise meaningless) identifier that is unique within
@@ -427,7 +431,7 @@ class InstructionBase(Record):
     .. attribute:: depends_on
 
         a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
-         that *must* be executed before this one. Note that
+        that *must* be executed before this one. Note that
         :func:`loopy.preprocess_kernel` (usually invoked automatically)
         augments this by adding dependencies on any writes to temporaries read
         by this instruction.
@@ -493,6 +497,14 @@ class InstructionBase(Record):
 
         A tuple of string identifiers that can be used to identify groups
         of instructions.
+
+    .. automethod:: __init__
+    .. automethod:: assignees_and_indices
+    .. automethod:: with_transformed_expressions
+    .. automethod:: write_dependency_names
+    .. automethod:: dependency_names
+    .. automethod:: assignee_var_names
+    .. automethod:: copy
     """
 
     fields = set("id depends_on depends_on_is_final "
@@ -568,10 +580,12 @@ class InstructionBase(Record):
                 predicates=predicates,
                 tags=tags)
 
+    # legacy
     @property
     def insn_deps(self):
         return self.depends_on
 
+    # legacy
     @property
     def insn_deps_is_final(self):
         return self.depends_on_is_final
@@ -740,6 +754,8 @@ class Assignment(InstructionBase):
 
         if not *None*, a type that will be assigned to the new temporary variable
         created from the assignee
+
+    .. automethod:: __init__
     """
 
     fields = InstructionBase.fields | \
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 46b301b47c45430078f554a178448687eb490e94..cb72f283e2151fd8d097f19eb4e4876cc78ccd43 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -770,7 +770,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         If *axis* is None, find a suitable axis automatically.
         """
         try:
-            desired_length = kernel.get_constant_iname_length(iname)
+            with isl.SuppressedWarnings(kernel.isl_context):
+                desired_length = kernel.get_constant_iname_length(iname)
         except isl.Error:
             # Likely unbounded, automatic assignment is not
             # going to happen for this iname.
@@ -882,7 +883,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
             def get_iname_length(iname):
                 try:
-                    return kernel.get_constant_iname_length(iname)
+                    with isl.SuppressedWarnings(kernel.isl_context):
+                        return kernel.get_constant_iname_length(iname)
                 except isl.Error:
                     return -1
             # assign longest auto axis inames first
diff --git a/loopy/options.py b/loopy/options.py
index 049d21affd284d94f27416d018d38bdbd5b4bddc..9b27f111e2fb93b09090b35278a3753f9f1a3f82 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -27,6 +27,11 @@ from pytools import Record
 import re
 
 
+class _ColoramaStub(object):
+    def __getattribute__(self, name):
+        return ""
+
+
 class Options(Record):
     """
     Unless otherwise specified, these options are Boolean-valued
@@ -105,6 +110,11 @@ class Options(Record):
 
         Options to pass to the OpenCL compiler when building the kernel.
         A list of strings.
+
+    .. attribute:: allow_terminal_colors
+
+        A :class:`bool`. Whether to allow colors in terminal output
+
     """
 
     def __init__(
@@ -124,6 +134,7 @@ class Options(Record):
             write_wrapper=False, highlight_wrapper=False,
             write_cl=False, highlight_cl=False,
             edit_cl=False, cl_build_options=[],
+            allow_terminal_colors=True
             ):
         Record.__init__(
                 self,
@@ -137,6 +148,7 @@ class Options(Record):
                 write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper,
                 write_cl=write_cl, highlight_cl=highlight_cl,
                 edit_cl=edit_cl, cl_build_options=cl_build_options,
+                allow_terminal_colors=allow_terminal_colors,
                 )
 
     def update(self, other):
@@ -150,6 +162,30 @@ class Options(Record):
         for field_name in sorted(self.__class__.fields):
             key_builder.rec(key_hash, getattr(self, field_name))
 
+    @property
+    def _fore(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Fore
+        else:
+            return _ColoramaStub()
+
+    @property
+    def _back(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Back
+        else:
+            return _ColoramaStub()
+
+    @property
+    def _style(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Style
+        else:
+            return _ColoramaStub()
+
 
 KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$")
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fe88118e018d5829e9ff2104b70940a39cb95ade..4c75cfd250807c6959c1e5167465d34b029d762e 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -66,6 +66,45 @@ def prepare_for_caching(kernel):
 # }}}
 
 
+# {{{ check reduction iname uniqueness
+
+def check_reduction_iname_uniqueness(kernel):
+    iname_to_reduction_count = {}
+    iname_to_nonsimultaneous_reduction_count = {}
+
+    def map_reduction(expr, rec):
+        rec(expr.expr)
+        for iname in expr.inames:
+            iname_to_reduction_count[iname] = (
+                    iname_to_reduction_count.get(iname, 0) + 1)
+            if not expr.allow_simultaneous:
+                iname_to_nonsimultaneous_reduction_count[iname] = (
+                        iname_to_nonsimultaneous_reduction_count.get(iname, 0) + 1)
+
+        return expr
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(map_reduction)
+
+    for insn in kernel.instructions:
+        insn.with_transformed_expressions(cb_mapper)
+
+    for iname, count in six.iteritems(iname_to_reduction_count):
+        nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0)
+
+        if nonsimul_count and count > 1:
+            raise LoopyError("iname '%s' used in more than one reduction. "
+                    "(%d of them, to be precise.) "
+                    "Since this usage can easily cause loop scheduling "
+                    "problems, this is prohibited by default. "
+                    "Use loopy.make_reduction_inames_unique() to fix this. "
+                    "If you are sure that this is OK, write the reduction "
+                    "as 'simul_reduce(...)' instead of 'reduce(...)'"
+                    % (iname, count))
+
+# }}}
+
+
 # {{{ infer types
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
@@ -677,11 +716,13 @@ def preprocess_kernel(kernel, device=None):
     kernel = expand_subst(kernel)
 
     # Ordering restriction:
-    # Type inference doesn't handle substitutions. Get them out of the
-    # way.
+    # Type inference and reduction iname uniqueness don't handle substitutions.
+    # Get them out of the way.
 
     kernel = infer_unknown_types(kernel, expect_completion=False)
 
+    check_reduction_iname_uniqueness(kernel)
+
     kernel = add_default_dependencies(kernel)
 
     # Ordering restrictions:
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 17c1ab3ce0bad0170f8f7b03e243862de33af21d..de71ffaed0d457f96ee6ddbdeeafac58c0959aa9 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -317,11 +317,40 @@ def group_insn_counts(kernel):
 
     return result
 
+
+def gen_dependencies_except(kernel, insn_id, except_insn_ids):
+    insn = kernel.id_to_insn[insn_id]
+    for dep_id in insn.depends_on:
+
+        if dep_id in except_insn_ids:
+            continue
+
+        yield dep_id
+
+        for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids):
+            yield sub_dep_id
+
 # }}}
 
 
 # {{{ debug help
 
+def format_insn_id(kernel, insn_id):
+    Fore = kernel.options._fore
+    Style = kernel.options._style
+    return Fore.GREEN + insn_id + Style.RESET_ALL
+
+
+def format_insn(kernel, insn_id):
+    insn = kernel.id_to_insn[insn_id]
+    Fore = kernel.options._fore
+    Style = kernel.options._style
+    return "[%s] %s%s%s <- %s%s%s" % (
+            format_insn_id(kernel, insn_id),
+            Fore.BLUE, str(insn.assignee), Style.RESET_ALL,
+            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+
+
 def dump_schedule(kernel, schedule):
     lines = []
     indent = ""
@@ -337,8 +366,7 @@ def dump_schedule(kernel, schedule):
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
             if isinstance(insn, Assignment):
-                insn_str = "[%s] %s <- %s" % (
-                        insn.id, str(insn.assignee), str(insn.expression))
+                insn_str = format_insn(kernel, sched_item.insn_id)
             else:
                 insn_str = sched_item.insn_id
             lines.append(indent + insn_str)
@@ -482,6 +510,8 @@ def generate_loop_schedules_internal(
     # to give loops containing high-priority instructions a chance.
 
     kernel = sched_state.kernel
+    Fore = kernel.options._fore
+    Style = kernel.options._style
 
     if allow_boost is None:
         rec_allow_boost = None
@@ -510,7 +540,7 @@ def generate_loop_schedules_internal(
         print(dump_schedule(sched_state.kernel, sched_state.schedule))
         #print("boost allowed:", allow_boost)
         print(75*"=")
-        print("LOOP NEST MAP:")
+        print("LOOP NEST MAP (inner: outer):")
         for iname, val in six.iteritems(sched_state.loop_nest_around_map):
             print("%s : %s" % (iname, ", ".join(val)))
         print(75*"=")
@@ -549,7 +579,7 @@ def generate_loop_schedules_internal(
         if not is_ready:
             if debug_mode:
                 print("instruction '%s' is missing insn depedencies '%s'" % (
-                        insn.id, ",".join(
+                        format_insn(kernel, insn.id), ",".join(
                             insn.depends_on - sched_state.scheduled_insn_ids)))
             continue
 
@@ -570,10 +600,10 @@ def generate_loop_schedules_internal(
             if debug_mode:
                 if want-have:
                     print("instruction '%s' is missing inames '%s'"
-                            % (insn.id, ",".join(want-have)))
+                            % (format_insn(kernel, insn.id), ",".join(want-have)))
                 if have-want:
                     print("instruction '%s' won't work under inames '%s'"
-                            % (insn.id, ",".join(have-want)))
+                            % (format_insn(kernel, insn.id), ",".join(have-want)))
 
         # {{{ determine group-based readiness
 
@@ -595,7 +625,7 @@ def generate_loop_schedules_internal(
         # }}}
 
         if is_ready and debug_mode:
-            print("ready to schedule '%s'" % insn.id)
+            print("ready to schedule '%s'" % format_insn(kernel, insn.id))
 
         if is_ready and not debug_mode:
             iid_set = frozenset([insn.id])
@@ -660,7 +690,38 @@ def generate_loop_schedules_internal(
                 if last_entered_loop in kernel.insn_inames(insn):
                     if debug_mode:
                         print("cannot leave '%s' because '%s' still depends on it"
-                                % (last_entered_loop, insn.id))
+                                % (last_entered_loop, format_insn(kernel, insn.id)))
+
+                        # check if there's a dependency of insn that needs to be
+                        # outside of last_entered_loop.
+                        for subdep_id in gen_dependencies_except(kernel, insn_id,
+                                sched_state.unscheduled_insn_ids):
+                            subdep = kernel.id_to_insn[insn_id]
+                            want = (kernel.insn_inames(subdep_id)
+                                    - sched_state.parallel_inames)
+                            if (
+                                    last_entered_loop not in want and
+                                    last_entered_loop not in subdep.boostable_into):
+                                print(
+                                    "%(warn)swarning:%(reset_all)s '%(iname)s', "
+                                    "which the schedule is "
+                                    "currently stuck inside of, seems mis-nested. "
+                                    "'%(subdep)s' must occur " "before '%(dep)s', "
+                                    "but '%(subdep)s must be outside "
+                                    "'%(iname)s', whereas '%(dep)s' must be back "
+                                    "in it.%(reset_all)s\n"
+                                    "  %(subdep_i)s\n"
+                                    "  %(dep_i)s"
+                                    % {
+                                        "warn": Fore.RED + Style.BRIGHT,
+                                        "reset_all": Style.RESET_ALL,
+                                        "iname": last_entered_loop,
+                                        "subdep": format_insn_id(kernel, subdep_id),
+                                        "dep": format_insn_id(kernel, insn_id),
+                                        "subdep_i": format_insn(kernel, subdep_id),
+                                        "dep_i": format_insn(kernel, insn_id),
+                                        })
+
                     can_leave = False
                     break
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index d39f2f8556ad62642794ab2ef9b4c9504fdb27b1..5aa251dc9318bae6fe49b69642722ecc88d79887 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -35,6 +35,21 @@ from loopy.kernel.data import Assignment
 from loopy.diagnostic import warn
 
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autofunction:: get_op_poly
+
+.. autofunction:: get_gmem_access_poly
+
+.. autofunction:: sum_mem_access_to_bytes
+
+.. autofunction:: get_barrier_poly
+
+"""
+
+
 # {{{ ToCountMap
 
 class ToCountMap:
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 7adab80c68c38f900976eb1adcd90226f40a7d9b..b887c703420d092d7f3c0fc9c729dd1d1f942a76 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -79,7 +79,20 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_reduction(self, expr, *args):
-        return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args))
+        mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames]
+
+        new_inames = []
+        for iname, new_sym_iname in zip(expr.inames, mapped_inames):
+            if not isinstance(new_sym_iname, Variable):
+                from loopy.diagnostic import LoopyError
+                raise LoopyError("%s did not map iname '%s' to a variable"
+                        % (type(self).__name__, iname))
+
+            new_inames.append(new_sym_iname.name)
+
+        return Reduction(
+                expr.operation, tuple(new_inames), self.rec(expr.expr, *args),
+                allow_simultaneous=expr.allow_simultaneous)
 
     def map_tagged_variable(self, expr, *args):
         # leaf, doesn't change
@@ -146,7 +159,8 @@ class StringifyMapper(StringifyMapperBase):
         return "loc.%d" % expr.index
 
     def map_reduction(self, expr, prec):
-        return "reduce(%s, [%s], %s)" % (
+        return "%sreduce(%s, [%s], %s)" % (
+                "simul_" if expr.allow_simultaneous else "",
                 expr.operation, ", ".join(expr.inames), expr.expr)
 
     def map_tagged_variable(self, expr, prec):
@@ -346,11 +360,16 @@ class Reduction(AlgebraicLeaf):
 
         The expression (as a :class:`pymbolic.primitives.Expression`)
         on which reduction is performed.
+
+    .. attribute:: allow_simultaneous
+
+        A :class:`bool`. If not *True*, an iname is allowed to be used
+        in precisely one reduction, to avoid mis-nesting errors.
     """
 
-    init_arg_names = ("operation", "inames", "expr")
+    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
 
-    def __init__(self, operation, inames, expr):
+    def __init__(self, operation, inames, expr, allow_simultaneous=False):
         if isinstance(inames, str):
             inames = tuple(iname.strip() for iname in inames.split(","))
 
@@ -378,9 +397,10 @@ class Reduction(AlgebraicLeaf):
         self.operation = operation
         self.inames = inames
         self.expr = expr
+        self.allow_simultaneous = allow_simultaneous
 
     def __getinitargs__(self):
-        return (self.operation, self.inames, self.expr)
+        return (self.operation, self.inames, self.expr, self.allow_simultaneous)
 
     def get_hash(self):
         return hash((self.__class__, self.operation, self.inames,
@@ -779,7 +799,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
     turns those into the actual pymbolic primitives used for that.
     """
 
-    def _parse_reduction(self, operation, inames, red_expr):
+    def _parse_reduction(self, operation, inames, red_expr,
+            allow_simultaneous=False):
         if isinstance(inames, Variable):
             inames = (inames,)
 
@@ -795,7 +816,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
             processed_inames.append(iname.name)
 
-        return Reduction(operation, tuple(processed_inames), red_expr)
+        return Reduction(operation, tuple(processed_inames), red_expr,
+                allow_simultaneous=allow_simultaneous)
 
     def map_call(self, expr):
         from loopy.library.reduction import parse_reduction_op
@@ -820,7 +842,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             else:
                 raise TypeError("cse takes two arguments")
 
-        elif name == "reduce":
+        elif name in ["reduce", "simul_reduce"]:
             if len(expr.parameters) == 3:
                 operation, inames, red_expr = expr.parameters
 
@@ -829,7 +851,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
                             "must be a symbol")
 
                 operation = parse_reduction_op(operation.name)
-                return self._parse_reduction(operation, inames, self.rec(red_expr))
+                return self._parse_reduction(operation, inames, self.rec(red_expr),
+                        allow_simultaneous=(name == "simul_reduce"))
             else:
                 raise TypeError("invalid 'reduce' calling sequence")
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index ee28594a50ce8486702e13f0ee9bf01debb4f859..85e58a809e51ff8067c9effb59240fb5125b46db 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -24,9 +24,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: TargetBase
+.. autoclass:: CTarget
+.. autoclass:: CudaTarget
+.. autoclass:: OpenCLTarget
+.. autoclass:: PyOpenCLTarget
+.. autoclass:: ISPCTarget
+
+"""
+
 
 class TargetBase(object):
-    """Objects of this type must be picklable."""
+    """Base class for all targets, i.e. different types of code that
+    loopy can generate.
+
+    Objects of this type must be picklable.
+    """
 
     # {{{ persistent hashing
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 73310c4db7ad590396377d1d757ebf8eb068a8e6..5d6a856d14f57b277e6c083717346bbeb11af7b6 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -56,6 +56,9 @@ def _preamble_generator(kernel, seen_dtypes, seen_functions):
 
 
 class CTarget(TargetBase):
+    """A target for plain "C", without any parallel extensions.
+    """
+
     hash_fields = TargetBase.hash_fields + ("fortran_abi",)
     comparison_fields = TargetBase.comparison_fields + ("fortran_abi",)
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 94a144cb972b10cd7a1d797adbd8d259e7eb64dc..55f8da4d608d7a1b3a1ca8ba960c498586f4d76d 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -162,6 +162,8 @@ class LoopyCudaCCodeMapper(LoopyCCodeMapper):
 # {{{ target
 
 class CudaTarget(CTarget):
+    """A target for Nvidia's CUDA GPU programming language."""
+
     def __init__(self, extern_c=True):
         """
         :arg extern_c: If *True*, declare kernels using "extern C" to
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 737ee0d998ce5b330697d56228183e0abe5e0137..b0b1c247ec78a29b75890b0e9f04448116b97c4e 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -30,6 +30,8 @@ from loopy.target.c import CTarget
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from loopy.diagnostic import LoopyError
 
+from pytools import memoize_method
+
 
 # {{{ expression mapper
 
@@ -54,7 +56,35 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
 # }}}
 
 
+# {{{ type registry
+
+def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True):
+    reg.get_or_register_dtype("bool", np.bool)
+
+    reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8)
+    reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8)
+    reg.get_or_register_dtype(["int16", "short", "signed short",
+        "signed short int", "short signed int"], np.int16)
+    reg.get_or_register_dtype(["uint16", "unsigned short",
+        "unsigned short int", "short unsigned int"], np.uint16)
+    reg.get_or_register_dtype(["int32", "int", "signed int"], np.int32)
+    reg.get_or_register_dtype(["uint32", "unsigned", "unsigned int"], np.uint32)
+
+    reg.get_or_register_dtype(["int64"], np.int64)
+    reg.get_or_register_dtype(["uint64"], np.uint64)
+
+    reg.get_or_register_dtype("float", np.float32)
+    reg.get_or_register_dtype("double", np.float64)
+
+# }}}
+
+
 class ISPCTarget(CTarget):
+    """A code generation target for Intel's `ISPC <https://ispc.github.io/>`_
+    SPMD programming language, to target Intel's Knight's hardware and modern
+    Intel CPUs with wide vector units.
+    """
+
     def __init__(self, occa_mode=False):
         """
         :arg occa_mode: Whether to modify the generated call signature to
@@ -64,6 +94,18 @@ class ISPCTarget(CTarget):
 
         super(ISPCTarget, self).__init__()
 
+    # {{{ types
+
+    @memoize_method
+    def get_dtype_registry(self):
+        from loopy.target.c.compyte.dtypes import DTypeRegistry
+        result = DTypeRegistry()
+        fill_registry_with_ispc_types(result, respect_windows=False,
+                include_bool=True)
+        return result
+
+    # }}}
+
     # {{{ top-level codegen
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
@@ -116,18 +158,26 @@ class ISPCTarget(CTarget):
         from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
         ccm = self.get_expression_to_code_mapper(codegen_state)
 
-        wrapper_body.extend([
-                S("assert(programCount == %s)"
-                    % ccm(lsize[0], PREC_COMPARISON)),
-                S("launch[%s] %s(%s)"
+        if lsize:
+            wrapper_body.append(
+                    S("assert(programCount == %s)"
+                        % ccm(lsize[0], PREC_COMPARISON)))
+
+        if gsize:
+            launch_spec = "[%s]" % ", ".join(
+                                ccm(gs_i, PREC_NONE)
+                                for gs_i in gsize),
+        else:
+            launch_spec = ""
+
+        wrapper_body.append(
+                S("launch%s %s(%s)"
                     % (
-                        ", ".join(
-                            ccm(gs_i, PREC_NONE)
-                            for gs_i in gsize),
+                        launch_spec,
                         inner_name,
                         ", ".join(arg_names)
                         ))
-                ])
+                )
 
         wrapper_fbody = FunctionBody(
                 ISPCExport(
@@ -176,11 +226,8 @@ class ISPCTarget(CTarget):
             raise LoopyError("unknown barrier kind")
 
     def wrap_temporary_decl(self, decl, is_local):
-        from cgen.ispc import ISPCUniform, ISPCVarying
-        if is_local:
-            return ISPCUniform(decl)
-        else:
-            return ISPCVarying(decl)
+        from cgen.ispc import ISPCUniform
+        return ISPCUniform(decl)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index a009e93360016128f69c85fee2a555cd016a5ab1..cfdc8620bb4a383e4d48f004a2c682ba8e495a05 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -189,6 +189,9 @@ class LoopyOpenCLCCodeMapper(LoopyCCodeMapper):
 # {{{ target
 
 class OpenCLTarget(CTarget):
+    """A target for the OpenCL C heterogeneous compute programming language.
+    """
+
     # {{{ library
 
     def function_manglers(self):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index d13384534c70df602785d4189739a7bc86ed37db..2947fdc6c5d17e919bdcc329f6243376b00a9969 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -31,12 +31,6 @@ import numpy as np
 
 from loopy.target.opencl import OpenCLTarget
 
-import pyopencl as cl
-import pyopencl.characterize as cl_char
-
-# This ensures the dtype registry is populated.
-import pyopencl.tools  # noqa
-
 import logging
 logger = logging.getLogger(__name__)
 
@@ -44,6 +38,9 @@ logger = logging.getLogger(__name__)
 # {{{ temp storage adjust for bank conflict
 
 def adjust_local_temp_var_storage(kernel, device):
+    import pyopencl as cl
+    import pyopencl.characterize as cl_char
+
     logger.debug("%s: adjust temp var storage" % kernel.name)
 
     new_temp_vars = {}
@@ -244,9 +241,21 @@ class _LegacyTypeRegistryStub(object):
         from pyopencl.compyte.dtypes import dtype_to_ctype
         return dtype_to_ctype(dtype)
 
+# }}}
+
+
+# {{{ target
 
 class PyOpenCLTarget(OpenCLTarget):
+    """A code generation target that takes special advantage of :mod:`pyopencl`
+    features such as run-time knowledge of the target device (to generate
+    warnings) and support for complex numbers.
+    """
+
     def __init__(self, device=None):
+        # This ensures the dtype registry is populated.
+        import pyopencl.tools  # noqa
+
         super(PyOpenCLTarget, self).__init__()
 
         self.device = device
diff --git a/loopy/tools.py b/loopy/tools.py
index 55b177bda4e6be03a985286fd4faf6322e257824..777532e7af92bc62e6878f564c8c5545f4cb2c4a 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -233,6 +233,119 @@ def remove_common_indentation(code, require_leading_newline=True,
 # }}}
 
 
+# {{{ build_ispc_shared_lib
+
+# DO NOT RELY ON THESE: THEY WILL GO AWAY
+
+def build_ispc_shared_lib(
+        cwd, ispc_sources, cxx_sources,
+        ispc_options=[], cxx_options=[],
+        ispc_bin="ispc",
+        cxx_bin="g++",
+        quiet=True):
+    from os.path import join
+
+    ispc_source_names = []
+    for name, contents in ispc_sources:
+        ispc_source_names.append(name)
+
+        with open(join(cwd, name), "w") as srcf:
+            srcf.write(contents)
+
+    cxx_source_names = []
+    for name, contents in cxx_sources:
+        cxx_source_names.append(name)
+
+        with open(join(cwd, name), "w") as srcf:
+            srcf.write(contents)
+
+    from subprocess import check_call
+
+    ispc_cmd = ([ispc_bin,
+                "--pic",
+                "-o", "ispc.o"]
+            + ispc_options
+            + list(ispc_source_names))
+    if not quiet:
+        print(" ".join(ispc_cmd))
+
+    check_call(ispc_cmd, cwd=cwd)
+
+    cxx_cmd = ([
+                cxx_bin,
+                "-shared", "-Wl,--export-dynamic",
+                "-fPIC",
+                "-oshared.so",
+                "ispc.o",
+                ]
+            + cxx_options
+            + list(cxx_source_names))
+
+    check_call(cxx_cmd, cwd=cwd)
+
+    if not quiet:
+        print(" ".join(cxx_cmd))
+
+# }}}
+
+
+# {{{ numpy address munging
+
+# DO NOT RELY ON THESE: THEY WILL GO AWAY
+
+def address_from_numpy(obj):
+    ary_intf = getattr(obj, "__array_interface__", None)
+    if ary_intf is None:
+        raise RuntimeError("no array interface")
+
+    buf_base, is_read_only = ary_intf["data"]
+    return buf_base + ary_intf.get("offset", 0)
+
+
+def cptr_from_numpy(obj):
+    import ctypes
+    return ctypes.c_void_p(address_from_numpy(obj))
+
+
+# https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172
+def empty_aligned(shape, dtype, order='C', n=64):
+    '''empty_aligned(shape, dtype='float64', order='C', n=None)
+    Function that returns an empty numpy array that is n-byte aligned,
+    where ``n`` is determined by inspecting the CPU if it is not
+    provided.
+    The alignment is given by the final optional argument, ``n``. If
+    ``n`` is not provided then this function will inspect the CPU to
+    determine alignment. The rest of the arguments are as per
+    :func:`numpy.empty`.
+    '''
+    itemsize = np.dtype(dtype).itemsize
+
+    # Apparently there is an issue with numpy.prod wrapping around on 32-bits
+    # on Windows 64-bit. This shouldn't happen, but the following code
+    # alleviates the problem.
+    if not isinstance(shape, (int, np.integer)):
+        array_length = 1
+        for each_dimension in shape:
+            array_length *= each_dimension
+
+    else:
+        array_length = shape
+
+    base_ary = np.empty(array_length*itemsize+n, dtype=np.int8)
+
+    # We now need to know how to offset base_ary
+    # so it is correctly aligned
+    _array_aligned_offset = (n-address_from_numpy(base_ary)) % n
+
+    array = np.frombuffer(
+            base_ary[_array_aligned_offset:_array_aligned_offset-n].data,
+            dtype=dtype).reshape(shape, order=order)
+
+    return array
+
+# }}}
+
+
 def is_interned(s):
     return s is None or intern(s) is s
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index a6830092910dcc89eefcc337961c3b215831fb41..d41222c26056300729b0c4005200ba6ea904010d 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -25,108 +25,9 @@ THE SOFTWARE.
 
 import six
 
-from loopy.symbolic import (RuleAwareIdentityMapper,
-        SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 
 
-# {{{ split_reduction
-
-class _ReductionSplitter(RuleAwareIdentityMapper):
-    def __init__(self, rule_mapping_context, within, inames, direction):
-        super(_ReductionSplitter, self).__init__(
-                rule_mapping_context)
-
-        self.within = within
-        self.inames = inames
-        self.direction = direction
-
-    def map_reduction(self, expr, expn_state):
-        if set(expr.inames) & set(expn_state.arg_context):
-            # FIXME
-            raise NotImplementedError()
-
-        if (self.inames <= set(expr.inames)
-                and self.within(
-                    expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
-            leftover_inames = set(expr.inames) - self.inames
-
-            from loopy.symbolic import Reduction
-            if self.direction == "in":
-                return Reduction(expr.operation, tuple(leftover_inames),
-                        Reduction(expr.operation, tuple(self.inames),
-                            self.rec(expr.expr, expn_state)))
-            elif self.direction == "out":
-                return Reduction(expr.operation, tuple(self.inames),
-                        Reduction(expr.operation, tuple(leftover_inames),
-                            self.rec(expr.expr, expn_state)))
-            else:
-                assert False
-        else:
-            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
-
-
-def _split_reduction(kernel, inames, direction, within=None):
-    if direction not in ["in", "out"]:
-        raise ValueError("invalid value for 'direction': %s" % direction)
-
-    if isinstance(inames, str):
-        inames = inames.split(",")
-    inames = set(inames)
-
-    from loopy.context_matching import parse_stack_match
-    within = parse_stack_match(within)
-
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            kernel.substitutions, kernel.get_var_name_generator())
-    rsplit = _ReductionSplitter(rule_mapping_context,
-            within, inames, direction)
-    return rule_mapping_context.finish_kernel(
-            rsplit.map_kernel(kernel))
-
-
-def split_reduction_inward(kernel, inames, within=None):
-    """Takes a reduction of the form::
-
-        sum([i,j,k], ...)
-
-    and splits it into two nested reductions::
-
-        sum([j,k], sum([i], ...))
-
-    In this case, *inames* would have been ``"i"`` indicating that
-    the iname ``i`` should be made the iname governing the inner reduction.
-
-    :arg inames: A list of inames, or a comma-separated string that can
-        be parsed into those
-    """
-
-    return _split_reduction(kernel, inames, "in", within)
-
-
-def split_reduction_outward(kernel, inames, within=None):
-    """Takes a reduction of the form::
-
-        sum([i,j,k], ...)
-
-    and splits it into two nested reductions::
-
-        sum([i], sum([j,k], ...))
-
-    In this case, *inames* would have been ``"i"`` indicating that
-    the iname ``i`` should be made the iname governing the outer reduction.
-
-    :arg inames: A list of inames, or a comma-separated string that can
-        be parsed into those
-    """
-
-    return _split_reduction(kernel, inames, "out", within)
-
-# }}}
-
-
 # {{{ fold constants
 
 def fold_constants(kernel):
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 59239104e43725c6ea5bf572eda7e79d080427a7..967e14de692ee96b0c01d2ea5bcf8b411890038b 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -29,21 +29,27 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont
 from loopy.kernel.data import ValueArg, GlobalArg
 import islpy as isl
 
+__doc__ = """
+.. autofunction:: to_batched
+"""
+
 
 # {{{ to_batched
 
 class _BatchVariableChanger(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, kernel, batch_varying_args,
-            batch_iname_expr):
+            batch_iname_expr, sequential):
         super(_BatchVariableChanger, self).__init__(rule_mapping_context)
 
         self.kernel = kernel
         self.batch_varying_args = batch_varying_args
         self.batch_iname_expr = batch_iname_expr
+        self.sequential = sequential
 
     def needs_batch_subscript(self, name):
         return (
-                name in self.kernel.temporary_variables
+                (not self.sequential
+                    and name in self.kernel.temporary_variables)
                 or
                 name in self.batch_varying_args)
 
@@ -64,14 +70,18 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
         return expr.aggregate[self.batch_iname_expr]
 
 
-def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"):
+def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
+        sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
 
     :arg nbatches: the number of batches. May be a constant non-negative
         integer or a string, which will be added as an integer argument.
-    :arg batch_varying_args: a list of argument names that depend vary per-batch.
+    :arg batch_varying_args: a list of argument names that vary per-batch.
         Each such variable will have a batch index added.
+    :arg sequential: A :class:`bool`. If *True*, do not duplicate
+        temporary variables for each batch. This automatically tags the batch
+        iname for sequential execution.
     """
 
     from pymbolic import var
@@ -110,26 +120,32 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"):
 
         new_args.append(arg)
 
-    new_temps = {}
-
-    for temp in six.itervalues(knl.temporary_variables):
-        new_temps[temp.name] = temp.copy(
-                shape=(nbatches_expr,) + temp.shape,
-                dim_tags=("c",) * (len(arg.shape) + 1))
-
     knl = knl.copy(
             domains=new_domains,
-            args=new_args,
-            temporary_variables=new_temps)
+            args=new_args)
+
+    if not sequential:
+        new_temps = {}
+
+        for temp in six.itervalues(knl.temporary_variables):
+            new_temps[temp.name] = temp.copy(
+                    shape=(nbatches_expr,) + temp.shape,
+                    dim_tags=("c",) * (len(arg.shape) + 1))
+
+        knl = knl.copy(temporary_variables=new_temps)
+    else:
+        import loopy as lp
+        from loopy.kernel.data import ForceSequentialTag
+        knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())])
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             knl.substitutions, vng)
     bvc = _BatchVariableChanger(rule_mapping_context,
-            knl, batch_varying_args, batch_iname_expr)
+            knl, batch_varying_args, batch_iname_expr,
+            sequential=sequential)
     return rule_mapping_context.finish_kernel(
             bvc.map_kernel(knl))
 
-
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 64332637910340d68cb035d64ad6f4f643c0b5c9..7b1deb7951392e2e0c46360f8fd979ebf5aedb37 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -353,6 +353,26 @@ def remove_unused_arguments(knl):
     for insn in exp_knl.instructions:
         refd_vars.update(insn.dependency_names())
 
+    from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
+    from loopy.symbolic import get_dependencies
+    from itertools import chain
+
+    def tolerant_get_deps(expr):
+        if expr is None or expr is lp.auto:
+            return set()
+        return get_dependencies(expr)
+
+    for ary in chain(knl.args, six.itervalues(knl.temporary_variables)):
+        if isinstance(ary, ArrayBase):
+            refd_vars.update(
+                    tolerant_get_deps(ary.shape)
+                    | tolerant_get_deps(ary.offset))
+
+            for dim_tag in ary.dim_tags:
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    refd_vars.update(
+                            tolerant_get_deps(dim_tag.stride))
+
     for arg in knl.args:
         if arg.name in refd_vars:
             new_args.append(arg)
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index bf435d3fe08d022790bf31a4d583d4923f0bfeff..e44f8abe227d451e8e940708530f6c20566685e8 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb):
 
     from pymbolic.imperative.transform import \
             fuse_instruction_streams_with_unique_ids
-    new_instructions, _ = fuse_instruction_streams_with_unique_ids(
-            knla.instructions, knlb.instructions)
+    new_instructions, old_b_id_to_new_b_id = \
+            fuse_instruction_streams_with_unique_ids(
+                    knla.instructions, knlb.instructions)
 
     # {{{ fuse assumptions
 
@@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb):
                 "target",
                 knla.target,
                 knlb.target),
-            options=knla.options)
+            options=knla.options), old_b_id_to_new_b_id
 
 # }}}
 
 
-def fuse_kernels(kernels, suffixes=None):
+def fuse_kernels(kernels, suffixes=None, data_flow=None):
     """Return a kernel that performs all the operations in all entries
     of *kernels*.
 
@@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None):
     :arg suffixes: If given, must be a list of strings of a length matching
         that of *kernels*. This will be used to disambiguate the names
         of temporaries, as described below.
+    :arg data_flow: A list of data dependencies
+        ``[(var_name, from_kernel, to_kernel), ...]``.
+        Based on this, the fuser will create dependencies between all
+        writers of *var_name* in ``kernels[from_kernel]`` to
+        readers of *var_name* in ``kernels[to_kernel]``.
 
     The components of the kernels are fused as follows:
 
@@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None):
     *   The resulting kernel will contain all instructions from each entry
         of *kernels*. Clashing instruction IDs will be renamed to ensure
         uniqueness.
+
+    .. versionchanged:: 2016.2
+
+        *data_flow* was added in version 2016.2
     """
     kernels = list(kernels)
 
+    if data_flow is None:
+        data_flow = []
+
     if suffixes:
         suffixes = list(suffixes)
         if len(suffixes) != len(kernels):
@@ -356,9 +369,46 @@ def fuse_kernels(kernels, suffixes=None):
 
         # }}}
 
-    result = kernels.pop(0)
-    while kernels:
-        result = _fuse_two_kernels(result, kernels.pop(0))
+    kernel_insn_ids = []
+    result = None
+
+    for knlb in kernels:
+        if result is None:
+            result = knlb
+            kernel_insn_ids.append([
+                insn.id for insn in knlb.instructions])
+        else:
+            result, old_b_id_to_new_b_id = _fuse_two_kernels(
+                    knla=result,
+                    knlb=knlb)
+
+            kernel_insn_ids.append([
+                old_b_id_to_new_b_id[insn.id]
+                for insn in knlb.instructions])
+
+    # {{{ realize data_flow dependencies
+
+    id_to_insn = result.id_to_insn.copy()
+
+    for var_name, from_kernel, to_kernel in data_flow:
+        from_writer_ids = frozenset(
+                insn_id
+                for insn_id in kernel_insn_ids[from_kernel]
+                if var_name in id_to_insn[insn_id].assignee_var_names())
+
+        for insn_id in kernel_insn_ids[to_kernel]:
+            insn = id_to_insn[insn_id]
+            if var_name in insn.read_dependency_names():
+                insn = insn.copy(depends_on=insn.depends_on | from_writer_ids)
+
+            id_to_insn[insn_id] = insn
+
+    result = result.copy(instructions=[
+            id_to_insn[insn_id]
+            for insn_ids in kernel_insn_ids
+            for insn_id in insn_ids])
+
+    # }}}
 
     return result
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 8cf16bfd330b3e425023573788bc68bc8a7275a2..c317f2a4fbd186ccf069da74118504249956b4bc 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -35,25 +35,40 @@ from loopy.symbolic import (
 from loopy.diagnostic import LoopyError
 
 
-# {{{ assume
+__doc__ = """
+.. currentmodule:: loopy
 
-def assume(kernel, assumptions):
-    if isinstance(assumptions, str):
-        assumptions_set_str = "[%s] -> { : %s}" \
-                % (",".join(s for s in kernel.outer_params()),
-                    assumptions)
-        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
-                assumptions_set_str)
+.. autofunction:: split_iname
 
-    if not isinstance(assumptions, isl.BasicSet):
-        raise TypeError("'assumptions' must be a BasicSet or a string")
+.. autofunction:: chunk_iname
 
-    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)
+.. autofunction:: join_inames
 
-    return kernel.copy(
-            assumptions=old_assumptions.params() & new_assumptions.params())
+.. autofunction:: tag_inames
 
-# }}}
+.. autofunction:: duplicate_inames
+
+.. undocumented .. autofunction:: link_inames
+
+.. autofunction:: rename_iname
+
+.. autofunction:: remove_unused_inames
+
+.. autofunction:: set_loop_priority
+
+.. autofunction:: split_reduction_inward
+
+.. autofunction:: split_reduction_outward
+
+.. autofunction:: affine_map_inames
+
+.. autofunction:: realize_ilp
+
+.. autofunction:: find_unused_axis_tag
+
+.. autofunction:: make_reduction_inames_unique
+
+"""
 
 
 # {{{ set loop priority
@@ -106,7 +121,8 @@ class _InameSplitter(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameSplitter, self).map_reduction(expr, expn_state)
 
@@ -269,8 +285,21 @@ def split_iname(kernel, split_iname, inner_length,
         within=None):
     """Split *split_iname* into two inames (an 'inner' one and an 'outer' one)
     so that ``split_iname == inner + outer*inner_length`` and *inner* is of
-    fixed length *inner_length*.
-
+    constant length *inner_length*.
+
+    :arg outer_iname: The new iname to use for the 'inner' (fixed-length)
+        loop. Defaults to a name derived from ``split_iname + "_outer"``
+    :arg inner_iname: The new iname to use for the 'inner' (fixed-length)
+        loop. Defaults to a name derived from ``split_iname + "_inner"``
+    :arg inner_length: a positive integer
+    :arg slabs:
+        A tuple ``(head_it_count, tail_it_count)`` indicating the
+        number of leading/trailing iterations of *outer_iname*
+        for which separate code should be generated.
+    :arg outer_tag: The iname tag (see :ref:`iname-tag`) to apply to
+        *outer_iname*.
+    :arg inner_tag: The iname tag (see :ref:`iname-tag`) to apply to
+        *inner_iname*.
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
     """
@@ -418,7 +447,8 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameJoiner, self).map_reduction(expr, expn_state)
 
@@ -544,7 +574,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
 def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
     """Tag an iname
 
-    :arg iname_to_tag: a list of tuple ``(iname, new_tag)``. *new_tag* is given
+    :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given
         as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or
         as a string as shown in :ref:`iname-tags`. May also be a dictionary
         for backwards compatibility.
@@ -650,7 +680,8 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, new_inames,
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameDuplicator, self).map_reduction(expr, expn_state)
 
@@ -1021,6 +1052,106 @@ def remove_unused_inames(knl, inames=None):
 # }}}
 
 
+# {{{ split_reduction
+
+class _ReductionSplitter(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, within, inames, direction):
+        super(_ReductionSplitter, self).__init__(
+                rule_mapping_context)
+
+        self.within = within
+        self.inames = inames
+        self.direction = direction
+
+    def map_reduction(self, expr, expn_state):
+        if set(expr.inames) & set(expn_state.arg_context):
+            # FIXME
+            raise NotImplementedError()
+
+        if (self.inames <= set(expr.inames)
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
+            leftover_inames = set(expr.inames) - self.inames
+
+            from loopy.symbolic import Reduction
+            if self.direction == "in":
+                return Reduction(expr.operation, tuple(leftover_inames),
+                        Reduction(expr.operation, tuple(self.inames),
+                            self.rec(expr.expr, expn_state),
+                            expr.allow_simultaneous),
+                        expr.allow_simultaneous)
+            elif self.direction == "out":
+                return Reduction(expr.operation, tuple(self.inames),
+                        Reduction(expr.operation, tuple(leftover_inames),
+                            self.rec(expr.expr, expn_state),
+                            expr.allow_simultaneous))
+            else:
+                assert False
+        else:
+            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
+
+
+def _split_reduction(kernel, inames, direction, within=None):
+    if direction not in ["in", "out"]:
+        raise ValueError("invalid value for 'direction': %s" % direction)
+
+    if isinstance(inames, str):
+        inames = inames.split(",")
+    inames = set(inames)
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    rsplit = _ReductionSplitter(rule_mapping_context,
+            within, inames, direction)
+    return rule_mapping_context.finish_kernel(
+            rsplit.map_kernel(kernel))
+
+
+def split_reduction_inward(kernel, inames, within=None):
+    """Takes a reduction of the form::
+
+        sum([i,j,k], ...)
+
+    and splits it into two nested reductions::
+
+        sum([j,k], sum([i], ...))
+
+    In this case, *inames* would have been ``"i"`` indicating that
+    the iname ``i`` should be made the iname governing the inner reduction.
+
+    :arg inames: A list of inames, or a comma-separated string that can
+        be parsed into those
+    """
+
+    return _split_reduction(kernel, inames, "in", within)
+
+
+def split_reduction_outward(kernel, inames, within=None):
+    """Takes a reduction of the form::
+
+        sum([i,j,k], ...)
+
+    and splits it into two nested reductions::
+
+        sum([i], sum([j,k], ...))
+
+    In this case, *inames* would have been ``"i"`` indicating that
+    the iname ``i`` should be made the iname governing the outer reduction.
+
+    :arg inames: A list of inames, or a comma-separated string that can
+        be parsed into those
+    """
+
+    return _split_reduction(kernel, inames, "out", within)
+
+# }}}
+
+
 # {{{ affine map inames
 
 def affine_map_inames(kernel, old_inames, new_inames, equations):
@@ -1259,4 +1390,125 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
 
 # }}}
 
+
+# {{{ separate_loop_head_tail_slab
+
+# undocumented, because not super-useful
+def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count):
+    """Mark *iname* so that the separate code is generated for
+    the lower *head_it_count* and the upper *tail_it_count*
+    iterations of the loop on *iname*.
+    """
+
+    iname_slab_increments = kernel.iname_slab_increments.copy()
+    iname_slab_increments[iname] = (head_it_count, tail_it_count)
+
+    return kernel.copy(iname_slab_increments=iname_slab_increments)
+
+# }}}
+
+
+# {{{ make_reduction_inames_unique
+
+class _ReductionInameUniquifier(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, inames, within):
+        super(_ReductionInameUniquifier, self).__init__(rule_mapping_context)
+
+        self.inames = inames
+        self.old_to_new = []
+        self.within = within
+
+        self.iname_to_red_count = {}
+        self.iname_to_nonsimultaneous_red_count = {}
+
+    def map_reduction(self, expr, expn_state):
+        within = self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)
+
+        for iname in expr.inames:
+            self.iname_to_red_count[iname] = (
+                    self.iname_to_red_count.get(iname, 0) + 1)
+            if not expr.allow_simultaneous:
+                self.iname_to_nonsimultaneous_red_count[iname] = (
+                    self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1)
+
+        if within and not expr.allow_simultaneous:
+            subst_dict = {}
+
+            from pymbolic import var
+
+            new_inames = []
+            for iname in expr.inames:
+                if (
+                        not (self.inames is None or iname in self.inames)
+                        or
+                        self.iname_to_red_count[iname] <= 1):
+                    new_inames.append(iname)
+                    continue
+
+                new_iname = self.rule_mapping_context.make_unique_var_name(iname)
+                subst_dict[iname] = var(new_iname)
+                self.old_to_new.append((iname, new_iname))
+                new_inames.append(new_iname)
+
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+
+            from loopy.symbolic import Reduction
+            return Reduction(expr.operation, tuple(new_inames),
+                    self.rec(
+                        SubstitutionMapper(make_subst_func(subst_dict))(
+                            expr.expr),
+                        expn_state),
+                    expr.allow_simultaneous)
+        else:
+            return super(_ReductionInameUniquifier, self).map_reduction(
+                    expr, expn_state)
+
+
+def make_reduction_inames_unique(kernel, inames=None, within=None):
+    """
+    :arg inames: if not *None*, only apply to these inames
+    :arg within: a stack match as understood by
+        :func:`loopy.context_matching.parse_stack_match`.
+
+    .. versionadded:: 2016.2
+    """
+
+    name_gen = kernel.get_var_name_generator()
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    # {{{ change kernel
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, name_gen)
+    r_uniq = _ReductionInameUniquifier(rule_mapping_context,
+            inames, within=within)
+
+    kernel = rule_mapping_context.finish_kernel(
+            r_uniq.map_kernel(kernel))
+
+    # }}}
+
+    # {{{ duplicate the inames
+
+    for old_iname, new_iname in r_uniq.old_to_new:
+        from loopy.kernel.tools import DomainChanger
+        domch = DomainChanger(kernel, frozenset([old_iname]))
+
+        from loopy.isl_helpers import duplicate_axes
+        kernel = kernel.copy(
+                domains=domch.get_domains_with(
+                    duplicate_axes(domch.domain, [old_iname], [new_iname])))
+
+    # }}}
+
+    return kernel
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index 1567263cddc8e199f93652ef4c39ef737b86fb3f..f7600b212cbf4db6b58c91bb3603f5a310c6b2a6 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -28,6 +28,42 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 import islpy as isl
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autofunction:: fix_parameters
+
+.. autofunction:: assume
+"""
+
+
+# {{{ assume
+
+def assume(kernel, assumptions):
+    """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
+    `n mod 4 = 0`.
+
+    :arg assumptions: a :class:`islpy.BasicSet` or a string representation of
+        the assumptions in :ref:`isl-syntax`.
+    """
+    if isinstance(assumptions, str):
+        assumptions_set_str = "[%s] -> { : %s}" \
+                % (",".join(s for s in kernel.outer_params()),
+                    assumptions)
+        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
+                assumptions_set_str)
+
+    if not isinstance(assumptions, isl.BasicSet):
+        raise TypeError("'assumptions' must be a BasicSet or a string")
+
+    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)
+
+    return kernel.copy(
+            assumptions=old_assumptions.params() & new_assumptions.params())
+
+# }}}
+
 
 # {{{ fix_parameter
 
@@ -99,6 +135,13 @@ def _fix_parameter(kernel, name, value):
 
 
 def fix_parameters(kernel, **value_dict):
+    """Fix the values of the arguments to specific constants.
+
+    *value_dict* consists of *name*/*value* pairs, where *name* will be fixed
+    to be *value*. *name* may refer to :ref:`domain-parameters` or
+    :ref:`arguments`.
+    """
+
     for name, value in six.iteritems(value_dict):
         kernel = _fix_parameter(kernel, name, value)
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 7623fb8911e2a50b08309c0c6cb45a0dd372bfbb..e599c902227faf8d1292ece2307d097bc8fd7c19 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -289,7 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
     """Extract an assignment (to a temporary variable or an argument)
-    as a :ref:`substituiton-rule`. The temporary may be an array, in
+    as a :ref:`substitution-rule`. The temporary may be an array, in
     which case the array indices will become arguments to the substitution
     rule.
 
@@ -349,7 +349,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 
     usage_to_definition = {}
 
-    for insn in kernel.instructions:
+    for insn in dep_kernel.instructions:
         if lhs_name not in insn.read_dependency_names():
             continue
 
diff --git a/loopy/version.py b/loopy/version.py
index 9ad8ac19bebff7a712e91900815057155205ae57..adc069663503b200bcdd1638c05ae0ffae5f14df 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v18-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 4fbb4e142b0680e28e88282409655c5a644a6f4e..5ed095315234339709309a1c55ec88c7fdab6bfa 100644
--- a/setup.py
+++ b/setup.py
@@ -40,8 +40,9 @@ setup(name="loo.py",
           "pytools>=2016.1",
           "pymbolic>=2016.2",
           "cgen>=2016.1",
-          "islpy>=2016.1",
+          "islpy>=2016.1.2",
           "six>=1.8.0",
+          "colorama",
           ],
 
       extras_require={
diff --git a/test/test_dg.py b/test/test_dg.py
index 63a961423d2f750a4c9a25fdcb5fb56a479d8a35..fafef86c35211183ebdaeb75acf2b664a36586a0 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -52,10 +52,10 @@ def test_dg_volume(ctx_factory):
             "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
             ],
             """
-                <> du_drst = sum(m, DrDsDt[n,m]*u[k,m])
-                <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m])
-                <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m])
-                <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m])
+                <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m])
+                <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m])
+                <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m])
+                <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m])
 
                 # volume flux
                 rhsu[k,n] = dot(drst_dx[k],dp_drst)
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9c6803e93f2e53a0c071e0372bf71256854de38a..6aeec63c49a72c784ad5cccf1ee7acc1fcae0f2a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -30,6 +30,8 @@ import pyopencl as cl
 import pyopencl.array as cl_array
 import loopy as lp
 
+import logging
+
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
@@ -61,12 +63,16 @@ def check_float4(result, ref_result):
 
 
 def test_axpy(ctx_factory):
+    logging.basicConfig(level="INFO")
     ctx = ctx_factory()
 
     n = 3145182
 
     vec = cl_array.vec
 
+    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
+        pytest.skip("crashes on AMD 15.12")
+
     for dtype, check, a, b in [
             (np.complex64, None, 5, 7),
             (vec.float4, check_float4,
@@ -111,7 +117,8 @@ def test_axpy(ctx_factory):
             lp.auto_test_vs_ref(seq_knl, ctx, variant(knl),
                     op_count=[np.dtype(dtype).itemsize*n*3/1e9],
                     op_label=["GBytes"],
-                    parameters={"a": a, "b": b, "n": n}, check_result=check)
+                    parameters={"a": a, "b": b, "n": n}, check_result=check,
+                    blacklist_ref_vendors=["Advanced Micro"])
 
 
 def test_transpose(ctx_factory):
@@ -463,7 +470,7 @@ def test_magma_fermi_matrix_mul(ctx_factory):
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
-            parameters={})
+            parameters={}, blacklist_ref_vendors="pocl")
 
 
 def test_image_matrix_mul(ctx_factory):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 667d7365d412d9be6d9a51c8d63c9470ffede2f9..606eec7667ca3d76a215c3b487e8c93bc371c36e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -529,6 +529,9 @@ def test_fuzz_code_generator(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
+        pytest.skip("crashes on AMD 15.12")
+
     #from expr_fuzz import get_fuzz_examples
     #for expr, var_values in get_fuzz_examples():
     for expr, var_values in generate_random_fuzz_examples(50):
@@ -941,6 +944,27 @@ def test_double_sum(ctx_factory):
 
     n = 20
 
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            [
+                "a = simul_reduce(sum, (i,j), i*j)",
+                "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))",
+                ],
+            assumptions="n>=1")
+
+    evt, (a, b) = knl(queue, n=n)
+
+    ref = sum(i*j for i in range(n) for j in range(n))
+    assert a.get() == ref
+    assert b.get() == ref
+
+
+def test_double_sum_made_unique(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 20
+
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
             [
@@ -949,9 +973,10 @@ def test_double_sum(ctx_factory):
                 ],
             assumptions="n>=1")
 
-    cknl = lp.CompiledKernel(ctx, knl)
+    knl = lp.make_reduction_inames_unique(knl)
+    print(knl)
 
-    evt, (a, b) = cknl(queue, n=n)
+    evt, (a, b) = knl(queue, n=n)
 
     ref = sum(i*j for i in range(n) for j in range(n))
     assert a.get() == ref
@@ -1095,8 +1120,8 @@ def test_arg_guessing_with_reduction(ctx_factory):
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
             """
-                a = 1.5 + sum((i,j), i*j)
-                d = 1.5 + sum((i,j), b[i,j])
+                a = 1.5 + simul_reduce(sum, (i,j), i*j)
+                d = 1.5 + simul_reduce(sum, (i,j), b[i,j])
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
@@ -1892,19 +1917,22 @@ def test_poisson(ctx_factory):
     sdim = 3
 
     knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2]: \
+            "{ [c,i,j,k,ell,ell2,ell3]: \
             0 <= c < nels and \
             0 <= i < nbf and \
             0 <= j < nbf and \
             0 <= k < nqp and \
-            0 <= ell < sdim and \
-            0 <= ell2 < sdim }",
+            0 <= ell,ell2 < sdim}",
             """
-            dpsi(bf,k0,dir) := sum(ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
-            Ael[c,i,j] = J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
+            dpsi(bf,k0,dir) := \
+                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
+            Ael[c,i,j] = \
+                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
             """,
             assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
 
+    print(knl)
+
     knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
 
     ref_knl = knl
@@ -1975,7 +2003,7 @@ def test_generate_c_snippet():
     u = var("u")
 
     from functools import partial
-    l_sum = partial(lp.Reduction, "sum")
+    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)
 
     Instr = lp.Assignment  # noqa
 
@@ -2266,7 +2294,10 @@ def test_finite_difference_expr_subst(ctx_factory):
             lp.GlobalArg("u", shape="n+2"),
             ])
 
-    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl])
+    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
+            data_flow=[
+                ("f", 1, 0)
+                ])
 
     fused_knl = lp.set_options(fused_knl, write_cl=True)
     evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 4350a3878469cb7ccd16ba5f0dda2d287d7f2136..3eacbaa2850b12ab1130a0f4b02ac5698bc9fab9 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -179,7 +179,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
 
     hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
           fetch_bounding_box=True, default_tag="for",
-          init_expression="0")
+          init_expression="0", store_expression="base + buffer")
 
     if opt_level == 5:
         tap_hsv = hsv
@@ -242,7 +242,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
     hsv = hsv.copy(name="horizontalStrongVolumeKernel")
 
     results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
-            do_check=False, quiet=True)
+            quiet=True)
 
     elapsed = results["elapsed_wall"]
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index a00fce1776dd8d0722e59ae544ce439568a66d5d..f4b91b236a16a8a30f2962d92b83780343fe801a 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -47,16 +47,16 @@ def test_tim2d(ctx_factory):
     knl = lp.make_kernel(
             "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
             [
-                "ur(a,b) := sum(o, D[a,o]*u[e,o,b])",
-                "us(a,b) := sum(o, D[b,o]*u[e,a,o])",
+                "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])",
+                "us(a,b) := simul_reduce(sum, o, D[b,o]*u[e,a,o])",
 
                 #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",
 
                 "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)",
                 "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)",
                 "lap[e,i,j]  = "
-                "  sum(m, D[m,i]*Gux(m,j))"
-                "+ sum(m, D[m,j]*Guy(i,m))"
+                "  simul_reduce(sum, m, D[m,i]*Gux(m,j))"
+                "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))"
 
             ],
             [