diff --git a/doc/conf.py b/doc/conf.py
index fee2a071c6b2e7c54dd415d135414e5c3e75d06a..7a03b0eeff927065439c60743b897befedb42eb6 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -149,7 +149,7 @@ html_static_path = ['_static']
 #html_split_index = False
 
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+html_show_sourcelink = False
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #html_show_sphinx = True
@@ -219,4 +219,8 @@ man_pages = [
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'http://docs.python.org/': None}
+intersphinx_mapping = {
+    'http://docs.python.org/': None,
+    'http://documen.tician.de/islpy': None,
+    'http://documen.tician.de/pyopencl': None
+    }
diff --git a/doc/guide.rst b/doc/guide.rst
new file mode 100644
index 0000000000000000000000000000000000000000..026f96c9777cd5807dd950188be189e094e7ad5a
--- /dev/null
+++ b/doc/guide.rst
@@ -0,0 +1,10 @@
+.. _guide:
+
+What can loopy do?
+==================
+
+This will become an example-based guide to what loopy can do.
+
+Loopy's Representation of a Kernel
+----------------------------------
+
diff --git a/doc/index.rst b/doc/index.rst
index 0809b1555d942020ef21d98acd83081748483dac..032b11e5e0ba6d45b568108a0cc47fe4311f72bd 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,15 +1,13 @@
-.. loopy documentation master file, created by
-   sphinx-quickstart on Tue Aug  9 13:40:49 2011.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
 Welcome to loopy's documentation!
 =================================
 
 Contents:
 
 .. toctree::
-   :maxdepth: 2
+    :maxdepth: 2
+
+    guide
+    reference
 
 Indices and tables
 ==================
diff --git a/doc/reference.rst b/doc/reference.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e657e47ed8586cb48dd21a173496d14b4d3b3fe7
--- /dev/null
+++ b/doc/reference.rst
@@ -0,0 +1,150 @@
+Reference Guide
+===============
+
+.. module:: loopy
+.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
+
+This guide defines all functionality exposed by loopy. If you would like
+a more gentle introduction, you may consider reading the example-based
+guide :ref:`guide` instead.
+
+.. _tags:
+
+Tags
+----
+
+===================== ====================================================
+Tag                   Meaning
+===================== ====================================================
+`None` | `"for"`      Sequential loop
+`"l.N"`               Local (intra-group) axis N
+`"l.auto"`            Automatically chosen local (intra-group) axis
+`"unr"`               Plain unrolling
+`"ilp"`               Unroll using instruction-level parallelism
+===================== ====================================================
+
+(Throughout this table, `N` must be replaced by an actual number.)
+
+.. _automatic-axes:
+
+Automatic Axis Assignment
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Automatic local axes are chosen as follows:
+
+#. For each instruction containing `"l.auto"` inames:
+    #. Find the lowest-numbered unused axis. If none exists,
+        use sequential unrolling instead.
+    #. Find the iname that has the smallest stride in any global
+        array access occurring in the instruction.
+    #. Assign the low-stride iname to the available axis, splitting
+        the iname if it is too long for the available axis size.
+
+If you need different behavior, use :func:`tag_dimensions` and
+:func:`split_dimension` to change the assignment of `"l.auto"` axes
+manually.
+
+.. _creating-kernels:
+
+Creating Kernels
+----------------
+
+.. _arguments:
+
+Arguments
+^^^^^^^^^
+
+.. autoclass:: ScalarArg
+    :members:
+    :undoc-members:
+
+.. autoclass:: ArrayArg
+    :members:
+    :undoc-members:
+
+.. autoclass:: ConstantArrayArg
+    :members:
+    :undoc-members:
+
+.. autoclass:: ImageArg
+    :members:
+    :undoc-members:
+
+.. _syntax:
+
+String Syntax
+^^^^^^^^^^^^^
+
+* Substitution rules
+
+* Instructions
+
+Kernels
+^^^^^^^
+
+.. autoclass:: LoopKernel
+
+Do not create :class:`LoopKernel` objects directly. Instead, use the following
+function, which takes the same arguments, but does some extra post-processing.
+
+.. autofunction:: make_kernel
+
+Wrangling dimensions
+--------------------
+
+.. autofunction:: split_dimension
+
+.. autofunction:: join_dimensions
+
+.. autofunction:: tag_dimensions
+
+Dealing with Substitution Rules
+-------------------------------
+
+.. autofunction:: extract_subst
+
+.. autofunction:: apply_subst
+
+Precomputation and Prefetching
+------------------------------
+
+.. autofunction:: precompute
+
+.. autofunction:: add_prefetch
+
+    Uses :func:`extract_subst` and :func:`precompute`.
+
+Finishing up
+------------
+
+.. autofunction:: generate_loop_schedules
+
+.. autofunction:: check_kernels
+
+.. autofunction:: generate_code
+
+Automatic Testing
+-----------------
+
+.. autofunction:: auto_test_vs_seq
+
+Troubleshooting
+---------------
+
+Printing :class:`LoopKernel` objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you're confused about things loopy is referring to in an error message or
+about the current state of the :class:`LoopKernel` you are transforming, the
+following always works::
+
+    print kernel
+
+(And it yields a human-readable--albeit terse--representation of *kernel*.)
+
+.. autofunction:: preprocess_kernel
+
+.. autofunction:: get_dot_dependency_graph
+
+Investigating Scheduler Problems
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh
index 22e22844cbc26c38fc6b783a0ddf83e3596f3b63..bcc0a79bed23995780efbf7fced1dfbe8cf306bd 100755
--- a/doc/upload-docs.sh
+++ b/doc/upload-docs.sh
@@ -1,3 +1,3 @@
 #! /bin/sh
 
-rsync --progress --verbose --archive --delete build/html/* buster:doc/loopy
+rsync --progress --verbose --archive --delete _build/html/* buster:doc/loopy
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 67b1b2d2157b5fdc8b15bd56907d5cbb6fe2740f..5d25efdedd0ae710b00ab26d6e771e40cc237473 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,7 +22,7 @@ class LoopyAdvisory(UserWarning):
 
 from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg
 
-from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph
+from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel
 from loopy.subst import extract_subst, apply_subst
 from loopy.cse import precompute
 from loopy.preprocess import preprocess_kernel
@@ -31,7 +31,7 @@ from loopy.codegen import generate_code
 from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq
 from loopy.check import check_kernels
 
-__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg",
+__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", "LoopKernel",
         "get_dot_dependency_graph",
         "preprocess_kernel", "generate_loop_schedules",
         "generate_code",
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 62be45ce016d45529020909781f5793a62f248f7..c397e951f8f35c06f3d4c1f1598c0d4cbe0583de 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -176,7 +176,7 @@ def make_initial_assignments(kernel):
 
 # {{{ main code generation entrypoint
 
-def generate_code(kernel):
+def generate_code(kernel, with_annotation=False):
     from cgen import (FunctionBody, FunctionDeclaration,
             POD, Value, ArrayOf, Module, Block,
             Line, Const, LiteralLines, Initializer)
@@ -187,7 +187,8 @@ def generate_code(kernel):
     from loopy.symbolic import LoopyCCodeMapper, pw_aff_to_expr
 
     ccm = LoopyCCodeMapper(kernel).copy_and_assign_many(
-            make_initial_assignments(kernel))
+            make_initial_assignments(kernel),
+            with_annotation=with_annotation)
 
     mod = Module()
 
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 335395db0ecf5345ff0a411c44b4da2829b56146..0b6c9604b7ae7b4d693155745831f7008aea01fb 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -11,10 +11,10 @@ import numpy as np
 
 class CompiledKernel:
     def __init__(self, context, kernel, size_args=None, options=[],
-             edit_code=False):
+             edit_code=False, with_annotation=False):
         self.kernel = kernel
         from loopy.codegen import generate_code
-        self.code = generate_code(kernel)
+        self.code = generate_code(kernel, with_annotation=with_annotation)
 
         if edit_code:
             from pytools import invoke_editor
@@ -227,7 +227,7 @@ def make_args(queue, kernel, seq_input_arrays, parameters):
 
 def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters,
         print_seq_code=False, print_code=True, warmup_rounds=2, timing_rounds=100,
-        edit_code=False, dump_binary=False):
+        edit_code=False, dump_binary=False, with_annotation=False):
     from time import time
 
     # {{{ set up CL context for sequential run
@@ -263,7 +263,8 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters,
         seq_sched_kernel = knl
         break
 
-    seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel)
+    seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel,
+            with_annotation=with_annotation)
     if print_seq_code:
         print "----------------------------------------------------------"
         print "Sequential Code:"
@@ -302,7 +303,9 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters,
         if args is None:
             args, output_arrays = make_args(queue, kernel, seq_input_arrays, parameters)
 
-        compiled = CompiledKernel(ctx, kernel, edit_code=edit_code)
+        compiled = CompiledKernel(ctx, kernel, edit_code=edit_code,
+                with_annotation=with_annotation)
+
         print "----------------------------------------------------------"
         print "Kernel #%d:" % i
         print "----------------------------------------------------------"
diff --git a/loopy/cse.py b/loopy/cse.py
index 124f462b9c891c5dfcdf68e50d9ee2c3812b6a07..c6150a621b32477984bb590221cbcf4993c4eeb7 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -37,24 +37,45 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 
 
 def get_footprint(kernel, subst_name, old_arg_names, arg_names,
-        sweep_inames, invocation_descriptors):
+        sweep_axes, invocation_descriptors):
     global_footprint_map = None
 
-    # {{{  find sweep inames referenced by arguments
+    # {{{ deal with argument names as sweep inames
 
-    processed_sweep_inames = set()
+    # An argument name as a sweep iname means that *all*
+    # inames contained in *all* uses of the rule will be
+    # made sweep inames.
+
+    sweep_inames = set()
 
     for invdesc in invocation_descriptors:
-        for iname in sweep_inames:
+        for iname in sweep_axes:
             if iname in old_arg_names:
                 arg_idx = old_arg_names.index(iname)
-                processed_sweep_inames.update(
+                sweep_inames.update(
                         get_dependencies(invdesc.args[arg_idx]))
             else:
-                processed_sweep_inames.add(iname)
+                sweep_inames.add(iname)
+
+    sweep_inames = list(sweep_inames)
+
+    # }}}
+
+    # {{{ see if we need extra storage dimensions
+
+    # Realize that by default our storage dimensions are our arguments. If
+    # we're given a sweep iname that no (usage-site) argument depends on, then
+    # this sweep isn't covered in our storage. This necessitates adding an
+    # extra storage dimension.
 
-    sweep_inames = list(processed_sweep_inames)
-    del processed_sweep_inames
+    # find inames used in argument dependencies
+
+    usage_arg_deps = set()
+    for invdesc in invocation_descriptors:
+        for arg in invdesc.args:
+            usage_arg_deps.update(get_dependencies(arg))
+
+    extra_storage_dims = list(set(sweep_inames) - usage_arg_deps)
 
     # }}}
 
@@ -72,36 +93,37 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names,
 
     # }}}
 
-    # {{{ construct arg mapping
+    # {{{ construct storage map
 
-    # map goes from substitution arguments to domain_dup_sweep
+    # The storage map goes from storage dimension to domain_dup_sweep.
+    # The first len(arg_names) storage dimensions are the rule's arguments.
 
     for invdesc in invocation_descriptors:
         map_space = domain_dup_sweep.get_space()
-        ln = len(arg_names)
+        stor_dim = len(arg_names) + len(extra_storage_dims)
         rn = map_space.dim(dim_type.out)
 
-        map_space = map_space.add_dims(dim_type.in_, ln)
+        map_space = map_space.add_dims(dim_type.in_, stor_dim)
         for i, iname in enumerate(arg_names):
             # arg names are initially primed, to be replaced with unprimed
             # base-0 versions below
 
             map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'")
 
-        # map_space: [arg_names] -> [domain](dup_sweep_index)[dup_sweep]
+        # map_space: [stor_dims] -> [domain](dup_sweep_index)[dup_sweep]
 
         set_space = map_space.move_dims(
                 dim_type.out, rn,
-                dim_type.in_, 0, ln).range()
+                dim_type.in_, 0, stor_dim).range()
 
-        # set_space: <domain>(dup_sweep_index)<dup_sweep><arg_names>
+        # set_space: <domain>(dup_sweep_index)<dup_sweep><stor_dims>
 
         footprint_map = None
 
         from loopy.symbolic import aff_from_expr
         for uarg_name, arg_val in zip(arg_names, invdesc.args):
             cns = isl.Constraint.equality_from_aff(
-                    aff_from_expr(set_space, 
+                    aff_from_expr(set_space,
                         var(uarg_name+"'") - prime_sweep_inames(arg_val)))
 
             cns_map = isl.BasicMap.from_constraint(cns)
@@ -112,7 +134,7 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names,
 
         footprint_map = footprint_map.move_dims(
                 dim_type.in_, 0,
-                dim_type.out, rn, ln)
+                dim_type.out, rn, stor_dim)
 
         # footprint_map is back in map_space
 
@@ -242,8 +264,37 @@ def simplify_via_aff(expr):
 
 
 
-def precompute(kernel, subst_name, dtype, sweep_inames=[],
-        new_arg_names=None, arg_name_to_tag={}, default_tag="l.auto"):
+def precompute(kernel, subst_name, dtype, sweep_axes=[],
+        storage_axes=None, new_arg_names=None, arg_name_to_tag={},
+        default_tag="l.auto"):
+    """Precompute the expression described in the substitution rule *subst_name*
+    and store it in a temporary array. A precomputation needs two things to operate,
+    a list of *sweep_axes* (order irrelevant) and an ordered list of *storage_axes*
+    (whose order will describe the axis ordering of the temporary array).
+
+    This function will then examine all usage sites of the substitution rule and
+    determine what the storage footprint of that sweep is.
+
+    The following cases can arise for each sweep axis:
+
+    * The axis is an iname that occurs within arguments specified at
+      usage sites of the substitution rule. This case is assumed covered
+      by the storage axes provided for the argument.
+
+    * The axis is an iname that occurs within the *value* of the rule, but not
+      within its arguments. A new, dedicated storage axis is allocated for
+      such an axis.
+
+    * The axis is a formal argument name of the substitution rule.
+      This is equivalent to specifying *all* inames occurring within
+      the so-named formal argument at *all* usage sites.
+
+    :arg sweep_axes: A :class:`list` of inames and/or rule argument names to be swept.
+    :arg storage_dims: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes.
+
+    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
+    eliminated.
+    """
 
     subst = kernel.substitutions[subst_name]
     arg_names = subst.arguments
@@ -252,7 +303,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
     # {{{ gather up invocations
 
     invocation_descriptors = []
-    invocation_arg_deps = set()
 
     def gather_substs(expr, name, args, rec):
         if len(args) != len(subst.arguments):
@@ -264,8 +314,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
             raise RuntimeError("CSE arguments in '%s' do not consist "
                     "exclusively of inames" % expr)
 
-        invocation_arg_deps.update(arg_deps)
-
         invocation_descriptors.append(
                 InvocationDescriptor(expr=expr, args=args))
         return expr
@@ -285,12 +333,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
 
         scm(subst_expander(insn.expression))
 
-    allowable_sweep_inames = invocation_arg_deps | set(arg_names)
-    if not set(sweep_inames) <= allowable_sweep_inames:
-        raise RuntimeError("independent iname(s) '%s' do not occur as arg names "
-                "of subsitution rule or in arguments of invocation" % (",".join(
-                    set(sweep_inames)-allowable_sweep_inames)))
-
     # }}}
 
     # {{{ process ind_iname_to_tag argument
@@ -358,7 +400,7 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
     (non1_arg_names, new_domain,
                 arg_base_indices, non1_arg_base_indices, non1_shape) = \
                         get_footprint(kernel, subst_name, old_arg_names, arg_names,
-                                sweep_inames, invocation_descriptors)
+                                sweep_axes, invocation_descriptors)
 
     new_domain = new_domain.coalesce()
 
@@ -367,6 +409,15 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
         if hull_new_domain <= new_domain:
             new_domain = hull_new_domain
 
+    if len(new_domain.get_basic_sets()) > 1:
+        print("Substitution '%s' yielded a footprint that was not "
+                "obviously convex. Now computing convex hull. "
+                "This might take a *long* time." % subst_name)
+
+        hull_new_domain = new_domain.convex_hull()
+        if hull_new_domain <= new_domain:
+            new_domain = hull_new_domain
+
     if isinstance(new_domain, isl.Set):
         dom_bsets = new_domain.get_basic_sets()
         if len(dom_bsets) > 1:
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 539130fd088e0d8548400b4b6f9a5f2ab86a5d29..0729d6dbd846fb7a02a73f370022224187be1b8b 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -260,7 +260,7 @@ class Instruction(Record):
     :ivar boostable_into: a set of inames into which the instruction
         may need to be boosted, as a heuristic help for the scheduler.
 
-    The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is
+    The following two instance variables are only used until :func:`loopy.make_kernel` is
     finished:
 
     :ivar temp_var_type: if not None, a type that will be assigned to the new temporary variable
@@ -480,7 +480,7 @@ class LoopKernel(Record):
 
     :ivar cache_manager:
 
-    The following instance variables are only used until :func:`loopy.kernel.make_kernel` is
+    The following instance variables are only used until :func:`loopy.make_kernel` is
     finished:
 
     :ivar iname_to_tag_requests:
@@ -814,8 +814,7 @@ class LoopKernel(Record):
 
     def find_readers(self):
         """
-        :return: a dict that maps variable names to ids of insns that
-            read that variable.
+        :return: a dict that maps variable names to ids of insns that read that variable.
         """
         result = {}
 
@@ -829,8 +828,7 @@ class LoopKernel(Record):
 
     def find_writers(self):
         """
-        :return: a dict that maps variable names to ids of insns that
-            write to that variable.
+        :return: a dict that maps variable names to ids of insns that write to that variable.
         """
         result = {}
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 59ac1fc05d366a776ee8dd62504a14e62039403d..0a374edf74e99ea1328717155516751bcb813c4a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -519,7 +519,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             # }}}
 
         if axis is None:
-            new_tag = None
+            new_tag = UnrollTag()
         else:
             new_tag = LocalIndexTag(axis)
             if desired_length > local_size[axis]:
diff --git a/loopy/subst.py b/loopy/subst.py
index c89bc586991792b0c6208e06dbc9743b88f5ff5d..636252304263587e59dd4718dea63d199acb2274 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -19,16 +19,12 @@ class ExprDescriptor(Record):
 
 def extract_subst(kernel, subst_name, template, parameters):
     """
-    :arg template: An expression against which all targeted subexpressions
-        must unify
+    :arg subst_name: The name of the substitution rule to be created.
+    :arg template: Unification template expression.
 
-        If None, a unification template will be chosen from among the targeted
-        CSEs. That CSE is chosen to depend on all the variables in
-        *parameters*.  It is an error if no such expression can be
-        found.
-
-        May contain '*' wildcards that will have to match exactly across all
-        unifications.
+    All targeted subexpressions must match ('unify with') *template*
+    The template may contain '*' wildcards that will have to match exactly across all
+    unifications.
     """
 
     newly_created_var_names = set()
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index e1be279653f2ac95251c475e0fdc9c059832d377..c65f6e7ac202ecebece95b386e89dcc8561f973b 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -297,7 +297,7 @@ class ArrayAccessFinder(CombineMapper):
 
 class LoopyCCodeMapper(CCodeMapper):
     def __init__(self, kernel, cse_name_list=[], var_subst_map={},
-            with_annotation=True):
+            with_annotation=False):
         def constant_mapper(c):
             if isinstance(c, float):
                 # FIXME: type-variable
@@ -318,7 +318,8 @@ class LoopyCCodeMapper(CCodeMapper):
         if cse_name_list is None:
             cse_name_list = self.cse_name_list
         return LoopyCCodeMapper(self.kernel,
-                cse_name_list=cse_name_list, var_subst_map=var_subst_map)
+                cse_name_list=cse_name_list, var_subst_map=var_subst_map,
+                with_annotation=self.with_annotation)
 
     def copy_and_assign(self, name, value):
         var_subst_map = self.var_subst_map.copy()
diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py
index 136fccfbc8165235d40669c31c4de1844cff3fc9..0efc205147009d3095d96f30998b113238ef30c8 100644
--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
@@ -70,13 +70,14 @@ def test_laplacian_stiffness(ctx_factory):
         # no ILP across elements, precompute dPsiTransf
         knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
         knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-        knl = lp.precompute(knl, "dPsi", np.float32)
+        knl = lp.precompute(knl, "dPsi", np.float32,
+                sweep_inames=["K_inner"])
         knl = lp.add_prefetch(knl, "jacInv",
                 ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
         return knl
 
-    for variant in [variant_1, variant_2]:
-    #for variant in [variant_3]:
+    #for variant in [variant_1, variant_2, variant_3]:
+    for variant in [variant_3]:
         kernel_gen = lp.generate_loop_schedules(variant(knl),
                 loop_priority=["jacInv_dim_0", "jacInv_dim_1"])
         kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))