diff --git a/MEMO b/MEMO
index 87725fac86ce357147af22ed60355ab30797eb9a..66f5f2f4e9ca3ba8657585ba04e6387d80e5abbd 100644
--- a/MEMO
+++ b/MEMO
@@ -55,10 +55,11 @@ To-do
 - reg rolling
 
 - nbody GPU
+  -> pending better prefetch spec
 
 - Expose iname-duplicate-and-rename as a primitive.
 
-- String instructions?
+- add_prefetch gets a flag to separate out each access
 
 - Making parameters run-time varying, substituting values that
   depend on other inames?
@@ -68,6 +69,8 @@ To-do
 Future ideas
 ^^^^^^^^^^^^
 
+- String instructions?
+
 - How is intra-instruction ordering of ILP loops going to be determined?
   (taking into account that it could vary even per-instruction?)
 
diff --git a/doc/reference.rst b/doc/reference.rst
index a6c3bc988409f6f037b8a86ea7ec3d05410448b1..0e538771935d373519eb074bb484550241a3ec50 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -14,6 +14,8 @@ Expressions
 -----------
 
 * `if`
+* `reductions`
+ * duplication of reduction inames
 * complex-valued arithmetic
 
 Assignments and Substitution Rules
@@ -28,13 +30,21 @@ Tag                   Meaning
 `None` | `"for"`      Sequential loop
 `"l.N"`               Local (intra-group) axis N
 `"l.auto"`            Automatically chosen local (intra-group) axis
+`"g.N"`               Group-number axis N
 `"unr"`               Plain unrolling
-`"ilp"`               Unroll using instruction-level parallelism
-`"inn"`               Realize parallel iname as innermost loop
+`"ilp"` | `"ilp.unr"` Unroll using instruction-level parallelism
+`"ilp.seq"`           Realize parallel iname as innermost loop
 ===================== ====================================================
 
 (Throughout this table, `N` must be replaced by an actual number.)
 
+ILP is really three things combined:
+
+* Restricts loops to be innermost (excludes them from scheduling)
+* Duplicates reduction storage for any reductions nested around ILP usage
+* Causes a loop (unrolled or not) to be opened/generated for each
+  involved instruction
+
 .. _automatic-axes:
 
 Automatic Axis Assignment
diff --git a/loopy/check.py b/loopy/check.py
index 16b4c72845d490ce33c852334bfb0f4ec76f523d..c0bfeca134cd70a77aa78cbe1d0ffd78f75c4d7f 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -85,7 +85,7 @@ def check_for_inactive_iname_access(kernel):
 
 def check_for_write_races(kernel):
     from loopy.symbolic import DependencyMapper
-    from loopy.kernel import ParallelTag, GroupIndexTag, IlpTag
+    from loopy.kernel import ParallelTag, GroupIndexTag, IlpBaseTag
     depmap = DependencyMapper()
 
     for insn in kernel.instructions:
@@ -136,7 +136,7 @@ def check_for_write_races(kernel):
                 ilp_inames = set(
                         iname
                         for iname in kernel.insn_inames(insn)
-                        if isinstance(kernel.iname_to_tag.get(iname), IlpTag))
+                        if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag))
 
                 inames_without_write_dep = ilp_inames - (
                         assignee_inames & ilp_inames)
diff --git a/loopy/cse.py b/loopy/cse.py
index 3246a34e6267361f1e3d259eb18bbb05521a3fe8..82b43d25d0311bb9f8dff46e28eb8d054d7d8570 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -278,7 +278,7 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
     :arg sweep_axes: A :class:`list` of inames and/or rule argument names to be swept.
     :arg storage_dims: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes.
 
-    If `storage_dims` is not specified, it defaults to the arrangement
+    If `storage_axes` is not specified, it defaults to the arrangement
     `<direct sweep axes><arguments>` with the direct sweep axes being the
     slower-varying indices.
 
@@ -318,6 +318,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
     from loopy.symbolic import SubstitutionCallbackMapper
     scm = SubstitutionCallbackMapper([(subst_name, subst_instance)], gather_substs)
 
+    # We need to work on the fully expanded form of an expression.
+    # To that end, instantiate a substitutor.
     from loopy.symbolic import ParametrizedSubstitutor
     rules_except_mine = kernel.substitutions.copy()
     del rules_except_mine[subst_name]
@@ -328,7 +330,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
         # arguments. Therefore, fully expand each instruction and look at
         # the invocations in subst_name occurring there.
 
-        scm(subst_expander(insn.expression))
+        expanded_expr = subst_expander(insn.expression)
+        scm(expanded_expr)
 
     if not invocation_descriptors:
         raise RuntimeError("no invocations of '%s' found" % subst_name)
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 4ca9b5540b05c0abb0808d7933b365d3cb023b9b..8b5010c44964b09d31228a327f3ad61d2d2b4349 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -65,9 +65,16 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
     def __str__(self):
         return "l.auto"
 
-class IlpTag(ParallelTag):
+class IlpBaseTag(ParallelTag):
+    pass
+
+class UnrolledIlpTag(IlpBaseTag):
+    def __str__(self):
+        return "ilp.unr"
+
+class LoopedIlpTag(IlpBaseTag):
     def __str__(self):
-        return "ilp"
+        return "ilp.seq"
 
 class UnrollTag(IndexTag):
     def __str__(self):
@@ -87,8 +94,10 @@ def parse_tag(tag):
         return None
     elif tag in ["unr"]:
         return UnrollTag()
-    elif tag == "ilp":
-        return IlpTag()
+    elif tag in ["ilp", "ilp.unr"]:
+        return UnrolledIlpTag()
+    elif tag == "ilp.seq":
+        return LoopedIlpTag()
     elif tag.startswith("g."):
         return GroupIndexTag(int(tag[2:]))
     elif tag.startswith("l."):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 43e93ddd0e2e77e8c4cd57b3efa8f4b71c74cd64..40c702fb9dbe90262729dc1696383d33e2b9e172 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -132,17 +132,17 @@ def realize_reduction(kernel, insn_id_filter=None):
     new_insns = []
     new_temporary_variables = kernel.temporary_variables.copy()
 
-    from loopy.kernel import IlpTag
+    from loopy.kernel import IlpBaseTag
 
     def map_reduction(expr, rec):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
-        # {{{ see if this reduction is nested inside some ILP loops
+        # {{{ see if this reduction is nested around some ILP loops
 
         ilp_inames = [iname
                 for iname in temp_kernel.insn_inames(insn)
-                if isinstance(temp_kernel.iname_to_tag.get(iname), IlpTag)]
+                if isinstance(temp_kernel.iname_to_tag.get(iname), IlpBaseTag)]
 
         from loopy.isl_helpers import static_max_of_pw_aff
 
diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py
index 1c42082f2b9a77db3fa38f8f9f34822b84b0c9da..c90b556bd6a8e8265451bc4ed7ca41543654091c 100644
--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
@@ -83,7 +83,7 @@ def test_laplacian_stiffness(ctx_factory):
         knl = lp.split_dimension(knl, "K", Ncloc,
                 outer_iname="Ko", inner_iname="Kloc")
         knl = lp.precompute(knl, "dPsi.one", np.float32, ["dx_axis"], default_tag=None)
-        knl = lp.tag_dimensions(knl, {"j": "ilp"})
+        knl = lp.tag_dimensions(knl, {"j": "ilp.seq"})
 
         return knl, ["Ko", "Kloc"]
 
@@ -124,7 +124,7 @@ def test_laplacian_stiffness(ctx_factory):
     # Plug in variant name here
     #                        |
     #                        v
-    for variant in [variant_simple_gpu_prefetch]:
+    for variant in [variant_fig33]:
         var_knl, loop_prio = variant(knl)
         kernel_gen = lp.generate_loop_schedules(var_knl,
                 loop_priority=loop_prio)