diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index d8a982e6cc435d8b077bb6edab0e90f86fe00fac..72c6c5b0cc11b5d49c018f5e0f142132fa54d778 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -625,7 +625,7 @@ loop's tag to ``"unr"``:
 
 
 :func:`loopy.tag_inames` is a new transformation that assigns
-implementation tags to kernels.  ``"unr'`` is the first tag we've
+implementation tags to kernels.  ``"unr"`` is the first tag we've
 explicitly learned about. Technically, though, it is the second--``"for"``
 (or, equivalently, *None*), which is the default, instructs loopy to
 implement an iname using a for loop.
@@ -747,10 +747,12 @@ assumption:
     ...
 
 While these conditionals enable the generated code to deal with arbitrary
-*n*, they come at a performance cost. But there's still no reason to pay
-for them with *every* item processed. Loopy allows generating separate code
-for the last iteration of the loop, by using the *slabs* keyword argument
-to :func:`split_iname`:
+*n*, they come at a performance cost. Loopy allows generating separate code
+for the last iteration of the *i_outer* loop, by using the *slabs* keyword
+argument to :func:`split_iname`. Since this last iteration of *i_outer* is
+the only iteration for which ``i_inner + 4*i_outer`` can become larger than
+*n*, only the (now separate) code for that iteration contains conditionals,
+enabling some cost savings:
 
 .. doctest::
 
@@ -759,16 +761,27 @@ to :func:`split_iname`:
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     <BLANKLINE>
-      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+    ...
+      /* bulk slab for 'i_outer' */
+      for (int i_outer = 0; i_outer <= (-2 + ((3 + n) / 4)); ++i_outer)
       {
         a[0 + i_outer * 4] = 0.0f;
-        if ((-2 + -4 * i_outer + n) >= 0)
-          a[1 + i_outer * 4] = 0.0f;
-        if ((-3 + -4 * i_outer + n) >= 0)
-          a[2 + i_outer * 4] = 0.0f;
-        if ((-4 + -4 * i_outer + n) >= 0)
-          a[3 + i_outer * 4] = 0.0f;
+        a[1 + i_outer * 4] = 0.0f;
+        a[2 + i_outer * 4] = 0.0f;
+        a[3 + i_outer * 4] = 0.0f;
       }
+      /* final slab for 'i_outer' */
+      for (int i_outer = (-1 + n + -1 * (3 * n / 4)); i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+        if ((-1 + n) >= 0)
+        {
+          a[0 + i_outer * 4] = 0.0f;
+          if ((-2 + -4 * i_outer + n) >= 0)
+            a[1 + i_outer * 4] = 0.0f;
+          if ((-3 + -4 * i_outer + n) >= 0)
+            a[2 + i_outer * 4] = 0.0f;
+          if ((4 + 4 * i_outer + -1 * n) == 0)
+            a[3 + i_outer * 4] = 0.0f;
+        }
     ...
 
 .. _specifying-arguments: