diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index f6e7ad9c2211d24582e5027777b584fc5ac64d98..1bdf70c29bf8ed8bbf42b1fc5edfdeb411f64aaa 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -178,7 +178,6 @@ by passing :attr:`loopy.Options.write_cl`.
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
-    <BLANKLINE>
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
     }
@@ -220,7 +219,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
     >>> evt, (out,) = knl(queue, a=x_vec_host)
     from __future__ import division
     ...
-    def invoke_loopy_kernel_loopy_kernel(cl_kernel, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
+    def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None):
         if allocator is None:
             allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
     <BLANKLINE>
@@ -228,9 +227,9 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
     <BLANKLINE>
         if n is None:
             if a is not None:
-                n = a.shape[0]
+                n = int(a.shape[0])
             elif out is not None:
-                n = out.shape[0]
+                n = int(out.shape[0])
     <BLANKLINE>
         # }}}
     ...
@@ -253,7 +252,6 @@ call :func:`loopy.generate_code`:
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
-    <BLANKLINE>
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
     }
@@ -369,7 +367,6 @@ Let us take a look at the generated code for the above kernel:
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
-    <BLANKLINE>
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
         {
@@ -419,7 +416,6 @@ Now the intended code is generated and our test passes.
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
-    <BLANKLINE>
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
           out[n * j + i] = a[n * i + j];
@@ -628,8 +624,9 @@ loop's tag to ``"unr"``:
     >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    #define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
     #define lid(N) ((int) get_local_id(N))
+    #define gid(N) ((int) get_group_id(N))
+    #define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
     ...
       for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
       {
@@ -707,7 +704,6 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     ...
     __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
     {
-    <BLANKLINE>
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
         a[lid(0) + gid(0) * 128] = 0.0f;
     }
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index be7840f27e92dc0339cd678e9dca40421462ac04..a9eb44f84d44015fa11ce83afe410e833bb8e214 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -310,7 +310,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     result = []
 
     for slab_name, slab in slabs:
-        if len(slabs) == 1:
+        if len(slabs) > 1:
             result.append(
                     codegen_state.ast_builder.emit_comment(
                         "%s slab for '%s'" % (slab_name, iname)))
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index f87b999e32890b28043fd3c6eeafc6488eebafc2..d9f420405ad4c0905dd8c47554bb2cf1f24bd87f 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -238,7 +238,7 @@ class CASTBuilder(ASTBuilderBase):
         base_storage_to_scope = {}
         base_storage_to_align_bytes = {}
 
-        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value
+        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value, Line
 
         class ConstRestrictPointer(Pointer):
             def get_decl_pair(self):
@@ -318,7 +318,12 @@ class CASTBuilder(ASTBuilderBase):
 
         # }}}
 
-        return base_storage_decls + temp_decls
+        result = base_storage_decls + temp_decls
+
+        if result:
+            result.append(Line())
+
+        return result
 
     @property
     def ast_block_class(self):