diff --git a/MEMO b/MEMO
index 8b9700092293a3f0d331e00bc28a64261a237f50..bb3ab941081a5f144e898950cee1a11dfee63f3f 100644
--- a/MEMO
+++ b/MEMO
@@ -16,13 +16,17 @@ Things to consider
 - Every loop in loopy is opened at most once.
   Too restrictive?
 
-- Why do CSEs necessarily have to duplicate the inames?
+- Why do precomputes necessarily have to duplicate the inames?
   -> because that would be necessary for a sequential prefetch
 
 - Cannot do slab decomposition on inames that share a tag with
   other inames
   -> Is that reasonable?
 
+- Entering a loop means:
+  - setting up conditionals related to it (slabs/bounds)
+  - allowing loops nested inside to depend on loop state
+
 - Not using all hw loop dimensions causes an error, as
   is the case for variant 3 in the rank_one test.
 
@@ -53,7 +57,6 @@ To-do
 - rename IndexTag -> InameTag
 
 - Data implementation tags
-  - retag semantics once strides have been computed
   - turn base_indices into offset
   - vectorization
   - automatic copies
@@ -63,7 +66,6 @@ To-do
   - automatic copies from an array with one set of tags
     to the same array with another set.
 
-
 - Make tests run on GPUs
 
 Fixes:
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 5ebce957c00099d8dfa524c16b09ec559275a363..5201d9be13b11ccedeaabae0300fec2fd744aa04 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -150,6 +150,11 @@ class CodeGenerationState(object):
 
         self.c_code_mapper = c_code_mapper
 
+    def copy(self, implemented_domain=None, c_code_mapper=None):
+        return CodeGenerationState(
+                implemented_domain=implemented_domain or self.implemented_domain,
+                c_code_mapper=c_code_mapper or self.c_code_mapper)
+
     def intersect(self, other):
         new_impl, new_other = isl.align_two(self.implemented_domain, other)
         return CodeGenerationState(
@@ -217,6 +222,8 @@ class POD(PODBase):
 # }}}
 
 
+# {{{ implemented data info
+
 class ImplementedDataInfo(Record):
     """
     .. attribute:: name
@@ -269,6 +276,8 @@ class ImplementedDataInfo(Record):
                 stride_for_name_and_axis=stride_for_name_and_axis,
                 allows_offset=allows_offset)
 
+# }}}
+
 
 # {{{ main code generation entrypoint
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 17641ec019e0eccaec74951070ddb401cf3d11f1..3beb9a2ee78b7d374997367162b51cc50d87410b 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -54,7 +54,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
     else:
         result = result.compute_divs()
 
-    result = isl.align_spaces(result, implemented_domain)
+    result, implemented_domain = isl.align_two(result, implemented_domain)
     result = result.gist(implemented_domain)
 
     if overapproximate:
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index a20ec818bd18e83719c5ea7b7f26fd6ff41f4956..bfc0273c04cefb0d29092e4de69b3de83eb60450 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -199,6 +199,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
     from loopy.isl_helpers import static_value_of_pw_aff
     lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff,
             constants_only=False)
+
+    # These bounds are 'implemented' by the hardware. Make sure
+    # that the downstream conditional generators realize that.
     slab = make_slab(domain.get_space(), iname,
             lower_bound, lower_bound+hw_axis_size)
     codegen_state = codegen_state.intersect(slab)
diff --git a/loopy/compiled.py b/loopy/compiled.py
index d799186f5811b1e0d8c7ae3120ea3358fbe018f3..ae443679bc7c812770cdad7bcd7cd7b58896f647 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -748,7 +748,7 @@ class CompiledKernel:
         return kernel
 
     @memoize_method
-    def cl_kernel_info(self, arg_to_dtype_set, code_op=None):
+    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), code_op=None):
         kernel = self.get_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 52d001ed46f94c30f24a7711471cd5034c591672..3bcf975d9e2fbdaaeef1c8a88f7281c6e36e2bd0 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -196,7 +196,6 @@ def loop_nest_map(kernel):
                 # ILP tags are special because they are parallel tags
                 # and therefore 'in principle' nest around everything.
                 # But they're realized by the scheduler as a loop
-                # (and the scheduler is the only custom
                 # at the innermost level, so we'll cut them some
                 # slack here.
                 continue
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 272b0130b53edc600c599e566856308fe108ba1e..01ea6b6fd4b5156c36082eff77f74285a215b99f 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1127,8 +1127,11 @@ def test_vector_ilp_with_prefetch(ctx_factory):
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
 
-    code, info = lp.generate_code(knl)
+    cknl = lp.CompiledKernel(ctx, knl)
+    cknl.cl_kernel_info()
+
     import re
+    code = cknl.get_code()
     assert len(list(re.finditer("barrier", code))) == 1