diff --git a/loopy/__init__.py b/loopy/__init__.py
index a350601418c3756eab6ea4e71723bd137f8e6bc0..78342b9e9c42b9383836c3be36966e20c72a5f24 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -502,7 +502,6 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
 
     kernel = extract_subst(kernel, rule_name, uni_template, parameters)
 
-
     if footprint_subscripts is not None:
         if not isinstance(footprint_subscripts, (list, tuple)):
             footprint_subscripts = [footprint_subscripts]
diff --git a/loopy/cse.py b/loopy/cse.py
index 640a69bb4db5180231f1cbaf71db8679f5ead1a3..54618ed71fce020a5265d1541c47feb1599af205 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -671,12 +671,18 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[],
                     found = True
                     break
 
-            if not invdesc.is_in_footprint:
+            if footprint_generators is None:
+                # We only have a right to find the expression if the
+                # invocation descriptors if they were generated by a scan
+                # of the code in the first place. If the user gave us
+                # the footprint generators, that isn't true.
+
+                assert found, expr
+
+            if not found or not invdesc.is_in_footprint:
                 left_unused_subst_rule_invocations[0] = True
                 return expr
 
-            assert found, expr
-
         else:
             # The current subsitution *was* found inside another substitution
             # rule. We can't dig up the corresponding invocation descriptor,
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 4f888ef09d0c4a66b23f90e06664fc0f36d89588..f1641256f70aeabfeb7c85447912266538124ff7 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -1,4 +1,3 @@
-
 from __future__ import division
 
 import numpy as np
@@ -38,6 +37,7 @@ def test_nbody(ctx_factory):
         return knl, []
 
     def variant_cpu(knl):
+        knl = lp.expand_subst(knl)
         knl = lp.split_dimension(knl, "i", 1024,
                 outer_tag="g.0", slabs=(0,1))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
@@ -56,7 +56,7 @@ def test_nbody(ctx_factory):
 
     n = 3000
 
-    for variant in [variant_gpu]:
+    for variant in [variant_1, variant_cpu, variant_gpu]:
         variant_knl, loop_prio = variant(knl)
         kernel_gen = lp.generate_loop_schedules(variant_knl,
                 loop_priority=loop_prio)
diff --git a/test/test_sem.py b/test/test_sem.py
index a83a74c90e8ea0877637e2d8f67e102caa50b6a2..8a93d11c0a3f35c45751f013eb91eee572c88d65 100644
--- a/test/test_sem.py
+++ b/test/test_sem.py
@@ -10,6 +10,8 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 
 
+1/0 # not ready
+
 def test_laplacian(ctx_factory):
     1/0 # not adapted to new language