diff --git a/loopy/execution.py b/loopy/execution.py
index dac5b2ff80767ae00c126aba31c2851cfe3769ef..07e28f06d33e5884ac57c9505593c9ee916c3171 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -187,7 +187,12 @@ class KernelExecutorBase(object):
     def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
         from loopy import CACHING_ENABLED
 
-        cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set)
+        from loopy.preprocess import prepare_for_caching
+        # prepare_for_caching() gets run by preprocess, but the kernel at this
+        # stage is not guaranteed to be preprocessed.
+        cacheable_kernel = prepare_for_caching(self.kernel)
+        cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+
         if CACHING_ENABLED:
             try:
                 return typed_and_scheduled_cache[cache_key]
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index a02fc58d97f370d45f36a465c38fa3caf3da9d41..531cc822e1bc76573ef6e0812970d16bd6df0b17 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -862,6 +862,16 @@ class ArrayBase(ImmutableRecord):
     def __repr__(self):
         return "<%s>" % self.__str__()
 
+    def update_persistent_hash_for_shape(self, key_hash, key_builder, shape):
+        if isinstance(shape, tuple):
+            for shape_i in shape:
+                if shape_i is None:
+                    key_builder.rec(key_hash, shape_i)
+                else:
+                    key_builder.update_for_pymbolic_expression(key_hash, shape_i)
+        else:
+            key_builder.rec(key_hash, shape)
+
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
         :class:`pytools.persistent_dict.PersistentDict`.
@@ -869,14 +879,7 @@ class ArrayBase(ImmutableRecord):
 
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.dtype)
-        if isinstance(self.shape, tuple):
-            for shape_i in self.shape:
-                if shape_i is None:
-                    key_builder.rec(key_hash, shape_i)
-                else:
-                    key_builder.update_for_pymbolic_expression(key_hash, shape_i)
-        else:
-            key_builder.rec(key_hash, self.shape)
+        self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape)
         key_builder.rec(key_hash, self.dim_tags)
         key_builder.rec(key_hash, self.offset)
         key_builder.rec(key_hash, self.dim_names)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 001dd06326edcad14d8ecd39e29229dd45de8ef2..94b31df12dae516d3539438b7e4ed66ed765e697 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -497,7 +497,8 @@ class TemporaryVariable(ArrayBase):
         """
 
         super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder)
-        key_builder.rec(key_hash, self.storage_shape)
+        self.update_persistent_hash_for_shape(key_hash, key_builder,
+                self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
 
         initializer = self.initializer
@@ -510,7 +511,7 @@ class TemporaryVariable(ArrayBase):
 # }}}
 
 
-# {{{ subsitution rule
+# {{{ substitution rule
 
 class SubstitutionRule(ImmutableRecord):
     """
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 57cf74b808ae1a7107e76a18a3876785ab8baabd..e05140ff5ba6ba32882607f99eeebfe4dfd80471 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -596,7 +596,8 @@ class SchedulerState(ImmutableRecord):
     .. attribute:: preschedule
 
         A sequence of schedule items that must be inserted into the
-        schedule, maintaining the same ordering
+        schedule, maintaining the same relative ordering. Newly scheduled
+        items may interleave this sequence.
 
     .. attribute:: prescheduled_insn_ids
 
@@ -1073,28 +1074,6 @@ def generate_loop_schedules_internal(
                           % iname)
                 continue
 
-            if (
-                    not sched_state.within_subkernel
-                    and iname not in sched_state.prescheduled_inames):
-                # Avoid messing up some orderings such as picking:
-                #
-                # EnterLoop(temporary.reload)
-                # CallKernel
-                # ...
-                #
-                # instead of
-                #
-                # CallKernel
-                # EnterLoop(temporary.reload)
-                # ...
-                #
-                # This serves a heuristic to catch some bad decisions early, the
-                # scheduler will not allow the first variant regardless.
-                if debug_mode:
-                    print("scheduling '%s' prohibited because we are outside "
-                          "a subkernel" % iname)
-                continue
-
             currently_accessible_inames = (
                     active_inames_set | sched_state.parallel_inames)
             if (
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 7b3a67c6b0b11a3adc68d58a10f309a6ee21919e..45da8eb3e3d7ccd784f9825995483018b4d897eb 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -232,7 +232,8 @@ class TypeInferenceMapper(CombineMapper):
             # Codegen for complex types depends on exactly correct types.
             # Refuse temptation to guess.
             raise TypeInferenceFailure("Complex constant '%s' needs to "
-                    "be sized for type inference " % expr)
+                    "be sized (i.e. as numpy.complex64/128) for type inference "
+                    % expr)
         else:
             raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 48cb6980ab79bcfc640a19551d9a7708b6a2b20c..78633abbd41408ae700aa8516e8a9c6f70f018a9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2376,6 +2376,21 @@ def test_kernel_var_name_generator():
     assert vng("b") != "b"
 
 
+def test_execution_backend_can_cache_dtypes(ctx_factory):
+    # When the kernel is invoked, the execution backend uses it as a cache key
+    # for the type inference and scheduling cache. This tests to make sure that
+    # dtypes in the kernel can be cached, even though they may not have a
+    # target.
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i")
+    knl = lp.add_dtypes(knl, dict(tmp=int))
+
+    knl(queue)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])