diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 0fcde42a5e913489513cbc3c2f207b48ecbd7b0d..45738dd1af1d0f8ac56869ffa53e83a14d80f136 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2312,21 +2312,6 @@ preprocess_cache = WriteOncePersistentDict(
 def preprocess_single_kernel(kernel, callables_table, device=None):
     from loopy.kernel import KernelState
 
-    # {{{ cache retrieval
-
-    from loopy import CACHING_ENABLED
-    if CACHING_ENABLED:
-        input_kernel = kernel
-
-        try:
-            result = preprocess_cache[kernel]
-            logger.debug("%s: preprocess cache hit" % kernel.name)
-            return result
-        except KeyError:
-            pass
-
-    # }}}
-
     prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name)
 
     from loopy.check import check_identifiers_in_subst_rules
@@ -2369,27 +2354,27 @@ def preprocess_single_kernel(kernel, callables_table, device=None):
 
     prepro_logger.done()
 
-    # {{{ prepare for caching
-
-    # PicklableDtype instances for example need to know the target they're working
-    # towards in order to pickle and unpickle them. This is the first pass that
-    # uses caching, so we need to be ready to pickle. This means propagating
-    # this target information.
+    return kernel
 
-    if CACHING_ENABLED:
-        input_kernel = prepare_for_caching(input_kernel)
 
-    kernel = prepare_for_caching(kernel)
+def preprocess_program(program, device=None):
 
-    # }}}
+    # {{{ cache retrieval
 
+    from loopy import CACHING_ENABLED
     if CACHING_ENABLED:
-        preprocess_cache.store_if_not_present(input_kernel, kernel)
+        input_program = program
 
-    return kernel
+        try:
+            result = preprocess_cache[program]
+            logger.debug(f"program with entrypoints: {program.entrypoints}"
+                    " preprocess cache hit")
+            return result
+        except KeyError:
+            pass
 
+    # }}}
 
-def preprocess_program(program, device=None):
     from loopy.kernel import KernelState
     if program.state >= KernelState.PREPROCESSED:
         return program
@@ -2468,6 +2453,23 @@ def preprocess_program(program, device=None):
     # inline_kernels_with_gbarriers does not recursively inline the callees.
     program = inline_kernels_with_gbarriers(program)
 
+    # {{{ prepare for caching
+
+    # PicklableDtype instances for example need to know the target they're working
+    # towards in order to pickle and unpickle them. This is the first pass that
+    # uses caching, so we need to be ready to pickle. This means propagating
+    # this target information.
+
+    if CACHING_ENABLED:
+        input_program = prepare_for_caching(input_program)
+
+    program = prepare_for_caching(program)
+
+    # }}}
+
+    if CACHING_ENABLED:
+        preprocess_cache.store_if_not_present(input_program, program)
+
     return program