diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 32b233900a9cd6ba491502185ccd33ac5d7544d4..550288642a053888cf5125129471c8bed9736cfa 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1277,10 +1277,21 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         result = dict(
                 (key, getattr(self, key))
                 for key in self.__class__.fields
-                if hasattr(self, key))
+                if hasattr(self, key) and key != "instructions")
 
         result.pop("cache_manager", None)
 
+        # Make the instructions lazily unpickling, to support faster
+        # cache retrieval for execution.
+        from loopy.kernel.instruction import _get_insn_eq_key, _get_insn_hash_key
+        from loopy.tools import (
+                LazilyUnpicklingListWithEqAndPersistentHashing as LazyList)
+
+        result["instructions"] = LazyList(
+                self.instructions,
+                eq_key_getter=_get_insn_eq_key,
+                persistent_hash_key_getter=_get_insn_hash_key)
+
         # make sure that kernels are pickled with a cached hash key in place
         from loopy.tools import LoopyKeyBuilder
         LoopyKeyBuilder()(self)
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 95001c78bb1f3ef0c6e823589075ddb6e3fbb506..067e3de13331f30f915efb34c0dd5780ba4616ba 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1388,4 +1388,16 @@ class BarrierInstruction(_DataObliviousInstruction):
 # }}}
 
 
+# {{{ key getters
+
+def _get_insn_eq_key(insn):
+    return insn._key_builder.key()
+
+
+def _get_insn_hash_key(insn):
+    return insn._key_builder.hash_key()
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 3634cc71aa73f9079ce4f0180f546cfc4f344b3d..4eecf52a3d68febc3f2348c196d3be9fb5e9c1b2 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -382,8 +382,11 @@ class CKernelExecutor(KernelExecutorBase):
         """
 
         self.compiler = compiler if compiler else CCompiler()
-        super(CKernelExecutor, self).__init__(kernel,
-                                              CExecutionWrapperGenerator())
+        super(CKernelExecutor, self).__init__(kernel)
+
+    def get_invoker_uncached(self, kernel, codegen_result):
+        generator = CExecutionWrapperGenerator
+        return generator(kernel, codegen_result)
 
     @memoize_method
     def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
@@ -423,7 +426,7 @@ class CKernelExecutor(KernelExecutorBase):
                 kernel=kernel,
                 c_kernels=c_kernels,
                 implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.invoker(kernel, codegen_result))
+                invoker=self.get_invoker(kernel, codegen_result))
 
     # }}}
 
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 3a3ea0a70fe9a9229aa3499ad0bdbfeb87f751ed..75424b9c7ffa54cf2fed2ebcf15d4c439d185d55 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -625,9 +625,10 @@ class ExecutionWrapperGeneratorBase(object):
         :codegen_result: the loopy :class:`CodeGenerationResult` created
         by code generation
 
-        :returns: py_func, a python function that handles excution of this
-        kernel
+        :returns: A python callable that handles execution of this
+            kernel
         """
+
         options = kernel.options
         implemented_data_info = codegen_result.implemented_data_info
 
@@ -677,7 +678,7 @@ class ExecutionWrapperGeneratorBase(object):
                 with open(options.write_wrapper, "w") as outf:
                     outf.write(output)
 
-        return gen.get_function()
+        return gen.get_picklable_function()
 
 # }}}
 
@@ -697,6 +698,11 @@ typed_and_scheduled_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
+invoker_cache = WriteOncePersistentDict(
+        "loopy-invoker-cache-v1-"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
 # {{{ kernel executor
 
 class KernelExecutorBase(object):
@@ -707,7 +713,7 @@ class KernelExecutorBase(object):
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel, invoker):
+    def __init__(self, kernel):
         """
         :arg kernel: a loopy.LoopKernel
         """
@@ -723,8 +729,6 @@ class KernelExecutorBase(object):
                 arg.dtype is None
                 for arg in kernel.args)
 
-        self.invoker = invoker
-
     def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
 
@@ -833,6 +837,29 @@ class KernelExecutorBase(object):
         code = generate_code_v2(kernel)
         return code.device_code()
 
+    def get_invoker_uncached(self, kernel, *args):
+        raise NotImplementedError()
+
+    def get_invoker(self, kernel, *args):
+        from loopy import CACHING_ENABLED
+
+        cache_key = (self.__class__.__name__, kernel)
+
+        if CACHING_ENABLED:
+            try:
+                return invoker_cache[cache_key]
+            except KeyError:
+                pass
+
+        logger.debug("%s: invoker cache miss" % kernel.name)
+
+        invoker = self.get_invoker_uncached(kernel, *args)
+
+        if CACHING_ENABLED:
+            invoker_cache.store_if_not_present(cache_key, invoker)
+
+        return invoker
+
     # }}}
 
     # {{{ call and info generator
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index bef3152d03c193c14b11ce6f9ba3f20fdfcff6ad..d33a92d6657cf1e61edeb5344c6ad49016ae640f 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -261,8 +261,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             specific arguments.
         """
 
-        super(PyOpenCLKernelExecutor, self).__init__(
-            kernel, invoker=PyOpenCLExecutionWrapperGenerator())
+        super(PyOpenCLKernelExecutor, self).__init__(kernel)
 
         self.context = context
 
@@ -270,6 +269,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         if isinstance(kernel.target, PyOpenCLTarget):
             self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
 
+    def get_invoker_uncached(self, kernel, codegen_result):
+        generator = PyOpenCLExecutionWrapperGenerator()
+        return generator(kernel, codegen_result)
+
     @memoize_method
     def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
@@ -309,7 +312,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
                 kernel=kernel,
                 cl_kernels=cl_kernels,
                 implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.invoker(kernel, codegen_result))
+                invoker=self.get_invoker(kernel, codegen_result))
 
     def __call__(self, queue, **kwargs):
         """