diff --git a/loopy/execution.py b/loopy/execution.py
index 5680fdbfef614a0df1674a56842acd1869d14636..dac5b2ff80767ae00c126aba31c2851cfe3769ef 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -28,6 +28,13 @@ import numpy as np
 from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
 
+import logging
+logger = logging.getLogger(__name__)
+
+from pytools.persistent_dict import PersistentDict
+from loopy.tools import LoopyKeyBuilder
+from loopy.version import DATA_MODEL_VERSION
+
 
 # {{{ object array argument packing
 
@@ -113,6 +120,11 @@ class SeparateArrayPackingController(object):
 
 # {{{ KernelExecutorBase
 
+typed_and_scheduled_cache = PersistentDict(
+        "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
 class KernelExecutorBase(object):
     """An object connecting a kernel to a :class:`pyopencl.Context`
     for execution.
@@ -137,15 +149,14 @@ class KernelExecutorBase(object):
                 arg.dtype is None
                 for arg in kernel.args)
 
-    @memoize_method
-    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
-        kernel = self.kernel
-
+    def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
 
-        if var_to_dtype_set:
+        kernel = self.kernel
+
+        if arg_to_dtype_set:
             var_to_dtype = {}
-            for var, dtype in var_to_dtype_set:
+            for var, dtype in arg_to_dtype_set:
                 try:
                     dest_name = kernel.impl_arg_to_arg[var].name
                 except KeyError:
@@ -172,10 +183,33 @@ class KernelExecutorBase(object):
 
         return kernel
 
+    @memoize_method
+    def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+        from loopy import CACHING_ENABLED
+
+        cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set)
+        if CACHING_ENABLED:
+            try:
+                return typed_and_scheduled_cache[cache_key]
+            except KeyError:
+                pass
+
+        logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+
+        kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+
+        if CACHING_ENABLED:
+            typed_and_scheduled_cache[cache_key] = kernel
+
+        return kernel
+
     def arg_to_dtype_set(self, kwargs):
         if not self.has_runtime_typed_args:
             return None
 
+        from loopy.types import NumpyType
+        target = self.kernel.target
+
         impl_arg_to_arg = self.kernel.impl_arg_to_arg
         arg_to_dtype = {}
         for arg_name, val in six.iteritems(kwargs):
@@ -191,7 +225,7 @@ class KernelExecutorBase(object):
                 except AttributeError:
                     pass
                 else:
-                    arg_to_dtype[arg_name] = dtype
+                    arg_to_dtype[arg_name] = NumpyType(dtype, target)
 
         return frozenset(six.iteritems(arg_to_dtype))
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index e5305b703a4f03adcc886b03dece75c9273c4ca2..dccaca2ec104a4749289f7cd89c491292f618e3d 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1400,17 +1400,35 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         result.pop("cache_manager", None)
 
-        return result
+        # make sure that kernels are pickled with a cached hash key in place
+        from loopy.tools import LoopyKeyBuilder
+        LoopyKeyBuilder()(self)
+
+        return (result, self._pytools_persistent_hash_digest)
 
     def __setstate__(self, state):
+        attribs, p_hash_digest = state
+
         new_fields = set()
 
-        for k, v in six.iteritems(state):
+        for k, v in six.iteritems(attribs):
             setattr(self, k, v)
             new_fields.add(k)
 
         self.register_fields(new_fields)
 
+        if 0:
+            # {{{ check that 'reconstituted' object has same hash
+
+            from loopy.tools import LoopyKeyBuilder
+            LoopyKeyBuilder()(self)
+
+            assert p_hash_digest == self._pytools_persistent_hash_digest
+
+            # }}}
+        else:
+            self._pytools_persistent_hash_digest = p_hash_digest
+
         from loopy.kernel.tools import SetOperationCacheManager
         self.cache_manager = SetOperationCacheManager()
         self._kernel_executor_cache = {}
diff --git a/loopy/version.py b/loopy/version.py
index 4c6dfbcc22c0ea38668c663eb858161471a3999a..8516ce006bde8b8616172a72a766ec86dfcd44f1 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v62-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v63-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 150cb1cc4bc6ee13a7d516ab09c8824d76a2c6c9..67d943af3be4446834bf7262a91b8596b601ca85 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2017.1",
+          "pytools>=2017.3",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",