From bc5c2e2466f6eae9d22da5f6d2e21c41fc561301 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 16 Mar 2018 14:18:18 -0500 Subject: [PATCH 1/3] Improve cache retrieval speeds for execution by using lazy unpickling of instructions. This requires caching the generation of invokers (which itself should also give a speed boost), since invoker generation needed to analyze the instructions in the kernel. --- loopy/kernel/__init__.py | 13 +++++++++- loopy/kernel/instruction.py | 12 +++++++++ loopy/target/c/c_execution.py | 9 ++++--- loopy/target/execution.py | 39 +++++++++++++++++++++++++----- loopy/target/pyopencl_execution.py | 9 ++++--- 5 files changed, 69 insertions(+), 13 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 32b233900..550288642 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1277,10 +1277,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): result = dict( (key, getattr(self, key)) for key in self.__class__.fields - if hasattr(self, key)) + if hasattr(self, key) and key != "instructions") result.pop("cache_manager", None) + # Make the instructions lazily unpickling, to support faster + # cache retrieval for execution. + from loopy.kernel.instruction import _get_insn_eq_key, _get_insn_hash_key + from loopy.tools import ( + LazilyUnpicklingListWithEqAndPersistentHashing as LazyList) + + result["instructions"] = LazyList( + self.instructions, + eq_key_getter=_get_insn_eq_key, + persistent_hash_key_getter=_get_insn_hash_key) + # make sure that kernels are pickled with a cached hash key in place from loopy.tools import LoopyKeyBuilder LoopyKeyBuilder()(self) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 95001c78b..067e3de13 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1388,4 +1388,16 @@ class BarrierInstruction(_DataObliviousInstruction): # }}} +# {{{ key getters + +def _get_insn_eq_key(insn): + return insn._key_builder.key() + + +def _get_insn_hash_key(insn): + return insn._key_builder.hash_key() + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 3634cc71a..4eecf52a3 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -382,8 +382,11 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel, - CExecutionWrapperGenerator()) + super(CKernelExecutor, self).__init__(kernel) + + def get_invoker_uncached(self, kernel, codegen_result): + generator = CExecutionWrapperGenerator + return generator(kernel, codegen_result) @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): @@ -423,7 +426,7 @@ class CKernelExecutor(KernelExecutorBase): kernel=kernel, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.invoker(kernel, codegen_result)) + invoker=self.get_invoker(kernel, codegen_result)) # }}} diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3a3ea0a70..75424b9c7 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -625,9 +625,10 @@ class ExecutionWrapperGeneratorBase(object): :codegen_result: the loopy :class:`CodeGenerationResult` created by code generation - :returns: py_func, a python function that handles excution of this - kernel + :returns: A python callable that handles execution of this + kernel """ + options = kernel.options implemented_data_info = codegen_result.implemented_data_info @@ -677,7 +678,7 @@ class ExecutionWrapperGeneratorBase(object): with open(options.write_wrapper, "w") as outf: outf.write(output) - return gen.get_function() + return gen.get_picklable_function() # }}} @@ -697,6 +698,11 @@ typed_and_scheduled_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +invoker_cache = WriteOncePersistentDict( + "loopy-invoker-cache-v1-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + # {{{ kernel executor class KernelExecutorBase(object): @@ -707,7 +713,7 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel, invoker): + def __init__(self, kernel): """ :arg kernel: a loopy.LoopKernel """ @@ -723,8 +729,6 @@ class KernelExecutorBase(object): arg.dtype is None for arg in kernel.args) - self.invoker = invoker - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes @@ -833,6 +837,29 @@ class KernelExecutorBase(object): code = generate_code_v2(kernel) return code.device_code() + def get_invoker_uncached(self, kernel, *args): + raise NotImplementedError() + + def get_invoker(self, kernel, *args): + from loopy import CACHING_ENABLED + + cache_key = (self.__class__.__name__, kernel) + + if CACHING_ENABLED: + try: + return invoker_cache[cache_key] + except KeyError: + pass + + logger.debug("%s: invoker cache miss" % kernel.name) + + invoker = self.get_invoker_uncached(kernel, *args) + + if CACHING_ENABLED: + invoker_cache.store_if_not_present(cache_key, invoker) + + return invoker + # }}} # {{{ call and info generator diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index bef3152d0..d33a92d66 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -261,8 +261,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__( - kernel, invoker=PyOpenCLExecutionWrapperGenerator()) + super(PyOpenCLKernelExecutor, self).__init__(kernel) self.context = context @@ -270,6 +269,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): if isinstance(kernel.target, PyOpenCLTarget): self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) + def get_invoker_uncached(self, kernel, codegen_result): + generator = PyOpenCLExecutionWrapperGenerator() + return generator(kernel, codegen_result) + @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) @@ -309,7 +312,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kernel=kernel, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.invoker(kernel, codegen_result)) + invoker=self.get_invoker(kernel, codegen_result)) def __call__(self, queue, **kwargs): """ -- GitLab From 994a8e08f7cdfd68e3248724f3cc54e03976bd55 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 16 Mar 2018 14:20:16 -0500 Subject: [PATCH 2/3] Bump required pytools version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6c684f546..c84a63fe3 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2018.2", + "pytools>=2018.3", "pymbolic>=2016.2", "genpy>=2016.1.2", "cgen>=2016.1", -- GitLab From 9376552f0ee3fd84bb8861abe778ac998f9742b7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 16 Mar 2018 16:25:08 -0500 Subject: [PATCH 3/3] Fix missing () --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 4eecf52a3..6b80bae20 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -385,7 +385,7 @@ class CKernelExecutor(KernelExecutorBase): super(CKernelExecutor, self).__init__(kernel) def get_invoker_uncached(self, kernel, codegen_result): - generator = CExecutionWrapperGenerator + generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method -- GitLab