From e1048b0a4c0c8512d0765e4eef09963dcc77ab0f Mon Sep 17 00:00:00 2001 From: Isuru Fernando <isuruf@gmail.com> Date: Sat, 9 Sep 2023 16:21:45 -0500 Subject: [PATCH] register_optimization_preambles for all kernels --- sumpy/e2e.py | 7 +++++++ sumpy/e2p.py | 4 ++++ sumpy/p2e.py | 2 ++ sumpy/p2p.py | 5 +++-- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 3dcf2727..0e218024 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -29,6 +29,7 @@ import pymbolic from loopy.version import MOST_RECENT_LANGUAGE_VERSION from sumpy.tools import KernelCacheMixin, to_complex_dtype +from sumpy.codegen import register_optimization_preambles from pytools import memoize_method import logging @@ -145,6 +146,7 @@ class E2EBase(KernelCacheMixin, ABC): # FIXME knl = self.get_kernel() knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0") + knl = register_optimization_preambles(knl, self.device) return knl @@ -279,6 +281,7 @@ class E2EFromCSR(E2EBase): # FIXME knl = self.get_kernel() knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0") + knl = register_optimization_preambles(knl, self.device) return knl @@ -518,6 +521,7 @@ class M2LUsingTranslationClassesDependentData(E2EFromCSR): knl = self.get_kernel(result_dtype) knl = self.tgt_expansion.m2l_translation.optimize_loopy_kernel( knl, self.tgt_expansion, self.src_expansion) + knl = register_optimization_preambles(knl, self.device) return knl @@ -627,6 +631,7 @@ class M2LGenerateTranslationClassesDependentData(E2EBase): knl = self.get_kernel(result_dtype) knl = lp.tag_inames(knl, "idim*:unr") knl = lp.tag_inames(knl, {"itr_class": "g.0"}) + knl = register_optimization_preambles(knl, self.device) return knl @@ -732,6 +737,7 @@ class M2LPreprocessMultipole(E2EBase): _, optimizations = self.get_inner_knl_and_optimizations(result_dtype) for optimization in optimizations: knl = optimization(knl) + knl = register_optimization_preambles(knl, self.device) return knl def __call__(self, queue, **kwargs): @@ -831,6 +837,7 @@ class M2LPostprocessLocal(E2EBase): for optimization in optimizations: knl = optimization(knl) knl = lp.add_inames_for_unused_hw_axes(knl) + knl = register_optimization_preambles(knl, self.device) return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 0ec1c48b..55af69b3 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -26,6 +26,7 @@ import numpy as np import loopy as lp from sumpy.tools import KernelCacheMixin, gather_loopy_arguments +from sumpy.codegen import register_optimization_preambles from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -198,6 +199,7 @@ class E2PFromSingleBox(E2PBase): knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + knl = register_optimization_preambles(knl, self.device) return knl @@ -324,6 +326,8 @@ class E2PFromCSR(E2PBase): knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + knl = register_optimization_preambles(knl, self.device) + return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/p2e.py b/sumpy/p2e.py index fe52f6b9..acc7e726 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -25,6 +25,7 @@ import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION from sumpy.tools import KernelCacheMixin, KernelComputation +from sumpy.codegen import register_optimization_preambles import logging logger = logging.getLogger(__name__) @@ -118,6 +119,7 @@ class P2EBase(KernelCacheMixin, KernelComputation): knl = self._allow_redundant_execution_of_knl_scaling(knl) knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + knl = register_optimization_preambles(knl, self.device) return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 0b986c7d..9c4302d6 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -29,6 +29,7 @@ from loopy.version import MOST_RECENT_LANGUAGE_VERSION from sumpy.tools import ( KernelComputation, KernelCacheMixin, is_obj_array_like) +from sumpy.codegen import register_optimization_preambles __doc__ = """ @@ -190,7 +191,6 @@ class P2PBase(KernelCacheMixin, KernelComputation): knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") - from sumpy.codegen import register_optimization_preambles knl = register_optimization_preambles(knl, self.device) return knl @@ -411,6 +411,8 @@ class P2PMatrixSubsetGenerator(P2PBase): knl = self._allow_redundant_execution_of_knl_scaling(knl) knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + knl = register_optimization_preambles(knl, self.device) + return knl def __call__(self, queue, targets, sources, tgtindices, srcindices, **kwargs): @@ -717,7 +719,6 @@ class P2PFromCSR(P2PBase): knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") - from sumpy.codegen import register_optimization_preambles knl = register_optimization_preambles(knl, self.device) return knl -- GitLab