From e1048b0a4c0c8512d0765e4eef09963dcc77ab0f Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sat, 9 Sep 2023 16:21:45 -0500
Subject: [PATCH] register_optimization_preambles for all kernels

---
 sumpy/e2e.py | 7 +++++++
 sumpy/e2p.py | 4 ++++
 sumpy/p2e.py | 2 ++
 sumpy/p2p.py | 5 +++--
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/sumpy/e2e.py b/sumpy/e2e.py
index 3dcf2727..0e218024 100644
--- a/sumpy/e2e.py
+++ b/sumpy/e2e.py
@@ -29,6 +29,7 @@ import pymbolic
 
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from sumpy.tools import KernelCacheMixin, to_complex_dtype
+from sumpy.codegen import register_optimization_preambles
 from pytools import memoize_method
 
 import logging
@@ -145,6 +146,7 @@ class E2EBase(KernelCacheMixin, ABC):
         # FIXME
         knl = self.get_kernel()
         knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0")
+        knl = register_optimization_preambles(knl, self.device)
 
         return knl
 
@@ -279,6 +281,7 @@ class E2EFromCSR(E2EBase):
         # FIXME
         knl = self.get_kernel()
         knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0")
+        knl = register_optimization_preambles(knl, self.device)
 
         return knl
 
@@ -518,6 +521,7 @@ class M2LUsingTranslationClassesDependentData(E2EFromCSR):
         knl = self.get_kernel(result_dtype)
         knl = self.tgt_expansion.m2l_translation.optimize_loopy_kernel(
                 knl, self.tgt_expansion, self.src_expansion)
+        knl = register_optimization_preambles(knl, self.device)
 
         return knl
 
@@ -627,6 +631,7 @@ class M2LGenerateTranslationClassesDependentData(E2EBase):
         knl = self.get_kernel(result_dtype)
         knl = lp.tag_inames(knl, "idim*:unr")
         knl = lp.tag_inames(knl, {"itr_class": "g.0"})
+        knl = register_optimization_preambles(knl, self.device)
 
         return knl
 
@@ -732,6 +737,7 @@ class M2LPreprocessMultipole(E2EBase):
         _, optimizations = self.get_inner_knl_and_optimizations(result_dtype)
         for optimization in optimizations:
             knl = optimization(knl)
+        knl = register_optimization_preambles(knl, self.device)
         return knl
 
     def __call__(self, queue, **kwargs):
@@ -831,6 +837,7 @@ class M2LPostprocessLocal(E2EBase):
         for optimization in optimizations:
             knl = optimization(knl)
         knl = lp.add_inames_for_unused_hw_axes(knl)
+        knl = register_optimization_preambles(knl, self.device)
         return knl
 
     def __call__(self, queue, **kwargs):
diff --git a/sumpy/e2p.py b/sumpy/e2p.py
index 0ec1c48b..55af69b3 100644
--- a/sumpy/e2p.py
+++ b/sumpy/e2p.py
@@ -26,6 +26,7 @@ import numpy as np
 import loopy as lp
 
 from sumpy.tools import KernelCacheMixin, gather_loopy_arguments
+from sumpy.codegen import register_optimization_preambles
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
 
@@ -198,6 +199,7 @@ class E2PFromSingleBox(E2PBase):
         knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
+        knl = register_optimization_preambles(knl, self.device)
 
         return knl
 
@@ -324,6 +326,8 @@ class E2PFromCSR(E2PBase):
         knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
+        knl = register_optimization_preambles(knl, self.device)
+
         return knl
 
     def __call__(self, queue, **kwargs):
diff --git a/sumpy/p2e.py b/sumpy/p2e.py
index fe52f6b9..acc7e726 100644
--- a/sumpy/p2e.py
+++ b/sumpy/p2e.py
@@ -25,6 +25,7 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
 from sumpy.tools import KernelCacheMixin, KernelComputation
+from sumpy.codegen import register_optimization_preambles
 
 import logging
 logger = logging.getLogger(__name__)
@@ -118,6 +119,7 @@ class P2EBase(KernelCacheMixin, KernelComputation):
         knl = self._allow_redundant_execution_of_knl_scaling(knl)
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
+        knl = register_optimization_preambles(knl, self.device)
         return knl
 
     def __call__(self, queue, **kwargs):
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 0b986c7d..9c4302d6 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -29,6 +29,7 @@ from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
 from sumpy.tools import (
         KernelComputation, KernelCacheMixin, is_obj_array_like)
+from sumpy.codegen import register_optimization_preambles
 
 
 __doc__ = """
@@ -190,7 +191,6 @@ class P2PBase(KernelCacheMixin, KernelComputation):
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
 
-        from sumpy.codegen import register_optimization_preambles
         knl = register_optimization_preambles(knl, self.device)
 
         return knl
@@ -411,6 +411,8 @@ class P2PMatrixSubsetGenerator(P2PBase):
         knl = self._allow_redundant_execution_of_knl_scaling(knl)
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
+        knl = register_optimization_preambles(knl, self.device)
+
         return knl
 
     def __call__(self, queue, targets, sources, tgtindices, srcindices, **kwargs):
@@ -717,7 +719,6 @@ class P2PFromCSR(P2PBase):
         knl = lp.set_options(knl,
                 enforce_variable_access_ordered="no_check")
 
-        from sumpy.codegen import register_optimization_preambles
         knl = register_optimization_preambles(knl, self.device)
 
         return knl
-- 
GitLab