diff --git a/sumpy/codegen.py b/sumpy/codegen.py index fa13d52696652f17d0be6a404ea9bf67f624d0dc..3b19fcb9ef7c821399cc3cf830cb8fe198625c6d 100644 --- a/sumpy/codegen.py +++ b/sumpy/codegen.py @@ -208,6 +208,20 @@ def register_bessel_callables(loopy_knl): Hankel1_01("hank1_01")) return loopy_knl + +def _fp_contract_fast_preamble(preamble_info): + yield ("fp_contract_fast_pocl", "#pragma clang fp contract(fast)") + + +def register_optimization_preambles(loopy_knl, device): + if isinstance(loopy_knl.target, lp.PyOpenCLTarget): + import pyopencl as cl + if device.platform.name == "Portable Computing Language" and \ + (device.type & cl.device_type.GPU): + loopy_knl = lp.register_preamble_generators(loopy_knl, + [_fp_contract_fast_preamble]) + return loopy_knl + # }}} diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 4be5916da17f041134e50c2015d607128074a0fd..d03d8bd1f80c5e4ca3f9b870c41308f8cd5d3fb4 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -190,6 +190,9 @@ class P2PBase(KernelCacheMixin, KernelComputation): knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, self.device) + return knl @@ -714,6 +717,9 @@ class P2PFromCSR(P2PBase): knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, self.device) + return knl def __call__(self, queue, **kwargs):