From 656dd729e74d9dff68268072bd161c1cfce98b35 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Wed, 18 Oct 2023 11:28:44 -0500
Subject: [PATCH] improve fallback for vec types (#175)

* improve fallback for vec types

* fix flake8
---
 sumpy/p2p.py | 50 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 9c4302d6..c5acdd9d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -661,13 +661,14 @@ class P2PFromCSR(P2PBase):
         return loopy_knl
 
     def get_optimized_kernel(self, max_nsources_in_one_box,
-            max_ntargets_in_one_box, dtype_size):
+            max_ntargets_in_one_box, source_dtype, strength_dtype):
         if not self.is_gpu:
             knl = self.get_kernel(max_nsources_in_one_box,
                     max_ntargets_in_one_box)
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
             knl = self._allow_redundant_execution_of_knl_scaling(knl)
         else:
+            dtype_size = np.dtype(strength_dtype).alignment
             work_items_per_group = min(256, max_ntargets_in_one_box)
             total_local_mem = max_nsources_in_one_box * \
                     (self.dim + self.strength_count) * dtype_size
@@ -682,6 +683,10 @@ class P2PFromCSR(P2PBase):
             knl = lp.set_temporary_address_space(knl,
                 ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL)
 
+            local_arrays = ["local_isrc", "local_isrc_strength"]
+            local_array_isrc_axis = [1, 1]
+            local_array_sizes = [self.dim, self.strength_count]
+            local_array_dtypes = [source_dtype, strength_dtype]
             # By having a concatenated memory layout of the temporaries
             # and marking the first axis as vec, we are transposing the
             # the arrays and also making the access of the source
@@ -689,15 +694,33 @@ class P2PFromCSR(P2PBase):
             # access of 256 bits (assuming double precision) which is
             # optimized for NVIDIA GPUs. On an NVIDIA Titan V, this
             # optimization led to a 8% speedup in the performance.
-            knl = lp.concatenate_arrays(knl,
-                ["local_isrc", "local_isrc_strength"], "local_isrc")
-            count = self.strength_count + self.dim
-            if count in [2, 3, 4, 8, 16]:
-                knl = lp.tag_array_axes(knl, "local_isrc", "vec,C")
+            if strength_dtype == source_dtype:
+                knl = lp.concatenate_arrays(knl, local_arrays, "local_isrc")
+                local_arrays = ["local_isrc"]
+                local_array_sizes = [self.dim + self.strength_count]
+                local_array_dtypes = [source_dtype]
+            # We try to mark the local arrays (sources, strengths)
+            # as vec for the first dimension
+            for i, (array_name, array_size, array_dtype) in \
+                    enumerate(zip(local_arrays, local_array_sizes,
+                                  local_array_dtypes)):
+                if array_dtype not in [np.float, np.double]:
+                    # pyopencl does not support complex data type vectors
+                    continue
+                if array_size in [2, 3, 4, 8, 16]:
+                    knl = lp.tag_array_axes(knl, array_name, "vec,C")
+                else:
+                    # FIXME: check if CUDA
+                    n = 16 // dtype_size
+                    if n in [1, 2, 4, 8]:
+                        knl = lp.split_array_axis(knl, array_name, 0, n)
+                        knl = lp.tag_array_axes(knl, array_name, "C,vec,C")
+                        local_array_isrc_axis[i] = 2
 
             # We need to split isrc_prefetch and isrc_offset into chunks.
             nsources = (max_nsources_in_one_box + nprefetch - 1) // nprefetch
-            knl = lp.split_array_axis(knl, "local_isrc", 1, nsources)
+            for local_array, axis in zip(local_arrays, local_array_isrc_axis):
+                knl = lp.split_array_axis(knl, local_array, axis, nsources)
             knl = lp.split_iname(knl, "isrc_prefetch", nsources,
                     outer_iname="iprefetch")
             knl = lp.split_iname(knl, "isrc_prefetch_inner", work_items_per_group)
@@ -709,7 +732,7 @@ class P2PFromCSR(P2PBase):
             # be as large as before. Need to simplify before unprivatizing
             knl = lp.simplify_indices(knl)
             knl = lp.unprivatize_temporaries_with_inames(knl,
-                    "iprefetch", only_var_names="local_isrc")
+                    "iprefetch", only_var_names=local_arrays)
 
             knl = lp.add_inames_to_insn(knl,
                     "inner", "id:init_* or id:*_scaling or id:src_box_insn_*")
@@ -728,14 +751,19 @@ class P2PFromCSR(P2PBase):
         max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box")
 
         if self.is_gpu:
-            dtype_size = kwargs.get("sources")[0].dtype.alignment
+            source_dtype = kwargs.get("sources")[0].dtype
+            strength_dtype = kwargs.get("strength").dtype
         else:
-            dtype_size = None
+            # these are unused for not GPU and defeats the caching
+            # set them to None to keep the caching across dtypes
+            source_dtype = None
+            strength_dtype = None
 
         knl = self.get_cached_kernel_executor(
                 max_nsources_in_one_box=max_nsources_in_one_box,
                 max_ntargets_in_one_box=max_ntargets_in_one_box,
-                dtype_size=dtype_size,
+                source_dtype=source_dtype,
+                strength_dtype=strength_dtype,
                 )
 
         return knl(queue, **kwargs)
-- 
GitLab