From 07ca1ee6df51ee463fdf321c2b4b2e3c7673ddf3 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Fri, 6 Jan 2023 06:38:33 +0530
Subject: [PATCH] Use a vec array for local_isrc* for coalesced array access
 (#151)

* Merge local_isrc and local_isrc_strength and tag as vec for coalescced access

* Use new name

* Add an explanation about the optimization

* Back to loopy main with renamed transform

Co-authored-by: Andreas Kloeckner <inform@tiker.net>
---
 sumpy/p2p.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index c24fa8d9..385e90b0 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -656,6 +656,18 @@ class P2PFromCSR(P2PBase):
             knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"})
             knl = lp.set_temporary_address_space(knl,
                 ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL)
+
+            # By having a concatenated memory layout of the temporaries
+            # and marking the first axis as vec, we are transposing the
+            # the arrays and also making the access of the source
+            # co-ordinates and the strength for each source a coalesced
+            # access of 256 bits (assuming double precision) which is
+            # optimized for NVIDIA GPUs. On an NVIDIA Titan V, this
+            # optimization led to a 8% speedup in the performance.
+            knl = lp.concatenate_arrays(knl,
+                ["local_isrc", "local_isrc_strength"], "local_isrc")
+            knl = lp.tag_array_axes(knl, "local_isrc", "vec,C")
+
             knl = lp.add_inames_for_unused_hw_axes(knl)
             # knl = lp.set_options(knl, write_code=True)
 
-- 
GitLab