From 07ca1ee6df51ee463fdf321c2b4b2e3c7673ddf3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando <isuruf@gmail.com> Date: Fri, 6 Jan 2023 06:38:33 +0530 Subject: [PATCH] Use a vec array for local_isrc* for coalesced array access (#151) * Merge local_isrc and local_isrc_strength and tag as vec for coalescced access * Use new name * Add an explanation about the optimization * Back to loopy main with renamed transform Co-authored-by: Andreas Kloeckner <inform@tiker.net> --- sumpy/p2p.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index c24fa8d9..385e90b0 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -656,6 +656,18 @@ class P2PFromCSR(P2PBase): knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"}) knl = lp.set_temporary_address_space(knl, ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL) + + # By having a concatenated memory layout of the temporaries + # and marking the first axis as vec, we are transposing the + # the arrays and also making the access of the source + # co-ordinates and the strength for each source a coalesced + # access of 256 bits (assuming double precision) which is + # optimized for NVIDIA GPUs. On an NVIDIA Titan V, this + # optimization led to a 8% speedup in the performance. + knl = lp.concatenate_arrays(knl, + ["local_isrc", "local_isrc_strength"], "local_isrc") + knl = lp.tag_array_axes(knl, "local_isrc", "vec,C") + knl = lp.add_inames_for_unused_hw_axes(knl) # knl = lp.set_options(knl, write_code=True) -- GitLab