diff --git a/sumpy/p2p.py b/sumpy/p2p.py index c24fa8d9154eeda184f0f394df212e51eab4c3e4..385e90b031a1b4ce4bbc478d82bfbdb8db1e609e 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -656,6 +656,18 @@ class P2PFromCSR(P2PBase): knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"}) knl = lp.set_temporary_address_space(knl, ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL) + + # By having a concatenated memory layout of the temporaries + # and marking the first axis as vec, we are transposing the + # the arrays and also making the access of the source + # co-ordinates and the strength for each source a coalesced + # access of 256 bits (assuming double precision) which is + # optimized for NVIDIA GPUs. On an NVIDIA Titan V, this + # optimization led to a 8% speedup in the performance. + knl = lp.concatenate_arrays(knl, + ["local_isrc", "local_isrc_strength"], "local_isrc") + knl = lp.tag_array_axes(knl, "local_isrc", "vec,C") + knl = lp.add_inames_for_unused_hw_axes(knl) # knl = lp.set_options(knl, write_code=True)