diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 896d34c97637a42c8ef716d0734fcf84cb515211..4b0590f0079376f356a48b9a3f44675655b731c1 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -33,7 +33,7 @@ import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var -from pytools import memoize_in +from pytools import memoize_method, memoize_in from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -293,6 +293,97 @@ class P2PMatrixGenerator(P2PBase): # {{{ P2P matrix block writer +class P2PMatrixLinearIndex(object): + def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges): + self.queue = queue + self.tgtindices = tgtindices + self.tgtranges = tgtranges + self.srcindices = srcindices + self.srcranges = srcranges + + @memoize_method + def block_ranges(self): + @memoize_in(self, "linear_cumsum_knl") + def cumsum(): + loopy_knl = lp.make_kernel( + "{[i, j]: 0 <= i < nranges and 0 <= j <= i}", + """ + blkranges[0] = 0 + blkranges[i + 1] = reduce(sum, j, \ + (srcranges[j + 1] - srcranges[j]) * \ + (tgtranges[j + 1] - tgtranges[j])) \ + """, + [ + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"), + lp.ValueArg("nranges", None) + ], + name="linear_cumsum_knl", + default_offset=lp.auto, + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True, + force_outer_iname_for_scan="i") + return loopy_knl + + _, (blkranges,) = cumsum()(self.queue, + tgtranges=self.tgtranges, srcranges=self.srcranges); + + return blkranges + + @memoize_method + def linear_indices(self): + @memoize_in(self, "linear_index_knl") + def linear_index(): + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}" + ], + """ + for irange + <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange] + <> nsrcblock = srcranges[irange + 1] - srcranges[irange] + + for itgt, isrc + <> imat = blkranges[irange] + (nsrcblock * itgt + isrc) + + rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \ + {id_prefix=write_index} + colindices[imat] = srcindices[srcranges[irange] + isrc] \ + {id_prefix=write_index} + end + end + """, + [ + lp.GlobalArg("srcindices", None), + lp.GlobalArg("tgtindices", None), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.GlobalArg("blkranges", None, shape="nranges + 1"), + lp.GlobalArg("rowindices", None, shape="nresults"), + lp.GlobalArg("colindices", None, shape="nresults"), + lp.ValueArg("nresults", None), + lp.ValueArg("nranges", None), + "..." + ], + name="linear_index_knl", + default_offset=lp.auto, + silenced_warnings="write_race(write_index*)", + lang_version=MOST_RECENT_LANGUAGE_VERSION) + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + + return loopy_knl + + blkranges = self.block_ranges() + _, (rowindices, colindices) = linear_index()(self.queue, + tgtindices=self.tgtindices, srcindices=self.srcindices, + tgtranges=self.tgtranges, srcranges=self.srcranges, + blkranges=blkranges, nresults=blkranges[-1]) + + return rowindices, colindices, blkranges + + class P2PMatrixBlockGenerator(P2PBase): """Generator for a subset of P2P interaction matrix entries.""" @@ -362,71 +453,27 @@ class P2PMatrixBlockGenerator(P2PBase): knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0") return knl - def __call__(self, queue, targets, sources, tgtindices, srcindices, - tgtranges, srcranges, **kwargs): - @memoize_in(self, "block_cumsum_knl") - def cumsum(): - loopy_knl = lp.make_kernel( - "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}", - """ - blkranges[0] = 0.0 - blkranges[i] = reduce(sum, j, \ - (srcranges[j] - srcranges[j - 1]) * \ - (tgtranges[j] - tgtranges[j - 1])) \ - """, - [ - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"), - lp.ValueArg("nranges", None) - ], - name="block_cumsum_knl", - default_offset=lp.auto, - lang_version=MOST_RECENT_LANGUAGE_VERSION) - - loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True, - force_outer_iname_for_scan="i") - return loopy_knl - - @memoize_in(self, "block_linear_index_knl") - def linear_index(): - loopy_knl = lp.make_kernel([ - "{[irange]: 0 <= irange < nranges}", - "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}" - ], - """ - for irange - <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange] - <> nsrcblock = srcranges[irange + 1] - srcranges[irange] - - for itgt, isrc - <> imat = blkranges[irange] + (nsrcblock * itgt + isrc) - - rowindices[imat] = tgtindices[tgtranges[irange] + itgt] - colindices[imat] = srcindices[srcranges[irange] + isrc] - end - end - """, - [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.GlobalArg("blkranges", None, shape="nranges + 1"), - lp.GlobalArg("rowindices", None, shape="nresults"), - lp.GlobalArg("colindices", None, shape="nresults"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nresults", None), - lp.ValueArg("nranges", None) - ], - name="block_linear_prefix_knl", - default_offset=lp.auto, - lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") - - return loopy_knl - + def __call__(self, queue, targets, sources, index_set, **kwargs): + """Construct a set of blocks from the full P2P interaction matrix. + The relevant subset of indices for block :math:`i` is given by the + row indices `tgtindices[tgtranges[i]:tgtranges[i + 1]]` and the + column indices `srcindices[srcranges[i]:srcranges[i + 1]]`. + + :arg tgtindices: list of row indices for all blocks. + :arg srcindices: list of column indices for all blocks. + :arg tgtranges: list used to index `tgtindices` in each block. + :arg srcindices: list used to index `srcindices` in each block. + + :return: a tuple containing `(kernel_1_block, ... kernel_n_block, + rowindices, colindices, blkranges)`. The set `(rowindices, colindices)` + can be used to place each element of each block in the full matrix. + Each `kernel_i_block` is a one-dimensional array containing all the + blocks, i.e. the ith kernel evaluated at each of the input + source-target pairs. To retrieve a given block :math:`j`, one can + use `kernel_i_block[blkranges[j]:blkranges[j + 1]]`, which can then + be reshaped into the original block size using the `tgtranges` and + `srcranges` index lists. + """ from pytools.obj_array import is_obj_array knl = self.get_cached_optimized_kernel( targets_is_obj_array=( @@ -434,17 +481,10 @@ class P2PMatrixBlockGenerator(P2PBase): sources_is_obj_array=( is_obj_array(sources) or isinstance(sources, (tuple, list)))) - _, (blkranges,) = cumsum()(queue, - tgtranges=tgtranges, srcranges=srcranges) - _, (rowindices, colindices,) = linear_index()(queue, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, - blkranges=blkranges, nresults=blkranges[-1]) - evt, results = knl(queue, targets=targets, sources=sources, + rowindices, colindices, _ = index_set.linear_indices() + return knl(queue, targets=targets, sources=sources, tgtindices=rowindices, srcindices=colindices, **kwargs) - return evt, tuple(list(results) + [rowindices, colindices, blkranges]) - # }}} diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index 12198a396941af4f9acd9712e19c580f74c6f35d..984fbc5b79ba0939d2317956d6ec58b19435a093 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -161,6 +161,7 @@ def test_p2p_direct(ctx_getter, exclude_self, factor): mode_nr = 25 from sumpy.p2p import P2P + from sumpy.p2p import P2PMatrixLinearIndex from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator lpot = P2P(ctx, [lknl], exclude_self=exclude_self) mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self) @@ -190,11 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self, factor): eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps - _, (blk, rowindices, colindices, blkranges) = \ - blk_gen(queue, targets, sources, - tgtindices, srcindices, tgtranges, srcranges, - **extra_kwargs) + index_set = P2PMatrixLinearIndex(queue, + tgtindices, srcindices, tgtranges, srcranges) + _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs) + rowindices, colindices, _ = index_set.linear_indices() eps = 1.0e-10 * la.norm(mat) assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps