diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 38bec39d3b4dbd7efe6229c563f391c29fdd3efb..97756a55d5690219317d930f1ae43debc769a682 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var +from pytools import memoize_in from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -306,68 +307,44 @@ class P2PMatrixBlockGenerator(P2PBase): arguments = ( self.get_default_src_tgt_arguments() + [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", None), - lp.ValueArg("ntgtindices", None), - lp.ValueArg("nranges", None) + lp.GlobalArg("srcindices", None, shape="nresults"), + lp.GlobalArg("tgtindices", None, shape="nresults"), + lp.ValueArg("nresults", None) ] + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntgtindices, nsrcindices") + [lp.GlobalArg("result_%d" % i, dtype, shape="nresults") for i, dtype in enumerate(self.value_dtypes)]) - loopy_knl = lp.make_kernel([ - "{[irange]: 0 <= irange < nranges}", - "{[itgt, isrc, idim]: \ - 0 <= itgt < ntgtblock and \ - 0 <= isrc < nsrcblock and \ - 0 <= idim < dim}", - ], + loopy_knl = lp.make_kernel( + "{[imat, idim]: 0 <= imat < nresults and 0 <= idim < dim}", self.get_kernel_scaling_assignments() + [""" - for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> ntgtblock = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> nsrcblock = srcend - srcstart - - for itgt, isrc - <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ - sources[idim, srcindices[srcstart + isrc]] + for imat + <> d[idim] = targets[idim, tgtindices[imat]] - \ + sources[idim, srcindices[imat]] """] + [""" - <> is_self = (srcindices[srcstart + isrc] == - target_to_source[tgtindices[tgtstart + itgt]]) + <> is_self = (srcindices[imat] == + target_to_source[tgtindices[imat]]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [""" - result_{i}[tgtstart + itgt, srcstart + isrc] = \ - knl_{i}_scaling * pair_result_{i} \ - {{id_prefix=write_p2p,inames=irange}} + result_{i}[imat] = \ + knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels))] - + [""" - end - end - """], + + ["end"], arguments, - assumptions="nranges>=1", + assumptions="nresults>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.add_dtypes(loopy_knl, dict( - nsources=np.int32, - ntargets=np.int32, - ntgtindices=np.int32, - nsrcindices=np.int32)) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) + for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -382,11 +359,74 @@ class P2PMatrixBlockGenerator(P2PBase): if targets_is_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") - knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0") + knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0") return knl def __call__(self, queue, targets, sources, tgtindices, srcindices, tgtranges, srcranges, **kwargs): + @memoize_in(self, "block_cumsum_knl") + def cumsum(): + loopy_knl = lp.make_kernel( + "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}", + """ + blkprefix[0] = 0.0 + blkprefix[i] = reduce(sum, j, \ + (srcranges[j] - srcranges[j - 1]) * \ + (tgtranges[j] - tgtranges[j - 1])) \ + """, + [ + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("blkprefix", np.int32, shape="nranges + 1"), + lp.ValueArg("nranges", None) + ], + name="block_cumsum_knl", + default_offset=lp.auto, + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True, + force_outer_iname_for_scan="i") + return loopy_knl + + @memoize_in(self, "block_linear_index_knl") + def linear_index(): + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}" + ], + """ + for irange + <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange] + <> nsrcblock = srcranges[irange + 1] - srcranges[irange] + + for itgt, isrc + <> imat = blkprefix[irange] + (nsrcblock * itgt + isrc) + + rowindices[imat] = tgtindices[tgtranges[irange] + itgt] + colindices[imat] = srcindices[srcranges[irange] + isrc] + end + end + """, + [ + lp.GlobalArg("srcindices", None, shape="nsrcindices"), + lp.GlobalArg("tgtindices", None, shape="ntgtindices"), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.GlobalArg("blkprefix", None, shape="nranges + 1"), + lp.GlobalArg("rowindices", None, shape="nresults"), + lp.GlobalArg("colindices", None, shape="nresults"), + lp.ValueArg("nsrcindices", np.int32), + lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nresults", None), + lp.ValueArg("nranges", None) + ], + name="block_linear_prefix_knl", + default_offset=lp.auto, + lang_version=MOST_RECENT_LANGUAGE_VERSION) + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + + return loopy_knl + from pytools.obj_array import is_obj_array knl = self.get_cached_optimized_kernel( targets_is_obj_array=( @@ -394,9 +434,16 @@ class P2PMatrixBlockGenerator(P2PBase): sources_is_obj_array=( is_obj_array(sources) or isinstance(sources, (tuple, list)))) - return knl(queue, targets=targets, sources=sources, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, **kwargs) + _, (blkprefix,) = cumsum()(queue, + tgtranges=tgtranges, srcranges=srcranges) + _, (rowindices, colindices,) = linear_index()(queue, + tgtindices=tgtindices, srcindices=srcindices, + tgtranges=tgtranges, srcranges=srcranges, + blkprefix=blkprefix, nresults=blkprefix[-1]) + evt, results = knl(queue, targets=targets, sources=sources, + tgtindices=rowindices, srcindices=colindices, **kwargs) + + return evt, tuple(list(results) + [rowindices, colindices]) # }}} diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index eb85052b8d68e24ad46a3be8ad05663600047e0a..e784529e0baaea3416f0db64c100ea3d5836c934 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -63,15 +63,22 @@ def create_arguments(n, mode, target_radius=1.0): return targets, sources, centers, sigma, expansion_radii -def create_subset(indices, ranges, factor): - indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object) - for i in range(ranges.shape[0] - 1): - iidx = indices[np.s_[ranges[i]:ranges[i + 1]]] - indices_[i] = np.sort(np.random.choice(iidx, - size=int(factor * len(iidx)), replace=False)) +def create_index_subset(nnodes, nblks, factor): + indices = np.arange(0, nnodes) + ranges = np.arange(0, nnodes + 1, nnodes // nblks) - ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_]) - indices_ = np.hstack(indices_) + if abs(factor - 1.0) < 1.0e-14: + ranges_ = ranges + indices_ = indices + else: + indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object) + for i in range(ranges.shape[0] - 1): + iidx = indices[np.s_[ranges[i]:ranges[i + 1]]] + indices_[i] = np.sort(np.random.choice(iidx, + size=int(factor * len(iidx)), replace=False)) + + ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_]) + indices_ = np.hstack(indices_) return indices_, ranges_ @@ -139,11 +146,12 @@ def test_qbx_direct(ctx_getter): assert la.norm(blk[itgt, isrc] - mat[block]) < eps -@pytest.mark.parametrize("exclude_self", [True, False]) -def test_p2p_direct(ctx_getter, exclude_self): - # This evaluates a single layer potential on a circle. +@pytest.mark.parametrize("exclude_self", "factor", [ + (True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6) + ]) +def test_p2p_direct(ctx_getter, exclude_self, factor): + # This does a point-to-point kernel evaluation on a circle. logging.basicConfig(level=logging.INFO) - ctx = ctx_getter() queue = cl.CommandQueue(ctx) @@ -166,15 +174,8 @@ def test_p2p_direct(ctx_getter, exclude_self): h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices = np.arange(0, n) - tgtranges = np.arange(0, tgtindices.shape[0] + 1, - tgtindices.shape[0] // nblks) - tgtindices, tgtranges = create_subset(tgtindices, tgtranges, 0.6) - - srcindices = np.arange(0, n) - srcranges = np.arange(0, srcindices.shape[0] + 1, - srcindices.shape[0] // nblks) - srcindices, srcranges = create_subset(srcindices, srcranges, 0.6) + tgtindices, tgtranges = create_index_subset(n, nblks, factor) + srcindices, srcranges = create_index_subset(n, nblks, factor) assert tgtranges.shape == srcranges.shape extra_kwargs = {} @@ -190,17 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self): eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps - _, (blk,) = blk_gen(queue, targets, sources, - tgtindices, srcindices, tgtranges, srcranges, - **extra_kwargs) + _, (blk, rowindices, colindices) = blk_gen(queue, targets, sources, + tgtindices, srcindices, tgtranges, srcranges, **extra_kwargs) - for i in range(srcranges.shape[0] - 1): - itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] - isrc = np.s_[srcranges[i]:srcranges[i + 1]] - block = np.ix_(tgtindices[itgt], srcindices[isrc]) - - eps = 1.0e-10 * la.norm(mat[block]) - assert la.norm(blk[itgt, isrc] - mat[block]) < eps + eps = 1.0e-10 * la.norm(mat) + assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps # You can test individual routines by typing