diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 38bec39d3b4dbd7efe6229c563f391c29fdd3efb..c408360d7aac0985f49b5feaf7dc770d37790956 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -293,7 +293,10 @@ class P2PMatrixGenerator(P2PBase): # {{{ P2P matrix block writer class P2PMatrixBlockGenerator(P2PBase): - """Generator for a subset of P2P interaction matrix entries.""" + """Generator for a subset of P2P interaction matrix entries. + + .. automethod:: __call__ + """ default_name = "p2p_block" @@ -306,68 +309,44 @@ class P2PMatrixBlockGenerator(P2PBase): arguments = ( self.get_default_src_tgt_arguments() + [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", None), - lp.ValueArg("ntgtindices", None), - lp.ValueArg("nranges", None) + lp.GlobalArg("srcindices", None, shape="nresult"), + lp.GlobalArg("tgtindices", None, shape="nresult"), + lp.ValueArg("nresult", None) ] + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntgtindices, nsrcindices") + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") for i, dtype in enumerate(self.value_dtypes)]) - loopy_knl = lp.make_kernel([ - "{[irange]: 0 <= irange < nranges}", - "{[itgt, isrc, idim]: \ - 0 <= itgt < ntgtblock and \ - 0 <= isrc < nsrcblock and \ - 0 <= idim < dim}", - ], + loopy_knl = lp.make_kernel( + "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() + [""" - for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> ntgtblock = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> nsrcblock = srcend - srcstart - - for itgt, isrc - <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ - sources[idim, srcindices[srcstart + isrc]] + for imat + <> d[idim] = targets[idim, tgtindices[imat]] - \ + sources[idim, srcindices[imat]] """] + [""" - <> is_self = (srcindices[srcstart + isrc] == - target_to_source[tgtindices[tgtstart + itgt]]) + <> is_self = (srcindices[imat] == + target_to_source[tgtindices[imat]]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [""" - result_{i}[tgtstart + itgt, srcstart + isrc] = \ - knl_{i}_scaling * pair_result_{i} \ - {{id_prefix=write_p2p,inames=irange}} + result_{i}[imat] = \ + knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels))] - + [""" - end - end - """], + + ["end"], arguments, - assumptions="nranges>=1", + assumptions="nresult>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.add_dtypes(loopy_knl, dict( - nsources=np.int32, - ntargets=np.int32, - ntgtindices=np.int32, - nsrcindices=np.int32)) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) + for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -382,11 +361,32 @@ class P2PMatrixBlockGenerator(P2PBase): if targets_is_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") - knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0") + knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0") return knl - def __call__(self, queue, targets, sources, tgtindices, srcindices, - tgtranges, srcranges, **kwargs): + def __call__(self, queue, targets, sources, index_set, **kwargs): + """Construct a set of blocks of the full P2P interaction matrix. + + The blocks are returned as one-dimensional arrays, for performance + and storage reasons. If the two-dimensional form is desired, it can + be obtained using the information in the `index_set` for a block + :math:`i` in the following way: + + .. code-block:: python + + blkranges = index_set.linear_ranges() + m = index_set.tgtranges[i + 1] - index_set.tgtranges[i] + n = index_set.srcranges[i + 1] - index_set.srcranges[i] + + block2d = result[blkranges[i]:blkranges[i + 1]].reshape(m, n) + + :arg targets: target point coordinates. + :arg sources: source point coordinates. + :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used + to define the various blocks. + :return: a tuple of one-dimensional arrays of kernel evaluations at + target-source pairs described by `index_set`. + """ from pytools.obj_array import is_obj_array knl = self.get_cached_optimized_kernel( targets_is_obj_array=( @@ -394,9 +394,9 @@ class P2PMatrixBlockGenerator(P2PBase): sources_is_obj_array=( is_obj_array(sources) or isinstance(sources, (tuple, list)))) + tgtindices, srcindices = index_set.linear_indices() return knl(queue, targets=targets, sources=sources, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, **kwargs) + tgtindices=tgtindices, srcindices=srcindices, **kwargs) # }}} diff --git a/sumpy/qbx.py b/sumpy/qbx.py index f3b2d6f78964d3d4f5a88248f7b9a08d32b722eb..985565f10af078da6909f97424ab6ecfdd9ef2d9 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -189,7 +189,10 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): # {{{ direct applier class LayerPotential(LayerPotentialBase): - """Direct applier for the layer potential.""" + """Direct applier for the layer potential. + + .. automethod:: __call__ + """ default_name = "qbx_apply" @@ -321,7 +324,10 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): # {{{ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): - """Generator for a subset of the layer potential matrix entries.""" + """Generator for a subset of the layer potential matrix entries. + + .. automethod:: __call__ + """ default_name = "qbx_block" @@ -335,68 +341,44 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): arguments = ( self.get_default_src_tgt_arguments() + [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", None), - lp.ValueArg("ntgtindices", None), - lp.ValueArg("nranges", None) + lp.GlobalArg("srcindices", None, shape="nresult"), + lp.GlobalArg("tgtindices", None, shape="nresult"), + lp.ValueArg("nresult", None) ] + - [lp.GlobalArg("result_%d" % i, - dtype, shape="ntgtindices, nsrcindices") - for i, dtype in enumerate(self.value_dtypes)]) + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") + for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ - "{[irange]: 0 <= irange < nranges}", - "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", - "{[idim]: 0 <= idim < dim}" + "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}" ], self.get_kernel_scaling_assignments() + [""" - for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> ntgtblock = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> nsrcblock = srcend - srcstart - - for itgt, isrc - <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \ - src[idim, srcindices[srcstart + isrc]] \ - {dup=idim} - <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \ - center[idim, tgtindices[tgtstart + itgt]] \ - {dup=idim} - <> rscale = expansion_radii[tgtindices[tgtstart + itgt]] + for imat + <> a[idim] = center[idim, tgtindices[imat]] - \ + src[idim, srcindices[imat]] {dup=idim} + <> b[idim] = tgt[idim, tgtindices[imat]] - \ + center[idim, tgtindices[imat]] {dup=idim} + <> rscale = expansion_radii[tgtindices[imat]] """] + loopy_insns + kernel_exprs + [""" - result_{i}[tgtstart + itgt, srcstart + isrc] = \ - knl_{i}_scaling * pair_result_{i} \ - {{id_prefix=write_lpot,inames=irange}} + result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_lpot}} """.format(i=iknl) for iknl in range(len(self.expansions))] - + [""" - end - end - """], + + ["end"], arguments, name=self.name, - assumptions="nranges>=1", + assumptions="nresult>=1", default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.add_dtypes(loopy_knl, dict( - nsources=np.int32, - ntargets=np.int32, - ntgtindices=np.int32, - nsrcindices=np.int32)) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) + for expn in self.expansions: loopy_knl = expn.prepare_loopy_kernel(loopy_knl) @@ -405,17 +387,28 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def get_optimized_kernel(self): loopy_knl = self.get_kernel() - loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0") return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, tgtranges, srcranges, **kwargs): + index_set, **kwargs): + """ + :arg targets: target point coordinates. + :arg sources: source point coordinates. + :arg centers: QBX target expansion centers. + :arg expansion_radii: radii for each expansion center. + :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used + to define the various blocks. + :return: a tuple of one-dimensional arrays of kernel evaluations at + target-source pairs described by `index_set`. + """ + knl = self.get_cached_optimized_kernel() + tgtindices, srcindices = index_set.linear_indices() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, **kwargs) + tgtindices=tgtindices, srcindices=srcindices, **kwargs) # }}} diff --git a/sumpy/tools.py b/sumpy/tools.py index 964c64cc440c807c02b3f8811c6a3f0de6c84b04..b80deddd00ce330bffcaadaf112929c89821c745 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -1,6 +1,9 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2018 Alexandru Fikl +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,10 +27,13 @@ THE SOFTWARE. import six from six.moves import range, zip -from pytools import memoize_method +from pytools import memoize_method, memoize_in import numpy as np import sumpy.symbolic as sym +import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION + import logging logger = logging.getLogger(__name__) @@ -292,6 +298,136 @@ class KernelComputation(object): # }}} +# {{{ + +class MatrixBlockIndex(object): + def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges): + """Keep track of different ways to index into matrix blocks. + + In this description, a block is given by a subset of indices of + the global matrix operator. For example, the :math:`i`-th block of + a matrix `mat` using the input arguments is obtained by: + + .. code-block:: python + + mat[np.ix_(tgtindices[tgtranges[i]:tgtranges[i + 1]], + srcindices[srcranges[i]:srcranges[i + 1]])] + + :arg tgtindices: a subset of the full matrix row index set. + :arg srcindices: a subset of the full matrix column index set. + :arg tgtranges: an array used to index into `tgtindices`. + :arg srcranges: an array used to index into `srcindices`. + """ + + self.queue = queue + self.tgtindices = tgtindices + self.tgtranges = tgtranges + self.srcindices = srcindices + self.srcranges = srcranges + + @memoize_method + def linear_ranges(self): + """ + :return: an array containing the cummulative sum of all block sizes. + It can be used to index into a linear representation of the + matrix blocks. + """ + + @memoize_in(self, "block_cumsum_knl") + def cumsum(): + loopy_knl = lp.make_kernel( + "{[i, j]: 0 <= i < nranges and 0 <= j <= i}", + """ + blkranges[0] = 0 + blkranges[i + 1] = reduce(sum, j, \ + (srcranges[j + 1] - srcranges[j]) * \ + (tgtranges[j + 1] - tgtranges[j])) \ + """, + [ + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"), + lp.ValueArg("nranges", None) + ], + name="block_cumsum_knl", + default_offset=lp.auto, + assumptions="nranges>=1", + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True, + force_outer_iname_for_scan="i") + return loopy_knl + + _, (blkranges,) = cumsum()(self.queue, + tgtranges=self.tgtranges, srcranges=self.srcranges) + + return blkranges + + @memoize_method + def linear_indices(self): + """ + :return: a tuple of `(rowindices, colindices)` that can be used in + conjunction with :func:`linear_ranges` to provide linear indexing + into a set of matrix blocks. These index arrays are just the + concatenated Cartesian products of all the block arrays described + by :attr:`tgtindices` and :arg:`srcindices`. + + They can be used to index directly into the matrix as follows: + + .. code-block:: python + + mat[rowindices[blkranges[i]:blkranges[i + 1]], + colindices[blkranges[i]:blkranges[i + 1]]] + """ + + @memoize_in(self, "block_index_knl") + def linear_index(): + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}" + ], + """ + for irange + <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange] + <> nsrcblock = srcranges[irange + 1] - srcranges[irange] + + for itgt, isrc + <> imat = blkranges[irange] + (nsrcblock * itgt + isrc) + + rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \ + {id_prefix=write_index} + colindices[imat] = srcindices[srcranges[irange] + isrc] \ + {id_prefix=write_index} + end + end + """, + [ + lp.GlobalArg('blkranges', None, shape="nranges + 1"), + lp.GlobalArg("rowindices", None, shape="nresults"), + lp.GlobalArg("colindices", None, shape="nresults"), + lp.ValueArg("nresults", None), + '...' + ], + name="block_index_knl", + default_offset=lp.auto, + assumptions='nranges>=1', + silenced_warnings="write_race(write_index*)", + lang_version=MOST_RECENT_LANGUAGE_VERSION) + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + + return loopy_knl + + blkranges = self.linear_ranges() + _, (rowindices, colindices) = linear_index()(self.queue, + tgtindices=self.tgtindices, srcindices=self.srcindices, + tgtranges=self.tgtranges, srcranges=self.srcranges, + blkranges=blkranges, nresults=blkranges[-1]) + + return rowindices, colindices + +# }}} + + # {{{ OrderedSet # Source: http://code.activestate.com/recipes/576694-orderedset/ diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index 4e2b3ad7c449ba9acb37955cd167d94fe3c80d83..892fef63edaf37d4286c07a7916fc13aabc269cd 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -31,6 +31,8 @@ import pyopencl as cl from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from sumpy.tools import MatrixBlockIndex + import logging logger = logging.getLogger(__name__) @@ -63,7 +65,28 @@ def create_arguments(n, mode, target_radius=1.0): return targets, sources, centers, sigma, expansion_radii -def test_qbx_direct(ctx_getter): +def create_index_subset(nnodes, nblks, factor): + indices = np.arange(0, nnodes) + ranges = np.arange(0, nnodes + 1, nnodes // nblks) + + if abs(factor - 1.0) < 1.0e-14: + ranges_ = ranges + indices_ = indices + else: + indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object) + for i in range(ranges.shape[0] - 1): + iidx = indices[np.s_[ranges[i]:ranges[i + 1]]] + indices_[i] = np.sort(np.random.choice(iidx, + size=int(factor * len(iidx)), replace=False)) + + ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_]) + indices_ = np.hstack(indices_) + + return indices_, ranges_ + + +@pytest.mark.parametrize('factor', [1.0, 0.6]) +def test_qbx_direct(ctx_getter, factor): # This evaluates a single layer potential on a circle. logging.basicConfig(level=logging.INFO) @@ -94,14 +117,8 @@ def test_qbx_direct(ctx_getter): h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices = np.arange(0, n) - tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) - tgtranges = np.arange(0, tgtindices.shape[0] + 1, - tgtindices.shape[0] // nblks) - srcindices = np.arange(0, n) - srcindices = np.random.choice(srcindices, size=int(0.8 * n)) - srcranges = np.arange(0, srcindices.shape[0] + 1, - srcindices.shape[0] // nblks) + tgtindices, tgtranges = create_index_subset(n, nblks, factor) + srcindices, srcranges = create_index_subset(n, nblks, factor) assert tgtranges.shape == srcranges.shape _, (mat,) = mat_gen(queue, targets, sources, centers, @@ -114,23 +131,21 @@ def test_qbx_direct(ctx_getter): eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps + index_set = MatrixBlockIndex(queue, + tgtindices, srcindices, tgtranges, srcranges) _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, tgtranges, srcranges) - - for i in range(srcranges.shape[0] - 1): - itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] - isrc = np.s_[srcranges[i]:srcranges[i + 1]] - block = np.ix_(tgtindices[itgt], srcindices[isrc]) + index_set) - eps = 1.0e-10 * la.norm(mat[block]) - assert la.norm(blk[itgt, isrc] - mat[block]) < eps + rowindices, colindices = index_set.linear_indices() + eps = 1.0e-10 * la.norm(mat) + assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps -@pytest.mark.parametrize("exclude_self", [True, False]) -def test_p2p_direct(ctx_getter, exclude_self): - # This evaluates a single layer potential on a circle. +@pytest.mark.parametrize(("exclude_self", "factor"), + [(True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)]) +def test_p2p_direct(ctx_getter, exclude_self, factor): + # This does a point-to-point kernel evaluation on a circle. logging.basicConfig(level=logging.INFO) - ctx = ctx_getter() queue = cl.CommandQueue(ctx) @@ -153,14 +168,8 @@ def test_p2p_direct(ctx_getter, exclude_self): h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices = np.arange(0, n) - tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) - tgtranges = np.arange(0, tgtindices.shape[0] + 1, - tgtindices.shape[0] // nblks) - srcindices = np.arange(0, n) - srcindices = np.random.choice(srcindices, size=int(0.8 * n)) - srcranges = np.arange(0, srcindices.shape[0] + 1, - srcindices.shape[0] // nblks) + tgtindices, tgtranges = create_index_subset(n, nblks, factor) + srcindices, srcranges = create_index_subset(n, nblks, factor) assert tgtranges.shape == srcranges.shape extra_kwargs = {} @@ -176,17 +185,13 @@ def test_p2p_direct(ctx_getter, exclude_self): eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps - _, (blk,) = blk_gen(queue, targets, sources, - tgtindices, srcindices, tgtranges, srcranges, - **extra_kwargs) - - for i in range(srcranges.shape[0] - 1): - itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] - isrc = np.s_[srcranges[i]:srcranges[i + 1]] - block = np.ix_(tgtindices[itgt], srcindices[isrc]) + index_set = MatrixBlockIndex(queue, + tgtindices, srcindices, tgtranges, srcranges) + _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs) - eps = 1.0e-10 * la.norm(mat[block]) - assert la.norm(blk[itgt, isrc] - mat[block]) < eps + rowindices, colindices = index_set.linear_indices() + eps = 1.0e-10 * la.norm(mat) + assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps # You can test individual routines by typing