From f8a7e8d41da3f4aa91c629df4a46a8c7347c1b8c Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Sat, 14 Apr 2018 19:12:55 -0500
Subject: [PATCH 01/11] simplify test data generation

---
 test/test_matrixgen.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 4e2b3ad7..eb85052b 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -63,6 +63,19 @@ def create_arguments(n, mode, target_radius=1.0):
     return targets, sources, centers, sigma, expansion_radii
 
 
+def create_subset(indices, ranges, factor):
+    indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
+    for i in range(ranges.shape[0] - 1):
+        iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
+        indices_[i] = np.sort(np.random.choice(iidx,
+            size=int(factor * len(iidx)), replace=False))
+
+    ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
+    indices_ = np.hstack(indices_)
+
+    return indices_, ranges_
+
+
 def test_qbx_direct(ctx_getter):
     # This evaluates a single layer potential on a circle.
     logging.basicConfig(level=logging.INFO)
@@ -154,13 +167,14 @@ def test_p2p_direct(ctx_getter, exclude_self):
         strengths = (sigma * h,)
 
         tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
         tgtranges = np.arange(0, tgtindices.shape[0] + 1,
                               tgtindices.shape[0] // nblks)
+        tgtindices, tgtranges = create_subset(tgtindices, tgtranges, 0.6)
+
         srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
         srcranges = np.arange(0, srcindices.shape[0] + 1,
                               srcindices.shape[0] // nblks)
+        srcindices, srcranges = create_subset(srcindices, srcranges, 0.6)
         assert tgtranges.shape == srcranges.shape
 
         extra_kwargs = {}
-- 
GitLab


From ba2c9b44cd494a49b2835464190dde8d95ecf42c Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Sat, 14 Apr 2018 21:25:44 -0500
Subject: [PATCH 02/11] p2p: rework block generator to not waste huge amounts
 of space

---
 sumpy/p2p.py           | 141 +++++++++++++++++++++++++++--------------
 test/test_matrixgen.py |  57 ++++++++---------
 2 files changed, 120 insertions(+), 78 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 38bec39d..97756a55 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -33,6 +33,7 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
+from pytools import memoize_in
 from sumpy.tools import KernelComputation, KernelCacheWrapper
 
 
@@ -306,68 +307,44 @@ class P2PMatrixBlockGenerator(P2PBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", None),
-                lp.ValueArg("ntgtindices", None),
-                lp.ValueArg("nranges", None)
+                lp.GlobalArg("srcindices", None, shape="nresults"),
+                lp.GlobalArg("tgtindices", None, shape="nresults"),
+                lp.ValueArg("nresults", None)
             ] +
-            [lp.GlobalArg("result_%d" % i, dtype,
-                shape="ntgtindices, nsrcindices")
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresults")
              for i, dtype in enumerate(self.value_dtypes)])
 
-        loopy_knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[itgt, isrc, idim]: \
-                0 <= itgt < ntgtblock and \
-                0 <= isrc < nsrcblock and \
-                0 <= idim < dim}",
-            ],
+        loopy_knl = lp.make_kernel(
+            "{[imat, idim]: 0 <= imat < nresults and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
             + ["""
-                for irange
-                    <> tgtstart = tgtranges[irange]
-                    <> tgtend = tgtranges[irange + 1]
-                    <> ntgtblock = tgtend - tgtstart
-                    <> srcstart = srcranges[irange]
-                    <> srcend = srcranges[irange + 1]
-                    <> nsrcblock = srcend - srcstart
-
-                    for itgt, isrc
-                        <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \
-                                     sources[idim, srcindices[srcstart + isrc]]
+                for imat
+                    <> d[idim] = targets[idim, tgtindices[imat]] - \
+                                 sources[idim, srcindices[imat]]
             """]
             + ["""
-                        <> is_self = (srcindices[srcstart + isrc] ==
-                                      target_to_source[tgtindices[tgtstart + itgt]])
+                    <> is_self = (srcindices[imat] ==
+                                  target_to_source[tgtindices[imat]])
                 """ if self.exclude_self else ""]
             + loopy_insns + kernel_exprs
             + ["""
-                        result_{i}[tgtstart + itgt, srcstart + isrc] = \
-                            knl_{i}_scaling * pair_result_{i} \
-                                {{id_prefix=write_p2p,inames=irange}}
+                    result_{i}[imat] = \
+                        knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_p2p}}
                 """.format(i=iknl)
                 for iknl in range(len(self.kernels))]
-            + ["""
-                    end
-                end
-            """],
+            + ["end"],
             arguments,
-            assumptions="nranges>=1",
+            assumptions="nresults>=1",
             silenced_warnings="write_race(write_p2p*)",
             name=self.name,
             fixed_parameters=dict(dim=self.dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-        loopy_knl = lp.add_dtypes(loopy_knl, dict(
-            nsources=np.int32,
-            ntargets=np.int32,
-            ntgtindices=np.int32,
-            nsrcindices=np.int32))
-
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
+
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
@@ -382,11 +359,74 @@ class P2PMatrixBlockGenerator(P2PBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
         return knl
 
     def __call__(self, queue, targets, sources, tgtindices, srcindices,
             tgtranges, srcranges, **kwargs):
+        @memoize_in(self, "block_cumsum_knl")
+        def cumsum():
+            loopy_knl = lp.make_kernel(
+                "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}",
+                """
+                blkprefix[0] = 0.0
+                blkprefix[i] = reduce(sum, j, \
+                    (srcranges[j] - srcranges[j - 1]) * \
+                    (tgtranges[j] - tgtranges[j - 1])) \
+                """,
+                [
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkprefix", np.int32, shape="nranges + 1"),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_cumsum_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
+                force_outer_iname_for_scan="i")
+            return loopy_knl
+
+        @memoize_in(self, "block_linear_index_knl")
+        def linear_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkprefix[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt]
+                        colindices[imat] = srcindices[srcranges[irange] + isrc]
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                    lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkprefix", None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nsrcindices", np.int32),
+                    lp.ValueArg("ntgtindices", np.int32),
+                    lp.ValueArg("nresults", None),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_linear_prefix_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
@@ -394,9 +434,16 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+        _, (blkprefix,) = cumsum()(queue,
+            tgtranges=tgtranges, srcranges=srcranges)
+        _, (rowindices, colindices,) = linear_index()(queue,
+            tgtindices=tgtindices, srcindices=srcindices,
+            tgtranges=tgtranges, srcranges=srcranges,
+            blkprefix=blkprefix, nresults=blkprefix[-1])
+        evt, results = knl(queue, targets=targets, sources=sources,
+            tgtindices=rowindices, srcindices=colindices, **kwargs)
+
+        return evt, tuple(list(results) + [rowindices, colindices])
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index eb85052b..e784529e 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -63,15 +63,22 @@ def create_arguments(n, mode, target_radius=1.0):
     return targets, sources, centers, sigma, expansion_radii
 
 
-def create_subset(indices, ranges, factor):
-    indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
-    for i in range(ranges.shape[0] - 1):
-        iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
-        indices_[i] = np.sort(np.random.choice(iidx,
-            size=int(factor * len(iidx)), replace=False))
+def create_index_subset(nnodes, nblks, factor):
+    indices = np.arange(0, nnodes)
+    ranges = np.arange(0, nnodes + 1, nnodes // nblks)
 
-    ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
-    indices_ = np.hstack(indices_)
+    if abs(factor - 1.0) < 1.0e-14:
+        ranges_ = ranges
+        indices_ = indices
+    else:
+        indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
+        for i in range(ranges.shape[0] - 1):
+            iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
+            indices_[i] = np.sort(np.random.choice(iidx,
+                size=int(factor * len(iidx)), replace=False))
+
+        ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
+        indices_ = np.hstack(indices_)
 
     return indices_, ranges_
 
@@ -139,11 +146,12 @@ def test_qbx_direct(ctx_getter):
             assert la.norm(blk[itgt, isrc] - mat[block]) < eps
 
 
-@pytest.mark.parametrize("exclude_self", [True, False])
-def test_p2p_direct(ctx_getter, exclude_self):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize("exclude_self", "factor", [
+    (True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)
+    ])
+def test_p2p_direct(ctx_getter, exclude_self, factor):
+    # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
-
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
@@ -166,15 +174,8 @@ def test_p2p_direct(ctx_getter, exclude_self):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        tgtindices, tgtranges = create_subset(tgtindices, tgtranges, 0.6)
-
-        srcindices = np.arange(0, n)
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
-        srcindices, srcranges = create_subset(srcindices, srcranges, 0.6)
+        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
+        srcindices, srcranges = create_index_subset(n, nblks, factor)
         assert tgtranges.shape == srcranges.shape
 
         extra_kwargs = {}
@@ -190,17 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk,) = blk_gen(queue, targets, sources,
-                            tgtindices, srcindices, tgtranges, srcranges,
-                            **extra_kwargs)
+        _, (blk, rowindices, colindices) = blk_gen(queue, targets, sources,
+            tgtindices, srcindices, tgtranges, srcranges, **extra_kwargs)
 
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
-
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        eps = 1.0e-10 * la.norm(mat)
+        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
 
 # You can test individual routines by typing
-- 
GitLab


From f82035124152bae6c6b72313dc0fcd0e32f874e0 Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Sat, 14 Apr 2018 21:46:06 -0500
Subject: [PATCH 03/11] p2p: also return the block ranges so we can separate
 them

---
 sumpy/p2p.py           | 16 ++++++++--------
 test/test_matrixgen.py | 11 ++++++-----
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 97756a55..896d34c9 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -369,15 +369,15 @@ class P2PMatrixBlockGenerator(P2PBase):
             loopy_knl = lp.make_kernel(
                 "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}",
                 """
-                blkprefix[0] = 0.0
-                blkprefix[i] = reduce(sum, j, \
+                blkranges[0] = 0.0
+                blkranges[i] = reduce(sum, j, \
                     (srcranges[j] - srcranges[j - 1]) * \
                     (tgtranges[j] - tgtranges[j - 1])) \
                 """,
                 [
                     lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
                     lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkprefix", np.int32, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
                     lp.ValueArg("nranges", None)
                 ],
                 name="block_cumsum_knl",
@@ -400,7 +400,7 @@ class P2PMatrixBlockGenerator(P2PBase):
                     <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
 
                     for itgt, isrc
-                        <> imat = blkprefix[irange] + (nsrcblock * itgt + isrc)
+                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
 
                         rowindices[imat] = tgtindices[tgtranges[irange] + itgt]
                         colindices[imat] = srcindices[srcranges[irange] + isrc]
@@ -412,7 +412,7 @@ class P2PMatrixBlockGenerator(P2PBase):
                     lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
                     lp.GlobalArg("srcranges", None, shape="nranges + 1"),
                     lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkprefix", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
                     lp.GlobalArg("rowindices", None, shape="nresults"),
                     lp.GlobalArg("colindices", None, shape="nresults"),
                     lp.ValueArg("nsrcindices", np.int32),
@@ -434,16 +434,16 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        _, (blkprefix,) = cumsum()(queue,
+        _, (blkranges,) = cumsum()(queue,
             tgtranges=tgtranges, srcranges=srcranges)
         _, (rowindices, colindices,) = linear_index()(queue,
             tgtindices=tgtindices, srcindices=srcindices,
             tgtranges=tgtranges, srcranges=srcranges,
-            blkprefix=blkprefix, nresults=blkprefix[-1])
+            blkranges=blkranges, nresults=blkranges[-1])
         evt, results = knl(queue, targets=targets, sources=sources,
             tgtindices=rowindices, srcindices=colindices, **kwargs)
 
-        return evt, tuple(list(results) + [rowindices, colindices])
+        return evt, tuple(list(results) + [rowindices, colindices, blkranges])
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index e784529e..12198a39 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -146,9 +146,8 @@ def test_qbx_direct(ctx_getter):
             assert la.norm(blk[itgt, isrc] - mat[block]) < eps
 
 
-@pytest.mark.parametrize("exclude_self", "factor", [
-    (True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)
-    ])
+@pytest.mark.parametrize(("exclude_self", "factor"),
+    [(True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)])
 def test_p2p_direct(ctx_getter, exclude_self, factor):
     # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
@@ -191,8 +190,10 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk, rowindices, colindices) = blk_gen(queue, targets, sources,
-            tgtindices, srcindices, tgtranges, srcranges, **extra_kwargs)
+        _, (blk, rowindices, colindices, blkranges) = \
+            blk_gen(queue, targets, sources,
+                    tgtindices, srcindices, tgtranges, srcranges,
+                    **extra_kwargs)
 
         eps = 1.0e-10 * la.norm(mat)
         assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
-- 
GitLab


From d63f96d29e7d5910f4739fe1ffa94562534c64dd Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Tue, 17 Apr 2018 20:17:38 -0500
Subject: [PATCH 04/11] p2p: add a cache object that keeps track of the indices
 in the various forms

---
 sumpy/p2p.py           | 190 +++++++++++++++++++++++++----------------
 test/test_matrixgen.py |   9 +-
 2 files changed, 120 insertions(+), 79 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 896d34c9..4b0590f0 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -33,7 +33,7 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
-from pytools import memoize_in
+from pytools import memoize_method, memoize_in
 from sumpy.tools import KernelComputation, KernelCacheWrapper
 
 
@@ -293,6 +293,97 @@ class P2PMatrixGenerator(P2PBase):
 
 # {{{ P2P matrix block writer
 
+class P2PMatrixLinearIndex(object):
+    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
+        self.queue = queue
+        self.tgtindices = tgtindices
+        self.tgtranges = tgtranges
+        self.srcindices = srcindices
+        self.srcranges = srcranges
+
+    @memoize_method
+    def block_ranges(self):
+        @memoize_in(self, "linear_cumsum_knl")
+        def cumsum():
+            loopy_knl = lp.make_kernel(
+                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
+                """
+                blkranges[0] = 0
+                blkranges[i + 1] = reduce(sum, j, \
+                    (srcranges[j + 1] - srcranges[j]) * \
+                    (tgtranges[j + 1] - tgtranges[j])) \
+                """,
+                [
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="linear_cumsum_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
+                force_outer_iname_for_scan="i")
+            return loopy_knl
+
+        _, (blkranges,) = cumsum()(self.queue,
+            tgtranges=self.tgtranges, srcranges=self.srcranges);
+
+        return blkranges
+
+    @memoize_method
+    def linear_indices(self):
+        @memoize_in(self, "linear_index_knl")
+        def linear_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
+                            {id_prefix=write_index}
+                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
+                            {id_prefix=write_index}
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg("srcindices", None),
+                    lp.GlobalArg("tgtindices", None),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nresults", None),
+                    lp.ValueArg("nranges", None),
+                    "..."
+                ],
+                name="linear_index_knl",
+                default_offset=lp.auto,
+                silenced_warnings="write_race(write_index*)",
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
+        blkranges = self.block_ranges()
+        _, (rowindices, colindices) = linear_index()(self.queue,
+            tgtindices=self.tgtindices, srcindices=self.srcindices,
+            tgtranges=self.tgtranges, srcranges=self.srcranges,
+            blkranges=blkranges, nresults=blkranges[-1])
+
+        return rowindices, colindices, blkranges
+
+
 class P2PMatrixBlockGenerator(P2PBase):
     """Generator for a subset of P2P interaction matrix entries."""
 
@@ -362,71 +453,27 @@ class P2PMatrixBlockGenerator(P2PBase):
         knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
         return knl
 
-    def __call__(self, queue, targets, sources, tgtindices, srcindices,
-            tgtranges, srcranges, **kwargs):
-        @memoize_in(self, "block_cumsum_knl")
-        def cumsum():
-            loopy_knl = lp.make_kernel(
-                "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}",
-                """
-                blkranges[0] = 0.0
-                blkranges[i] = reduce(sum, j, \
-                    (srcranges[j] - srcranges[j - 1]) * \
-                    (tgtranges[j] - tgtranges[j - 1])) \
-                """,
-                [
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
-                    lp.ValueArg("nranges", None)
-                ],
-                name="block_cumsum_knl",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
-                force_outer_iname_for_scan="i")
-            return loopy_knl
-
-        @memoize_in(self, "block_linear_index_knl")
-        def linear_index():
-            loopy_knl = lp.make_kernel([
-                "{[irange]: 0 <= irange < nranges}",
-                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
-                ],
-                """
-                for irange
-                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
-                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
-
-                    for itgt, isrc
-                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
-
-                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt]
-                        colindices[imat] = srcindices[srcranges[irange] + isrc]
-                    end
-                end
-                """,
-                [
-                    lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                    lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("rowindices", None, shape="nresults"),
-                    lp.GlobalArg("colindices", None, shape="nresults"),
-                    lp.ValueArg("nsrcindices", np.int32),
-                    lp.ValueArg("ntgtindices", np.int32),
-                    lp.ValueArg("nresults", None),
-                    lp.ValueArg("nranges", None)
-                ],
-                name="block_linear_prefix_knl",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-
-            return loopy_knl
-
+    def __call__(self, queue, targets, sources, index_set, **kwargs):
+        """Construct a set of blocks from the full P2P interaction matrix.
+        The relevant subset of indices for block :math:`i` is given by the
+        row indices `tgtindices[tgtranges[i]:tgtranges[i + 1]]` and the
+        column indices `srcindices[srcranges[i]:srcranges[i + 1]]`.
+
+        :arg tgtindices: list of row indices for all blocks.
+        :arg srcindices: list of column indices for all blocks.
+        :arg tgtranges: list used to index `tgtindices` in each block.
+        :arg srcindices: list used to index `srcindices` in each block.
+
+        :return: a tuple containing `(kernel_1_block, ... kernel_n_block,
+        rowindices, colindices, blkranges)`. The set `(rowindices, colindices)`
+        can be used to place each element of each block in the full matrix.
+        Each `kernel_i_block` is a one-dimensional array containing all the
+        blocks, i.e. the ith kernel evaluated at each of the input
+        source-target pairs. To retrieve a given block :math:`j`, one can
+        use `kernel_i_block[blkranges[j]:blkranges[j + 1]]`, which can then
+        be reshaped into the original block size using the `tgtranges` and
+        `srcranges` index lists.
+        """
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
@@ -434,17 +481,10 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        _, (blkranges,) = cumsum()(queue,
-            tgtranges=tgtranges, srcranges=srcranges)
-        _, (rowindices, colindices,) = linear_index()(queue,
-            tgtindices=tgtindices, srcindices=srcindices,
-            tgtranges=tgtranges, srcranges=srcranges,
-            blkranges=blkranges, nresults=blkranges[-1])
-        evt, results = knl(queue, targets=targets, sources=sources,
+        rowindices, colindices, _ = index_set.linear_indices()
+        return knl(queue, targets=targets, sources=sources,
             tgtindices=rowindices, srcindices=colindices, **kwargs)
 
-        return evt, tuple(list(results) + [rowindices, colindices, blkranges])
-
 # }}}
 
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 12198a39..984fbc5b 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -161,6 +161,7 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
     mode_nr = 25
 
     from sumpy.p2p import P2P
+    from sumpy.p2p import P2PMatrixLinearIndex
     from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
@@ -190,11 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk, rowindices, colindices, blkranges) = \
-            blk_gen(queue, targets, sources,
-                    tgtindices, srcindices, tgtranges, srcranges,
-                    **extra_kwargs)
+        index_set = P2PMatrixLinearIndex(queue,
+                tgtindices, srcindices, tgtranges, srcranges)
+        _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs)
 
+        rowindices, colindices, _ = index_set.linear_indices()
         eps = 1.0e-10 * la.norm(mat)
         assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
-- 
GitLab


From 1e01bbb613e778b9b3d9b4b3ab46b33475e76a2b Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Tue, 17 Apr 2018 21:12:40 -0500
Subject: [PATCH 05/11] p2p: move the index cache class to sumpy.tools

---
 sumpy/p2p.py           | 133 +++++++---------------------------------
 sumpy/tools.py         | 136 ++++++++++++++++++++++++++++++++++++++++-
 test/test_matrixgen.py |   6 +-
 3 files changed, 160 insertions(+), 115 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 4b0590f0..d1903a28 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -293,97 +293,6 @@ class P2PMatrixGenerator(P2PBase):
 
 # {{{ P2P matrix block writer
 
-class P2PMatrixLinearIndex(object):
-    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
-        self.queue = queue
-        self.tgtindices = tgtindices
-        self.tgtranges = tgtranges
-        self.srcindices = srcindices
-        self.srcranges = srcranges
-
-    @memoize_method
-    def block_ranges(self):
-        @memoize_in(self, "linear_cumsum_knl")
-        def cumsum():
-            loopy_knl = lp.make_kernel(
-                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
-                """
-                blkranges[0] = 0
-                blkranges[i + 1] = reduce(sum, j, \
-                    (srcranges[j + 1] - srcranges[j]) * \
-                    (tgtranges[j + 1] - tgtranges[j])) \
-                """,
-                [
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
-                    lp.ValueArg("nranges", None)
-                ],
-                name="linear_cumsum_knl",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
-                force_outer_iname_for_scan="i")
-            return loopy_knl
-
-        _, (blkranges,) = cumsum()(self.queue,
-            tgtranges=self.tgtranges, srcranges=self.srcranges);
-
-        return blkranges
-
-    @memoize_method
-    def linear_indices(self):
-        @memoize_in(self, "linear_index_knl")
-        def linear_index():
-            loopy_knl = lp.make_kernel([
-                "{[irange]: 0 <= irange < nranges}",
-                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
-                ],
-                """
-                for irange
-                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
-                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
-
-                    for itgt, isrc
-                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
-
-                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
-                            {id_prefix=write_index}
-                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
-                            {id_prefix=write_index}
-                    end
-                end
-                """,
-                [
-                    lp.GlobalArg("srcindices", None),
-                    lp.GlobalArg("tgtindices", None),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("rowindices", None, shape="nresults"),
-                    lp.GlobalArg("colindices", None, shape="nresults"),
-                    lp.ValueArg("nresults", None),
-                    lp.ValueArg("nranges", None),
-                    "..."
-                ],
-                name="linear_index_knl",
-                default_offset=lp.auto,
-                silenced_warnings="write_race(write_index*)",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-
-            return loopy_knl
-
-        blkranges = self.block_ranges()
-        _, (rowindices, colindices) = linear_index()(self.queue,
-            tgtindices=self.tgtindices, srcindices=self.srcindices,
-            tgtranges=self.tgtranges, srcranges=self.srcranges,
-            blkranges=blkranges, nresults=blkranges[-1])
-
-        return rowindices, colindices, blkranges
-
-
 class P2PMatrixBlockGenerator(P2PBase):
     """Generator for a subset of P2P interaction matrix entries."""
 
@@ -454,25 +363,27 @@ class P2PMatrixBlockGenerator(P2PBase):
         return knl
 
     def __call__(self, queue, targets, sources, index_set, **kwargs):
-        """Construct a set of blocks from the full P2P interaction matrix.
-        The relevant subset of indices for block :math:`i` is given by the
-        row indices `tgtindices[tgtranges[i]:tgtranges[i + 1]]` and the
-        column indices `srcindices[srcranges[i]:srcranges[i + 1]]`.
-
-        :arg tgtindices: list of row indices for all blocks.
-        :arg srcindices: list of column indices for all blocks.
-        :arg tgtranges: list used to index `tgtindices` in each block.
-        :arg srcindices: list used to index `srcindices` in each block.
-
-        :return: a tuple containing `(kernel_1_block, ... kernel_n_block,
-        rowindices, colindices, blkranges)`. The set `(rowindices, colindices)`
-        can be used to place each element of each block in the full matrix.
-        Each `kernel_i_block` is a one-dimensional array containing all the
-        blocks, i.e. the ith kernel evaluated at each of the input
-        source-target pairs. To retrieve a given block :math:`j`, one can
-        use `kernel_i_block[blkranges[j]:blkranges[j + 1]]`, which can then
-        be reshaped into the original block size using the `tgtranges` and
-        `srcranges` index lists.
+        """Construct a set of blocks of the full P2P interaction matrix.
+
+        The blocks are returned as one-dimensional arrays, for performance
+        and storage reasons. If the two-dimensional form is desired, it can
+        be obtained using the information in the `index_set` for a block
+        :math:`i` in the following way:
+
+        .. code-block:: python
+
+            blkranges = index_set.linear_ranges()
+            m = index_set.tgtranges[i + 1] - index_set.tgtranges[i]
+            n = index_set.srcranges[i + 1] - index_set.srcranges[i]
+
+            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(m, n)
+
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
+        to define the various blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+        target-source pairs described by `index_set`.
         """
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
@@ -481,7 +392,7 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        rowindices, colindices, _ = index_set.linear_indices()
+        rowindices, colindices = index_set.linear_indices()
         return knl(queue, targets=targets, sources=sources,
             tgtindices=rowindices, srcindices=colindices, **kwargs)
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 964c64cc..b891da0e 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -24,10 +24,13 @@ THE SOFTWARE.
 
 import six
 from six.moves import range, zip
-from pytools import memoize_method
+from pytools import memoize_method, memoize_in
 import numpy as np
 import sumpy.symbolic as sym
 
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -291,6 +294,137 @@ class KernelComputation(object):
 
 # }}}
 
+# {{{
+
+class MatrixBlockIndex(object):
+    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
+        """Keep track of different ways to index into matrix blocks.
+
+        In this description, a block is given by a subset of indices of
+        the global matrix operator. For example, the :math:`i`-th block of
+        a matrix `mat` using the input arguments is obtained by:
+
+        .. code-block:: python
+
+            mat[np.ix_(tgtindices[tgtranges[i]:tgtranges[i + 1]],
+                       srcindices[srcranges[i]:srcranges[i + 1]])]
+
+        :arg tgtindices: a subset of the full matrix row index set.
+        :arg srcindices: a subset of the full matrix column index set.
+        :arg tgtranges: an array used to index into `tgtindices`.
+        :arg srcranges: an array used to index into `srcindices`.
+        """
+
+        self.queue = queue
+        self.tgtindices = tgtindices
+        self.tgtranges = tgtranges
+        self.srcindices = srcindices
+        self.srcranges = srcranges
+
+    @memoize_method
+    def linear_ranges(self):
+        """
+        :return: an array containing the cummulative sum of all block sizes.
+            It can be used to index into a linear representation of the
+            matrix blocks.
+        """
+
+        @memoize_in(self, "block_cumsum_knl")
+        def cumsum():
+            loopy_knl = lp.make_kernel(
+                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
+                """
+                blkranges[0] = 0
+                blkranges[i + 1] = reduce(sum, j, \
+                    (srcranges[j + 1] - srcranges[j]) * \
+                    (tgtranges[j + 1] - tgtranges[j])) \
+                """,
+                [
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_cumsum_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
+                force_outer_iname_for_scan="i")
+            return loopy_knl
+
+        _, (blkranges,) = cumsum()(self.queue,
+            tgtranges=self.tgtranges, srcranges=self.srcranges);
+
+        return blkranges
+
+    @memoize_method
+    def linear_indices(self):
+        """
+        :return: a tuple of `(rowindices, colindices)` that can be used in
+            conjunction with :func:`linear_ranges` to provide linear indexing
+            into a set of matrix blocks. These index arrays are just the
+            concatenated Cartesian products of all the block arrays described
+            by :attr:`tgtindices` and :arg:`srcindices`.
+
+            They can be used to index directly into the matrix as follows:
+
+            .. code-block:: python
+
+                mat[rowindices[blkranges[i]:blkranges[i + 1]],
+                    colindices[blkranges[i]:blkranges[i + 1]]]
+        """
+
+        @memoize_in(self, "block_index_knl")
+        def linear_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
+                            {id_prefix=write_index}
+                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
+                            {id_prefix=write_index}
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg("srcindices", None),
+                    lp.GlobalArg("tgtindices", None),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nresults", None),
+                    lp.ValueArg("nranges", None),
+                    "..."
+                ],
+                name="block_index_knl",
+                default_offset=lp.auto,
+                silenced_warnings="write_race(write_index*)",
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
+        blkranges = self.linear_ranges()
+        _, (rowindices, colindices) = linear_index()(self.queue,
+            tgtindices=self.tgtindices, srcindices=self.srcindices,
+            tgtranges=self.tgtranges, srcranges=self.srcranges,
+            blkranges=blkranges, nresults=blkranges[-1])
+
+        return rowindices, colindices
+
+# }}}
 
 # {{{ OrderedSet
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 984fbc5b..d6e85c23 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -161,7 +161,7 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
     mode_nr = 25
 
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixLinearIndex
+    from sumpy.tools import MatrixBlockIndex
     from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
@@ -191,11 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        index_set = P2PMatrixLinearIndex(queue,
+        index_set = MatrixBlockIndex(queue,
                 tgtindices, srcindices, tgtranges, srcranges)
         _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs)
 
-        rowindices, colindices, _ = index_set.linear_indices()
+        rowindices, colindices = index_set.linear_indices()
         eps = 1.0e-10 * la.norm(mat)
         assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
-- 
GitLab


From 0885edb0008673b20640af6aca7f8edc82705822 Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Tue, 17 Apr 2018 21:23:49 -0500
Subject: [PATCH 06/11] qbx: use same method as p2p to construct the block
 matrices

---
 sumpy/p2p.py           | 16 +++++-----
 sumpy/qbx.py           | 70 ++++++++++++++----------------------------
 test/test_matrixgen.py | 31 ++++++++-----------
 3 files changed, 43 insertions(+), 74 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index d1903a28..9503a2a1 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -307,15 +307,15 @@ class P2PMatrixBlockGenerator(P2PBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nresults"),
-                lp.GlobalArg("tgtindices", None, shape="nresults"),
-                lp.ValueArg("nresults", None)
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
             ] +
-            [lp.GlobalArg("result_%d" % i, dtype, shape="nresults")
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
              for i, dtype in enumerate(self.value_dtypes)])
 
         loopy_knl = lp.make_kernel(
-            "{[imat, idim]: 0 <= imat < nresults and 0 <= idim < dim}",
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
             + ["""
                 for imat
@@ -335,7 +335,7 @@ class P2PMatrixBlockGenerator(P2PBase):
                 for iknl in range(len(self.kernels))]
             + ["end"],
             arguments,
-            assumptions="nresults>=1",
+            assumptions="nresult>=1",
             silenced_warnings="write_race(write_p2p*)",
             name=self.name,
             fixed_parameters=dict(dim=self.dim),
@@ -392,9 +392,9 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        rowindices, colindices = index_set.linear_indices()
+        tgtindices, srcindices = index_set.linear_indices()
         return knl(queue, targets=targets, sources=sources,
-            tgtindices=rowindices, srcindices=colindices, **kwargs)
+                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index f3b2d6f7..73f7ec3d 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -335,68 +335,44 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", None),
-                lp.ValueArg("ntgtindices", None),
-                lp.ValueArg("nranges", None)
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
             ] +
-            [lp.GlobalArg("result_%d" % i,
-                dtype, shape="ntgtindices, nsrcindices")
-            for i, dtype in enumerate(self.value_dtypes)])
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+             for i, dtype in enumerate(self.value_dtypes)])
 
         loopy_knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}",
-            "{[idim]: 0 <= idim < dim}"
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
             ],
             self.get_kernel_scaling_assignments()
             + ["""
-                for irange
-                <> tgtstart = tgtranges[irange]
-                <> tgtend = tgtranges[irange + 1]
-                <> ntgtblock = tgtend - tgtstart
-                <> srcstart = srcranges[irange]
-                <> srcend = srcranges[irange + 1]
-                <> nsrcblock = srcend - srcstart
-
-                for itgt, isrc
-                    <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \
-                                 src[idim, srcindices[srcstart + isrc]] \
-                                 {dup=idim}
-                    <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \
-                                 center[idim, tgtindices[tgtstart + itgt]] \
-                                 {dup=idim}
-                    <> rscale = expansion_radii[tgtindices[tgtstart + itgt]]
+                for imat
+                    <> a[idim] = center[idim, tgtindices[imat]] - \
+                                 src[idim, srcindices[imat]] {dup=idim}
+                    <> b[idim] = tgt[idim, tgtindices[imat]] - \
+                                 center[idim, tgtindices[imat]] {dup=idim}
+                    <> rscale = expansion_radii[tgtindices[imat]]
             """]
             + loopy_insns + kernel_exprs
             + ["""
-                    result_{i}[tgtstart + itgt, srcstart + isrc] = \
-                        knl_{i}_scaling * pair_result_{i} \
-                            {{id_prefix=write_lpot,inames=irange}}
+                    result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_lpot}}
                 """.format(i=iknl)
                 for iknl in range(len(self.expansions))]
-            + ["""
-                    end
-                end
-            """],
+            + ["end"],
             arguments,
             name=self.name,
-            assumptions="nranges>=1",
+            assumptions="nresult>=1",
             default_offset=lp.auto,
             silenced_warnings="write_race(write_lpot*)",
             fixed_parameters=dict(dim=self.dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-        loopy_knl = lp.add_dtypes(loopy_knl, dict(
-            nsources=np.int32,
-            ntargets=np.int32,
-            ntgtindices=np.int32,
-            nsrcindices=np.int32))
-
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
+
         for expn in self.expansions:
             loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
 
@@ -405,17 +381,17 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
     def get_optimized_kernel(self):
         loopy_knl = self.get_kernel()
 
-        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+        loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
         return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, expansion_radii,
-            tgtindices, srcindices, tgtranges, srcranges, **kwargs):
+                 index_set, **kwargs):
         knl = self.get_cached_optimized_kernel()
 
+        tgtindices, srcindices = index_set.linear_indices()
         return knl(queue, src=sources, tgt=targets, center=centers,
                 expansion_radii=expansion_radii,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index d6e85c23..a687ee16 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -31,6 +31,8 @@ import pyopencl as cl
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
+from sumpy.tools import MatrixBlockIndex
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -82,8 +84,8 @@ def create_index_subset(nnodes, nblks, factor):
 
     return indices_, ranges_
 
-
-def test_qbx_direct(ctx_getter):
+@pytest.mark.parametrize('factor', [1.0, 0.6])
+def test_qbx_direct(ctx_getter, factor):
     # This evaluates a single layer potential on a circle.
     logging.basicConfig(level=logging.INFO)
 
@@ -114,14 +116,8 @@ def test_qbx_direct(ctx_getter):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
+        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
+        srcindices, srcranges = create_index_subset(n, nblks, factor)
         assert tgtranges.shape == srcranges.shape
 
         _, (mat,) = mat_gen(queue, targets, sources, centers,
@@ -134,16 +130,14 @@ def test_qbx_direct(ctx_getter):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        index_set = MatrixBlockIndex(queue,
+            tgtindices, srcindices, tgtranges, srcranges)
         _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
-                            tgtindices, srcindices, tgtranges, srcranges)
+                            index_set)
 
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
-
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        rowindices, colindices = index_set.linear_indices()
+        eps = 1.0e-10 * la.norm(mat)
+        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
 
 @pytest.mark.parametrize(("exclude_self", "factor"),
@@ -161,7 +155,6 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
     mode_nr = 25
 
     from sumpy.p2p import P2P
-    from sumpy.tools import MatrixBlockIndex
     from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
-- 
GitLab


From c120504edd0e2740b54c0ae1a0ca82663a78b7aa Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Tue, 17 Apr 2018 21:26:18 -0500
Subject: [PATCH 07/11] qbx: add some docs

---
 sumpy/qbx.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 73f7ec3d..554771e7 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -386,6 +386,17 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
 
     def __call__(self, queue, targets, sources, centers, expansion_radii,
                  index_set, **kwargs):
+        """
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg centers: QBX target expansion centers.
+        :arg expansion_radii: radii for each expansion center.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
+        to define the various blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+        target-source pairs described by `index_set`.
+        """
+
         knl = self.get_cached_optimized_kernel()
 
         tgtindices, srcindices = index_set.linear_indices()
-- 
GitLab


From 11fa1a90635f5715ac296e4a361bfcb802fce739 Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Wed, 18 Apr 2018 16:47:12 -0500
Subject: [PATCH 08/11] tools: remove some globalargs that loopy can infer

---
 sumpy/tools.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/sumpy/tools.py b/sumpy/tools.py
index b891da0e..9d73d1a1 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -347,6 +347,7 @@ class MatrixBlockIndex(object):
                 ],
                 name="block_cumsum_knl",
                 default_offset=lp.auto,
+                assumptions="nranges>=1",
                 lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
             loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
@@ -397,19 +398,15 @@ class MatrixBlockIndex(object):
                 end
                 """,
                 [
-                    lp.GlobalArg("srcindices", None),
-                    lp.GlobalArg("tgtindices", None),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", None, shape="nranges + 1"),
+                    lp.GlobalArg('blkranges', None, shape="nranges + 1"),
                     lp.GlobalArg("rowindices", None, shape="nresults"),
                     lp.GlobalArg("colindices", None, shape="nresults"),
                     lp.ValueArg("nresults", None),
-                    lp.ValueArg("nranges", None),
-                    "..."
+                    '...'
                 ],
                 name="block_index_knl",
                 default_offset=lp.auto,
+                assumptions='nranges>=1',
                 silenced_warnings="write_race(write_index*)",
                 lang_version=MOST_RECENT_LANGUAGE_VERSION)
             loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-- 
GitLab


From a7a171e9f4b268dd8c91dcfe668eb37a7c8bd9aa Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Wed, 18 Apr 2018 16:48:41 -0500
Subject: [PATCH 09/11] enable doc generation

---
 sumpy/p2p.py |  9 ++++++---
 sumpy/qbx.py | 10 ++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 9503a2a1..f0bba60f 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -294,7 +294,10 @@ class P2PMatrixGenerator(P2PBase):
 # {{{ P2P matrix block writer
 
 class P2PMatrixBlockGenerator(P2PBase):
-    """Generator for a subset of P2P interaction matrix entries."""
+    """Generator for a subset of P2P interaction matrix entries.
+
+    .. automethod:: __call__
+    """
 
     default_name = "p2p_block"
 
@@ -381,9 +384,9 @@ class P2PMatrixBlockGenerator(P2PBase):
         :arg targets: target point coordinates.
         :arg sources: source point coordinates.
         :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
-        to define the various blocks.
+            to define the various blocks.
         :return: a tuple of one-dimensional arrays of kernel evaluations at
-        target-source pairs described by `index_set`.
+            target-source pairs described by `index_set`.
         """
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 554771e7..985565f1 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -189,7 +189,10 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 # {{{ direct applier
 
 class LayerPotential(LayerPotentialBase):
-    """Direct applier for the layer potential."""
+    """Direct applier for the layer potential.
+
+    .. automethod:: __call__
+    """
 
     default_name = "qbx_apply"
 
@@ -321,7 +324,10 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 # {{{
 
 class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
-    """Generator for a subset of the layer potential matrix entries."""
+    """Generator for a subset of the layer potential matrix entries.
+
+    .. automethod:: __call__
+    """
 
     default_name = "qbx_block"
 
-- 
GitLab


From c654614108fcd644927828815d537224f2f6f1f8 Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Wed, 18 Apr 2018 16:51:54 -0500
Subject: [PATCH 10/11] add copyright

---
 sumpy/tools.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sumpy/tools.py b/sumpy/tools.py
index 9d73d1a1..4ddfb25f 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
-- 
GitLab


From 992acb2ba693c9c876f3d8ce18bed503dd8696ca Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Wed, 18 Apr 2018 17:30:51 -0500
Subject: [PATCH 11/11] flake8 fixes

---
 sumpy/p2p.py           | 1 -
 sumpy/tools.py         | 4 +++-
 test/test_matrixgen.py | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index f0bba60f..c408360d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -33,7 +33,6 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
-from pytools import memoize_method, memoize_in
 from sumpy.tools import KernelComputation, KernelCacheWrapper
 
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 4ddfb25f..b80deddd 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -297,6 +297,7 @@ class KernelComputation(object):
 
 # }}}
 
+
 # {{{
 
 class MatrixBlockIndex(object):
@@ -358,7 +359,7 @@ class MatrixBlockIndex(object):
             return loopy_knl
 
         _, (blkranges,) = cumsum()(self.queue,
-            tgtranges=self.tgtranges, srcranges=self.srcranges);
+            tgtranges=self.tgtranges, srcranges=self.srcranges)
 
         return blkranges
 
@@ -426,6 +427,7 @@ class MatrixBlockIndex(object):
 
 # }}}
 
+
 # {{{ OrderedSet
 
 # Source: http://code.activestate.com/recipes/576694-orderedset/
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index a687ee16..892fef63 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -84,6 +84,7 @@ def create_index_subset(nnodes, nblks, factor):
 
     return indices_, ranges_
 
+
 @pytest.mark.parametrize('factor', [1.0, 0.6])
 def test_qbx_direct(ctx_getter, factor):
     # This evaluates a single layer potential on a circle.
-- 
GitLab