From 2df8cd3bffb71b32d8f85aa4915ac142a1276719 Mon Sep 17 00:00:00 2001
From: Alex Fikl <alexfikl@gmail.com>
Date: Sun, 18 Feb 2018 17:09:43 -0600
Subject: [PATCH] add block matrix generators

---
 sumpy/p2p.py           | 121 ++++++++++++++++++++++++++++++++++++++---
 sumpy/qbx.py           | 113 +++++++++++++++++++++++++++++++++++---
 test/test_matrixgen.py |  59 ++++++++++++++++++--
 3 files changed, 272 insertions(+), 21 deletions(-)

diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 8c37b0e0..b76e7890 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -120,10 +120,26 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
                     shape=(self.dim, "nsources")),
                 lp.GlobalArg("targets", None,
                    shape=(self.dim, "ntargets")),
-                lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None)
+                lp.ValueArg("nsources", np.int32),
+                lp.ValueArg("ntargets", np.int32)
+                ]
+
+    def get_domains(self):
+        return [
+                "{[itgt]: 0 <= itgt < ntargets}",
+                "{[isrc]: 0 <= isrc < nsources}",
+                "{[idim]: 0 <= idim < dim}"
                 ]
 
+    def get_loop_begin(self):
+        return ["for itgt, isrc"]
+
+    def get_loop_end(self):
+        return ["end"]
+
+    def get_assumptions(self):
+        return "nsources>=1 and ntargets>=1"
+
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
 
@@ -146,25 +162,24 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
                 + gather_loopy_source_arguments(self.kernels))
 
         loopy_knl = lp.make_kernel(
-                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
-                        and 0<=idim<dim}",
+                self.get_domains(),
                 self.get_kernel_scaling_assignments()
-                + ["for itgt, isrc"]
-                + loopy_insns
+                + self.get_loop_begin()
                 + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
                 + ["<> is_self = (isrc == target_to_source[itgt])"
                     if self.exclude_self else ""]
+                + loopy_insns
                 + [
                     lp.Assignment(id=None,
                         assignee="pair_result_%d" % i, expression=expr,
                         temp_var_type=lp.auto)
                     for i, expr in enumerate(exprs)
                 ]
-                + ["end"]
+                + self.get_loop_end()
                 + self.get_result_store_instructions(),
                 arguments,
+                assumptions=self.get_assumptions(),
                 name=self.name,
-                assumptions="nsources>=1 and ntargets>=1",
                 fixed_parameters=dict(
                     dim=self.dim,
                     nstrengths=self.strength_count,
@@ -186,7 +201,8 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
+        # FIXME: how to split when using blocks?
+        # knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
         return knl
 
 
@@ -266,6 +282,93 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
 # }}}
 
 
+# {{{
+
+class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
+    default_name = "p2p_block"
+
+    def get_src_tgt_arguments(self):
+        return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \
+            + [
+                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                lp.GlobalArg("srcranges", None, shape="nranges"),
+                lp.GlobalArg("tgtranges", None, shape="nranges"),
+                lp.ValueArg("nsrcindices", np.int32),
+                lp.ValueArg("ntgtindices", np.int32),
+                lp.ValueArg("nranges", None)
+            ]
+
+    def get_domains(self):
+        # FIXME: this doesn't work when separating j and k
+        return [
+                "{[i]: 0 <= i < nranges - 1}",
+                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return [
+                """
+                for i
+                    <> tgtstart = tgtranges[i]
+                    <> tgtend = tgtranges[i + 1]
+                    <> tgt_length = tgtend - tgtstart
+                    <> srcstart = srcranges[i]
+                    <> srcend = srcranges[i + 1]
+                    <> src_length = srcend - srcstart
+                    for j, k
+                        <> itgt = tgtindices[tgtstart + j]
+                        <> isrc = srcindices[srcstart + k]
+                """
+                ]
+
+    def get_loop_end(self):
+        return [
+                """
+                    end
+                end
+                """
+                ]
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
+                for i, dtype in enumerate(self.value_dtypes)
+                ]
+
+    def get_result_store_instructions(self):
+        # FIXME: doesn't work without inames=i. check how the loops are nested!
+        return [
+                """
+                result_{i}[tgtstart + j, srcstart + k] = \
+                        knl_{i}_scaling * pair_result_{i} {{inames=i}}
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))
+                ]
+
+    def get_assumptions(self):
+        return "nranges>=2"
+
+    def __call__(self, queue, targets, sources, tgtindices, srcindices,
+            tgtranges, srcranges, **kwargs):
+        from pytools.obj_array import is_obj_array
+        knl = self.get_optimized_kernel(
+                targets_is_obj_array=(
+                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
+                sources_is_obj_array=(
+                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+
+        return knl(queue, targets=targets, sources=sources,
+                tgtindices=tgtindices, srcindices=srcindices,
+                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+
+# }}}
+
+
 # {{{ P2P from CSR-like interaction list
 
 class P2PFromCSR(P2PComputationBase):
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index cde20ef2..64ea5578 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -116,10 +116,26 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 lp.GlobalArg("center", None,
                     shape=(self.dim, "ntargets"), order="C"),
                 lp.GlobalArg("expansion_radii", None, shape="ntargets"),
-                lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None),
+                lp.ValueArg("nsources", np.int32),
+                lp.ValueArg("ntargets", np.int32),
                 ]
 
+    def get_domains(self):
+        return [
+                "{[itgt]: 0 <= itgt < ntargets}",
+                "{[isrc]: 0 <= isrc < nsources}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return ["for itgt, isrc"]
+
+    def get_loop_end(self):
+        return ["end"]
+
+    def get_assumptions(self):
+        return "nsources>=1 and ntargets>=1"
+
     @memoize_method
     def get_kernel(self):
         from sumpy.symbolic import make_sym_vector
@@ -162,10 +178,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 + gather_loopy_source_arguments(self.kernels))
 
         loopy_knl = lp.make_kernel(
-                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
-                "and 0<=idim<dim}",
+                self.get_domains(),
                 self.get_kernel_scaling_assignments()
-                + ["for itgt, isrc"]
+                + self.get_loop_begin()
                 + [self.get_compute_a_and_b_vecs()]
                 + loopy_insns
                 + [
@@ -174,11 +189,11 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                         temp_var_type=lp.auto)
                     for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
                 ]
-                + ["end"]
+                + self.get_loop_end()
                 + self.get_result_store_instructions(),
                 arguments,
                 name=self.name,
-                assumptions="nsources>=1 and ntargets>=1",
+                assumptions=self.get_assumptions(),
                 default_offset=lp.auto,
                 silenced_warnings="write_race(write_lpot*)",
                 fixed_parameters=dict(dim=self.dim),
@@ -197,7 +212,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
     def get_optimized_kernel(self):
         # FIXME specialize/tune for GPU/CPU
         loopy_knl = self.get_kernel()
+        return loopy_knl
 
+        # FIXME: how to tune for blocks?
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
@@ -297,6 +314,88 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 
 # }}}
 
+
+# {{{
+
+class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
+    default_name = "qbx_block"
+
+    def get_src_tgt_arguments(self):
+        return LayerPotentialBase.get_src_tgt_arguments(self) \
+            + [
+                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                lp.GlobalArg("srcranges", None, shape="nranges"),
+                lp.GlobalArg("tgtranges", None, shape="nranges"),
+                lp.ValueArg("nsrcindices", np.int32),
+                lp.ValueArg("ntgtindices", np.int32),
+                lp.ValueArg("nranges", None)
+            ]
+
+    def get_domains(self):
+        # FIXME: this doesn't work when separating j and k
+        return [
+                "{[i]: 0 <= i < nranges - 1}",
+                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return [
+                """
+                for i
+                    <> tgtstart = tgtranges[i]
+                    <> tgtend = tgtranges[i + 1]
+                    <> tgt_length = tgtend - tgtstart
+                    <> srcstart = srcranges[i]
+                    <> srcend = srcranges[i + 1]
+                    <> src_length = srcend - srcstart
+                    for j, k
+                        <> itgt = tgtindices[tgtstart + j]
+                        <> isrc = srcindices[srcstart + k]
+                """
+                ]
+
+    def get_loop_end(self):
+        return [
+                """
+                    end
+                end
+                """
+                ]
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
+                for i, dtype in enumerate(self.value_dtypes)
+                ]
+
+    def get_result_store_instructions(self):
+        return [
+                """
+                result_KNLIDX[tgtstart + j, srcstart + k] = \
+                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=i}
+                """.replace("KNLIDX", str(iknl))
+                for iknl in range(len(self.expansions))
+                ]
+
+    def get_assumptions(self):
+        return "nranges>=2"
+
+    def __call__(self, queue, targets, sources, centers, expansion_radii,
+            tgtindices, srcindices, tgtranges, srcranges, **kwargs):
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, src=sources, tgt=targets, center=centers,
+                expansion_radii=expansion_radii,
+                tgtindices=tgtindices, srcindices=srcindices,
+                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+
+# }}}
+
 # }}}
 
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 244a555b..ed5cbae7 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -73,15 +73,19 @@ def test_qbx_direct(ctx_getter):
     from sumpy.kernel import LaplaceKernel
     lknl = LaplaceKernel(2)
 
+    nblks = 10
     order = 12
     mode_nr = 25
 
     from sumpy.qbx import LayerPotential
     from sumpy.qbx import LayerPotentialMatrixGenerator
+    from sumpy.qbx import LayerPotentialMatrixBlockGenerator
     from sumpy.expansion.local import LineTaylorLocalExpansion
+    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
     mat_gen = LayerPotentialMatrixGenerator(ctx,
             [LineTaylorLocalExpansion(lknl, order)])
-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
+    blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
+            [LineTaylorLocalExpansion(lknl, order)])
 
     for n in [200, 300, 400]:
         targets, sources, centers, sigma, expansion_radii = \
@@ -90,16 +94,37 @@ def test_qbx_direct(ctx_getter):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
+        tgtindices = np.arange(0, n)
+        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
+        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
+                              tgtindices.shape[0] // nblks)
+        srcindices = np.arange(0, n)
+        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
+        srcranges = np.arange(0, srcindices.shape[0] + 1,
+                              srcindices.shape[0] // nblks)
+        assert tgtranges.shape == srcranges.shape
+
         _, (mat,) = mat_gen(queue, targets, sources, centers,
-                expansion_radii=expansion_radii)
+                expansion_radii)
         result_mat = mat.dot(strengths[0])
 
         _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
-                expansion_radii=expansion_radii)
+                expansion_radii)
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
+                            tgtindices, srcindices, tgtranges, srcranges)
+
+        for i in range(srcranges.shape[0] - 1):
+            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
+            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
+            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+
+            eps = 1.0e-10 * la.norm(mat[block])
+            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+
 
 @pytest.mark.parametrize("exclude_self", [True, False])
 def test_p2p_direct(ctx_getter, exclude_self):
@@ -112,12 +137,14 @@ def test_p2p_direct(ctx_getter, exclude_self):
     from sumpy.kernel import LaplaceKernel
     lknl = LaplaceKernel(2)
 
+    nblks = 10
     mode_nr = 25
 
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixGenerator
-    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
 
     for n in [200, 300, 400]:
         targets, sources, _, sigma, _ = \
@@ -126,6 +153,16 @@ def test_p2p_direct(ctx_getter, exclude_self):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
+        tgtindices = np.arange(0, n)
+        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
+        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
+                              tgtindices.shape[0] // nblks)
+        srcindices = np.arange(0, n)
+        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
+        srcranges = np.arange(0, srcindices.shape[0] + 1,
+                              srcindices.shape[0] // nblks)
+        assert tgtranges.shape == srcranges.shape
+
         extra_kwargs = {}
         if exclude_self:
             extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
@@ -139,6 +176,18 @@ def test_p2p_direct(ctx_getter, exclude_self):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        _, (blk,) = blk_gen(queue, targets, sources,
+                            tgtindices, srcindices, tgtranges, srcranges,
+                            **extra_kwargs)
+
+        for i in range(srcranges.shape[0] - 1):
+            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
+            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
+            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+
+            eps = 1.0e-10 * la.norm(mat[block])
+            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+
 
 # You can test individual routines by typing
 # $ python test_kernels.py 'test_p2p(cl.create_some_context)'
-- 
GitLab