diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 38bec39d3b4dbd7efe6229c563f391c29fdd3efb..97756a55d5690219317d930f1ae43debc769a682 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -33,6 +33,7 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
+from pytools import memoize_in
 from sumpy.tools import KernelComputation, KernelCacheWrapper
 
 
@@ -306,68 +307,44 @@ class P2PMatrixBlockGenerator(P2PBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", None),
-                lp.ValueArg("ntgtindices", None),
-                lp.ValueArg("nranges", None)
+                lp.GlobalArg("srcindices", None, shape="nresults"),
+                lp.GlobalArg("tgtindices", None, shape="nresults"),
+                lp.ValueArg("nresults", None)
             ] +
-            [lp.GlobalArg("result_%d" % i, dtype,
-                shape="ntgtindices, nsrcindices")
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresults")
              for i, dtype in enumerate(self.value_dtypes)])
 
-        loopy_knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[itgt, isrc, idim]: \
-                0 <= itgt < ntgtblock and \
-                0 <= isrc < nsrcblock and \
-                0 <= idim < dim}",
-            ],
+        loopy_knl = lp.make_kernel(
+            "{[imat, idim]: 0 <= imat < nresults and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
             + ["""
-                for irange
-                    <> tgtstart = tgtranges[irange]
-                    <> tgtend = tgtranges[irange + 1]
-                    <> ntgtblock = tgtend - tgtstart
-                    <> srcstart = srcranges[irange]
-                    <> srcend = srcranges[irange + 1]
-                    <> nsrcblock = srcend - srcstart
-
-                    for itgt, isrc
-                        <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \
-                                     sources[idim, srcindices[srcstart + isrc]]
+                for imat
+                    <> d[idim] = targets[idim, tgtindices[imat]] - \
+                                 sources[idim, srcindices[imat]]
             """]
             + ["""
-                        <> is_self = (srcindices[srcstart + isrc] ==
-                                      target_to_source[tgtindices[tgtstart + itgt]])
+                    <> is_self = (srcindices[imat] ==
+                                  target_to_source[tgtindices[imat]])
                 """ if self.exclude_self else ""]
             + loopy_insns + kernel_exprs
             + ["""
-                        result_{i}[tgtstart + itgt, srcstart + isrc] = \
-                            knl_{i}_scaling * pair_result_{i} \
-                                {{id_prefix=write_p2p,inames=irange}}
+                    result_{i}[imat] = \
+                        knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_p2p}}
                 """.format(i=iknl)
                 for iknl in range(len(self.kernels))]
-            + ["""
-                    end
-                end
-            """],
+            + ["end"],
             arguments,
-            assumptions="nranges>=1",
+            assumptions="nresults>=1",
             silenced_warnings="write_race(write_p2p*)",
             name=self.name,
             fixed_parameters=dict(dim=self.dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-        loopy_knl = lp.add_dtypes(loopy_knl, dict(
-            nsources=np.int32,
-            ntargets=np.int32,
-            ntgtindices=np.int32,
-            nsrcindices=np.int32))
-
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
+
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
@@ -382,11 +359,74 @@ class P2PMatrixBlockGenerator(P2PBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
         return knl
 
     def __call__(self, queue, targets, sources, tgtindices, srcindices,
             tgtranges, srcranges, **kwargs):
+        @memoize_in(self, "block_cumsum_knl")
+        def cumsum():
+            loopy_knl = lp.make_kernel(
+                "{[i, j]: 1 <= i <= nranges and 1 <= j <= i}",
+                """
+                blkprefix[0] = 0.0
+                blkprefix[i] = reduce(sum, j, \
+                    (srcranges[j] - srcranges[j - 1]) * \
+                    (tgtranges[j] - tgtranges[j - 1])) \
+                """,
+                [
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkprefix", np.int32, shape="nranges + 1"),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_cumsum_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
+                force_outer_iname_for_scan="i")
+            return loopy_knl
+
+        @memoize_in(self, "block_linear_index_knl")
+        def linear_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkprefix[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt]
+                        colindices[imat] = srcindices[srcranges[irange] + isrc]
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                    lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkprefix", None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nsrcindices", np.int32),
+                    lp.ValueArg("ntgtindices", np.int32),
+                    lp.ValueArg("nresults", None),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_linear_prefix_knl",
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
@@ -394,9 +434,16 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+        _, (blkprefix,) = cumsum()(queue,
+            tgtranges=tgtranges, srcranges=srcranges)
+        _, (rowindices, colindices,) = linear_index()(queue,
+            tgtindices=tgtindices, srcindices=srcindices,
+            tgtranges=tgtranges, srcranges=srcranges,
+            blkprefix=blkprefix, nresults=blkprefix[-1])
+        evt, results = knl(queue, targets=targets, sources=sources,
+            tgtindices=rowindices, srcindices=colindices, **kwargs)
+
+        return evt, tuple(list(results) + [rowindices, colindices])
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index eb85052b8d68e24ad46a3be8ad05663600047e0a..e784529e0baaea3416f0db64c100ea3d5836c934 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -63,15 +63,22 @@ def create_arguments(n, mode, target_radius=1.0):
     return targets, sources, centers, sigma, expansion_radii
 
 
-def create_subset(indices, ranges, factor):
-    indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
-    for i in range(ranges.shape[0] - 1):
-        iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
-        indices_[i] = np.sort(np.random.choice(iidx,
-            size=int(factor * len(iidx)), replace=False))
+def create_index_subset(nnodes, nblks, factor):
+    indices = np.arange(0, nnodes)
+    ranges = np.arange(0, nnodes + 1, nnodes // nblks)
 
-    ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
-    indices_ = np.hstack(indices_)
+    if abs(factor - 1.0) < 1.0e-14:
+        ranges_ = ranges
+        indices_ = indices
+    else:
+        indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
+        for i in range(ranges.shape[0] - 1):
+            iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
+            indices_[i] = np.sort(np.random.choice(iidx,
+                size=int(factor * len(iidx)), replace=False))
+
+        ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
+        indices_ = np.hstack(indices_)
 
     return indices_, ranges_
 
@@ -139,11 +146,12 @@ def test_qbx_direct(ctx_getter):
             assert la.norm(blk[itgt, isrc] - mat[block]) < eps
 
 
-@pytest.mark.parametrize("exclude_self", [True, False])
-def test_p2p_direct(ctx_getter, exclude_self):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize("exclude_self", "factor", [
+    (True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)
+    ])
+def test_p2p_direct(ctx_getter, exclude_self, factor):
+    # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
-
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
@@ -166,15 +174,8 @@ def test_p2p_direct(ctx_getter, exclude_self):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        tgtindices, tgtranges = create_subset(tgtindices, tgtranges, 0.6)
-
-        srcindices = np.arange(0, n)
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
-        srcindices, srcranges = create_subset(srcindices, srcranges, 0.6)
+        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
+        srcindices, srcranges = create_index_subset(n, nblks, factor)
         assert tgtranges.shape == srcranges.shape
 
         extra_kwargs = {}
@@ -190,17 +191,11 @@ def test_p2p_direct(ctx_getter, exclude_self):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk,) = blk_gen(queue, targets, sources,
-                            tgtindices, srcindices, tgtranges, srcranges,
-                            **extra_kwargs)
+        _, (blk, rowindices, colindices) = blk_gen(queue, targets, sources,
+            tgtindices, srcindices, tgtranges, srcranges, **extra_kwargs)
 
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
-
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        eps = 1.0e-10 * la.norm(mat)
+        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
 
 # You can test individual routines by typing