diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 38bec39d3b4dbd7efe6229c563f391c29fdd3efb..c408360d7aac0985f49b5feaf7dc770d37790956 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -293,7 +293,10 @@ class P2PMatrixGenerator(P2PBase):
 # {{{ P2P matrix block writer
 
 class P2PMatrixBlockGenerator(P2PBase):
-    """Generator for a subset of P2P interaction matrix entries."""
+    """Generator for a subset of P2P interaction matrix entries.
+
+    .. automethod:: __call__
+    """
 
     default_name = "p2p_block"
 
@@ -306,68 +309,44 @@ class P2PMatrixBlockGenerator(P2PBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", None),
-                lp.ValueArg("ntgtindices", None),
-                lp.ValueArg("nranges", None)
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
             ] +
-            [lp.GlobalArg("result_%d" % i, dtype,
-                shape="ntgtindices, nsrcindices")
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
              for i, dtype in enumerate(self.value_dtypes)])
 
-        loopy_knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[itgt, isrc, idim]: \
-                0 <= itgt < ntgtblock and \
-                0 <= isrc < nsrcblock and \
-                0 <= idim < dim}",
-            ],
+        loopy_knl = lp.make_kernel(
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
             + ["""
-                for irange
-                    <> tgtstart = tgtranges[irange]
-                    <> tgtend = tgtranges[irange + 1]
-                    <> ntgtblock = tgtend - tgtstart
-                    <> srcstart = srcranges[irange]
-                    <> srcend = srcranges[irange + 1]
-                    <> nsrcblock = srcend - srcstart
-
-                    for itgt, isrc
-                        <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \
-                                     sources[idim, srcindices[srcstart + isrc]]
+                for imat
+                    <> d[idim] = targets[idim, tgtindices[imat]] - \
+                                 sources[idim, srcindices[imat]]
             """]
             + ["""
-                        <> is_self = (srcindices[srcstart + isrc] ==
-                                      target_to_source[tgtindices[tgtstart + itgt]])
+                    <> is_self = (srcindices[imat] ==
+                                  target_to_source[tgtindices[imat]])
                 """ if self.exclude_self else ""]
             + loopy_insns + kernel_exprs
             + ["""
-                        result_{i}[tgtstart + itgt, srcstart + isrc] = \
-                            knl_{i}_scaling * pair_result_{i} \
-                                {{id_prefix=write_p2p,inames=irange}}
+                    result_{i}[imat] = \
+                        knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_p2p}}
                 """.format(i=iknl)
                 for iknl in range(len(self.kernels))]
-            + ["""
-                    end
-                end
-            """],
+            + ["end"],
             arguments,
-            assumptions="nranges>=1",
+            assumptions="nresult>=1",
             silenced_warnings="write_race(write_p2p*)",
             name=self.name,
             fixed_parameters=dict(dim=self.dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-        loopy_knl = lp.add_dtypes(loopy_knl, dict(
-            nsources=np.int32,
-            ntargets=np.int32,
-            ntgtindices=np.int32,
-            nsrcindices=np.int32))
-
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
+
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
@@ -382,11 +361,32 @@ class P2PMatrixBlockGenerator(P2PBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
         return knl
 
-    def __call__(self, queue, targets, sources, tgtindices, srcindices,
-            tgtranges, srcranges, **kwargs):
+    def __call__(self, queue, targets, sources, index_set, **kwargs):
+        """Construct a set of blocks of the full P2P interaction matrix.
+
+        The blocks are returned as one-dimensional arrays, for performance
+        and storage reasons. If the two-dimensional form is desired, it can
+        be obtained using the information in the `index_set` for a block
+        :math:`i` in the following way:
+
+        .. code-block:: python
+
+            blkranges = index_set.linear_ranges()
+            m = index_set.tgtranges[i + 1] - index_set.tgtranges[i]
+            n = index_set.srcranges[i + 1] - index_set.srcranges[i]
+
+            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(m, n)
+
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
+            to define the various blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+            target-source pairs described by `index_set`.
+        """
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
@@ -394,9 +394,9 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
+        tgtindices, srcindices = index_set.linear_indices()
         return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index f3b2d6f78964d3d4f5a88248f7b9a08d32b722eb..985565f10af078da6909f97424ab6ecfdd9ef2d9 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -189,7 +189,10 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 # {{{ direct applier
 
 class LayerPotential(LayerPotentialBase):
-    """Direct applier for the layer potential."""
+    """Direct applier for the layer potential.
+
+    .. automethod:: __call__
+    """
 
     default_name = "qbx_apply"
 
@@ -321,7 +324,10 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 # {{{
 
 class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
-    """Generator for a subset of the layer potential matrix entries."""
+    """Generator for a subset of the layer potential matrix entries.
+
+    .. automethod:: __call__
+    """
 
     default_name = "qbx_block"
 
@@ -335,68 +341,44 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         arguments = (
             self.get_default_src_tgt_arguments() +
             [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", None),
-                lp.ValueArg("ntgtindices", None),
-                lp.ValueArg("nranges", None)
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
             ] +
-            [lp.GlobalArg("result_%d" % i,
-                dtype, shape="ntgtindices, nsrcindices")
-            for i, dtype in enumerate(self.value_dtypes)])
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+             for i, dtype in enumerate(self.value_dtypes)])
 
         loopy_knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}",
-            "{[idim]: 0 <= idim < dim}"
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
             ],
             self.get_kernel_scaling_assignments()
             + ["""
-                for irange
-                <> tgtstart = tgtranges[irange]
-                <> tgtend = tgtranges[irange + 1]
-                <> ntgtblock = tgtend - tgtstart
-                <> srcstart = srcranges[irange]
-                <> srcend = srcranges[irange + 1]
-                <> nsrcblock = srcend - srcstart
-
-                for itgt, isrc
-                    <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \
-                                 src[idim, srcindices[srcstart + isrc]] \
-                                 {dup=idim}
-                    <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \
-                                 center[idim, tgtindices[tgtstart + itgt]] \
-                                 {dup=idim}
-                    <> rscale = expansion_radii[tgtindices[tgtstart + itgt]]
+                for imat
+                    <> a[idim] = center[idim, tgtindices[imat]] - \
+                                 src[idim, srcindices[imat]] {dup=idim}
+                    <> b[idim] = tgt[idim, tgtindices[imat]] - \
+                                 center[idim, tgtindices[imat]] {dup=idim}
+                    <> rscale = expansion_radii[tgtindices[imat]]
             """]
             + loopy_insns + kernel_exprs
             + ["""
-                    result_{i}[tgtstart + itgt, srcstart + isrc] = \
-                        knl_{i}_scaling * pair_result_{i} \
-                            {{id_prefix=write_lpot,inames=irange}}
+                    result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_lpot}}
                 """.format(i=iknl)
                 for iknl in range(len(self.expansions))]
-            + ["""
-                    end
-                end
-            """],
+            + ["end"],
             arguments,
             name=self.name,
-            assumptions="nranges>=1",
+            assumptions="nresult>=1",
             default_offset=lp.auto,
             silenced_warnings="write_race(write_lpot*)",
             fixed_parameters=dict(dim=self.dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-        loopy_knl = lp.add_dtypes(loopy_knl, dict(
-            nsources=np.int32,
-            ntargets=np.int32,
-            ntgtindices=np.int32,
-            nsrcindices=np.int32))
-
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
+
         for expn in self.expansions:
             loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
 
@@ -405,17 +387,28 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
     def get_optimized_kernel(self):
         loopy_knl = self.get_kernel()
 
-        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+        loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
         return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, expansion_radii,
-            tgtindices, srcindices, tgtranges, srcranges, **kwargs):
+                 index_set, **kwargs):
+        """
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg centers: QBX target expansion centers.
+        :arg expansion_radii: radii for each expansion center.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
+        to define the various blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+        target-source pairs described by `index_set`.
+        """
+
         knl = self.get_cached_optimized_kernel()
 
+        tgtindices, srcindices = index_set.linear_indices()
         return knl(queue, src=sources, tgt=targets, center=centers,
                 expansion_radii=expansion_radii,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 964c64cc440c807c02b3f8811c6a3f0de6c84b04..b80deddd00ce330bffcaadaf112929c89821c745 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -24,10 +27,13 @@ THE SOFTWARE.
 
 import six
 from six.moves import range, zip
-from pytools import memoize_method
+from pytools import memoize_method, memoize_in
 import numpy as np
 import sumpy.symbolic as sym
 
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -292,6 +298,136 @@ class KernelComputation(object):
 # }}}
 
 
+# {{{
+
+class MatrixBlockIndex(object):
+    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
+        """Keep track of different ways to index into matrix blocks.
+
+        In this description, a block is given by a subset of indices of
+        the global matrix operator. For example, the :math:`i`-th block of
+        a matrix `mat` using the input arguments is obtained by:
+
+        .. code-block:: python
+
+            mat[np.ix_(tgtindices[tgtranges[i]:tgtranges[i + 1]],
+                       srcindices[srcranges[i]:srcranges[i + 1]])]
+
+        :arg tgtindices: a subset of the full matrix row index set.
+        :arg srcindices: a subset of the full matrix column index set.
+        :arg tgtranges: an array used to index into `tgtindices`.
+        :arg srcranges: an array used to index into `srcindices`.
+        """
+
+        self.queue = queue
+        self.tgtindices = tgtindices
+        self.tgtranges = tgtranges
+        self.srcindices = srcindices
+        self.srcranges = srcranges
+
+    @memoize_method
+    def linear_ranges(self):
+        """
+        :return: an array containing the cummulative sum of all block sizes.
+            It can be used to index into a linear representation of the
+            matrix blocks.
+        """
+
+        @memoize_in(self, "block_cumsum_knl")
+        def cumsum():
+            loopy_knl = lp.make_kernel(
+                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
+                """
+                blkranges[0] = 0
+                blkranges[i + 1] = reduce(sum, j, \
+                    (srcranges[j + 1] - srcranges[j]) * \
+                    (tgtranges[j + 1] - tgtranges[j])) \
+                """,
+                [
+                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
+                    lp.ValueArg("nranges", None)
+                ],
+                name="block_cumsum_knl",
+                default_offset=lp.auto,
+                assumptions="nranges>=1",
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
+                force_outer_iname_for_scan="i")
+            return loopy_knl
+
+        _, (blkranges,) = cumsum()(self.queue,
+            tgtranges=self.tgtranges, srcranges=self.srcranges)
+
+        return blkranges
+
+    @memoize_method
+    def linear_indices(self):
+        """
+        :return: a tuple of `(rowindices, colindices)` that can be used in
+            conjunction with :func:`linear_ranges` to provide linear indexing
+            into a set of matrix blocks. These index arrays are just the
+            concatenated Cartesian products of all the block arrays described
+            by :attr:`tgtindices` and :arg:`srcindices`.
+
+            They can be used to index directly into the matrix as follows:
+
+            .. code-block:: python
+
+                mat[rowindices[blkranges[i]:blkranges[i + 1]],
+                    colindices[blkranges[i]:blkranges[i + 1]]]
+        """
+
+        @memoize_in(self, "block_index_knl")
+        def linear_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
+                            {id_prefix=write_index}
+                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
+                            {id_prefix=write_index}
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg('blkranges', None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nresults", None),
+                    '...'
+                ],
+                name="block_index_knl",
+                default_offset=lp.auto,
+                assumptions='nranges>=1',
+                silenced_warnings="write_race(write_index*)",
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
+        blkranges = self.linear_ranges()
+        _, (rowindices, colindices) = linear_index()(self.queue,
+            tgtindices=self.tgtindices, srcindices=self.srcindices,
+            tgtranges=self.tgtranges, srcranges=self.srcranges,
+            blkranges=blkranges, nresults=blkranges[-1])
+
+        return rowindices, colindices
+
+# }}}
+
+
 # {{{ OrderedSet
 
 # Source: http://code.activestate.com/recipes/576694-orderedset/
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 4e2b3ad7c449ba9acb37955cd167d94fe3c80d83..892fef63edaf37d4286c07a7916fc13aabc269cd 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -31,6 +31,8 @@ import pyopencl as cl
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
+from sumpy.tools import MatrixBlockIndex
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -63,7 +65,28 @@ def create_arguments(n, mode, target_radius=1.0):
     return targets, sources, centers, sigma, expansion_radii
 
 
-def test_qbx_direct(ctx_getter):
+def create_index_subset(nnodes, nblks, factor):
+    indices = np.arange(0, nnodes)
+    ranges = np.arange(0, nnodes + 1, nnodes // nblks)
+
+    if abs(factor - 1.0) < 1.0e-14:
+        ranges_ = ranges
+        indices_ = indices
+    else:
+        indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
+        for i in range(ranges.shape[0] - 1):
+            iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
+            indices_[i] = np.sort(np.random.choice(iidx,
+                size=int(factor * len(iidx)), replace=False))
+
+        ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
+        indices_ = np.hstack(indices_)
+
+    return indices_, ranges_
+
+
+@pytest.mark.parametrize('factor', [1.0, 0.6])
+def test_qbx_direct(ctx_getter, factor):
     # This evaluates a single layer potential on a circle.
     logging.basicConfig(level=logging.INFO)
 
@@ -94,14 +117,8 @@ def test_qbx_direct(ctx_getter):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
+        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
+        srcindices, srcranges = create_index_subset(n, nblks, factor)
         assert tgtranges.shape == srcranges.shape
 
         _, (mat,) = mat_gen(queue, targets, sources, centers,
@@ -114,23 +131,21 @@ def test_qbx_direct(ctx_getter):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        index_set = MatrixBlockIndex(queue,
+            tgtindices, srcindices, tgtranges, srcranges)
         _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
-                            tgtindices, srcindices, tgtranges, srcranges)
-
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+                            index_set)
 
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        rowindices, colindices = index_set.linear_indices()
+        eps = 1.0e-10 * la.norm(mat)
+        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
 
-@pytest.mark.parametrize("exclude_self", [True, False])
-def test_p2p_direct(ctx_getter, exclude_self):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize(("exclude_self", "factor"),
+    [(True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)])
+def test_p2p_direct(ctx_getter, exclude_self, factor):
+    # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
-
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
@@ -153,14 +168,8 @@ def test_p2p_direct(ctx_getter, exclude_self):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
+        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
+        srcindices, srcranges = create_index_subset(n, nblks, factor)
         assert tgtranges.shape == srcranges.shape
 
         extra_kwargs = {}
@@ -176,17 +185,13 @@ def test_p2p_direct(ctx_getter, exclude_self):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk,) = blk_gen(queue, targets, sources,
-                            tgtindices, srcindices, tgtranges, srcranges,
-                            **extra_kwargs)
-
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+        index_set = MatrixBlockIndex(queue,
+                tgtindices, srcindices, tgtranges, srcranges)
+        _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs)
 
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        rowindices, colindices = index_set.linear_indices()
+        eps = 1.0e-10 * la.norm(mat)
+        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
 
 # You can test individual routines by typing