diff --git a/requirements.txt b/requirements.txt
index 0acdd3827c4b16f9f8109b4ef7aafd5347b690b8..3a15952739b666eab6234546f34a06ef1a094cad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 numpy
-sympy==1.0
+sympy==1.1.1
 git+https://github.com/inducer/pymbolic
 git+https://github.com/inducer/islpy
 git+https://github.com/inducer/pyopencl
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index c408360d7aac0985f49b5feaf7dc770d37790956..a0e89f242f958bbaea411090582e885d942b2a1d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -375,15 +375,14 @@ class P2PMatrixBlockGenerator(P2PBase):
         .. code-block:: python
 
             blkranges = index_set.linear_ranges()
-            m = index_set.tgtranges[i + 1] - index_set.tgtranges[i]
-            n = index_set.srcranges[i + 1] - index_set.srcranges[i]
+            blkshape = index_set.block_shape(i)
 
-            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(m, n)
+            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(*blkshape)
 
         :arg targets: target point coordinates.
         :arg sources: source point coordinates.
-        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
-            to define the various blocks.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
+            to define the blocks.
         :return: a tuple of one-dimensional arrays of kernel evaluations at
             target-source pairs described by `index_set`.
         """
@@ -394,9 +393,11 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        tgtindices, srcindices = index_set.linear_indices()
-        return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
+        return knl(queue,
+                   targets=targets,
+                   sources=sources,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 985565f10af078da6909f97424ab6ecfdd9ef2d9..a0814e01c412727a908ec72758e62014b3278708 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -354,6 +354,9 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
             self.get_kernel_scaling_assignments()
             + ["""
                 for imat
+                    <> itgt = tgtindices[imat]
+                    <> isrc = srcindices[imat]
+
                     <> a[idim] = center[idim, tgtindices[imat]] - \
                                  src[idim, srcindices[imat]] {dup=idim}
                     <> b[idim] = tgt[idim, tgtindices[imat]] - \
@@ -397,18 +400,21 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         :arg sources: source point coordinates.
         :arg centers: QBX target expansion centers.
         :arg expansion_radii: radii for each expansion center.
-        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndex` object used
-        to define the various blocks.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
+            to define the blocks.
         :return: a tuple of one-dimensional arrays of kernel evaluations at
-        target-source pairs described by `index_set`.
+            target-source pairs described by `index_set`.
         """
 
         knl = self.get_cached_optimized_kernel()
 
-        tgtindices, srcindices = index_set.linear_indices()
-        return knl(queue, src=sources, tgt=targets, center=centers,
-                expansion_radii=expansion_radii,
-                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
+        return knl(queue,
+                   src=sources,
+                   tgt=targets,
+                   center=centers,
+                   expansion_radii=expansion_radii,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index b80deddd00ce330bffcaadaf112929c89821c745..469cb5e15238299092d55841f2bf43486bd11e0c 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -31,6 +31,9 @@ from pytools import memoize_method, memoize_in
 import numpy as np
 import sumpy.symbolic as sym
 
+import pyopencl as cl
+import pyopencl.array  # noqa
+
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
@@ -300,88 +303,202 @@ class KernelComputation(object):
 
 # {{{
 
-class MatrixBlockIndex(object):
-    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
-        """Keep track of different ways to index into matrix blocks.
 
-        In this description, a block is given by a subset of indices of
-        the global matrix operator. For example, the :math:`i`-th block of
-        a matrix `mat` using the input arguments is obtained by:
+def _to_host(x, queue=None):
+    if isinstance(x, cl.array.Array):
+        queue = queue or x.queue
+        return x.get(queue)
+    return x
 
-        .. code-block:: python
 
-            mat[np.ix_(tgtindices[tgtranges[i]:tgtranges[i + 1]],
-                       srcindices[srcranges[i]:srcranges[i + 1]])]
+class BlockIndexRanges(object):
+    """Convenience class for working with blocks of a global array.
 
-        :arg tgtindices: a subset of the full matrix row index set.
-        :arg srcindices: a subset of the full matrix column index set.
-        :arg tgtranges: an array used to index into `tgtindices`.
-        :arg srcranges: an array used to index into `srcindices`.
-        """
+    .. attribute:: indices
+
+        A list of not necessarily continuous or increasing integers
+        representing the indices of a global array. The individual blocks are
+        delimited using :attr:`ranges`.
+
+    .. attribute:: ranges
+
+        A list of nondecreasing integers used to index into :attr:`indices`.
+        A block :math:`i` can be retrieved using
+        `indices[ranges[i]:ranges[i + 1]]`.
+
+    .. automethod:: block_shape
+    .. automethod:: get
+    .. automethod:: take
+    """
 
-        self.queue = queue
-        self.tgtindices = tgtindices
-        self.tgtranges = tgtranges
-        self.srcindices = srcindices
-        self.srcranges = srcranges
+    def __init__(self, cl_context, indices, ranges):
+        self.cl_context = cl_context
+        self.indices = indices
+        self.ranges = ranges
 
+    @property
     @memoize_method
+    def _ranges(self):
+        with cl.CommandQueue(self.cl_context) as queue:
+            return _to_host(self.ranges, queue=queue)
+
+    @property
+    def nblocks(self):
+        return self.ranges.shape[0] - 1
+
+    def block_shape(self, i):
+        return (self._ranges[i + 1] - self._ranges[i],)
+
+    def block_indices(self, i):
+        return self.indices[self._ranges[i]:self._ranges[i + 1]]
+
+    def get(self, queue=None):
+        return BlockIndexRanges(self.cl_context,
+                                _to_host(self.indices, queue=queue),
+                                _to_host(self.ranges, queue=queue))
+
+    def take(self, x, i):
+        """Return the subset of a global array `x` that is defined by
+        the :attr:`indices` in block :math:`i`.
+        """
+
+        return x[self.block_indices(i)]
+
+
+class MatrixBlockIndexRanges(object):
+    """Keep track of different ways to index into matrix blocks.
+
+    .. attribute:: row
+
+        A :class:`BlockIndexRanges` encapsulating row block indices.
+
+    .. attribute:: col
+
+        A :class:`BlockIndexRanges` encapsulating column block indices.
+
+    .. automethod:: block_shape
+    .. automethod:: block_take
+    .. automethod:: get
+    .. automethod:: take
+
+    """
+
+    def __init__(self, cl_context, row, col):
+        self.cl_context = cl_context
+        self.row = row
+        self.col = col
+        assert self.row.nblocks == self.col.nblocks
+
+        self.blkranges = np.cumsum([0] + [
+            self.row.block_shape(i)[0] * self.col.block_shape(i)[0]
+            for i in range(self.row.nblocks)])
+
+        if isinstance(self.row.indices, cl.array.Array):
+            with cl.CommandQueue(self.cl_context) as queue:
+                self.blkranges = \
+                    cl.array.to_device(queue, self.blkranges).with_queue(None)
+
+    @property
+    def nblocks(self):
+        return self.row.nblocks
+
+    def block_shape(self, i):
+        return self.row.block_shape(i) + self.col.block_shape(i)
+
+    def block_indices(self, i):
+        return (self.row.block_indices(i),
+                self.col.block_indices(i))
+
+    @property
+    def linear_row_indices(self):
+        r, _ = self._linear_indices()
+        return r
+
+    @property
+    def linear_col_indices(self):
+        _, c = self._linear_indices()
+        return c
+
+    @property
     def linear_ranges(self):
+        return self.blkranges
+
+    def get(self, queue=None):
+        """Transfer data to the host. Only the initial given data is
+        transfered, not the arrays returned by :meth:`linear_row_indices` and
+        friends.
+
+        :return: a copy of `self` in which all data lives on the host, i.e.
+        all :class:`pyopencl.array.Array` instances are replaces by
+        :class:`numpy.ndarray` instances.
         """
-        :return: an array containing the cummulative sum of all block sizes.
-            It can be used to index into a linear representation of the
-            matrix blocks.
+        return MatrixBlockIndexRanges(self.cl_context,
+                row=self.row.get(queue=queue),
+                col=self.col.get(queue=queue))
+
+    def take(self, x, i):
+        """Retrieve a block from a global matrix.
+
+        :arg x: a 2D :class:`numpy.ndarray`.
+        :arg i: block index.
+        :return: requested block from the matrix.
         """
 
-        @memoize_in(self, "block_cumsum_knl")
-        def cumsum():
-            loopy_knl = lp.make_kernel(
-                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
-                """
-                blkranges[0] = 0
-                blkranges[i + 1] = reduce(sum, j, \
-                    (srcranges[j + 1] - srcranges[j]) * \
-                    (tgtranges[j + 1] - tgtranges[j])) \
-                """,
-                [
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
-                    lp.ValueArg("nranges", None)
-                ],
-                name="block_cumsum_knl",
-                default_offset=lp.auto,
-                assumptions="nranges>=1",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+        if isinstance(self.row.indices, cl.array.Array) or \
+                isinstance(self.col.indices, cl.array.Array):
+            raise ValueError("CL `Array`s are not supported."
+                    "Use MatrixBlockIndexRanges.get() and then view into matrices.")
 
-            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
-                force_outer_iname_for_scan="i")
-            return loopy_knl
+        irow, icol = self.block_indices(i)
+        return x[np.ix_(irow, icol)]
 
-        _, (blkranges,) = cumsum()(self.queue,
-            tgtranges=self.tgtranges, srcranges=self.srcranges)
+    def block_take(self, x, i):
+        """Retrieve a block from a linear representation of the matrix blocks.
+        A linear representation of the matrix blocks can be obtained, or
+        should be consistent with
+
+        .. code-block:: python
 
-        return blkranges
+            i = index.linear_row_indices()
+            j = index.linear_col_indices()
+            linear_blks = global_mat[i, j]
+
+            for k in range(index.nblocks):
+                assert np.allclose(index.block_take(linear_blks, k),
+                                   index.take(global_mat, k))
+
+        :arg x: a 1D :class:`numpy.ndarray`.
+        :arg i: block index.
+        :return: requested block, reshaped into a 2D array.
+        """
+
+        iblk = np.s_[self.blkranges[i]:self.blkranges[i + 1]]
+        return x[iblk].reshape(*self.block_shape(i))
 
     @memoize_method
-    def linear_indices(self):
+    def _linear_indices(self):
         """
-        :return: a tuple of `(rowindices, colindices)` that can be used in
-            conjunction with :func:`linear_ranges` to provide linear indexing
-            into a set of matrix blocks. These index arrays are just the
-            concatenated Cartesian products of all the block arrays described
-            by :attr:`tgtindices` and :arg:`srcindices`.
+        :return: a tuple of `(rowindices, colindices)` that can be
+            used to provide linear indexing into a set of matrix blocks. These
+            index arrays are just the concatenated Cartesian products of all
+            the block arrays described by :attr:`row` and :attr:`col`.
 
-            They can be used to index directly into the matrix as follows:
+            They can be used to index directly into a matrix as follows:
 
             .. code-block:: python
 
                 mat[rowindices[blkranges[i]:blkranges[i + 1]],
                     colindices[blkranges[i]:blkranges[i + 1]]]
+
+            The same block can be obtained more easily using
+
+            .. code-block:: python
+
+                index.view(mat, i).reshape(-1)
         """
 
         @memoize_in(self, "block_index_knl")
-        def linear_index():
+        def _build_index():
             loopy_knl = lp.make_kernel([
                 "{[irange]: 0 <= irange < nranges}",
                 "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
@@ -417,13 +534,16 @@ class MatrixBlockIndex(object):
 
             return loopy_knl
 
-        blkranges = self.linear_ranges()
-        _, (rowindices, colindices) = linear_index()(self.queue,
-            tgtindices=self.tgtindices, srcindices=self.srcindices,
-            tgtranges=self.tgtranges, srcranges=self.srcranges,
-            blkranges=blkranges, nresults=blkranges[-1])
-
-        return rowindices, colindices
+        with cl.CommandQueue(self.cl_context) as queue:
+            _, (rowindices, colindices) = _build_index()(queue,
+                tgtindices=self.row.indices,
+                srcindices=self.col.indices,
+                tgtranges=self.row.ranges,
+                srcranges=self.col.ranges,
+                blkranges=self.blkranges,
+                nresults=_to_host(self.blkranges[-1], queue=queue))
+            return (rowindices.with_queue(None),
+                    colindices.with_queue(None))
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 892fef63edaf37d4286c07a7916fc13aabc269cd..2f3aabc3082b99707ab86b5461390e866350fdc3 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -26,12 +26,16 @@ import sys
 import numpy as np
 import numpy.linalg as la
 
-import pytest
 import pyopencl as cl
+import pyopencl.array  # noqa
+
+from sumpy.tools import vector_to_device
+from sumpy.tools import MatrixBlockIndexRanges
+
+import pytest
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
-from sumpy.tools import MatrixBlockIndex
 
 import logging
 logger = logging.getLogger(__name__)
@@ -44,7 +48,7 @@ else:
     faulthandler.enable()
 
 
-def create_arguments(n, mode, target_radius=1.0):
+def _build_geometry(queue, n, mode, target_radius=1.0):
     # parametrize circle
     t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
     unit_circle = np.exp(1j * t)
@@ -62,10 +66,14 @@ def create_arguments(n, mode, target_radius=1.0):
     centers = unit_circle * (1.0 - radius)
     expansion_radii = radius * np.ones(n)
 
-    return targets, sources, centers, sigma, expansion_radii
+    return (cl.array.to_device(queue, targets),
+            cl.array.to_device(queue, sources),
+            vector_to_device(queue, centers),
+            cl.array.to_device(queue, expansion_radii),
+            cl.array.to_device(queue, sigma))
 
 
-def create_index_subset(nnodes, nblks, factor):
+def _build_block_index(queue, nnodes, nblks, factor):
     indices = np.arange(0, nnodes)
     ranges = np.arange(0, nnodes + 1, nnodes // nblks)
 
@@ -82,116 +90,159 @@ def create_index_subset(nnodes, nblks, factor):
         ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
         indices_ = np.hstack(indices_)
 
-    return indices_, ranges_
+    from sumpy.tools import BlockIndexRanges
+    return BlockIndexRanges(queue.context,
+                            cl.array.to_device(queue, indices_).with_queue(None),
+                            cl.array.to_device(queue, ranges_).with_queue(None))
 
 
 @pytest.mark.parametrize('factor', [1.0, 0.6])
-def test_qbx_direct(ctx_getter, factor):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize('lpot_id', [1, 2])
+def test_qbx_direct(ctx_getter, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(2)
-
+    ndim = 2
     nblks = 10
     order = 12
     mode_nr = 25
 
+    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    if lpot_id == 1:
+        knl = LaplaceKernel(ndim)
+    elif lpot_id == 2:
+        knl = LaplaceKernel(ndim)
+        knl = DirectionalSourceDerivative(knl, dir_vec_name="dsource_vec")
+    else:
+        raise ValueError("unknow lpot_id")
+
+    from sumpy.expansion.local import LineTaylorLocalExpansion
+    lknl = LineTaylorLocalExpansion(knl, order)
+
     from sumpy.qbx import LayerPotential
+    lpot = LayerPotential(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixGenerator
+    mat_gen = LayerPotentialMatrixGenerator(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixBlockGenerator
-    from sumpy.expansion.local import LineTaylorLocalExpansion
-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
-    mat_gen = LayerPotentialMatrixGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
-    blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
+    blk_gen = LayerPotentialMatrixBlockGenerator(ctx, [lknl])
 
     for n in [200, 300, 400]:
-        targets, sources, centers, sigma, expansion_radii = \
-                create_arguments(n, mode_nr)
+        targets, sources, centers, expansion_radii, sigma = \
+                _build_geometry(queue, n, mode_nr)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
-        srcindices, srcranges = create_index_subset(n, nblks, factor)
-        assert tgtranges.shape == srcranges.shape
-
-        _, (mat,) = mat_gen(queue, targets, sources, centers,
-                expansion_radii)
-        result_mat = mat.dot(strengths[0])
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
 
-        _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
-                expansion_radii)
+        extra_kwargs = {}
+        if lpot_id == 2:
+            extra_kwargs["dsource_vec"] = \
+                    vector_to_device(queue, np.ones((ndim, n)))
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                strengths=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
+
+        rowindices = index_set.linear_row_indices.get(queue)
+        colindices = index_set.linear_col_indices.get(queue)
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
+        assert la.norm(blk - mat[rowindices, colindices]) < eps
 
-        index_set = MatrixBlockIndex(queue,
-            tgtindices, srcindices, tgtranges, srcranges)
-        _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
-                            index_set)
-
-        rowindices, colindices = index_set.linear_indices()
-        eps = 1.0e-10 * la.norm(mat)
-        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
-
-@pytest.mark.parametrize(("exclude_self", "factor"),
-    [(True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)])
+@pytest.mark.parametrize("exclude_self", [True, False])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
 def test_p2p_direct(ctx_getter, exclude_self, factor):
-    # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
+
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(2)
-
+    ndim = 2
     nblks = 10
     mode_nr = 25
 
+    from sumpy.kernel import LaplaceKernel
+    lknl = LaplaceKernel(ndim)
+
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixGenerator
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixBlockGenerator
     blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
 
     for n in [200, 300, 400]:
-        targets, sources, _, sigma, _ = \
-            create_arguments(n, mode_nr, target_radius=1.2)
+        targets, sources, _, _, sigma = \
+            _build_geometry(queue, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices, tgtranges = create_index_subset(n, nblks, factor)
-        srcindices, srcranges = create_index_subset(n, nblks, factor)
-        assert tgtranges.shape == srcranges.shape
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
 
         extra_kwargs = {}
         if exclude_self:
-            extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
-
-        _, (mat,) = mat_gen(queue, targets, sources, **extra_kwargs)
-        result_mat = mat.dot(strengths[0])
-
-        _, (result_lpot,) = lpot(queue, targets, sources, strengths,
-                                 **extra_kwargs)
+            extra_kwargs["target_to_source"] = \
+                cl.array.arange(queue, 0, n, dtype=np.int)
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                strength=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        index_set = MatrixBlockIndex(queue,
-                tgtindices, srcindices, tgtranges, srcranges)
-        _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs)
-
-        rowindices, colindices = index_set.linear_indices()
-        eps = 1.0e-10 * la.norm(mat)
-        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
+        index_set = index_set.get(queue)
+        for i in range(index_set.nblocks):
+            assert la.norm(index_set.block_take(blk, i) -
+                           index_set.take(mat, i)) < eps
 
 
 # You can test individual routines by typing