diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index c408360d7aac0985f49b5feaf7dc770d37790956..7e012497681a050cd607650f22efdcfcb0c5b529 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -375,10 +375,9 @@ class P2PMatrixBlockGenerator(P2PBase):
         .. code-block:: python
 
             blkranges = index_set.linear_ranges()
-            m = index_set.tgtranges[i + 1] - index_set.tgtranges[i]
-            n = index_set.srcranges[i + 1] - index_set.srcranges[i]
+            blkshape = index_set.block_shape(i)
 
-            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(m, n)
+            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(*blkshape)
 
         :arg targets: target point coordinates.
         :arg sources: source point coordinates.
@@ -394,9 +393,11 @@ class P2PMatrixBlockGenerator(P2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        tgtindices, srcindices = index_set.linear_indices()
-        return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
+        return knl(queue,
+                   targets=targets,
+                   sources=sources,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index b67c104225f3cccacde2abb5d8fbc8cbff0668b0..dea09e742fa6f38d1bb085392b04b8b292f9e499 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -408,10 +408,13 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
 
         knl = self.get_cached_optimized_kernel()
 
-        tgtindices, srcindices = index_set.linear_indices()
-        return knl(queue, src=sources, tgt=targets, center=centers,
-                expansion_radii=expansion_radii,
-                tgtindices=tgtindices, srcindices=srcindices, **kwargs)
+        return knl(queue,
+                   src=sources,
+                   tgt=targets,
+                   center=centers,
+                   expansion_radii=expansion_radii,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index b80deddd00ce330bffcaadaf112929c89821c745..ce27a6ec843112b80c0b78341255ad1138654eea 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -31,6 +31,9 @@ from pytools import memoize_method, memoize_in
 import numpy as np
 import sumpy.symbolic as sym
 
+import pyopencl as cl
+import pyopencl.array  # noqa
+
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
@@ -300,88 +303,170 @@ class KernelComputation(object):
 
 # {{{
 
-class MatrixBlockIndex(object):
-    def __init__(self, queue, tgtindices, srcindices, tgtranges, srcranges):
-        """Keep track of different ways to index into matrix blocks.
 
-        In this description, a block is given by a subset of indices of
-        the global matrix operator. For example, the :math:`i`-th block of
-        a matrix `mat` using the input arguments is obtained by:
+def _to_host(x, queue=None):
+    if isinstance(x, cl.array.Array):
+        queue = queue or x.queue
+        return x.get(queue)
+    return x
 
-        .. code-block:: python
 
-            mat[np.ix_(tgtindices[tgtranges[i]:tgtranges[i + 1]],
-                       srcindices[srcranges[i]:srcranges[i + 1]])]
+class BlockIndex(object):
+    """Convenience class for working with block indices.
 
-        :arg tgtindices: a subset of the full matrix row index set.
-        :arg srcindices: a subset of the full matrix column index set.
-        :arg tgtranges: an array used to index into `tgtindices`.
-        :arg srcranges: an array used to index into `srcindices`.
-        """
+    .. automethod:: block_shape
+    .. automethod:: get
+    .. automethod:: view
+    """
 
-        self.queue = queue
-        self.tgtindices = tgtindices
-        self.tgtranges = tgtranges
-        self.srcindices = srcindices
-        self.srcranges = srcranges
+    def __init__(self, indices, ranges):
+        self.indices = indices
+        self.ranges = ranges
 
+    @property
+    def nblocks(self):
+        return self.ranges.shape[0] - 1
+
+    def block_shape(self, i):
+        return (self._ranges[i + 1] - self._ranges[i],)
+
+    def get(self, queue=None):
+        return BlockIndex(_to_host(self.indices, queue=queue),
+                          _to_host(self.ranges, queue=queue))
+
+    @property
     @memoize_method
+    def _ranges(self):
+        return _to_host(self.ranges)
+
+    def view(self, x, i):
+        return x[self.indices[self._ranges[i]:self._ranges[i + 1]]]
+
+
+class MatrixBlockIndex(object):
+    """Keep track of different ways to index into matrix blocks.
+
+    .. attribute:: row
+
+        A :class:`BlockIndex`.
+
+    .. attribute:: col
+
+        A :class:`BlockIndex`.
+
+    .. automethod:: block_shape
+    .. automethod:: get
+    .. automethod:: view
+    """
+
+    def __init__(self, queue, rowindices, colindices):
+        self.queue = queue
+        self.row = rowindices
+        self.col = colindices
+        assert self.row.nblocks == self.col.nblocks
+
+        self.blkranges = np.cumsum([0] + [
+            self.row.block_shape(i)[0] * self.col.block_shape(i)[0]
+            for i in range(self.row.nblocks)])
+        if self.queue:
+            self.blkranges = cl.array.to_device(self.queue, self.blkranges)
+
+    @property
+    def nblocks(self):
+        return self.row.nblocks
+
+    def block_shape(self, i):
+        return self.row.block_shape(i) + self.col.block_shape(i)
+
+    @property
+    def linear_row_indices(self):
+        r, _ = self._linear_indices()
+        return r
+
+    @property
+    def linear_col_indices(self):
+        _, c = self._linear_indices()
+        return c
+
+    @property
     def linear_ranges(self):
+        return self.blkranges
+
+    def get(self, queue=None):
+        """Transfer data to the host. Only the initial given data is
+        transfered, not the arrays returned by :meth:`linear_row_indices` and
+        friends.
+
+        :return: a copy of `self` in which all data lives on the host, i.e.
+        all :class:`pyopencl.array.Array` instances are replaces by
+        :class:`numpy.ndarray` instances.
         """
-        :return: an array containing the cummulative sum of all block sizes.
-            It can be used to index into a linear representation of the
-            matrix blocks.
+        queue = queue or self.queue
+
+        return MatrixBlockIndex(None,
+                rowindices=self.row.get(queue),
+                colindices=self.col.get(queue))
+
+    def view(self, x, i):
+        """Retrieve a block from a global matrix.
+
+        :arg x: a 2D :class:`numpy.ndarray`, i.e. a matrix.
+        :arg i: the block index.
+        :return: the requested block from the matrix.
         """
 
-        @memoize_in(self, "block_cumsum_knl")
-        def cumsum():
-            loopy_knl = lp.make_kernel(
-                "{[i, j]: 0 <= i < nranges and 0 <= j <= i}",
-                """
-                blkranges[0] = 0
-                blkranges[i + 1] = reduce(sum, j, \
-                    (srcranges[j + 1] - srcranges[j]) * \
-                    (tgtranges[j + 1] - tgtranges[j])) \
-                """,
-                [
-                    lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                    lp.GlobalArg("blkranges", np.int32, shape="nranges + 1"),
-                    lp.ValueArg("nranges", None)
-                ],
-                name="block_cumsum_knl",
-                default_offset=lp.auto,
-                assumptions="nranges>=1",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+        if isinstance(x, cl.array.Array) or \
+                isinstance(self.row.indices, cl.array.Array) or \
+                isinstance(self.col.indices, cl.array.Array):
+            raise ValueError("CL `Array`s are not supported."
+                             "Use MatrixBlockIndex.get() and then view into matrices.")
 
-            loopy_knl = lp.realize_reduction(loopy_knl, force_scan=True,
-                force_outer_iname_for_scan="i")
-            return loopy_knl
+        irow = self.row.indices[self.row.ranges[i]:self.row.ranges[i + 1]]
+        icol = self.col.indices[self.col.ranges[i]:self.col.ranges[i + 1]]
+
+        return x[np.ix_(irow, icol)]
 
-        _, (blkranges,) = cumsum()(self.queue,
-            tgtranges=self.tgtranges, srcranges=self.srcranges)
+    def view_block(self, x, i):
+        """Retrieve a block from a linear representation of the matrix blocks.
+
+        :arg x: a 1D :class:`numpy.ndarray`.
+        :arg i: the block index.
+        :return: the requested block, reshaped into a 2D array. The result
+            matches, in shape and value, the block returned by
+            :meth:`view` with the global matrix.
+        """
+        if isinstance(x, cl.array.Array) or \
+                isinstance(self.blkranges, cl.array.Array):
+            raise ValueError("CL `Array`s are not supported."
+                             "Use MatrixBlockIndex.get() and then view into matrices.")
 
-        return blkranges
+        iblk = np.s_[self.blkranges[i]:self.blkranges[i + 1]]
+        return x[iblk].reshape(*self.block_shape(i))
 
     @memoize_method
-    def linear_indices(self):
+    def _linear_indices(self):
         """
-        :return: a tuple of `(rowindices, colindices)` that can be used in
-            conjunction with :func:`linear_ranges` to provide linear indexing
-            into a set of matrix blocks. These index arrays are just the
-            concatenated Cartesian products of all the block arrays described
-            by :attr:`tgtindices` and :arg:`srcindices`.
+        :return: a tuple of `(rowindices, colindices)` that can be
+            used to provide linear indexing into a set of matrix blocks. These
+            index arrays are just the concatenated Cartesian products of all
+            the block arrays described by :attr:`row` and :attr:`col`.
 
-            They can be used to index directly into the matrix as follows:
+            They can be used to index directly into a matrix as follows:
 
             .. code-block:: python
 
                 mat[rowindices[blkranges[i]:blkranges[i + 1]],
                     colindices[blkranges[i]:blkranges[i + 1]]]
+
+            The same block can be obtained more easily using
+
+            .. code-block:: python
+
+                index.view(mat, i).reshape(-1)
         """
 
         @memoize_in(self, "block_index_knl")
-        def linear_index():
+        def _build_index():
             loopy_knl = lp.make_kernel([
                 "{[irange]: 0 <= irange < nranges}",
                 "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
@@ -417,11 +502,13 @@ class MatrixBlockIndex(object):
 
             return loopy_knl
 
-        blkranges = self.linear_ranges()
-        _, (rowindices, colindices) = linear_index()(self.queue,
-            tgtindices=self.tgtindices, srcindices=self.srcindices,
-            tgtranges=self.tgtranges, srcranges=self.srcranges,
-            blkranges=blkranges, nresults=blkranges[-1])
+        _, (rowindices, colindices) = _build_index()(self.queue,
+            tgtindices=self.row.indices,
+            srcindices=self.col.indices,
+            tgtranges=self.row.ranges,
+            srcranges=self.col.ranges,
+            blkranges=self.blkranges,
+            nresults=self.blkranges[-1].get(self.queue))
 
         return rowindices, colindices
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 5fde01edf40510c81bc67dd0d3ab93759a3fc857..6c9b1c365c93062dad368e571409531a304f9f76 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -27,9 +27,11 @@ import numpy as np
 import numpy.linalg as la
 
 import pyopencl as cl
+import pyopencl.array
 
 from pytools.obj_array import make_obj_array
 
+from sumpy.tools import vector_to_device
 from sumpy.tools import MatrixBlockIndex
 
 import pytest
@@ -48,7 +50,7 @@ else:
     faulthandler.enable()
 
 
-def _build_geometry(n, mode, target_radius=1.0):
+def _build_geometry(queue, n, mode, target_radius=1.0):
     # parametrize circle
     t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
     unit_circle = np.exp(1j * t)
@@ -66,10 +68,14 @@ def _build_geometry(n, mode, target_radius=1.0):
     centers = unit_circle * (1.0 - radius)
     expansion_radii = radius * np.ones(n)
 
-    return targets, sources, centers, sigma, expansion_radii
+    return (cl.array.to_device(queue, targets),
+            cl.array.to_device(queue, sources),
+            vector_to_device(queue, centers),
+            cl.array.to_device(queue, expansion_radii),
+            cl.array.to_device(queue, sigma))
 
 
-def _build_block_index(nnodes, nblks, factor):
+def _build_block_index(queue, nnodes, nblks, factor):
     indices = np.arange(0, nnodes)
     ranges = np.arange(0, nnodes + 1, nnodes // nblks)
 
@@ -86,128 +92,158 @@ def _build_block_index(nnodes, nblks, factor):
         ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
         indices_ = np.hstack(indices_)
 
-    return indices_, ranges_
+    from sumpy.tools import BlockIndex
+    return BlockIndex(cl.array.to_device(queue, indices_),
+                      cl.array.to_device(queue, ranges_))
 
 
 @pytest.mark.parametrize('factor', [1.0, 0.6])
 @pytest.mark.parametrize('lpot_id', [1, 2])
 def test_qbx_direct(ctx_getter, factor, lpot_id):
-    # This evaluates a single layer potential on a circle.
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
+    ndim = 2
+    nblks = 10
+    order = 12
+    mode_nr = 25
+
     from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
     if lpot_id == 1:
-        lknl = LaplaceKernel(2)
+        knl = LaplaceKernel(ndim)
     elif lpot_id == 2:
-        lknl = LaplaceKernel(2)
-        lknl = DirectionalSourceDerivative(lknl, dir_vec_name="dsource_vec")
+        knl = LaplaceKernel(ndim)
+        knl = DirectionalSourceDerivative(knl, dir_vec_name="dsource_vec")
     else:
         raise ValueError("unknow lpot_id")
 
-    nblks = 10
-    order = 12
-    mode_nr = 25
+    from sumpy.expansion.local import LineTaylorLocalExpansion
+    lknl = LineTaylorLocalExpansion(knl, order)
 
     from sumpy.qbx import LayerPotential
+    lpot = LayerPotential(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixGenerator
+    mat_gen = LayerPotentialMatrixGenerator(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixBlockGenerator
-    from sumpy.expansion.local import LineTaylorLocalExpansion
-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
-    mat_gen = LayerPotentialMatrixGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
-    blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
+    blk_gen = LayerPotentialMatrixBlockGenerator(ctx, [lknl])
 
     for n in [200, 300, 400]:
-        targets, sources, centers, sigma, expansion_radii = \
-                _build_geometry(n, mode_nr)
+        targets, sources, centers, expansion_radii, sigma = \
+                _build_geometry(queue, n, mode_nr)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices, tgtranges = _build_block_index(n, nblks, factor)
-        srcindices, srcranges = _build_block_index(n, nblks, factor)
-        assert tgtranges.shape == srcranges.shape
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndex(queue, tgtindices, srcindices)
 
-        knl_kwargs = {}
+        extra_kwargs = {}
         if lpot_id == 2:
-            knl_kwargs["dsource_vec"] = make_obj_array([
-                np.ones(n) for _ in range(2)])
-
-        _, (mat,) = mat_gen(queue, targets, sources, centers,
-                expansion_radii, **knl_kwargs)
-        result_mat = mat.dot(strengths[0])
-
-        _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
-                expansion_radii, **knl_kwargs)
-
-        eps = 1.0e-10 * la.norm(result_lpot)
+            extra_kwargs["dsource_vec"] = \
+                    vector_to_device(queue, np.ones((ndim, n)))
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                strengths=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
+
+        rowindices = index_set.linear_row_indices.get(queue)
+        colindices = index_set.linear_col_indices.get(queue)
+
+        eps = 1.0e-13 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
+        assert la.norm(blk - mat[rowindices, colindices]) < eps
 
-        index_set = MatrixBlockIndex(queue,
-            tgtindices, srcindices, tgtranges, srcranges)
-        _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
-                            index_set, **knl_kwargs)
-
-        rowindices, colindices = index_set.linear_indices()
-        eps = 1.0e-10 * la.norm(mat)
-        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
 
-
-@pytest.mark.parametrize(("exclude_self", "factor"),
-    [(True, 1.0), (True, 0.6), (False, 1.0), (False, 0.6)])
+@pytest.mark.parametrize("exclude_self", [True, False])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
 def test_p2p_direct(ctx_getter, exclude_self, factor):
-    # This does a point-to-point kernel evaluation on a circle.
     logging.basicConfig(level=logging.INFO)
+
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(2)
-
+    ndim = 2
     nblks = 10
     mode_nr = 25
 
+    from sumpy.kernel import LaplaceKernel
+    lknl = LaplaceKernel(ndim)
+
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixGenerator
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixBlockGenerator
     blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
 
     for n in [200, 300, 400]:
-        targets, sources, _, sigma, _ = \
-            _build_geometry(n, mode_nr, target_radius=1.2)
+        targets, sources, _, _, sigma = \
+            _build_geometry(queue, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices, tgtranges = _build_block_index(n, nblks, factor)
-        srcindices, srcranges = _build_block_index(n, nblks, factor)
-        assert tgtranges.shape == srcranges.shape
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndex(queue, tgtindices, srcindices)
 
         extra_kwargs = {}
         if exclude_self:
-            extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
-
-        _, (mat,) = mat_gen(queue, targets, sources, **extra_kwargs)
-        result_mat = mat.dot(strengths[0])
-
-        _, (result_lpot,) = lpot(queue, targets, sources, strengths,
-                                 **extra_kwargs)
-
-        eps = 1.0e-10 * la.norm(result_lpot)
+            extra_kwargs["target_to_source"] = \
+                cl.array.arange(queue, 0, n, dtype=np.int)
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                strength=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
+
+        eps = 1.0e-13 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        index_set = MatrixBlockIndex(queue,
-                tgtindices, srcindices, tgtranges, srcranges)
-        _, (blk,) = blk_gen(queue, targets, sources, index_set, **extra_kwargs)
-
-        rowindices, colindices = index_set.linear_indices()
-        eps = 1.0e-10 * la.norm(mat)
-        assert la.norm(blk - mat[rowindices, colindices].reshape(-1)) < eps
+        index_set = index_set.get(queue)
+        for i in range(index_set.nblocks):
+            assert la.norm(index_set.view_block(blk, i) -
+                           index_set.view(mat, i)) < eps
 
 
 # You can test individual routines by typing