diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 81b69ec195582627ac29dd050e004e5ac325bc67..308375dc7296d3698fdb0ca0ca8dc093101d6707 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -88,7 +88,7 @@ Pylint:
   # Pylint won't find the Cython bits without this
   - PROJECT_INSTALL_FLAGS="--editable"
   - export PY_EXE=python3
-  - EXTRA_INSTALL="Cython pybind11 numpy mako matplotlib"
+  - EXTRA_INSTALL="Cython pybind11 numpy scipy mako matplotlib"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
   - ". ./prepare-and-run-pylint.sh pytential test/test_*.py"
   tags:
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
index 8ff76bb9e0f0f3f9b7a47d27b91d14210c68cd9c..0526329fe76ced9798b06e83fb881754e0c684c2 100644
--- a/pytential/linalg/proxy.py
+++ b/pytential/linalg/proxy.py
@@ -25,8 +25,10 @@ import numpy as np
 import numpy.linalg as la
 
 from pytools.obj_array import make_obj_array
-from pytools import memoize_method, memoize_in
+from pytools import memoize_method
+
 from sumpy.tools import BlockIndexRanges
+from boxtree.tools import DeviceDataRecord
 
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
@@ -37,31 +39,42 @@ Proxy Point Generation
 ~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: ProxyGenerator
-.. autofunction:: partition_by_nodes
+.. autoclass:: BlockProxyPoints
 
+.. autofunction:: partition_by_nodes
 .. autofunction:: gather_block_neighbor_points
-.. autofunction:: gather_block_interaction_points
 """
 
 
 # {{{ point index partitioning
 
+def make_block_index(actx, indices, ranges=None):
+    """Wrap a ``(indices, ranges)`` tuple into a ``BlockIndexRanges``."""
+    if ranges is None:
+        ranges = np.cumsum([0] + [r.size for r in indices])
+        indices = np.hstack(indices)
+
+    return BlockIndexRanges(actx.context,
+            actx.freeze(actx.from_numpy(indices)),
+            actx.freeze(actx.from_numpy(ranges)))
+
+
 def partition_by_nodes(actx, discr,
-        tree_kind="adaptive-level-restricted", max_nodes_in_box=None):
+        tree_kind="adaptive-level-restricted", max_particles_in_box=None):
     """Generate equally sized ranges of nodes. The partition is created at the
     lowest level of granularity, i.e. nodes. This results in balanced ranges
     of points, but will split elements across different ranges.
 
     :arg discr: a :class:`meshmode.discretization.Discretization`.
     :arg tree_kind: if not *None*, it is passed to :class:`boxtree.TreeBuilder`.
-    :arg max_nodes_in_box: passed to :class:`boxtree.TreeBuilder`.
+    :arg max_particles_in_box: passed to :class:`boxtree.TreeBuilder`.
 
     :return: a :class:`sumpy.tools.BlockIndexRanges`.
     """
 
-    if max_nodes_in_box is None:
+    if max_particles_in_box is None:
         # FIXME: this is just an arbitrary value
-        max_nodes_in_box = 32
+        max_particles_in_box = 32
 
     if tree_kind is not None:
         from boxtree import box_flags_enum
@@ -72,7 +85,7 @@ def partition_by_nodes(actx, discr,
         from meshmode.dof_array import flatten, thaw
         tree, _ = builder(actx.queue,
                 flatten(thaw(actx, discr.nodes())),
-                max_particles_in_box=max_nodes_in_box,
+                max_particles_in_box=max_particles_in_box,
                 kind=tree_kind)
 
         tree = tree.get(actx.queue)
@@ -80,32 +93,49 @@ def partition_by_nodes(actx, discr,
                        & box_flags_enum.HAS_CHILDREN == 0).nonzero()
 
         indices = np.empty(len(leaf_boxes), dtype=np.object)
+        ranges = None
+
         for i, ibox in enumerate(leaf_boxes):
             box_start = tree.box_source_starts[ibox]
             box_end = box_start + tree.box_source_counts_cumul[ibox]
             indices[i] = tree.user_source_ids[box_start:box_end]
-
-        ranges = actx.from_numpy(
-                np.cumsum([0] + [box.shape[0] for box in indices])
-                )
-        indices = actx.from_numpy(np.hstack(indices))
     else:
-        indices = actx.from_numpy(np.arange(0, discr.ndofs, dtype=np.int))
-        ranges = actx.from_numpy(np.arange(
-            0,
-            discr.ndofs + 1,
-            max_nodes_in_box, dtype=np.int))
+        indices = np.arange(0, discr.ndofs, dtype=np.int)
+        ranges = np.arange(0, discr.ndofs + 1, max_particles_in_box, dtype=np.int)
 
-    assert ranges[-1] == discr.ndofs
-
-    return BlockIndexRanges(actx.context,
-        actx.freeze(indices), actx.freeze(ranges))
+    return make_block_index(actx, indices, ranges=ranges)
 
 # }}}
 
 
 # {{{ proxy point generator
 
+class BlockProxyPoints(DeviceDataRecord):
+    """
+    .. attribute:: indices
+
+        A :class:`~sumpy.tools.BlockIndexRanges` describing which proxies
+        belong to which block.
+
+    .. attribute:: points
+
+        A concatenated list of all the proxy points. Can be sliced into
+        using :attr:`indices` (shape ``(dim, nproxies * nblocks)``).
+
+    .. attribute:: centers
+
+        A list of all the proxy ball centers (shape ``(dim, nblocks)``).
+
+    .. attribute:: radii
+
+        A list of all the proxy ball radii (shape ``(nblocks,)``).
+    """
+
+    @property
+    def nproxy(self):
+        return self.points[0].shape[0] // self.indices.nblocks
+
+
 def _generate_unit_sphere(ambient_dim, approx_npoints):
     """Generate uniform points on a unit sphere.
 
@@ -210,6 +240,8 @@ class ProxyGenerator:
     def nproxy(self):
         return self.ref_points.shape[1]
 
+    # {{{ proxy ball kernels
+
     @memoize_method
     def get_kernel(self):
         if self.radius_factor < 1.0:
@@ -218,44 +250,54 @@ class ProxyGenerator:
             radius_expr = "{f} * rqbx"
         radius_expr = radius_expr.format(f=self.radius_factor)
 
-        # NOTE: centers of mass are computed using a second-order approximation
         knl = lp.make_kernel([
             "{[irange]: 0 <= irange < nranges}",
-            "{[i]: 0 <= i < npoints}",
-            "{[idim]: 0 <= idim < dim}"
+            "{[i]: 0 <= i < npoints}", "{[j]: 0 <= j < npoints}",
+            "{[idim]: 0 <= idim < ndim}"
             ],
-            ["""
+            """
             for irange
                 <> ioffset = srcranges[irange]
                 <> npoints = srcranges[irange + 1] - srcranges[irange]
 
-                proxy_center[idim, irange] = 1.0 / npoints * \
-                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
-                        {{dup=idim:i}}
+                for idim
+                    <> bbox_max = \
+                        simul_reduce(max, i, sources[idim, srcindices[i + ioffset]])
+                    <> bbox_min = \
+                        simul_reduce(min, i, sources[idim, srcindices[i + ioffset]])
 
-                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
-                        (proxy_center[idim, irange] -
-                         sources[idim, srcindices[i + ioffset]]) ** 2)))
+                    proxy_center[idim, irange] = (bbox_max + bbox_min) / 2.0
+                end
 
-                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                <> rblk = simul_reduce(max, j, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         sources[idim, srcindices[j + ioffset]]) ** 2))) \
+                        {dup=idim}
+            """
+            + ("""
+                <> rqbx_int = simul_reduce(max, j, sqrt(simul_reduce(sum, idim, \
                         (proxy_center[idim, irange] -
-                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
-                         expansion_radii[srcindices[i + ioffset]])
-                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                         center_int[idim, srcindices[j + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[j + ioffset]]) \
+                         {dup=idim}
+                <> rqbx_ext = simul_reduce(max, j, sqrt(simul_reduce(sum, idim, \
                         (proxy_center[idim, irange] -
-                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
-                         expansion_radii[srcindices[i + ioffset]])
+                         center_ext[idim, srcindices[j + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[j + ioffset]]) \
+                         {dup=idim}
                 <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext
-
+            """ if self.radius_factor > 1.0e-14 else "<> rqbx = 0.0")
+            + """
                 proxy_radius[irange] = {radius_expr}
             end
-            """.format(radius_expr=radius_expr)],
-            [
-                lp.GlobalArg("sources", None,
-                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+            """.format(radius_expr=radius_expr),
+            ([] if self.radius_factor < 1.0e-4 else [
                 lp.GlobalArg("center_int", None,
                     shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
                 lp.GlobalArg("center_ext", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C")
+            ]) + [
+                lp.GlobalArg("sources", None,
                     shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
                 lp.GlobalArg("proxy_center", None,
                     shape=(self.ambient_dim, "nranges")),
@@ -265,8 +307,8 @@ class ProxyGenerator:
                 "..."
             ],
             name="find_proxy_radii_knl",
-            assumptions="dim>=1 and nranges>=1",
-            fixed_parameters=dict(dim=self.ambient_dim),
+            assumptions="ndim>=1 and nranges>=1",
+            fixed_parameters=dict(ndim=self.ambient_dim),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         knl = lp.tag_inames(knl, "idim*:unr")
@@ -279,6 +321,38 @@ class ProxyGenerator:
 
         return knl
 
+    # }}}
+
+    def _get_proxy_centers_and_radii(self, actx, source_dd, indices, **kwargs):
+        from pytential import bind, sym
+        source_dd = sym.as_dofdesc(source_dd)
+        discr = self.places.get_discretization(
+                source_dd.geometry, source_dd.discr_stage)
+
+        from meshmode.dof_array import flatten, thaw
+        context = {
+                "sources": flatten(thaw(actx, discr.nodes())),
+                "srcindices": indices.indices,
+                "srcranges": indices.ranges
+                }
+
+        if self.radius_factor > 1.0e-14:
+            radii = bind(self.places, sym.expansion_radii(
+                self.ambient_dim, dofdesc=source_dd))(actx)
+            center_int = bind(self.places, sym.expansion_centers(
+                self.ambient_dim, -1, dofdesc=source_dd))(actx)
+            center_ext = bind(self.places, sym.expansion_centers(
+                self.ambient_dim, +1, dofdesc=source_dd))(actx)
+
+            context["center_int"] = flatten(center_int)
+            context["center_ext"] = flatten(center_ext)
+            context["expansion_radii"] = flatten(radii)
+
+        knl = self.get_kernel()
+        _, (centers, radii,) = knl(actx.queue, **context, **kwargs)
+
+        return centers, radii
+
     def __call__(self, actx, source_dd, indices, **kwargs):
         """Generate proxy points for each given range of source points in
         the discretization in *source_dd*.
@@ -300,60 +374,58 @@ class ProxyGenerator:
             distance ``pxyradii[i]`` from the range center ``pxycenters[i]``.
         """
 
-        def _affine_map(v, A, b):
-            return np.dot(A, v) + b
-
-        from pytential import bind, sym
-        source_dd = sym.as_dofdesc(source_dd)
-        discr = self.places.get_discretization(
-                source_dd.geometry, source_dd.discr_stage)
-
-        radii = bind(self.places, sym.expansion_radii(
-            self.ambient_dim, dofdesc=source_dd))(actx)
-        center_int = bind(self.places, sym.expansion_centers(
-            self.ambient_dim, -1, dofdesc=source_dd))(actx)
-        center_ext = bind(self.places, sym.expansion_centers(
-            self.ambient_dim, +1, dofdesc=source_dd))(actx)
+        # {{{ compute proxy centers and radii
 
-        from meshmode.dof_array import flatten, thaw
-        knl = self.get_kernel()
-        _, (centers_dev, radii_dev,) = knl(actx.queue,
-            sources=flatten(thaw(actx, discr.nodes())),
-            center_int=flatten(center_int),
-            center_ext=flatten(center_ext),
-            expansion_radii=flatten(radii),
-            srcindices=indices.indices,
-            srcranges=indices.ranges, **kwargs)
+        centers_dev, radii_dev = self._get_proxy_centers_and_radii(actx,
+                source_dd, indices, **kwargs)
 
         from pytential.utils import flatten_to_numpy
-        centers = flatten_to_numpy(actx, centers_dev)
+        centers = np.vstack(flatten_to_numpy(actx, centers_dev))
         radii = flatten_to_numpy(actx, radii_dev)
-        proxies = np.empty(indices.nblocks, dtype=np.object)
+
+        # }}}
+
+        # {{{ build proxy points for each block
+
+        def _affine_map(v, A, b):
+            return np.dot(A, v) + b
+
+        nproxy = self.nproxy * indices.nblocks
+        proxies = np.empty((self.ambient_dim, nproxy), dtype=centers.dtype)
+        pxy_nr_base = 0
+
         for i in range(indices.nblocks):
-            proxies[i] = _affine_map(self.ref_points,
+            bball = _affine_map(self.ref_points,
                     A=(radii[i] * np.eye(self.ambient_dim)),
                     b=centers[:, i].reshape(-1, 1))
 
-        pxyranges = actx.from_numpy(np.arange(
-            0,
-            proxies.shape[0] * proxies[0].shape[1] + 1,
-            proxies[0].shape[1],
-            dtype=indices.ranges.dtype))
+            proxies[:, pxy_nr_base:pxy_nr_base + self.nproxy] = bball
+            pxy_nr_base += self.nproxy
+
         proxies = make_obj_array([
-            actx.freeze(actx.from_numpy(np.hstack([p[idim] for p in proxies])))
-            for idim in range(self.ambient_dim)
+            actx.freeze(actx.from_numpy(p)) for p in proxies
             ])
         centers = make_obj_array([
-            actx.freeze(centers_dev[idim])
-            for idim in range(self.ambient_dim)
+            actx.freeze(actx.from_numpy(c)) for c in centers
             ])
+        pxyindices = np.arange(0, nproxy, dtype=indices.indices.dtype)
+        pxyranges = np.arange(0, nproxy + 1, self.nproxy)
+
+        assert proxies[0].size == pxyranges[-1]
+        return BlockProxyPoints(
+                indices=make_block_index(actx, pxyindices, pxyranges),
+                points=proxies,
+                centers=centers,
+                radii=actx.freeze(radii_dev),
+                )
+
+# }}}
 
-        assert pxyranges[-1] == proxies[0].shape[0]
-        return proxies, actx.freeze(pxyranges), centers, actx.freeze(radii_dev)
 
+# {{{ gather_block_neighbor_points
 
-def gather_block_neighbor_points(actx, discr, indices, pxycenters, pxyradii,
-        max_nodes_in_box=None):
+def gather_block_neighbor_points(actx, discr, indices, pxy,
+        max_particles_in_box=None):
     """Generate a set of neighboring points for each range of points in
     *discr*. Neighboring points of a range :math:`i` are defined
     as all the points inside the proxy ball :math:`i` that do not also
@@ -361,25 +433,17 @@ def gather_block_neighbor_points(actx, discr, indices, pxycenters, pxyradii,
 
     :arg discr: a :class:`meshmode.discretization.Discretization`.
     :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
-    :arg pxycenters: an array containing the center of each proxy ball.
-    :arg pxyradii: an array containing the radius of each proxy ball.
+    :arg pxy: a :class:`BlockProxyPoints`.
 
     :return: a :class:`sumpy.tools.BlockIndexRanges`.
     """
 
-    if max_nodes_in_box is None:
+    if max_particles_in_box is None:
         # FIXME: this is a fairly arbitrary value
-        max_nodes_in_box = 32
+        max_particles_in_box = 32
 
     indices = indices.get(actx.queue)
 
-    # NOTE: this is constructed for multiple reasons:
-    #   * TreeBuilder takes object arrays
-    #   * `srcindices` can be a small subset of nodes, so this will save
-    #   some work
-    #   * `srcindices` may reorder the array returned by nodes(), so this
-    #   makes sure that we have the same order in tree.user_source_ids
-    #   and friends
     from pytential.utils import flatten_to_numpy
     sources = flatten_to_numpy(actx, discr.nodes())
     sources = make_obj_array([
@@ -390,21 +454,18 @@ def gather_block_neighbor_points(actx, discr, indices, pxycenters, pxyradii,
     from boxtree import TreeBuilder
     builder = TreeBuilder(actx.context)
     tree, _ = builder(actx.queue, sources,
-            max_particles_in_box=max_nodes_in_box)
+            max_particles_in_box=max_particles_in_box)
 
     from boxtree.area_query import AreaQueryBuilder
     builder = AreaQueryBuilder(actx.context)
-    query, _ = builder(actx.queue, tree, pxycenters, pxyradii)
+    query, _ = builder(actx.queue, tree, pxy.centers, pxy.radii)
 
     # find nodes inside each proxy ball
     tree = tree.get(actx.queue)
     query = query.get(actx.queue)
 
-    pxycenters = np.vstack([
-        actx.to_numpy(pxycenters[idim])
-        for idim in range(discr.ambient_dim)
-        ])
-    pxyradii = actx.to_numpy(pxyradii)
+    pxycenters = np.vstack([actx.to_numpy(c) for c in pxy.centers])
+    pxyradii = actx.to_numpy(pxy.radii)
 
     nbrindices = np.empty(indices.nblocks, dtype=np.object)
     for iproxy in range(indices.nblocks):
@@ -418,8 +479,7 @@ def gather_block_neighbor_points(actx, discr, indices, pxycenters, pxyradii,
         iend = istart + tree.box_source_counts_cumul[iboxes]
         isources = np.hstack([np.arange(s, e)
                               for s, e in zip(istart, iend)])
-        nodes = np.vstack([tree.sources[idim][isources]
-                           for idim in range(discr.ambient_dim)])
+        nodes = np.vstack([s[isources] for s in tree.sources])
         isources = tree.user_source_ids[isources]
 
         # get nodes inside the ball but outside the current range
@@ -431,118 +491,7 @@ def gather_block_neighbor_points(actx, discr, indices, pxycenters, pxyradii,
 
         nbrindices[iproxy] = indices.indices[isources[mask]]
 
-    nbrranges = actx.from_numpy(np.cumsum([0] + [n.shape[0] for n in nbrindices]))
-    nbrindices = actx.from_numpy(np.hstack(nbrindices))
-
-    return BlockIndexRanges(actx.context,
-            actx.freeze(nbrindices), actx.freeze(nbrranges))
-
-
-def gather_block_interaction_points(actx, places, source_dd, indices,
-        radius_factor=None,
-        approx_nproxy=None,
-        max_nodes_in_box=None):
-    """Generate sets of interaction points for each given range of indices
-    in the *source* discretization. For each input range of indices,
-    the corresponding output range of points is consists of:
-
-    - a set of proxy points (or balls) around the range, which
-      model farfield interactions. These are constructed using
-      :class:`ProxyGenerator`.
-
-    - a set of neighboring points that are inside the proxy balls, but
-      do not belong to the given range, which model nearby interactions.
-      These are constructed with :func:`gather_block_neighbor_points`.
-
-    :arg places: a :class:`~pytential.GeometryCollection`.
-    :arg source_dd: geometry in *places* for which to generate the
-        interaction points. This is a
-        :class:`~pytential.symbolic.primitives.DOFDescriptor` describing
-        the exact discretization.
-    :arg indices: a :class:`sumpy.tools.BlockIndexRanges` on the
-        discretization described by *source_dd*.
-
-    :return: a tuple ``(nodes, ranges)``, where each value is a
-        :class:`pyopencl.array.Array`. For a range :math:`i`, we can
-        get the slice using ``nodes[ranges[i]:ranges[i + 1]]``.
-    """
-
-    @memoize_in(places, "concat_proxy_and_neighbors")
-    def knl():
-        loopy_knl = lp.make_kernel([
-            "{[irange, idim]: 0 <= irange < nranges and \
-                              0 <= idim < dim}",
-            "{[ipxy, ingb]: 0 <= ipxy < npxyblock and \
-                            0 <= ingb < nngbblock}"
-            ],
-            """
-            for irange
-                <> pxystart = pxyranges[irange]
-                <> pxyend = pxyranges[irange + 1]
-                <> npxyblock = pxyend - pxystart
-
-                <> ngbstart = nbrranges[irange]
-                <> ngbend = nbrranges[irange + 1]
-                <> nngbblock = ngbend - ngbstart
-
-                <> istart = pxyranges[irange] + nbrranges[irange]
-                nodes[idim, istart + ipxy] = \
-                    proxies[idim, pxystart + ipxy] \
-                    {id_prefix=write_pxy,nosync=write_ngb}
-                nodes[idim, istart + npxyblock + ingb] = \
-                    sources[idim, nbrindices[ngbstart + ingb]] \
-                    {id_prefix=write_ngb,nosync=write_pxy}
-                ranges[irange + 1] = ranges[irange] + npxyblock + nngbblock
-            end
-            """,
-            [
-                lp.GlobalArg("sources", None,
-                    shape=(lpot_source.ambient_dim, "nsources"), dim_tags="sep,C"),
-                lp.GlobalArg("proxies", None,
-                    shape=(lpot_source.ambient_dim, "nproxies"), dim_tags="sep,C"),
-                lp.GlobalArg("nbrindices", None,
-                    shape="nnbrindices"),
-                lp.GlobalArg("nodes", None,
-                    shape=(lpot_source.ambient_dim, "nproxies + nnbrindices")),
-                lp.ValueArg("nsources", np.int),
-                lp.ValueArg("nproxies", np.int),
-                lp.ValueArg("nnbrindices", np.int),
-                "..."
-            ],
-            name="concat_proxy_and_neighbors",
-            default_offset=lp.auto,
-            silenced_warnings="write_race(write_*)",
-            fixed_parameters=dict(dim=lpot_source.ambient_dim),
-            lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-
-        return loopy_knl
-
-    lpot_source = places.get_geometry(source_dd.geometry)
-    generator = ProxyGenerator(places,
-            radius_factor=radius_factor,
-            approx_nproxy=approx_nproxy)
-    proxies, pxyranges, pxycenters, pxyradii = \
-            generator(actx, source_dd, indices)
-
-    discr = places.get_discretization(source_dd.geometry, source_dd.discr_stage)
-    neighbors = gather_block_neighbor_points(actx, discr,
-            indices, pxycenters, pxyradii,
-            max_nodes_in_box=max_nodes_in_box)
-
-    from meshmode.dof_array import flatten, thaw
-    ranges = actx.zeros(indices.nblocks + 1, dtype=np.int)
-    _, (nodes, ranges) = knl()(actx.queue,
-            sources=flatten(thaw(actx, discr.nodes())),
-            proxies=proxies,
-            pxyranges=pxyranges,
-            nbrindices=neighbors.indices,
-            nbrranges=neighbors.ranges,
-            ranges=ranges)
-
-    return actx.freeze(nodes), actx.freeze(ranges)
+    return make_block_index(actx, nbrindices)
 
 # }}}
 
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
new file mode 100644
index 0000000000000000000000000000000000000000..131aca9797e1bd6235e17abe661326c3447bc738
--- /dev/null
+++ b/pytential/linalg/skeletonization.py
@@ -0,0 +1,502 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018-2020 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from contextlib import contextmanager
+
+import numpy as np
+
+from pytools.obj_array import make_obj_array
+from pytools import Record
+
+from sumpy.tools import MatrixBlockIndexRanges
+from pytential.symbolic.mappers import IdentityMapper, LocationTagger
+
+
+# {{{ helpers
+
+def interp_decomp(A, rank, eps):
+    """Wrapper for :func:`~scipy.linalg.interpolative.interp_decomp` that
+    always has the same output signature.
+
+    :return: a tuple ``(k, idx, interp)`` containing the numerical rank,
+        the column indices and the resulting interpolation matrix.
+    """
+
+    import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
+    if rank is None:
+        k, idx, proj = sli.interp_decomp(A, eps)
+    else:
+        idx, proj = sli.interp_decomp(A, rank)
+        k = rank
+
+    # NOTE: fix should be in scipy 1.2.0
+    # https://github.com/scipy/scipy/pull/9125
+    if k == A.shape[1]:
+        proj = np.empty((k, 0), dtype=proj.dtype)
+
+    interp = sli.reconstruct_interp_matrix(idx, proj)
+    return k, idx, interp
+
+
+def make_block_diag(blk, blkindices):
+    """Construct a block diagonal matrix from a linear representation of
+    the matrix blocks.
+    """
+
+    nblocks = blkindices.nblocks
+    diag = np.full((nblocks, nblocks), 0, dtype=np.object)
+
+    for i in range(nblocks):
+        diag[i, i] = blkindices.block_take(blk, i)
+
+    return diag
+
+
+# }}}
+
+
+# {{{ evaluation
+
+class QBXForcedLimitReplacer(IdentityMapper):
+    def __init__(self, qbx_forced_limit=None):
+        super(QBXForcedLimitReplacer, self).__init__()
+        self.qbx_forced_limit = qbx_forced_limit
+
+    def map_int_g(self, expr):
+        return expr.copy(qbx_forced_limit=self.qbx_forced_limit)
+
+
+class LocationReplacer(LocationTagger):
+    def _default_dofdesc(self, dofdesc):
+        return self.default_where
+
+    def map_int_g(self, expr):
+        return type(expr)(
+                expr.kernel,
+                self.operand_rec(expr.density),
+                expr.qbx_forced_limit,
+                self.default_source, self.default_where,
+                kernel_arguments=dict(
+                    (name, self.operand_rec(arg_expr))
+                    for name, arg_expr in expr.kernel_arguments.items()
+                    ))
+
+
+class DOFDescriptorReplacer(LocationReplacer):
+    def __init__(self, default_source, default_target):
+        super(DOFDescriptorReplacer, self).__init__(
+                default_target, default_source=default_source)
+        self.operand_rec = LocationReplacer(
+                default_source, default_source=default_source)
+
+
+class BlockEvaluationWrangler:
+    """
+    .. automethod:: evaluate_source_farfield
+    .. automethod:: evaluate_target_farfield
+    .. automethod:: evaluate_nearfield
+    """
+
+    def __init__(self, exprs, input_exprs, domains,
+            context=None,
+            weighted_farfield=None,
+            farfield_block_builder=None,
+            nearfield_block_builder=None):
+        r"""
+        :arg exprs: an :class:`~numpy.ndarray` of expressions (layer potentials)
+            that correspond to the output blocks of the matrix.
+        :arg input_exprs: a :class:`list` of densities that correspond to the
+            input blocks of the matrix.
+        :arg domains: a :class:`list` of the same length as *input_exprs*
+            defining the domain of each input.
+
+        :arg context: a :class:`dict` with additional parameters required to
+            evaluated the expressions.
+
+        :arg weighted_farfield: a :class:`tuple` containing two :class:`bool`\ s
+            which turn on the weighing proxy interactions by the corresponding
+            quadrature weights.
+        :arg farfield_block_builder:
+            a :class:`pytential.symbolic.matrix.MatrixBlockBuilderBase` that
+            is used to evaluate farfield proxy interactions.
+        :arg nearfield_block_builder:
+            a :class:`pytential.symbolic.matrix.MatrixBlockBuilderBase` that
+            is used to evaluate nearfield neighbour interactions.
+        """
+        if context is None:
+            context = {}
+
+        self.exprs = exprs
+        self.input_exprs = input_exprs
+        self.domains = domains
+        self.context = context
+
+        if weighted_farfield is None:
+            weighted_source = True
+            weighted_target = False
+        elif isinstance(weighted_farfield, bool):
+            weighted_source = weighted_target = weighted_farfield
+        elif isinstance(weighted_farfield, (list, tuple)):
+            weighted_source, weighted_target = weighted_farfield
+        else:
+            raise ValueError("unknown value for weighting: `{}`".format(
+                weighted_farfield))
+
+        self.weighted_source = weighted_source
+        self.weighted_target = weighted_target
+
+        self.nearfield_block_builder = nearfield_block_builder
+        if self.nearfield_block_builder is None:
+            from pytential.symbolic.matrix import NearFieldBlockBuilder
+            self.nearfield_block_builder = NearFieldBlockBuilder
+
+        self.farfield_block_builder = farfield_block_builder
+        if self.farfield_block_builder is None:
+            from pytential.symbolic.matrix import FarFieldBlockBuilder
+            self.farfield_block_builder = FarFieldBlockBuilder
+
+    def _prepare_farfield_expr(self, places, expr, auto_where=None):
+        from pytential.symbolic.execution import _prepare_auto_where
+        auto_where = _prepare_auto_where(auto_where, places=places)
+
+        expr = QBXForcedLimitReplacer(qbx_forced_limit=None)(expr)
+        expr = DOFDescriptorReplacer(auto_where[0], auto_where[1])(expr)
+
+        return expr
+
+    def _evaluate(self, actx, places, builder_cls,
+            expr, idomain, index_set, auto_where, **kwargs):
+        domain = self.domains[idomain]
+        dep_source = places.get_geometry(domain.geometry)
+        dep_discr = places.get_discretization(domain.geometry, domain.discr_stage)
+
+        builder = builder_cls(actx,
+                dep_expr=self.input_exprs[idomain],
+                other_dep_exprs=(
+                    self.input_exprs[:idomain]
+                    + self.input_exprs[idomain+1:]),
+                dep_source=dep_source,
+                dep_discr=dep_discr,
+                places=places,
+                index_set=index_set,
+                context=self.context,
+                **kwargs)
+
+        return builder(expr)
+
+    def evaluate_source_farfield(self,
+            actx, places, ibrow, ibcol, index_set, auto_where=None):
+        expr = self._prepare_farfield_expr(
+                places, self.exprs[ibrow], auto_where=auto_where)
+
+        return self._evaluate(actx, places,
+                self.farfield_block_builder,
+                expr, ibcol, index_set, auto_where,
+                weighted=self.weighted_source,
+                exclude_self=False)
+
+    def evaluate_target_farfield(self,
+            actx, places, ibrow, ibcol, index_set, auto_where=None):
+        expr = self._prepare_farfield_expr(
+                places, self.exprs[ibrow], auto_where=auto_where)
+
+        return self._evaluate(actx, places,
+                self.farfield_block_builder,
+                expr, ibcol, index_set, auto_where,
+                weighted=self.weighted_target,
+                exclude_self=False)
+
+    def evaluate_nearfield(self,
+            actx, places, ibrow, ibcol, index_set, auto_where=None):
+        return self._evaluate(actx, places,
+                self.nearfield_block_builder,
+                self.exprs[ibrow], ibcol, index_set, auto_where)
+
+
+def make_block_evaluation_wrangler(places, exprs, input_exprs,
+        domains=None, context=None, auto_where=None,
+        _weighted_farfield=None,
+        _farfield_block_builder=None,
+        _nearfield_block_builder=None):
+
+    from pytential.symbolic.execution import _prepare_auto_where
+    auto_where = _prepare_auto_where(auto_where, places)
+    from pytential.symbolic.execution import _prepare_expr
+    exprs = _prepare_expr(places, exprs, auto_where=auto_where)
+
+    if not (isinstance(exprs, np.ndarray) and exprs.dtype.char == "O"):
+        exprs = make_obj_array([exprs])
+
+    try:
+        input_exprs = list(input_exprs)
+    except TypeError:
+        input_exprs = [input_exprs]
+
+    from pytential.symbolic.execution import _prepare_domains
+    domains = _prepare_domains(len(input_exprs), places, domains, auto_where[0])
+
+    if context is None:
+        context = {}
+
+    return BlockEvaluationWrangler(
+            exprs=exprs,
+            input_exprs=input_exprs,
+            domains=domains,
+            context=context,
+            weighted_farfield=_weighted_farfield,
+            farfield_block_builder=_farfield_block_builder,
+            nearfield_block_builder=_nearfield_block_builder)
+
+# }}}
+
+
+# {{{ skeletonize_block_by_proxy
+
+@contextmanager
+def add_to_geometry_collection(places, proxy):
+    # NOTE: this is a giant hack to keep all the caches in `places` and
+    # just add the proxy points to it, since otherwise all the DISCRETIZATION
+    # scope stuff would be recomputed all the time
+
+    try:
+        pxyplaces = places.merge({"proxy": proxy})
+        yield pxyplaces
+    finally:
+        pass
+
+    # try:
+    #     # NOTE: this needs to match `EvaluationMapper.map_common_subexpression`
+    #     previous_cse_cache = places._get_cache("cse")
+    #     places.places["proxy"] = proxy
+    #     yield places
+    # finally:
+    #     del places.places["proxy"]
+    #     # NOTE: this is meant to make sure that proxy-related things don't
+    #     # get cached over multiple runs and lead to some explosion of some sort
+    #     places.caches["cse"] = previous_cse_cache
+
+
+def make_block_proxy_skeleton(actx, ibrow, ibcol,
+        places, proxy_generator, wrangler, indices,
+        source_or_target=None,
+        max_particles_in_box=None):
+    """Builds a block matrix that can be used to skeletonize the
+    rows (targets) or columns (sources) of the symbolic matrix block
+    described by ``(ibrow, ibcol)``.
+    """
+
+    if source_or_target == "source":
+        from pytential.target import PointsTarget as ProxyPoints
+
+        swap_arg_order = lambda x, y: (x, y)        # noqa: E731
+        evaluate_farfield = wrangler.evaluate_source_farfield
+        block_stack = np.vstack
+    elif source_or_target == "target":
+        from pytential.source import PointPotentialSource as ProxyPoints
+
+        swap_arg_order = lambda x, y: (y, x)        # noqa: E731
+        evaluate_farfield = wrangler.evaluate_target_farfield
+        block_stack = np.hstack
+    else:
+        raise ValueError(f"unknown value: '{source_or_target}'")
+
+    # {{{ generate proxies
+
+    domain = wrangler.domains[ibcol]
+    dep_discr = places.get_discretization(domain.geometry, domain.discr_stage)
+    pxy = proxy_generator(actx, domain, indices)
+
+    # }}}
+
+    # {{{ evaluate (farfield) proxy interactions
+
+    with add_to_geometry_collection(places, ProxyPoints(pxy.points)) as pxyplaces:
+        pxyindices = MatrixBlockIndexRanges(actx.context,
+                *swap_arg_order(pxy.indices, indices))
+        pxymat = evaluate_farfield(actx, pxyplaces, ibrow, ibcol, pxyindices,
+                auto_where=swap_arg_order(domain, "proxy"))
+
+    if indices.nblocks == 1:
+        # NOTE: this doesn't really happen, because this function only gets
+        # called by skeletonize_block_by_proxy, which already takes care of it
+        return make_block_diag(pxymat, pxyindices.get(actx.queue))
+
+    # }}}
+
+    # {{{ evaluate (nearfield) neighbor interactions
+
+    from pytential.linalg.proxy import gather_block_neighbor_points
+    nbrindices = gather_block_neighbor_points(
+            actx, dep_discr, indices, pxy,
+            max_particles_in_box=max_particles_in_box)
+
+    nbrindices = MatrixBlockIndexRanges(actx.context,
+            *swap_arg_order(nbrindices, indices))
+    nbrmat = wrangler.evaluate_nearfield(actx, places, ibrow, ibcol, nbrindices)
+
+    # }}}
+
+    # {{{ concatenate everything to get the blocks ready for ID-ing
+
+    pxyindices = pxyindices.get(actx.queue)
+    nbrindices = nbrindices.get(actx.queue)
+
+    pxyblk = np.full((indices.nblocks, indices.nblocks), 0, dtype=np.object)
+    for i in range(indices.nblocks):
+        pxyblk[i, i] = block_stack(swap_arg_order(
+            pxyindices.block_take(pxymat, i),
+            nbrindices.block_take(nbrmat, i),
+            ))
+    # }}}
+
+    return pxyblk
+
+
+def _skeletonize_block_by_proxy_with_mats(actx,
+        ibcol, ibrow, places, proxy_generator, wrangler, blkindices,
+        id_eps=None, id_rank=None,
+        max_particles_in_box=None):
+    L = np.full((blkindices.nblocks, blkindices.nblocks), 0, dtype=np.object)
+    R = np.full((blkindices.nblocks, blkindices.nblocks), 0, dtype=np.object)
+
+    if blkindices.nblocks == 1:
+        L[0, 0] = np.eye(blkindices.row.indices.size)
+        R[0, 0] = np.eye(blkindices.col.indices.size)
+
+        return L, R, blkindices
+
+    # construct proxy matrices to skeletonize
+    src_mat = make_block_proxy_skeleton(actx, ibrow, ibcol,
+            places, proxy_generator, wrangler, blkindices.col,
+            source_or_target="source",
+            max_particles_in_box=max_particles_in_box)
+    tgt_mat = make_block_proxy_skeleton(actx, ibrow, ibcol,
+            places, proxy_generator, wrangler, blkindices.row,
+            source_or_target="target",
+            max_particles_in_box=max_particles_in_box)
+
+    src_skl_indices = np.empty(blkindices.nblocks, dtype=np.object)
+    tgt_skl_indices = np.empty(blkindices.nblocks, dtype=np.object)
+    skl_ranges = np.zeros(blkindices.nblocks + 1, dtype=np.int)
+
+    src_indices = blkindices.col.get(actx.queue)
+    tgt_indices = blkindices.row.get(actx.queue)
+
+    for i in range(blkindices.nblocks):
+        k = id_rank
+
+        # skeletonize target points
+        assert not np.any(np.isinf(tgt_mat[i, i])), \
+                np.where(np.isinf(tgt_mat[i, i]))
+        assert not np.any(np.isnan(tgt_mat[i, i])), \
+                np.where(np.isnan(tgt_mat[i, i]))
+
+        k, idx, interp = interp_decomp(tgt_mat[i, i].T, k, id_eps)
+        assert k > 0
+
+        L[i, i] = interp.T
+        tgt_skl_indices[i] = tgt_indices.block_indices(i)[idx[:k]]
+
+        # skeletonize source points
+        assert not np.any(np.isinf(src_mat[i, i])), \
+                np.where(np.isinf(src_mat[i, i]))
+        assert not np.any(np.isnan(src_mat[i, i])), \
+                np.where(np.isnan(src_mat[i, i]))
+
+        k, idx, interp = interp_decomp(src_mat[i, i], k, id_eps)
+        assert k > 0
+
+        R[i, i] = interp
+        src_skl_indices[i] = src_indices.block_indices(i)[idx[:k]]
+
+        skl_ranges[i + 1] = skl_ranges[i] + k
+        assert R[i, i].shape == (k, src_mat[i, i].shape[1])
+        assert L[i, i].shape == (tgt_mat[i, i].shape[0], k)
+
+    from pytential.linalg.proxy import make_block_index
+    src_skl_indices = make_block_index(actx, np.hstack(src_skl_indices), skl_ranges)
+    tgt_skl_indices = make_block_index(actx, np.hstack(tgt_skl_indices), skl_ranges)
+    skl_indices = MatrixBlockIndexRanges(actx.context,
+            tgt_skl_indices, src_skl_indices)
+
+    return L, R, skl_indices, src_mat, tgt_mat
+
+
+def skeletonize_block_by_proxy(actx,
+        ibcol, ibrow, places, proxy_generator, wrangler, blkindices,
+        id_eps=None, id_rank=None,
+        max_particles_in_box=None):
+    L, R, sklindices, _, _ = _skeletonize_block_by_proxy_with_mats(actx,
+            ibcol, ibrow, places, proxy_generator, wrangler, blkindices,
+            id_eps=id_eps,
+            id_rank=id_rank,
+            max_particles_in_box=max_particles_in_box)
+
+    return L, R, sklindices
+
+# }}}
+
+
+# {{{ skeletonize_by_proxy
+
+class SkeletonizedBlock(Record):
+    """
+    .. attribute:: L
+    .. attribute:: R
+    .. attribute:: sklindices
+    """
+
+
+def skeletonize_by_proxy(actx, places, proxy_generator, wrangler, blkindices,
+        id_eps=None, id_rank=None, max_particles_in_box=None):
+    r"""
+    :arg places: a :class:`~meshmode.array_context.ArrayContext`.
+    :arg proxy_generator: a :class:`~pytential.linalg.proxy.ProxyGenerator`.
+    :arg wrangler: a :class:`BlockEvaluationWrangler`.
+    :arg blkindices: a :class:`~sumpy.tools.MatrixBlockIndexRanges`.
+
+    :returns: a tuple ``(L, R, sklindices)`` encoding the block-by-block
+        decompression of the matrix represented *wrangler*. :math:`L` and
+        :math:`R` are :math:`n \times n` diagonal block matrix, where
+        :math:`n` is :attr:`~sumpy.tools.MatrixBlockIndexRanges.nblocks``. The
+        ``sklindices`` array contains the remaining (skeleton) nodes from
+        ``blkindices`` after compression.
+    """
+
+    nrows = len(wrangler.exprs)
+    ncols = len(wrangler.input_exprs)
+    skel = np.empty((nrows, ncols), dtype=np.object)
+
+    for ibrow in range(nrows):
+        for ibcol in range(ncols):
+            L, R, sklindices = skeletonize_block_by_proxy(actx,
+                    ibrow, ibcol, places, proxy_generator, wrangler, blkindices,
+                    id_eps=id_eps, id_rank=id_rank,
+                    max_particles_in_box=max_particles_in_box)
+
+            skel[ibrow, ibcol] = SkeletonizedBlock(L=L, R=R, sklindices=sklindices)
+
+    return skel
+
+# }}}
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index 3a89914583f66b9fdc0e41dfa76556dc9f64cbbd..db2c59874ef4d0cb210e83c8d83d51bde71ac967 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -411,11 +411,13 @@ class MatrixBuilder(MatrixBuilderBase):
 
 class P2PMatrixBuilder(MatrixBuilderBase):
     def __init__(self, actx, dep_expr, other_dep_exprs,
-            dep_source, dep_discr, places, context, exclude_self=True):
+            dep_source, dep_discr, places, context,
+            weighted=False, exclude_self=False):
         super().__init__(actx,
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, context)
 
+        self.weighted = weighted
         self.exclude_self = exclude_self
 
     def map_int_g(self, expr):
@@ -456,8 +458,17 @@ class P2PMatrixBuilder(MatrixBuilderBase):
                 targets=flatten(thaw(actx, target_discr.nodes())),
                 sources=flatten(thaw(actx, source_discr.nodes())),
                 **kernel_args)
+        mat = actx.to_numpy(mat)
+
+        if self.weighted:
+            from pytential import bind, sym
+            waa = bind(self.places, sym.weights_and_area_elements(
+                source_discr.ambient_dim,
+                dofdesc=expr.source))(self.array_context)
 
-        return actx.to_numpy(mat).dot(rec_density)
+            mat[:, :] *= actx.to_numpy(flatten(waa))
+
+        return mat.dot(rec_density)
 
 # }}}
 
@@ -527,18 +538,22 @@ class NearFieldBlockBuilder(MatrixBlockBuilderBase):
         waa = bind(self.places, sym.weights_and_area_elements(
             source_discr.ambient_dim,
             dofdesc=expr.source))(actx)
-        waa = flatten(waa)
 
+        waa = flatten(waa)
         mat *= waa[self.index_set.linear_col_indices]
-        return rec_density * actx.to_numpy(mat)
+
+        return actx.to_numpy(mat) * rec_density
 
 
 class FarFieldBlockBuilder(MatrixBlockBuilderBase):
     def __init__(self, queue, dep_expr, other_dep_exprs, dep_source, dep_discr,
-            places, index_set, context, exclude_self=False):
+            places, index_set, context,
+            weighted=False, exclude_self=False):
         super().__init__(queue,
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, index_set, context)
+
+        self.weighted = weighted
         self.exclude_self = exclude_self
 
     def get_dep_variable(self):
@@ -554,9 +569,6 @@ class FarFieldBlockBuilder(MatrixBlockBuilderBase):
         target_discr = self.places.get_discretization(
                 expr.target.geometry, expr.target.discr_stage)
 
-        if source_discr is not target_discr:
-            raise NotImplementedError
-
         rec_density = self._blk_mapper.rec(expr.density)
         if is_zero(rec_density):
             return 0
@@ -583,14 +595,24 @@ class FarFieldBlockBuilder(MatrixBlockBuilderBase):
         mat_gen = P2PMatrixBlockGenerator(actx.context, (kernel,),
                 exclude_self=self.exclude_self)
 
-        from meshmode.dof_array import flatten, thaw
+        from pytential.utils import flatten_if_needed
         _, (mat,) = mat_gen(actx.queue,
-                targets=flatten(thaw(actx, target_discr.nodes())),
-                sources=flatten(thaw(actx, source_discr.nodes())),
+                targets=flatten_if_needed(actx, target_discr.nodes()),
+                sources=flatten_if_needed(actx, source_discr.nodes()),
                 index_set=self.index_set,
                 **kernel_args)
 
-        return rec_density * actx.to_numpy(mat)
+        if self.weighted:
+            from pytential import bind, sym
+            waa = bind(self.places, sym.weights_and_area_elements(
+                source_discr.ambient_dim,
+                dofdesc=expr.source))(actx)
+
+            from meshmode.dof_array import flatten
+            waa = flatten(waa)
+            mat *= waa[self.index_set.linear_col_indices]
+
+        return actx.to_numpy(mat) * rec_density
 
 # }}}
 
diff --git a/test/extra_matrix_data.py b/test/extra_matrix_data.py
index 7401d13860e81229e735b7e571ff95f9ce5b6fa2..6db88d2dbc5674720bbfba77bbda8bc9538dacb3 100644
--- a/test/extra_matrix_data.py
+++ b/test/extra_matrix_data.py
@@ -21,23 +21,106 @@ def max_block_error(mat, blk, index_set, p=None):
 
     return error
 
+
+def skeletonization_error(mat, skel, srcindices, p=None):
+    r"""Matrix errors are computed as follows:
+
+        .. math::
+
+            \begin{aligned}
+            \epsilon_{l, ij} = \|A_{ij} - L_{ii} S_{ij}\|_p, \\
+            \epsilon_{r, ij} = \|A_{ij} - S_{ij} R_{jj}\|_p, \\
+            \epsilon_f = \|A - L S R\|_p.
+            \end{aligned}
+
+    :arg mat: dense matrix.
+    :arg skel: a :class:`~pytential.linalg.skeletonization.SkeletonizedBlock`.
+    :arg p: norm type (follows :func:`~numpy.linalg.norm` rules for matrices).
+
+    :returns: a tuple ``(err_l, err_r, err_f)`` of the left, right and full
+        matrix errors.
+    """
+    from itertools import product
+
+    L = skel.L
+    R = skel.R
+    sklindices = skel.sklindices
+
+    def mnorm(x, y):
+        return la.norm(x - y, ord=p) / la.norm(x, ord=p)
+
+    # build block matrices
+    nblocks = srcindices.nblocks
+    S = np.full((nblocks, nblocks), 0, dtype=np.object)
+    A = np.full((nblocks, nblocks), 0, dtype=np.object)
+
+    def block_indices(blk, i):
+        return blk.indices[blk.ranges[i]:blk.ranges[i + 1]]
+
+    # compute max block error
+    err_l = np.zeros((nblocks, nblocks))
+    err_r = np.zeros((nblocks, nblocks))
+    for i, j in product(range(nblocks), repeat=2):
+        if i == j:
+            continue
+
+        # full matrix indices
+        f_tgt = block_indices(srcindices.row, i)
+        f_src = block_indices(srcindices.col, j)
+        # skeleton matrix indices
+        s_tgt = block_indices(sklindices.row, i)
+        s_src = block_indices(sklindices.col, j)
+
+        S[i, j] = mat[np.ix_(s_tgt, s_src)]
+        A[i, j] = mat[np.ix_(f_tgt, f_src)]
+
+        blk = mat[np.ix_(s_tgt, f_src)]
+        err_l[i, j] = mnorm(A[i, j], L[i, i].dot(blk))
+
+        blk = mat[np.ix_(f_tgt, s_src)]
+        err_r[i, j] = mnorm(A[i, j], blk.dot(R[j, j]))
+
+    # compute full matrix error
+    from pytential.symbolic.execution import _bmat
+    A = _bmat(A, dtypes=[mat.dtype])
+    L = _bmat(L, dtypes=[mat.dtype])
+    S = _bmat(S, dtypes=[mat.dtype])
+    R = _bmat(R, dtypes=[mat.dtype])
+
+    assert L.shape == (A.shape[0], S.shape[0])
+    assert R.shape == (S.shape[1], A.shape[1])
+    err_f = mnorm(A, L.dot(S.dot(R)))
+
+    return err_l, err_r, err_f
+
 # }}}
 
 
 # {{{ MatrixTestCase
 
 class MatrixTestCaseMixin:
+    # operators
+    op_type = "scalar"
+    # disable fmm for matrix tests
+    fmm_backend = None
+
     # partitioning
     approx_block_count = 10
     max_particles_in_box = None
     tree_kind = "adaptive-level-restricted"
     index_sparsity_factor = 1.0
 
-    # operators
-    op_type = "scalar"
+    # proxy
+    proxy_radius_factor = None
+    proxy_approx_count = None
 
-    # disable fmm for matrix tests
-    fmm_backend = None
+    # skeletonization
+    id_eps = 1.0e-8
+    skel_discr_stage = sym.QBX_SOURCE_STAGE2
+
+    weighted_farfield = None
+    farfield_block_builder = None
+    nearfield_block_builder = None
 
     def get_block_indices(self, actx, discr, matrix_indices=True):
         max_particles_in_box = self.max_particles_in_box
@@ -47,7 +130,7 @@ class MatrixTestCaseMixin:
         from pytential.linalg.proxy import partition_by_nodes
         indices = partition_by_nodes(actx, discr,
                 tree_kind=self.tree_kind,
-                max_nodes_in_box=max_particles_in_box)
+                max_particles_in_box=max_particles_in_box)
 
         if abs(self.index_sparsity_factor - 1.0) < 1.0e-14:
             if not matrix_indices:
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
index 3c6f0b29ab71a6874534cbfe57362c1bdcfc4f67..34ed458e9f217a4a8828999564c4dfc49f45cb51 100644
--- a/test/test_linalg_proxy.py
+++ b/test/test_linalg_proxy.py
@@ -29,9 +29,10 @@ import pyopencl as cl
 
 from pytential import bind, sym
 from pytential import GeometryCollection
+from pytential.utils import flatten_to_numpy
 
 from meshmode.array_context import PyOpenCLArrayContext
-from meshmode.mesh.generation import ellipse
+from meshmode.mesh.generation import ellipse, NArmedStarfish
 
 import pytest
 from pyopencl.tools import (  # noqa
@@ -44,6 +45,8 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
 
+# {{{ plot_partition_indices
+
 def plot_partition_indices(actx, discr, indices, **kwargs):
     try:
         import matplotlib.pyplot as pt
@@ -52,7 +55,7 @@ def plot_partition_indices(actx, discr, indices, **kwargs):
 
     indices = indices.get(actx.queue)
     args = [
-        kwargs.get("tree_kind", "linear").replace("-", "_"),
+        str(kwargs.get("tree_kind", "linear")).replace("-", "_"),
         kwargs.get("discr_stage", "stage1"),
         discr.ambient_dim
         ]
@@ -62,7 +65,6 @@ def plot_partition_indices(actx, discr, indices, **kwargs):
     pt.savefig("test_partition_{1}_{3}d_ranges_{2}.png".format(*args))
     pt.clf()
 
-    from pytential.utils import flatten_to_numpy
     if discr.ambient_dim == 2:
         sources = flatten_to_numpy(actx, discr.nodes())
 
@@ -96,18 +98,27 @@ def plot_partition_indices(actx, discr, indices, **kwargs):
             ("marker", marker)
             ])
 
+# }}}
+
 
 PROXY_TEST_CASES = [
         extra.CurveTestCase(
             name="ellipse",
             target_order=7,
             curve_fn=partial(ellipse, 3.0)),
+        extra.CurveTestCase(
+            name="starfish",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[32]),
         extra.TorusTestCase(
             target_order=2,
             resolutions=[0])
         ]
 
 
+# {{{ test_partition_points
+
 @pytest.mark.skip(reason="only useful with visualize=True")
 @pytest.mark.parametrize("tree_kind", ["adaptive", None])
 @pytest.mark.parametrize("case", PROXY_TEST_CASES)
@@ -134,10 +145,16 @@ def test_partition_points(ctx_factory, tree_kind, case, visualize=False):
 
     # }}}
 
+# }}}
+
+
+# {{{ test_proxy_generator
 
 @pytest.mark.parametrize("case", PROXY_TEST_CASES)
 @pytest.mark.parametrize("index_sparsity_factor", [1.0, 0.6])
-def test_proxy_generator(ctx_factory, case, index_sparsity_factor, visualize=False):
+@pytest.mark.parametrize("proxy_radius_factor", [0.0, 1.1])
+def test_proxy_generator(ctx_factory, case,
+        index_sparsity_factor, proxy_radius_factor, visualize=False):
     """Tests that the proxies generated are all at the correct radius from the
     points in the cluster, etc.
     """
@@ -146,10 +163,12 @@ def test_proxy_generator(ctx_factory, case, index_sparsity_factor, visualize=Fal
     queue = cl.CommandQueue(ctx)
     actx = PyOpenCLArrayContext(queue)
 
-    case = case.copy(index_sparsity_factor=index_sparsity_factor)
+    case = case.copy(
+            index_sparsity_factor=index_sparsity_factor,
+            proxy_radius_factor=proxy_radius_factor)
     logger.info("\n%s", case)
 
-    # {{{ generate proxies
+    # {{{ check proxies
 
     qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
     places = GeometryCollection(qbx, auto_where=case.name)
@@ -158,101 +177,119 @@ def test_proxy_generator(ctx_factory, case, index_sparsity_factor, visualize=Fal
     srcindices = case.get_block_indices(actx, density_discr, matrix_indices=False)
 
     from pytential.linalg.proxy import ProxyGenerator
-    generator = ProxyGenerator(places)
-    proxies, pxyranges, pxycenters, pxyradii = \
-            generator(actx, places.auto_source, srcindices)
+    proxies = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor,
+            approx_nproxy=case.proxy_approx_count)(
+                    actx, places.auto_source, srcindices)
 
-    proxies = np.vstack([actx.to_numpy(p) for p in proxies])
-    pxyranges = actx.to_numpy(pxyranges)
-    pxycenters = np.vstack([actx.to_numpy(c) for c in pxycenters])
-    pxyradii = actx.to_numpy(pxyradii)
+    proxies = proxies.get(queue)
+    pxypoints = np.vstack(proxies.points)
+    pxycenters = np.vstack(proxies.centers)
+
+    srcindices = srcindices.get(queue)
+    nodes = np.vstack(flatten_to_numpy(actx, density_discr.nodes()))
 
     for i in range(srcindices.nblocks):
-        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+        isrc = srcindices.block_indices(i)
+        ipxy = proxies.indices.block_indices(i)
 
-        r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
-        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)
+        r = la.norm(pxypoints[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        p_error = la.norm(r - proxies.radii[i])
+
+        r = la.norm(nodes[:, isrc] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        n_error = la.norm(r - proxies.radii[i], np.inf)
+
+        assert p_error < 1.0e-14, f"block {i}"
+        assert n_error - proxies.radii[i] < 1.0e-14, f"block {i}"
 
     # }}}
 
     # {{{ visualization
 
-    srcindices = srcindices.get(queue)
-    if visualize:
-        ambient_dim = places.ambient_dim
-        if ambient_dim == 2:
+    if not visualize:
+        return
+
+    ambient_dim = places.ambient_dim
+    if ambient_dim == 2:
+        try:
             import matplotlib.pyplot as pt
+        except ImportError:
+            return
+
+        ci = np.vstack(flatten_to_numpy(actx,
+            bind(places, sym.expansion_centers(ambient_dim, -1))(actx)
+            ))
+        ce = np.vstack(flatten_to_numpy(actx,
+            bind(places, sym.expansion_centers(ambient_dim, +1))(actx)
+            ))
+        r = flatten_to_numpy(actx,
+                bind(places, sym.expansion_radii(ambient_dim))(actx)
+                )
+
+        fig = pt.figure(figsize=(10, 8))
+        for i in range(srcindices.nblocks):
+            isrc = srcindices.block_indices(i)
+            ipxy = proxies.indices.block_indices(i)
+
+            ax = pt.gca()
+            for j in isrc:
+                c = pt.Circle(ci[:, j], r[j], color="k", alpha=0.1)
+                ax.add_artist(c)
+                c = pt.Circle(ce[:, j], r[j], color="k", alpha=0.1)
+                ax.add_artist(c)
+
+            ax.plot(nodes[0], nodes[1], "ko", ms=2.0, alpha=0.5)
+            ax.plot(nodes[0, srcindices.indices], nodes[1, srcindices.indices],
+                    "o", ms=2.0)
+            ax.plot(nodes[0, isrc], nodes[1, isrc], "o", ms=1.0)
+            ax.plot(pxypoints[0, ipxy], pxypoints[1, ipxy], "o", ms=1.0)
+            ax.plot(pxycenters[0, i], pxycenters[1, i], "ko", ms=2.0)
+            ax.set_aspect("equal")
+            ax.set_xlim([-1.5, 1.5])
+            ax.set_ylim([-1.5, 1.5])
+
+            filename = f"test_proxy_generator_{ambient_dim}d_{i:04}"
+            fig.savefig(filename, dpi=300)
+            fig.clf()
+        pt.close(fig)
+    else:
+        from meshmode.discretization.visualization import make_visualizer
+        from meshmode.mesh.processing import ( # noqa
+                affine_map, merge_disjoint_meshes)
+        from meshmode.discretization import Discretization
+        from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+
+        from meshmode.mesh.generation import generate_icosphere
+        ref_mesh = generate_icosphere(1, proxies.nproxy)
 
-            from pytential.utils import flatten_to_numpy
-            density_nodes = np.vstack(flatten_to_numpy(actx, density_discr.nodes()))
-            ci = bind(places, sym.expansion_centers(ambient_dim, -1))(actx)
-            ci = np.vstack(flatten_to_numpy(actx, ci))
-            ce = bind(places, sym.expansion_centers(ambient_dim, +1))(actx)
-            ce = np.vstack(flatten_to_numpy(actx, ce))
-            r = bind(places, sym.expansion_radii(ambient_dim))(actx)
-            r = flatten_to_numpy(actx, r)
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
-
-                pt.figure(figsize=(10, 8))
-                axis = pt.gca()
-                for j in isrc:
-                    c = pt.Circle(ci[:, j], r[j], color="k", alpha=0.1)
-                    axis.add_artist(c)
-                    c = pt.Circle(ce[:, j], r[j], color="k", alpha=0.1)
-                    axis.add_artist(c)
-
-                pt.plot(density_nodes[0], density_nodes[1],
-                        "ko", ms=2.0, alpha=0.5)
-                pt.plot(density_nodes[0, srcindices.indices],
-                        density_nodes[1, srcindices.indices],
-                        "o", ms=2.0)
-                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
-                        "o", ms=2.0)
-                pt.plot(proxies[0, ipxy], proxies[1, ipxy],
-                        "o", ms=2.0)
-                pt.xlim([-1.5, 1.5])
-                pt.ylim([-1.5, 1.5])
-
-                filename = "test_proxy_generator_{}d_{:04}.png".format(
-                        ambient_dim, i)
-                pt.savefig(filename, dpi=300)
-                pt.clf()
-        else:
-            from meshmode.discretization.visualization import make_visualizer
-            from meshmode.mesh.processing import ( # noqa
-                    affine_map, merge_disjoint_meshes)
-            from meshmode.discretization import Discretization
-            from meshmode.discretization.poly_element import \
-                InterpolatoryQuadratureSimplexGroupFactory
-
-            from meshmode.mesh.generation import generate_icosphere
-            ref_mesh = generate_icosphere(1, generator.nproxy)
-
-            # NOTE: this does not plot the actual proxy points
-            for i in range(srcindices.nblocks):
-                mesh = affine_map(ref_mesh,
-                    A=(pxyradii[i] * np.eye(ambient_dim)),
-                    b=pxycenters[:, i].reshape(-1))
-
-                mesh = merge_disjoint_meshes([mesh, density_discr.mesh])
-                discr = Discretization(actx, mesh,
-                    InterpolatoryQuadratureSimplexGroupFactory(10))
-
-                vis = make_visualizer(actx, discr, 10)
-                filename = "test_proxy_generator_{}d_{:04}.vtu".format(
-                        ambient_dim, i)
-                vis.write_vtk_file(filename, [])
+        # NOTE: this does not plot the actual proxy points
+        for i in range(srcindices.nblocks):
+            mesh = affine_map(ref_mesh,
+                A=(proxies.radii[i] * np.eye(ambient_dim)),
+                b=pxycenters[:, i].reshape(-1))
+
+            mesh = merge_disjoint_meshes([mesh, density_discr.mesh])
+            discr = Discretization(actx, mesh,
+                InterpolatoryQuadratureSimplexGroupFactory(10))
+
+            vis = make_visualizer(actx, discr, 10)
+
+            filename = f"test_proxy_generator_{ambient_dim}d_{i:04}.vtu"
+            vis.write_vtk_file(filename, [])
 
     # }}}
 
+# }}}
+
+
+# {{{ test_neighbor_points
 
 @pytest.mark.parametrize("case", PROXY_TEST_CASES)
 @pytest.mark.parametrize("index_sparsity_factor", [1.0, 0.6])
-def test_interaction_points(ctx_factory,
-        case, index_sparsity_factor, visualize=False):
+@pytest.mark.parametrize("proxy_radius_factor", [0.0, 1.1])
+def test_neighbor_points(ctx_factory, case,
+        index_sparsity_factor, proxy_radius_factor, visualize=False):
     """Test that neighboring points (inside the proxy balls, but outside the
     current block/cluster) are actually inside.
     """
@@ -261,7 +298,9 @@ def test_interaction_points(ctx_factory,
     queue = cl.CommandQueue(ctx)
     actx = PyOpenCLArrayContext(queue)
 
-    case = case.copy(index_sparsity_factor=index_sparsity_factor)
+    case = case.copy(
+            index_sparsity_factor=index_sparsity_factor,
+            proxy_radius_factor=proxy_radius_factor)
     logger.info("\n%s", case)
 
     # {{{ check neighboring points
@@ -274,86 +313,272 @@ def test_interaction_points(ctx_factory,
 
     # generate proxy points
     from pytential.linalg.proxy import ProxyGenerator
-    generator = ProxyGenerator(places)
-    _, _, pxycenters, pxyradii = generator(actx, places.auto_source, srcindices)
+    proxies = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor,
+            approx_nproxy=case.proxy_approx_count)(
+                    actx, places.auto_source, srcindices)
 
     # get neighboring points
-    from pytential.linalg.proxy import (  # noqa
-            gather_block_neighbor_points,
-            gather_block_interaction_points)
-    nbrindices = gather_block_neighbor_points(actx, density_discr,
-            srcindices, pxycenters, pxyradii)
-    nodes, ranges = gather_block_interaction_points(actx,
-            places, places.auto_source, srcindices)
+    from pytential.linalg.proxy import gather_block_neighbor_points
+    nbrindices = gather_block_neighbor_points(actx,
+            density_discr, srcindices, proxies)
 
     srcindices = srcindices.get(queue)
     nbrindices = nbrindices.get(queue)
 
+    proxies = proxies.get(queue)
+    pxycenters = np.vstack(proxies.centers)
+    nodes = np.vstack(flatten_to_numpy(actx, density_discr.nodes()))
+
     for i in range(srcindices.nblocks):
         isrc = srcindices.block_indices(i)
         inbr = nbrindices.block_indices(i)
 
         assert not np.any(np.isin(inbr, isrc))
 
+        r = la.norm(nodes[:, inbr] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.all(r - proxies.radii[i] < 0.0)
+
     # }}}
 
     # {{{ visualize
 
-    from pytential.utils import flatten_to_numpy
-    if visualize:
-        ambient_dim = places.ambient_dim
-        if ambient_dim == 2:
+    if not visualize:
+        return
+
+    ambient_dim = places.ambient_dim
+    if ambient_dim == 2:
+        try:
             import matplotlib.pyplot as pt
-            density_nodes = flatten_to_numpy(actx, density_discr.nodes())
-            nodes = flatten_to_numpy(actx, nodes)
-            ranges = actx.to_numpy(ranges)
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                inbr = nbrindices.block_indices(i)
-                iall = np.s_[ranges[i]:ranges[i + 1]]
-
-                pt.figure(figsize=(10, 8))
-                pt.plot(density_nodes[0], density_nodes[1],
-                        "ko", ms=2.0, alpha=0.5)
-                pt.plot(density_nodes[0][srcindices.indices],
-                        density_nodes[1][srcindices.indices],
-                        "o", ms=2.0)
-                pt.plot(density_nodes[0][isrc], density_nodes[1][isrc],
-                        "o", ms=2.0)
-                pt.plot(density_nodes[0][inbr], density_nodes[1][inbr],
-                        "o", ms=2.0)
-                pt.plot(nodes[0][iall], nodes[1][iall],
-                        "x", ms=2.0)
-                pt.xlim([-1.5, 1.5])
-                pt.ylim([-1.5, 1.5])
-
-                filename = f"test_area_query_{ambient_dim}d_{i:04}.png"
-                pt.savefig(filename, dpi=300)
-                pt.clf()
-        elif ambient_dim == 3:
-            from meshmode.discretization.visualization import make_visualizer
-            marker = np.empty(density_discr.ndofs)
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                inbr = nbrindices.block_indices(i)
-
-                marker.fill(0.0)
-                marker[srcindices.indices] = 0.0
-                marker[isrc] = -42.0
-                marker[inbr] = +42.0
-
-                from meshmode.dof_array import unflatten
-                marker_dev = unflatten(actx, density_discr, actx.from_numpy(marker))
-
-                vis = make_visualizer(actx, density_discr, 10)
-                filename = f"test_area_query_{ambient_dim}d_{i:04}.vtu"
-                vis.write_vtk_file(filename, [
-                    ("marker", marker_dev),
-                    ])
+        except ImportError:
+            return
+
+        pxycenters = np.vstack(proxies.centers)
+        pxyradii = proxies.radii
+
+        fig = pt.figure(figsize=(10, 10))
+        for i in range(srcindices.nblocks):
+            isrc = srcindices.block_indices(i)
+            inbr = nbrindices.block_indices(i)
+
+            ax = fig.gca()
+
+            ax.plot(nodes[0], nodes[1], "ko", ms=2.0, alpha=0.5)
+            ax.plot(nodes[0, srcindices.indices], nodes[1, srcindices.indices],
+                    "o", ms=2.0)
+            ax.plot(nodes[0, isrc], nodes[1, isrc], "o", ms=1.0)
+            ax.plot(nodes[0, inbr], nodes[1, inbr], "o", ms=1.0)
+            ax.plot(pxycenters[0, i], pxycenters[1, i], "ko", ms=2.0)
+            ax.axis("equal")
+
+            c = pt.Circle(pxycenters[:, i], pxyradii[i], color="k", alpha=0.1)
+            ax.add_artist(c)
+
+            pt.xlim([-1.5, 1.5])
+            pt.ylim([-1.5, 1.5])
+
+            filename = f"test_area_query_{ambient_dim}d_{i:04}"
+            fig.savefig(filename, dpi=300)
+            fig.clf()
+        pt.close(fig)
+    elif ambient_dim == 3:
+        from meshmode.discretization.visualization import make_visualizer
+        marker = np.empty(density_discr.ndofs)
+
+        for i in range(srcindices.nblocks):
+            isrc = srcindices.block_indices(i)
+            inbr = nbrindices.block_indices(i)
+
+            marker.fill(0.0)
+            marker[srcindices.indices] = 0.0
+            marker[isrc] = -42.0
+            marker[inbr] = +42.0
+
+            from meshmode.dof_array import unflatten
+            marker_dev = unflatten(actx, density_discr, actx.from_numpy(marker))
+
+            vis = make_visualizer(actx, density_discr, 10)
+            filename = "test_area_query_{}d_{:04}.vtu".format(ambient_dim, i)
+            vis.write_vtk_file(filename, [
+                ("marker", marker_dev),
+                ])
+
     # }}}
 
+# }}}
+
+
+# {{{ test_skeletonize_by_proxy
+
+@pytest.mark.parametrize("case", PROXY_TEST_CASES)
+def test_skeletonize_by_proxy(ctx_factory, case, visualize=False):
+    """Test single-level level skeletonization accuracy."""
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
+
+    case = case.copy(approx_block_count=6, op_type="scalar", id_eps=1.0e-8)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[0], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    srcindices = case.get_block_indices(actx, density_discr)
+
+    logger.info("nblocks %3d ndofs %7d", srcindices.nblocks, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ wranglers
+
+    from pytential.linalg.proxy import ProxyGenerator
+    from pytential.linalg.skeletonization import make_block_evaluation_wrangler
+    proxy_generator = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor,
+            approx_nproxy=case.proxy_approx_count)
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+    wrangler = make_block_evaluation_wrangler(places, sym_op, sym_u,
+            domains=None,
+            context=case.knl_concrete_kwargs,
+            _weighted_farfield=case.weighted_farfield,
+            _farfield_block_builder=case.farfield_block_builder,
+            _nearfield_block_builder=case.nearfield_block_builder)
+
+    # }}}
+
+    # {{{ check proxy id decomposition
+
+    # dense matrix
+    from pytential.symbolic.execution import build_matrix
+    mat = actx.to_numpy(
+            build_matrix(actx, places, sym_op, sym_u,
+                context=case.knl_concrete_kwargs)
+            )
+
+    # skeleton
+    from pytential.linalg.skeletonization import \
+            _skeletonize_block_by_proxy_with_mats
+    L, R, sklindices, src, tgt = _skeletonize_block_by_proxy_with_mats(actx,
+            0, 0, places, proxy_generator, wrangler, srcindices,
+            id_eps=case.id_eps,
+            max_particles_in_box=case.max_particles_in_box)
+
+    srcindices = srcindices.get(queue)
+    sklindices = sklindices.get(queue)
+    for i in range(sklindices.nblocks):
+        # targets (rows)
+        bi = np.searchsorted(
+            srcindices.row.block_indices(i),
+            sklindices.row.block_indices(i),
+            )
+
+        A = tgt[i, i]
+        S = A[bi, :]
+        tgt_error = la.norm(A - L[i, i] @ S) / la.norm(A)
+
+        # sources (columns)
+        bj = np.searchsorted(
+            srcindices.col.block_indices(i),
+            sklindices.col.block_indices(i),
+            )
+
+        A = src[i, i]
+        S = A[:, bj]
+        src_error = la.norm(A - S @ R[i, i]) / la.norm(A)
+
+        logger.info("[%04d] id_eps %.5e src %.5e tgt %.5e rank %d/%d",
+                i, case.id_eps,
+                src_error, tgt_error, R[i, i].shape[0], R[i, i].shape[1])
+
+        assert src_error < 6 * case.id_eps
+        assert tgt_error < 6 * case.id_eps
+
+    # }}}
+
+    # {{{ check skeletonize
+
+    from pytential.linalg.skeletonization import SkeletonizedBlock
+    skeleton = SkeletonizedBlock(L=L, R=R, sklindices=sklindices)
+
+    blk_err_l, blk_err_r, err_f = extra.skeletonization_error(
+            mat, skeleton, srcindices)
+    err_l = la.norm(blk_err_l, np.inf)
+    err_r = la.norm(blk_err_r, np.inf)
+
+    logger.info("error: id_eps %.5e L %.5e R %.5e F %.5e",
+            case.id_eps, err_l, err_r, err_f)
+
+    # FIXME: why is the 3D error so large?
+    rtol = 5 * 10**places.ambient_dim * case.id_eps
+
+    assert err_l < rtol
+    assert err_r < rtol
+    assert err_f < rtol
+
+    # }}}
+
+    # {{{ visualize
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    pt.imshow(np.log10(blk_err_l + 1.0e-16))
+    pt.colorbar()
+    pt.savefig("test_skeletonize_by_proxy_err_l")
+    pt.clf()
+
+    pt.imshow(np.log10(blk_err_r + 1.0e-16))
+    pt.colorbar()
+    pt.savefig("test_skeletonize_by_proxy_err_r")
+    pt.clf()
+
+    def plot_skeleton(isrc, iskl, name):
+        pt.figure(figsize=(10, 10), dpi=300)
+        pt.plot(sources[0][isrc.indices], sources[1][isrc.indices],
+                "ko", alpha=0.5)
+
+        ax = pt.gca()
+        for i in range(srcindices.nblocks):
+            iblk = iskl.block_indices(i)
+            pt.plot(sources[0][iblk], sources[1][iblk], "o")
+
+            c = pt.Circle(pxycenters[:, i], pxyradii[i], color="k", alpha=0.1)
+            ax.add_artist(c)
+
+        ax.set_aspect("equal")
+        pt.xlim([-1.5, 1.5])
+        pt.ylim([-1.5, 1.5])
+        pt.savefig(f"test_skeletonize_by_proxy_{name}")
+        pt.clf()
+
+    pxy = proxy_generator(actx, wrangler.domains[0], srcindices.row).get(queue)
+    pxycenters = np.vstack(pxy.centers)
+    pxyradii = pxy.radii
+
+    if places.ambient_dim == 2:
+        sources = flatten_to_numpy(actx, density_discr.nodes())
+        srcindices = srcindices.get(queue)
+        sklindices = sklindices.get(queue)
+
+        plot_skeleton(srcindices.col, sklindices.col, "sources")
+        plot_skeleton(srcindices.row, sklindices.row, "targets")
+    else:
+        # TODO: would be nice to figure out a way to visualize some of these
+        # skeletonization results in 3D. Probably need to teach the
+        # visualizers to spit out point clouds
+        pass
+
+    # }}}
+
+# }}}
+
 
 if __name__ == "__main__":
     import sys
diff --git a/test/test_matrix.py b/test/test_matrix.py
index 9b04fbfd1ef3a38e414bdf7a08bb0bea7aa602a7..4ab317f70cda5f1799cc431039a90a6cb5617bb2 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -111,7 +111,8 @@ def test_build_matrix(ctx_factory, k, curve_fn, op_type, visualize=False):
     from pytential.symbolic.execution import build_matrix
     mat = actx.to_numpy(
             build_matrix(actx, places, sym_op, sym_u,
-            context=case.knl_concrete_kwargs))
+                context=case.knl_concrete_kwargs)
+            )
 
     if visualize:
         try: