diff --git a/doc/conf.py b/doc/conf.py
index 81b15205e10a3e747bec9b350f31f9979c920656..6df4af25e85a60f552c9d45b3e2da91a8ea2fef3 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -271,6 +271,7 @@ intersphinx_mapping = {
     'http://docs.python.org/': None,
     'http://documen.tician.de/boxtree/': None,
     'http://docs.scipy.org/doc/numpy/': None,
+    'http://documen.tician.de/meshmode/': None,
     'http://documen.tician.de/modepy/': None,
     'http://documen.tician.de/pyopencl/': None,
     'http://documen.tician.de/pytools/': None,
diff --git a/doc/index.rst b/doc/index.rst
index d56c8bb52279e161973f2c402419343d7729d878..772a936a9211e7b2db269a8b5c97e5749b2db8d0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -17,6 +17,7 @@ Contents
 
     discretization
     symbolic
+    linalg
     tools
     misc
 
diff --git a/doc/linalg.rst b/doc/linalg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d5e78a6e288d50419be802bf3ab235dff81c419d
--- /dev/null
+++ b/doc/linalg.rst
@@ -0,0 +1,9 @@
+Linear Algebra Routines
+=======================
+
+Hierarchical Direct Solver
+--------------------------
+
+.. automodule:: pytential.linalg.proxy
+
+.. vim: sw=4:tw=75
diff --git a/pytential/linalg/__init__.py b/pytential/linalg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4a471df6aa7ab0893c8fa82fe2f3c1b319962c5
--- /dev/null
+++ b/pytential/linalg/__init__.py
@@ -0,0 +1,23 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2018 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb8e8dec7bc5b4b132258b4e8334221762c143c
--- /dev/null
+++ b/pytential/linalg/proxy.py
@@ -0,0 +1,692 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array # noqa
+from pyopencl.array import to_device
+
+from pytools.obj_array import make_obj_array
+from pytools import memoize_method, memoize
+from sumpy.tools import BlockIndexRanges
+
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
+
+__doc__ = """
+Proxy Point Generation
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ProxyGenerator
+
+.. autofunction:: partition_by_nodes
+.. autofunction:: partition_by_elements
+.. autofunction:: partition_from_coarse
+
+.. autofunction:: gather_block_neighbor_points
+.. autofunction:: gather_block_interaction_points
+"""
+
+
+# {{{ point index partitioning
+
+def _element_node_range(group, ielement):
+    istart = group.node_nr_base + group.nunit_nodes * ielement
+    iend = group.node_nr_base + group.nunit_nodes * (ielement + 1)
+
+    return np.arange(istart, iend)
+
+
+def partition_by_nodes(discr,
+                       use_tree=True,
+                       max_nodes_in_box=None):
+    """Generate equally sized ranges of nodes. The partition is created at the
+    lowest level of granularity, i.e. nodes. This results in balanced ranges
+    of points, but will split elements across different ranges.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg use_tree: if ``True``, node partitions are generated using a
+        :class:`boxtree.TreeBuilder`, which leads to geometrically close
+        points to belong to the same partition. If ``False``, a simple linear
+        partition is constructed.
+    :arg max_nodes_in_box: passed to :class:`boxtree.TreeBuilder`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_nodes_in_box is None:
+        # FIXME: this is just an arbitrary value
+        max_nodes_in_box = 32
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        if use_tree:
+            from boxtree import box_flags_enum
+            from boxtree import TreeBuilder
+
+            builder = TreeBuilder(discr.cl_context)
+
+            tree, _ = builder(queue, discr.nodes(),
+                max_particles_in_box=max_nodes_in_box)
+
+            tree = tree.get(queue)
+            leaf_boxes, = (tree.box_flags &
+                           box_flags_enum.HAS_CHILDREN == 0).nonzero()
+
+            indices = np.empty(len(leaf_boxes), dtype=np.object)
+            for i, ibox in enumerate(leaf_boxes):
+                box_start = tree.box_source_starts[ibox]
+                box_end = box_start + tree.box_source_counts_cumul[ibox]
+                indices[i] = tree.user_source_ids[box_start:box_end]
+
+            ranges = to_device(queue,
+                np.cumsum([0] + [box.shape[0] for box in indices]))
+            indices = to_device(queue, np.hstack(indices))
+        else:
+            indices = cl.array.arange(queue, 0, discr.nnodes,
+                                      dtype=np.int)
+            ranges = cl.array.arange(queue, 0, discr.nnodes + 1,
+                                     discr.nnodes // max_nodes_in_box,
+                                     dtype=np.int)
+        assert ranges[-1] == discr.nnodes
+
+        return BlockIndexRanges(discr.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+
+def partition_by_elements(discr,
+                          use_tree=True,
+                          max_elements_in_box=None):
+    """Generate equally sized ranges of points. The partition is created at the
+    element level, so that all the nodes belonging to an element belong to
+    the same range. This can result in slightly larger differences in size
+    between the ranges, but can be very useful when the individual partitions
+    need to be resampled, integrated, etc.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg use_tree: if ``True``, node partitions are generated using a
+        :class:`boxtree.TreeBuilder`, which leads to geometrically close
+        points to belong to the same partition. If ``False``, a simple linear
+        partition is constructed.
+    :arg max_elements_in_box: passed to :class:`boxtree.TreeBuilder`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_elements_in_box is None:
+        # NOTE: keep in sync with partition_by_nodes
+        max_nodes_in_box = 32
+
+        nunit_nodes = int(np.mean([g.nunit_nodes for g in discr.groups]))
+        max_elements_in_box = max_nodes_in_box // nunit_nodes
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        if use_tree:
+            from boxtree import box_flags_enum
+            from boxtree import TreeBuilder
+
+            builder = TreeBuilder(discr.cl_context)
+
+            from pytential.qbx.utils import element_centers_of_mass
+            elranges = np.cumsum([group.nelements for group in discr.mesh.groups])
+            elcenters = element_centers_of_mass(discr)
+
+            tree, _ = builder(queue, elcenters,
+                max_particles_in_box=max_elements_in_box)
+
+            groups = discr.groups
+            tree = tree.get(queue)
+            leaf_boxes, = (tree.box_flags &
+                           box_flags_enum.HAS_CHILDREN == 0).nonzero()
+
+            indices = np.empty(len(leaf_boxes), dtype=np.object)
+            for i, ibox in enumerate(leaf_boxes):
+                box_start = tree.box_source_starts[ibox]
+                box_end = box_start + tree.box_source_counts_cumul[ibox]
+
+                ielement = tree.user_source_ids[box_start:box_end]
+                igroup = np.digitize(ielement, elranges)
+
+                indices[i] = np.hstack([_element_node_range(groups[j], k)
+                                        for j, k in zip(igroup, ielement)])
+        else:
+            nelements = discr.mesh.nelements
+            elements = np.array_split(np.arange(0, nelements),
+                                      nelements // max_elements_in_box)
+
+            elranges = np.cumsum([g.nelements for g in discr.groups])
+            elgroups = [np.digitize(elements[i], elranges)
+                        for i in range(len(elements))]
+
+            indices = np.empty(len(elements), dtype=np.object)
+            for i in range(indices.shape[0]):
+                indices[i] = np.hstack([_element_node_range(discr.groups[j], k)
+                                        for j, k in zip(elgroups[i], elements[i])])
+
+        ranges = to_device(queue,
+                np.cumsum([0] + [b.shape[0] for b in indices]))
+        indices = to_device(queue, np.hstack(indices))
+        assert ranges[-1] == discr.nnodes
+
+        return BlockIndexRanges(discr.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+
+def partition_from_coarse(resampler, from_indices):
+    """Generate a partition of nodes from an existing partition on a
+    coarser discretization. The new partition is generated based on element
+    refinement relationships in *resampler*, so the existing partition
+    needs to be created using :func:`partition_by_elements`,
+    since we assume that each range contains all the nodes in an element.
+
+    The new partition will have the same number of ranges as the old partition.
+    The nodes inside each range in the new partition are all the nodes in
+    *resampler.to_discr* that were refined from elements in the same
+    range from *resampler.from_discr*.
+
+    :arg resampler: a
+        :class:`meshmode.discretization.connection.DirectDiscretizationConnection`.
+    :arg from_indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if not hasattr(resampler, "groups"):
+        raise ValueError("resampler must be a DirectDiscretizationConnection.")
+
+    with cl.CommandQueue(resampler.cl_context) as queue:
+        from_indices = from_indices.get(queue)
+
+        # construct ranges
+        from_discr = resampler.from_discr
+        from_grp_ranges = np.cumsum([0] +
+                [grp.nelements for grp in from_discr.mesh.groups])
+        from_el_ranges = np.hstack([
+            np.arange(grp.node_nr_base, grp.nnodes + 1, grp.nunit_nodes)
+            for grp in from_discr.groups])
+
+        # construct coarse element arrays in each from_range
+        el_indices = np.empty(from_indices.nblocks, dtype=np.object)
+        el_ranges = np.full(from_grp_ranges[-1], -1, dtype=np.int)
+        for i in range(from_indices.nblocks):
+            ifrom = from_indices.block_indices(i)
+            el_indices[i] = np.unique(np.digitize(ifrom, from_el_ranges)) - 1
+            el_ranges[el_indices[i]] = i
+        el_indices = np.hstack(el_indices)
+
+        # construct lookup table
+        to_el_table = [np.full(g.nelements, -1, dtype=np.int)
+                       for g in resampler.to_discr.groups]
+
+        for igrp, grp in enumerate(resampler.groups):
+            for batch in grp.batches:
+                to_el_table[igrp][batch.to_element_indices.get(queue)] = \
+                    from_grp_ranges[igrp] + batch.from_element_indices.get(queue)
+
+        # construct fine node index list
+        indices = [np.empty(0, dtype=np.int)
+                   for _ in range(from_indices.nblocks)]
+        for igrp in range(len(resampler.groups)):
+            to_element_indices = \
+                    np.where(np.isin(to_el_table[igrp], el_indices))[0]
+
+            for i, j in zip(el_ranges[to_el_table[igrp][to_element_indices]],
+                            to_element_indices):
+                indices[i] = np.hstack([indices[i],
+                    _element_node_range(resampler.to_discr.groups[igrp], j)])
+
+        ranges = to_device(queue,
+                np.cumsum([0] + [b.shape[0] for b in indices]))
+        indices = to_device(queue, np.hstack(indices))
+
+        return BlockIndexRanges(resampler.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+# }}}
+
+
+# {{{ proxy point generator
+
+def _generate_unit_sphere(ambient_dim, approx_npoints):
+    """Generate uniform points on a unit sphere.
+
+    :arg ambient_dim: dimension of the ambient space.
+    :arg approx_npoints: approximate number of points to generate. If the
+        ambient space is 3D, this will not generate the exact number of points.
+    :return: array of shape ``(ambient_dim, npoints)``, where ``npoints``
+        will not generally be the same as ``approx_npoints``.
+    """
+
+    if ambient_dim == 2:
+        t = np.linspace(0.0, 2.0 * np.pi, approx_npoints)
+        points = np.vstack([np.cos(t), np.sin(t)])
+    elif ambient_dim == 3:
+        # https://www.cmu.edu/biolphys/deserno/pdf/sphere_equi.pdf
+        # code by Matt Wala from
+        # https://github.com/mattwala/gigaqbx-accuracy-experiments/blob/d56ed063ffd7843186f4fe05d2a5b5bfe6ef420c/translation_accuracy.py#L23
+        a = 4.0 * np.pi / approx_npoints
+        m_theta = int(np.round(np.pi / np.sqrt(a)))
+        d_theta = np.pi / m_theta
+        d_phi = a / d_theta
+
+        points = []
+        for m in range(m_theta):
+            theta = np.pi * (m + 0.5) / m_theta
+            m_phi = int(np.round(2.0 * np.pi * np.sin(theta) / d_phi))
+
+            for n in range(m_phi):
+                phi = 2.0 * np.pi * n / m_phi
+                points.append(np.array([np.sin(theta) * np.cos(phi),
+                                        np.sin(theta) * np.sin(phi),
+                                        np.cos(theta)]))
+
+        for i in range(ambient_dim):
+            for sign in [-1, 1]:
+                pole = np.zeros(ambient_dim)
+                pole[i] = sign
+                points.append(pole)
+
+        points = np.array(points).T
+    else:
+        raise ValueError("ambient_dim > 3 not supported.")
+
+    return points
+
+
+class ProxyGenerator(object):
+    r"""
+    .. attribute:: ambient_dim
+    .. attribute:: nproxy
+
+        Number of proxy points in a single proxy ball.
+
+    .. attribute:: source
+
+        A :class:`pytential.qbx.QBXLayerPotentialSource`.
+
+    .. attribute:: ratio
+
+        A ratio used to compute the proxy ball radius. The radius
+        is computed in the :math:`\ell^2` norm, resulting in a circle or
+        sphere of proxy points. For QBX, we have two radii of interest
+        for a set of points: the radius :math:`r_{block}` of the
+        smallest ball containing all the points and the radius
+        :math:`r_{qbx}` of the smallest ball containing all the QBX
+        expansion balls in the block. If the ratio :math:`\theta \in
+        [0, 1]`, then the radius of the proxy ball is
+
+        .. math::
+
+            r = (1 - \theta) r_{block} + \theta r_{qbx}.
+
+        If the ratio :math:`\theta > 1`, the the radius is simply
+
+        .. math::
+
+            r = \theta r_{qbx}.
+
+    .. attribute:: ref_points
+
+        Reference points on a unit ball. Can be used to construct the points
+        of a proxy ball :math:`i` by translating them to ``center[i]`` and
+        scaling by ``radii[i]``, as obtained by :meth:`__call__`.
+
+    .. automethod:: __call__
+    """
+
+    def __init__(self, source, approx_nproxy=None, ratio=None):
+        self.source = source
+        self.ambient_dim = source.density_discr.ambient_dim
+        self.ratio = 1.1 if ratio is None else ratio
+
+        approx_nproxy = 32 if approx_nproxy is None else approx_nproxy
+        self.ref_points = \
+                _generate_unit_sphere(self.ambient_dim, approx_nproxy)
+
+    @property
+    def nproxy(self):
+        return self.ref_points.shape[1]
+
+    @memoize_method
+    def get_kernel(self):
+        if self.ratio < 1.0:
+            radius_expr = "(1.0 - {ratio}) * rblk + {ratio} * rqbx"
+        else:
+            radius_expr = "{ratio} * rqbx"
+        radius_expr = radius_expr.format(ratio=self.ratio)
+
+        # NOTE: centers of mass are computed using a second-order approximation
+        # that currently matches what is in `element_centers_of_mass`.
+        knl = lp.make_kernel([
+            "{[irange]: 0 <= irange < nranges}",
+            "{[i]: 0 <= i < npoints}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            ["""
+            for irange
+                <> ioffset = srcranges[irange]
+                <> npoints = srcranges[irange + 1] - srcranges[irange]
+
+                proxy_center[idim, irange] = 1.0 / npoints * \
+                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
+                        {{dup=idim:i}}
+
+                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         sources[idim, srcindices[i + ioffset]]) ** 2)))
+
+                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx = if(rqbx_ext < rqbx_int, rqbx_int, rqbx_ext)
+
+                proxy_radius[irange] = {radius_expr}
+            end
+            """.format(radius_expr=radius_expr)],
+            [
+                lp.GlobalArg("sources", None,
+                    shape=(self.ambient_dim, "nsources")),
+                lp.GlobalArg("center_int", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("center_ext", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("proxy_center", None,
+                    shape=(self.ambient_dim, "nranges")),
+                lp.GlobalArg("proxy_radius", None,
+                    shape="nranges"),
+                lp.ValueArg("nsources", np.int),
+                "..."
+            ],
+            name="find_proxy_radii_knl",
+            assumptions="dim>=1 and nranges>=1",
+            fixed_parameters=dict(dim=self.ambient_dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        knl = lp.tag_inames(knl, "idim*:unr")
+
+        return knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+
+        return knl
+
+    def __call__(self, queue, indices, **kwargs):
+        """Generate proxy points for each given range of source points in
+        the discretization in :attr:`source`.
+
+        :arg queue: a :class:`pyopencl.CommandQueue`.
+        :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+        :return: a tuple of ``(proxies, pxyranges, pxycenters, pxyranges)``,
+            where each element is a :class:`pyopencl.array.Array`. The
+            sizes of the arrays are as follows: ``pxycenters`` is of size
+            ``(2, nranges)``, ``pxyradii`` is of size ``(nranges,)``,
+            ``pxyranges`` is of size ``(nranges + 1,)`` and ``proxies`` is
+            of size ``(2, nranges * nproxy)``. The proxy points in a range
+            :math:`i` can be obtained by a slice
+            ``proxies[pxyranges[i]:pxyranges[i + 1]]`` and are all at a
+            distance ``pxyradii[i]`` from the range center ``pxycenters[i]``.
+        """
+
+        def _affine_map(v, A, b):
+            return np.dot(A, v) + b
+
+        from pytential.qbx.utils import get_centers_on_side
+
+        knl = self.get_kernel()
+        _, (centers_dev, radii_dev,) = knl(queue,
+            sources=self.source.density_discr.nodes(),
+            center_int=get_centers_on_side(self.source, -1),
+            center_ext=get_centers_on_side(self.source, +1),
+            expansion_radii=self.source._expansion_radii("nsources"),
+            srcindices=indices.indices,
+            srcranges=indices.ranges, **kwargs)
+        centers = centers_dev.get()
+        radii = radii_dev.get()
+
+        proxies = np.empty(indices.nblocks, dtype=np.object)
+        for i in range(indices.nblocks):
+            proxies[i] = _affine_map(self.ref_points,
+                    A=(radii[i] * np.eye(self.ambient_dim)),
+                    b=centers[:, i].reshape(-1, 1))
+
+        pxyranges = cl.array.arange(queue,
+                0,
+                proxies.shape[0] * proxies[0].shape[1] + 1,
+                proxies[0].shape[1],
+                dtype=indices.ranges.dtype)
+        proxies = make_obj_array([
+            cl.array.to_device(queue, np.hstack([p[idim] for p in proxies]))
+            for idim in range(self.ambient_dim)])
+        centers = make_obj_array([
+            centers_dev[idim].with_queue(queue).copy()
+            for idim in range(self.ambient_dim)])
+
+        assert pxyranges[-1] == proxies[0].shape[0]
+        return proxies, pxyranges, centers, radii_dev
+
+
+def gather_block_neighbor_points(discr, indices, pxycenters, pxyradii,
+                                 max_nodes_in_box=None):
+    """Generate a set of neighboring points for each range of points in
+    *discr*. Neighboring points of a range :math:`i` are defined
+    as all the points inside the proxy ball :math:`i` that do not also
+    belong to the range itself.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+    :arg pxycenters: an array containing the center of each proxy ball.
+    :arg pxyradii: an array containing the radius of each proxy ball.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_nodes_in_box is None:
+        # FIXME: this is a fairly arbitrary value
+        max_nodes_in_box = 32
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        indices = indices.get(queue)
+
+        # NOTE: this is constructed for multiple reasons:
+        #   * TreeBuilder takes object arrays
+        #   * `srcindices` can be a small subset of nodes, so this will save
+        #   some work
+        #   * `srcindices` may reorder the array returned by nodes(), so this
+        #   makes sure that we have the same order in tree.user_source_ids
+        #   and friends
+        sources = discr.nodes().get(queue)
+        sources = make_obj_array([
+            cl.array.to_device(queue, sources[idim, indices.indices])
+            for idim in range(discr.ambient_dim)])
+
+        # construct tree
+        from boxtree import TreeBuilder
+        builder = TreeBuilder(discr.cl_context)
+        tree, _ = builder(queue, sources,
+                          max_particles_in_box=max_nodes_in_box)
+
+        from boxtree.area_query import AreaQueryBuilder
+        builder = AreaQueryBuilder(discr.cl_context)
+        query, _ = builder(queue, tree, pxycenters, pxyradii)
+
+        # find nodes inside each proxy ball
+        tree = tree.get(queue)
+        query = query.get(queue)
+
+        if isinstance(pxycenters[0], cl.array.Array):
+            pxycenters = np.vstack([pxycenters[idim].get(queue)
+                                    for idim in range(discr.ambient_dim)])
+        if isinstance(pxyradii, cl.array.Array):
+            pxyradii = pxyradii.get(queue)
+
+        nbrindices = np.empty(indices.nblocks, dtype=np.object)
+        for iproxy in range(indices.nblocks):
+            # get list of boxes intersecting the current ball
+            istart = query.leaves_near_ball_starts[iproxy]
+            iend = query.leaves_near_ball_starts[iproxy + 1]
+            iboxes = query.leaves_near_ball_lists[istart:iend]
+
+            # get nodes inside the boxes
+            istart = tree.box_source_starts[iboxes]
+            iend = istart + tree.box_source_counts_cumul[iboxes]
+            isources = np.hstack([np.arange(s, e)
+                                  for s, e in zip(istart, iend)])
+            nodes = np.vstack([tree.sources[idim][isources]
+                               for idim in range(discr.ambient_dim)])
+            isources = tree.user_source_ids[isources]
+
+            # get nodes inside the ball but outside the current range
+            center = pxycenters[:, iproxy].reshape(-1, 1)
+            radius = pxyradii[iproxy]
+            mask = (la.norm(nodes - center, axis=0) < radius) & \
+                   ((isources < indices.ranges[iproxy]) |
+                    (indices.ranges[iproxy + 1] <= isources))
+
+            nbrindices[iproxy] = indices.indices[isources[mask]]
+
+        nbrranges = to_device(queue,
+                np.cumsum([0] + [n.shape[0] for n in nbrindices]))
+        nbrindices = to_device(queue, np.hstack(nbrindices))
+
+        return BlockIndexRanges(discr.cl_context,
+                                nbrindices.with_queue(None),
+                                nbrranges.with_queue(None))
+
+
+def gather_block_interaction_points(source, indices,
+                                    ratio=None,
+                                    approx_nproxy=None,
+                                    max_nodes_in_box=None):
+    """Generate sets of interaction points for each given range of indices
+    in the *source* discretization. For each input range of indices,
+    the corresponding output range of points is consists of:
+
+    - a set of proxy points (or balls) around the range, which
+      model farfield interactions. These are constructed using
+      :class:`ProxyGenerator`.
+
+    - a set of neighboring points that are inside the proxy balls, but
+      do not belong to the given range, which model nearby interactions.
+      These are constructed with :func:`gather_block_neighbor_points`.
+
+    :arg source: a :class:`pytential.qbx.QBXLayerPotentialSource`.
+    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+    :return: a tuple ``(nodes, ranges)``, where each value is a
+        :class:`pyopencl.array.Array`. For a range :math:`i`, we can
+        get the slice using ``nodes[ranges[i]:ranges[i + 1]]``.
+    """
+
+    @memoize
+    def knl():
+        loopy_knl = lp.make_kernel([
+            "{[irange, idim]: 0 <= irange < nranges and \
+                              0 <= idim < dim}",
+            "{[ipxy, ingb]: 0 <= ipxy < npxyblock and \
+                            0 <= ingb < nngbblock}"
+            ],
+            """
+            for irange
+                <> pxystart = pxyranges[irange]
+                <> pxyend = pxyranges[irange + 1]
+                <> npxyblock = pxyend - pxystart
+
+                <> ngbstart = nbrranges[irange]
+                <> ngbend = nbrranges[irange + 1]
+                <> nngbblock = ngbend - ngbstart
+
+                <> istart = pxyranges[irange] + nbrranges[irange]
+                nodes[idim, istart + ipxy] = \
+                    proxies[idim, pxystart + ipxy] \
+                    {id_prefix=write_pxy,nosync=write_ngb}
+                nodes[idim, istart + npxyblock + ingb] = \
+                    sources[idim, nbrindices[ngbstart + ingb]] \
+                    {id_prefix=write_ngb,nosync=write_pxy}
+                ranges[irange + 1] = ranges[irange] + npxyblock + nngbblock
+            end
+            """,
+            [
+                lp.GlobalArg("sources", None,
+                    shape=(source.ambient_dim, "nsources")),
+                lp.GlobalArg("proxies", None,
+                    shape=(source.ambient_dim, "nproxies"), dim_tags="sep,C"),
+                lp.GlobalArg("nbrindices", None,
+                    shape="nnbrindices"),
+                lp.GlobalArg("nodes", None,
+                    shape=(source.ambient_dim, "nproxies + nnbrindices")),
+                lp.ValueArg("nsources", np.int),
+                lp.ValueArg("nproxies", np.int),
+                lp.ValueArg("nnbrindices", np.int),
+                "..."
+            ],
+            name="concat_proxy_and_neighbors",
+            default_offset=lp.auto,
+            silenced_warnings="write_race(write_*)",
+            fixed_parameters=dict(dim=source.ambient_dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+        return loopy_knl
+
+    with cl.CommandQueue(source.cl_context) as queue:
+        generator = ProxyGenerator(source,
+                                   ratio=ratio,
+                                   approx_nproxy=approx_nproxy)
+        proxies, pxyranges, pxycenters, pxyradii = generator(queue, indices)
+
+        neighbors = gather_block_neighbor_points(source.density_discr,
+                indices, pxycenters, pxyradii,
+                max_nodes_in_box=max_nodes_in_box)
+
+        ranges = cl.array.zeros(queue, indices.nblocks + 1, dtype=np.int)
+        _, (nodes, ranges) = knl()(queue,
+                sources=source.density_discr.nodes(),
+                proxies=proxies,
+                pxyranges=pxyranges,
+                nbrindices=neighbors.indices,
+                nbrranges=neighbors.ranges,
+                ranges=ranges)
+
+        return nodes.with_queue(None), ranges.with_queue(None)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a063ca3b94f515e20ee9b130893f0f23243dc3
--- /dev/null
+++ b/test/test_linalg_proxy.py
@@ -0,0 +1,436 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import os
+import time
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+from pyopencl.array import to_device
+
+from sumpy.tools import BlockIndexRanges
+from meshmode.mesh.generation import ( # noqa
+        ellipse, NArmedStarfish, generate_torus, make_curve_mesh)
+
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+
+def _build_qbx_discr(queue,
+        ndim=2,
+        nelements=30,
+        target_order=7,
+        qbx_order=4,
+        curve_f=None):
+
+    if curve_f is None:
+        curve_f = NArmedStarfish(5, 0.25)
+
+    if ndim == 2:
+        mesh = make_curve_mesh(curve_f,
+                np.linspace(0, 1, nelements + 1),
+                target_order)
+    elif ndim == 3:
+        mesh = generate_torus(10.0, 2.0, order=target_order)
+    else:
+        raise ValueError("unsupported ambient dimension")
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    qbx, _ = QBXLayerPotentialSource(density_discr,
+            fine_order=4 * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False).with_refinement()
+
+    return qbx
+
+
+def _build_block_index(discr,
+                       nblks=10,
+                       factor=1.0,
+                       method='elements',
+                       use_tree=True):
+
+    from pytential.linalg.proxy import (
+            partition_by_nodes, partition_by_elements)
+
+    if method == 'elements':
+        factor = 1.0
+
+    if method == 'nodes':
+        nnodes = discr.nnodes
+    else:
+        nnodes = discr.mesh.nelements
+    max_particles_in_box = nnodes // nblks
+
+    # create index ranges
+    if method == 'nodes':
+        indices = partition_by_nodes(discr,
+                                     use_tree=use_tree,
+                                     max_nodes_in_box=max_particles_in_box)
+    elif method == 'elements':
+        indices = partition_by_elements(discr,
+                                        use_tree=use_tree,
+                                        max_elements_in_box=max_particles_in_box)
+    else:
+        raise ValueError('unknown method: {}'.format(method))
+
+    # randomly pick a subset of points
+    if abs(factor - 1.0) > 1.0e-14:
+        with cl.CommandQueue(discr.cl_context) as queue:
+            indices = indices.get(queue)
+
+            indices_ = np.empty(indices.nblocks, dtype=np.object)
+            for i in range(indices.nblocks):
+                iidx = indices.block_indices(i)
+                isize = int(factor * len(iidx))
+                isize = max(1, min(isize, len(iidx)))
+
+                indices_[i] = np.sort(
+                        np.random.choice(iidx, size=isize, replace=False))
+
+            ranges_ = to_device(queue,
+                    np.cumsum([0] + [r.shape[0] for r in indices_]))
+            indices_ = to_device(queue, np.hstack(indices_))
+
+            indices = BlockIndexRanges(discr.cl_context,
+                                       indices_.with_queue(None),
+                                       ranges_.with_queue(None))
+
+    return indices
+
+
+def _plot_partition_indices(queue, discr, indices, **kwargs):
+    import matplotlib.pyplot as pt
+    indices = indices.get(queue)
+
+    args = [
+        kwargs.get("method", "unknown"),
+        "tree" if kwargs.get("use_tree", False) else "linear",
+        kwargs.get("pid", "stage1"),
+        discr.ambient_dim
+        ]
+
+    pt.figure(figsize=(10, 8), dpi=300)
+    pt.plot(np.diff(indices.ranges))
+    pt.savefig("test_partition_{0}_{1}_{3}d_ranges_{2}.png".format(*args))
+    pt.clf()
+
+    if discr.ambient_dim == 2:
+        sources = discr.nodes().get(queue)
+
+        pt.figure(figsize=(10, 8), dpi=300)
+
+        if indices.indices.shape[0] != discr.nnodes:
+            pt.plot(sources[0], sources[1], 'ko', alpha=0.5)
+        for i in range(indices.nblocks):
+            isrc = indices.block_indices(i)
+            pt.plot(sources[0][isrc], sources[1][isrc], 'o')
+
+        pt.xlim([-1.5, 1.5])
+        pt.ylim([-1.5, 1.5])
+        pt.savefig("test_partition_{0}_{1}_{3}d_{2}.png".format(*args))
+        pt.clf()
+    elif discr.ambient_dim == 3:
+        from meshmode.discretization import NoninterpolatoryElementGroupError
+        try:
+            discr.groups[0].basis()
+        except NoninterpolatoryElementGroupError:
+            return
+
+        from meshmode.discretization.visualization import make_visualizer
+        marker = -42.0 * np.ones(discr.nnodes)
+
+        for i in range(indices.nblocks):
+            isrc = indices.block_indices(i)
+            marker[isrc] = 10.0 * (i + 1.0)
+
+        vis = make_visualizer(queue, discr, 10)
+
+        filename = "test_partition_{0}_{1}_{3}d_{2}.png".format(*args)
+        if os.path.isfile(filename):
+            os.remove(filename)
+
+        vis.write_vtk_file(filename, [
+            ("marker", cl.array.to_device(queue, marker))
+            ])
+
+
+@pytest.mark.parametrize("method", ["nodes", "elements"])
+@pytest.mark.parametrize("use_tree", [True, False])
+@pytest.mark.parametrize("ndim", [2, 3])
+def test_partition_points(ctx_factory, method, use_tree, ndim, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    _build_block_index(qbx.density_discr,
+                       method=method,
+                       use_tree=use_tree,
+                       factor=0.6)
+
+
+@pytest.mark.parametrize("use_tree", [True, False])
+@pytest.mark.parametrize("ndim", [2, 3])
+def test_partition_coarse(ctx_factory, use_tree, ndim, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method="elements", use_tree=use_tree)
+
+    if visualize:
+        discr = qbx.resampler.from_discr
+        _plot_partition_indices(queue, discr, srcindices,
+                method="elements", use_tree=use_tree, pid="stage1")
+
+    from pytential.linalg.proxy import partition_from_coarse
+    resampler = qbx.direct_resampler
+
+    t_start = time.time()
+    srcindices_ = partition_from_coarse(resampler, srcindices)
+    t_end = time.time()
+    if visualize:
+        print('Time: {:.5f}s'.format(t_end - t_start))
+
+    srcindices = srcindices.get(queue)
+    srcindices_ = srcindices_.get(queue)
+
+    sources = resampler.from_discr.nodes().get(queue)
+    sources_ = resampler.to_discr.nodes().get(queue)
+
+    for i in range(srcindices.nblocks):
+        isrc = srcindices.block_indices(i)
+        isrc_ = srcindices_.block_indices(i)
+
+        for j in range(ndim):
+            assert np.min(sources_[j][isrc_]) <= np.min(sources[j][isrc])
+            assert np.max(sources_[j][isrc_]) >= np.max(sources[j][isrc])
+
+    if visualize:
+        discr = resampler.to_discr
+        _plot_partition_indices(queue, discr, srcindices_,
+                method="elements", use_tree=use_tree, pid="stage2")
+
+
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_proxy_generator(ctx_factory, ndim, factor, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method='nodes', factor=factor)
+
+    from pytential.linalg.proxy import ProxyGenerator
+    generator = ProxyGenerator(qbx, ratio=1.1)
+    proxies, pxyranges, pxycenters, pxyradii = generator(queue, srcindices)
+
+    proxies = np.vstack([p.get() for p in proxies])
+    pxyranges = pxyranges.get()
+    pxycenters = np.vstack([c.get() for c in pxycenters])
+    pxyradii = pxyradii.get()
+
+    for i in range(srcindices.nblocks):
+        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+        r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)
+
+    srcindices = srcindices.get(queue)
+    if visualize:
+        if qbx.ambient_dim == 2:
+            import matplotlib.pyplot as pt
+            from pytential.qbx.utils import get_centers_on_side
+
+            density_nodes = qbx.density_discr.nodes().get(queue)
+            ci = get_centers_on_side(qbx, -1)
+            ci = np.vstack([c.get(queue) for c in ci])
+            ce = get_centers_on_side(qbx, +1)
+            ce = np.vstack([c.get(queue) for c in ce])
+            r = qbx._expansion_radii("nsources").get(queue)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+                pt.figure(figsize=(10, 8))
+                axis = pt.gca()
+                for j in isrc:
+                    c = pt.Circle(ci[:, j], r[j], color='k', alpha=0.1)
+                    axis.add_artist(c)
+                    c = pt.Circle(ce[:, j], r[j], color='k', alpha=0.1)
+                    axis.add_artist(c)
+
+                pt.plot(density_nodes[0], density_nodes[1],
+                        'ko', ms=2.0, alpha=0.5)
+                pt.plot(density_nodes[0, srcindices.indices],
+                        density_nodes[1, srcindices.indices],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
+                        'o', ms=2.0)
+                pt.plot(proxies[0, ipxy], proxies[1, ipxy],
+                        'o', ms=2.0)
+                pt.xlim([-1.5, 1.5])
+                pt.ylim([-1.5, 1.5])
+
+                filename = "test_proxy_generator_{}d_{:04}.png".format(ndim, i)
+                pt.savefig(filename, dpi=300)
+                pt.clf()
+        else:
+            from meshmode.discretization.visualization import make_visualizer
+            from meshmode.mesh.processing import ( # noqa
+                    affine_map, merge_disjoint_meshes)
+            from meshmode.discretization import Discretization
+            from meshmode.discretization.poly_element import \
+                InterpolatoryQuadratureSimplexGroupFactory
+
+            from meshmode.mesh.generation import generate_icosphere
+            ref_mesh = generate_icosphere(1, generator.nproxy)
+
+            # NOTE: this does not plot the actual proxy points
+            for i in range(srcindices.nblocks):
+                mesh = affine_map(ref_mesh,
+                    A=(pxyradii[i] * np.eye(ndim)),
+                    b=pxycenters[:, i].reshape(-1))
+
+                mesh = merge_disjoint_meshes([mesh, qbx.density_discr.mesh])
+                discr = Discretization(ctx, mesh,
+                    InterpolatoryQuadratureSimplexGroupFactory(10))
+
+                vis = make_visualizer(queue, discr, 10)
+                filename = "test_proxy_generator_{}d_{:04}.vtu".format(ndim, i)
+                if os.path.isfile(filename):
+                    os.remove(filename)
+                vis.write_vtk_file(filename, [])
+
+
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_interaction_points(ctx_factory, ndim, factor, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method='nodes', factor=factor)
+
+    # generate proxy points
+    from pytential.linalg.proxy import ProxyGenerator
+    generator = ProxyGenerator(qbx)
+    _, _, pxycenters, pxyradii = generator(queue, srcindices)
+
+    from pytential.linalg.proxy import (  # noqa
+            gather_block_neighbor_points,
+            gather_block_interaction_points)
+    nbrindices = gather_block_neighbor_points(qbx.density_discr,
+            srcindices, pxycenters, pxyradii)
+    nodes, ranges = gather_block_interaction_points(qbx, srcindices)
+
+    srcindices = srcindices.get(queue)
+    nbrindices = nbrindices.get(queue)
+
+    for i in range(srcindices.nblocks):
+        isrc = srcindices.block_indices(i)
+        inbr = nbrindices.block_indices(i)
+
+        assert not np.any(np.isin(inbr, isrc))
+
+    if visualize:
+        if ndim == 2:
+            import matplotlib.pyplot as pt
+            density_nodes = qbx.density_discr.nodes().get(queue)
+            nodes = nodes.get(queue)
+            ranges = ranges.get(queue)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                inbr = nbrindices.block_indices(i)
+                iall = np.s_[ranges[i]:ranges[i + 1]]
+
+                pt.figure(figsize=(10, 8))
+                pt.plot(density_nodes[0], density_nodes[1],
+                        'ko', ms=2.0, alpha=0.5)
+                pt.plot(density_nodes[0, srcindices.indices],
+                        density_nodes[1, srcindices.indices],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, inbr], density_nodes[1, inbr],
+                        'o', ms=2.0)
+                pt.plot(nodes[0, iall], nodes[1, iall],
+                        'x', ms=2.0)
+                pt.xlim([-1.5, 1.5])
+                pt.ylim([-1.5, 1.5])
+
+                filename = "test_area_query_{}d_{:04}.png".format(ndim, i)
+                pt.savefig(filename, dpi=300)
+                pt.clf()
+        elif ndim == 3:
+            from meshmode.discretization.visualization import make_visualizer
+            marker = np.empty(qbx.density_discr.nnodes)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                inbr = nbrindices.block_indices(i)
+
+                # TODO: some way to turn off some of the interpolations
+                # would help visualize this better.
+                marker.fill(0.0)
+                marker[srcindices.indices] = 0.0
+                marker[isrc] = -42.0
+                marker[inbr] = +42.0
+                marker_dev = cl.array.to_device(queue, marker)
+
+                vis = make_visualizer(queue, qbx.density_discr, 10)
+                filename = "test_area_query_{}d_{:04}.vtu".format(ndim, i)
+                if os.path.isfile(filename):
+                    os.remove(filename)
+
+                vis.write_vtk_file(filename, [
+                    ("marker", marker_dev),
+                    ])
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker