diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index effbd8ce64d935a2821477a5598d923b3e47a8a9..51b2d38286b498b85991f301c0958c99f476af05 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -64,7 +64,7 @@ Python 3 POCL Examples:
   - test -n "$SKIP_EXAMPLES" && exit
   - export PY_EXE=python3
   - export PYOPENCL_TEST=portable:pthread
-  - export EXTRA_INSTALL="Cython pybind11 numpy mako git+git://github.com/inducer/pytools pyvisfile matplotlib"
+  - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako git+git://github.com/inducer/pytools pyvisfile matplotlib"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
   - ". ./build-py-project-and-run-examples.sh"
   tags:
@@ -130,7 +130,7 @@ Pylint:
   # Pylint won't find the Cython bits without this
   - PROJECT_INSTALL_FLAGS="--editable"
   - export PY_EXE=python3
-  - EXTRA_INSTALL="Cython pybind11 numpy mako matplotlib"
+  - EXTRA_INSTALL="Cython pybind11 numpy scipy mako matplotlib"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
   - ". ./prepare-and-run-pylint.sh pytential test/test_*.py"
   tags:
diff --git a/doc/linalg.rst b/doc/linalg.rst
index d5e78a6e288d50419be802bf3ab235dff81c419d..d23eb16bb34d929a1d90fea3ba6e1550c2f2e7b0 100644
--- a/doc/linalg.rst
+++ b/doc/linalg.rst
@@ -4,6 +4,6 @@ Linear Algebra Routines
 Hierarchical Direct Solver
 --------------------------
 
-.. automodule:: pytential.linalg.proxy
+.. automodule:: pytential.linalg.hss
 
 .. vim: sw=4:tw=75
diff --git a/examples/cost.py b/examples/cost.py
index 9809cb2228e0cce79b16d76b13b25359fe01e25b..71c11680484b4c7321273c23d43946633a62cbf8 100644
--- a/examples/cost.py
+++ b/examples/cost.py
@@ -55,8 +55,6 @@ def starfish_lpot_source(queue, n_arms):
             pre_density_discr, OVSMP_FACTOR * TARGET_ORDER,
             **lpot_kwargs)
 
-    lpot_source, _ = lpot_source.with_refinement()
-
     return lpot_source
 
 # }}}
@@ -72,17 +70,16 @@ def test_geometries(queue):
         yield starfish_lpot_source(queue, n_arms)
 
 
-def get_bound_op(lpot_source):
+def get_bound_op(places):
     from sumpy.kernel import LaplaceKernel
-    sigma_sym = sym.var("sigma")
-    k_sym = LaplaceKernel(lpot_source.ambient_dim)
-    op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+    op = sym.S(LaplaceKernel(places.ambient_dim),
+            sym.var("sigma"),
+            qbx_forced_limit=+1)
 
-    return bind(lpot_source, op)
+    return bind(places, op)
 
 
-def get_test_density(queue, lpot_source):
-    density_discr = lpot_source.density_discr
+def get_test_density(queue, density_discr):
     nodes = density_discr.nodes().with_queue(queue)
     sigma = cl.clmath.sin(10 * nodes[0])
 
@@ -100,8 +97,13 @@ def calibrate_cost_model(ctx):
 
     for lpot_source in training_geometries(queue):
         lpot_source = lpot_source.copy(cost_model=cost_model)
-        bound_op = get_bound_op(lpot_source)
-        sigma = get_test_density(queue, lpot_source)
+
+        from pytential import GeometryCollection
+        places = GeometryCollection(lpot_source)
+        density_discr = places.get_discretization(places.auto_source.geometry)
+
+        bound_op = get_bound_op(places)
+        sigma = get_test_density(queue, density_discr)
 
         cost_S = bound_op.get_modeled_cost(queue, sigma=sigma)
 
@@ -126,8 +128,13 @@ def test_cost_model(ctx, cost_model):
 
     for lpot_source in test_geometries(queue):
         lpot_source = lpot_source.copy(cost_model=cost_model)
-        bound_op = get_bound_op(lpot_source)
-        sigma = get_test_density(queue, lpot_source)
+
+        from pytential import GeometryCollection
+        places = GeometryCollection(lpot_source)
+        density_discr = places.get_discretization(places.auto_source.geometry)
+
+        bound_op = get_bound_op(places)
+        sigma = get_test_density(queue, density_discr)
 
         cost_S = bound_op.get_modeled_cost(queue, sigma=sigma)
         model_result = (
diff --git a/examples/fmm-error.py b/examples/fmm-error.py
index c3350786b6464b3b20b5f9e8d740cf74185e9cf1..a6d19bb150499c121c38994fcaffdd3e1bca9f50 100644
--- a/examples/fmm-error.py
+++ b/examples/fmm-error.py
@@ -24,7 +24,6 @@ def main():
         kernel = HelmholtzKernel(2)
     else:
         kernel = LaplaceKernel(2)
-    #kernel = OneKernel()
 
     mesh = make_curve_mesh(
             #lambda t: ellipse(1, t),
@@ -41,16 +40,24 @@ def main():
             cl_ctx, mesh,
             InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-    slow_qbx, _ = QBXLayerPotentialSource(
+    unaccel_qbx = QBXLayerPotentialSource(
             pre_density_discr, fine_order=2*target_order,
             qbx_order=qbx_order, fmm_order=False,
             target_association_tolerance=.05
-            ).with_refinement()
-    qbx = slow_qbx.copy(fmm_order=10)
-    density_discr = slow_qbx.density_discr
+            )
 
-    nodes = density_discr.nodes().with_queue(queue)
+    from pytential.target import PointsTarget
+    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600)
 
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "unaccel_qbx": unaccel_qbx,
+        "qbx": unaccel_qbx.copy(fmm_order=10),
+        "targets": PointsTarget(fplot.points)
+        })
+    density_discr = places.get_discretization("unaccel_qbx")
+
+    nodes = density_discr.nodes().with_queue(queue)
     angle = cl.clmath.atan2(nodes[1], nodes[0])
 
     from pytential import bind, sym
@@ -63,21 +70,20 @@ def main():
     if isinstance(kernel, HelmholtzKernel):
         sigma = sigma.astype(np.complex128)
 
-    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600)
-    from pytential.target import PointsTarget
+    fld_in_vol = bind(places, op, auto_where=("unaccel_qbx", "targets"))(
+            queue, sigma=sigma, k=k).get()
 
-    fld_in_vol = bind(
-            (slow_qbx, PointsTarget(fplot.points)),
-            op)(queue, sigma=sigma, k=k).get()
-
-    fmm_fld_in_vol = bind(
-            (qbx, PointsTarget(fplot.points)),
-            op)(queue, sigma=sigma, k=k).get()
+    fmm_fld_in_vol = bind(places, op, auto_where=("qbx", "targets"))(
+            queue, sigma=sigma, k=k).get()
 
     err = fmm_fld_in_vol-fld_in_vol
 
-    import matplotlib
-    matplotlib.use('Agg')
+    try:
+        import matplotlib
+    except ImportError:
+        return
+
+    matplotlib.use("Agg")
     im = fplot.show_scalar_in_matplotlib(np.log10(np.abs(err) + 1e-17))
 
     from matplotlib.colors import Normalize
@@ -89,7 +95,7 @@ def main():
     pt.gca().yaxis.set_major_formatter(NullFormatter())
 
     cb = pt.colorbar(shrink=0.9)
-    cb.set_label(r"$\log_{10}(\mathdefault{Error})$")
+    cb.set_label(r"$\log_{10}(\mathrm{Error})$")
 
     pt.savefig("fmm-error-order-%d.pdf" % qbx_order)
 
diff --git a/examples/helmholtz-dirichlet.py b/examples/helmholtz-dirichlet.py
index 847e5c3fdfec588b6c8d3e10d2b31bb237f74022..75115da4a896638c1a18fcc50ae3345704b801ce 100644
--- a/examples/helmholtz-dirichlet.py
+++ b/examples/helmholtz-dirichlet.py
@@ -7,7 +7,7 @@ from meshmode.discretization import Discretization
 from meshmode.discretization.poly_element import \
         InterpolatoryQuadratureSimplexGroupFactory
 
-from pytential import bind, sym, norm  # noqa
+from pytential import bind, sym
 from pytential.target import PointsTarget
 
 # {{{ set some constants for use below
@@ -23,7 +23,7 @@ k = 3
 # }}}
 
 
-def main():
+def main(mesh_name="ellipse", visualize=False):
     import logging
     logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
@@ -33,12 +33,12 @@ def main():
     from meshmode.mesh.generation import ellipse, make_curve_mesh
     from functools import partial
 
-    if 0:
+    if mesh_name == "ellipse":
         mesh = make_curve_mesh(
                 partial(ellipse, 1),
                 np.linspace(0, 1, nelements+1),
                 mesh_order)
-    else:
+    elif mesh_name == "ellipse_array":
         base_mesh = make_curve_mesh(
                 partial(ellipse, 1),
                 np.linspace(0, 1, nelements+1),
@@ -58,11 +58,13 @@ def main():
 
         mesh = merge_disjoint_meshes(meshes, single_group=True)
 
-        if 0:
+        if visualize:
             from meshmode.mesh.visualization import draw_curve
             draw_curve(mesh)
             import matplotlib.pyplot as plt
             plt.show()
+    else:
+        raise ValueError("unknown mesh name: {}".format(mesh_name))
 
     pre_density_discr = Discretization(
             cl_ctx, mesh,
@@ -70,22 +72,31 @@ def main():
 
     from pytential.qbx import (
             QBXLayerPotentialSource, QBXTargetAssociationFailedException)
-    qbx, _ = QBXLayerPotentialSource(
+    qbx = QBXLayerPotentialSource(
             pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
             fmm_order=fmm_order
-            ).with_refinement()
-    density_discr = qbx.density_discr
+            )
+
+    from sumpy.visualization import FieldPlotter
+    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500)
+    targets = cl.array.to_device(queue, fplot.points)
+
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "qbx": qbx,
+        "qbx_high_target_assoc_tol": qbx.copy(target_association_tolerance=0.05),
+        "targets": PointsTarget(targets)
+        }, auto_where="qbx")
+    density_discr = places.get_discretization("qbx")
 
     # {{{ describe bvp
 
     from sumpy.kernel import LaplaceKernel, HelmholtzKernel
     kernel = HelmholtzKernel(2)
 
-    cse = sym.cse
-
     sigma_sym = sym.var("sigma")
     sqrt_w = sym.sqrt_jac_q_weight(2)
-    inv_sqrt_w_sigma = cse(sigma_sym/sqrt_w)
+    inv_sqrt_w_sigma = sym.cse(sigma_sym/sqrt_w)
 
     # Brakhage-Werner parameter
     alpha = 1j
@@ -94,17 +105,18 @@ def main():
     # +1 for exterior Dirichlet
     loc_sign = +1
 
+    k_sym = sym.var("k")
     bdry_op_sym = (-loc_sign*0.5*sigma_sym
             + sqrt_w*(
-                alpha*sym.S(kernel, inv_sqrt_w_sigma, k=sym.var("k"),
+                alpha*sym.S(kernel, inv_sqrt_w_sigma, k=k_sym,
                     qbx_forced_limit=+1)
-                - sym.D(kernel, inv_sqrt_w_sigma, k=sym.var("k"),
+                - sym.D(kernel, inv_sqrt_w_sigma, k=k_sym,
                     qbx_forced_limit="avg")
                 ))
 
     # }}}
 
-    bound_op = bind(qbx, bdry_op_sym)
+    bound_op = bind(places, bdry_op_sym)
 
     # {{{ fix rhs and solve
 
@@ -118,11 +130,11 @@ def main():
 
     bc = -u_incoming_func(nodes)
 
-    bvp_rhs = bind(qbx, sqrt_w*sym.var("bc"))(queue, bc=bc)
+    bvp_rhs = bind(places, sqrt_w*sym.var("bc"))(queue, bc=bc)
 
     from pytential.solve import gmres
     gmres_result = gmres(
-            bound_op.scipy_op(queue, "sigma", dtype=np.complex128, k=k),
+            bound_op.scipy_op(queue, sigma_sym.name, dtype=np.complex128, k=k),
             bvp_rhs, tol=1e-8, progress=True,
             stall_iterations=0,
             hard_failure=True)
@@ -131,51 +143,36 @@ def main():
 
     # {{{ postprocess/visualize
 
-    sigma = gmres_result.solution
-
-    repr_kwargs = dict(k=sym.var("k"), qbx_forced_limit=None)
+    repr_kwargs = dict(
+            source="qbx_high_target_assoc_tol",
+            target="targets",
+            qbx_forced_limit=None)
     representation_sym = (
-            alpha*sym.S(kernel, inv_sqrt_w_sigma, **repr_kwargs)
-            - sym.D(kernel, inv_sqrt_w_sigma, **repr_kwargs))
-
-    from sumpy.visualization import FieldPlotter
-    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500)
-
-    targets = cl.array.to_device(queue, fplot.points)
+            alpha*sym.S(kernel, inv_sqrt_w_sigma, k=k_sym, **repr_kwargs)
+            - sym.D(kernel, inv_sqrt_w_sigma, k=k_sym, **repr_kwargs))
 
     u_incoming = u_incoming_func(targets)
-
-    qbx_stick_out = qbx.copy(target_association_tolerance=0.05)
-
     ones_density = density_discr.zeros(queue)
     ones_density.fill(1)
-    indicator = bind(
-            (qbx_stick_out, PointsTarget(targets)),
-            sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=None))(
+
+    indicator = bind(places, sym.D(LaplaceKernel(2), sigma_sym, **repr_kwargs))(
             queue, sigma=ones_density).get()
 
     try:
-        fld_in_vol = bind(
-                (qbx_stick_out, PointsTarget(targets)),
-                representation_sym)(queue, sigma=sigma, k=k).get()
+        fld_in_vol = bind(places, representation_sym)(
+                queue, sigma=gmres_result.solution, k=k).get()
     except QBXTargetAssociationFailedException as e:
-        fplot.write_vtk_file(
-                "failed-targets.vts",
-                [
-                    ("failed", e.failed_target_flags.get(queue))
-                    ]
-                )
+        fplot.write_vtk_file("helmholtz-dirichlet-failed-targets.vts", [
+            ("failed", e.failed_target_flags.get(queue))
+            ])
         raise
 
     #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-    fplot.write_vtk_file(
-            "potential-helm.vts",
-            [
-                ("potential", fld_in_vol),
-                ("indicator", indicator),
-                ("u_incoming", u_incoming.get()),
-                ]
-            )
+    fplot.write_vtk_file("helmholtz-dirichlet-potential.vts", [
+        ("potential", fld_in_vol),
+        ("indicator", indicator),
+        ("u_incoming", u_incoming.get()),
+        ])
 
     # }}}
 
diff --git a/examples/hss-accuracy-study.py b/examples/hss-accuracy-study.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec7afc2b101a7585fd6158c1d1151b3ab98a09cc
--- /dev/null
+++ b/examples/hss-accuracy-study.py
@@ -0,0 +1,429 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018-2020 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array   # noqa
+
+from meshmode.mesh.generation import ( # noqa
+        ellipse, NArmedStarfish, generate_urchin, make_curve_mesh)
+
+from pytential import sym
+
+
+class HSSAccuracyTestCase(object):
+    # number of elements in the base mesh
+    nelements = 512
+    # element order
+    target_order = 4
+    # oversampling
+    source_ovsmp = 4
+    # qbx expansion order
+    qbx_order = 4
+
+    # representation: single (1) or double (2) -layer
+    lpot_id = 2
+    # wavenumber (!= 0 for Helmholtz)
+    k = 0
+    # exterior (+1) / interior (-1)
+    side = -1
+    # qbx expansion side (should match the problem side)
+    qbx_forced_limit = "avg"
+
+    # matrix discretization location id
+    matrix_discr_stage = sym.QBX_SOURCE_STAGE2
+    # matrix type
+    matrix_type = "qbx"
+    # add weights to P2P matrices, if used
+    weighted_farfield = None
+
+    # number of proxy points
+    proxy_approx_count = None
+    # proxy radius factor
+    proxy_radius_factor = 1.2
+    # max particles in box
+    max_particles_in_box = 64
+    # tree adaptivity
+    tree_kind = "adaptive-level-restricted"
+
+    def __init__(self, queue, **kwargs):
+        self.queue = queue
+
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+        # kernel
+        from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+        if self.k == 0:
+            self.kernel = LaplaceKernel(self.ambient_dim)
+            self.concrete_knl_kwargs = {}
+            self.sym_knl_kwargs = {}
+        else:
+            self.kernel = HelmholtzKernel(self.ambient_dim)
+            self.concrete_knl_kwargs = {"k": self.k}
+            self.sym_knl_kwargs = {"k": sym.var("k")}
+
+        # block matrix builders
+        from pytential.symbolic.matrix import MatrixBuilder, P2PMatrixBuilder
+        from pytential.symbolic.matrix import FarFieldBlockBuilder
+        from pytential.symbolic.matrix import NearFieldBlockBuilder
+
+        self.mat_builder_kwargs = {}
+        if self.matrix_type == "qbx":
+            self.mat_builder = MatrixBuilder
+            self.nearfield_block_builder = NearFieldBlockBuilder
+            self.farfield_block_builder = FarFieldBlockBuilder
+        elif self.matrix_type == "p2p":
+            self.mat_builder = P2PMatrixBuilder
+            self.nearfield_block_builder = FarFieldBlockBuilder
+            self.farfield_block_builder = FarFieldBlockBuilder
+
+    def get_mesh(self, resolution, mesh_order):
+        if self.ambient_dim == 2:
+            curve_f = NArmedStarfish(5, 0.25)
+            # curve_f = lambda t: 2.0 * ellipse(1.0, t)
+            mesh = make_curve_mesh(curve_f,
+                    np.linspace(0.0, 1.0, resolution + 1),
+                    mesh_order)
+        elif self.ambient_dim == 3:
+            mesh = generate_urchin(mesh_order, 1, 2, resolution, min_rad=0.5)
+        else:
+            raise ValueError('unsupported ambient dimension')
+
+        return mesh
+
+    def get_layer_potential(self, cl_context, resolution, mesh_order):
+        from meshmode.discretization import Discretization
+        from meshmode.discretization.poly_element import \
+                InterpolatoryQuadratureSimplexGroupFactory as GroupFactory
+        mesh = self.get_mesh(resolution, mesh_order)
+        discr = Discretization(
+                cl_context, mesh, GroupFactory(self.target_order))
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(discr,
+                fine_order=self.source_ovsmp * self.target_order,
+                qbx_order=self.qbx_order,
+                fmm_order=False)
+
+        return qbx
+
+    def get_symbolic_operator(self):
+        lpot_kwargs = {"qbx_forced_limit": self.qbx_forced_limit}
+        lpot_kwargs.update(self.concrete_knl_kwargs)
+
+        density_sym = sym.var("u")
+        if self.lpot_id == 1:
+            op_sym = sym.S(self.kernel, density_sym, **lpot_kwargs)
+        elif self.lpot_id == 2:
+            op_sym = sym.D(self.kernel, density_sym, **lpot_kwargs)
+
+            if self.qbx_forced_limit == "avg":
+                op_sym = 0.5 * self.side * density_sym + op_sym
+        else:
+            raise ValueError("Unknown lpot_id: {}".format(self.lpot_id))
+
+        return density_sym, op_sym
+
+    def get_partition(self, queue, places, dofdesc):
+        from pytential.linalg.hss import partition_by_nodes
+        discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+        indices, partition = partition_by_nodes(queue, discr,
+                tree_kind=self.tree_kind,
+                max_particles_in_box=self.max_particles_in_box)
+
+        return indices, partition
+
+
+def compute_matrix_error(queue, mat, hss):
+    r"""
+    :return: a tuple ``(err_l, err_r, err_f)`` of the left and right
+        off-diagonal block-wise compression errors and the full matrix
+        compression error. These are basically:
+
+        .. math::
+
+            \begin{aligned}
+            \epsilon_l = \max_{i \ne j} \|A_{ij} - L_{ii} B_{ij}\|_2, \\
+            \epsilon_r = \max_{i \ne j} \|A_{ij} - B_{ij} R_{jj}\|_2, \\
+            \epsilon_f = \|A - L B R\|_2.
+            \end{aligned}
+    """
+    from itertools import product
+
+    L = hss.L
+    R = hss.R
+    D = hss.D
+
+    sklindices = hss.sklindices
+    indices = hss.indices
+
+    # compute max block error
+    err_l = -np.inf
+    err_r = -np.inf
+    nblocks = indices.nblocks
+    for i, j in product(range(nblocks), repeat=2):
+        if i == j:
+            continue
+
+        ftgt = indices.row.block_indices(i)
+        fsrc = indices.col.block_indices(j)
+
+        stgt = sklindices.row.block_indices(i)
+        ssrc = sklindices.col.block_indices(j)
+
+        blk = mat[np.ix_(ftgt, fsrc)]
+        lblk = mat[np.ix_(stgt, fsrc)]
+        rblk = mat[np.ix_(ftgt, ssrc)]
+
+        err_l = max(err_l, la.norm(blk - L[i, i].dot(lblk)) / la.norm(blk))
+        err_r = max(err_r, la.norm(blk - rblk.dot(R[j, j])) / la.norm(blk))
+
+    # compute full matrix error
+    S = np.full((nblocks, nblocks), 0, dtype=np.object)
+    for i, j in product(range(nblocks), repeat=2):
+        if i == j:
+            continue
+
+        itgt = sklindices.row.block_indices(i)
+        isrc = sklindices.col.block_indices(j)
+        S[i, j] = mat[np.ix_(itgt, isrc)]
+
+    from pytential.symbolic.execution import _bmat
+    L = _bmat(L, dtype=mat.dtype)
+    S = _bmat(S, dtype=mat.dtype)
+    R = _bmat(R, dtype=mat.dtype)
+    D = _bmat(D, dtype=mat.dtype)
+    mat = mat[np.ix_(indices.row.indices, indices.col.indices)]
+
+    assert D.shape == mat.shape
+    assert L.shape == (mat.shape[0], S.shape[0])
+    assert R.shape == (S.shape[1], mat.shape[1])
+    err_f = la.norm(mat - L.dot(S.dot(R)) - D) / la.norm(mat)
+
+    return err_l, err_r, err_f
+
+
+# {{{ matrix reconstruction accuracy
+
+def matrix_reconstruction_accuracy(ctx_factory,
+        ambient_dim=2, visualize=True, **kwargs):
+    """Plots actual compression error vs desired compression tolerance."""
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    case = HSSAccuracyTestCase(queue, ambient_dim=ambient_dim, **kwargs)
+
+    # {{{ conctruct test case
+
+    qbx = case.get_layer_potential(ctx, case.nelements, case.target_order)
+
+    from pytential import GeometryCollection
+    dd = sym.DOFDescriptor("hss", discr_stage=case.matrix_discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_source = places.get_geometry(dd.geometry)
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    indices, partition = case.get_partition(queue, places, dd)
+
+    from pytential.symbolic.execution import _prepare_expr
+    density_sym, op_sym = case.get_symbolic_operator()
+    prep_op_sym = _prepare_expr(places, op_sym, auto_where=dd)
+
+    builder = case.mat_builder(queue,
+            dep_expr=density_sym,
+            other_dep_exprs=[],
+            dep_source=dep_source,
+            dep_discr=dep_discr,
+            places=places,
+            context=case.concrete_knl_kwargs,
+            **case.mat_builder_kwargs,
+            )
+    mat = builder(prep_op_sym)
+
+    # }}}
+
+    # {{{ compute errors
+
+    id_eps_array = 10.0 ** np.arange(-2, -17, -1)
+    err_lbi = np.empty(id_eps_array.shape)
+    err_ibr = np.empty(id_eps_array.shape)
+    err_lbr = np.empty(id_eps_array.shape)
+
+    nelements = np.sum([g.nelements for g in dep_discr.groups])
+    print("levels: {}".format(partition.nlevels))
+    print("blocks: {}".format(indices.nblocks))
+    print("nelems: {}".format(nelements))
+    print("nnodes: {}".format(dep_discr.nnodes))
+
+    from pytential.linalg.hss import build_compressed_matrix
+    for i, id_eps in enumerate(id_eps_array):
+        hss = build_compressed_matrix(queue,
+                places, op_sym, density_sym,
+                auto_where=dd,
+                id_eps=id_eps,
+                context=case.concrete_knl_kwargs,
+
+                _proxy_radius_factor=case.proxy_radius_factor,
+                _proxy_approx_count=case.proxy_approx_count,
+                _tree_max_particles_in_box=case.max_particles_in_box,
+                _tree_kind=case.tree_kind,
+                _weighted_farfield=case.weighted_farfield,
+                _farfield_block_builder=case.farfield_block_builder,
+                _nearfield_block_builder=case.nearfield_block_builder)
+
+        err = compute_matrix_error(queue, mat, hss.levels[hss.nlevels - 1])
+
+        err_lbi[i] = err[0]
+        err_ibr[i] = err[1]
+        err_lbr[i] = err[2]
+
+        if i == 0:
+            D = hss.levels[hss.nlevels - 1].D
+            blocks = [D[k, k].shape[0] for k in range(D.shape[0])]
+            print("L1 sizes (min/mean/max): ",
+                    [np.min(blocks), int(np.mean(blocks)), np.max(blocks)])
+            print("        {:^11} {:^11} {:^11}".format('L', 'R', 'F'))
+
+        print('{:.1e} {:.5e} {:.5e} {:.5e}'.format(id_eps, *err))
+
+    if visualize:
+        import matplotlib.pyplot as pt
+
+        pt.figure(figsize=(10, 10), dpi=300)
+        pt.loglog(id_eps_array, err_lbi, "^-", label="Left")
+        pt.loglog(id_eps_array, err_ibr, "v-", label="Right")
+        pt.loglog(id_eps_array, err_lbr, "o-", label="Full")
+        pt.loglog(id_eps_array, id_eps_array, 'k--')
+        pt.xlabel(r"$\epsilon_{ID}$")
+        pt.ylabel("$error$")
+        pt.legend()
+        pt.savefig("hss_matrix_reconstruction_accuracy.png")
+
+# }}}
+
+
+# {{{ residual accuracy
+
+def residual_accuracy(ctx_factory,
+        ambient_dim=2, visualize=True, **kwargs):
+    """Plots residual vs desired tolerance."""
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    case = HSSAccuracyTestCase(queue, ambient_dim=ambient_dim, **kwargs)
+
+    # {{{ conctruct test case
+
+    qbx = case.get_layer_potential(ctx, case.nelements, case.target_order)
+
+    from pytential import GeometryCollection
+    dd = sym.DOFDescriptor("hss", discr_stage=case.matrix_discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_source = places.get_geometry(dd.geometry)
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    indices, partition = case.get_partition(queue, places, dd)
+
+    from pytential.symbolic.execution import _prepare_expr
+    density_sym, op_sym = case.get_symbolic_operator()
+    prep_op_sym = _prepare_expr(places, op_sym, auto_where=dd)
+
+    builder = case.mat_builder(queue,
+            dep_expr=density_sym,
+            other_dep_exprs=[],
+            dep_source=dep_source,
+            dep_discr=dep_discr,
+            places=places,
+            context=case.concrete_knl_kwargs,
+            **case.mat_builder_kwargs,
+            )
+    mat = builder(prep_op_sym)
+
+    # }}}
+
+    # {{{ setup solution
+
+    x = np.random.rand(dep_discr.nnodes)
+    nodes = dep_discr.nodes().get(queue)
+    x = np.sin(np.angle(nodes[0, :] + 1.0j * nodes[1, :]))
+    b = mat.dot(x)
+
+    # desired tolerance array
+    id_eps_array = 10.0 ** np.arange(-2, -17, -1)
+    err_res = np.empty(id_eps_array.shape)
+    err_sol = np.empty(id_eps_array.shape)
+
+    kappa = la.cond(mat)
+    print("blocks: {}".format(indices.nblocks))
+    print("nnodes: {}".format(dep_discr.nnodes))
+    print("levels: {}".format(partition.nlevels))
+    print("cond:   {}".format(kappa))
+
+    print("        {:^11} {:^11}".format('R', 'S'))
+    from pytential.linalg.hss import build_compressed_matrix
+    for i, id_eps in enumerate(id_eps_array):
+        hss = build_compressed_matrix(queue,
+                places, op_sym, density_sym,
+                auto_where=dd, matrix_mode="backward",
+                id_eps=id_eps,
+                context=case.concrete_knl_kwargs,
+                proxy_radius_factor=case.proxy_radius_factor,
+                proxy_approx_count=case.proxy_approx_count,
+                tree_kind=case.tree_kind,
+                weighted_farfield=case.weighted_p2p,
+                tree_max_particles_in_box=case.max_particles_in_box,
+                farfield_block_builder=case.farfield_block_builder,
+                nearfield_block_builder=case.nearfield_block_builder)
+        xhat = hss.matvec(b)
+
+        err_res[i] = la.norm(mat.dot(xhat) - b) / la.norm(b)
+        err_sol[i] = la.norm(xhat - x) / la.norm(x)
+
+        print('{:.1e} {:.5e} {:.5e}'.format(id_eps, err_res[i], err_sol[i]))
+
+    if visualize:
+        import matplotlib.pyplot as pt
+
+        pt.figure(figsize=(10, 10), dpi=300)
+        pt.loglog(id_eps_array, err_res, "^-", label="Residual")
+        pt.loglog(id_eps_array, err_sol, "v-", label="Solution")
+        pt.loglog(id_eps_array, id_eps_array, 'k--')
+        pt.loglog(id_eps_array, kappa * id_eps_array, 'k--')
+        pt.xlabel(r"$\epsilon_{ID}$")
+        pt.ylabel("$error$")
+        pt.legend()
+        pt.savefig("hss_residual_accuracy.png")
+
+
+# }}}
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        matrix_reconstruction_accuracy(cl._csc, ambient_dim=2)
+        residual_accuracy(cl._csc, ambient_dim=2)
diff --git a/examples/laplace-dirichlet-3d.py b/examples/laplace-dirichlet-3d.py
index db93fadff90cc35b5eed01cdb5ce1ae7180c769c..984f1de10694a493e1b708f35118897c21a25cc5 100644
--- a/examples/laplace-dirichlet-3d.py
+++ b/examples/laplace-dirichlet-3d.py
@@ -7,7 +7,7 @@ from meshmode.discretization import Discretization
 from meshmode.discretization.poly_element import \
         InterpolatoryQuadratureSimplexGroupFactory
 
-from pytential import bind, sym, norm  # noqa
+from pytential import bind, sym
 from pytential.target import PointsTarget
 
 # {{{ set some constants for use below
@@ -22,18 +22,18 @@ fmm_order = 3
 # }}}
 
 
-def main():
+def main(mesh_name="torus", visualize=False):
     import logging
     logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    from meshmode.mesh.generation import generate_torus
+    if mesh_name == "torus":
+        rout = 10
+        rin = 1
 
-    rout = 10
-    rin = 1
-    if 1:
+        from meshmode.mesh.generation import generate_torus
         base_mesh = generate_torus(
                 rout, rin, 40, 4,
                 mesh_order)
@@ -52,11 +52,13 @@ def main():
 
         mesh = merge_disjoint_meshes(meshes, single_group=True)
 
-        if 0:
+        if visualize:
             from meshmode.mesh.visualization import draw_curve
             draw_curve(mesh)
             import matplotlib.pyplot as plt
             plt.show()
+    else:
+        raise ValueError("unknown mesh name: {}".format(mesh_name))
 
     pre_density_discr = Discretization(
             cl_ctx, mesh,
@@ -64,23 +66,32 @@ def main():
 
     from pytential.qbx import (
             QBXLayerPotentialSource, QBXTargetAssociationFailedException)
-    qbx, _ = QBXLayerPotentialSource(
+    qbx = QBXLayerPotentialSource(
             pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
             fmm_order=fmm_order,
-            ).with_refinement()
-    density_discr = qbx.density_discr
+            )
+
+    from sumpy.visualization import FieldPlotter
+    fplot = FieldPlotter(np.zeros(3), extent=20, npoints=50)
+    targets = cl.array.to_device(queue, fplot.points)
+
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "qbx": qbx,
+        "qbx_target_assoc": qbx.copy(target_association_tolerance=0.2),
+        "targets": PointsTarget(targets)
+        }, auto_where="qbx")
+    density_discr = places.get_discretization("qbx")
 
     # {{{ describe bvp
 
     from sumpy.kernel import LaplaceKernel
     kernel = LaplaceKernel(3)
 
-    cse = sym.cse
-
     sigma_sym = sym.var("sigma")
     #sqrt_w = sym.sqrt_jac_q_weight(3)
     sqrt_w = 1
-    inv_sqrt_w_sigma = cse(sigma_sym/sqrt_w)
+    inv_sqrt_w_sigma = sym.cse(sigma_sym/sqrt_w)
 
     # -1 for interior Dirichlet
     # +1 for exterior Dirichlet
@@ -88,13 +99,13 @@ def main():
 
     bdry_op_sym = (loc_sign*0.5*sigma_sym
             + sqrt_w*(
-                sym.S(kernel, inv_sqrt_w_sigma)
-                + sym.D(kernel, inv_sqrt_w_sigma)
+                sym.S(kernel, inv_sqrt_w_sigma, qbx_forced_limit=+1)
+                + sym.D(kernel, inv_sqrt_w_sigma, qbx_forced_limit="avg")
                 ))
 
     # }}}
 
-    bound_op = bind(qbx, bdry_op_sym)
+    bound_op = bind(places, bdry_op_sym)
 
     # {{{ fix rhs and solve
 
@@ -109,7 +120,7 @@ def main():
 
     bc = cl.array.to_device(queue, u_incoming_func(nodes))
 
-    bvp_rhs = bind(qbx, sqrt_w*sym.var("bc"))(queue, bc=bc)
+    bvp_rhs = bind(places, sqrt_w*sym.var("bc"))(queue, bc=bc)
 
     from pytential.solve import gmres
     gmres_result = gmres(
@@ -118,7 +129,8 @@ def main():
             stall_iterations=0,
             hard_failure=True)
 
-    sigma = bind(qbx, sym.var("sigma")/sqrt_w)(queue, sigma=gmres_result.solution)
+    sigma = bind(places, sym.var("sigma")/sqrt_w)(
+            queue, sigma=gmres_result.solution)
 
     # }}}
 
@@ -130,38 +142,27 @@ def main():
 
     # {{{ postprocess/visualize
 
-    repr_kwargs = dict(qbx_forced_limit=None)
+    repr_kwargs = dict(
+            source="qbx_target_assoc",
+            target="targets",
+            qbx_forced_limit=None)
     representation_sym = (
             sym.S(kernel, inv_sqrt_w_sigma, **repr_kwargs)
             + sym.D(kernel, inv_sqrt_w_sigma, **repr_kwargs))
 
-    from sumpy.visualization import FieldPlotter
-    fplot = FieldPlotter(np.zeros(3), extent=20, npoints=50)
-
-    targets = cl.array.to_device(queue, fplot.points)
-
-    qbx_stick_out = qbx.copy(target_stick_out_factor=0.2)
-
     try:
-        fld_in_vol = bind(
-                (qbx_stick_out, PointsTarget(targets)),
-                representation_sym)(queue, sigma=sigma).get()
+        fld_in_vol = bind(places, representation_sym)(
+                queue, sigma=sigma).get()
     except QBXTargetAssociationFailedException as e:
-        fplot.write_vtk_file(
-                "failed-targets.vts",
-                [
-                    ("failed", e.failed_target_flags.get(queue))
-                    ]
-                )
+        fplot.write_vtk_file("laplace-dirichlet-3d-failed-targets.vts", [
+            ("failed", e.failed_target_flags.get(queue)),
+            ])
         raise
 
     #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-    fplot.write_vtk_file(
-            "potential-laplace-3d.vts",
-            [
-                ("potential", fld_in_vol),
-                ]
-            )
+    fplot.write_vtk_file("laplace-dirichlet-3d-potential.vts", [
+        ("potential", fld_in_vol),
+        ])
 
     # }}}
 
diff --git a/examples/layerpot-3d.py b/examples/layerpot-3d.py
index 28f0967e8aec28332902a128d0fb1efafb100d4e..ecace75de9c67dfdb4899f688737aeda14b05fc8 100644
--- a/examples/layerpot-3d.py
+++ b/examples/layerpot-3d.py
@@ -1,45 +1,42 @@
 from __future__ import division
+
 import numpy as np
 import pyopencl as cl
+
 from sumpy.visualization import FieldPlotter
-#from mayavi import mlab
 from sumpy.kernel import one_kernel_2d, LaplaceKernel, HelmholtzKernel  # noqa
 
-import faulthandler
+from pytential import bind, sym
 from six.moves import range
-faulthandler.enable()
-
-cl_ctx = cl.create_some_context()
-queue = cl.CommandQueue(cl_ctx)
 
 target_order = 5
 qbx_order = 3
 mode_nr = 4
-
-if 1:
-    cad_file_name = "geometries/ellipsoid.step"
-    h = 0.6
-else:
-    cad_file_name = "geometries/two-cylinders-smooth.step"
-    h = 0.4
-
 k = 0
-if k:
-    kernel = HelmholtzKernel(3)
-else:
-    kernel = LaplaceKernel(3)
-#kernel = OneKernel()
 
 
-def main():
+def main(mesh_name="ellipsoid"):
     import logging
     logger = logging.getLogger(__name__)
     logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    if mesh_name == "ellipsoid":
+        cad_file_name = "geometries/ellipsoid.step"
+        h = 0.6
+    elif mesh_name == "two-cylinders":
+        cad_file_name = "geometries/two-cylinders-smooth.step"
+        h = 0.4
+    else:
+        raise ValueError("unknown mesh name: %s" % mesh_name)
+
     from meshmode.mesh.io import generate_gmsh, FileSource
     mesh = generate_gmsh(
             FileSource(cad_file_name), 2, order=2,
-            other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h])
+            other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h],
+            target_unit="MM")
 
     from meshmode.mesh.processing import perform_flips
     # Flip elements--gmsh generates inside-out geometry.
@@ -60,15 +57,28 @@ def main():
     density_discr = Discretization(
             cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-    qbx, _ = QBXLayerPotentialSource(density_discr, 4*target_order, qbx_order,
+    qbx = QBXLayerPotentialSource(density_discr, 4*target_order, qbx_order,
             fmm_order=qbx_order + 3,
-            target_association_tolerance=0.15).with_refinement()
+            target_association_tolerance=0.15)
 
-    nodes = density_discr.nodes().with_queue(queue)
+    from pytential.target import PointsTarget
+    fplot = FieldPlotter(bbox_center, extent=3.5*bbox_size, npoints=150)
+
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "qbx": qbx,
+        "targets": PointsTarget(fplot.points)
+        }, auto_where="qbx")
+    density_discr = places.get_discretization("qbx")
 
+    nodes = density_discr.nodes().with_queue(queue)
     angle = cl.clmath.atan2(nodes[1], nodes[0])
 
-    from pytential import bind, sym
+    if k:
+        kernel = HelmholtzKernel(3)
+    else:
+        kernel = LaplaceKernel(3)
+
     #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None))
     op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
     #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)
@@ -83,29 +93,21 @@ def main():
     if isinstance(kernel, HelmholtzKernel):
         sigma = sigma.astype(np.complex128)
 
-    fplot = FieldPlotter(bbox_center, extent=3.5*bbox_size, npoints=150)
-
-    from pytential.target import PointsTarget
-    fld_in_vol = bind(
-            (qbx, PointsTarget(fplot.points)),
-            op)(queue, sigma=sigma, k=k).get()
+    fld_in_vol = bind(places, op, auto_where=("qbx", "targets"))(
+            queue, sigma=sigma, k=k).get()
 
     #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-    fplot.write_vtk_file(
-            "potential-3d.vts",
-            [
-                ("potential", fld_in_vol)
-                ]
-            )
-
-    bdry_normals = bind(
-            density_discr,
+    fplot.write_vtk_file("layerpot-3d-potential.vts", [
+        ("potential", fld_in_vol)
+        ])
+
+    bdry_normals = bind(places,
             sym.normal(density_discr.ambient_dim))(queue).as_vector(dtype=object)
 
     from meshmode.discretization.visualization import make_visualizer
     bdry_vis = make_visualizer(queue, density_discr, target_order)
 
-    bdry_vis.write_vtk_file("source-3d.vtu", [
+    bdry_vis.write_vtk_file("layerpot-3d-density.vtu", [
         ("sigma", sigma),
         ("bdry_normals", bdry_normals),
         ])
diff --git a/examples/layerpot.py b/examples/layerpot.py
index 7b4737da00d6d1d1cc76e31f39932fc7c12783e8..e01a24eb8ffbc15e232839e7015a2f5feeae35a5 100644
--- a/examples/layerpot.py
+++ b/examples/layerpot.py
@@ -10,10 +10,9 @@ from sumpy.visualization import FieldPlotter
 from sumpy.kernel import one_kernel_2d, LaplaceKernel, HelmholtzKernel  # noqa
 
 from pytential import bind, sym
-
-import faulthandler
 from six.moves import range
-faulthandler.enable()
+
+from meshmode.mesh.generation import starfish, ellipse, drop # noqa
 
 target_order = 16
 qbx_order = 3
@@ -21,27 +20,18 @@ nelements = 60
 mode_nr = 3
 
 k = 0
-if k:
-    kernel = HelmholtzKernel(2)
-    kernel_kwargs = {"k": sym.var("k")}
-else:
-    kernel = LaplaceKernel(2)
-    kernel_kwargs = {}
-#kernel = OneKernel()
 
 
-def main():
+def main(curve_fn=starfish, visualize=True):
     import logging
     logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    from meshmode.mesh.generation import (  # noqa
-            make_curve_mesh, starfish, ellipse, drop)
+    from meshmode.mesh.generation import make_curve_mesh
     mesh = make_curve_mesh(
-            #lambda t: ellipse(1, t),
-            starfish,
+            curve_fn,
             np.linspace(0, 1, nelements+1),
             target_order)
 
@@ -53,16 +43,31 @@ def main():
     pre_density_discr = Discretization(
             cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-    qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4*target_order, qbx_order,
+    qbx = QBXLayerPotentialSource(pre_density_discr, 4*target_order, qbx_order,
             fmm_order=qbx_order+3,
-            target_association_tolerance=0.005).with_refinement()
+            target_association_tolerance=0.005)
 
-    density_discr = qbx.density_discr
+    from pytential.target import PointsTarget
+    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000)
+    targets_dev = cl.array.to_device(queue, fplot.points)
 
-    nodes = density_discr.nodes().with_queue(queue)
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "qbx": qbx,
+        "targets": PointsTarget(targets_dev),
+        }, auto_where="qbx")
+    density_discr = places.get_discretization("qbx")
 
+    nodes = density_discr.nodes().with_queue(queue)
     angle = cl.clmath.atan2(nodes[1], nodes[0])
 
+    if k:
+        kernel = HelmholtzKernel(2)
+        kernel_kwargs = {"k": sym.var("k")}
+    else:
+        kernel = LaplaceKernel(2)
+        kernel_kwargs = {}
+
     def op(**kwargs):
         kwargs.update(kernel_kwargs)
 
@@ -80,26 +85,19 @@ def main():
     if isinstance(kernel, HelmholtzKernel):
         sigma = sigma.astype(np.complex128)
 
-    bound_bdry_op = bind(qbx, op())
-    #mlab.figure(bgcolor=(1, 1, 1))
-    if 1:
-        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000)
-        from pytential.target import PointsTarget
-
-        targets_dev = cl.array.to_device(queue, fplot.points)
-        fld_in_vol = bind(
-                (qbx, PointsTarget(targets_dev)),
-                op(qbx_forced_limit=None))(queue, sigma=sigma, k=k).get()
+    bound_bdry_op = bind(places, op())
+    if visualize:
+        fld_in_vol = bind(places, op(
+            source="qbx",
+            target="targets",
+            qbx_forced_limit=None))(queue, sigma=sigma, k=k).get()
 
         if enable_mayavi:
             fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
         else:
-            fplot.write_vtk_file(
-                    "potential-2d.vts",
-                    [
-                        ("potential", fld_in_vol)
-                        ]
-                    )
+            fplot.write_vtk_file("layerpot-potential.vts", [
+                ("potential", fld_in_vol)
+                ])
 
     if 0:
         def apply_op(density):
diff --git a/examples/scaling-study.py b/examples/scaling-study.py
index 3327e3c8c6ce71262018551008a203a04d68e70b..21a85019ff03214f314265999598ed000350e3d5 100644
--- a/examples/scaling-study.py
+++ b/examples/scaling-study.py
@@ -6,7 +6,7 @@ from meshmode.discretization import Discretization
 from meshmode.discretization.poly_element import \
         InterpolatoryQuadratureSimplexGroupFactory
 
-from pytential import bind, sym, norm  # noqa
+from pytential import bind, sym
 from pytential.target import PointsTarget
 
 # {{{ set some constants for use below
@@ -22,7 +22,7 @@ k = 0
 # }}}
 
 
-def make_mesh(nx, ny):
+def make_mesh(nx, ny, visualize=False):
     from meshmode.mesh.generation import ellipse, make_curve_mesh
     from functools import partial
 
@@ -43,7 +43,7 @@ def make_mesh(nx, ny):
 
     mesh = merge_disjoint_meshes(meshes, single_group=True)
 
-    if 0:
+    if visualize:
         from meshmode.mesh.visualization import draw_curve
         draw_curve(mesh)
         import matplotlib.pyplot as plt
@@ -52,14 +52,14 @@ def make_mesh(nx, ny):
     return mesh
 
 
-def timing_run(nx, ny):
+def timing_run(nx, ny, visualize=False):
     import logging
     logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    mesh = make_mesh(nx=nx, ny=ny)
+    mesh = make_mesh(nx=nx, ny=ny, visualize=visualize)
 
     density_discr = Discretization(
             cl_ctx, mesh,
@@ -72,16 +72,33 @@ def timing_run(nx, ny):
             fmm_order=fmm_order
             )
 
+    places = {"qbx": qbx}
+    if visualize:
+        from sumpy.visualization import FieldPlotter
+        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1500)
+        targets = PointsTarget(cl.array.to_device(queue, fplot.points))
+
+        places.update({
+            "plot-targets": targets,
+            "qbx-indicator": qbx.copy(
+                target_association_tolerance=0.05,
+                fmm_level_to_order=lambda lev: 7,
+                qbx_order=2),
+            "qbx-target-assoc": qbx.copy(target_association_tolerance=0.1)
+            })
+
+    from pytential import GeometryCollection
+    places = GeometryCollection(places, auto_where="qbx")
+    density_discr = places.get_discretization("qbx")
+
     # {{{ describe bvp
 
     from sumpy.kernel import HelmholtzKernel
     kernel = HelmholtzKernel(2)
 
-    cse = sym.cse
-
     sigma_sym = sym.var("sigma")
     sqrt_w = sym.sqrt_jac_q_weight(2)
-    inv_sqrt_w_sigma = cse(sigma_sym/sqrt_w)
+    inv_sqrt_w_sigma = sym.cse(sigma_sym/sqrt_w)
 
     # Brakhage-Werner parameter
     alpha = 1j
@@ -90,15 +107,14 @@ def timing_run(nx, ny):
     # +1 for exterior Dirichlet
     loc_sign = +1
 
-    bdry_op_sym = (-loc_sign*0.5*sigma_sym
-            + sqrt_w*(
-                alpha*sym.S(kernel, inv_sqrt_w_sigma, k=sym.var("k"))
-                - sym.D(kernel, inv_sqrt_w_sigma, k=sym.var("k"))
-                ))
+    k_sym = sym.var("k")
+    S_sym = sym.S(kernel, inv_sqrt_w_sigma, k=k_sym, qbx_forced_limit=+1)
+    D_sym = sym.D(kernel, inv_sqrt_w_sigma, k=k_sym, qbx_forced_limit="avg")
+    bdry_op_sym = -loc_sign*0.5*sigma_sym + sqrt_w*(alpha*S_sym + D_sym)
 
     # }}}
 
-    bound_op = bind(qbx, bdry_op_sym)
+    bound_op = bind(places, bdry_op_sym)
 
     # {{{ fix rhs and solve
 
@@ -115,74 +131,54 @@ def timing_run(nx, ny):
     repr_kwargs = dict(k=sym.var("k"), qbx_forced_limit=+1)
 
     sym_op = sym.S(kernel, sym.var("sigma"), **repr_kwargs)
-    bound_op = bind(qbx, sym_op)
+    bound_op = bind(places, sym_op)
 
-    print("FMM WARM-UP RUN 1: %d elements" % mesh.nelements)
+    print("FMM WARM-UP RUN 1: %5d elements" % mesh.nelements)
     bound_op(queue, sigma=sigma, k=k)
-    print("FMM WARM-UP RUN 2: %d elements" % mesh.nelements)
+    queue.finish()
+
+    print("FMM WARM-UP RUN 2: %5d elements" % mesh.nelements)
     bound_op(queue, sigma=sigma, k=k)
     queue.finish()
-    print("FMM TIMING RUN: %d elements" % mesh.nelements)
 
     from time import time
     t_start = time()
-
     bound_op(queue, sigma=sigma, k=k)
     queue.finish()
-    elapsed = time()-t_start
+    elapsed = time() - t_start
 
-    print("FMM TIMING RUN DONE: %d elements -> %g s"
+    print("FMM TIMING RUN:    %5d elements -> %g s"
             % (mesh.nelements, elapsed))
 
-    return (mesh.nelements, elapsed)
-
-    if 0:
-        from sumpy.visualization import FieldPlotter
-        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1500)
-
-        targets = cl.array.to_device(queue, fplot.points)
-
-        qbx_tgt_tol = qbx.copy(target_association_tolerance=0.05)
-
-        indicator_qbx = qbx_tgt_tol.copy(
-                fmm_level_to_order=lambda lev: 7, qbx_order=2)
-
+    if visualize:
         ones_density = density_discr.zeros(queue)
         ones_density.fill(1)
-        indicator = bind(
-                (indicator_qbx, PointsTarget(targets)),
-                sym_op)(
+        indicator = bind(places, sym_op,
+                auto_where=("qbx-indicator", "plot-targets"))(
                 queue, sigma=ones_density).get()
 
-        qbx_stick_out = qbx.copy(target_stick_out_factor=0.1)
         try:
-            fld_in_vol = bind(
-                    (qbx_stick_out, PointsTarget(targets)),
-                    sym_op)(queue, sigma=sigma, k=k).get()
+            fld_in_vol = bind(places, sym_op,
+                    auto_where=("qbx-target-assoc", "plot-targets"))(
+                    queue, sigma=sigma, k=k).get()
         except QBXTargetAssociationFailedException as e:
-            fplot.write_vtk_file(
-                    "failed-targets.vts",
-                    [
-                        ("failed", e.failed_target_flags.get(queue))
-                        ]
-                    )
+            fplot.write_vtk_file("scaling-study-failed-targets.vts", [
+                ("failed", e.failed_target_flags.get(queue)),
+                ])
             raise
 
-        #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-        fplot.write_vtk_file(
-                "potential-scaling.vts",
-                [
-                    ("potential", fld_in_vol),
-                    ("indicator", indicator)
-                    ]
-                )
+        fplot.write_vtk_file("scaling-study-potential.vts", [
+            ("potential", fld_in_vol),
+            ("indicator", indicator),
+            ])
+
+    return (mesh.nelements, elapsed)
 
     # }}}
 
 
 if __name__ == "__main__":
-    results = []
-    for nx, ny in [
+    grid_sizes = [
             (3, 3),
             (3, 4),
             (4, 4),
@@ -198,9 +194,14 @@ if __name__ == "__main__":
             (9, 9),
             (9, 10),
             (10, 10),
-            ]:
+            ]
 
-        results.append(timing_run(nx, ny))
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
 
-    for r in results:
-        print(r)
+    for nx, ny in grid_sizes:
+        npoints, t_elapsed = timing_run(nx, ny)
+        eoc.add_data_point(npoints, t_elapsed)
+    print(eoc.pretty_print(
+        abscissa_label="Elements",
+        error_label="Timing (s)"))
diff --git a/pytential/__init__.py b/pytential/__init__.py
index d28e8bdbcc8be2377ee575edeeb8e2b6ce0fb6e7..728ce196fd33a55566ea9f21e920e2b59332453b 100644
--- a/pytential/__init__.py
+++ b/pytential/__init__.py
@@ -25,8 +25,8 @@ THE SOFTWARE.
 import numpy as np
 
 import pytential.symbolic.primitives as sym
-from pytential.symbolic.execution import GeometryCollection # noqa
 from pytential.symbolic.execution import bind
+from pytential.symbolic.execution import GeometryCollection
 
 from pytools import memoize_on_first_arg
 
@@ -123,4 +123,4 @@ def norm(discr, queue, x, p=2):
         raise ValueError("unsupported norm order: %s" % p)
 
 
-__all__ = ["sym", "bind"]
+__all__ = ["sym", "bind", "GeometryCollection"]
diff --git a/pytential/linalg/hss.py b/pytential/linalg/hss.py
new file mode 100644
index 0000000000000000000000000000000000000000..94897f59d90082f9f01119d492252538abed896f
--- /dev/null
+++ b/pytential/linalg/hss.py
@@ -0,0 +1,1432 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018-2020 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+
+from pytools import memoize_method, Record
+from pytools.obj_array import is_obj_array, make_obj_array
+
+from pytential.symbolic.mappers import IdentityMapper
+from sumpy.tools import MatrixBlockIndexRanges, BlockIndexRanges
+from boxtree.tools import DeviceDataRecord
+
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
+
+__doc__ = """
+Hierarchical Matrix Compression
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: build_compressed_matrix
+
+"""
+
+
+# {{{ helpers
+
+def _interp_decomp(A, rank, eps):
+    """Wrapper for :func:`~scipy.linalg.interpolative.interp_decomp` that
+    always has the same output signature.
+
+    :return: a tuple ``(k, idx, interp)`` containing the numerical rank,
+        the column indices and the resulting interpolation matrix.
+    """
+
+    import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
+    if rank is None:
+        k, idx, proj = sli.interp_decomp(A, eps)
+    else:
+        idx, proj = sli.interp_decomp(A, rank)
+        k = rank
+
+    # NOTE: fix should be in scipy 1.2.0
+    # https://github.com/scipy/scipy/pull/9125
+    if k == A.shape[1]:
+        proj = np.empty((k, 0), dtype=proj.dtype)
+
+    interp = sli.reconstruct_interp_matrix(idx, proj)
+    return k, idx, interp
+
+
+def _to_block_index(queue, indices, ranges=None):
+    """Wrap a ``(indices, ranges)`` tuple into a ``BlockIndexRanges``."""
+
+    if ranges is None:
+        ranges = np.cumsum([0] + [r.size for r in indices])
+
+    ranges = cl.array.to_device(queue, ranges).with_queue(None)
+    indices = cl.array.to_device(queue, np.hstack(indices)).with_queue(None)
+
+    return BlockIndexRanges(queue.context, indices, ranges)
+
+
+def _build_diag_block(blk, blkindices):
+    """Construct a block diagonal matrix from a linear representation of
+    the matrix blocks."""
+
+    nblocks = blkindices.nblocks
+    diag = np.full((nblocks, nblocks), 0, dtype=np.object)
+
+    for i in range(nblocks):
+        diag[i, i] = blkindices.block_take(blk, i)
+
+    return diag
+
+
+def _level_ranges(r_blk, r_skl):
+    """Iterator over the block ranges between compression levels."""
+    assert r_blk.ranges.size == r_skl.ranges.size
+
+    nblocks = r_blk.ranges.size - 1
+    for iblk in range(nblocks):
+        i = np.s_[r_blk.ranges[iblk]:r_blk.ranges[iblk + 1]]
+        j = np.s_[r_skl.ranges[iblk]:r_skl.ranges[iblk + 1]]
+
+        yield i, j
+
+
+class QBXForcedLimitReplacer(IdentityMapper):
+    def __init__(self, qbx_forced_limit=None):
+        self.qbx_forced_limit = qbx_forced_limit
+
+    def map_int_g(self, expr):
+        return expr.copy(qbx_forced_limit=self.qbx_forced_limit)
+
+
+class BlockEvaluationWrangler(object):
+    def __init__(self, exprs, input_exprs, domains,
+            context=None,
+            weighted_farfield=None,
+            farfield_block_builder=None,
+            nearfield_block_builder=None):
+        self.exprs = exprs
+        self.input_exprs = input_exprs
+        self.domains = domains
+        self.context = context
+
+        if weighted_farfield is None:
+            self.weighted_farfield = (True, False)
+        elif isinstance(weighted_farfield, bool):
+            self.weighted_farfield = (weighted_farfield, weighted_farfield)
+        elif isinstance(weighted_farfield, (list, tuple)):
+            self.weighted_farfield = tuple(weighted_farfield)
+        else:
+            raise ValueError("unknown value for weighting: `{}`".format(
+                self.weighted_farfield))
+
+        self.nearfield_block_builder = nearfield_block_builder
+        if self.nearfield_block_builder is None:
+            from pytential.symbolic.matrix import NearFieldBlockBuilder
+            self.nearfield_block_builder = NearFieldBlockBuilder
+
+        self.farfield_block_builder = farfield_block_builder
+        if self.farfield_block_builder is None:
+            from pytential.symbolic.matrix import FarFieldBlockBuilder
+            self.farfield_block_builder = FarFieldBlockBuilder
+
+    def _evaluate(self, queue, places, builder_cls,
+            expr, idomain, index_set, auto_where, **kwargs):
+        domain = self.domains[idomain]
+        dep_source = places.get_geometry(domain.geometry)
+        dep_discr = places.get_discretization(domain.geometry, domain.discr_stage)
+
+        from pytential.symbolic.execution import _prepare_auto_where
+        auto_where = _prepare_auto_where(auto_where, places=places)
+        from pytential.symbolic.execution import _prepare_expr
+        expr = _prepare_expr(places, expr, auto_where=auto_where)
+
+        builder = builder_cls(queue,
+                dep_expr=self.input_exprs[idomain],
+                other_dep_exprs=(
+                    self.input_exprs[:idomain]
+                    + self.input_exprs[idomain+1:]),
+                dep_source=dep_source,
+                dep_discr=dep_discr,
+                places=places,
+                index_set=index_set,
+                context=self.context,
+                **kwargs)
+
+        return builder(expr)
+
+    def evaluate_source_farfield(self,
+            queue, places, ibrow, ibcol, index_set, auto_where=None):
+        expr = QBXForcedLimitReplacer()(self.exprs[ibcol])
+        return self._evaluate(queue, places,
+                self.farfield_block_builder,
+                expr, ibcol, index_set, auto_where,
+                weighted=self.weighted_farfield[0],
+                exclude_self=False)
+
+    def evaluate_target_farfield(self,
+            queue, places, ibrow, ibcol, index_set, auto_where=None):
+        expr = QBXForcedLimitReplacer()(self.exprs[ibcol])
+        return self._evaluate(queue, places,
+                self.farfield_block_builder,
+                expr, ibcol, index_set, auto_where,
+                weighted=self.weighted_farfield[1],
+                exclude_self=False)
+
+    def evaluate_nearfield(self,
+            queue, places, ibrow, ibcol, index_set, auto_where=None):
+        return self._evaluate(queue, places,
+                self.nearfield_block_builder,
+                self.exprs[ibrow], ibcol, index_set, auto_where)
+
+# }}}
+
+
+# {{{ node block partitioning
+
+class PartitionTreeLevel(object):
+    """
+    .. attribute:: nlevels
+
+        Total number of levels in the tree.
+
+    .. attribute:: level
+
+        Level for the current instance.
+
+    .. attribute:: size
+
+        Number of boxes at the current level.
+
+    .. attribute:: box_parent_ids
+
+        An mapping from box ids to their parent ids.
+
+    .. attribute:: partition_box_ids
+
+        A list of the boxes at the current level.
+
+    .. attribute:: partition_parent_ids
+
+        Parent ids for :attr:`parent_box_ids`.
+
+    .. attribute:: partition_parent_map
+
+        A map from :attr:`partition_parent_ids` to :attr:`partition_box_ids`.
+        Note that this map is one to many.
+
+    .. automethod:: cluster
+    """
+    def __init__(self, tree, partition_box_ids):
+        self.box_parent_ids = tree.box_parent_ids
+        self.box_levels = tree.box_levels
+        self.partition_box_ids = partition_box_ids
+
+        self.nlevels = tree.nlevels
+        self.level = getattr(tree, 'level', tree.nlevels) - 1
+
+    @property
+    def size(self):
+        return self.partition_box_ids.size
+
+    @property
+    @memoize_method
+    def partition_parent_ids(self):
+        # NOTE: the root box has itself as a parent
+        return self.box_parent_ids[self.partition_box_ids]
+
+    @property
+    @memoize_method
+    def partition_parent_map(self):
+        # NOTE: np.unique returns a sorted array
+        unique_parent_ids = np.unique(self.partition_parent_ids)
+        # find the index of each parent id
+        unique_parent_index = np.searchsorted(unique_parent_ids,
+                                              self.partition_parent_ids)
+
+        unique_parent_map = np.empty(unique_parent_ids.size,
+                                     dtype=np.object)
+        for i in range(unique_parent_ids.size):
+            unique_parent_map[i] = \
+                    np.where(unique_parent_index == i)[0]
+
+        return unique_parent_map
+
+    def cluster(self, x=None):
+        """Cluster the current level boxes into its parent boxes.
+
+        :arg x: a :class:`~sumpy.tools.MatrixBlockIndexRanges`,
+            :class:`~sumpy.tools.BlockIndexRanges` or a 2D
+            :class:`numpy.ndarray` of type ``object`` containing matrix
+            blocks. If *None*, *self* is clustered into its parent level.
+
+        :return: an object of the same type as *x* that has been clustered
+            accordingly. The clustering is always done using the
+            :attribute:`partition_parent_map`.
+        """
+        if x is None:
+            if self.size == 1:
+                return self
+
+            y = PartitionTreeLevel(self, np.unique(self.partition_parent_ids))
+        elif isinstance(x, MatrixBlockIndexRanges):
+            if x.nblocks != self.size:
+                raise ValueError('partitions must match index ranges')
+
+            if self.size == 1:
+                return x
+
+            y = MatrixBlockIndexRanges(x.cl_context,
+                                       self.cluster(x.row),
+                                       self.cluster(x.col))
+        elif isinstance(x, BlockIndexRanges):
+            if x.nblocks != self.size:
+                raise ValueError('partitions must match index ranges')
+
+            if self.size == 1:
+                return x
+
+            with cl.CommandQueue(x.cl_context) as queue:
+                x_host = x.get(queue)
+
+                nblocks = self.partition_parent_map.size
+                indices = np.empty(x_host.indices.size, dtype=np.int)
+                ranges = np.zeros(nblocks + 1, dtype=np.int)
+
+                for i, ppm in enumerate(self.partition_parent_map):
+                    partition = np.hstack([x_host.block_indices(c)
+                                           for c in ppm])
+
+                    ranges[i + 1] = ranges[i] + partition.size
+                    indices[ranges[i]:ranges[i + 1]] = partition
+
+                ranges = cl.array.to_device(queue, ranges).with_queue(None)
+                indices = cl.array.to_device(queue, indices).with_queue(None)
+
+            y = BlockIndexRanges(x.cl_context, indices, ranges)
+        elif isinstance(x, np.ndarray) and x.ndim == 2:
+            from pytential.symbolic.execution import _bmat
+
+            if x.shape != (self.size, self.size):
+                raise ValueError('partitions must match matrix blocks')
+
+            if self.size == 1:
+                return x
+
+            nblocks = self.partition_parent_map.size
+            y = np.empty((nblocks, nblocks), dtype=np.object)
+            for i, ppm in enumerate(self.partition_parent_map):
+                y[i, i] = _bmat(x[np.ix_(ppm, ppm)])
+        else:
+            raise TypeError('type cannot be clustered: {}.'.format(type(x)))
+
+        return y
+
+
+def partition_by_nodes(queue, discr,
+                       tree_kind='adaptive-level-restricted',
+                       max_particles_in_box=None):
+    """Partition the nodes in *discr* into evenly sized groups. Since the
+    partitioning is done at the level of nodes, this will break up elements
+    in *discr* between different groups.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg tree_kind: any value accepted by the *tree* argument of
+        :class:`boxtree.TreeBuilder`. If *None*, a simple linear partitioning
+        of the nodes by index is performed. This should only be used if
+        spatially close nodes have adjacent indices as well (or testing).
+    :arg max_particles_in_box: passed to :class:`boxtree.TreeBuilder`.
+
+    :return: a tuple of ``(indices, partition)``, where indices is a
+        :class:`sumpy.tools.BlockIndexRanges` and partition is a
+        :class:`pytential.linalg.hss.PartitionTreeLevel`. The *partition*
+        holds minimal information about the tree that was used to build it.
+    """
+
+    if max_particles_in_box is None:
+        # FIXME: this is just an arbitrary value
+        max_particles_in_box = 32
+
+    if tree_kind is not None:
+        from boxtree import box_flags_enum
+        from boxtree import TreeBuilder
+        builder = TreeBuilder(discr.cl_context)
+        tree_dev, _ = builder(queue,
+                discr.nodes(),
+                tree_kind=tree_kind,
+                extent_norm="l2",
+                max_particles_in_box=max_particles_in_box)
+
+        # get leaf boxes
+        tree = tree_dev.get(queue)
+        leaf_boxes, = (tree.box_flags
+                       & box_flags_enum.HAS_CHILDREN == 0).nonzero()
+        nblocks = len(leaf_boxes)
+
+        # build list of indices in each box
+        indices = np.empty(discr.nnodes, dtype=np.int)
+        ranges = np.zeros(nblocks + 1, dtype=np.int)
+
+        for i, ibox in enumerate(leaf_boxes):
+            box_start = tree.box_source_starts[ibox]
+            box_end = box_start + tree.box_source_counts_cumul[ibox]
+            box_indices = tree.user_source_ids[box_start:box_end]
+
+            ranges[i + 1] = ranges[i] + box_indices.size
+            indices[ranges[i]:ranges[i + 1]] = box_indices
+
+        ranges = cl.array.to_device(queue, ranges)
+        indices = cl.array.to_device(queue, indices)
+
+        partition = PartitionTreeLevel(tree, leaf_boxes)
+    else:
+        indices = cl.array.arange(queue, 0, discr.nnodes,
+                                  dtype=np.int)
+        ranges = cl.array.arange(queue, 0, discr.nnodes + 1,
+                                 discr.nnodes // max_particles_in_box,
+                                 dtype=np.int)
+
+        # TODO: would be nice to build a fake simple binary tree in this
+        # case, so that it still works for anything besides debugging
+        partition = None
+
+    assert ranges[-1] == discr.nnodes
+
+    indices = BlockIndexRanges(discr.cl_context,
+                               indices.with_queue(None),
+                               ranges.with_queue(None))
+
+    return indices, partition
+
+# }}}
+
+
+# {{{ proxy point generator
+
+def _generate_unit_sphere(ambient_dim, approx_npoints):
+    """Generate uniform points on a unit sphere.
+
+    :arg ambient_dim: dimension of the ambient space.
+    :arg approx_npoints: approximate number of points to generate. If the
+        ambient space is 3D, this will not generate the exact number of points.
+    :return: array of shape ``(ambient_dim, npoints)``, where ``npoints``
+        will not generally be the same as ``approx_npoints``.
+    """
+
+    if ambient_dim == 2:
+        t = np.linspace(0.0, 2.0 * np.pi, approx_npoints)
+        points = np.vstack([np.cos(t), np.sin(t)])
+    elif ambient_dim == 3:
+        # https://www.cmu.edu/biolphys/deserno/pdf/sphere_equi.pdf
+        # code by Matt Wala from
+        # https://github.com/mattwala/gigaqbx-accuracy-experiments/blob/d56ed063ffd7843186f4fe05d2a5b5bfe6ef420c/translation_accuracy.py#L23
+        a = 4.0 * np.pi / approx_npoints
+        m_theta = int(np.round(np.pi / np.sqrt(a)))
+        d_theta = np.pi / m_theta
+        d_phi = a / d_theta
+
+        points = []
+        for m in range(m_theta):
+            theta = np.pi * (m + 0.5) / m_theta
+            m_phi = int(np.round(2.0 * np.pi * np.sin(theta) / d_phi))
+
+            for n in range(m_phi):
+                phi = 2.0 * np.pi * n / m_phi
+                points.append(np.array([np.sin(theta) * np.cos(phi),
+                                        np.sin(theta) * np.sin(phi),
+                                        np.cos(theta)]))
+
+        for i in range(ambient_dim):
+            for sign in [-1, 1]:
+                pole = np.zeros(ambient_dim)
+                pole[i] = sign
+                points.append(pole)
+
+        points = np.array(points).T
+    else:
+        raise ValueError("ambient_dim > 3 not supported.")
+
+    return points
+
+
+class BlockProxyPoints(DeviceDataRecord):
+    """
+    .. attribute :: indices
+
+        A :class:`~sumpy.tools.BlockIndexRanges` describing which proxies
+        belong to which block.
+
+    .. attribute :: points
+
+        A concatenated list of all the proxy points. Can be sliced into
+        using :attr:`indices` (shape ``(dim, nproxies * nblocks)``).
+
+    .. attribute :: centers
+
+        A list of all the proxy ball centers (shape ``(dim, nblocks)``).
+
+    .. attribute :: radii
+
+        A list of all the proxy ball radii (shape ``(nblocks,)``).
+    """
+
+
+class ProxyGenerator(object):
+    r"""
+    .. attribute:: ambient_dim
+    .. attribute:: nproxy
+    .. attribute:: places
+
+        A :class:`pytential.symbolic.execution.GeometryCollection`.
+
+    .. attribute:: radius_factor
+
+        A factor used to compute the proxy ball radius. The radius
+        is computed in the :math:`\ell^2` norm, resulting in a circle or
+        sphere of proxy points. For QBX, we have two radii of interest
+        for a set of points: the radius :math:`r_{block}` of the
+        smallest ball containing all the points and the radius
+        :math:`r_{qbx}` of the smallest ball containing all the QBX
+        expansion balls in the block. If the factor :math:`\theta \in
+        [0, 1]`, then the radius of the proxy ball is
+
+        .. math::
+
+            r = (1 - \theta) r_{block} + \theta r_{qbx}.
+
+        If the factor :math:`\theta > 1`, the the radius is simply
+
+        .. math::
+
+            r = \theta r_{qbx}.
+
+    .. automethod:: __call__
+    """
+
+    def __init__(self, places, approx_nproxy=None, radius_factor=None):
+        from pytential import GeometryCollection
+        if not isinstance(places, GeometryCollection):
+            places = GeometryCollection(places)
+
+        if radius_factor is None:
+            radius_factor = 1.0
+
+        if approx_nproxy is None:
+            approx_nproxy = 32
+
+        self.places = places
+        self.radius_factor = radius_factor
+        self.ref_points = _generate_unit_sphere(
+                places.ambient_dim, approx_nproxy)
+
+    @property
+    def nproxy(self):
+        return self.ref_points.shape[1]
+
+    @property
+    def ambient_dim(self):
+        return self.ref_points.shape[0]
+
+    @memoize_method
+    def get_kernel(self):
+        if self.radius_factor < 1.0:
+            radius_expr = "(1.0 - {factor}) * rblk + {factor} * rqbx"
+        else:
+            radius_expr = "{factor} * rqbx"
+        radius_expr = radius_expr.format(factor=self.radius_factor)
+
+        # NOTE: centers of mass are computed using a second-order approximation
+        knl = lp.make_kernel([
+            "{[irange]: 0 <= irange < nranges}",
+            "{[i]: 0 <= i < npoints}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            ["""
+            for irange
+                <> ioffset = srcranges[irange]
+                <> npoints = srcranges[irange + 1] - srcranges[irange]
+
+                proxy_center[idim, irange] = 1.0 / npoints * \
+                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
+                        {{dup=idim:i}}
+
+                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         sources[idim, srcindices[i + ioffset]]) ** 2)))
+
+                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext
+
+                proxy_radius[irange] = {radius_expr}
+            end
+            """.format(radius_expr=radius_expr)],
+            [
+                lp.GlobalArg("sources", None,
+                    shape=(self.ambient_dim, "nsources")),
+                lp.GlobalArg("center_int", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("center_ext", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("proxy_center", None,
+                    shape=(self.ambient_dim, "nranges")),
+                lp.GlobalArg("proxy_radius", None,
+                    shape="nranges"),
+                lp.ValueArg("nsources", np.int),
+                "..."
+            ],
+            name="find_proxy_radii_knl",
+            assumptions="dim>=1 and nranges>=1",
+            fixed_parameters=dict(dim=self.ambient_dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        knl = lp.tag_inames(knl, "idim*:unr")
+
+        return knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "irange", 64, outer_tag="g.0")
+
+        return knl
+
+    def __call__(self, queue, dofdesc, indices):
+        """Generate proxy points for each given range of source points in
+        the discretization in :attr:`places`.
+
+        :arg queue: a :class:`pyopencl.CommandQueue`.
+        :arg dofdesc: a descriptor for the discretization around which
+            the proxy points should be generated.
+        :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+        :return: a :class:`BlockProxyPoints`.
+        """
+
+        # {{{ generate proxy points
+
+        from pytential import bind, sym
+        dd = sym.as_dofdesc(dofdesc)
+        discr = self.places.get_discretization(dd.geometry, dd.discr_stage)
+
+        qbx_radii = bind(self.places, sym.expansion_radii(
+            self.ambient_dim, dofdesc=dd))(queue)
+        qbx_center_int = bind(self.places, sym.expansion_centers(
+            self.ambient_dim, -1, dofdesc=dd))(queue)
+        qbx_center_ext = bind(self.places, sym.expansion_centers(
+            self.ambient_dim, +1, dofdesc=dd))(queue)
+
+        knl = self.get_kernel()
+        _, (centers, radii,) = knl(queue,
+            sources=discr.nodes(),
+            center_int=qbx_center_int,
+            center_ext=qbx_center_ext,
+            expansion_radii=qbx_radii,
+            srcindices=indices.indices,
+            srcranges=indices.ranges)
+        centers_host = centers.get(queue)
+        radii_host = radii.get(queue)
+
+        def _affine_map(v, A, b):
+            return np.dot(A, v) + b
+
+        proxies = np.empty(indices.nblocks, dtype=np.object)
+        for i in range(indices.nblocks):
+            proxies[i] = _affine_map(self.ref_points,
+                    A=(radii_host[i] * np.eye(self.ambient_dim)),
+                    b=centers_host[:, i].reshape(-1, 1))
+        # }}}
+
+        pxyranges = cl.array.arange(queue,
+                0,
+                proxies.shape[0] * proxies[0].shape[1] + 1,
+                proxies[0].shape[1],
+                dtype=indices.ranges.dtype).with_queue(None)
+        proxies = make_obj_array([
+            cl.array.to_device(queue,
+                np.hstack([p[idim] for p in proxies])).with_queue(None)
+            for idim in range(self.ambient_dim)])
+        centers = make_obj_array([
+            centers[idim].with_queue(queue).copy().with_queue(None)
+            for idim in range(self.ambient_dim)])
+
+        pxyindices = cl.array.arange(queue, 0, proxies[0].size,
+            dtype=pxyranges.dtype)
+        pxyindices = BlockIndexRanges(queue.context, pxyindices, pxyranges)
+
+        return BlockProxyPoints(
+                indices=pxyindices,
+                points=proxies,
+                centers=centers.T,
+                radii=radii.with_queue(None),
+                )
+
+
+def gather_block_neighbor_points(queue, discr, srcindices, proxy,
+                                 max_particles_in_box=None):
+    """Generate a set of neighboring points for each range of points in
+    *discr*. Neighboring points of a range :math:`i` are defined
+    as all the points inside the proxy ball :math:`i` that do not also
+    belong to the range itself.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg proxy: a :class:`BlockProxyPoints`.
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_particles_in_box is None:
+        # FIXME: this is a fairly arbitrary value
+        max_particles_in_box = 32
+
+    # NOTE: this is constructed for multiple reasons:
+    #   * TreeBuilder takes object arrays
+    #   * `srcindices` can be a small subset of nodes, so this will save
+    #   some work
+    #   * `srcindices` may reorder the array returned by nodes(), so this
+    #   makes sure that we have the same order in tree.user_source_ids
+    #   and friends
+    sources = discr.nodes().with_queue(queue)
+    sources = make_obj_array([
+        cl.array.take(sources[idim], srcindices.indices)
+        for idim in range(discr.ambient_dim)])
+
+    # construct tree
+    from boxtree import TreeBuilder
+    builder = TreeBuilder(discr.cl_context)
+    tree, _ = builder(queue, sources,
+                      max_particles_in_box=max_particles_in_box)
+
+    from boxtree.area_query import AreaQueryBuilder
+    builder = AreaQueryBuilder(discr.cl_context)
+    query, _ = builder(queue, tree, proxy.centers, proxy.radii)
+
+    # find nodes inside each proxy ball
+    tree = tree.get(queue)
+    query = query.get(queue)
+
+    srcindices = srcindices.get(queue)
+    proxy = proxy.get(queue)
+    pxycenters = np.vstack(proxy.centers)
+    pxyradii = proxy.radii
+
+    nbrindices = np.empty(srcindices.nblocks, dtype=np.object)
+    nbrranges = np.zeros(srcindices.nblocks + 1, dtype=srcindices.ranges.dtype)
+    for iproxy in range(srcindices.nblocks):
+        # get list of boxes intersecting the current ball
+        istart = query.leaves_near_ball_starts[iproxy]
+        iend = query.leaves_near_ball_starts[iproxy + 1]
+        iboxes = query.leaves_near_ball_lists[istart:iend]
+
+        # get nodes inside the boxes
+        istart = tree.box_source_starts[iboxes]
+        iend = istart + tree.box_source_counts_cumul[iboxes]
+        isources = np.hstack([np.arange(s, e)
+                              for s, e in zip(istart, iend)])
+        nodes = np.vstack([tree.sources[idim][isources]
+                           for idim in range(discr.ambient_dim)])
+        isources = tree.user_source_ids[isources]
+
+        # get nodes inside the ball but outside the current range
+        center = pxycenters[:, iproxy].reshape(-1, 1)
+        radius = pxyradii[iproxy]
+        mask = ((la.norm(nodes - center, axis=0) < radius)
+                & ((isources < srcindices.ranges[iproxy])
+                    | (srcindices.ranges[iproxy + 1] <= isources)))
+
+        nbrindices[iproxy] = srcindices.indices[isources[mask]]
+        nbrranges[iproxy + 1] = nbrranges[iproxy] + nbrindices[iproxy].size
+
+    return _to_block_index(queue, nbrindices, nbrranges)
+
+# }}}
+
+
+# {{{ skeletonization
+
+def _build_source_skeleton_matrix(queue, places, proxy, wrangler, indices,
+        ibrow, ibcol, max_particles_in_box=None):
+    """Builds a block matrix that can be used to skeletonize the columns
+    (sources) of the symbolic matrix block described by ``(ibrow, ibcol)``.
+    The returned matrix is block diagonal and defined by
+
+        .. math::
+
+            S_{ii} = [A_{nearby}, A_{proxy}]
+
+    where :math:`A_{nearby}` contains interactions with all neighboring
+    points and :math:`A_{proxy}` contains interactions with a set of
+    proxy points.
+
+    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg proxy: a :class:`ProxyGenerator`.
+    :arg wrangler: a :class:`BlockEvaluationWrangler`.
+    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+    :return: a block matrix in the form of a 2D :class:`numpy.ndarray`
+        of ``dtype = object``.
+    """
+    from pytential.target import PointsTarget
+    from pytential.symbolic.execution import GeometryCollection
+
+    domain = wrangler.domains[ibcol]
+    source_lpot = places.get_geometry(domain.geometry)
+    source_discr = places.get_discretization(domain.geometry, domain.discr_stage)
+
+    pxy = proxy(queue, domain, indices)
+    pxyindices = MatrixBlockIndexRanges(queue.context, pxy.indices, indices)
+
+    # build proxy interaction matrices
+    pxyplaces = (source_lpot, PointsTarget(pxy.points))
+    pxyplaces = GeometryCollection(pxyplaces, auto_where=(domain, "proxy"))
+
+    pxymat = wrangler.evaluate_source_farfield(queue,
+            pxyplaces, ibrow, ibcol, pxyindices)
+
+    if indices.nblocks == 1:
+        return _build_diag_block(pxymat, pxyindices.get(queue))
+
+    # build neighbor interaction matrix blocks
+    nbrindices = gather_block_neighbor_points(
+            queue, source_discr, indices, pxy,
+            max_particles_in_box=max_particles_in_box)
+
+    nbrindices = MatrixBlockIndexRanges(queue.context, nbrindices, indices)
+    nbrmat = wrangler.evaluate_nearfield(queue,
+            places, ibrow, ibcol, nbrindices)
+
+    # concatenate matrix blocks
+    pxyindices = pxyindices.get(queue)
+    nbrindices = nbrindices.get(queue)
+
+    pxyblk = np.full((indices.nblocks, indices.nblocks), 0, dtype=np.object)
+    for i in range(indices.nblocks):
+        pxyblk[i, i] = np.vstack([
+            pxyindices.block_take(pxymat, i),
+            nbrindices.block_take(nbrmat, i)
+            ])
+
+    return pxyblk
+
+
+def _build_target_skeleton_matrix(queue, places, proxy, wrangler, indices,
+        ibrow, ibcol, max_particles_in_box=None):
+    """Builds a block matrix that can be used to skeletonize the rows
+    (targets) of the symbolic matrix block described by ``(ibrow, ibcol)``.
+    """
+    from pytential.source import PointPotentialSource
+    from pytential.symbolic.execution import GeometryCollection
+
+    domain = wrangler.domains[ibcol]
+    target_lpot = places.get_geometry(domain.geometry)
+    target_discr = places.get_discretization(domain.geometry, domain.discr_stage)
+
+    pxy = proxy(queue, domain, indices)
+    pxyindices = MatrixBlockIndexRanges(queue.context, indices, pxy.indices)
+
+    # build proxy interaction matrix blocks
+    pxyplaces = (PointPotentialSource(queue.context, pxy.points), target_lpot)
+    pxyplaces = GeometryCollection(pxyplaces, auto_where=("proxy", domain))
+
+    pxymat = wrangler.evaluate_target_farfield(queue,
+            pxyplaces, ibrow, ibcol, pxyindices)
+
+    if indices.nblocks == 1:
+        return _build_diag_block(pxymat, pxyindices.get(queue))
+
+    # build neighbor interaction matrix blocks
+    nbrindices = gather_block_neighbor_points(
+            queue, target_discr, indices, pxy,
+            max_particles_in_box=max_particles_in_box)
+
+    nbrindices = MatrixBlockIndexRanges(queue.context, indices, nbrindices)
+    nbrmat = wrangler.evaluate_nearfield(queue,
+            places, ibrow, ibcol, nbrindices)
+
+    # concatenate matrix blocks
+    pxyindices = pxyindices.get(queue)
+    nbrindices = nbrindices.get(queue)
+
+    pxyblk = np.full((indices.nblocks, indices.nblocks), 0, dtype=np.object)
+    for i in range(indices.nblocks):
+        pxyblk[i, i] = np.hstack([
+            nbrindices.block_take(nbrmat, i),
+            pxyindices.block_take(pxymat, i)
+            ])
+
+    return pxyblk
+
+
+def _skeletonize(queue, places, proxy, wrangler, blkindices, id_eps,
+        id_rank=None,
+        tree_max_particles_in_box=None):
+    r"""
+    :returns: a tuple ``(L, R, sklindices)`` encoding the block-by-block
+        decompression of the matrix represented by the *exprs* and
+        *input_exprs*. :math:`L` and :math:`R` are :math:`n \times n`
+        diagonal block matrix, where :math:`n` is ``blkindices.nblocks``. The
+        ``sklindices`` array contains the remaining (skeleton) nodes from
+        ``blkindices`` after compression.
+    """
+
+    L = np.full((blkindices.nblocks, blkindices.nblocks), 0, dtype=np.object)
+    R = np.full((blkindices.nblocks, blkindices.nblocks), 0, dtype=np.object)
+
+    if blkindices.nblocks == 1:
+        L[0, 0] = np.eye(blkindices.row.indices.size)
+        R[0, 0] = np.eye(blkindices.col.indices.size)
+
+        return L, R, blkindices
+
+    # construct proxy matrices to skeletonize
+    src_mat = _build_source_skeleton_matrix(queue,
+            places, proxy, wrangler, blkindices.col, 0, 0,
+            max_particles_in_box=tree_max_particles_in_box)
+    tgt_mat = _build_target_skeleton_matrix(queue,
+            places, proxy, wrangler, blkindices.row, 0, 0,
+            max_particles_in_box=tree_max_particles_in_box)
+
+    src_skl_indices = np.empty(blkindices.nblocks, dtype=np.object)
+    tgt_skl_indices = np.empty(blkindices.nblocks, dtype=np.object)
+    skl_ranges = np.zeros(blkindices.nblocks + 1, dtype=np.int)
+
+    src_indices = blkindices.col.get(queue)
+    tgt_indices = blkindices.row.get(queue)
+
+    for i in range(blkindices.nblocks):
+        k = id_rank
+
+        assert not np.any(np.isnan(src_mat[i, i])), "block {}".format(i)
+        assert not np.any(np.isinf(src_mat[i, i])), "block {}".format(i)
+        assert not np.any(np.isnan(tgt_mat[i, i])), "block {}".format(i)
+        assert not np.any(np.isinf(tgt_mat[i, i])), "block {}".format(i)
+
+        # skeletonize target points
+        k, idx, interp = _interp_decomp(tgt_mat[i, i].T, k, id_eps)
+        assert k > 0
+
+        L[i, i] = interp.T
+        tgt_skl_indices[i] = tgt_indices.block_indices(i)[idx[:k]]
+
+        # skeletonize source points
+        k, idx, interp = _interp_decomp(src_mat[i, i], k, id_eps)
+        assert k > 0
+
+        R[i, i] = interp
+        src_skl_indices[i] = src_indices.block_indices(i)[idx[:k]]
+
+        skl_ranges[i + 1] = skl_ranges[i] + k
+        assert R[i, i].shape == (k, src_mat[i, i].shape[1])
+        assert L[i, i].shape == (tgt_mat[i, i].shape[0], k)
+
+    src_skl_indices = _to_block_index(queue, src_skl_indices, skl_ranges)
+    tgt_skl_indices = _to_block_index(queue, tgt_skl_indices, skl_ranges)
+    skl_indices = MatrixBlockIndexRanges(queue.context,
+                                         tgt_skl_indices,
+                                         src_skl_indices)
+
+    return L, R, skl_indices
+
+# }}}
+
+
+# {{{ compressed matrix builder
+
+class CompressedMatrixLevel(Record):
+    """
+    .. attribute:: level
+    .. attribute:: L
+
+        Left skeletonization matrices at level :attr:`level`. This is a
+        block matrix of size :attr:`nblocks`.
+
+    .. attribute:: R
+
+        Right skeletonization matrices at level :attr:`level`. This is a
+        block matrix of size :attr:`nblocks`.
+
+    .. attribute:: S
+
+        Diagonal of the skeletonized matrix, if computing the inverse, or
+        *None* otherwise. This is a block matrix of size :attr:`nblocks`.
+
+    .. attribute:: D
+
+        Inverse diagonal or the diagonal of the matrix at level
+        :attr:`level` or, if computing the inverse or not, respectively.
+        This is a block matrix of size :attr:`nblocks`.
+
+    .. attribute:: indices
+
+        A :class:`sumpy.tools.MatrixBlockIndexRanges` representing the
+        indices at the current level.
+
+    .. attribute:: sklindices
+
+        A :class:`sumpy.tools.MatrixBlockIndexRanges` representing the
+        skeletonized indices at the current level, which is a subset of
+        :attr:`indices`.
+
+    .. attribute:: shape
+    .. attribute:: sklshape
+    .. attribute:: nblocks
+    """
+    @property
+    def shape(self):
+        return (self.indices.row.indices.size,
+                self.indices.col.indices.size)
+
+    @property
+    def sklshape(self):
+        return (self.sklindices.row.indices.size,
+                self.sklindices.col.indices.size)
+
+    @property
+    def nblocks(self):
+        return self.indices.nblocks
+
+
+class CompressedMatrixBuilder(object):
+    def __init__(self, queue, places, proxy, wrangler,
+            matrix_mode='forward',
+            id_eps=None,
+
+            # debugging
+            id_rank=None,
+            max_level=None,
+            tree_kind='adaptive-level-restricted',
+            tree_max_particles_in_box=None):
+        self.queue = queue
+        self.places = places
+        self.proxy = proxy
+        self.wrangler = wrangler
+        self.matrix_mode = matrix_mode
+        self.id_eps = id_eps
+
+        self.id_rank = id_rank
+        self.tree_kind = tree_kind
+        self.tree_max_particles_in_box = tree_max_particles_in_box
+
+        self.clusters = {}
+        for domain in wrangler.domains:
+            discr = self.places.get_discretization(
+                    domain.geometry, domain.discr_stage)
+            indices, partition = partition_by_nodes(self.queue, discr,
+                    tree_kind=self.tree_kind,
+                    max_particles_in_box=self.tree_max_particles_in_box)
+
+            self.clusters[domain.geometry] = (indices, partition)
+
+        self.levels = None
+        self.leaf_indices = MatrixBlockIndexRanges(queue.context, indices, indices)
+        self.leaf_partition = partition
+
+        if max_level is None:
+            self.nlevels = self.leaf_partition.nlevels
+        else:
+            self.nlevels = min(max_level, self.leaf_partition.nlevels)
+
+        # TODO: probably a better idea to put this into PartitionTreeLevel
+        self.partitions = np.empty(self.nlevels, dtype=np.object)
+        self.partitions[-1] = self.leaf_partition
+        for i in range(self.nlevels - 2, -1, -1):
+            self.partitions[i] = self.partitions[i + 1].cluster()
+
+    @property
+    def dtype(self):
+        if self.levels is None:
+            raise RuntimeError('must call compress() first')
+
+        return self.levels[0].D[0, 0].dtype
+
+    @property
+    def shape(self):
+        return (self.leaf_indices.row.indices.size,
+                self.leaf_indices.col.indices.size)
+
+    @property
+    def root(self):
+        if self.levels is None:
+            raise RuntimeError('must call compress() first')
+
+        return self.levels[0]
+
+    @property
+    def leaf(self):
+        if self.levels is None:
+            raise RuntimeError('must call compress() first')
+
+        return self.levels[-1]
+
+    def _evaluate_near_diagonal(self, level):
+        partition = self.partitions[level]
+        cmat = self.levels[level]
+
+        # make a list of block indices
+        from itertools import product
+        nblocks = np.sum([p.size * (p.size - 1)
+                          for p in partition.partition_parent_map])
+        near_block_index = np.empty((nblocks, 2), dtype=np.int)
+
+        k = 0
+        for ppm in partition.partition_parent_map:
+            for i, j in product(ppm, repeat=2):
+                if i == j:
+                    continue
+
+                near_block_index[k] = (i, j)
+                k += 1
+
+        # collect entry indices
+        indices_host = cmat.sklindices.get(self.queue)
+        tgtindices = np.empty(nblocks, dtype=np.object)
+        srcindices = np.empty(nblocks, dtype=np.object)
+
+        for k, (i, j) in enumerate(near_block_index):
+            tgtindices[k] = indices_host.row.block_indices(i)
+            srcindices[k] = indices_host.col.block_indices(j)
+        near_indices = MatrixBlockIndexRanges(self.queue.context,
+                _to_block_index(self.queue, tgtindices),
+                _to_block_index(self.queue, srcindices))
+
+        # evaluate
+        sblk = self.wrangler.evaluate_nearfield(self.queue,
+                self.places, 0, 0, near_indices, self.wrangler.domains[0])
+
+        near_indices = near_indices.get(self.queue)
+        S = np.full((cmat.nblocks, cmat.nblocks), 0, dtype=np.object)
+        for k, (i, j) in enumerate(near_block_index):
+            S[i, j] = near_indices.block_take(sblk, k)
+
+        if self.matrix_mode == 'forward':
+            # NOTE: filling in the diagonals in the forward case as well
+            # so that the clustering works for blocks that aren't actually
+            # clustered at this level. otherwise it wouldn't know the shape
+            for i in range(cmat.nblocks):
+                S[i, i] = np.zeros(indices_host.block_shape(i), dtype=sblk.dtype)
+        else:
+            for i in range(cmat.nblocks):
+                S[i, i] = cmat.S[i, i]
+
+        return partition.cluster(S)
+
+    def _compress_level(self, level):
+        # {{{ skeletonize
+
+        if level == self.nlevels - 1:
+            indices = self.leaf_indices
+        else:
+            indices = self.levels[level + 1].sklindices
+            indices = self.partitions[level + 1].cluster(indices)
+
+        L, R, sklindices = _skeletonize(self.queue,
+                self.places, self.proxy, self.wrangler, indices,
+                id_eps=self.id_eps,
+                id_rank=self.id_rank,
+                tree_max_particles_in_box=self.tree_max_particles_in_box)
+
+        # }}}
+
+        # {{{ evaluate diagonal
+
+        if level == self.nlevels - 1:
+            D = self.wrangler.evaluate_nearfield(self.queue,
+                    self.places, 0, 0, indices, self.wrangler.domains[0])
+            D = _build_diag_block(D, indices.get(self.queue))
+        else:
+            D = self._evaluate_near_diagonal(level + 1)
+
+        # }}}
+
+        S = None
+        if level == 0:
+            S = D[0, 0]
+            D[0, 0] = np.zeros_like(S)
+        elif self.matrix_mode == 'backward':
+            S = np.zeros(D.shape, dtype=np.object)
+            for i in range(indices.nblocks):
+                D[i, i] = la.inv(D[i, i])
+                S[i, i] = la.inv(R[i, i].dot(D[i, i].dot(L[i, i])))
+
+        return CompressedMatrixLevel(
+                level=level,
+                L=L, S=S, R=R, D=D,
+                indices=indices, sklindices=sklindices)
+
+    def compress(self):
+        # recursively compress matrix
+        self.levels = np.empty(self.nlevels, dtype=np.object)
+        for i in range(self.nlevels - 1, -1, -1):
+            self.levels[i] = self._compress_level(i)
+
+        # put all the indices on the host
+        for i in range(self.nlevels):
+            self.levels[i].indices = self.levels[i].indices.get(self.queue)
+            self.levels[i].sklindices = self.levels[i].sklindices.get(self.queue)
+
+        return self
+
+    def _rec_matvec(self, x, level):
+        cmat = self.levels[level]
+        L = cmat.L
+        S = cmat.S
+        R = cmat.R
+        D = cmat.D
+
+        # {{{ downsample input
+
+        y = np.empty(cmat.sklshape[0], dtype=x.dtype)
+        for k, (i, j) in enumerate(
+                _level_ranges(cmat.sklindices.row, cmat.indices.col)):
+            y[i] = R[k, k].dot(x[j])
+
+        # }}}
+
+        # {{{ recurse and do a full matvec at the root level
+
+        if level > 0:
+            y = self._rec_matvec(y, level - 1)
+        else:
+            y = S.dot(y)
+
+        # }}}
+
+        # {{{ upsample output
+
+        b = np.empty(cmat.shape[1], dtype=x.dtype)
+        for k, (i, j) in enumerate(
+                _level_ranges(cmat.indices.row, cmat.sklindices.col)):
+            b[i] = L[k, k].dot(y[j]) + D[k, k].dot(x[i])
+
+        # }}}
+
+        return b
+
+    def _rec_imatvec(self, b, level):
+        cmat = self.levels[level]
+        L = cmat.L
+        S = cmat.S
+        R = cmat.R
+        D = cmat.D
+
+        # {{{ downsample input
+
+        if level > 0:
+            y = np.empty(cmat.sklshape[0], dtype=b.dtype)
+            for k, (i, j) in enumerate(
+                    _level_ranges(cmat.sklindices.row, cmat.indices.col)):
+                y[i] = S[k, k].dot(R[k, k].dot(D[k, k].dot(b[j])))
+        else:
+            y = b
+
+        # }}}
+
+        # {{{ recurse and solve at root level
+
+        if level > 0:
+            z = self._rec_imatvec(y, level - 1)
+        else:
+            z = la.solve(S, y)
+
+        # }}}
+
+        # {{{ upsample output
+
+        if level > 0:
+            x = np.empty(b.size, dtype=b.dtype)
+            for k, (i, j) in enumerate(
+                    _level_ranges(cmat.indices.row, cmat.sklindices.col)):
+                x[i] = b[i] - L[k, k].dot(y[j]) \
+                            + L[k, k].dot(S[k, k].dot(z[j]))
+                x[i] = D[k, k].dot(x[i])
+        else:
+            x = z
+
+        # }}}
+
+        return x
+
+    def matvec(self, x):
+        if x.size != self.shape[1]:
+            raise ValueError("shapes {} and {} are not aligned.".format(
+                self.shape, x.shape))
+
+        if isinstance(x, cl.array.Array):
+            x = x.get(self.queue)
+
+        # NOTE: we assume the given x is indexed linearly in [0, nnodes - 1],
+        # but the indices we have are not necessarily like that, so this
+        # reshuffles it to match the stored indexing
+        x = x[self.levels[-1].indices.col.indices]
+
+        if self.matrix_mode == 'forward':
+            b = self._rec_matvec(x, self.nlevels - 1)
+        else:
+            b = self._rec_imatvec(x, self.nlevels - 1)
+
+        b = b[np.argsort(self.levels[-1].indices.row.indices)]
+
+        return b
+
+    def __matmul__(self, x):
+        return self.matvec(x)
+
+    def dot(self, x):
+        return self.matvec(x)
+
+
+def build_compressed_matrix(queue,
+        places, exprs, input_exprs, domains=None,
+        auto_where=None, context=None,
+        matrix_mode=None, id_eps=None,
+
+        # debugging
+        _max_level=None,
+        _id_rank=None,
+        _proxy_radius_factor=None,
+        _proxy_approx_count=None,
+        _tree_kind="adaptive-level-restricted",
+        _tree_max_particles_in_box=None,
+        _weighted_farfield=None,
+        _nearfield_block_builder=None,
+        _farfield_block_builder=None):
+    """Implements matrix compression based on skeletonization, similar to
+    the methods described in [ho-greengard]_ and [martinsson-rokhlin]_.
+
+    .. [ho-greengard] K. L. Ho, L. Greengard,
+        "A Fast Direct Solver for Structured Linear Systems by Recursive
+        Skeletonization", SIAM J. Sci. Comp., Vol. 34, No. 5, 2012.
+        https://doi.org/10.1137/120866683
+
+    .. [martinsson-rokhlin] P.-G. Martinsson, V. Rokhlin,
+        "A Fast Direct Solver for Boundary Integral Equations in Two
+        Dimensions", JCP, Vol. 205, 2005.
+        https://doi.org/10.1016/j.jcp.2004.10.033
+
+    :arg places: a :class:`~pytential.symbolic.execution.GeometryCollection`.
+        Alternatively, any list or mapping that is a valid argument for
+        its constructor can also be used.
+    :arg exprs: an array of expressions corresponding to the output block
+        rows of the matrix.
+    :arg input_exprs: an array of expressions corresponding to the input
+        block columns of the matrix.
+    :arg domains: a list of discretiatization identifiers (from *places*) or
+        *None* (indicating the default domain).
+
+    :arg matrix_mode: *'forward'* or *'backward'*, used to construct the
+        compressed operator or its inverse.
+    :arg id_eps: desired compressed block tolerance.
+
+    :returns: an object supporting matrix-vector multiplication in the
+        following forms:
+
+        * using the ``matvec`` or ``dot`` methods, similar to
+          :class:`scipy.sparse.linalg.LinearOperator`.
+        * using ``@`` in recent versions of Python.
+    """
+
+    # {{{ validate parameters
+
+    from pytential.symbolic.execution import GeometryCollection
+    if not isinstance(places, GeometryCollection):
+        places = GeometryCollection(places, auto_where=auto_where)
+
+    if not is_obj_array(exprs):
+        exprs = make_obj_array([exprs])
+
+    try:
+        input_exprs = list(input_exprs)
+    except TypeError:
+        input_exprs = [input_exprs]
+
+    if len(exprs) != 1 or len(input_exprs) != 1:
+        raise NotImplementedError("only scalar operators are implemented")
+
+    from pytential.symbolic.execution import _prepare_auto_where
+    auto_where = _prepare_auto_where(auto_where, places)
+    from pytential.symbolic.execution import _prepare_domains
+    domains = _prepare_domains(len(input_exprs), places, domains, auto_where[0])
+
+    if context is None:
+        context = {}
+
+    if matrix_mode is None:
+        matrix_mode = "forward"
+
+    if matrix_mode not in ["forward", "backward"]:
+        raise ValueError("unknown matrix mode '{}'".format(matrix_mode))
+
+    if id_eps is None:
+        # NOTE: matches the default tolerance fro :func:`pytential.solve.gmres`
+        # in practice (additional errors from compression)
+        id_eps = 1.0e-7
+
+    if _proxy_approx_count is None:
+        # NOTE: suggested in [gym2012] just before Remark 6.3 on page 24.
+        #
+        # [gym2012] A. Gillman, P. M. Young, P. G. Martinsson, A Direct
+        # Solver with O(N) Complexity for Integral Equations in 1D, 2012.
+        _proxy_approx_count = int(-3.0 * np.log(id_eps))
+
+    if _tree_max_particles_in_box is None:
+        # NOTE: just an arbitrary value that matches :func:`partition_by_nodes`
+        _tree_max_particles_in_box = 32
+
+    # }}}
+
+    # {{{ build helpers
+
+    proxy = ProxyGenerator(places,
+            approx_nproxy=_proxy_approx_count,
+            radius_factor=_proxy_radius_factor)
+
+    wrangler = BlockEvaluationWrangler(
+            exprs, input_exprs, domains,
+            context=context,
+            weighted_farfield=_weighted_farfield,
+            farfield_block_builder=_farfield_block_builder,
+            nearfield_block_builder=_nearfield_block_builder)
+
+    # }}}
+
+    # {{{ build and compress matrix
+
+    mat = CompressedMatrixBuilder(queue, places, proxy, wrangler,
+            matrix_mode=matrix_mode,
+            id_eps=id_eps,
+            id_rank=_id_rank,
+            max_level=_max_level,
+            tree_kind=_tree_kind,
+            tree_max_particles_in_box=_tree_max_particles_in_box)
+
+    mat = mat.compress()
+
+    # }}}
+
+    return mat
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
deleted file mode 100644
index 9fd26658a9e91fce4f3c41edf5aa7b698171ac41..0000000000000000000000000000000000000000
--- a/pytential/linalg/proxy.py
+++ /dev/null
@@ -1,616 +0,0 @@
-from __future__ import division, absolute_import
-
-__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-import numpy as np
-import numpy.linalg as la
-
-import pyopencl as cl
-import pyopencl.array # noqa
-from pyopencl.array import to_device
-
-from pytools.obj_array import make_obj_array
-from pytools import memoize_method, memoize
-from sumpy.tools import BlockIndexRanges
-
-import loopy as lp
-from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-
-
-__doc__ = """
-Proxy Point Generation
-~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: ProxyGenerator
-
-.. autofunction:: partition_by_nodes
-.. autofunction:: partition_from_coarse
-
-.. autofunction:: gather_block_neighbor_points
-.. autofunction:: gather_block_interaction_points
-"""
-
-
-# {{{ point index partitioning
-
-def _element_node_range(group, ielement):
-    istart = group.node_nr_base + group.nunit_nodes * ielement
-    iend = group.node_nr_base + group.nunit_nodes * (ielement + 1)
-
-    return np.arange(istart, iend)
-
-
-def partition_by_nodes(discr,
-                       use_tree=True,
-                       max_nodes_in_box=None):
-    """Generate equally sized ranges of nodes. The partition is created at the
-    lowest level of granularity, i.e. nodes. This results in balanced ranges
-    of points, but will split elements across different ranges.
-
-    :arg discr: a :class:`meshmode.discretization.Discretization`.
-    :arg use_tree: if ``True``, node partitions are generated using a
-        :class:`boxtree.TreeBuilder`, which leads to geometrically close
-        points to belong to the same partition. If ``False``, a simple linear
-        partition is constructed.
-    :arg max_nodes_in_box: passed to :class:`boxtree.TreeBuilder`.
-
-    :return: a :class:`sumpy.tools.BlockIndexRanges`.
-    """
-
-    if max_nodes_in_box is None:
-        # FIXME: this is just an arbitrary value
-        max_nodes_in_box = 32
-
-    with cl.CommandQueue(discr.cl_context) as queue:
-        if use_tree:
-            from boxtree import box_flags_enum
-            from boxtree import TreeBuilder
-
-            builder = TreeBuilder(discr.cl_context)
-
-            tree, _ = builder(queue, discr.nodes(),
-                max_particles_in_box=max_nodes_in_box)
-
-            tree = tree.get(queue)
-            leaf_boxes, = (tree.box_flags
-                           & box_flags_enum.HAS_CHILDREN == 0).nonzero()
-
-            indices = np.empty(len(leaf_boxes), dtype=np.object)
-            for i, ibox in enumerate(leaf_boxes):
-                box_start = tree.box_source_starts[ibox]
-                box_end = box_start + tree.box_source_counts_cumul[ibox]
-                indices[i] = tree.user_source_ids[box_start:box_end]
-
-            ranges = to_device(queue,
-                np.cumsum([0] + [box.shape[0] for box in indices]))
-            indices = to_device(queue, np.hstack(indices))
-        else:
-            indices = cl.array.arange(queue, 0, discr.nnodes,
-                                      dtype=np.int)
-            ranges = cl.array.arange(queue, 0, discr.nnodes + 1,
-                                     discr.nnodes // max_nodes_in_box,
-                                     dtype=np.int)
-        assert ranges[-1] == discr.nnodes
-
-        return BlockIndexRanges(discr.cl_context,
-                                indices.with_queue(None),
-                                ranges.with_queue(None))
-
-
-def partition_from_coarse(resampler, from_indices):
-    """Generate a partition of nodes from an existing partition on a
-    coarser discretization. The new partition is generated based on element
-    refinement relationships in *resampler*, so the existing partition
-    needs to be created using :func:`partition_by_elements`,
-    since we assume that each range contains all the nodes in an element.
-
-    The new partition will have the same number of ranges as the old partition.
-    The nodes inside each range in the new partition are all the nodes in
-    *resampler.to_discr* that were refined from elements in the same
-    range from *resampler.from_discr*.
-
-    :arg resampler: a
-        :class:`meshmode.discretization.connection.DirectDiscretizationConnection`.
-    :arg from_indices: a :class:`sumpy.tools.BlockIndexRanges`.
-
-    :return: a :class:`sumpy.tools.BlockIndexRanges`.
-    """
-
-    if not hasattr(resampler, "groups"):
-        raise ValueError("resampler must be a DirectDiscretizationConnection.")
-
-    with cl.CommandQueue(resampler.cl_context) as queue:
-        from_indices = from_indices.get(queue)
-
-        # construct ranges
-        from_discr = resampler.from_discr
-        from_grp_ranges = np.cumsum(
-            [0] + [grp.nelements for grp in from_discr.mesh.groups])
-        from_el_ranges = np.hstack([
-            np.arange(grp.node_nr_base, grp.nnodes + 1, grp.nunit_nodes)
-            for grp in from_discr.groups])
-
-        # construct coarse element arrays in each from_range
-        el_indices = np.empty(from_indices.nblocks, dtype=np.object)
-        el_ranges = np.full(from_grp_ranges[-1], -1, dtype=np.int)
-        for i in range(from_indices.nblocks):
-            ifrom = from_indices.block_indices(i)
-            el_indices[i] = np.unique(np.digitize(ifrom, from_el_ranges)) - 1
-            el_ranges[el_indices[i]] = i
-        el_indices = np.hstack(el_indices)
-
-        # construct lookup table
-        to_el_table = [np.full(g.nelements, -1, dtype=np.int)
-                       for g in resampler.to_discr.groups]
-
-        for igrp, grp in enumerate(resampler.groups):
-            for batch in grp.batches:
-                to_el_table[igrp][batch.to_element_indices.get(queue)] = \
-                    from_grp_ranges[igrp] + batch.from_element_indices.get(queue)
-
-        # construct fine node index list
-        indices = [np.empty(0, dtype=np.int)
-                   for _ in range(from_indices.nblocks)]
-        for igrp in range(len(resampler.groups)):
-            to_element_indices = \
-                    np.where(np.isin(to_el_table[igrp], el_indices))[0]
-
-            for i, j in zip(el_ranges[to_el_table[igrp][to_element_indices]],
-                            to_element_indices):
-                indices[i] = np.hstack([indices[i],
-                    _element_node_range(resampler.to_discr.groups[igrp], j)])
-
-        ranges = to_device(queue,
-                np.cumsum([0] + [b.shape[0] for b in indices]))
-        indices = to_device(queue, np.hstack(indices))
-
-        return BlockIndexRanges(resampler.cl_context,
-                                indices.with_queue(None),
-                                ranges.with_queue(None))
-
-# }}}
-
-
-# {{{ proxy point generator
-
-def _generate_unit_sphere(ambient_dim, approx_npoints):
-    """Generate uniform points on a unit sphere.
-
-    :arg ambient_dim: dimension of the ambient space.
-    :arg approx_npoints: approximate number of points to generate. If the
-        ambient space is 3D, this will not generate the exact number of points.
-    :return: array of shape ``(ambient_dim, npoints)``, where ``npoints``
-        will not generally be the same as ``approx_npoints``.
-    """
-
-    if ambient_dim == 2:
-        t = np.linspace(0.0, 2.0 * np.pi, approx_npoints)
-        points = np.vstack([np.cos(t), np.sin(t)])
-    elif ambient_dim == 3:
-        # https://www.cmu.edu/biolphys/deserno/pdf/sphere_equi.pdf
-        # code by Matt Wala from
-        # https://github.com/mattwala/gigaqbx-accuracy-experiments/blob/d56ed063ffd7843186f4fe05d2a5b5bfe6ef420c/translation_accuracy.py#L23
-        a = 4.0 * np.pi / approx_npoints
-        m_theta = int(np.round(np.pi / np.sqrt(a)))
-        d_theta = np.pi / m_theta
-        d_phi = a / d_theta
-
-        points = []
-        for m in range(m_theta):
-            theta = np.pi * (m + 0.5) / m_theta
-            m_phi = int(np.round(2.0 * np.pi * np.sin(theta) / d_phi))
-
-            for n in range(m_phi):
-                phi = 2.0 * np.pi * n / m_phi
-                points.append(np.array([np.sin(theta) * np.cos(phi),
-                                        np.sin(theta) * np.sin(phi),
-                                        np.cos(theta)]))
-
-        for i in range(ambient_dim):
-            for sign in [-1, 1]:
-                pole = np.zeros(ambient_dim)
-                pole[i] = sign
-                points.append(pole)
-
-        points = np.array(points).T
-    else:
-        raise ValueError("ambient_dim > 3 not supported.")
-
-    return points
-
-
-class ProxyGenerator(object):
-    r"""
-    .. attribute:: ambient_dim
-    .. attribute:: nproxy
-
-        Number of proxy points in a single proxy ball.
-
-    .. attribute:: source
-
-        A :class:`pytential.qbx.QBXLayerPotentialSource`.
-
-    .. attribute:: ratio
-
-        A ratio used to compute the proxy ball radius. The radius
-        is computed in the :math:`\ell^2` norm, resulting in a circle or
-        sphere of proxy points. For QBX, we have two radii of interest
-        for a set of points: the radius :math:`r_{block}` of the
-        smallest ball containing all the points and the radius
-        :math:`r_{qbx}` of the smallest ball containing all the QBX
-        expansion balls in the block. If the ratio :math:`\theta \in
-        [0, 1]`, then the radius of the proxy ball is
-
-        .. math::
-
-            r = (1 - \theta) r_{block} + \theta r_{qbx}.
-
-        If the ratio :math:`\theta > 1`, the the radius is simply
-
-        .. math::
-
-            r = \theta r_{qbx}.
-
-    .. attribute:: ref_points
-
-        Reference points on a unit ball. Can be used to construct the points
-        of a proxy ball :math:`i` by translating them to ``center[i]`` and
-        scaling by ``radii[i]``, as obtained by :meth:`__call__`.
-
-    .. automethod:: __call__
-    """
-
-    def __init__(self, source, approx_nproxy=None, ratio=None):
-        self.source = source
-        self.ambient_dim = source.density_discr.ambient_dim
-        self.ratio = 1.1 if ratio is None else ratio
-
-        approx_nproxy = 32 if approx_nproxy is None else approx_nproxy
-        self.ref_points = \
-                _generate_unit_sphere(self.ambient_dim, approx_nproxy)
-
-    @property
-    def nproxy(self):
-        return self.ref_points.shape[1]
-
-    @memoize_method
-    def get_kernel(self):
-        if self.ratio < 1.0:
-            radius_expr = "(1.0 - {ratio}) * rblk + {ratio} * rqbx"
-        else:
-            radius_expr = "{ratio} * rqbx"
-        radius_expr = radius_expr.format(ratio=self.ratio)
-
-        # NOTE: centers of mass are computed using a second-order approximation
-        knl = lp.make_kernel([
-            "{[irange]: 0 <= irange < nranges}",
-            "{[i]: 0 <= i < npoints}",
-            "{[idim]: 0 <= idim < dim}"
-            ],
-            ["""
-            for irange
-                <> ioffset = srcranges[irange]
-                <> npoints = srcranges[irange + 1] - srcranges[irange]
-
-                proxy_center[idim, irange] = 1.0 / npoints * \
-                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
-                        {{dup=idim:i}}
-
-                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
-                        (proxy_center[idim, irange] -
-                         sources[idim, srcindices[i + ioffset]]) ** 2)))
-
-                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
-                        (proxy_center[idim, irange] -
-                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
-                         expansion_radii[srcindices[i + ioffset]])
-                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
-                        (proxy_center[idim, irange] -
-                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
-                         expansion_radii[srcindices[i + ioffset]])
-                <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext
-
-                proxy_radius[irange] = {radius_expr}
-            end
-            """.format(radius_expr=radius_expr)],
-            [
-                lp.GlobalArg("sources", None,
-                    shape=(self.ambient_dim, "nsources")),
-                lp.GlobalArg("center_int", None,
-                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
-                lp.GlobalArg("center_ext", None,
-                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
-                lp.GlobalArg("proxy_center", None,
-                    shape=(self.ambient_dim, "nranges")),
-                lp.GlobalArg("proxy_radius", None,
-                    shape="nranges"),
-                lp.ValueArg("nsources", np.int),
-                "..."
-            ],
-            name="find_proxy_radii_knl",
-            assumptions="dim>=1 and nranges>=1",
-            fixed_parameters=dict(dim=self.ambient_dim),
-            lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-        knl = lp.tag_inames(knl, "idim*:unr")
-        return knl
-
-    @memoize_method
-    def get_optimized_kernel(self):
-        knl = self.get_kernel()
-        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
-
-        return knl
-
-    def __call__(self, queue, indices, **kwargs):
-        """Generate proxy points for each given range of source points in
-        the discretization in :attr:`source`.
-
-        :arg queue: a :class:`pyopencl.CommandQueue`.
-        :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
-
-        :return: a tuple of ``(proxies, pxyranges, pxycenters, pxyranges)``,
-            where each element is a :class:`pyopencl.array.Array`. The
-            sizes of the arrays are as follows: ``pxycenters`` is of size
-            ``(2, nranges)``, ``pxyradii`` is of size ``(nranges,)``,
-            ``pxyranges`` is of size ``(nranges + 1,)`` and ``proxies`` is
-            of size ``(2, nranges * nproxy)``. The proxy points in a range
-            :math:`i` can be obtained by a slice
-            ``proxies[pxyranges[i]:pxyranges[i + 1]]`` and are all at a
-            distance ``pxyradii[i]`` from the range center ``pxycenters[i]``.
-        """
-
-        def _affine_map(v, A, b):
-            return np.dot(A, v) + b
-
-        from pytential import bind, sym
-        radii = bind(self.source,
-                sym.expansion_radii(self.source.ambient_dim))(queue)
-        center_int = bind(self.source,
-                sym.expansion_centers(self.source.ambient_dim, -1))(queue)
-        center_ext = bind(self.source,
-                sym.expansion_centers(self.source.ambient_dim, +1))(queue)
-
-        knl = self.get_kernel()
-        _, (centers_dev, radii_dev,) = knl(queue,
-            sources=self.source.density_discr.nodes(),
-            center_int=center_int,
-            center_ext=center_ext,
-            expansion_radii=radii,
-            srcindices=indices.indices,
-            srcranges=indices.ranges, **kwargs)
-        centers = centers_dev.get()
-        radii = radii_dev.get()
-
-        proxies = np.empty(indices.nblocks, dtype=np.object)
-        for i in range(indices.nblocks):
-            proxies[i] = _affine_map(self.ref_points,
-                    A=(radii[i] * np.eye(self.ambient_dim)),
-                    b=centers[:, i].reshape(-1, 1))
-
-        pxyranges = cl.array.arange(queue,
-                0,
-                proxies.shape[0] * proxies[0].shape[1] + 1,
-                proxies[0].shape[1],
-                dtype=indices.ranges.dtype)
-        proxies = make_obj_array([
-            cl.array.to_device(queue, np.hstack([p[idim] for p in proxies]))
-            for idim in range(self.ambient_dim)])
-        centers = make_obj_array([
-            centers_dev[idim].with_queue(queue).copy()
-            for idim in range(self.ambient_dim)])
-
-        assert pxyranges[-1] == proxies[0].shape[0]
-        return proxies, pxyranges, centers, radii_dev
-
-
-def gather_block_neighbor_points(discr, indices, pxycenters, pxyradii,
-                                 max_nodes_in_box=None):
-    """Generate a set of neighboring points for each range of points in
-    *discr*. Neighboring points of a range :math:`i` are defined
-    as all the points inside the proxy ball :math:`i` that do not also
-    belong to the range itself.
-
-    :arg discr: a :class:`meshmode.discretization.Discretization`.
-    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
-    :arg pxycenters: an array containing the center of each proxy ball.
-    :arg pxyradii: an array containing the radius of each proxy ball.
-
-    :return: a :class:`sumpy.tools.BlockIndexRanges`.
-    """
-
-    if max_nodes_in_box is None:
-        # FIXME: this is a fairly arbitrary value
-        max_nodes_in_box = 32
-
-    with cl.CommandQueue(discr.cl_context) as queue:
-        indices = indices.get(queue)
-
-        # NOTE: this is constructed for multiple reasons:
-        #   * TreeBuilder takes object arrays
-        #   * `srcindices` can be a small subset of nodes, so this will save
-        #   some work
-        #   * `srcindices` may reorder the array returned by nodes(), so this
-        #   makes sure that we have the same order in tree.user_source_ids
-        #   and friends
-        sources = discr.nodes().get(queue)
-        sources = make_obj_array([
-            cl.array.to_device(queue, sources[idim, indices.indices])
-            for idim in range(discr.ambient_dim)])
-
-        # construct tree
-        from boxtree import TreeBuilder
-        builder = TreeBuilder(discr.cl_context)
-        tree, _ = builder(queue, sources,
-                          max_particles_in_box=max_nodes_in_box)
-
-        from boxtree.area_query import AreaQueryBuilder
-        builder = AreaQueryBuilder(discr.cl_context)
-        query, _ = builder(queue, tree, pxycenters, pxyradii)
-
-        # find nodes inside each proxy ball
-        tree = tree.get(queue)
-        query = query.get(queue)
-
-        if isinstance(pxycenters[0], cl.array.Array):
-            pxycenters = np.vstack([pxycenters[idim].get(queue)
-                                    for idim in range(discr.ambient_dim)])
-        if isinstance(pxyradii, cl.array.Array):
-            pxyradii = pxyradii.get(queue)
-
-        nbrindices = np.empty(indices.nblocks, dtype=np.object)
-        for iproxy in range(indices.nblocks):
-            # get list of boxes intersecting the current ball
-            istart = query.leaves_near_ball_starts[iproxy]
-            iend = query.leaves_near_ball_starts[iproxy + 1]
-            iboxes = query.leaves_near_ball_lists[istart:iend]
-
-            # get nodes inside the boxes
-            istart = tree.box_source_starts[iboxes]
-            iend = istart + tree.box_source_counts_cumul[iboxes]
-            isources = np.hstack([np.arange(s, e)
-                                  for s, e in zip(istart, iend)])
-            nodes = np.vstack([tree.sources[idim][isources]
-                               for idim in range(discr.ambient_dim)])
-            isources = tree.user_source_ids[isources]
-
-            # get nodes inside the ball but outside the current range
-            center = pxycenters[:, iproxy].reshape(-1, 1)
-            radius = pxyradii[iproxy]
-            mask = ((la.norm(nodes - center, axis=0) < radius)
-                    & ((isources < indices.ranges[iproxy])
-                        | (indices.ranges[iproxy + 1] <= isources)))
-
-            nbrindices[iproxy] = indices.indices[isources[mask]]
-
-        nbrranges = to_device(queue,
-                np.cumsum([0] + [n.shape[0] for n in nbrindices]))
-        nbrindices = to_device(queue, np.hstack(nbrindices))
-
-        return BlockIndexRanges(discr.cl_context,
-                                nbrindices.with_queue(None),
-                                nbrranges.with_queue(None))
-
-
-def gather_block_interaction_points(source, indices,
-                                    ratio=None,
-                                    approx_nproxy=None,
-                                    max_nodes_in_box=None):
-    """Generate sets of interaction points for each given range of indices
-    in the *source* discretization. For each input range of indices,
-    the corresponding output range of points is consists of:
-
-    - a set of proxy points (or balls) around the range, which
-      model farfield interactions. These are constructed using
-      :class:`ProxyGenerator`.
-
-    - a set of neighboring points that are inside the proxy balls, but
-      do not belong to the given range, which model nearby interactions.
-      These are constructed with :func:`gather_block_neighbor_points`.
-
-    :arg source: a :class:`pytential.qbx.QBXLayerPotentialSource`.
-    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
-
-    :return: a tuple ``(nodes, ranges)``, where each value is a
-        :class:`pyopencl.array.Array`. For a range :math:`i`, we can
-        get the slice using ``nodes[ranges[i]:ranges[i + 1]]``.
-    """
-
-    @memoize
-    def knl():
-        loopy_knl = lp.make_kernel([
-            "{[irange, idim]: 0 <= irange < nranges and \
-                              0 <= idim < dim}",
-            "{[ipxy, ingb]: 0 <= ipxy < npxyblock and \
-                            0 <= ingb < nngbblock}"
-            ],
-            """
-            for irange
-                <> pxystart = pxyranges[irange]
-                <> pxyend = pxyranges[irange + 1]
-                <> npxyblock = pxyend - pxystart
-
-                <> ngbstart = nbrranges[irange]
-                <> ngbend = nbrranges[irange + 1]
-                <> nngbblock = ngbend - ngbstart
-
-                <> istart = pxyranges[irange] + nbrranges[irange]
-                nodes[idim, istart + ipxy] = \
-                    proxies[idim, pxystart + ipxy] \
-                    {id_prefix=write_pxy,nosync=write_ngb}
-                nodes[idim, istart + npxyblock + ingb] = \
-                    sources[idim, nbrindices[ngbstart + ingb]] \
-                    {id_prefix=write_ngb,nosync=write_pxy}
-                ranges[irange + 1] = ranges[irange] + npxyblock + nngbblock
-            end
-            """,
-            [
-                lp.GlobalArg("sources", None,
-                    shape=(source.ambient_dim, "nsources")),
-                lp.GlobalArg("proxies", None,
-                    shape=(source.ambient_dim, "nproxies"), dim_tags="sep,C"),
-                lp.GlobalArg("nbrindices", None,
-                    shape="nnbrindices"),
-                lp.GlobalArg("nodes", None,
-                    shape=(source.ambient_dim, "nproxies + nnbrindices")),
-                lp.ValueArg("nsources", np.int),
-                lp.ValueArg("nproxies", np.int),
-                lp.ValueArg("nnbrindices", np.int),
-                "..."
-            ],
-            name="concat_proxy_and_neighbors",
-            default_offset=lp.auto,
-            silenced_warnings="write_race(write_*)",
-            fixed_parameters=dict(dim=source.ambient_dim),
-            lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-
-        return loopy_knl
-
-    with cl.CommandQueue(source.cl_context) as queue:
-        generator = ProxyGenerator(source,
-                                   ratio=ratio,
-                                   approx_nproxy=approx_nproxy)
-        proxies, pxyranges, pxycenters, pxyradii = generator(queue, indices)
-
-        neighbors = gather_block_neighbor_points(source.density_discr,
-                indices, pxycenters, pxyradii,
-                max_nodes_in_box=max_nodes_in_box)
-
-        ranges = cl.array.zeros(queue, indices.nblocks + 1, dtype=np.int)
-        _, (nodes, ranges) = knl()(queue,
-                sources=source.density_discr.nodes(),
-                proxies=proxies,
-                pxyranges=pxyranges,
-                nbrindices=neighbors.indices,
-                nbrranges=neighbors.ranges,
-                ranges=ranges)
-
-        return nodes.with_queue(None), ranges.with_queue(None)
-
-# }}}
-
-# vim: foldmethod=marker
diff --git a/pytential/log.py b/pytential/log.py
index 651afb4b5e3f6c50d0f85e7639dc333ae43dd4d6..ae49fa63fe69c970753163225db9f1be7253f1d2 100644
--- a/pytential/log.py
+++ b/pytential/log.py
@@ -50,8 +50,9 @@ LEVEL_TO_COLOR = {
 
 
 PYTENTIAL_LOG_FORMAT = (
-        "[$BOLD%(name)s$RESET][%(levelname)s]  %(message)s "
-        "($BOLD%(filename)s$RESET:%(lineno)d)"
+        "[$BOLD%(name)s$RESET][%(levelname)s] "
+        "($BOLD%(filename)s$RESET:%(lineno)d) "
+        " %(message)s"
 )
 
 # }}}
diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py
index f65c31d2cf948be729f63ed23b97b9958b39dcdf..a23dd3d711258cdf25387196c1c89145c062994d 100644
--- a/pytential/qbx/__init__.py
+++ b/pytential/qbx/__init__.py
@@ -27,7 +27,6 @@ import six
 
 import numpy as np
 from pytools import memoize_method
-from meshmode.discretization import Discretization
 from pytential.qbx.target_assoc import QBXTargetAssociationFailedException
 from pytential.source import LayerPotentialSourceBase
 
@@ -57,12 +56,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
     .. attribute :: fmm_order
 
     .. automethod :: __init__
-    .. automethod :: with_refinement
     .. automethod :: copy
 
-    .. attribute :: stage2_density_discr
-    .. attribute :: quad_stage2_density_discr
-
     See :ref:`qbxguts` for some information on the inner workings of this.
     """
 
@@ -74,14 +69,13 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             qbx_order=None,
             fmm_order=None,
             fmm_level_to_order=None,
-            to_refined_connection=None,
             expansion_factory=None,
             target_association_tolerance=_not_provided,
 
             # begin experimental arguments
             # FIXME default debug=False once everything has matured
             debug=True,
-            _refined_for_global_qbx=False,
+            _disable_refinement=False,
             _expansions_in_tree_have_extent=True,
             _expansion_stick_out_factor=0.5,
             _well_sep_is_n_away=2,
@@ -98,11 +92,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         """
         :arg fine_order: The total degree to which the (upsampled)
              underlying quadrature is exact.
-        :arg to_refined_connection: A connection used for resampling from
-             *density_discr* the fine density discretization.  It is assumed
-             that the fine density discretization given by
-             *to_refined_connection.to_discr* is *not* already upsampled. May
-             be *None*.
         :arg fmm_order: `False` for direct calculation. May not be given if
             *fmm_level_to_order* is given.
         :arg fmm_level_to_order: A function that takes arguments of
@@ -201,16 +190,13 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         self.target_association_tolerance = target_association_tolerance
         self.fmm_backend = fmm_backend
 
-        # Default values are lazily provided if these are None
-        self._to_refined_connection = to_refined_connection
-
         if expansion_factory is None:
             from sumpy.expansion import DefaultExpansionFactory
             expansion_factory = DefaultExpansionFactory()
         self.expansion_factory = expansion_factory
 
         self.debug = debug
-        self._refined_for_global_qbx = _refined_for_global_qbx
+        self._disable_refinement = _disable_refinement
         self._expansions_in_tree_have_extent = \
                 _expansions_in_tree_have_extent
         self._expansion_stick_out_factor = _expansion_stick_out_factor
@@ -242,7 +228,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             qbx_order=None,
             fmm_order=_not_provided,
             fmm_level_to_order=_not_provided,
-            to_refined_connection=None,
             expansion_factory=None,
             target_association_tolerance=_not_provided,
             _expansions_in_tree_have_extent=_not_provided,
@@ -257,7 +242,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             fmm_backend=None,
 
             debug=_not_provided,
-            _refined_for_global_qbx=_not_provided,
+            _disable_refinement=_not_provided,
             target_stick_out_factor=_not_provided,
             ):
 
@@ -305,19 +290,17 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 qbx_order=qbx_order if qbx_order is not None else self.qbx_order,
 
                 target_association_tolerance=target_association_tolerance,
-                to_refined_connection=(
-                    to_refined_connection or self._to_refined_connection),
                 expansion_factory=(
                     expansion_factory or self.expansion_factory),
 
                 debug=(
                     # False is a valid value here
                     debug if debug is not _not_provided else self.debug),
-                _refined_for_global_qbx=(
+                _disable_refinement=(
                     # False is a valid value here
-                    _refined_for_global_qbx
-                    if _refined_for_global_qbx is not _not_provided
-                    else self._refined_for_global_qbx),
+                    _disable_refinement
+                    if _disable_refinement is not _not_provided
+                    else self._disable_refinement),
                 _expansions_in_tree_have_extent=(
                     # False is a valid value here
                     _expansions_in_tree_have_extent
@@ -352,84 +335,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     # }}}
 
-    @property
-    def stage2_density_discr(self):
-        """The refined, interpolation-focused density discretization (no oversampling).
-        """
-        return (self._to_refined_connection.to_discr
-                if self._to_refined_connection is not None
-                else self.density_discr)
-
-    @property
-    @memoize_method
-    def refined_interp_to_ovsmp_quad_connection(self):
-        from meshmode.discretization.connection import make_same_mesh_connection
-
-        return make_same_mesh_connection(
-                self.quad_stage2_density_discr,
-                self.stage2_density_discr)
-
-    @property
-    @memoize_method
-    def quad_stage2_density_discr(self):
-        """The refined, quadrature-focused density discretization (with upsampling).
-        """
-        from meshmode.discretization.poly_element import (
-                QuadratureSimplexGroupFactory)
-
-        return Discretization(
-            self.density_discr.cl_context, self.stage2_density_discr.mesh,
-            QuadratureSimplexGroupFactory(self.fine_order),
-            self.real_dtype)
-
-    # {{{ weights and area elements
-
-    @memoize_method
-    def weights_and_area_elements(self):
-        from pytential import bind, sym
-        with cl.CommandQueue(self.cl_context) as queue:
-            return bind(self, sym.weights_and_area_elements(
-                self.ambient_dim,
-                dofdesc=sym.QBX_SOURCE_QUAD_STAGE2))(queue).with_queue(None)
-
-    # }}}
-
-    @property
-    @memoize_method
-    def resampler(self):
-        from meshmode.discretization.connection import \
-                ChainedDiscretizationConnection
-
-        conn = self.refined_interp_to_ovsmp_quad_connection
-
-        if self._to_refined_connection is not None:
-            return ChainedDiscretizationConnection(
-                    [self._to_refined_connection, conn])
-
-        return conn
-
-    @property
-    @memoize_method
-    def direct_resampler(self):
-        """
-        .. warning::
-
-            This always returns a
-            :class:`~meshmode.discretization.connection.DirectDiscretizationConnection`.
-            In case the geometry has been refined multiple times, a direct
-            connection can have a large number of groups and/or
-            interpolation batches, making it scale significantly worse than
-            the one returned by :attr:`resampler`.
-        """
-        from meshmode.discretization.connection import \
-                flatten_chained_connection
-
-        conn = self.resampler
-        with cl.CommandQueue(self.cl_context) as queue:
-            conn = flatten_chained_connection(queue, conn)
-
-        return conn
-
     @property
     @memoize_method
     def tree_code_container(self):
@@ -449,52 +354,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         return TargetAssociationCodeContainer(
                 self.cl_context, self.tree_code_container)
 
-    @memoize_method
-    def with_refinement(self, target_order=None, kernel_length_scale=None,
-            maxiter=None, visualize=False, refiner=None,
-            _expansion_disturbance_tolerance=None,
-            _force_stage2_uniform_refinement_rounds=None,
-            _scaled_max_curvature_threshold=None):
-        """
-        :arg refiner: If the mesh underlying :attr:`density_discr`
-            is itself the result of refinement, then its
-            :class:`meshmode.refinement.Refiner` instance may need to
-            be reused for continued refinement. This argument
-            provides the opportunity to pass in an existing refiner
-            that should be used for continued refinement.
-        :returns: a tuple ``(lpot_src, cnx)``, where ``lpot_src`` is a
-            :class:`QBXLayerPotentialSource` and ``cnx`` is a
-            :class:`meshmode.discretization.connection.DiscretizationConnection`
-            from the originally given to the refined geometry.
-        """
-        from pytential.qbx.refinement import refine_for_global_qbx
-
-        from meshmode.discretization.poly_element import (
-                InterpolatoryQuadratureSimplexGroupFactory)
-
-        if target_order is None:
-            target_order = self.density_discr.groups[0].order
-
-        with cl.CommandQueue(self.cl_context) as queue:
-            lpot, connection = refine_for_global_qbx(
-                    self,
-                    self.refiner_code_container.get_wrangler(queue),
-                    InterpolatoryQuadratureSimplexGroupFactory(target_order),
-                    kernel_length_scale=kernel_length_scale,
-                    maxiter=maxiter, visualize=visualize,
-                    expansion_disturbance_tolerance=_expansion_disturbance_tolerance,
-                    force_stage2_uniform_refinement_rounds=(
-                        _force_stage2_uniform_refinement_rounds),
-                    scaled_max_curvature_threshold=(
-                        _scaled_max_curvature_threshold),
-                    refiner=refiner)
-
-        return lpot, connection
-
     # {{{ internal API
 
     @memoize_method
-    def qbx_fmm_geometry_data(self, target_discrs_and_qbx_sides):
+    def qbx_fmm_geometry_data(self, places, name,
+            target_discrs_and_qbx_sides):
         """
         :arg target_discrs_and_qbx_sides:
             a tuple of *(discr, qbx_forced_limit)*
@@ -506,8 +370,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         """
         from pytential.qbx.geometry import QBXFMMGeometryData
 
-        return QBXFMMGeometryData(self.qbx_fmm_code_getter,
-                self, target_discrs_and_qbx_sides,
+        return QBXFMMGeometryData(places, name,
+                self.qbx_fmm_code_getter,
+                target_discrs_and_qbx_sides,
                 target_association_tolerance=self.target_association_tolerance,
                 tree_kind=self._tree_kind,
                 debug=self.debug)
@@ -595,7 +460,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     def _dispatch_compute_potential_insn(self, queue, insn, bound_expr,
             evaluate, func, extra_args=None):
-        if not self._refined_for_global_qbx:
+        if self._disable_refinement:
             from warnings import warn
             warn(
                     "Executing global QBX without refinement. "
@@ -663,7 +528,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 target_name_and_side_to_number[key] = \
                         len(target_discrs_and_qbx_sides)
 
-                target_discr = bound_expr.places.get_geometry(o.target_name)
+                target_discr = bound_expr.places.get_discretization(
+                        o.target_name.geometry, o.target_name.discr_stage)
                 if isinstance(target_discr, LayerPotentialSourceBase):
                     target_discr = target_discr.density_discr
 
@@ -690,7 +556,10 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         target_name_and_side_to_number, target_discrs_and_qbx_sides = (
                 self.get_target_discrs_and_qbx_sides(insn, bound_expr))
 
-        geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides)
+        geo_data = self.qbx_fmm_geometry_data(
+                bound_expr.places,
+                insn.source.geometry,
+                target_discrs_and_qbx_sides)
 
         # FIXME Exert more positive control over geo_data attribute lifetimes using
         # geo_data.<method>.clear_cache(geo_data).
@@ -701,8 +570,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         # FIXME don't compute *all* output kernels on all targets--respect that
         # some target discretizations may only be asking for derivatives (e.g.)
 
-        strengths = (evaluate(insn.density).with_queue(queue)
-                * self.weights_and_area_elements())
+        from pytential import bind, sym
+        waa = bind(bound_expr.places, sym.weights_and_area_elements(
+            self.ambient_dim, dofdesc=insn.source))(queue)
+        strengths = waa * evaluate(insn.density).with_queue(queue)
+
         out_kernels = tuple(knl for knl in insn.kernels)
         fmm_kernel = self.get_fmm_kernel(out_kernels)
         output_and_expansion_dtype = (
@@ -811,6 +683,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate,
             return_timing_data):
+        from pytential import bind, sym
         if return_timing_data:
             from pytential.source import UnableToCollectTimingData
             from warnings import warn
@@ -826,35 +699,38 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         for arg_name, arg_expr in six.iteritems(insn.kernel_arguments):
             kernel_args[arg_name] = evaluate(arg_expr)
 
-        strengths = (evaluate(insn.density).with_queue(queue)
-                * self.weights_and_area_elements())
+        waa = bind(bound_expr.places, sym.weights_and_area_elements(
+            self.ambient_dim, dofdesc=insn.source))(queue)
+        strengths = waa * evaluate(insn.density).with_queue(queue)
 
-        from pytential import bind, sym
-        expansion_radii = bind(self,
-                sym.expansion_radii(self.ambient_dim))(queue)
-        centers = {
-                -1: bind(self,
-                    sym.expansion_centers(self.ambient_dim, -1))(queue),
-                +1: bind(self,
-                    sym.expansion_centers(self.ambient_dim, +1))(queue)
-                }
+        source_discr = bound_expr.places.get_discretization(
+                insn.source.geometry, insn.source.discr_stage)
 
         # FIXME: Do this all at once
         result = []
         for o in insn.outputs:
-            target_discr = bound_expr.get_discretization(o.target_name)
-
-            is_self = self.density_discr is target_discr
+            source_dd = insn.source.copy(discr_stage=o.target_name.discr_stage)
+            target_discr = bound_expr.places.get_discretization(
+                    o.target_name.geometry, o.target_name.discr_stage)
+            density_discr = bound_expr.places.get_discretization(
+                    source_dd.geometry, source_dd.discr_stage)
 
+            is_self = density_discr is target_discr
             if is_self:
                 # QBXPreprocessor is supposed to have taken care of this
                 assert o.qbx_forced_limit is not None
                 assert abs(o.qbx_forced_limit) > 0
 
+                expansion_radii = bind(bound_expr.places, sym.expansion_radii(
+                    self.ambient_dim, dofdesc=o.target_name))(queue)
+                centers = bind(bound_expr.places, sym.expansion_centers(
+                    self.ambient_dim, o.qbx_forced_limit,
+                    dofdesc=o.target_name))(queue)
+
                 evt, output_for_each_kernel = lpot_applier(
                         queue, target_discr.nodes(),
-                        self.quad_stage2_density_discr.nodes(),
-                        centers[o.qbx_forced_limit],
+                        source_discr.nodes(),
+                        centers,
                         [strengths],
                         expansion_radii=expansion_radii,
                         **kernel_args)
@@ -869,17 +745,18 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
                 evt, output_for_each_kernel = p2p(queue,
                         target_discr.nodes(),
-                        self.quad_stage2_density_discr.nodes(),
+                        source_discr.nodes(),
                         [strengths], **kernel_args)
 
                 qbx_forced_limit = o.qbx_forced_limit
                 if qbx_forced_limit is None:
                     qbx_forced_limit = 0
 
+                target_discrs_and_qbx_sides = ((target_discr, qbx_forced_limit),)
                 geo_data = self.qbx_fmm_geometry_data(
-                        target_discrs_and_qbx_sides=(
-                            (target_discr, qbx_forced_limit),
-                        ))
+                        bound_expr.places,
+                        insn.source.geometry,
+                        target_discrs_and_qbx_sides=target_discrs_and_qbx_sides)
 
                 # center-related info is independent of targets
 
@@ -918,7 +795,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                     lpot_applier_on_tgt_subset(
                             queue,
                             targets=target_discr.nodes(),
-                            sources=self.quad_stage2_density_discr.nodes(),
+                            sources=source_discr.nodes(),
                             centers=geo_data.centers(),
                             expansion_radii=geo_data.expansion_radii(),
                             strengths=[strengths],
diff --git a/pytential/qbx/geometry.py b/pytential/qbx/geometry.py
index 72b054f52b47d0fdcc4ba91e638632a0d2e162a7..7a991ddfa2b17c6f5b6fa3dc81d1c8d9fdecab06 100644
--- a/pytential/qbx/geometry.py
+++ b/pytential/qbx/geometry.py
@@ -312,14 +312,19 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
 
     .. rubric :: Attributes
 
-    .. attribute:: code_getter
+    .. attribute:: places
 
-        The :class:`QBXFMMGeometryCodeGetter` for this object.
+        A :class:`~pytential.symbolic.execution.GeometryCollection`
+        containing the :class:`~pytential.qbx.QBXLayerPotentialSource`.
+
+    .. attribute:: source_dd
+
+        Symbolic name for the :class:`~pytential.qbx.QBXLayerPotentialSource`
+        in the collection :attr:`places`.
 
-    .. attribute:: lpot_source
+    .. attribute:: code_getter
 
-        The :class:`pytential.qbx.QBXLayerPotentialSource`
-        acting as the source geometry.
+        The :class:`QBXFMMGeometryCodeGetter` for this object.
 
     .. attribute:: target_discrs_and_qbx_sides
 
@@ -365,10 +370,11 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
     .. method:: m2l_rotation_angles()
     """
 
-    def __init__(self, code_getter, lpot_source,
+    def __init__(self, places, source_dd,
+            code_getter,
             target_discrs_and_qbx_sides,
             target_association_tolerance,
-            tree_kind, debug):
+            tree_kind, debug=None):
         """
         .. rubric:: Constructor arguments
 
@@ -380,14 +386,20 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
         :arg debug: a :class:`bool` flag for whether to enable
             potentially costly self-checks
         """
+        from pytential import sym
+        self.places = places
+        self.source_dd = sym.as_dofdesc(source_dd)
+        self.lpot_source = places.get_geometry(self.source_dd.geometry)
 
         self.code_getter = code_getter
-        self.lpot_source = lpot_source
-        self.target_discrs_and_qbx_sides = \
-                target_discrs_and_qbx_sides
+        self.target_discrs_and_qbx_sides = target_discrs_and_qbx_sides
         self.target_association_tolerance = target_association_tolerance
         self.tree_kind = tree_kind
-        self.debug = debug
+        self.debug = self.lpot_source.debug if debug is None else debug
+
+    @property
+    def ambient_dim(self):
+        return self.lpot_source.ambient_dim
 
     @property
     def cl_context(self):
@@ -395,7 +407,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
 
     @property
     def coord_dtype(self):
-        return self.lpot_source.quad_stage2_density_discr.nodes().dtype
+        return self.lpot_source.density_discr.real_dtype
 
     # {{{ centers/radii
 
@@ -409,13 +421,14 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
 
         ``coord_t [ambient_dim][ncenters]``
         """
+        from pytential import bind, sym
+        from pytools.obj_array import make_obj_array
 
         with cl.CommandQueue(self.cl_context) as queue:
-            from pytential.qbx.utils import get_interleaved_centers
-            from pytools.obj_array import make_obj_array
-            return make_obj_array([
-                ccomp.with_queue(None)
-                for ccomp in get_interleaved_centers(queue, self.lpot_source)])
+            centers = bind(self.places, sym.interleaved_expansion_centers(
+                self.ambient_dim,
+                dofdesc=self.source_dd.to_stage1()))(queue)
+            return make_obj_array([ax.with_queue(None) for ax in centers])
 
     @memoize_method
     def expansion_radii(self):
@@ -424,9 +437,13 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
 
         ``coord_t [ncenters]``
         """
+        from pytential import bind, sym
+
         with cl.CommandQueue(self.cl_context) as queue:
-            from pytential.qbx.utils import get_interleaved_radii
-            return get_interleaved_radii(queue, self.lpot_source)
+            return bind(self.places, sym.expansion_radii(
+                self.ambient_dim,
+                granularity=sym.GRANULARITY_CENTER,
+                dofdesc=self.source_dd.to_stage1()))(queue)
 
     # }}}
 
@@ -437,8 +454,6 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
         """Return a :class:`TargetInfo`. |cached|"""
 
         code_getter = self.code_getter
-        lpot_src = self.lpot_source
-
         with cl.CommandQueue(self.cl_context) as queue:
             ntargets = self.ncenters
             target_discr_starts = []
@@ -450,7 +465,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             target_discr_starts.append(ntargets)
 
             targets = cl.array.empty(
-                    self.cl_context, (lpot_src.ambient_dim, ntargets),
+                    self.cl_context, (self.ambient_dim, ntargets),
                     self.coord_dtype)
             code_getter.copy_targets_kernel()(
                     queue,
@@ -503,15 +518,19 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
         """
 
         code_getter = self.code_getter
-        lpot_src = self.lpot_source
+        lpot_source = self.lpot_source
         target_info = self.target_info()
 
         with cl.CommandQueue(self.cl_context) as queue:
-            nsources = lpot_src.quad_stage2_density_discr.nnodes
+            from pytential import sym
+            quad_stage2_discr = self.places.get_discretization(
+                    self.source_dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
+
+            nsources = quad_stage2_discr.nnodes
             nparticles = nsources + target_info.ntargets
 
             target_radii = None
-            if self.lpot_source._expansions_in_tree_have_extent:
+            if lpot_source._expansions_in_tree_have_extent:
                 target_radii = cl.array.zeros(queue, target_info.ntargets,
                         self.coord_dtype)
                 target_radii[:self.ncenters] = self.expansion_radii()
@@ -531,14 +550,14 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             refine_weights.finish()
 
             tree, _ = code_getter.build_tree()(queue,
-                    particles=lpot_src.quad_stage2_density_discr.nodes(),
+                    particles=quad_stage2_discr.nodes(),
                     targets=target_info.targets,
                     target_radii=target_radii,
-                    max_leaf_refine_weight=lpot_src._max_leaf_refine_weight,
+                    max_leaf_refine_weight=lpot_source._max_leaf_refine_weight,
                     refine_weights=refine_weights,
                     debug=self.debug,
-                    stick_out_factor=lpot_src._expansion_stick_out_factor,
-                    extent_norm=lpot_src._box_extent_norm,
+                    stick_out_factor=lpot_source._expansion_stick_out_factor,
+                    extent_norm=lpot_source._box_extent_norm,
                     kind=self.tree_kind)
 
             if self.debug:
@@ -612,7 +631,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             qbx_center_to_target_box = cl.array.empty(
                     queue, self.ncenters, tree.box_id_dtype)
 
-            if self.lpot_source.debug:
+            if self.debug:
                 qbx_center_to_target_box.fill(-1)
 
             evt, _ = qbx_center_to_target_box_lookup(
@@ -624,7 +643,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
                     user_target_from_tree_target=user_target_from_tree_target,
                     ncenters=self.ncenters)
 
-            if self.lpot_source.debug:
+            if self.debug:
                 assert 0 <= cl.array.min(qbx_center_to_target_box).get()
                 assert (
                         cl.array.max(qbx_center_to_target_box).get()
@@ -738,7 +757,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
         values from :class:`target_state` allowed. Targets occur in user order.
         """
         from pytential.qbx.target_assoc import associate_targets_to_qbx_centers
-        tgt_info = self.target_info()
+        target_info = self.target_info()
 
         from pytential.target import PointsTarget
 
@@ -747,7 +766,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
                     .target_side_preferences()[self.ncenters:].get(queue=queue))
 
             target_discrs_and_qbx_sides = [(
-                    PointsTarget(tgt_info.targets[:, self.ncenters:]),
+                    PointsTarget(target_info.targets[:, self.ncenters:]),
                     target_side_prefs.astype(np.int32))]
 
             target_association_wrangler = (
@@ -755,13 +774,15 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
                     .get_wrangler(queue))
 
             tgt_assoc_result = associate_targets_to_qbx_centers(
-                    self.lpot_source,
+                    self.places,
+                    self.source_dd,
                     target_association_wrangler,
                     target_discrs_and_qbx_sides,
                     target_association_tolerance=(
-                        self.target_association_tolerance))
+                        self.target_association_tolerance),
+                    debug=self.debug)
 
-            result = cl.array.empty(queue, tgt_info.ntargets,
+            result = cl.array.empty(queue, target_info.ntargets,
                     tgt_assoc_result.target_to_center.dtype)
             result[:self.ncenters].fill(target_state.NO_QBX_NEEDED)
             result[self.ncenters:] = tgt_assoc_result.target_to_center
@@ -872,6 +893,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             This only works for two-dimensional geometries.
         """
 
+        from pytential import sym
         import matplotlib.pyplot as pt
         pt.clf()
 
@@ -880,8 +902,12 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             raise ValueError("only 2-dimensional geometry info can be plotted")
 
         with cl.CommandQueue(self.cl_context) as queue:
+            stage2_density_discr = self.places.get_discretization(
+                    self.source_dd.geometry, sym.QBX_SOURCE_STAGE2)
+            quad_stage2_density_discr = self.places.get_discretization(
+                    self.source_dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
             from meshmode.discretization.visualization import draw_curve
-            draw_curve(self.lpot_source.quad_stage2_density_discr)
+            draw_curve(quad_stage2_density_discr)
 
             global_flags = self.global_qbx_flags().get(queue=queue)
 
@@ -967,7 +993,7 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface):
             #pt.legend()
             pt.savefig(
                     "geodata-stage2-nelem%d.pdf"
-                    % self.lpot_source.stage2_density_discr.mesh.nelements)
+                    % stage2_density_discr.mesh.nelements)
 
     # }}}
 
diff --git a/pytential/qbx/refinement.py b/pytential/qbx/refinement.py
index 5e1afca22e24766ec76510e78072f30b09d9f70f..b3c28ee6e7c843ced0a39bc27da6e2a1b503d3f2 100644
--- a/pytential/qbx/refinement.py
+++ b/pytential/qbx/refinement.py
@@ -82,7 +82,7 @@ Refiner driver
 
 .. autoclass:: RefinerWrangler
 
-.. autofunction:: refine_for_global_qbx
+.. autofunction:: refine_geometry_collection
 """
 
 # {{{ kernels
@@ -284,7 +284,7 @@ class RefinerWrangler(TreeWranglerBase):
 
     @log_process(logger)
     def check_expansion_disks_undisturbed_by_sources(self,
-            lpot_source, tree, peer_lists,
+            stage1_density_discr, tree, peer_lists,
             expansion_disturbance_tolerance,
             refine_flags,
             debug, wait_for=None):
@@ -309,9 +309,9 @@ class RefinerWrangler(TreeWranglerBase):
         unwrap_args = AreaQueryElementwiseTemplate.unwrap_args
 
         from pytential import bind, sym
-        center_danger_zone_radii = bind(lpot_source, sym.expansion_radii(
-            lpot_source.ambient_dim,
-            granularity=sym.GRANULARITY_CENTER))(self.queue)
+        center_danger_zone_radii = bind(stage1_density_discr,
+                sym.expansion_radii(stage1_density_discr.ambient_dim,
+                    granularity=sym.GRANULARITY_CENTER))(self.queue)
 
         evt = knl(
             *unwrap_args(
@@ -344,9 +344,9 @@ class RefinerWrangler(TreeWranglerBase):
         return found_panel_to_refine.get()[0] == 1
 
     @log_process(logger)
-    def check_sufficient_source_quadrature_resolution(
-            self, lpot_source, tree, peer_lists, refine_flags, debug,
-            wait_for=None):
+    def check_sufficient_source_quadrature_resolution(self,
+            stage2_density_discr, tree, peer_lists, refine_flags,
+            debug, wait_for=None):
 
         # Avoid generating too many kernels.
         from pytools import div_ceil
@@ -366,10 +366,10 @@ class RefinerWrangler(TreeWranglerBase):
         found_panel_to_refine.finish()
 
         from pytential import bind, sym
-        source_danger_zone_radii_by_panel = bind(lpot_source,
+        dd = sym.as_dofdesc(sym.GRANULARITY_ELEMENT).to_stage2()
+        source_danger_zone_radii_by_panel = bind(stage2_density_discr,
                 sym._source_danger_zone_radii(
-                    lpot_source.ambient_dim,
-                    dofdesc=sym.GRANULARITY_ELEMENT))(self.queue)
+                    stage2_density_discr.ambient_dim, dofdesc=dd))(self.queue)
         unwrap_args = AreaQueryElementwiseTemplate.unwrap_args
 
         evt = knl(
@@ -444,11 +444,13 @@ class RefinerWrangler(TreeWranglerBase):
 # }}}
 
 
+# {{{ stage1/stage2 refinement
+
 class RefinerNotConvergedWarning(UserWarning):
     pass
 
 
-def make_empty_refine_flags(queue, lpot_source, use_stage2_discr=False):
+def make_empty_refine_flags(queue, density_discr):
     """Return an array on the device suitable for use as element refine flags.
 
     :arg queue: An instance of :class:`pyopencl.CommandQueue`.
@@ -457,161 +459,150 @@ def make_empty_refine_flags(queue, lpot_source, use_stage2_discr=False):
     :returns: A :class:`pyopencl.array.Array` suitable for use as refine flags,
         initialized to zero.
     """
-    discr = (lpot_source.stage2_density_discr
-            if use_stage2_discr
-            else lpot_source.density_discr)
-    result = cl.array.zeros(queue, discr.mesh.nelements, np.int32)
+    result = cl.array.zeros(queue, density_discr.mesh.nelements, np.int32)
     result.finish()
     return result
 
 
-# {{{ main entry point
+def _warn_max_iterations(violated_criteria, expansion_disturbance_tolerance):
+    from warnings import warn
+    warn(
+            "QBX layer potential source refiner did not terminate "
+            "after %d iterations (the maximum). "
+            "You may call 'refine_geometry_collection()' manually "
+            "and pass 'visualize=True' to see what area of the geometry is "
+            "causing trouble. If the issue is disturbance of expansion disks, "
+            "you may pass a slightly increased value (currently: %g) for "
+            "'expansion_disturbance_tolerance'. As a last resort, "
+            "you may use Python's warning filtering mechanism to "
+            "not treat this warning as an error. The criteria triggering "
+            "refinement in each iteration were: %s. " % (
+                len(violated_criteria),
+                expansion_disturbance_tolerance,
+                ", ".join(
+                    "%d: %s" % (i+1, vc_text)
+                    for i, vc_text in enumerate(violated_criteria))),
+            RefinerNotConvergedWarning)
 
-def refine_for_global_qbx(lpot_source, wrangler,
-        group_factory, kernel_length_scale=None,
-        force_stage2_uniform_refinement_rounds=None,
-        scaled_max_curvature_threshold=None,
-        debug=None, maxiter=None,
-        visualize=None, expansion_disturbance_tolerance=None,
-        refiner=None):
-    """
-    Entry point for calling the refiner.
 
-    :arg lpot_source: An instance of :class:`QBXLayerPotentialSource`.
+def _visualize_refinement(queue, discr,
+        niter, stage_nr, stage_name, flags, visualize=False):
+    if not visualize:
+        return
 
-    :arg wrangler: An instance of :class:`RefinerWrangler`.
+    if stage_nr not in (1, 2):
+        raise ValueError("unexpected stage number")
 
-    :arg group_factory: An instance of
-        :class:`meshmode.mesh.discretization.ElementGroupFactory`. Used for
-        discretizing the coarse refined mesh.
+    flags = flags.get()
+    logger.info("for stage %s: splitting %d/%d stage-%d elements",
+            stage_name, np.sum(flags), discr.mesh.nelements, stage_nr)
 
-    :arg kernel_length_scale: The kernel length scale, or *None* if not
-        applicable. All panels are refined to below this size.
+    from meshmode.discretization.visualization import make_visualizer
+    vis = make_visualizer(queue, discr, 3)
 
-    :arg maxiter: The maximum number of refiner iterations.
+    assert len(flags) == discr.mesh.nelements
 
-    :returns: A tuple ``(lpot_source, *conn*)`` where ``lpot_source`` is the
-        refined layer potential source, and ``conn`` is a
-        :class:`meshmode.discretization.connection.DiscretizationConnection`
-        going from the original mesh to the refined mesh.
-    """
+    flags = flags.astype(np.bool)
+    nodes_flags = np.zeros(discr.nnodes)
+    for grp in discr.groups:
+        meg = grp.mesh_el_group
+        grp.view(nodes_flags)[
+                flags[meg.element_nr_base:meg.nelements+meg.element_nr_base]] = 1
 
-    if maxiter is None:
-        maxiter = 10
+    nodes_flags = cl.array.to_device(queue, nodes_flags)
+    vis_data = [
+        ("refine_flags", nodes_flags),
+        ]
 
-    if debug is None:
-        # FIXME: Set debug=False by default once everything works.
-        debug = True
+    if 0:
+        from pytential import sym, bind
+        bdry_normals = bind(discr, sym.normal(discr.ambient_dim))(
+                queue).as_vector(dtype=object)
+        vis_data.append(("bdry_normals", bdry_normals),)
 
-    if expansion_disturbance_tolerance is None:
-        expansion_disturbance_tolerance = 0.025
+    vis.write_vtk_file("refinement-%s-%03d.vtu" % (stage_name, niter), vis_data)
 
-    if force_stage2_uniform_refinement_rounds is None:
-        force_stage2_uniform_refinement_rounds = 0
 
-    # TODO: Stop doing redundant checks by avoiding panels which no longer need
-    # refinement.
+def _make_quad_stage2_discr(lpot_source, stage2_density_discr):
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            QuadratureSimplexGroupFactory
 
-    from meshmode.mesh.refinement import RefinerWithoutAdjacency
-    from meshmode.discretization.connection import (
-            ChainedDiscretizationConnection, make_same_mesh_connection)
+    return Discretization(
+            lpot_source.cl_context,
+            stage2_density_discr.mesh,
+            QuadratureSimplexGroupFactory(lpot_source.fine_order),
+            lpot_source.real_dtype)
 
-    if refiner is not None:
-        assert refiner.get_current_mesh() == lpot_source.density_discr.mesh
-    else:
-        # We may be handed a mesh that's already non-conforming, we don't rely
-        # on adjacency, and the no-adjacency refiner is faster.
-        refiner = RefinerWithoutAdjacency(lpot_source.density_discr.mesh)
 
-    connections = []
+def _make_temporary_collection(lpot_source,
+        stage1_density_discr=None,
+        stage2_density_discr=None):
+    from pytential import sym
+    from pytential import GeometryCollection
 
-    # {{{ first stage refinement
+    name = "_tmp_refine_source"
+    places = GeometryCollection(lpot_source, auto_where=name)
 
-    def visualize_refinement(niter, stage_nr, stage_name, flags):
-        if not visualize:
-            return
+    if stage1_density_discr is not None:
+        places._add_discr_to_cache(stage1_density_discr,
+                name, sym.QBX_SOURCE_STAGE1)
 
-        if stage_nr == 1:
-            discr = lpot_source.density_discr
-        elif stage_nr == 2:
-            discr = lpot_source.stage2_density_discr
-        else:
-            raise ValueError("unexpected stage number")
-
-        flags = flags.get()
-        logger.info("for stage %s: splitting %d/%d stage-%d elements",
-                stage_name, np.sum(flags), discr.mesh.nelements, stage_nr)
-
-        from meshmode.discretization.visualization import make_visualizer
-        vis = make_visualizer(wrangler.queue, discr, 3)
-
-        assert len(flags) == discr.mesh.nelements
-
-        flags = flags.astype(np.bool)
-        nodes_flags = np.zeros(discr.nnodes)
-        for grp in discr.groups:
-            meg = grp.mesh_el_group
-            grp.view(nodes_flags)[
-                    flags[meg.element_nr_base:meg.nelements+meg.element_nr_base]] = 1
-
-        nodes_flags = cl.array.to_device(wrangler.queue, nodes_flags)
-        vis_data = [
-            ("refine_flags", nodes_flags),
-            ]
-
-        if 0:
-            from pytential import sym, bind
-            bdry_normals = bind(discr, sym.normal(discr.ambient_dim))(
-                    wrangler.queue).as_vector(dtype=object)
-            vis_data.append(("bdry_normals", bdry_normals),)
-
-        vis.write_vtk_file("refinement-%s-%03d.vtu" % (stage_name, niter), vis_data)
-
-    def warn_max_iterations():
-        from warnings import warn
-        warn(
-                "QBX layer potential source refiner did not terminate "
-                "after %d iterations (the maximum). "
-                "You may pass 'visualize=True' to with_refinement() "
-                "to see what area of the geometry is causing trouble. "
-                "If the issue is disturbance of expansion disks, you may "
-                "pass a slightly increased value (currently: %g) for "
-                "_expansion_disturbance_tolerance in with_refinement(). "
-                "As a last resort, "
-                "you may use Python's warning filtering mechanism to "
-                "not treat this warning as an error. "
-                "The criteria triggering refinement in each iteration "
-                "were: %s. " % (
-                    len(violated_criteria),
-                    expansion_disturbance_tolerance,
-                    ", ".join(
-                        "%d: %s" % (i+1, vc_text)
-                        for i, vc_text in enumerate(violated_criteria))),
-                RefinerNotConvergedWarning)
+    if stage2_density_discr is not None:
+        quad_stage2_density_discr = \
+                _make_quad_stage2_discr(lpot_source, stage2_density_discr)
 
+        places._add_discr_to_cache(stage2_density_discr,
+                name, sym.QBX_SOURCE_STAGE2)
+        places._add_discr_to_cache(quad_stage2_density_discr,
+                name, sym.QBX_SOURCE_QUAD_STAGE2)
+
+    return places
+
+
+def _refine_qbx_stage1(lpot_source, density_discr,
+        wrangler, group_factory,
+        kernel_length_scale=None,
+        scaled_max_curvature_threshold=None,
+        expansion_disturbance_tolerance=None,
+        maxiter=None, debug=None, visualize=False):
+    from pytential import bind, sym
+    from meshmode.discretization.connection import ChainedDiscretizationConnection
+    if lpot_source._disable_refinement:
+        return density_discr, ChainedDiscretizationConnection([],
+                from_discr=density_discr)
+
+    from meshmode.mesh.refinement import RefinerWithoutAdjacency
+    refiner = RefinerWithoutAdjacency(density_discr.mesh)
+
+    # TODO: Stop doing redundant checks by avoiding panels which no longer need
+    # refinement.
+
+    connections = []
     violated_criteria = []
     iter_violated_criteria = ["start"]
-
     niter = 0
 
+    stage1_density_discr = density_discr
     while iter_violated_criteria:
         iter_violated_criteria = []
         niter += 1
 
         if niter > maxiter:
-            warn_max_iterations()
+            _warn_max_iterations(
+                    violated_criteria, expansion_disturbance_tolerance)
             break
 
-        refine_flags = make_empty_refine_flags(wrangler.queue, lpot_source)
+        refine_flags = make_empty_refine_flags(
+                wrangler.queue, stage1_density_discr)
 
         if kernel_length_scale is not None:
             with ProcessLogger(logger,
                     "checking kernel length scale to panel size ratio"):
 
-                from pytential import bind, sym
-                quad_resolution = bind(lpot_source, sym._quad_resolution(
-                    lpot_source.ambient_dim,
-                    dofdesc=sym.GRANULARITY_ELEMENT))(wrangler.queue)
+                quad_resolution = bind(stage1_density_discr,
+                        sym._quad_resolution(stage1_density_discr.ambient_dim,
+                            dofdesc=sym.GRANULARITY_ELEMENT))(wrangler.queue)
 
                 violates_kernel_length_scale = \
                         wrangler.check_element_prop_threshold(
@@ -621,16 +612,16 @@ def refine_for_global_qbx(lpot_source, wrangler,
 
                 if violates_kernel_length_scale:
                     iter_violated_criteria.append("kernel length scale")
-                    visualize_refinement(
-                            niter, 1, "kernel-length-scale", refine_flags)
+                    _visualize_refinement(wrangler.queue, stage1_density_discr,
+                            niter, 1, "kernel-length-scale", refine_flags,
+                            visualize=visualize)
 
         if scaled_max_curvature_threshold is not None:
             with ProcessLogger(logger,
                     "checking scaled max curvature threshold"):
-                from pytential import sym, bind
-                scaled_max_curv = bind(lpot_source,
-                    sym.ElementwiseMax(
-                        sym._scaled_max_curvature(lpot_source.ambient_dim),
+                scaled_max_curv = bind(stage1_density_discr,
+                    sym.ElementwiseMax(sym._scaled_max_curvature(
+                        stage1_density_discr.ambient_dim),
                         dofdesc=sym.GRANULARITY_ELEMENT))(wrangler.queue)
 
                 violates_scaled_max_curv = \
@@ -641,25 +632,32 @@ def refine_for_global_qbx(lpot_source, wrangler,
 
                 if violates_scaled_max_curv:
                     iter_violated_criteria.append("curvature")
-                    visualize_refinement(niter, 1, "curvature", refine_flags)
+                    _visualize_refinement(wrangler.queue, stage1_density_discr,
+                            niter, 1, "curvature", refine_flags,
+                            visualize=visualize)
 
         if not iter_violated_criteria:
             # Only start building trees once the simple length-based criteria
             # are happy.
+            places = _make_temporary_collection(lpot_source,
+                    stage1_density_discr=stage1_density_discr)
 
             # Build tree and auxiliary data.
             # FIXME: The tree should not have to be rebuilt at each iteration.
-            tree = wrangler.build_tree(lpot_source)
+            tree = wrangler.build_tree(places,
+                    sources_list=[places.auto_source.geometry])
             peer_lists = wrangler.find_peer_lists(tree)
 
             has_disturbed_expansions = \
                     wrangler.check_expansion_disks_undisturbed_by_sources(
-                            lpot_source, tree, peer_lists,
+                            stage1_density_discr, tree, peer_lists,
                             expansion_disturbance_tolerance,
                             refine_flags, debug)
             if has_disturbed_expansions:
                 iter_violated_criteria.append("disturbed expansions")
-                visualize_refinement(niter, 1, "disturbed-expansions", refine_flags)
+                _visualize_refinement(wrangler.queue, stage1_density_discr,
+                        niter, 1, "disturbed-expansions", refine_flags,
+                        visualize=visualize)
 
             del tree
             del peer_lists
@@ -668,44 +666,72 @@ def refine_for_global_qbx(lpot_source, wrangler,
             violated_criteria.append(" and ".join(iter_violated_criteria))
 
             conn = wrangler.refine(
-                    lpot_source.density_discr, refiner, refine_flags,
+                    stage1_density_discr, refiner, refine_flags,
                     group_factory, debug)
+            stage1_density_discr = conn.to_discr
             connections.append(conn)
-            lpot_source = lpot_source.copy(density_discr=conn.to_discr)
 
         del refine_flags
 
-    # }}}
+    conn = ChainedDiscretizationConnection(connections,
+            from_discr=density_discr)
 
-    # {{{ second stage refinement
+    return stage1_density_discr, conn
 
+
+def _refine_qbx_stage2(lpot_source, stage1_density_discr,
+        wrangler, group_factory,
+        expansion_disturbance_tolerance=None,
+        force_stage2_uniform_refinement_rounds=None,
+        maxiter=None, debug=None, visualize=False):
+    from meshmode.discretization.connection import ChainedDiscretizationConnection
+    if lpot_source._disable_refinement:
+        return stage1_density_discr, ChainedDiscretizationConnection([],
+                from_discr=stage1_density_discr)
+
+    from meshmode.mesh.refinement import RefinerWithoutAdjacency
+    refiner = RefinerWithoutAdjacency(stage1_density_discr.mesh)
+
+    # TODO: Stop doing redundant checks by avoiding panels which no longer need
+    # refinement.
+
+    connections = []
+    violated_criteria = []
     iter_violated_criteria = ["start"]
     niter = 0
-    fine_connections = []
-
-    stage2_density_discr = lpot_source.density_discr
 
+    stage2_density_discr = stage1_density_discr
     while iter_violated_criteria:
         iter_violated_criteria = []
         niter += 1
 
         if niter > maxiter:
-            warn_max_iterations()
+            _warn_max_iterations(
+                    violated_criteria, expansion_disturbance_tolerance)
             break
 
+        places = _make_temporary_collection(lpot_source,
+                stage1_density_discr=stage1_density_discr,
+                stage2_density_discr=stage2_density_discr)
+
         # Build tree and auxiliary data.
         # FIXME: The tree should not have to be rebuilt at each iteration.
-        tree = wrangler.build_tree(lpot_source, use_stage2_discr=True)
+        tree = wrangler.build_tree(places,
+                sources_list=[places.auto_source.geometry],
+                use_stage2_discr=True)
         peer_lists = wrangler.find_peer_lists(tree)
         refine_flags = make_empty_refine_flags(
-                wrangler.queue, lpot_source, use_stage2_discr=True)
+                wrangler.queue, stage2_density_discr)
 
-        has_insufficient_quad_res = \
+        has_insufficient_quad_resolution = \
                 wrangler.check_sufficient_source_quadrature_resolution(
-                        lpot_source, tree, peer_lists, refine_flags, debug)
-        if has_insufficient_quad_res:
+                        stage2_density_discr, tree, peer_lists, refine_flags,
+                        debug)
+        if has_insufficient_quad_resolution:
             iter_violated_criteria.append("insufficient quadrature resolution")
-            visualize_refinement(niter, 2, "quad-resolution", refine_flags)
+            _visualize_refinement(wrangler.queue, stage2_density_discr,
+                    niter, 2, "quad-resolution", refine_flags,
+                    visualize=visualize)
 
         if iter_violated_criteria:
             violated_criteria.append(" and ".join(iter_violated_criteria))
@@ -714,42 +740,223 @@ def refine_for_global_qbx(lpot_source, wrangler,
                     stage2_density_discr,
                     refiner, refine_flags, group_factory, debug)
             stage2_density_discr = conn.to_discr
-            fine_connections.append(conn)
-            lpot_source = lpot_source.copy(
-                    to_refined_connection=ChainedDiscretizationConnection(
-                        fine_connections))
+            connections.append(conn)
 
         del tree
         del refine_flags
         del peer_lists
 
-    for round in range(force_stage2_uniform_refinement_rounds):
+    for _ in range(force_stage2_uniform_refinement_rounds):
         conn = wrangler.refine(
                 stage2_density_discr,
                 refiner,
                 np.ones(stage2_density_discr.mesh.nelements, dtype=np.bool),
                 group_factory, debug)
         stage2_density_discr = conn.to_discr
-        fine_connections.append(conn)
-        lpot_source = lpot_source.copy(
-                to_refined_connection=ChainedDiscretizationConnection(
-                    fine_connections))
+        connections.append(conn)
+
+    conn = ChainedDiscretizationConnection(connections,
+            from_discr=stage1_density_discr)
+
+    return stage2_density_discr, conn
+
+
+def _refine_qbx_quad_stage2(lpot_source, stage2_density_discr):
+    from meshmode.discretization.connection import make_same_mesh_connection
+    discr = _make_quad_stage2_discr(lpot_source, stage2_density_discr)
+    conn = make_same_mesh_connection(discr, stage2_density_discr)
+
+    return discr, conn
+
+# }}}
+
+
+# {{{ _refine_for_global_qbx
+
+def _refine_for_global_qbx(places, dofdesc, wrangler,
+        group_factory=None,
+        kernel_length_scale=None,
+        force_stage2_uniform_refinement_rounds=None,
+        scaled_max_curvature_threshold=None,
+        expansion_disturbance_tolerance=None,
+        maxiter=None, debug=None, visualize=False,
+        _copy_collection=False):
+    """Entry point for calling the refiner. Once the refinement is complete,
+    the refined discretizations can be obtained from *places* by calling
+    :meth:`~pytential.symbolic.execution.GeometryCollection.get_discretization`.
+
+    :returns: a new version of the :class:`pytential.GeometryCollection`
+        *places* with (what)?
+        Depending on *_copy_collection*, *places* is updated in-place
+        or copied.
+    """
+
+    from pytential import sym
+    dofdesc = sym.as_dofdesc(dofdesc)
+
+    from pytential.qbx import QBXLayerPotentialSource
+    lpot_source = places.get_geometry(dofdesc.geometry)
+    if not isinstance(lpot_source, QBXLayerPotentialSource):
+        raise ValueError("`%s` is not a `QBXLayerPotentialSource`" % (
+            dofdesc.geometry))
+    # {{{
+
+    if maxiter is None:
+        maxiter = 10
+
+    if debug is None:
+        # FIXME: Set debug=False by default once everything works.
+        debug = lpot_source.debug
+
+    if expansion_disturbance_tolerance is None:
+        expansion_disturbance_tolerance = 0.025
+
+    if force_stage2_uniform_refinement_rounds is None:
+        force_stage2_uniform_refinement_rounds = 0
+
+    if group_factory is None:
+        from meshmode.discretization.poly_element import \
+                InterpolatoryQuadratureSimplexGroupFactory
+        group_factory = InterpolatoryQuadratureSimplexGroupFactory(
+                lpot_source.density_discr.groups[0].order)
 
     # }}}
 
-    lpot_source = lpot_source.copy(debug=debug, _refined_for_global_qbx=True)
+    # {{{
+
+    # FIXME: would be nice if this was an IntFlag or something ordered
+    stage_index_map = {
+            sym.QBX_SOURCE_STAGE1: 1,
+            sym.QBX_SOURCE_STAGE2: 2,
+            sym.QBX_SOURCE_QUAD_STAGE2: 3
+            }
+    if dofdesc.discr_stage not in stage_index_map:
+        raise ValueError("unknown discr stage: %s" % dofdesc.discr_stage)
+    stage_index = stage_index_map[dofdesc.discr_stage]
+    geometry = dofdesc.geometry
+
+    def add_to_cache(refine_discr, refine_conn, from_ds, to_ds):
+        places._add_discr_to_cache(refine_discr, geometry, to_ds)
+        places._add_conn_to_cache(refine_conn, geometry, from_ds, to_ds)
 
-    if len(connections) == 0:
-        # FIXME: This is inefficient
-        connection = make_same_mesh_connection(
-                lpot_source.density_discr,
-                lpot_source.density_discr)
-    else:
-        connection = ChainedDiscretizationConnection(connections)
+    def get_from_cache(from_ds, to_ds):
+        discr = places._get_discr_from_cache(geometry, to_ds)
+        conn = places._get_conn_from_cache(geometry, from_ds, to_ds)
+        return discr, conn
+
+    if _copy_collection:
+        places = places.copy()
+
+    # }}}
 
-    return lpot_source, connection
+    # {{{
+
+    discr = lpot_source.density_discr
+    if stage_index >= 1:
+        ds = (None, sym.QBX_SOURCE_STAGE1)
+        try:
+            discr, conn = get_from_cache(*ds)
+        except KeyError:
+            discr, conn = _refine_qbx_stage1(
+                    lpot_source, discr, wrangler, group_factory,
+                    kernel_length_scale=kernel_length_scale,
+                    scaled_max_curvature_threshold=(
+                        scaled_max_curvature_threshold),
+                    expansion_disturbance_tolerance=(
+                        expansion_disturbance_tolerance),
+                    maxiter=maxiter, debug=debug, visualize=visualize)
+            add_to_cache(discr, conn, *ds)
+
+    if stage_index >= 2:
+        ds = (sym.QBX_SOURCE_STAGE1, sym.QBX_SOURCE_STAGE2)
+        try:
+            discr, conn = get_from_cache(*ds)
+        except KeyError:
+            discr, conn = _refine_qbx_stage2(
+                    lpot_source, discr, wrangler, group_factory,
+                    expansion_disturbance_tolerance=(
+                        expansion_disturbance_tolerance),
+                    force_stage2_uniform_refinement_rounds=(
+                        force_stage2_uniform_refinement_rounds),
+                    maxiter=maxiter, debug=debug, visualize=visualize)
+            add_to_cache(discr, conn, *ds)
+
+    if stage_index >= 3:
+        ds = (sym.QBX_SOURCE_STAGE2, sym.QBX_SOURCE_QUAD_STAGE2)
+        try:
+            discr, conn = get_from_cache(*ds)
+        except KeyError:
+            discr, conn = _refine_qbx_quad_stage2(lpot_source, discr)
+            add_to_cache(discr, conn, *ds)
+
+    # }}}
+
+    return places
 
 # }}}
 
 
+# {{{ refine_geometry_collection
+
+def refine_geometry_collection(queue, places,
+        group_factory=None,
+        refine_discr_stage=None,
+        kernel_length_scale=None,
+        force_stage2_uniform_refinement_rounds=None,
+        scaled_max_curvature_threshold=None,
+        expansion_disturbance_tolerance=None,
+        maxiter=None,
+        debug=None, visualize=False):
+    """Entry point for refining all the
+    :class:`~pytential.qbx.QBXLayerPotentialSource` in the given collection.
+    The :class:`~pytential.symbolic.execution.GeometryCollection` performs
+    on-demand refinement, but this function can be used to tweak the
+    parameters.
+
+    :arg places: A :class:`~pytential.symbolic.execution.GeometryCollection`.
+    :arg refine_discr_stage: Defines up to which stage the refinement should
+        be performed. One of
+        :class:`~pytential.symbolic.primitives.QBX_SOURCE_STAGE1`,
+        :class:`~pytential.symbolic.primitives.QBX_SOURCE_STAGE2` or
+        :class:`~pytential.symbolic.primitives.QBX_SOURCE_QUAD_STAGE2`.
+    :arg group_factory: An instance of
+        :class:`meshmode.mesh.discretization.ElementGroupFactory`. Used for
+        discretizing the coarse refined mesh.
+
+    :arg kernel_length_scale: The kernel length scale, or *None* if not
+        applicable. All panels are refined to below this size.
+    :arg maxiter: The maximum number of refiner iterations.
+    """
+
+    from pytential import sym
+    if refine_discr_stage is None:
+        if force_stage2_uniform_refinement_rounds is not None:
+            refine_discr_stage = sym.QBX_SOURCE_STAGE2
+        else:
+            refine_discr_stage = sym.QBX_SOURCE_STAGE1
+
+    from pytential.qbx import QBXLayerPotentialSource
+    places = places.copy()
+    for geometry in places.places:
+        dofdesc = sym.as_dofdesc(geometry).copy(
+                discr_stage=refine_discr_stage)
+        lpot_source = places.get_geometry(dofdesc.geometry)
+        if not isinstance(lpot_source, QBXLayerPotentialSource):
+            continue
+
+        _refine_for_global_qbx(places, dofdesc,
+                lpot_source.refiner_code_container.get_wrangler(queue),
+                group_factory=group_factory,
+                kernel_length_scale=kernel_length_scale,
+                scaled_max_curvature_threshold=scaled_max_curvature_threshold,
+                expansion_disturbance_tolerance=expansion_disturbance_tolerance,
+                force_stage2_uniform_refinement_rounds=(
+                    force_stage2_uniform_refinement_rounds),
+                maxiter=maxiter, debug=debug, visualize=visualize,
+                _copy_collection=False)
+
+    return places
+
+# }}}
+
 # vim: foldmethod=marker:filetype=pyopencl
diff --git a/pytential/qbx/target_assoc.py b/pytential/qbx/target_assoc.py
index 27658c8a7043d27e2a9f520da9ca1cec4c2214fe..39b226ab491e72adfe1dba8ec714d73f090779f6 100644
--- a/pytential/qbx/target_assoc.py
+++ b/pytential/qbx/target_assoc.py
@@ -496,8 +496,12 @@ class TargetAssociationCodeContainer(TreeCodeContainerMixin):
 class TargetAssociationWrangler(TreeWranglerBase):
 
     @log_process(logger)
-    def mark_targets(self, tree, peer_lists, lpot_source, target_status,
-                     debug, wait_for=None):
+    def mark_targets(self, places, dofdesc,
+            tree, peer_lists, target_status,
+            debug, wait_for=None):
+        from pytential import bind, sym
+        ambient_dim = places.ambient_dim
+
         # Round up level count--this gets included in the kernel as
         # a stack bound. Rounding avoids too many kernel versions.
         from pytools import div_ceil
@@ -514,12 +518,12 @@ class TargetAssociationWrangler(TreeWranglerBase):
         found_target_close_to_panel.finish()
 
         # Perform a space invader query over the sources.
-        from pytential import bind, sym
         source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice]
         sources = [
                 axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
-        tunnel_radius_by_source = bind(lpot_source,
-                sym._close_target_tunnel_radii(lpot_source.ambient_dim))(self.queue)
+        tunnel_radius_by_source = bind(places,
+                sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))(
+                        self.queue)
 
         # Target-marking algorithm (TGTMARK):
         #
@@ -555,9 +559,6 @@ class TargetAssociationWrangler(TreeWranglerBase):
                 wait_for=wait_for)
         wait_for = [evt]
 
-        tunnel_radius_by_source = bind(lpot_source,
-            sym._close_target_tunnel_radii(lpot_source.ambient_dim))(self.queue)
-
         evt = knl(
             *unwrap_args(
                 tree, peer_lists,
@@ -587,9 +588,13 @@ class TargetAssociationWrangler(TreeWranglerBase):
         return (found_target_close_to_panel == 1).all().get()
 
     @log_process(logger)
-    def find_centers(self, tree, peer_lists, lpot_source,
-                     target_status, target_flags, target_assoc,
-                     target_association_tolerance, debug, wait_for=None):
+    def find_centers(self, places, dofdesc,
+            tree, peer_lists, target_status, target_flags, target_assoc,
+            target_association_tolerance,
+            debug, wait_for=None):
+        from pytential import bind, sym
+        ambient_dim = places.ambient_dim
+
         # Round up level count--this gets included in the kernel as
         # a stack bound. Rounding avoids too many kernel versions.
         from pytools import div_ceil
@@ -607,15 +612,15 @@ class TargetAssociationWrangler(TreeWranglerBase):
             marked_target_count = int(cl.array.sum(target_status).get())
 
         # Perform a space invader query over the centers.
-        from pytential import bind, sym
         center_slice = (
                 tree.sorted_target_ids[tree.qbx_user_center_slice]
                 .with_queue(self.queue))
         centers = [
                 axis.with_queue(self.queue)[center_slice] for axis in tree.sources]
-        expansion_radii_by_center = bind(lpot_source, sym.expansion_radii(
-            lpot_source.ambient_dim,
-            granularity=sym.GRANULARITY_CENTER))(self.queue)
+        expansion_radii_by_center = bind(places, sym.expansion_radii(
+            ambient_dim,
+            granularity=sym.GRANULARITY_CENTER,
+            dofdesc=dofdesc))(self.queue)
         expansion_radii_by_center_with_tolerance = \
                 expansion_radii_by_center * (1 + target_association_tolerance)
 
@@ -686,9 +691,12 @@ class TargetAssociationWrangler(TreeWranglerBase):
         cl.wait_for_events([evt])
 
     @log_process(logger)
-    def mark_panels_for_refinement(self, tree, peer_lists, lpot_source,
-                                   target_status, refine_flags, debug,
-                                   wait_for=None):
+    def mark_panels_for_refinement(self, places, dofdesc,
+            tree, peer_lists, target_status, refine_flags,
+            debug, wait_for=None):
+        from pytential import bind, sym
+        ambient_dim = places.ambient_dim
+
         # Round up level count--this gets included in the kernel as
         # a stack bound. Rounding avoids too many kernel versions.
         from pytools import div_ceil
@@ -705,12 +713,12 @@ class TargetAssociationWrangler(TreeWranglerBase):
         found_panel_to_refine.finish()
 
         # Perform a space invader query over the sources.
-        from pytential import bind, sym
         source_slice = tree.user_source_ids[tree.qbx_user_source_slice]
         sources = [
                 axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
-        tunnel_radius_by_source = bind(lpot_source,
-                sym._close_target_tunnel_radii(lpot_source.ambient_dim))(self.queue)
+        tunnel_radius_by_source = bind(places,
+                sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))(
+                        self.queue)
 
         # See (TGTMARK) above for algorithm.
 
@@ -723,8 +731,9 @@ class TargetAssociationWrangler(TreeWranglerBase):
                 wait_for=wait_for)
         wait_for = [evt]
 
-        tunnel_radius_by_source = bind(lpot_source,
-                sym._close_target_tunnel_radii(lpot_source.ambient_dim))(self.queue)
+        tunnel_radius_by_source = bind(places,
+                sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))(
+                        self.queue)
 
         evt = knl(
             *unwrap_args(
@@ -781,26 +790,23 @@ class TargetAssociationWrangler(TreeWranglerBase):
         return QBXTargetAssociation(target_to_center=target_to_center)
 
 
-def associate_targets_to_qbx_centers(lpot_source, wrangler,
+def associate_targets_to_qbx_centers(places, geometry, wrangler,
         target_discrs_and_qbx_sides, target_association_tolerance,
         debug=True, wait_for=None):
     """
     Associate targets to centers in a layer potential source.
 
-    :arg lpot_source: An instance of :class:`QBXLayerPotentialSource`
-
+    :arg places: A :class:`~pytential.symbolic.execution.GeometryCollection`.
+    :arg geometry: Name of the source geometry in *places* for which to
+        associate targets.
     :arg wrangler: An instance of :class:`TargetAssociationWrangler`
-
     :arg target_discrs_and_qbx_sides:
-
-        a list of tuples ``(discr, sides)``, where
-        *discr* is a
+        a list of tuples ``(discr, sides)``, where *discr* is a
         :class:`pytential.discretization.Discretization`
         or a
         :class:`pytential.discretization.target.TargetBase` instance, and
-        *sides* is either a :class:`int` or
-        an array of (:class:`numpy.int8`) side requests for each
-        target.
+        *sides* is either a :class:`int` or an array of (:class:`numpy.int8`)
+        side requests for each target.
 
         The side request can take on the values in :ref:`qbx-side-request-table`.
 
@@ -811,16 +817,21 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler,
     :returns: A :class:`QBXTargetAssociation`.
     """
 
-    tree = wrangler.build_tree(lpot_source,
-            [discr for discr, _ in target_discrs_and_qbx_sides])
+    from pytential import sym
+    dofdesc = sym.as_dofdesc(geometry).to_stage1()
+
+    tree = wrangler.build_tree(places,
+            sources_list=[dofdesc.geometry],
+            targets_list=[discr for discr, _ in target_discrs_and_qbx_sides])
 
     peer_lists = wrangler.find_peer_lists(tree)
 
     target_status = cl.array.zeros(wrangler.queue, tree.nqbxtargets, dtype=np.int32)
     target_status.finish()
 
-    have_close_targets = wrangler.mark_targets(tree, peer_lists,
-           lpot_source, target_status, debug)
+    have_close_targets = wrangler.mark_targets(places, dofdesc,
+            tree, peer_lists, target_status,
+            debug)
 
     target_assoc = wrangler.make_default_target_association(tree.nqbxtargets)
 
@@ -829,8 +840,10 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler,
 
     target_flags = wrangler.make_target_flags(target_discrs_and_qbx_sides)
 
-    wrangler.find_centers(tree, peer_lists, lpot_source, target_status,
-            target_flags, target_assoc, target_association_tolerance, debug)
+    wrangler.find_centers(places, dofdesc,
+            tree, peer_lists, target_status,
+            target_flags, target_assoc, target_association_tolerance,
+            debug)
 
     center_not_found = (
         target_status == target_status_enum.MARKED_QBX_CENTER_PENDING)
@@ -860,7 +873,9 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler,
         refine_flags = cl.array.zeros(
                 wrangler.queue, tree.nqbxpanels, dtype=np.int32)
         have_panel_to_refine = wrangler.mark_panels_for_refinement(
-                tree, peer_lists, lpot_source, target_status, refine_flags, debug)
+                places, dofdesc,
+                tree, peer_lists, target_status, refine_flags,
+                debug)
 
         assert have_panel_to_refine
         raise QBXTargetAssociationFailedException(
diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py
index f5a74dc1605a38ae51415e07a41b6cc60e491c7f..b872152a2002ddec5678073e15255c25842db4ed 100644
--- a/pytential/qbx/utils.py
+++ b/pytential/qbx/utils.py
@@ -68,45 +68,6 @@ QBX_TREE_MAKO_DEFS = r"""//CL:mako//
 # }}}
 
 
-# {{{ make interleaved centers
-
-def get_interleaved_centers(queue, lpot_source):
-    """
-    Return an array of shape (dim, ncenters) in which interior centers are placed
-    next to corresponding exterior centers.
-    """
-    from pytential import bind, sym
-    int_centers = bind(lpot_source,
-            sym.expansion_centers(lpot_source.ambient_dim, -1))(queue)
-    ext_centers = bind(lpot_source,
-            sym.expansion_centers(lpot_source.ambient_dim, +1))(queue)
-
-    from pytential.symbolic.dof_connection import CenterGranularityConnection
-    interleaver = CenterGranularityConnection(lpot_source.density_discr)
-    return interleaver(queue, [int_centers, ext_centers])
-
-# }}}
-
-
-# {{{ make interleaved radii
-
-def get_interleaved_radii(queue, lpot_source):
-    """
-    Return an array of shape (dim, ncenters) in which interior centers are placed
-    next to corresponding exterior centers.
-    """
-    from pytential import bind, sym
-
-    radii = bind(lpot_source,
-            sym.expansion_radii(lpot_source.ambient_dim))(queue)
-
-    from pytential.symbolic.dof_connection import CenterGranularityConnection
-    interleaver = CenterGranularityConnection(lpot_source.density_discr)
-    return interleaver(queue, radii)
-
-# }}}
-
-
 # {{{ tree code container
 
 class TreeCodeContainer(object):
@@ -159,13 +120,15 @@ class TreeWranglerBase(object):
         self.code_container = code_container
         self.queue = queue
 
-    def build_tree(self, lpot_source, targets_list=(),
+    def build_tree(self, places, targets_list=(), sources_list=(),
                    use_stage2_discr=False):
         tb = self.code_container.build_tree()
         plfilt = self.code_container.particle_list_filter()
-        from pytential.qbx.utils import build_tree_with_qbx_metadata
+
         return build_tree_with_qbx_metadata(
-                self.queue, tb, plfilt, lpot_source, targets_list=targets_list,
+                self.queue, places, tb, plfilt,
+                sources_list=sources_list,
+                targets_list=targets_list,
                 use_stage2_discr=use_stage2_discr)
 
     def find_peer_lists(self, tree):
@@ -263,41 +226,61 @@ MAX_REFINE_WEIGHT = 64
 
 
 @log_process(logger)
-def build_tree_with_qbx_metadata(
-        queue, tree_builder, particle_list_filter, lpot_source, targets_list=(),
+def build_tree_with_qbx_metadata(queue, places,
+        tree_builder, particle_list_filter,
+        sources_list=(), targets_list=(),
         use_stage2_discr=False):
     """Return a :class:`TreeWithQBXMetadata` built from the given layer
     potential source. This contains particles of four different types:
 
        * source particles either from
-         ``lpot_source.density_discr`` or
-         ``lpot_source.stage2_density_discr``
-       * centers from ``lpot_source.centers()``
+         :class:`~pytential.symbolic.primitives.QBX_SOURCE_STAGE1` or
+         :class:`~pytential.symbolic.primitives.QBX_SOURCE_QUAD_STAGE2`.
+       * centers from
+         :class:`~pytential.symbolic.primitives.QBX_SOURCE_STAGE1`.
        * targets from ``targets_list``.
 
     :arg queue: An instance of :class:`pyopencl.CommandQueue`
-
-    :arg lpot_source: An instance of
-        :class:`pytential.qbx.NewQBXLayerPotentialSource`.
-
+    :arg places: An instance of
+        :class:`~pytential.symbolic.execution.GeometryCollection`.
     :arg targets_list: A list of :class:`pytential.target.TargetBase`
 
-    :arg use_stage2_discr: If *True*, builds a tree with sources
-        from ``lpot_source.stage2_density_discr``. If *False* (default),
-        they are from ``lpot_source.density_discr``.
+    :arg use_stage2_discr: If *True*, builds a tree with stage 2 sources.
+        If *False*, the tree is built with stage 1 sources.
     """
+
     # The ordering of particles is as follows:
     # - sources go first
     # - then centers
     # - then targets
 
-    if use_stage2_discr:
-        density_discr = lpot_source.quad_stage2_density_discr
-    else:
-        density_discr = lpot_source.density_discr
+    from pytential import bind, sym
+    stage1_density_discrs = []
+    density_discrs = []
+    for source_name in sources_list:
+        dd = sym.as_dofdesc(source_name)
+
+        discr = places.get_discretization(dd.geometry)
+        stage1_density_discrs.append(discr)
+
+        if use_stage2_discr:
+            discr = places.get_discretization(
+                    dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
+        density_discrs.append(discr)
+
+    # TODO: update code to work for multiple source discretizations
+    if len(sources_list) != 1:
+        raise RuntimeError("can only build a tree for a single source")
+
+    def _make_centers(discr):
+        return bind(discr, sym.interleaved_expansion_centers(
+            discr.ambient_dim))(queue)
+
+    stage1_density_discr = stage1_density_discrs[0]
+    density_discr = density_discrs[0]
 
     sources = density_discr.nodes()
-    centers = get_interleaved_centers(queue, lpot_source)
+    centers = _make_centers(stage1_density_discr)
     targets = (tgt.nodes() for tgt in targets_list)
 
     particles = tuple(
diff --git a/pytential/source.py b/pytential/source.py
index 25e9d1385f5282d41c332e8bcf93290e5785e90a..7ed794abfb2d1fa1ff32d21117e13ccfa3069a8c 100644
--- a/pytential/source.py
+++ b/pytential/source.py
@@ -150,7 +150,8 @@ class PointPotentialSource(PotentialSource):
         # FIXME: Do this all at once
         result = []
         for o in insn.outputs:
-            target_discr = bound_expr.get_discretization(o.target_name)
+            target_discr = bound_expr.places.get_discretization(
+                    o.target_name.geometry, o.target_name.discr_stage)
 
             # no on-disk kernel caching
             if p2p is None:
@@ -165,14 +166,6 @@ class PointPotentialSource(PotentialSource):
         timing_data = {}
         return result, timing_data
 
-    @memoize_method
-    def weights_and_area_elements(self):
-        with cl.CommandQueue(self.cl_context) as queue:
-            result = cl.array.empty(queue, self.nnodes, dtype=self.real_dtype)
-            result.fill(1)
-
-        return result.with_queue(None)
-
 # }}}
 
 
@@ -182,15 +175,9 @@ class LayerPotentialSourceBase(PotentialSource):
     """A discretization of a layer potential using panel-based geometry, with
     support for refinement and upsampling.
 
-    .. rubric:: Discretizations
-
-    .. attribute:: density_discr
-    .. attribute:: stage2_density_discr
-    .. attribute:: quad_stage2_density_discr
-    .. attribute:: resampler
-
     .. rubric:: Discretization data
 
+    .. attribute:: density_discr
     .. attribute:: cl_context
     .. attribute:: ambient_dim
     .. attribute:: dim
@@ -199,23 +186,13 @@ class LayerPotentialSourceBase(PotentialSource):
 
     .. rubric:: Execution
 
+    .. automethod:: cost_model_compute_potential_insn
+    .. automethod:: exec_compute_potential_insn
     """
 
     def __init__(self, density_discr):
         self.density_discr = density_discr
 
-    @property
-    def stage2_density_discr(self):
-        raise NotImplementedError
-
-    @property
-    def quad_stage2_density_discr(self):
-        raise NotImplementedError
-
-    @property
-    def resampler(self):
-        raise NotImplementedError
-
     @property
     def ambient_dim(self):
         return self.density_discr.ambient_dim
diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py
index 86b10d24ad0e163506fc2e2b4dd004a6b7bf7c73..3fcfbf2ec6eb4c42cb2a6d1af24bdb0bb1eca297 100644
--- a/pytential/symbolic/compiler.py
+++ b/pytential/symbolic/compiler.py
@@ -438,7 +438,7 @@ class OperatorCompiler(IdentityMapper):
 
     def op_group_features(self, expr):
         from pytential.symbolic.primitives import hashable_kernel_args
-        lpot_source = self.places.get_geometry(expr.source)
+        lpot_source = self.places.get_geometry(expr.source.geometry)
         return (
                 lpot_source.op_group_features(expr)
                 + hashable_kernel_args(expr.kernel_arguments))
@@ -516,6 +516,11 @@ class OperatorCompiler(IdentityMapper):
         self.assigned_names.add(name)
         return name
 
+    def make_assign(self, name, expr, priority):
+        return Assign(names=[name], exprs=[expr],
+                dep_mapper_factory=self.dep_mapper_factory,
+                priority=priority)
+
     def assign_to_new_var(self, expr, priority=0, prefix=None):
         from pymbolic.primitives import Variable, Subscript
 
@@ -535,9 +540,12 @@ class OperatorCompiler(IdentityMapper):
     # {{{ map_xxx routines
 
     def map_common_subexpression(self, expr):
-        if expr.scope != cse_scope.EXPRESSION:
-            from warnings import warn
-            warn("mishandling CSE scope")
+        # NOTE: EXPRESSION and DISCRETIZATION scopes are handled in
+        # execution.py::EvaluationMapperBase so that they can be cached
+        # with a longer lifetime
+        if expr.scope != cse_scope.EVALUATION:
+            return expr
+
         try:
             return self.expr_to_var[expr.child]
         except KeyError:
@@ -560,11 +568,6 @@ class OperatorCompiler(IdentityMapper):
             self.expr_to_var[expr.child] = cse_var
             return cse_var
 
-    def make_assign(self, name, expr, priority):
-        return Assign(names=[name], exprs=[expr],
-                dep_mapper_factory=self.dep_mapper_factory,
-                priority=priority)
-
     def map_int_g(self, expr, name_hint=None):
         try:
             return self.expr_to_var[expr]
diff --git a/pytential/symbolic/dof_connection.py b/pytential/symbolic/dof_connection.py
index 8cc5d1e4e8202dee9a20fed4013f4f106fd75a60..df97dc9b887b026b52cce5a7100d9f378abf7fb9 100644
--- a/pytential/symbolic/dof_connection.py
+++ b/pytential/symbolic/dof_connection.py
@@ -26,6 +26,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
 import pyopencl as cl
 import pyopencl.array # noqa
 from pytools import memoize
@@ -214,10 +215,13 @@ def connection_from_dds(places, from_dd, to_dd):
     from_dd = sym.as_dofdesc(from_dd)
     to_dd = sym.as_dofdesc(to_dd)
 
-    from pytential.symbolic.execution import GeometryCollection
+    from pytential import GeometryCollection
     if not isinstance(places, GeometryCollection):
         places = GeometryCollection(places)
-    from_discr = places.get_geometry(from_dd)
+
+    lpot = places.get_geometry(from_dd.geometry)
+    from_discr = places.get_discretization(from_dd.geometry, from_dd.discr_stage)
+    to_discr = places.get_discretization(to_dd.geometry, to_dd.discr_stage)
 
     if from_dd.geometry != to_dd.geometry:
         raise ValueError("cannot interpolate between different geometries")
@@ -228,27 +232,36 @@ def connection_from_dds(places, from_dd, to_dd):
     connections = []
     if from_dd.discr_stage is not to_dd.discr_stage:
         from pytential.qbx import QBXLayerPotentialSource
-        if not isinstance(from_discr, QBXLayerPotentialSource):
+        if not isinstance(lpot, QBXLayerPotentialSource):
             raise ValueError("can only interpolate on a "
                     "`QBXLayerPotentialSource`")
 
-        if to_dd.discr_stage is not sym.QBX_SOURCE_QUAD_STAGE2:
-            # TODO: can probably extend this to project from a QUAD_STAGE2
-            # using L2ProjectionInverseDiscretizationConnection
-            raise ValueError("can only interpolate to "
-                "`QBX_SOURCE_QUAD_STAGE2`")
-
-        if from_dd.discr_stage is sym.QBX_SOURCE_QUAD_STAGE2:
-            pass
-        elif from_dd.discr_stage is sym.QBX_SOURCE_STAGE2:
-            connections.append(
-                    from_discr.refined_interp_to_ovsmp_quad_connection)
-        else:
-            connections.append(from_discr.resampler)
+        # if to_dd.discr_stage is not sym.QBX_SOURCE_QUAD_STAGE2:
+        #     # TODO: can probably extend this to project from a QUAD_STAGE2
+        #     # using L2ProjectionInverseDiscretizationConnection
+        #     raise ValueError("can only interpolate to "
+        #         "`QBX_SOURCE_QUAD_STAGE2`")
+
+        # FIXME: would be nice if these were ordered by themselves
+        stage_name_to_index_map = {
+                None: 0,
+                sym.QBX_SOURCE_STAGE1: 1,
+                sym.QBX_SOURCE_STAGE2: 2,
+                sym.QBX_SOURCE_QUAD_STAGE2: 3
+                }
+        stage_index_to_name_map = dict([(i, name) for name, i in
+                    six.iteritems(stage_name_to_index_map)])
+
+        from_stage = stage_name_to_index_map[from_dd.discr_stage]
+        to_stage = stage_name_to_index_map[to_dd.discr_stage]
+
+        for istage in range(from_stage, to_stage):
+            conn = places._get_conn_from_cache(from_dd.geometry,
+                    stage_index_to_name_map[istage],
+                    stage_index_to_name_map[istage + 1])
+            connections.append(conn)
 
     if from_dd.granularity is not to_dd.granularity:
-        to_discr = places.get_discretization(to_dd)
-
         if to_dd.granularity is sym.GRANULARITY_NODE:
             pass
         elif to_dd.granularity is sym.GRANULARITY_CENTER:
@@ -259,7 +272,15 @@ def connection_from_dds(places, from_dd, to_dd):
         else:
             raise ValueError("invalid to_dd granularity: %s" % to_dd.granularity)
 
-    return DOFConnection(connections, from_dd=from_dd, to_dd=to_dd)
+    if from_dd.granularity is not to_dd.granularity:
+        conn = DOFConnection(connections, from_dd=from_dd, to_dd=to_dd)
+    else:
+        from meshmode.discretization.connection import \
+                ChainedDiscretizationConnection
+        conn = ChainedDiscretizationConnection(connections,
+                from_discr=from_discr)
+
+    return conn
 
 # }}}
 
diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py
index da854e74da7c9a1ee73f786f1e533d4427243cf5..31e50e7e17c88eb015eb6866db032e787dffb296 100644
--- a/pytential/symbolic/execution.py
+++ b/pytential/symbolic/execution.py
@@ -41,6 +41,9 @@ from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pytools import memoize_in
 from pytential import sym
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 __doc__ = """
 .. autoclass :: BoundExpression
@@ -49,7 +52,7 @@ __doc__ = """
 
 # FIXME caches: fix up queues
 
-# {{{ evaluation mapper
+# {{{ evaluation mapper base (shared, between actual eval and cost model)
 
 def mesh_el_view(mesh, group_nr, global_array):
     """Return a view of *global_array* of shape
@@ -92,13 +95,13 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
     def map_max(self, expr):
         return self._map_minmax(
                 cl.array.maximum,
-                super(EvaluationMapper, self).map_max,
+                super(EvaluationMapperBase, self).map_max,
                 expr)
 
     def map_min(self, expr):
         return self._map_minmax(
                 cl.array.minimum,
-                super(EvaluationMapper, self).map_min,
+                super(EvaluationMapperBase, self).map_min,
                 expr)
 
     def map_node_sum(self, expr):
@@ -150,7 +153,8 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
 
             return result
 
-        discr = self.bound_expr.get_discretization(expr.dofdesc)
+        discr = self.places.get_discretization(
+                expr.dofdesc.geometry, expr.dofdesc.discr_stage)
         operand = self.rec(expr.operand)
         assert operand.shape == (discr.nnodes,)
 
@@ -176,8 +180,8 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
         return self._map_elementwise_reduction("max", expr)
 
     def map_ones(self, expr):
-        discr = self.bound_expr.get_discretization(expr.dofdesc)
-
+        discr = self.places.get_discretization(
+                expr.dofdesc.geometry, expr.dofdesc.discr_stage)
         result = (discr
                 .empty(queue=self.queue, dtype=discr.real_dtype)
                 .with_queue(self.queue))
@@ -186,12 +190,14 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
         return result
 
     def map_node_coordinate_component(self, expr):
-        discr = self.bound_expr.get_discretization(expr.dofdesc)
+        discr = self.places.get_discretization(
+                expr.dofdesc.geometry, expr.dofdesc.discr_stage)
         return discr.nodes()[expr.ambient_axis] \
                 .with_queue(self.queue)
 
     def map_num_reference_derivative(self, expr):
-        discr = self.bound_expr.get_discretization(expr.dofdesc)
+        discr = self.places.get_discretization(
+                expr.dofdesc.geometry, expr.dofdesc.discr_stage)
 
         from pytools import flatten
         ref_axes = flatten([axis] * mult for axis, mult in expr.ref_axes)
@@ -201,19 +207,20 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
                         .with_queue(self.queue)
 
     def map_q_weight(self, expr):
-        discr = self.bound_expr.get_discretization(expr.dofdesc)
+        discr = self.places.get_discretization(
+                expr.dofdesc.geometry, expr.dofdesc.discr_stage)
         return discr.quad_weights(self.queue) \
                 .with_queue(self.queue)
 
     def map_inverse(self, expr):
-        bound_op_cache = self.bound_expr.places.get_cache("bound_op")
+        bound_op_cache = self.bound_expr.places._get_cache("bound_op")
 
         try:
             bound_op = bound_op_cache[expr]
         except KeyError:
             bound_op = bind(
                     expr.expression,
-                    self.places.get_geometry(expr.dofdesc),
+                    self.places.get_geometry(expr.dofdesc.geometry),
                     self.bound_expr.iprec)
             bound_op_cache[expr] = bound_op
 
@@ -229,17 +236,30 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
     def map_interpolation(self, expr):
         operand = self.rec(expr.operand)
 
-        if isinstance(operand, cl.array.Array):
-            from pytential.symbolic.dof_connection import connection_from_dds
-
-            conn = connection_from_dds(self.places,
-                    expr.from_dd, expr.to_dd)
-            return conn(self.queue, operand).with_queue(self.queue)
+        if isinstance(operand, (cl.array.Array, list)):
+            conn = self.places.get_connection(expr.from_dd, expr.to_dd)
+            return conn(self.queue, operand)
         elif isinstance(operand, (int, float, complex, np.number)):
             return operand
         else:
             raise TypeError("cannot interpolate `{}`".format(type(operand)))
 
+    def map_common_subexpression(self, expr):
+        if expr.scope == sym.cse_scope.EXPRESSION:
+            cache = self.bound_expr._get_cache("cse")
+        elif expr.scope == sym.cse_scope.DISCRETIZATION:
+            cache = self.places._get_cache("cse")
+        else:
+            return self.rec(expr.child)
+
+        try:
+            rec = cache[expr.child]
+        except KeyError:
+            rec = self.rec(expr.child)
+            cache[expr.child] = rec
+
+        return rec
+
     # }}}
 
     def exec_assign(self, queue, insn, bound_expr, evaluate):
@@ -290,7 +310,7 @@ class EvaluationMapperBase(PymbolicEvaluationMapper):
                         *args, queue=self.queue)
 
         else:
-            return EvaluationMapperBase.map_call(self, expr)
+            return super(EvaluationMapperBase, self).map_call(expr)
 
 # }}}
 
@@ -305,7 +325,7 @@ class EvaluationMapper(EvaluationMapperBase):
         self.timing_data = timing_data
 
     def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
-        source = bound_expr.places.get_geometry(insn.source)
+        source = bound_expr.places.get_geometry(insn.source.geometry)
 
         return_timing_data = self.timing_data is not None
 
@@ -324,7 +344,7 @@ class EvaluationMapper(EvaluationMapperBase):
 # }}}
 
 
-# {{{ cost model mapper
+# {{{ cost model evaluation mapper
 
 class CostModelMapper(EvaluationMapperBase):
     """Mapper for evaluating cost models.
@@ -348,7 +368,7 @@ class CostModelMapper(EvaluationMapperBase):
         self.modeled_cost = {}
 
     def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
-        source = bound_expr.places.get_geometry(insn.source)
+        source = bound_expr.places.get_geometry(insn.source.geometry)
 
         result, cost_model_result = (
                 source.cost_model_compute_potential_insn(
@@ -428,7 +448,7 @@ class MatVecOp:
 def _prepare_domains(nresults, places, domains, default_domain):
     """
     :arg nresults: number of results.
-    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg places: a :class:`~pytential.symbolic.execution.GeometryCollection`.
     :arg domains: recommended domains.
     :arg default_domain: default value for domains which are not provided.
 
@@ -451,9 +471,36 @@ def _prepare_domains(nresults, places, domains, default_domain):
     return domains
 
 
-def _prepare_expr(places, expr):
+def _prepare_auto_where(auto_where, places=None):
     """
-    :arg places: :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg auto_where: a 2-tuple, single identifier or `None` used as a hint
+        to determine the default geometries.
+    :arg places: a :class:`GeometryCollection`,
+        whose :attr:`GeometryCollection.auto_where` is used by default if
+        provided and `auto_where` is `None`.
+    :return: a tuple ``(source, target)`` of
+        :class:`~pytential.symbolic.primitives.DOFDescriptor`s denoting
+        the default source and target geometries.
+    """
+
+    if auto_where is None:
+        if places is None:
+            auto_source = sym.DEFAULT_SOURCE
+            auto_target = sym.DEFAULT_TARGET
+        else:
+            auto_source, auto_target = places.auto_where
+    elif isinstance(auto_where, (list, tuple)):
+        auto_source, auto_target = auto_where
+    else:
+        auto_source = auto_where
+        auto_target = auto_source
+
+    return (sym.as_dofdesc(auto_source), sym.as_dofdesc(auto_target))
+
+
+def _prepare_expr(places, expr, auto_where=None):
+    """
+    :arg places: :class:`~pytential.symbolic.execution.GeometryCollection`.
     :arg expr: a symbolic expression.
     :return: processed symbolic expressions, tagged with the appropriate
         `where` identifier from places, etc.
@@ -462,19 +509,19 @@ def _prepare_expr(places, expr):
     from pytential.source import LayerPotentialSourceBase
     from pytential.symbolic.mappers import (
             ToTargetTagger,
-            DerivativeBinder,
-            InterpolationPreprocessor)
+            DerivativeBinder)
 
-    expr = ToTargetTagger(*places.auto_where)(expr)
+    auto_source, auto_target = _prepare_auto_where(auto_where, places=places)
+    expr = ToTargetTagger(auto_source, auto_target)(expr)
     expr = DerivativeBinder()(expr)
 
     for name, place in six.iteritems(places.places):
         if isinstance(place, LayerPotentialSourceBase):
             expr = place.preprocess_optemplate(name, places, expr)
 
-    # NOTE: only insert interpolation operators after the layer potential
-    # operators were preprocessed to avoid any confusion
+    from pytential.symbolic.mappers import InterpolationPreprocessor
     expr = InterpolationPreprocessor(places)(expr)
+
     return expr
 
 # }}}
@@ -482,6 +529,22 @@ def _prepare_expr(places, expr):
 
 # {{{ geometry collection
 
+def _is_valid_identifier(name):
+    if six.PY2:
+        # https://docs.python.org/2.7/reference/lexical_analysis.html#identifiers
+        import re
+        is_identifier = re.match(r"^[^\d\W]\w*\Z", name) is not None
+    else:
+        is_identifier = name.isidentifier()
+
+    import keyword
+    return is_identifier and not keyword.iskeyword(name)
+
+
+_GEOMETRY_COLLECTION_DISCR_CACHE_NAME = "refined_qbx_discrs"
+_GEOMETRY_COLLECTION_CONNS_CACHE_NAME = "refined_qbx_conns"
+
+
 class GeometryCollection(object):
     """A mapping from symbolic identifiers ("place IDs", typically strings)
     to 'geometries', where a geometry can be a
@@ -492,11 +555,18 @@ class GeometryCollection(object):
     of subsets of them, as well as related common subexpressions such as
     metric terms.
 
-    .. automethod:: get_discretization
     .. automethod:: get_geometry
+    .. automethod:: get_connection
+    .. automethod:: get_discretization
+
     .. automethod:: copy
+    .. automethod:: merge
 
-    .. method:: get_cache
+    Refinement of :class:`QBXLayerPotentialSource` entries is performed
+    on demand, or it may be performed by explcitly calling
+    :func:`pytential.qbx.refinement.refine_geometry_collection`,
+    which allows more customization of the refinement process through
+    parameters.
     """
 
     def __init__(self, places, auto_where=None):
@@ -505,7 +575,8 @@ class GeometryCollection(object):
             geometry objects. Supported objects are
             :class:`~pytential.source.PotentialSource`,
             :class:`~potential.target.TargetBase` and
-            :class:`~meshmode.discretization.Discretization`.
+            :class:`~meshmode.discretization.Discretization`. If this is
+            a mapping, the keys that are strings must be valid Python identifiers.
         :arg auto_where: location identifier for each geometry object, used
             to denote specific discretizations, e.g. in the case where
             *places* is a :class:`~pytential.source.LayerPotentialSourceBase`.
@@ -520,51 +591,71 @@ class GeometryCollection(object):
         from pytential.qbx import QBXLayerPotentialSource
         from meshmode.discretization import Discretization
 
-        # {{{ define default source and target descriptors
-
-        if isinstance(auto_where, (list, tuple)):
-            auto_source, auto_target = auto_where
-        else:
-            auto_source, auto_target = auto_where, None
-
-        if auto_source is None:
-            auto_source = sym.DEFAULT_SOURCE
-        if auto_target is None:
-            auto_target = sym.DEFAULT_TARGET
-
-        auto_source = sym.as_dofdesc(auto_source)
-        auto_target = sym.as_dofdesc(auto_target)
-        self.auto_where = (auto_source, auto_target)
-
-        # }}}
-
         # {{{ construct dict
 
         self.places = {}
+        self.caches = {}
+
+        auto_source, auto_target = _prepare_auto_where(auto_where)
         if isinstance(places, QBXLayerPotentialSource):
             self.places[auto_source.geometry] = places
-            self.places[auto_target.geometry] = \
-                    self._get_lpot_discretization(places, auto_target)
-        elif isinstance(places, (Discretization, PotentialSource)):
-            self.places[auto_source.geometry] = places
-            self.places[auto_target.geometry] = places
+            auto_target = auto_source
         elif isinstance(places, TargetBase):
             self.places[auto_target.geometry] = places
+            auto_source = auto_target
+        if isinstance(places, (Discretization, PotentialSource)):
+            self.places[auto_source.geometry] = places
+            self.places[auto_target.geometry] = places
         elif isinstance(places, tuple):
             source_discr, target_discr = places
             self.places[auto_source.geometry] = source_discr
             self.places[auto_target.geometry] = target_discr
         else:
-            self.places = places.copy()
+            self.places = places
+
+        self.auto_where = (auto_source, auto_target)
+
+        # }}}
+
+        # {{{ validate
 
+        # check allowed identifiers
+        for name in self.places:
+            if not isinstance(name, str):
+                continue
+            if not _is_valid_identifier(name):
+                raise ValueError("`{}` is not a valid identifier".format(name))
+
+        # check allowed types
         for p in six.itervalues(self.places):
             if not isinstance(p, (PotentialSource, TargetBase, Discretization)):
                 raise TypeError("Must pass discretization, targets or "
                         "layer potential sources as 'places'.")
 
-        # }}}
+        # check cl_context
+        from pytools import is_single_valued
+        cl_contexts = []
+        for p in six.itervalues(self.places):
+            if isinstance(p, (PotentialSource, Discretization)):
+                cl_contexts.append(p.cl_context)
+            elif isinstance(p, TargetBase):
+                nodes = p.nodes()[0]
+                if isinstance(nodes, cl.array.Array) and nodes.queue is not None:
+                    cl_contexts.append(nodes.queue.context)
 
-        self.caches = {}
+        if not is_single_valued(cl_contexts):
+            raise RuntimeError("All 'places' must have the same CL context.")
+
+        self.cl_context = cl_contexts[0]
+
+        # check ambient_dim
+        ambient_dims = [p.ambient_dim for p in six.itervalues(self.places)]
+        if not is_single_valued(ambient_dims):
+            raise RuntimeError("All 'places' must have the same ambient dimension.")
+
+        self.ambient_dim = ambient_dims[0]
+
+        # }}}
 
     @property
     def auto_source(self):
@@ -574,14 +665,76 @@ class GeometryCollection(object):
     def auto_target(self):
         return self.auto_where[1]
 
-    def _get_lpot_discretization(self, lpot, dofdesc):
-        if dofdesc.discr_stage == sym.QBX_SOURCE_STAGE2:
-            return lpot.stage2_density_discr
-        if dofdesc.discr_stage == sym.QBX_SOURCE_QUAD_STAGE2:
-            return lpot.quad_stage2_density_discr
-        return lpot.density_discr
+    # {{{ cache handling
+
+    def _get_cache(self, name):
+        return self.caches.setdefault(name, {})
+
+    def _get_discr_from_cache(self, geometry, discr_stage):
+        cache = self._get_cache(_GEOMETRY_COLLECTION_DISCR_CACHE_NAME)
+        key = (geometry, discr_stage)
+
+        if key not in cache:
+            raise KeyError("cached discretization does not exist on `{}`"
+                    "for stage `{}`".format(geometry, discr_stage))
 
-    def get_discretization(self, dofdesc):
+        return cache[key]
+
+    def _add_discr_to_cache(self, discr, geometry, discr_stage):
+        cache = self._get_cache(_GEOMETRY_COLLECTION_DISCR_CACHE_NAME)
+        key = (geometry, discr_stage)
+
+        if key in cache:
+            raise RuntimeError("trying to overwrite the cache")
+
+        cache[key] = discr
+
+    def _get_conn_from_cache(self, geometry, from_stage, to_stage):
+        cache = self._get_cache(_GEOMETRY_COLLECTION_CONNS_CACHE_NAME)
+        key = (geometry, from_stage, to_stage)
+
+        if key not in cache:
+            raise KeyError("cached connection does not exist on `{}` "
+                    "from `{}` to `{}`".format(geometry, from_stage, to_stage))
+
+        return cache[key]
+
+    def _add_conn_to_cache(self, conn, geometry, from_stage, to_stage):
+        cache = self._get_cache(_GEOMETRY_COLLECTION_CONNS_CACHE_NAME)
+        key = (geometry, from_stage, to_stage)
+
+        if key in cache:
+            raise RuntimeError("trying to overwrite the cache")
+
+        cache[key] = conn
+
+    def _get_qbx_discretization(self, geometry, discr_stage):
+        lpot_source = self.get_geometry(geometry)
+
+        try:
+            discr = self._get_discr_from_cache(geometry, discr_stage)
+        except KeyError:
+            from pytential import sym
+            from pytential.qbx.refinement import _refine_for_global_qbx
+
+            with cl.CommandQueue(lpot_source.cl_context) as queue:
+                # NOTE: this adds the required discretizations to the cache
+                dofdesc = sym.DOFDescriptor(geometry, discr_stage)
+                _refine_for_global_qbx(self, dofdesc,
+                        lpot_source.refiner_code_container.get_wrangler(queue),
+                        _copy_collection=False)
+
+            discr = self._get_discr_from_cache(geometry, discr_stage)
+
+        return discr
+
+    # }}}
+
+    def get_connection(self, from_dd, to_dd):
+        from pytential.symbolic.dof_connection import connection_from_dds
+        return connection_from_dds(self, from_dd, to_dd)
+
+    def get_discretization(self, geometry, discr_stage=None):
         """
         :arg dofdesc: a :class:`~pytential.symbolic.primitives.DOFDescriptor`
             specifying the desired discretization.
@@ -592,35 +745,48 @@ class GeometryCollection(object):
             the corresponding :class:`~meshmode.discretization.Discretization`
             in its attributes instead.
         """
-
-        dofdesc = sym.as_dofdesc(dofdesc)
-        if dofdesc.geometry in self.places:
-            discr = self.places[dofdesc.geometry]
-        else:
-            raise KeyError('geometry not in the collection: {}'.format(
-                dofdesc.geometry))
+        if discr_stage is None:
+            discr_stage = sym.QBX_SOURCE_STAGE1
+        discr = self.get_geometry(geometry)
 
         from pytential.qbx import QBXLayerPotentialSource
         from pytential.source import LayerPotentialSourceBase
 
         if isinstance(discr, QBXLayerPotentialSource):
-            return self._get_lpot_discretization(discr, dofdesc)
+            return self._get_qbx_discretization(geometry, discr_stage)
         elif isinstance(discr, LayerPotentialSourceBase):
             return discr.density_discr
         else:
             return discr
 
-    def get_geometry(self, dofdesc):
-        dofdesc = sym.as_dofdesc(dofdesc)
-        return self.places[dofdesc.geometry]
+    def get_geometry(self, geometry):
+        try:
+            return self.places[geometry]
+        except KeyError:
+            raise KeyError("geometry not in the collection: '{}'".format(
+                geometry))
+
+    def copy(self, places=None, auto_where=None):
+        places = self.places if places is None else places
+        return type(self)(
+                places=places.copy(),
+                auto_where=self.auto_where if auto_where is None else auto_where)
 
-    def copy(self):
-        return GeometryCollection(
-                self.places,
-                auto_where=self.auto_where)
+    def merge(self, places):
+        """Merges two geometry collections and returns the new collection.
 
-    def get_cache(self, name):
-        return self.caches.setdefault(name, {})
+        :arg places: A :class:`dict` or :class:`GeometryCollection` to
+            merge with the current collection. If it is empty, a copy of the
+            current collection is returned.
+        """
+
+        new_places = self.places.copy()
+        if places:
+            if isinstance(places, GeometryCollection):
+                places = places.places
+            new_places.update(places)
+
+        return self.copy(places=new_places)
 
     def __repr__(self):
         return "%s(%s)" % (type(self).__name__, repr(self.places))
@@ -635,7 +801,7 @@ class GeometryCollection(object):
 
 class BoundExpression(object):
     """An expression readied for evaluation by binding it to a
-    :class:`GeometryCollection`.
+    :class:`~pytential.symbolic.execution.GeometryCollection`.
 
     .. automethod :: get_modeled_cost
     .. automethod :: scipy_op
@@ -653,8 +819,8 @@ class BoundExpression(object):
         from pytential.symbolic.compiler import OperatorCompiler
         self.code = OperatorCompiler(self.places)(sym_op_expr)
 
-    def get_discretization(self, where):
-        return self.places.get_discretization(where)
+    def _get_cache(self, name):
+        return self.caches.setdefault(name, {})
 
     def get_modeled_cost(self, queue, **args):
         cost_model_mapper = CostModelMapper(self, queue, args)
@@ -689,7 +855,9 @@ class BoundExpression(object):
             if dom_name is None:
                 size = 1
             else:
-                size = self.places.get_geometry(dom_name).nnodes
+                discr = self.places.get_discretization(
+                        dom_name.geometry, dom_name.discr_stage)
+                size = discr.nnodes
 
             starts_and_ends.append((total_dofs, total_dofs+size))
             total_dofs += size
@@ -733,7 +901,7 @@ class BoundExpression(object):
 
 def bind(places, expr, auto_where=None):
     """
-    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg places: a :class:`~pytential.symbolic.execution.GeometryCollection`.
         Alternatively, any list or mapping that is a valid argument for its
         constructor can also be used.
     :arg auto_where: for simple source-to-self or source-to-target
@@ -744,10 +912,10 @@ def bind(places, expr, auto_where=None):
         in the form of a :mod:`numpy` object array
     :returns: a :class:`BoundExpression`
     """
-
     if not isinstance(places, GeometryCollection):
         places = GeometryCollection(places, auto_where=auto_where)
-    expr = _prepare_expr(places, expr)
+        auto_where = places.auto_where
+    expr = _prepare_expr(places, expr, auto_where=auto_where)
 
     return BoundExpression(places, expr)
 
@@ -756,14 +924,27 @@ def bind(places, expr, auto_where=None):
 
 # {{{ matrix building
 
-def _bmat(blocks, dtypes):
+def _bmat(blocks, dtype=None, shape=None):
     from pytools import single_valued
     from pytential.symbolic.matrix import is_zero
 
     nrows = blocks.shape[0]
     ncolumns = blocks.shape[1]
 
-    # "block row starts"/"block column starts"
+    if dtype is None:
+        dtype = [blocks[ibrow, ibcol].dtype
+                 for ibcol in range(ncolumns)
+                 for ibrow in range(nrows)
+                 if not is_zero(blocks[ibrow, ibcol])]
+        dtype = np.find_common_type(dtype, [])
+
+    if blocks.size == 1:
+        if shape is None:
+            return blocks[0, 0]
+        else:
+            return np.zeros(shape, dtype=dtype)
+
+    # "block row starts" / "block column starts"
     brs = np.cumsum([0]
             + [single_valued(blocks[ibrow, ibcol].shape[0]
                              for ibcol in range(ncolumns)
@@ -776,8 +957,12 @@ def _bmat(blocks, dtypes):
                              if not is_zero(blocks[ibrow, ibcol]))
              for ibcol in range(ncolumns)])
 
-    result = np.zeros((brs[-1], bcs[-1]),
-                      dtype=np.find_common_type(dtypes, []))
+    if shape is None:
+        shape = (brs[-1], bcs[-1])
+    else:
+        assert shape == (brs[-1], bcs[-1])
+
+    result = np.zeros(shape, dtype=dtype)
     for ibcol in range(ncolumns):
         for ibrow in range(nrows):
             result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \
@@ -790,7 +975,7 @@ def build_matrix(queue, places, exprs, input_exprs, domains=None,
         auto_where=None, context=None):
     """
     :arg queue: a :class:`pyopencl.CommandQueue`.
-    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg places: a :class:`~pytential.symbolic.execution.GeometryCollection`.
         Alternatively, any list or mapping that is a valid argument for its
         constructor can also be used.
     :arg exprs: an array of expressions corresponding to the output block
@@ -810,10 +995,11 @@ def build_matrix(queue, places, exprs, input_exprs, domains=None,
     if context is None:
         context = {}
 
+    from pytential import GeometryCollection
     from pytools.obj_array import is_obj_array, make_obj_array
     if not isinstance(places, GeometryCollection):
         places = GeometryCollection(places, auto_where=auto_where)
-    exprs = _prepare_expr(places, exprs)
+    exprs = _prepare_expr(places, exprs, auto_where=auto_where)
 
     if not is_obj_array(exprs):
         exprs = make_obj_array([exprs])
@@ -833,13 +1019,17 @@ def build_matrix(queue, places, exprs, input_exprs, domains=None,
 
     dtypes = []
     for ibcol in range(nblock_columns):
+        dep_source = places.get_geometry(domains[ibcol].geometry)
+        dep_discr = places.get_discretization(
+                domains[ibcol].geometry, domains[ibcol].discr_stage)
+
         mbuilder = MatrixBuilder(
                 queue,
                 dep_expr=input_exprs[ibcol],
                 other_dep_exprs=(input_exprs[:ibcol]
                                  + input_exprs[ibcol + 1:]),
-                dep_source=places.get_geometry(domains[ibcol]),
-                dep_discr=places.get_discretization(domains[ibcol]),
+                dep_source=dep_source,
+                dep_discr=dep_discr,
                 places=places,
                 context=context)
 
@@ -851,7 +1041,8 @@ def build_matrix(queue, places, exprs, input_exprs, domains=None,
             if isinstance(block, np.ndarray):
                 dtypes.append(block.dtype)
 
-    return cl.array.to_device(queue, _bmat(blocks, dtypes))
+    dtypes = np.find_common_type(dtypes, [])
+    return cl.array.to_device(queue, _bmat(blocks, dtype=dtypes))
 
 # }}}
 
diff --git a/pytential/symbolic/mappers.py b/pytential/symbolic/mappers.py
index 66c0bba0f9f851c805357cf1f1d61af8c9b6415d..09dcbe15ec1f1ec732f44da4d7f740d5dae0a893 100644
--- a/pytential/symbolic/mappers.py
+++ b/pytential/symbolic/mappers.py
@@ -430,8 +430,8 @@ class DerivativeBinder(DerivativeBinderBase, IdentityMapper):
 
 class UnregularizedPreprocessor(IdentityMapper):
 
-    def __init__(self, source_name, places):
-        self.source_name = source_name
+    def __init__(self, geometry, places):
+        self.geometry = geometry
         self.places = places
 
     def map_int_g(self, expr):
@@ -460,17 +460,23 @@ class InterpolationPreprocessor(IdentityMapper):
     a :class:`~pytential.symbolic.primitives.Interpolation`. This is used to
 
     * do differentiation on
-    :attr:`~pytential.source.LayerPotentialSource.quad_stage2_density_discr`,
-    by performing it on
-    :attr:`~pytential.source.LayerPotentialSource.stage2_density_discr` and
-    upsampling.
+    :class:`~pytential.symbolic.primitives.QBX_SOURCE_QUAD_STAGE2`.
+    by performing it on :attr:`from_discr_stage` and upsampling.
     * upsample layer potential sources to
-    :attr:`~pytential.source.LayerPotentialSource.quad_stage2_density_discr`,
+    :attr:`~pytential.symbolic.primitives.QBX_SOURCE_QUAD_STAGE2`,
     """
 
-    def __init__(self, places):
+    def __init__(self, places, from_discr_stage=None):
+        """
+        .. attribute:: from_discr_stage
+
+            Sets the stage on which to compute the data before interpolation.
+            For valid values, see
+            :attr:`~pytential.symbolic.primitives.DOFDescriptor.discr_stage`.
+        """
         self.places = places
-        self.from_discr_stage = prim.QBX_SOURCE_STAGE2
+        self.from_discr_stage = (prim.QBX_SOURCE_STAGE2
+                if from_discr_stage is None else from_discr_stage)
         self.tagger = DiscretizationStageTagger(self.from_discr_stage)
 
     def map_num_reference_derivative(self, expr):
@@ -479,7 +485,7 @@ class InterpolationPreprocessor(IdentityMapper):
             return expr
 
         from pytential.qbx import QBXLayerPotentialSource
-        lpot_source = self.places.get_geometry(to_dd)
+        lpot_source = self.places.get_geometry(to_dd.geometry)
         if not isinstance(lpot_source, QBXLayerPotentialSource):
             return expr
 
@@ -487,27 +493,37 @@ class InterpolationPreprocessor(IdentityMapper):
         return prim.interp(from_dd, to_dd, self.rec(self.tagger(expr)))
 
     def map_int_g(self, expr):
-        from_dd = expr.source
-        if from_dd.discr_stage is not None:
-            return expr
+        if expr.target.discr_stage is None:
+            expr = expr.copy(target=expr.target.to_stage1())
 
         from pytential.qbx import QBXLayerPotentialSource
-        lpot_source = self.places.get_geometry(from_dd)
+        lpot_source = self.places.get_geometry(expr.source.geometry)
         if not isinstance(lpot_source, QBXLayerPotentialSource):
             return expr
 
-        to_dd = from_dd.copy(discr_stage=prim.QBX_SOURCE_QUAD_STAGE2)
-        density = prim.interp(from_dd, to_dd, self.rec(expr.density))
-        kernel_arguments = dict(
-                (name, prim.interp(from_dd, to_dd, self.rec(arg_expr)))
-                for name, arg_expr in expr.kernel_arguments.items())
+        if expr.source.discr_stage is None:
+            from_dd = expr.source.to_stage1()
+            to_dd = from_dd.to_quad_stage2()
+            density = prim.interp(from_dd, to_dd, self.rec(expr.density))
+
+            from_dd = from_dd.copy(discr_stage=self.from_discr_stage)
+            kernel_arguments = dict(
+                    (name, prim.interp(from_dd, to_dd,
+                        self.rec(self.tagger(arg_expr))))
+                    for name, arg_expr in expr.kernel_arguments.items())
+        else:
+            to_dd = expr.source
+
+            density = expr.density
+            kernel_arguments = dict(
+                    (name, self.rec(arg_expr))
+                    for name, arg_expr in expr.kernel_arguments.items())
 
         return expr.copy(
                 kernel=expr.kernel,
                 density=density,
                 kernel_arguments=kernel_arguments,
-                source=to_dd,
-                target=expr.target)
+                source=to_dd)
 
 # }}}
 
@@ -515,16 +531,18 @@ class InterpolationPreprocessor(IdentityMapper):
 # {{{ QBX preprocessor
 
 class QBXPreprocessor(IdentityMapper):
-    def __init__(self, source_name, places):
-        self.source_name = source_name
+    def __init__(self, geometry, places):
+        self.geometry = geometry
         self.places = places
 
     def map_int_g(self, expr):
-        if expr.source.geometry != self.source_name:
+        if expr.source.geometry != self.geometry:
             return expr
 
-        source_discr = self.places.get_discretization(expr.source)
-        target_discr = self.places.get_discretization(expr.target)
+        source_discr = self.places.get_discretization(
+                expr.source.geometry, expr.source.discr_stage)
+        target_discr = self.places.get_discretization(
+                expr.target.geometry, expr.target.discr_stage)
 
         if expr.qbx_forced_limit == 0:
             raise ValueError("qbx_forced_limit == 0 was a bad idea and "
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index 9f9e0e4a268b7f365003b9c293b7b08e246bf01c..ae3718d771c29bc8037a4fd83d09b73116724037 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -32,127 +32,76 @@ import pyopencl.array  # noqa
 import six
 from six.moves import intern
 
+from pytools import memoize_method
 from pytential.symbolic.mappers import EvaluationMapperBase
-import pytential.symbolic.primitives as sym
-from pytential.symbolic.execution import bind
 
 
 # {{{ helpers
 
 def is_zero(x):
-    return isinstance(x, (int, float, complex, np.number)) and x == 0
+    return ((x is None)
+            or (isinstance(x, (int, float, complex, np.number)) and x == 0))
 
 
-def _get_layer_potential_args(mapper, expr, source):
+def _get_layer_potential_args(mapper, expr, include_args=None):
     """
-    :arg mapper: a :class:`pytential.symbolic.matrix.MatrixBuilderBase`.
+    :arg mapper: a :class:`~pytential.symbolic.matrix.MatrixBuilderBase`.
     :arg expr: symbolic layer potential expression.
-    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
 
     :return: a mapping of kernel arguments evaluated by the *mapper*.
     """
 
     kernel_args = {}
     for arg_name, arg_expr in six.iteritems(expr.kernel_arguments):
+        if (include_args is not None
+                and arg_name not in include_args):
+            continue
+
         kernel_args[arg_name] = mapper.rec(arg_expr)
 
     return kernel_args
 
+# }}}
 
-def _get_kernel_args(mapper, kernel, expr, source):
-    """
-    :arg mapper: a :class:`pytential.symbolic.matrix.MatrixBuilderBase`.
-    :arg kernel: a :class:`sumpy.kernel.Kernel`.
-    :arg expr: symbolic layer potential expression.
-    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
-
-    :return: a mapping of kernel arguments evaluated by the *mapper*.
-    """
 
-    # NOTE: copied from pytential.symbolic.primitives.IntG
-    inner_kernel_args = kernel.get_args() + kernel.get_source_args()
-    inner_kernel_args = set(arg.loopy_arg.name for arg in inner_kernel_args)
+# {{{ base classes for matrix builders
 
-    kernel_args = {}
-    for arg_name, arg_expr in six.iteritems(expr.kernel_arguments):
-        if arg_name not in inner_kernel_args:
-            continue
-        kernel_args[arg_name] = mapper.rec(arg_expr)
+class MatrixBuilderBase(EvaluationMapperBase):
+    r"""Construct a matrix block from a given symbolic representation of
+    an integral equation. For example, let the integral equation be
+    given by:
 
-    return kernel_args
+        .. math::
 
+            b_i = \mathcal{S}_{ij}[\sigma_j]
 
-def _get_weights_and_area_elements(queue, source, source_discr):
-    """
-    :arg queue: a :class:`pyopencl.CommandQueue`.
-    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
-    :arg source_discr: a :class:`meshmode.discretization.Discretization`.
+    This class evaluates a specific :math:`\mathcal{S}_{ij}`.
 
-    :return: quadrature weights for each node in *source_discr*.
-    """
+    .. attribute:: dep_expr
 
-    if source.quad_stage2_density_discr is source_discr:
-        waa = source.weights_and_area_elements().with_queue(queue)
-    else:
-        # NOTE: copied from `weights_and_area_elements`, but using the
-        # discretization given by `where` and no interpolation
-        area = bind(source_discr,
-                sym.area_element(source.ambient_dim, source.dim))(queue)
-        qweight = bind(source_discr, sym.QWeight())(queue)
-        waa = area * qweight
+        Symbolic expression for the column input, i.e. the name of the
+        :math:`\sigma_j`.
 
-    return waa
+    .. attribute:: other_dep_expr
 
+        A list of the remaining input expressions, which can be empty.
 
-def _get_centers_and_expansion_radii(queue, source, target_discr, qbx_forced_limit):
-    """
-    :arg queue: a :class:`pyopencl.CommandQueue`.
-    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
-    :arg target_discr: a :class:`meshmode.discretization.Discretization`.
-    :arg qbx_forced_limit: an integer (*+1* or *-1*).
+    .. attribute:: dep_source
 
-    :return: a tuple of `(centers, radii)` for each node in *target_discr*.
-    """
+        A :class:`~pytential.source.LayerPotentialSourceBase`, where the
+        :attribute:`dep_expr` is defined.
 
-    if source.density_discr is target_discr:
-        # NOTE: skip expensive target association
-        centers = bind(source,
-            sym.expansion_centers(source.ambient_dim, qbx_forced_limit))(queue)
-        radii = bind(source,
-            sym.expansion_radii(source.ambient_dim))(queue)
-    else:
-        from pytential.qbx.utils import get_interleaved_centers
-        centers = get_interleaved_centers(queue, source)
-        radii = bind(source, sym.expansion_radii(
-            source.ambient_dim,
-            granularity=sym.GRANULARITY_CENTER))(queue)
-
-        # NOTE: using a very small tolerance to make sure all the stage2
-        # targets are associated to a center. We can't use the user provided
-        # source.target_association_tolerance here because it will likely be
-        # way too small.
-        target_association_tolerance = 1.0e-1
-
-        from pytential.qbx.target_assoc import associate_targets_to_qbx_centers
-        code_container = source.target_association_code_container
-        assoc = associate_targets_to_qbx_centers(
-                source,
-                code_container.get_wrangler(queue),
-                [(target_discr, qbx_forced_limit)],
-                target_association_tolerance=target_association_tolerance)
-
-        centers = [cl.array.take(c, assoc.target_to_center, queue=queue)
-                   for c in centers]
-        radii = cl.array.take(radii, assoc.target_to_center, queue=queue)
-
-    return centers, radii
+    .. attribute:: dep_discr
 
-# }}}
+        The exact :class:`~meshmode.discretization.Discretization` for
+        the :attribute:`dep_expr`.
 
+    .. attribute:: places
 
-# {{{ base class for matrix builders
+        A :class:`~pytential.symbolic.execution.GeometryCollection` of all
+        the domains in the integral equation definition.
+    """
 
-class MatrixBuilderBase(EvaluationMapperBase):
     def __init__(self, queue, dep_expr, other_dep_exprs,
             dep_source, dep_discr, places, context):
         """
@@ -161,11 +110,11 @@ class MatrixBuilderBase(EvaluationMapperBase):
             that the builder is evaluating.
         :arg other_dep_exprs: symbolic expressions for the remaining input
             block columns.
-        :arg dep_source: a :class:`pytential.source.LayerPotentialSourceBase`
+        :arg dep_source: a :class:`~pytential.source.LayerPotentialSourceBase`
             for the given *dep_expr*.
-        :arg dep_discr: a concerete :class:`meshmode.discretization.Discretization`
+        :arg dep_discr: a concerete :class:`~meshmode.discretization.Discretization`
             for the given *dep_expr*.
-        :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`
+        :arg places: a :class:`~pytential.symbolic.execution.GeometryCollection`
             for all the sources and targets the builder is expected to
             encounter.
         """
@@ -178,12 +127,10 @@ class MatrixBuilderBase(EvaluationMapperBase):
         self.dep_discr = dep_discr
         self.places = places
 
-        self.dep_nnodes = dep_discr.nnodes
-
     # {{{
 
     def get_dep_variable(self):
-        return np.eye(self.dep_nnodes, dtype=np.float64)
+        return np.eye(self.dep_discr.nnodes, dtype=np.float64)
 
     def is_kind_vector(self, x):
         return len(x.shape) == 1
@@ -278,6 +225,7 @@ class MatrixBuilderBase(EvaluationMapperBase):
             return vecs_and_scalars
 
     def map_num_reference_derivative(self, expr):
+        from pytential import bind, sym
         rec_operand = self.rec(expr.operand)
 
         assert isinstance(rec_operand, np.ndarray)
@@ -292,6 +240,7 @@ class MatrixBuilderBase(EvaluationMapperBase):
         return bind(self.places, op)(self.queue, u=rec_operand).get()
 
     def map_node_coordinate_component(self, expr):
+        from pytential import bind, sym
         op = sym.NodeCoordinateComponent(expr.ambient_axis, dofdesc=expr.dofdesc)
         return bind(self.places, op)(self.queue).get()
 
@@ -305,8 +254,9 @@ class MatrixBuilderBase(EvaluationMapperBase):
         if isinstance(rec_arg, np.ndarray):
             rec_arg = cl.array.to_device(self.queue, rec_arg)
 
+        from pytential import bind, sym
         op = expr.function(sym.var("u"))
-        result = bind(self.dep_source, op)(self.queue, u=rec_arg)
+        result = bind(self.places, op)(self.queue, u=rec_arg)
 
         if isinstance(result, cl.array.Array):
             result = result.get()
@@ -324,21 +274,50 @@ class MatrixBlockBuilderBase(MatrixBuilderBase):
     to evaluate linear combinations of layer potential operators.
     For example, they do not support composition of operators because we
     assume that each operator acts directly on the density.
+
+    .. attribute:: index_set
+
+        A :class:`sumpy.tools.MatrixBlockIndexRanges` class describing
+        which blocks are going to be evaluated.
     """
 
     def __init__(self, queue, dep_expr, other_dep_exprs,
             dep_source, dep_discr, places, index_set, context):
         """
-        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` class
-            describing which blocks are going to be evaluated.
         """
 
         super(MatrixBlockBuilderBase, self).__init__(queue,
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, context)
-
         self.index_set = index_set
-        self.dep_nnodes = index_set.col.indices.size
+
+    @property
+    @memoize_method
+    def _mat_mapper(self):
+        # mat_mapper is used to compute any kernel arguments that needs to
+        # be computed on the full discretization, ignoring our index_set,
+        # e.g the normal in a double layer potential
+
+        return MatrixBuilder(self.queue,
+                self.dep_expr,
+                self.other_dep_exprs,
+                self.dep_source,
+                self.dep_discr,
+                self.places, self.context)
+
+    @property
+    @memoize_method
+    def _blk_mapper(self):
+        # blk_mapper is used to recursively compute the density to
+        # a layer potential operator to ensure there is no composition
+
+        return MatrixBlockBuilderBase(self.queue,
+                self.dep_expr,
+                self.other_dep_exprs,
+                self.dep_source,
+                self.dep_discr,
+                self.places,
+                self.index_set, self.context)
 
     def get_dep_variable(self):
         return 1.0
@@ -368,30 +347,48 @@ class MatrixBuilder(MatrixBuilderBase):
                 dep_source, dep_discr, places, context)
 
     def map_interpolation(self, expr):
+        from pytential import sym
+
         if expr.to_dd.discr_stage != sym.QBX_SOURCE_QUAD_STAGE2:
             raise RuntimeError("can only interpolate to QBX_SOURCE_QUAD_STAGE2")
-
         operand = self.rec(expr.operand)
+
         if isinstance(operand, (int, float, complex, np.number)):
             return operand
         elif isinstance(operand, np.ndarray) and operand.ndim == 1:
-            from pytential.symbolic.dof_connection import connection_from_dds
-            conn = connection_from_dds(self.places,
-                    expr.from_dd, expr.to_dd)
-
-            operand = cl.array.to_device(self.queue, operand)
-            return conn(self.queue, operand).get(self.queue)
+            conn = self.places.get_connection(expr.from_dd, expr.to_dd)
+            return conn(self.queue,
+                    cl.array.to_device(self.queue, operand)).get(self.queue)
         elif isinstance(operand, np.ndarray) and operand.ndim == 2:
-            resampler = self.places.get_geometry(expr.from_dd).direct_resampler
-            mat = resampler.full_resample_matrix(self.queue).get(self.queue)
+            cache = self.places._get_cache("direct_resampler")
+            key = (expr.from_dd.geometry,
+                    expr.from_dd.discr_stage,
+                    expr.to_dd.discr_stage)
+
+            try:
+                mat = cache[key]
+            except KeyError:
+                from meshmode.discretization.connection import \
+                    flatten_chained_connection
+
+                conn = self.places.get_connection(expr.from_dd, expr.to_dd)
+                conn = flatten_chained_connection(self.queue, conn)
+                mat = conn.full_resample_matrix(self.queue).get(self.queue)
+
+                # FIXME: the resample matrix is slow to compute and very big
+                # to store, so caching it may not be the best idea
+                cache[key] = mat
+
             return mat.dot(operand)
         else:
-            raise RuntimeError('unknown operand type: {}'.format(type(operand)))
+            raise RuntimeError("unknown operand type: {}".format(type(operand)))
 
     def map_int_g(self, expr):
-        lpot_source = self.places.get_geometry(expr.source)
-        source_discr = self.places.get_discretization(expr.source)
-        target_discr = self.places.get_discretization(expr.target)
+        lpot_source = self.places.get_geometry(expr.source.geometry)
+        source_discr = self.places.get_discretization(
+                expr.source.geometry, expr.source.discr_stage)
+        target_discr = self.places.get_discretization(
+                expr.target.geometry, expr.target.discr_stage)
 
         rec_density = self.rec(expr.density)
         if is_zero(rec_density):
@@ -402,7 +399,7 @@ class MatrixBuilder(MatrixBuilderBase):
             raise NotImplementedError("layer potentials on non-variables")
 
         kernel = expr.kernel
-        kernel_args = _get_layer_potential_args(self, expr, lpot_source)
+        kernel_args = _get_layer_potential_args(self, expr)
 
         from sumpy.expansion.local import LineTaylorLocalExpansion
         local_expn = LineTaylorLocalExpansion(kernel, lpot_source.qbx_order)
@@ -412,8 +409,14 @@ class MatrixBuilder(MatrixBuilderBase):
                 self.queue.context, (local_expn,))
 
         assert abs(expr.qbx_forced_limit) > 0
-        centers, radii = _get_centers_and_expansion_radii(self.queue,
-                lpot_source, target_discr, expr.qbx_forced_limit)
+        from pytential import bind, sym
+        radii = bind(self.places, sym.expansion_radii(
+            source_discr.ambient_dim,
+            dofdesc=expr.target))(self.queue)
+        centers = bind(self.places, sym.expansion_centers(
+            source_discr.ambient_dim,
+            expr.qbx_forced_limit,
+            dofdesc=expr.target))(self.queue)
 
         _, (mat,) = mat_gen(self.queue,
                 targets=target_discr.nodes(),
@@ -423,7 +426,9 @@ class MatrixBuilder(MatrixBuilderBase):
                 **kernel_args)
         mat = mat.get()
 
-        waa = _get_weights_and_area_elements(self.queue, lpot_source, source_discr)
+        waa = bind(self.places, sym.weights_and_area_elements(
+            source_discr.ambient_dim,
+            dofdesc=expr.source))(self.queue)
         mat[:, :] *= waa.get(self.queue)
         mat = mat.dot(rec_density)
 
@@ -436,17 +441,24 @@ class MatrixBuilder(MatrixBuilderBase):
 
 class P2PMatrixBuilder(MatrixBuilderBase):
     def __init__(self, queue, dep_expr, other_dep_exprs,
-            dep_source, dep_discr, places, context, exclude_self=True):
+            dep_source, dep_discr, places, context,
+            weighted=False, exclude_self=True):
         super(P2PMatrixBuilder, self).__init__(queue,
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, context)
 
+        self.weighted = weighted
         self.exclude_self = exclude_self
 
+        from pytential.source import LayerPotentialSourceBase
+        if not isinstance(dep_source, LayerPotentialSourceBase):
+            self.weighted = False
+
     def map_int_g(self, expr):
-        source = self.places.get_geometry(expr.source)
-        source_discr = self.places.get_discretization(expr.source)
-        target_discr = self.places.get_discretization(expr.target)
+        source_discr = self.places.get_discretization(
+                expr.source.geometry, expr.source.discr_stage)
+        target_discr = self.places.get_discretization(
+                expr.target.geometry, expr.target.discr_stage)
 
         rec_density = self.rec(expr.density)
         if is_zero(rec_density):
@@ -456,8 +468,15 @@ class P2PMatrixBuilder(MatrixBuilderBase):
         if not self.is_kind_matrix(rec_density):
             raise NotImplementedError("layer potentials on non-variables")
 
+        # NOTE: copied from pytential.symbolic.primitives.IntG
+        # NOTE: P2P evaluation only uses the inner kernel, so it should not
+        # get other kernel_args, e.g. normal vectors in a double layer
         kernel = expr.kernel.get_base_kernel()
-        kernel_args = _get_kernel_args(self, kernel, expr, source)
+        kernel_args = kernel.get_args() + kernel.get_source_args()
+        kernel_args = set(arg.loopy_arg.name for arg in kernel_args)
+
+        kernel_args = _get_layer_potential_args(self,
+                expr, include_args=kernel_args)
         if self.exclude_self:
             kernel_args["target_to_source"] = \
                 cl.array.arange(self.queue, 0, target_discr.nnodes, dtype=np.int)
@@ -470,8 +489,15 @@ class P2PMatrixBuilder(MatrixBuilderBase):
                 targets=target_discr.nodes(),
                 sources=source_discr.nodes(),
                 **kernel_args)
-
         mat = mat.get()
+
+        if self.weighted:
+            from pytential import bind, sym
+            waa = bind(self.places, sym.weights_and_area_elements(
+                source_discr.ambient_dim,
+                dofdesc=expr.source))(self.queue)
+            mat[:, :] *= waa.get(self.queue)
+
         mat = mat.dot(rec_density)
 
         return mat
@@ -487,19 +513,6 @@ class NearFieldBlockBuilder(MatrixBlockBuilderBase):
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, index_set, context)
 
-        # NOTE: we need additional mappers to redirect some operations:
-        #   * mat_mapper is used to compute any kernel arguments that need to
-        #   be computed on the full discretization, ignoring our index_set,
-        #   e.g the normal in a double layer potential
-        #   * blk_mapper is used to recursively compute the density to
-        #   a layer potential operator to ensure there is no composition
-        self.mat_mapper = MatrixBuilderBase(queue,
-                dep_expr, other_dep_exprs, dep_source, dep_discr,
-                places, context)
-        self.blk_mapper = MatrixBlockBuilderBase(queue,
-                dep_expr, other_dep_exprs, dep_source, dep_discr,
-                places, index_set, context)
-
     def get_dep_variable(self):
         tgtindices = self.index_set.linear_row_indices.get(self.queue)
         srcindices = self.index_set.linear_col_indices.get(self.queue)
@@ -507,33 +520,41 @@ class NearFieldBlockBuilder(MatrixBlockBuilderBase):
         return np.equal(tgtindices, srcindices).astype(np.float64)
 
     def map_int_g(self, expr):
-        source = self.places.get_geometry(expr.source)
-        source_discr = self.places.get_discretization(expr.source)
-        target_discr = self.places.get_discretization(expr.target)
+        lpot_source = self.places.get_geometry(expr.source.geometry)
+        source_discr = self.places.get_discretization(
+                expr.source.geometry, expr.source.discr_stage)
+        target_discr = self.places.get_discretization(
+                expr.target.geometry, expr.target.discr_stage)
 
         if source_discr is not target_discr:
-            raise NotImplementedError()
+            raise NotImplementedError
 
-        rec_density = self.blk_mapper.rec(expr.density)
+        rec_density = self._blk_mapper.rec(expr.density)
         if is_zero(rec_density):
             return 0
 
         if not np.isscalar(rec_density):
-            raise NotImplementedError()
+            raise NotImplementedError
 
         kernel = expr.kernel
-        kernel_args = _get_layer_potential_args(self.mat_mapper, expr, None)
+        kernel_args = _get_layer_potential_args(self._mat_mapper, expr)
 
         from sumpy.expansion.local import LineTaylorLocalExpansion
-        local_expn = LineTaylorLocalExpansion(kernel, source.qbx_order)
+        local_expn = LineTaylorLocalExpansion(kernel, lpot_source.qbx_order)
 
         from sumpy.qbx import LayerPotentialMatrixBlockGenerator
         mat_gen = LayerPotentialMatrixBlockGenerator(
                 self.queue.context, (local_expn,))
 
         assert abs(expr.qbx_forced_limit) > 0
-        centers, radii = _get_centers_and_expansion_radii(self.queue,
-                source, target_discr, expr.qbx_forced_limit)
+        from pytential import bind, sym
+        radii = bind(self.places, sym.expansion_radii(
+            source_discr.ambient_dim,
+            dofdesc=expr.target))(self.queue)
+        centers = bind(self.places, sym.expansion_centers(
+            source_discr.ambient_dim,
+            expr.qbx_forced_limit,
+            dofdesc=expr.target))(self.queue)
 
         _, (mat,) = mat_gen(self.queue,
                 targets=target_discr.nodes(),
@@ -543,7 +564,9 @@ class NearFieldBlockBuilder(MatrixBlockBuilderBase):
                 index_set=self.index_set,
                 **kernel_args)
 
-        waa = _get_weights_and_area_elements(self.queue, source, source_discr)
+        waa = bind(self.places, sym.weights_and_area_elements(
+            source_discr.ambient_dim,
+            dofdesc=expr.source))(self.queue)
         mat *= waa[self.index_set.linear_col_indices]
         mat = rec_density * mat.get(self.queue)
 
@@ -552,19 +575,14 @@ class NearFieldBlockBuilder(MatrixBlockBuilderBase):
 
 class FarFieldBlockBuilder(MatrixBlockBuilderBase):
     def __init__(self, queue, dep_expr, other_dep_exprs, dep_source, dep_discr,
-            places, index_set, context, exclude_self=False):
+            places, index_set, context,
+            weighted=False, exclude_self=True):
         super(FarFieldBlockBuilder, self).__init__(queue,
                 dep_expr, other_dep_exprs, dep_source, dep_discr,
                 places, index_set, context)
 
-        # NOTE: same mapper issues as in the NearFieldBlockBuilder
+        self.weighted = weighted
         self.exclude_self = exclude_self
-        self.mat_mapper = MatrixBuilderBase(queue,
-                dep_expr, other_dep_exprs, dep_source, dep_discr,
-                places, context)
-        self.blk_mapper = MatrixBlockBuilderBase(queue,
-                dep_expr, other_dep_exprs, dep_source, dep_discr,
-                places, index_set, context)
 
     def get_dep_variable(self):
         tgtindices = self.index_set.linear_row_indices.get(self.queue)
@@ -573,22 +591,27 @@ class FarFieldBlockBuilder(MatrixBlockBuilderBase):
         return np.equal(tgtindices, srcindices).astype(np.float64)
 
     def map_int_g(self, expr):
-        source = self.places.get_geometry(expr.source)
-        source_discr = self.places.get_discretization(expr.source)
-        target_discr = self.places.get_discretization(expr.target)
+        source_discr = self.places.get_discretization(
+                expr.source.geometry, expr.source.discr_stage)
+        target_discr = self.places.get_discretization(
+                expr.target.geometry, expr.target.discr_stage)
 
-        if source_discr is not target_discr:
-            raise NotImplementedError()
-
-        rec_density = self.blk_mapper.rec(expr.density)
+        rec_density = self._blk_mapper.rec(expr.density)
         if is_zero(rec_density):
             return 0
 
         if not np.isscalar(rec_density):
-            raise NotImplementedError()
+            raise NotImplementedError
 
+        # NOTE: copied from pytential.symbolic.primitives.IntG
+        # NOTE: P2P evaluation only uses the inner kernel, so it should not
+        # get other kernel_args, e.g. normal vectors in a double layer
         kernel = expr.kernel.get_base_kernel()
-        kernel_args = _get_kernel_args(self.mat_mapper, kernel, expr, source)
+        kernel_args = kernel.get_args() + kernel.get_source_args()
+        kernel_args = set(arg.loopy_arg.name for arg in kernel_args)
+
+        kernel_args = _get_layer_potential_args(self._mat_mapper,
+                expr, include_args=kernel_args)
         if self.exclude_self:
             kernel_args["target_to_source"] = \
                 cl.array.arange(self.queue, 0, target_discr.nnodes, dtype=np.int)
@@ -602,9 +625,15 @@ class FarFieldBlockBuilder(MatrixBlockBuilderBase):
                 sources=source_discr.nodes(),
                 index_set=self.index_set,
                 **kernel_args)
-        mat = rec_density * mat.get(self.queue)
 
-        return mat
+        if self.weighted:
+            from pytential import bind, sym
+            waa = bind(self.places, sym.weights_and_area_elements(
+                source_discr.ambient_dim,
+                dofdesc=expr.source))(self.queue)
+            mat *= waa[self.index_set.linear_col_indices]
+
+        return rec_density * mat.get(self.queue)
 
 # }}}
 
diff --git a/pytential/symbolic/pde/maxwell/waveguide.py b/pytential/symbolic/pde/maxwell/waveguide.py
index 4d716fba24c86ec0ae0706b1368187500d314bc0..3a1d3a63dfa932969c2363ac2c09da0a98e3b89f 100644
--- a/pytential/symbolic/pde/maxwell/waveguide.py
+++ b/pytential/symbolic/pde/maxwell/waveguide.py
@@ -436,7 +436,11 @@ class Dielectric2DBoundaryOperatorBase(L2WeightedPDEOperator):
         if use_l2_weighting is None:
             use_l2_weighting = False
 
+        from sumpy.kernel import HelmholtzKernel
+        self.kernel = HelmholtzKernel(2, allow_evanescent=True)
+
         super(Dielectric2DBoundaryOperatorBase, self).__init__(
+                self.kernel,
                 use_l2_weighting=use_l2_weighting)
 
         if mode == "te":
@@ -483,9 +487,6 @@ class Dielectric2DBoundaryOperatorBase(L2WeightedPDEOperator):
                 sym.cse((k_expr**2-beta**2)**0.5, "K%d" % i)
                 for i, k_expr in enumerate(self.domain_k_exprs)]
 
-        from sumpy.kernel import HelmholtzKernel
-        self.kernel = HelmholtzKernel(2, allow_evanescent=True)
-
         # {{{ build bc list
 
         # list of tuples, where each tuple consists of BCTermDescriptor instances
@@ -629,7 +630,7 @@ class Dielectric2DBoundaryOperatorBase(L2WeightedPDEOperator):
                 assert False, raw_potential_op
         elif term.direction == self.dir_normal:
             potential_op = sym.normal_derivative(
-                    potential_op, interface_id)
+                    2, potential_op, dofdesc=interface_id)
 
             if raw_potential_op is sym.S:
                 # S'
@@ -686,6 +687,14 @@ class DielectricSRep2DBoundaryOperator(Dielectric2DBoundaryOperatorBase):
             ``i_interface`` is the number of the enclosed domain, starting from 0.
         """
         result = np.zeros((2, 2, len(self.interfaces)), dtype=np.object)
+        sides = {
+                self.side_out: "o",
+                self.side_in: "i"
+                }
+        fields = {
+                self.field_kind_e: "E",
+                self.field_kind_h: "H"
+                }
 
         i_unknown = 0
         for side in self.sides:
@@ -704,15 +713,8 @@ class DielectricSRep2DBoundaryOperator(Dielectric2DBoundaryOperatorBase):
                         dens = sym.cse(
                                 dens/self.get_sqrt_weight(interface_id),
                                 "dens_{side}_{field}_{dom}".format(
-                                    side={
-                                        self.side_out: "o",
-                                        self.side_in: "i"}
-                                    [side],
-                                    field={
-                                        self.field_kind_e: "E",
-                                        self.field_kind_h: "H"
-                                        }
-                                    [field_kind],
+                                    side=sides[side],
+                                    field=fields[field_kind],
                                     dom=i_interface))
 
                     result[side, field_kind, i_interface] = dens
@@ -720,7 +722,7 @@ class DielectricSRep2DBoundaryOperator(Dielectric2DBoundaryOperatorBase):
         assert i_unknown == len(unknown)
         return result
 
-    def representation(self, unknown, i_domain):
+    def representation(self, unknown, i_domain, qbx_forced_limit=None):
         """
         :return: a symbolic expression for the representation of the PDE solution
             in domain number *i_domain*.
@@ -749,7 +751,8 @@ class DielectricSRep2DBoundaryOperator(Dielectric2DBoundaryOperatorBase):
                             self.kernel,
                             my_unk,
                             source=interface_id,
-                            k=self.domain_K_exprs[i_domain])
+                            k=self.domain_K_exprs[i_domain],
+                            qbx_forced_limit=qbx_forced_limit)
 
             result.append(field_result)
 
diff --git a/pytential/symbolic/primitives.py b/pytential/symbolic/primitives.py
index 98cad9f116776d12e6bd4a6b4e796c4307af7bf4..1799e09b491105927f8f35d608603d32c1ef1ecd 100644
--- a/pytential/symbolic/primitives.py
+++ b/pytential/symbolic/primitives.py
@@ -240,6 +240,10 @@ def _deprecate_kwargs(oldkey, newkey):
     return super_wrapper
 
 
+class _NoArgSentinel(object):
+    pass
+
+
 # {{{ dof descriptors
 
 class DEFAULT_SOURCE:  # noqa: N801
@@ -251,22 +255,22 @@ class DEFAULT_TARGET:  # noqa: N801
 
 
 class QBX_SOURCE_STAGE1:   # noqa: N801
-    """Symbolic identifier for the base `stage1` discretization
-    :attr:`pytential.source.LayerPotentialSourceBase.density_discr`.
+    """Symbolic identifier for the Stage 1 discretization of a
+    :class:`pytential.source.QBXLayerPotentialSource`.
     """
     pass
 
 
 class QBX_SOURCE_STAGE2:   # noqa: N801
-    """Symbolic identifier for the `stage2` discretization
-    :attr:`pytential.source.LayerPotentialSourceBase.stage2_density_discr`.
+    """Symbolic identifier for the Stage 2 discretization of a
+    :class:`pytential.source.QBXLayerPotentialSource`.
     """
     pass
 
 
 class QBX_SOURCE_QUAD_STAGE2:   # noqa: N801
-    """Symbolic identifier for the `stage2` discretization
-    :attr:`pytential.source.LayerPotentialSourceBase.quad_stage2_density_discr`.
+    """Symbolic identifier for the upsampled Stage 2 discretization of a
+    :class:`pytential.source.QBXLayerPotentialSource`.
     """
     pass
 
@@ -322,25 +326,25 @@ class DOFDescriptor(object):
         if granularity is None:
             granularity = GRANULARITY_NODE
 
-        if discr_stage is not None:
-            if not (discr_stage == QBX_SOURCE_STAGE1
-                    or discr_stage == QBX_SOURCE_STAGE2
-                    or discr_stage == QBX_SOURCE_QUAD_STAGE2):
-                raise ValueError('unknown discr stage tag: "{}"'.format(discr_stage))
+        if not (discr_stage is None
+                or discr_stage == QBX_SOURCE_STAGE1
+                or discr_stage == QBX_SOURCE_STAGE2
+                or discr_stage == QBX_SOURCE_QUAD_STAGE2):
+            raise ValueError("unknown discr stage tag: '{}'".format(discr_stage))
 
         if not (granularity == GRANULARITY_NODE
                 or granularity == GRANULARITY_CENTER
                 or granularity == GRANULARITY_ELEMENT):
-            raise ValueError('unknown granularity: "{}"'.format(granularity))
+            raise ValueError("unknown granularity: '{}'".format(granularity))
 
         self.geometry = geometry
         self.discr_stage = discr_stage
         self.granularity = granularity
 
-    def copy(self, geometry=None, discr_stage=None, granularity=None):
+    def copy(self, geometry=None, discr_stage=_NoArgSentinel, granularity=None):
         if isinstance(geometry, DOFDescriptor):
             discr_stage = geometry.discr_stage \
-                    if discr_stage is None else discr_stage
+                    if discr_stage is _NoArgSentinel else discr_stage
             geometry = geometry.geometry
 
         return type(self)(
@@ -349,7 +353,16 @@ class DOFDescriptor(object):
                 granularity=(self.granularity
                     if granularity is None else granularity),
                 discr_stage=(self.discr_stage
-                    if discr_stage is None else discr_stage))
+                    if discr_stage is _NoArgSentinel else discr_stage))
+
+    def to_stage1(self):
+        return self.copy(discr_stage=QBX_SOURCE_STAGE1)
+
+    def to_stage2(self):
+        return self.copy(discr_stage=QBX_SOURCE_STAGE2)
+
+    def to_quad_stage2(self):
+        return self.copy(discr_stage=QBX_SOURCE_QUAD_STAGE2)
 
     def __hash__(self):
         return hash((type(self),
@@ -366,9 +379,9 @@ class DOFDescriptor(object):
 
     def __repr__(self):
         discr_stage = self.discr_stage \
-                if self.discr_stage is None else self.discr_stage.__name__,
+                if self.discr_stage is None else self.discr_stage.__name__
         granularity = self.granularity.__name__
-        return '{}(geometry={}, stage={}, granularity={})'.format(
+        return "{}(geometry={}, stage={}, granularity={})".format(
                 type(self).__name__, self.geometry, discr_stage, granularity)
 
     def __str__(self):
@@ -443,7 +456,7 @@ def make_sym_mv(name, num_components):
     return MultiVector(make_sym_vector(name, num_components))
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def make_sym_surface_mv(name, ambient_dim, dim, dofdesc=None):
     par_grad = parametrization_derivative_matrix(ambient_dim, dim, dofdesc)
 
@@ -512,7 +525,7 @@ class DiscretizationProperty(Expression):
 
     init_arg_names = ("dofdesc",)
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, dofdesc=None):
         """
         :arg dofdesc: |dofdesc-blurb|
@@ -522,7 +535,7 @@ class DiscretizationProperty(Expression):
 
     @property
     def where(self):
-        warn('`where` is deprecated. use `dofdesc` instead.',
+        warn("`where` is deprecated. use `dofdesc` instead.",
              DeprecationWarning, stacklevel=2)
         return self.dofdesc
 
@@ -542,7 +555,7 @@ class NodeCoordinateComponent(DiscretizationProperty):
 
     init_arg_names = ("ambient_axis", "dofdesc")
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, ambient_axis, dofdesc=None):
         """
         :arg dofdesc: |dofdesc-blurb|
@@ -556,7 +569,7 @@ class NodeCoordinateComponent(DiscretizationProperty):
     mapper_method = intern("map_node_coordinate_component")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def nodes(ambient_dim, dofdesc=None):
     """Return a :class:`pymbolic.geometric_algebra.MultiVector` of node
     locations.
@@ -576,7 +589,7 @@ class NumReferenceDerivative(DiscretizationProperty):
 
     init_arg_names = ("ref_axes", "operand", "dofdesc")
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __new__(cls, ref_axes=None, operand=None, dofdesc=None):
         # If the constructor is handed a multivector object, return an
         # object array of the operator applied to each of the
@@ -590,7 +603,7 @@ class NumReferenceDerivative(DiscretizationProperty):
         else:
             return DiscretizationProperty.__new__(cls)
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, ref_axes, operand, dofdesc=None):
         """
         :arg ref_axes: a :class:`tuple` of tuples indicating indices of
@@ -627,7 +640,7 @@ class NumReferenceDerivative(DiscretizationProperty):
     mapper_method = intern("map_num_reference_derivative")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def reference_jacobian(func, output_dim, dim, dofdesc=None):
     """Return a :class:`np.array` representing the Jacobian of a vector function
     with respect to the reference coordinates.
@@ -642,7 +655,7 @@ def reference_jacobian(func, output_dim, dim, dofdesc=None):
     return jac
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def parametrization_derivative_matrix(ambient_dim, dim, dofdesc=None):
     """Return a :class:`np.array` representing the derivative of the
     reference-to-global parametrization.
@@ -655,7 +668,7 @@ def parametrization_derivative_matrix(ambient_dim, dim, dofdesc=None):
             "pd_matrix", cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def parametrization_derivative(ambient_dim, dim, dofdesc=None):
     """Return a :class:`pymbolic.geometric_algebra.MultiVector` representing
     the derivative of the reference-to-global parametrization.
@@ -667,7 +680,7 @@ def parametrization_derivative(ambient_dim, dim, dofdesc=None):
     return product(MultiVector(vec) for vec in par_grad.T)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def pseudoscalar(ambient_dim, dim=None, dofdesc=None):
     """
     Same as the outer product of all parametrization derivative columns.
@@ -681,14 +694,14 @@ def pseudoscalar(ambient_dim, dim=None, dofdesc=None):
             "pseudoscalar", cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def area_element(ambient_dim, dim=None, dofdesc=None):
     return cse(
             sqrt(pseudoscalar(ambient_dim, dim, dofdesc).norm_squared()),
             "area_element", cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def sqrt_jac_q_weight(ambient_dim, dim=None, dofdesc=None):
     return cse(
             sqrt(
@@ -697,7 +710,7 @@ def sqrt_jac_q_weight(ambient_dim, dim=None, dofdesc=None):
             "sqrt_jac_q_weight", cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def normal(ambient_dim, dim=None, dofdesc=None):
     """Exterior unit normals."""
 
@@ -714,7 +727,7 @@ def normal(ambient_dim, dim=None, dofdesc=None):
             scope=cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def mean_curvature(ambient_dim, dim=None, dofdesc=None):
     """(Numerical) mean curvature."""
 
@@ -735,13 +748,13 @@ def mean_curvature(ambient_dim, dim=None, dofdesc=None):
         s_op = shape_operator(ambient_dim, dim=dim, dofdesc=dofdesc)
         kappa = -0.5 * sum(s_op[i, i] for i in range(s_op.shape[0]))
     else:
-        raise NotImplementedError('not available in {}D for {}D surfaces'
+        raise NotImplementedError("not available in {}D for {}D surfaces"
                 .format(ambient_dim, dim))
 
     return kappa
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def first_fundamental_form(ambient_dim, dim=None, dofdesc=None):
     if dim is None:
         dim = ambient_dim - 1
@@ -756,7 +769,7 @@ def first_fundamental_form(ambient_dim, dim=None, dofdesc=None):
             "fundform1")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def second_fundamental_form(ambient_dim, dim=None, dofdesc=None):
     """Compute the second fundamental form of a surface. This is in reference
     to the reference-to-global mapping in use for each element.
@@ -796,7 +809,7 @@ def second_fundamental_form(ambient_dim, dim=None, dofdesc=None):
     return result
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def shape_operator(ambient_dim, dim=None, dofdesc=None):
     if dim is None:
         dim = ambient_dim - 1
@@ -819,7 +832,7 @@ def shape_operator(ambient_dim, dim=None, dofdesc=None):
             "shape_operator")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _panel_size(ambient_dim, dim=None, dofdesc=None):
     # A broken quasi-1D approximation of 1D element size. Do not use.
 
@@ -867,7 +880,7 @@ def _small_mat_eigenvalues(mat):
                 "eigenvalue formula for %dx%d matrices" % (m, n))
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _equilateral_parametrization_derivative_matrix(ambient_dim, dim=None,
         dofdesc=None):
     if dim is None:
@@ -886,7 +899,7 @@ def _equilateral_parametrization_derivative_matrix(ambient_dim, dim=None,
             "equilateral_pder_mat")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _simplex_mapping_max_stretch_factor(ambient_dim, dim=None, dofdesc=None,
         with_elementwise_max=True):
     """Return the largest factor by which the reference-to-global
@@ -935,7 +948,7 @@ def _simplex_mapping_max_stretch_factor(ambient_dim, dim=None, dofdesc=None,
     return cse(result, "mapping_max_stretch", cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _max_curvature(ambient_dim, dim=None, dofdesc=None):
     # An attempt at a 'max curvature' criterion.
 
@@ -956,7 +969,7 @@ def _max_curvature(ambient_dim, dim=None, dofdesc=None):
                 "dimensions" % ambient_dim)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _scaled_max_curvature(ambient_dim, dim=None, dofdesc=None):
     """An attempt at a unit-less, scale-invariant quantity that characterizes
     'how much curviness there is on an element'. Values seem to hover around 1
@@ -982,7 +995,7 @@ def _expansion_radii_factor(ambient_dim, dim):
     return 0.5 * dim_fudge_factor
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _quad_resolution(ambient_dim, dim=None, granularity=None, dofdesc=None):
     """This measures the quadrature resolution across the
     mesh. In a 1D uniform mesh of uniform 'parametrization speed', it
@@ -1003,13 +1016,9 @@ def _quad_resolution(ambient_dim, dim=None, granularity=None, dofdesc=None):
     return interp(from_dd, to_dd, stretch)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _source_danger_zone_radii(ambient_dim, dim=None,
         granularity=None, dofdesc=None):
-    dofdesc = as_dofdesc(dofdesc)
-    if dofdesc.discr_stage is None:
-        dofdesc = dofdesc.copy(discr_stage=QBX_SOURCE_STAGE2)
-
     # This should be the expression of the expansion radii, but
     #
     # - in reference to the stage 2 discretization
@@ -1021,31 +1030,27 @@ def _source_danger_zone_radii(ambient_dim, dim=None,
     #   - Setting this equal to half the expansion radius will not provide
     #     a refinement 'buffer layer' at a 2x coarsening fringe.
 
-    factor = 0.75 * _expansion_radii_factor(ambient_dim, dim)
-    return factor * _quad_resolution(ambient_dim, dim=dim,
-            granularity=granularity, dofdesc=dofdesc)
+    return 0.75 * expansion_radii(ambient_dim,
+            dim=dim, granularity=granularity, dofdesc=dofdesc)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def _close_target_tunnel_radii(ambient_dim, dim=None,
         granularity=None, dofdesc=None):
-    factor = 0.5 * _expansion_radii_factor(ambient_dim, dim)
-
-    return factor * _quad_resolution(ambient_dim, dim=dim,
-            granularity=granularity, dofdesc=dofdesc)
+    return 0.5 * expansion_radii(ambient_dim,
+            dim=dim, granularity=granularity, dofdesc=dofdesc)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def expansion_radii(ambient_dim, dim=None, granularity=None, dofdesc=None):
     factor = _expansion_radii_factor(ambient_dim, dim)
-
     return cse(factor * _quad_resolution(ambient_dim, dim=dim,
         granularity=granularity, dofdesc=dofdesc),
         "expansion_radii",
         cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def expansion_centers(ambient_dim, side, dim=None, dofdesc=None):
     x = nodes(ambient_dim, dofdesc=dofdesc)
     normals = normal(ambient_dim, dim=dim, dofdesc=dofdesc)
@@ -1058,7 +1063,19 @@ def expansion_centers(ambient_dim, side, dim=None, dofdesc=None):
             cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
+def interleaved_expansion_centers(ambient_dim, dim=None, dofdesc=None):
+    centers = [
+            expansion_centers(ambient_dim, -1, dim=dim, dofdesc=dofdesc),
+            expansion_centers(ambient_dim, +1, dim=dim, dofdesc=dofdesc)
+            ]
+
+    source = as_dofdesc(dofdesc)
+    target = source.copy(granularity=GRANULARITY_CENTER)
+    return interp(source, target, centers)
+
+
+@_deprecate_kwargs("where", "dofdesc")
 def h_max(ambient_dim, dim=None, dofdesc=None):
     """Defines a maximum element size in the discretization."""
 
@@ -1070,7 +1087,7 @@ def h_max(ambient_dim, dim=None, dofdesc=None):
             cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def weights_and_area_elements(ambient_dim, dim=None, dofdesc=None):
     """Combines :func:`area_element` and :class:`QWeight`."""
 
@@ -1156,7 +1173,7 @@ class NodeMax(SingleScalarOperandExpression):
     mapper_method = "map_node_max"
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def integral(ambient_dim, dim, operand, dofdesc=None):
     """A volume integral of *operand*."""
 
@@ -1170,7 +1187,7 @@ class SingleScalarOperandExpressionWithWhere(Expression):
 
     init_arg_names = ("operand", "dofdesc")
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __new__(cls, operand=None, dofdesc=None):
         # If the constructor is handed a multivector object, return an
         # object array of the operator applied to each of the
@@ -1184,14 +1201,14 @@ class SingleScalarOperandExpressionWithWhere(Expression):
         else:
             return Expression.__new__(cls)
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, operand, dofdesc=None):
         self.operand = operand
         self.dofdesc = as_dofdesc(dofdesc)
 
     @property
     def where(self):
-        warn('`where` is deprecated. use `dofdesc` instead.',
+        warn("`where` is deprecated. use `dofdesc` instead.",
              DeprecationWarning, stacklevel=2)
         return self.dofdesc
 
@@ -1230,13 +1247,13 @@ class Ones(Expression):
 
     init_arg_names = ("dofdesc",)
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, dofdesc=None):
         self.dofdesc = as_dofdesc(dofdesc)
 
     @property
     def where(self):
-        warn('`where` is deprecated. use `dofdesc` instead.',
+        warn("`where` is deprecated. use `dofdesc` instead.",
              DeprecationWarning, stacklevel=2)
         return self.dofdesc
 
@@ -1246,20 +1263,20 @@ class Ones(Expression):
     mapper_method = intern("map_ones")
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def ones_vec(dim, dofdesc=None):
     from pytools.obj_array import make_obj_array
     return MultiVector(
                 make_obj_array(dim*[Ones(dofdesc)]))
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def area(ambient_dim, dim, dofdesc=None):
     return cse(integral(ambient_dim, dim, Ones(dofdesc), dofdesc), "area",
             cse_scope.DISCRETIZATION)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def mean(ambient_dim, dim, operand, dofdesc=None):
     return (
             integral(ambient_dim, dim, operand, dofdesc)
@@ -1270,7 +1287,7 @@ class IterativeInverse(Expression):
 
     init_arg_names = ("expression", "rhs", "variable_name", "extra_vars", "dofdesc")
 
-    @_deprecate_kwargs('where', 'dofdesc')
+    @_deprecate_kwargs("where", "dofdesc")
     def __init__(self, expression, rhs, variable_name, extra_vars={},
             dofdesc=None):
         self.expression = expression
@@ -1281,7 +1298,7 @@ class IterativeInverse(Expression):
 
     @property
     def where(self):
-        warn('`where` is deprecated. use `dofdesc` instead.',
+        warn("`where` is deprecated. use `dofdesc` instead.",
              DeprecationWarning, stacklevel=2)
         return self.dofdesc
 
@@ -1376,10 +1393,6 @@ def hashable_kernel_args(kernel_arguments):
     return tuple(hashable_args)
 
 
-class _NoArgSentinel(object):
-    pass
-
-
 class IntG(Expression):
     r"""
     .. math::
@@ -1649,7 +1662,7 @@ def S(kernel, density,
             kernel_arguments, **kwargs)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def tangential_derivative(ambient_dim, operand, dim=None, dofdesc=None):
     pder = (
             pseudoscalar(ambient_dim, dim, dofdesc)
@@ -1661,7 +1674,7 @@ def tangential_derivative(ambient_dim, operand, dim=None, dofdesc=None):
             (d.dnabla(ambient_dim) * d(operand)) >> pder)
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def normal_derivative(ambient_dim, operand, dim=None, dofdesc=None):
     d = Derivative()
     return d.resolve(
@@ -1769,7 +1782,7 @@ def Dp(kernel, *args, **kwargs):
 
 # {{{ conventional vector calculus
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def tangential_onb(ambient_dim, dim=None, dofdesc=None):
     """Return a matrix of shape ``(ambient_dim, dim)`` with orthogonal columns
     spanning the tangential space of the surface of *dofdesc*.
@@ -1797,7 +1810,7 @@ def tangential_onb(ambient_dim, dim=None, dofdesc=None):
     return orth_pd_mat
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def xyz_to_tangential(xyz_vec, dofdesc=None):
     ambient_dim = len(xyz_vec)
     tonb = tangential_onb(ambient_dim, dofdesc=dofdesc)
@@ -1807,7 +1820,7 @@ def xyz_to_tangential(xyz_vec, dofdesc=None):
         ])
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def tangential_to_xyz(tangential_vec, dofdesc=None):
     ambient_dim = len(tangential_vec) + 1
     tonb = tangential_onb(ambient_dim, dofdesc=dofdesc)
@@ -1816,13 +1829,13 @@ def tangential_to_xyz(tangential_vec, dofdesc=None):
         for i in range(ambient_dim - 1))
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def project_to_tangential(xyz_vec, dofdesc=None):
     return tangential_to_xyz(
             cse(xyz_to_tangential(xyz_vec, dofdesc), dofdesc))
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def n_dot(vec, dofdesc=None):
     nrm = normal(len(vec), dofdesc).as_vector()
 
@@ -1841,7 +1854,7 @@ def cross(vec_a, vec_b):
         for i in range(3)])
 
 
-@_deprecate_kwargs('where', 'dofdesc')
+@_deprecate_kwargs("where", "dofdesc")
 def n_cross(vec, dofdesc=None):
     return cross(normal(3, dofdesc).as_vector(), vec)
 
diff --git a/pytential/unregularized.py b/pytential/unregularized.py
index 4e8e3098e79e7211c2727be5472e2ed9c0d5bc7f..6f0125cd56d6af95bf9576fea3f24e08d114c65f 100644
--- a/pytential/unregularized.py
+++ b/pytential/unregularized.py
@@ -87,15 +87,6 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
             expansion_factory = DefaultExpansionFactory()
         self.expansion_factory = expansion_factory
 
-    @memoize_method
-    def weights_and_area_elements(self):
-        from pytential import bind, sym
-        with cl.CommandQueue(self.cl_context) as queue:
-            waa = bind(self,
-                    sym.weights_and_area_elements(self.ambient_dim))(queue)
-
-            return waa.with_queue(None)
-
     def copy(
             self,
             density_discr=None,
@@ -153,14 +144,17 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
         for arg_name, arg_expr in six.iteritems(insn.kernel_arguments):
             kernel_args[arg_name] = evaluate(arg_expr)
 
-        strengths = (evaluate(insn.density).with_queue(queue)
-                * self.weights_and_area_elements())
+        from pytential import bind, sym
+        waa = bind(bound_expr.places, sym.weights_and_area_elements(
+            self.ambient_dim, dofdesc=insn.source))(queue)
+        strengths = waa * evaluate(insn.density).with_queue(queue)
 
         result = []
         p2p = None
 
         for o in insn.outputs:
-            target_discr = bound_expr.get_discretization(o.target_name)
+            target_discr = bound_expr.places.get_discretization(
+                    o.target_name.geometry, o.target_name.discr_stage)
 
             if p2p is None:
                 p2p = self.get_p2p(insn.kernels)
@@ -221,7 +215,7 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
                 continue
 
             target_name_to_index[o.target_name] = len(targets)
-            targets.append(bound_expr.places.get_geometry(o.target_name))
+            targets.append(bound_expr.places.get_geometry(o.target_name.geometry))
 
         targets = tuple(targets)
 
@@ -231,8 +225,10 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
 
         geo_data = self.fmm_geometry_data(targets)
 
-        strengths = (evaluate(insn.density).with_queue(queue)
-                * self.weights_and_area_elements())
+        from pytential import bind, sym
+        waa = bind(bound_expr.places, sym.weights_and_area_elements(
+            self.ambient_dim, dofdesc=insn.source))(queue)
+        strengths = waa * evaluate(insn.density).with_queue(queue)
 
         out_kernels = tuple(knl for knl in insn.kernels)
         fmm_kernel = self.get_fmm_kernel(out_kernels)
diff --git a/requirements.txt b/requirements.txt
index 625deb28d7e5a04253bc3ffb5c12b2ba263362b5..164afcc3fbdefc16941d0361abe4497f4d217907 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,6 @@ git+https://github.com/inducer/pyopencl
 git+https://github.com/inducer/islpy
 git+https://github.com/inducer/loopy
 git+https://gitlab.tiker.net/inducer/boxtree
-git+https://github.com/inducer/meshmode
+git+https://gitlab.tiker.net/inducer/meshmode
 git+https://gitlab.tiker.net/inducer/sumpy
 git+https://gitlab.tiker.net/inducer/pyfmmlib
diff --git a/test/extra_curve_data.py b/test/extra_curve_data.py
index 4d2dacca6bbae937119f59757cdd738b4f0f95e3..a6968526b03eec90fc59b23c49acc4b334b8d217 100644
--- a/test/extra_curve_data.py
+++ b/test/extra_curve_data.py
@@ -38,6 +38,9 @@ class Curve(object):
     def __add__(self, other):
         return CompositeCurve(self, other)
 
+    def __call__(self, ts):
+        raise NotImplementedError
+
 
 class CompositeCurve(Curve):
     """
diff --git a/test/extra_geometry_tools.py b/test/extra_geometry_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f215bb2222d7152d92c3790f4e7e90dfc0c8ed9
--- /dev/null
+++ b/test/extra_geometry_tools.py
@@ -0,0 +1,561 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2014 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+from functools import partial
+
+import numpy as np
+
+from pytential import sym
+from pytools.obj_array import make_obj_array
+
+from meshmode.mesh.generation import make_curve_mesh, ellipse
+from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+
+
+# {{{ helpers
+
+def refine_mesh_uniformly(mesh, iterations, with_adjacency=False):
+    if with_adjacency:
+        from meshmode.mesh.refinement import refine_uniformly
+        mesh = refine_uniformly(mesh, iterations)
+    else:
+        # TODO: add this to meshmode
+        from meshmode.mesh.refinement import RefinerWithoutAdjacency
+        refiner = RefinerWithoutAdjacency(mesh)
+        for _ in range(iterations):
+            refiner.refine_uniformly()
+        mesh = refiner.get_current_mesh()
+
+    return mesh
+
+
+def make_circular_point_group(ambient_dim, npoints, radius,
+        center=np.array([0., 0.]), func=lambda x: x):
+    t = func(np.linspace(0, 1, npoints, endpoint=False)) * (2 * np.pi)
+    center = np.asarray(center)
+    result = np.zeros((ambient_dim, npoints))
+    result[:2, :] = center[:, np.newaxis] + radius*np.vstack((np.cos(t), np.sin(t)))
+    return result
+
+# }}}
+
+
+# {{{ operators
+
+class IntEqOperatorBase(object):
+    nvariables = -1
+
+    def __init__(self, kernel, sign, kernel_arguments):
+        self.sign = sign
+        self.kernel = kernel
+        self.kernel_arguments = kernel_arguments
+
+    def get_density_var(self, name):
+        if self.nvariables == 1:
+            return sym.var(name)
+        return sym.make_sym_vector(name, self.nvariables)
+
+    def _lpot(self, op, density, qbx_forced_limit):
+        return op(self.kernel, density,
+                kernel_arguments=self.kernel_arguments,
+                qbx_forced_limit=qbx_forced_limit)
+
+    def _S(self, density, qbx_forced_limit):
+        return self._lpot(sym.S, density, qbx_forced_limit)
+
+    def _Sp(self, density, qbx_forced_limit):
+        return self._lpot(sym.Sp, density, qbx_forced_limit)
+
+    def _D(self, density, qbx_forced_limit):
+        return self._lpot(sym.D, density, qbx_forced_limit)
+
+    def representation(self, density, qbx_forced_limit=None):
+        raise NotImplementedError
+
+    def operator(self, density):
+        raise NotImplementedError
+
+
+class ScalarSingleLayerOperator(IntEqOperatorBase):
+    nvariables = 1
+
+    def representation(self, density, qbx_forced_limit=None):
+        return self._S(density, qbx_forced_limit)
+
+    def operator(self, density):
+        return self._S(density, +1)
+
+
+class ScalarDoubleLayerOperator(IntEqOperatorBase):
+    nvariables = 1
+
+    def representation(self, density, qbx_forced_limit=None):
+        return self._D(density, qbx_forced_limit)
+
+    def operator(self, density):
+        return 0.5 * self.sign * density + self._D(density, "avg")
+
+
+class ScalarMixedOperator(IntEqOperatorBase):
+    nvariables = 1
+    alpha = 0.3
+    beta = 0.5
+
+    def representation(self, density, qbx_forced_limit=None):
+        return (self._S(self.alpha * density, qbx_forced_limit)
+                + self._D(self.beta * density, qbx_forced_limit))
+
+    def operator(self, density):
+        return (0.5 * self.beta * self.sign * density
+                + self._S(self.alpha * density, +1)
+                + self._D(self.beta * density, "avg"))
+
+
+class VectorMixedOperator(IntEqOperatorBase):
+    nvariables = 2
+    alpha = 0.4
+    beta = 0.3
+
+    def representation(self, density, qbx_forced_limit=None):
+        u0 = density[0]
+        u1 = density[1]
+        return make_obj_array([
+            self._Sp(u0, qbx_forced_limit)
+            + self._D(u1, qbx_forced_limit),
+            self._S(self.alpha * u0, qbx_forced_limit)
+            + self._D(self.beta * u0, qbx_forced_limit)
+            ])
+
+    def operator(self, density):
+        u0 = density[0]
+        u1 = density[1]
+        return make_obj_array([
+            -0.5 * self.sign * u0 + self._Sp(u0, "avg")
+            + 0.5 * self.sign * u1 + self._D(u1, "avg"),
+            self._S(self.alpha * u0, +1)
+            + 0.5 * self.sign * u0 + self._D(self.beta * u0, "avg")
+            ])
+
+# }}}
+
+
+# {{{ base classes
+
+class TestCaseBase(object):
+    fmm_backend = "sumpy"
+    gmres_tol = 1.0e-14
+
+    def __init__(self, knl_class, knl_kwargs={}, **kwargs):
+        self.knl_class = knl_class
+
+        self.knl_kwargs = knl_kwargs
+        self.knl_kwargs_syms = dict((k, sym.var(k)) for k in self.knl_kwargs)
+
+        for k, v in six.iteritems(self.knl_kwargs):
+            setattr(self, k, v)
+        for k, v in six.iteritems(kwargs):
+            setattr(self, k, v)
+
+    def __str__(self):
+        def is_valid(cls, k):
+            v = getattr(cls, k)
+            return k[0] != "_" \
+                    and (not callable(v) or isinstance(v, type)) \
+                    and not isinstance(v, property)
+
+        instance_attrs = {
+                k: getattr(self, k) for k in dir(self) if is_valid(self, k)
+                }
+        width = len(max(list(instance_attrs.keys()), key=len))
+        fmt = "%%%ds : %%s" % width
+        header_attrs = {
+                "class": type(self).__name__,
+                "name": instance_attrs.pop("name"),
+                "-" * width: "-" * width
+                }
+        return "\n".join([
+            "\t%s" % "\n\t".join(fmt % (k, v) for k, v in header_attrs.items()),
+            "\t%s" % "\n\t".join(fmt % (k, v) for k, v in instance_attrs.items()),
+        ])
+
+    @property
+    def ambient_dim(self):
+        raise NotImplementedError
+
+    @property
+    def name(self):
+        raise NotImplementedError
+
+    @property
+    def qbx_order(self):
+        raise NotImplementedError
+
+    @property
+    def target_order(self):
+        raise NotImplementedError
+
+
+class IntEqTestCase(TestCaseBase):
+    source_ovsmp = 4
+    use_refinement = True
+    fmm_backend = None
+
+    inner_radius = None
+    outer_radius = None
+
+    fmm_tol = None
+    fmm_order = None
+
+    # used in `test_scalar_int_eq`
+    check_gradient = False
+    check_tangential_deriv = False
+
+    def __init__(self, knl_class_or_helmholtz_k=0,
+            op_type="dirichlet", side=+1, knl_kwargs={}, **kwargs):
+
+        if not isinstance(knl_class_or_helmholtz_k, type):
+            if knl_class_or_helmholtz_k == 0:
+                knl_class = LaplaceKernel
+            else:
+                knl_kwargs = {"k": knl_class_or_helmholtz_k}
+                knl_class = HelmholtzKernel
+        else:
+            knl_class = knl_class_or_helmholtz_k
+
+        self.op_type = op_type.lower()
+        self.side = side
+
+        super(IntEqTestCase, self).__init__(
+                knl_class, knl_kwargs=knl_kwargs, **kwargs)
+
+    def get_mesh(self, resolution, mesh_order):
+        raise NotImplementedError
+
+    def get_operator(self, ambient_dim):
+        knl = self.knl_class(ambient_dim)
+
+        if knl.is_complex_valued:
+            self.dtype = np.complex128
+        else:
+            self.dtype = np.float64
+
+        if self.side in [+1, "scat"]:
+            sign = +1
+        else:
+            sign = -1
+
+        from pytential.symbolic.pde.scalar import (
+                DirichletOperator,
+                NeumannOperator,
+                BiharmonicClampedPlateOperator,
+                )
+
+        if self.op_type == "single":
+            op = ScalarSingleLayerOperator(knl, sign,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "double":
+            op = ScalarDoubleLayerOperator(knl, sign,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "scalar_mixed":
+            op = ScalarMixedOperator(knl, sign,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "vector_mixed":
+            op = VectorMixedOperator(knl, sign,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "dirichlet":
+            op = DirichletOperator(knl, sign,
+                    use_l2_weighting=True,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "neumann":
+            op = NeumannOperator(knl, sign,
+                    use_l2_weighting=True,
+                    use_improved_operator=False,
+                    kernel_arguments=self.knl_kwargs_syms)
+        elif self.op_type == "clamped_plate":
+            op = BiharmonicClampedPlateOperator(knl, sign)
+        else:
+            raise ValueError("unknown operator type `{}`".format(self.op_type))
+
+        return op
+
+    def get_test_sources_and_targets(self, ctx, ambient_dim,
+            nsources=10, ntargets=20):
+        if self.side == -1:
+            test_src_radius = self.outer_radius
+            test_tgt_radius = self.inner_radius
+        elif self.side == +1:
+            test_src_radius = self.inner_radius
+            test_tgt_radius = self.outer_radius
+        elif self.side == "scat":
+            test_src_radius = self.outer_radius
+            test_tgt_radius = self.outer_radius
+        else:
+            raise ValueError("unknown side `{}`".format(self.side))
+
+        from pytential.source import PointPotentialSource
+        point_sources = make_circular_point_group(
+                ambient_dim, nsources, test_src_radius,
+                func=lambda x: x**1.5)
+        point_source = PointPotentialSource(ctx, point_sources)
+
+        from pytential.target import PointsTarget
+        test_targets = make_circular_point_group(
+                ambient_dim, ntargets, test_tgt_radius)
+        point_target = PointsTarget(test_targets)
+
+        return point_source, point_target
+
+    def get_layer_potential(self, ctx, resolution, mesh_order):
+        from meshmode.discretization import Discretization
+        from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory as GroupFactory
+
+        mesh = self.get_mesh(resolution, mesh_order)
+        pre_density_discr = Discretization(
+                ctx, mesh, GroupFactory(self.target_order))
+
+        from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
+        kwargs = {}
+        if self.fmm_backend is None:
+            kwargs["fmm_order"] = False
+        else:
+            if hasattr(self, "fmm_tol"):
+                kwargs["fmm_order"] = SimpleExpansionOrderFinder(self.fmm_tol)
+            elif hasattr(self, "fmm_order"):
+                kwargs["fmm_order"] = self.fmm_order
+            else:
+                kwargs["fmm_order"] = self.qbx_order + 5
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(
+                pre_density_discr,
+                fine_order=self.source_ovsmp * self.target_order,
+                qbx_order=self.qbx_order,
+
+                _disable_refinement=not self.use_refinement,
+                _box_extent_norm=getattr(self, "box_extent_norm", None),
+                _from_sep_smaller_crit=getattr(self, "from_sep_smaller_crit", None),
+                fmm_backend=self.fmm_backend,
+                **kwargs)
+
+        return qbx
+
+# }}}
+
+
+# {{{ 2D
+
+class CurveIntEqTestCase(IntEqTestCase):
+    name = "curve2d"
+    ambient_dim = 2
+
+    qbx_order = 5
+    target_order = 5
+    # resolutions for convergence study
+    resolutions = [40, 50, 60]
+
+    # default to sphere
+    curve_fn = partial(ellipse, 1.0)
+
+    # smallest and largest radius that can fit the geometry
+    inner_radius = 0.5
+    outer_radius = 1.5
+
+    # used in `test_scalar_int_eq`
+    check_tangential_deriv = True
+
+    def get_mesh(self, nelements, mesh_order):
+        return make_curve_mesh(
+                self.curve_fn,
+                np.linspace(0.0, 1.0, nelements + 1),
+                mesh_order)
+
+
+class StarfishIntEqTestCase(CurveIntEqTestCase):
+    name = "starfish"
+    n_arms = 5
+    amplitude = 0.25
+
+    @property
+    def curve_fn(self):
+        from meshmode.mesh.generation import NArmedStarfish
+        return NArmedStarfish(self.n_arms, self.amplitude)
+
+# }}}
+
+
+# {{{ 3D
+
+class FMMLIBIntEqTestCase(IntEqTestCase):
+    ambient_dim = 3
+    gmres_tol = 1e-7
+
+    fmm_backend = "fmmlib"
+    use_refinement = False
+
+    @property
+    def target_order(self):
+        return self.qbx_order
+
+
+class EllipsoidIntEqTestCase(FMMLIBIntEqTestCase):
+    name = "ellipsoid"
+    qbx_order = 4
+
+    resolutions = [2, 0.8]
+    fmm_order = 13
+
+    # smallest and largest radius that can fit the geometry
+    inner_radius = 0.4
+    outer_radius = 5
+
+    # used in `test_scalar_int_eq`
+    check_gradient = True
+
+    def get_mesh(self, resolution, mesh_order):
+        from meshmode.mesh.io import generate_gmsh, FileSource
+        mesh = generate_gmsh(
+                FileSource("ellipsoid.step"), 2, order=mesh_order,
+                other_options=[
+                    "-string",
+                    "Mesh.CharacteristicLengthMax = %g;" % resolution])
+
+        # flip elements -- gmsh generates inside-out geometry
+        from meshmode.mesh.processing import perform_flips
+        return perform_flips(mesh, np.ones(mesh.nelements))
+
+
+class SphereIntEqTestCase(IntEqTestCase):
+    name = "sphere"
+    ambient_dim = 3
+
+    qbx_order = 4
+    target_order = 8
+
+    resolutions = [1, 2]
+
+    fmm_backend = "fmmlib"
+    fmm_tol = 1.0e-4
+    gmres_tol = 1.0e-7
+
+    use_refinement = False
+
+    # smallest and largest radius that can fit the geometry
+    inner_radius = 0.4
+    outer_radius = 5
+
+    def get_mesh(self, iterations, mesh_order):
+        from meshmode.mesh.generation import generate_icosphere
+        mesh = generate_icosphere(1, mesh_order)
+
+        return refine_mesh_uniformly(mesh, iterations)
+
+
+class TorusIntEqTestCase(IntEqTestCase):
+    name = "torus"
+    ambient_dim = 3
+
+    qbx_order = 4
+    target_order = 8
+
+    r_major = 1.0
+    r_minor = 0.5
+
+    def get_mesh(self, iterations, mesh_order):
+        from meshmode.mesh.generation import generate_torus
+        mesh = generate_torus(self.r_major, self.r_minor, order=mesh_order)
+
+        return refine_mesh_uniformly(mesh, iterations)
+
+
+class MergedCubesIntEqTestCase(FMMLIBIntEqTestCase):
+    name = "merged-cubes"
+    ambient_dim = 3
+
+    qbx_order = 4
+    resolutions = [1.4]
+
+    inner_radius = 0.4
+    outer_radius = 12
+
+    def get_mesh(self, resolution, mesh_order):
+        from meshmode.mesh.io import generate_gmsh, FileSource
+        mesh = generate_gmsh(
+                FileSource("merged-cubes.step"), 2, order=mesh_order,
+                other_options=[
+                    "-string",
+                    "Mesh.CharacteristicLengthMax = %g;" % resolution])
+
+        # flip elements -- gmsh generates inside-out geometry
+        from meshmode.mesh.processing import perform_flips
+        return perform_flips(mesh, np.ones(mesh.nelements))
+
+
+class ManyEllipsoidIntEqTestCase(FMMLIBIntEqTestCase):
+    name = "ellipsoid"
+    qbx_order = 4
+
+    resolutions = [2, 1]
+
+    # this should sit in the area just outside the middle ellipsoid
+    inner_radius = 0.4
+    outer_radius = 5
+
+    nx = 2
+    ny = 2
+    nz = 2
+
+    def get_mesh(self, resolution, mesh_order):
+        from meshmode.mesh.io import generate_gmsh, FileSource
+        base_mesh = generate_gmsh(
+                FileSource("ellipsoid.step"), 2, order=mesh_order,
+                other_options=[
+                    "-string",
+                    "Mesh.CharacteristicLengthMax = %g;" % resolution])
+
+        # flip elements -- gmsh generates inside-out geometry.
+        from meshmode.mesh.processing import perform_flips
+        base_mesh = perform_flips(base_mesh, np.ones(base_mesh.nelements))
+
+        from meshmode.mesh.processing import affine_map, merge_disjoint_meshes
+        from meshmode.mesh.tools import rand_rotation_matrix
+        pitch = 10
+        meshes = [
+                affine_map(
+                    base_mesh,
+                    A=rand_rotation_matrix(3),
+                    b=pitch*np.array([
+                        (ix - self.nx//2),
+                        (iy - self.ny//2),
+                        (iz - self.ny//2)]))
+                for ix in range(self.nx)
+                for iy in range(self.ny)
+                for iz in range(self.nz)
+                ]
+
+        return merge_disjoint_meshes(meshes, single_group=True)
+
+
+# }}}
diff --git a/test/extra_matrix_tools.py b/test/extra_matrix_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bab8d2f85ad78f27364374ff6d30694b13e29b
--- /dev/null
+++ b/test/extra_matrix_tools.py
@@ -0,0 +1,158 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2018-2020 Alexandru Fikl
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+
+from pytential import sym
+from pytential.symbolic.matrix import (
+        FarFieldBlockBuilder, NearFieldBlockBuilder,
+        P2PMatrixBuilder, MatrixBuilder)
+
+from sumpy.tools import BlockIndexRanges
+from meshmode.mesh.generation import NArmedStarfish
+
+import extra_geometry_tools as eq
+
+
+# {{{ helpers
+
+def block_max_error(mat, blk, index_set, visualize=False):
+    error = -np.inf
+    for i in range(index_set.nblocks):
+        mat_i = index_set.take(mat, i)
+        blk_i = index_set.block_take(blk, i)
+
+        if visualize:
+            print('block[{:04}]: {:.5e}'.format(i, error))
+        error = max(error, la.norm(mat_i - blk_i) / la.norm(mat_i))
+
+    return error
+
+
+def build_block_index(queue, discr,
+        max_particles_in_box=None, nblocks=10, factor=1.0):
+    nnodes = discr.nnodes
+    if max_particles_in_box is None:
+        max_particles_in_box = nnodes // nblocks
+
+    # create index ranges
+    from pytential.linalg.hss import partition_by_nodes
+    indices, partition = partition_by_nodes(queue, discr,
+            max_particles_in_box=max_particles_in_box)
+
+    if abs(factor - 1.0) < 1.0e-14:
+        return indices, partition
+
+    # randomly pick a subset of points
+    indices = indices.get(queue)
+
+    indices_ = np.empty(indices.nblocks, dtype=np.object)
+    for i in range(indices.nblocks):
+        iidx = indices.block_indices(i)
+        isize = int(factor * len(iidx))
+        isize = max(1, min(isize, len(iidx)))
+
+        indices_[i] = np.sort(
+                np.random.choice(iidx, size=isize, replace=False))
+
+    ranges_ = cl.array.to_device(queue,
+            np.cumsum([0] + [r.shape[0] for r in indices_]))
+    indices_ = cl.array.to_device(queue, np.hstack(indices_))
+
+    indices = BlockIndexRanges(discr.cl_context,
+                               indices_.with_queue(None),
+                               ranges_.with_queue(None))
+
+    return indices, partition
+
+# }}}
+
+
+# {{{ tests
+
+class MatrixTestCaseMixin(object):
+    proxy_radius_factor = None
+    max_particles_in_box = None
+
+    # id tolerance
+    id_eps = 1.0e-8
+    # matrix type
+    matrix_type = "qbx"
+    # add weights to farfield proxy interactions
+    weighted_farfield = None
+
+    # keeps `partition_factor * 100`% of the points randomly
+    partition_factor = 1.0
+    # stage on which to perform the compression
+    discr_stage = sym.QBX_SOURCE_STAGE2
+
+    @property
+    def nblocks(self):
+        raise NotImplementedError
+
+    @property
+    def farfield_block_builder(self):
+        return FarFieldBlockBuilder
+
+    @property
+    def nearfield_block_builder(self):
+        return NearFieldBlockBuilder if self.matrix_type == "qbx" else \
+                FarFieldBlockBuilder
+
+    @property
+    def dense_matrix_builder(self):
+        return MatrixBuilder if self.matrix_type == "qbx" else \
+                P2PMatrixBuilder
+
+
+class CurveHSSTestCase(MatrixTestCaseMixin, eq.CurveIntEqTestCase):
+    name = "curve_hss"
+    qbx_order = 4
+    target_order = 4
+
+    nblocks = 8
+
+    resolutions = [32, 48, 64]
+    proxy_radius_factor = 1.1
+
+    curve_fn = NArmedStarfish(5, 0.25)
+
+
+class TorusHSSTestCase(MatrixTestCaseMixin, eq.TorusIntEqTestCase):
+    name = "torus_hss"
+    qbx_order = 4
+    target_order = 2
+
+    nblocks = 8
+
+    resolutions = [0, 1, 2]
+    proxy_radius_factor = 1.1
+
+# }}}
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index 55bf804f95cd8def1972297691e9d92ce0e1d645..2ebac61c3113b962172ce2e27825c64059402ea1 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -36,6 +36,8 @@ from pytools import one
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
 
 from pytential import bind, sym, norm  # noqa
+from pytential import GeometryCollection
+
 from pytential.qbx.cost import CostModel
 
 
@@ -86,14 +88,11 @@ def get_lpot_source(queue, dim):
             pre_density_discr, OVSMP_FACTOR*target_order,
             **lpot_kwargs)
 
-    lpot_source, _ = lpot_source.with_refinement()
-
     return lpot_source
 
 
-def get_density(queue, lpot_source):
-    density_discr = lpot_source.density_discr
-    nodes = density_discr.nodes().with_queue(queue)
+def get_density(queue, discr):
+    nodes = discr.nodes().with_queue(queue)
     return cl.clmath.sin(10 * nodes[0])
 
 # }}}
@@ -111,13 +110,17 @@ def test_timing_data_gathering(ctx_factory):
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     lpot_source = get_lpot_source(queue, 2)
-    sigma = get_density(queue, lpot_source)
+    places = GeometryCollection(lpot_source)
+
+    dofdesc = places.auto_source.to_stage1()
+    density_discr = places.get_discretization(dofdesc.geometry)
+    sigma = get_density(queue, density_discr)
 
     sigma_sym = sym.var("sigma")
     k_sym = LaplaceKernel(lpot_source.ambient_dim)
     sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
 
-    op_S = bind(lpot_source, sym_op_S)
+    op_S = bind(places, sym_op_S)
 
     timing_data = {}
     op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data)
@@ -138,26 +141,26 @@ def test_cost_model(ctx_factory, dim, use_target_specific_qbx):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
-    lpot_source = (
-            get_lpot_source(queue, dim)
-            .copy(
-                _use_target_specific_qbx=use_target_specific_qbx,
-                cost_model=CostModel()))
+    lpot_source = get_lpot_source(queue, dim).copy(
+            _use_target_specific_qbx=use_target_specific_qbx,
+            cost_model=CostModel())
+    places = GeometryCollection(lpot_source)
 
-    sigma = get_density(queue, lpot_source)
+    density_discr = places.get_discretization(places.auto_source.geometry)
+    sigma = get_density(queue, density_discr)
 
     sigma_sym = sym.var("sigma")
     k_sym = LaplaceKernel(lpot_source.ambient_dim)
 
     sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
-    op_S = bind(lpot_source, sym_op_S)
+    op_S = bind(places, sym_op_S)
     cost_S = op_S.get_modeled_cost(queue, sigma=sigma)
     assert len(cost_S) == 1
 
     sym_op_S_plus_D = (
             sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
             + sym.D(k_sym, sigma_sym, qbx_forced_limit="avg"))
-    op_S_plus_D = bind(lpot_source, sym_op_S_plus_D)
+    op_S_plus_D = bind(places, sym_op_S_plus_D)
     cost_S_plus_D = op_S_plus_D.get_modeled_cost(queue, sigma=sigma)
     assert len(cost_S_plus_D) == 2
 
@@ -177,20 +180,24 @@ def test_cost_model_metadata_gathering(ctx_factory):
 
     lpot_source = get_lpot_source(queue, 2).copy(
             fmm_level_to_order=fmm_level_to_order)
+    places = GeometryCollection(lpot_source)
 
-    sigma = get_density(queue, lpot_source)
+    density_discr = places.get_discretization(places.auto_source.geometry)
+    sigma = get_density(queue, density_discr)
 
     sigma_sym = sym.var("sigma")
     k_sym = HelmholtzKernel(2, "k")
     k = 2
 
     sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1, k=sym.var("k"))
-    op_S = bind(lpot_source, sym_op_S)
+    op_S = bind(places, sym_op_S)
 
     cost_S = one(op_S.get_modeled_cost(queue, sigma=sigma, k=k).values())
 
     geo_data = lpot_source.qbx_fmm_geometry_data(
-            target_discrs_and_qbx_sides=((lpot_source.density_discr, 1),))
+            places,
+            places.auto_source,
+            target_discrs_and_qbx_sides=((density_discr, 1),))
 
     tree = geo_data.tree()
 
@@ -452,14 +459,18 @@ def test_cost_model_correctness(ctx_factory, dim, off_surface,
         targets = lpot_source.density_discr
         target_discrs_and_qbx_sides = ((targets, 1),)
         qbx_forced_limit = 1
+    places = GeometryCollection((lpot_source, targets))
+
+    source_dd = places.auto_source
+    density_discr = places.get_discretization(source_dd.geometry)
 
     # Construct bound op, run cost model.
     sigma_sym = sym.var("sigma")
     k_sym = LaplaceKernel(lpot_source.ambient_dim)
     sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=qbx_forced_limit)
 
-    op_S = bind((lpot_source, targets), sym_op_S)
-    sigma = get_density(queue, lpot_source)
+    op_S = bind(places, sym_op_S)
+    sigma = get_density(queue, density_discr)
 
     from pytools import one
     cost_S = one(op_S.get_modeled_cost(queue, sigma=sigma).values())
@@ -468,11 +479,15 @@ def test_cost_model_correctness(ctx_factory, dim, off_surface,
     # high-level interface, so call the FMM driver directly.
     from pytential.qbx.fmm import drive_fmm
     geo_data = lpot_source.qbx_fmm_geometry_data(
+            places, source_dd.geometry,
             target_discrs_and_qbx_sides=target_discrs_and_qbx_sides)
 
     wrangler = ConstantOneQBXExpansionWrangler(
             queue, geo_data, use_target_specific_qbx)
-    nnodes = lpot_source.quad_stage2_density_discr.nnodes
+
+    quad_stage2_density_discr = places.get_discretization(
+            source_dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
+    nnodes = quad_stage2_density_discr.nnodes
     src_weights = np.ones(nnodes)
 
     timing_data = {}
@@ -533,16 +548,18 @@ def test_cost_model_order_varying_by_level(ctx_factory):
             cost_model=CostModel(
                 calibration_params=CONSTANT_ONE_PARAMS),
             fmm_level_to_order=level_to_order_constant)
+    places = GeometryCollection(lpot_source)
 
+    density_discr = places.get_discretization(places.auto_source.geometry)
     sigma_sym = sym.var("sigma")
 
     k_sym = LaplaceKernel(2)
     sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
 
-    sigma = get_density(queue, lpot_source)
+    sigma = get_density(queue, density_discr)
 
     cost_constant = one(
-            bind(lpot_source, sym_op)
+            bind(places, sym_op)
             .get_modeled_cost(queue, sigma=sigma).values())
 
     # }}}
diff --git a/test/test_global_qbx.py b/test/test_global_qbx.py
index fd19c9733b5c822613b7ec264e162ab1b6e08726..9ab5b6926e2f88767bd0b01594f1a05c686174e9 100644
--- a/test/test_global_qbx.py
+++ b/test/test_global_qbx.py
@@ -43,6 +43,7 @@ from meshmode.mesh.generation import (  # noqa
 from extra_curve_data import horseshoe
 
 from pytential import bind, sym
+from pytential import GeometryCollection
 
 import logging
 logger = logging.getLogger(__name__)
@@ -80,61 +81,62 @@ def iter_elements(discr):
             discr_nodes_idx += discr_group.nunit_nodes
 
 
-def run_source_refinement_test(ctx_factory, mesh, order, helmholtz_k=None):
+def run_source_refinement_test(ctx_factory, mesh, order,
+        helmholtz_k=None, visualize=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
+    # {{{ initial geometry
+
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import (
             InterpolatoryQuadratureSimplexGroupFactory)
-
-    factory = InterpolatoryQuadratureSimplexGroupFactory(order)
-
-    discr = Discretization(cl_ctx, mesh, factory)
-
-    from pytential.qbx.refinement import (
-            RefinerCodeContainer, refine_for_global_qbx)
-
-    from pytential.qbx.utils import TreeCodeContainer
+    discr = Discretization(cl_ctx, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(order))
 
     lpot_source = QBXLayerPotentialSource(discr,
             qbx_order=order,  # not used in refinement
             fine_order=order)
-    del discr
+    places = GeometryCollection(lpot_source)
 
+    # }}}
+
+    # {{{ refined geometry
+
+    kernel_length_scale = 5 / helmholtz_k if helmholtz_k else None
     expansion_disturbance_tolerance = 0.025
-    refiner_extra_kwargs = {
-            "expansion_disturbance_tolerance": expansion_disturbance_tolerance,
-            }
-    if helmholtz_k is not None:
-        refiner_extra_kwargs["kernel_length_scale"] = 5/helmholtz_k
-
-    lpot_source, conn = refine_for_global_qbx(
-            lpot_source,
-            RefinerCodeContainer(
-                cl_ctx, TreeCodeContainer(cl_ctx)).get_wrangler(queue),
-            factory, **refiner_extra_kwargs)
-
-    discr_nodes = lpot_source.density_discr.nodes().get(queue)
-    fine_discr_nodes = \
-            lpot_source.quad_stage2_density_discr.nodes().get(queue)
-
-    int_centers = bind(lpot_source,
+
+    from pytential.qbx.refinement import refine_geometry_collection
+    places = refine_geometry_collection(queue, places,
+            kernel_length_scale=kernel_length_scale,
+            expansion_disturbance_tolerance=expansion_disturbance_tolerance,
+            visualize=visualize)
+
+    # }}}
+
+    dd = places.auto_source
+    stage1_density_discr = places.get_discretization(dd.geometry)
+    stage1_density_nodes = stage1_density_discr.nodes().get(queue)
+
+    quad_stage2_density_discr = places.get_discretization(
+            dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
+    quad_stage2_density_nodes = quad_stage2_density_discr.nodes().get(queue)
+
+    int_centers = bind(places,
         sym.expansion_centers(lpot_source.ambient_dim, -1))(queue)
     int_centers = np.array([axis.get(queue) for axis in int_centers])
-    ext_centers = bind(lpot_source,
+    ext_centers = bind(places,
         sym.expansion_centers(lpot_source.ambient_dim, +1))(queue)
     ext_centers = np.array([axis.get(queue) for axis in ext_centers])
 
-    expansion_radii = bind(lpot_source,
+    expansion_radii = bind(places,
         sym.expansion_radii(lpot_source.ambient_dim))(queue).get()
-    source_danger_zone_radii = bind(lpot_source, sym._source_danger_zone_radii(
-        lpot_source.ambient_dim,
-        dofdesc=sym.GRANULARITY_ELEMENT))(queue).get()
 
-    quad_res = bind(lpot_source, sym._quad_resolution(
-        lpot_source.ambient_dim,
-        dofdesc=sym.GRANULARITY_ELEMENT))(queue)
+    dd = dd.copy(granularity=sym.GRANULARITY_ELEMENT)
+    source_danger_zone_radii = bind(places, sym._source_danger_zone_radii(
+        lpot_source.ambient_dim, dofdesc=dd.to_stage2()))(queue).get()
+    quad_res = bind(places, sym._quad_resolution(
+        lpot_source.ambient_dim, dofdesc=dd))(queue)
 
     # {{{ check if satisfying criteria
 
@@ -147,7 +149,7 @@ def run_source_refinement_test(ctx_factory, mesh, order, helmholtz_k=None):
         my_ext_centers = ext_centers[:, centers_panel.discr_slice]
         all_centers = np.append(my_int_centers, my_ext_centers, axis=-1)
 
-        nodes = discr_nodes[:, sources_panel.discr_slice]
+        nodes = stage1_density_nodes[:, sources_panel.discr_slice]
 
         # =distance(centers of panel 1, panel 2)
         dist = (
@@ -172,7 +174,7 @@ def run_source_refinement_test(ctx_factory, mesh, order, helmholtz_k=None):
         my_ext_centers = ext_centers[:, centers_panel.discr_slice]
         all_centers = np.append(my_int_centers, my_ext_centers, axis=-1)
 
-        nodes = fine_discr_nodes[:, sources_panel.discr_slice]
+        nodes = quad_stage2_density_nodes[:, sources_panel.discr_slice]
 
         # =distance(centers of panel 1, panel 2)
         dist = (
@@ -192,10 +194,10 @@ def run_source_refinement_test(ctx_factory, mesh, order, helmholtz_k=None):
         # Check wavenumber to panel size ratio.
         assert quad_res[panel.element_nr] * helmholtz_k <= 5
 
-    for i, panel_1 in enumerate(iter_elements(lpot_source.density_discr)):
-        for panel_2 in iter_elements(lpot_source.density_discr):
+    for i, panel_1 in enumerate(iter_elements(stage1_density_discr)):
+        for panel_2 in iter_elements(stage1_density_discr):
             check_disk_undisturbed_by_sources(panel_1, panel_2)
-        for panel_2 in iter_elements(lpot_source.quad_stage2_density_discr):
+        for panel_2 in iter_elements(quad_stage2_density_discr):
             check_sufficient_quadrature_resolution(panel_1, panel_2)
         if helmholtz_k is not None:
             check_quad_res_to_helmholtz_k_ratio(panel_1)
@@ -246,17 +248,12 @@ def test_target_association(ctx_factory, curve_name, curve_f, nelements,
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
     factory = InterpolatoryQuadratureSimplexGroupFactory(order)
-
     discr = Discretization(cl_ctx, mesh, factory)
 
-    lpot_source, conn = QBXLayerPotentialSource(discr,
+    lpot_source = QBXLayerPotentialSource(discr,
             qbx_order=order,  # not used in target association
-            fine_order=order).with_refinement()
-    del discr
-
-    from pytential.qbx.utils import get_interleaved_centers
-    centers = np.array([ax.get(queue)
-            for ax in get_interleaved_centers(queue, lpot_source)])
+            fine_order=order)
+    places = GeometryCollection(lpot_source)
 
     # }}}
 
@@ -264,20 +261,24 @@ def test_target_association(ctx_factory, curve_name, curve_f, nelements,
 
     from pyopencl.clrandom import PhiloxGenerator
     rng = PhiloxGenerator(cl_ctx, seed=RNG_SEED)
-    nsources = lpot_source.density_discr.nnodes
-    noise = rng.uniform(queue, nsources, dtype=np.float, a=0.01, b=1.0)
-    tunnel_radius = bind(lpot_source,
-        sym._close_target_tunnel_radii(lpot_source.ambient_dim))(queue)
-
-    def targets_from_sources(sign, dist):
-        from pytential import sym, bind
-        dim = 2
-        nodes = bind(lpot_source.density_discr, sym.nodes(dim))(queue)
-        normals = bind(lpot_source.density_discr, sym.normal(dim))(queue)
+
+    dd = places.auto_source.to_stage1()
+    centers = bind(places, sym.interleaved_expansion_centers(
+        lpot_source.ambient_dim, dofdesc=dd))(queue)
+    centers = np.array([ax.get(queue) for ax in centers])
+
+    tunnel_radius = bind(places, sym._close_target_tunnel_radii(
+        lpot_source.ambient_dim, dofdesc=dd))(queue)
+
+    density_discr = places.get_discretization(dd.geometry)
+    noise = rng.uniform(queue, density_discr.nnodes, dtype=np.float, a=0.01, b=1.0)
+
+    def targets_from_sources(sign, dist, dim=2):
+        nodes = bind(places, sym.nodes(dim, dofdesc=dd))(queue)
+        normals = bind(places, sym.normal(dim, dofdesc=dd))(queue)
         return (nodes + normals * sign * dist).as_vector(np.object)
 
     from pytential.target import PointsTarget
-
     int_targets = PointsTarget(targets_from_sources(-1, noise * tunnel_radius))
     ext_targets = PointsTarget(targets_from_sources(+1, noise * tunnel_radius))
     far_targets = PointsTarget(targets_from_sources(+1, FAR_TARGET_DIST_FROM_SOURCE))
@@ -285,9 +286,9 @@ def test_target_association(ctx_factory, curve_name, curve_f, nelements,
     # Create target discretizations.
     target_discrs = (
         # On-surface targets, interior
-        (lpot_source.density_discr, -1),
+        (density_discr, -1),
         # On-surface targets, exterior
-        (lpot_source.density_discr, +1),
+        (density_discr, +1),
         # Interior close targets
         (int_targets, -2),
         # Exterior close targets
@@ -313,22 +314,22 @@ def test_target_association(ctx_factory, curve_name, curve_f, nelements,
             TargetAssociationCodeContainer, associate_targets_to_qbx_centers)
 
     from pytential.qbx.utils import TreeCodeContainer
-
     code_container = TargetAssociationCodeContainer(
             cl_ctx, TreeCodeContainer(cl_ctx))
 
     target_assoc = (associate_targets_to_qbx_centers(
-            lpot_source,
+            places,
+            places.auto_source,
             code_container.get_wrangler(queue),
             target_discrs,
             target_association_tolerance=1e-10)
         .get(queue=queue))
 
-    expansion_radii = bind(lpot_source, sym.expansion_radii(
+    expansion_radii = bind(places, sym.expansion_radii(
         lpot_source.ambient_dim,
         granularity=sym.GRANULARITY_CENTER))(queue).get()
     surf_targets = np.array(
-            [axis.get(queue) for axis in lpot_source.density_discr.nodes()])
+            [axis.get(queue) for axis in density_discr.nodes()])
     int_targets = np.array([axis.get(queue) for axis in int_targets.nodes()])
     ext_targets = np.array([axis.get(queue) for axis in ext_targets.nodes()])
 
@@ -336,7 +337,7 @@ def test_target_association(ctx_factory, curve_name, curve_f, nelements,
         import matplotlib.pyplot as plt
         from meshmode.mesh.visualization import draw_curve
 
-        draw_curve(lpot_source.density_discr.mesh)
+        draw_curve(density_discr.mesh)
 
         targets = int_targets
         tgt_slice = surf_int_slice
@@ -433,6 +434,7 @@ def test_target_association_failure(ctx_factory):
     lpot_source = QBXLayerPotentialSource(discr,
             qbx_order=order,  # not used in target association
             fine_order=order)
+    places = GeometryCollection(lpot_source)
 
     # }}}
 
@@ -460,7 +462,8 @@ def test_target_association_failure(ctx_factory):
 
     with pytest.raises(QBXTargetAssociationFailedException):
         associate_targets_to_qbx_centers(
-            lpot_source,
+            places,
+            places.auto_source,
             code_container.get_wrangler(queue),
             targets,
             target_association_tolerance=1e-10)
diff --git a/test/test_layer_pot.py b/test/test_layer_pot.py
index 74b918164c0110ee9b10f4929e9e8b77bd32e77c..c7eeb200c97195a51881a28fcb4092d94d257599 100644
--- a/test/test_layer_pot.py
+++ b/test/test_layer_pot.py
@@ -36,7 +36,9 @@ from meshmode.mesh.generation import (  # noqa
         ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
         make_curve_mesh, NArmedStarfish)
 from sumpy.visualization import FieldPlotter
+
 from pytential import bind, sym, norm
+from pytential import GeometryCollection
 
 import logging
 logger = logging.getLogger(__name__)
@@ -79,7 +81,7 @@ def test_geometry(ctx_factory):
 # {{{ test off-surface eval
 
 @pytest.mark.parametrize("use_fmm", [True, False])
-def test_off_surface_eval(ctx_factory, use_fmm, do_plot=False):
+def test_off_surface_eval(ctx_factory, use_fmm, visualize=False):
     logging.basicConfig(level=logging.INFO)
 
     cl_ctx = ctx_factory()
@@ -108,32 +110,32 @@ def test_off_surface_eval(ctx_factory, use_fmm, do_plot=False):
 
     pre_density_discr = Discretization(
             cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
-    qbx, _ = QBXLayerPotentialSource(
+    qbx = QBXLayerPotentialSource(
             pre_density_discr,
             4*target_order,
             qbx_order,
             fmm_order=fmm_order,
-            ).with_refinement()
+            )
+
+    from pytential.target import PointsTarget
+    fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30)
+    targets = PointsTarget(fplot.points)
 
-    density_discr = qbx.density_discr
+    places = GeometryCollection((qbx, targets))
+    density_discr = places.get_discretization(places.auto_source.geometry)
 
     from sumpy.kernel import LaplaceKernel
     op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=-2)
 
     sigma = density_discr.zeros(queue) + 1
+    fld_in_vol = bind(places, op)(queue, sigma=sigma)
+    fld_in_vol_exact = -1
 
-    fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30)
-    from pytential.target import PointsTarget
-    fld_in_vol = bind(
-            (qbx, PointsTarget(fplot.points)),
-            op)(queue, sigma=sigma)
-
-    err = cl.clmath.fabs(fld_in_vol - (-1))
-
+    err = cl.clmath.fabs(fld_in_vol - fld_in_vol_exact)
     linf_err = cl.array.max(err).get()
     print("l_inf error:", linf_err)
 
-    if do_plot:
+    if visualize:
         fplot.show_scalar_in_matplotlib(fld_in_vol.get())
         import matplotlib.pyplot as pt
         pt.colorbar()
@@ -171,41 +173,48 @@ def test_off_surface_eval_vs_direct(ctx_factory,  do_plot=False):
 
     pre_density_discr = Discretization(
             cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
-    direct_qbx, _ = QBXLayerPotentialSource(
+    direct_qbx = QBXLayerPotentialSource(
             pre_density_discr, 4*target_order, qbx_order,
             fmm_order=False,
             target_association_tolerance=0.05,
-            ).with_refinement()
-    fmm_qbx, _ = QBXLayerPotentialSource(
+            )
+    fmm_qbx = QBXLayerPotentialSource(
             pre_density_discr, 4*target_order, qbx_order,
             fmm_order=qbx_order + 3,
             _expansions_in_tree_have_extent=True,
             target_association_tolerance=0.05,
-            ).with_refinement()
+            )
 
     fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500)
     from pytential.target import PointsTarget
     ptarget = PointsTarget(fplot.points)
     from sumpy.kernel import LaplaceKernel
 
-    op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=None)
+    places = GeometryCollection({
+        "direct_qbx": direct_qbx,
+        "fmm_qbx": fmm_qbx,
+        "target": ptarget})
+
+    direct_density_discr = places.get_discretization("direct_qbx")
+    fmm_density_discr = places.get_discretization("fmm_qbx")
 
     from pytential.qbx import QBXTargetAssociationFailedException
+    op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=None)
     try:
-        direct_density_discr = direct_qbx.density_discr
         direct_sigma = direct_density_discr.zeros(queue) + 1
-        direct_fld_in_vol = bind((direct_qbx, ptarget), op)(
-                queue, sigma=direct_sigma)
-
+        direct_fld_in_vol = bind(places, op,
+                auto_where=("direct_qbx", "target"))(
+                        queue, sigma=direct_sigma)
     except QBXTargetAssociationFailedException as e:
         fplot.show_scalar_in_matplotlib(e.failed_target_flags.get(queue))
         import matplotlib.pyplot as pt
         pt.show()
         raise
 
-    fmm_density_discr = fmm_qbx.density_discr
     fmm_sigma = fmm_density_discr.zeros(queue) + 1
-    fmm_fld_in_vol = bind((fmm_qbx, ptarget), op)(queue, sigma=fmm_sigma)
+    fmm_fld_in_vol = bind(places, op,
+            auto_where=("fmm_qbx", "target"))(
+                    queue, sigma=fmm_sigma)
 
     err = cl.clmath.fabs(fmm_fld_in_vol - direct_fld_in_vol)
 
@@ -246,23 +255,29 @@ def test_unregularized_with_ones_kernel(ctx_factory):
             InterpolatoryQuadratureSimplexGroupFactory(order))
 
     from pytential.unregularized import UnregularizedLayerPotentialSource
-    lpot_src = UnregularizedLayerPotentialSource(discr)
-
-    from sumpy.kernel import one_kernel_2d
+    lpot_source = UnregularizedLayerPotentialSource(discr)
+    from pytential.target import PointsTarget
+    targets = PointsTarget(np.zeros((2, 1), dtype=float))
 
-    expr = sym.IntG(one_kernel_2d, sym.var("sigma"), qbx_forced_limit=None)
+    places = GeometryCollection({
+        sym.DEFAULT_SOURCE: lpot_source,
+        sym.DEFAULT_TARGET: lpot_source,
+        "target_non_self": targets})
 
-    from pytential.target import PointsTarget
-    op_self = bind(lpot_src, expr)
-    op_nonself = bind((lpot_src, PointsTarget(np.zeros((2, 1), dtype=float))), expr)
+    from sumpy.kernel import one_kernel_2d
+    sigma_sym = sym.var("sigma")
+    op = sym.IntG(one_kernel_2d, sigma_sym, qbx_forced_limit=None)
 
-    with cl.CommandQueue(cl_ctx) as queue:
-        sigma = cl.array.zeros(queue, discr.nnodes, dtype=float)
-        sigma.fill(1)
-        sigma.finish()
+    sigma = cl.array.zeros(queue, discr.nnodes, dtype=float)
+    sigma.fill(1)
+    sigma.finish()
 
-        result_self = op_self(queue, sigma=sigma)
-        result_nonself = op_nonself(queue, sigma=sigma)
+    result_self = bind(places, op,
+            auto_where=places.auto_where)(
+                    queue, sigma=sigma)
+    result_nonself = bind(places, op,
+            auto_where=(places.auto_source, "target_non_self"))(
+                    queue, sigma=sigma)
 
     assert np.allclose(result_self.get(), 2 * np.pi)
     assert np.allclose(result_nonself.get(), 2 * np.pi)
@@ -276,6 +291,8 @@ def test_unregularized_off_surface_fmm_vs_direct(ctx_factory):
     target_order = 8
     fmm_order = 4
 
+    # {{{ geometry
+
     mesh = make_curve_mesh(WobblyCircle.random(8, seed=30),
                 np.linspace(0, 1, nelements+1),
                 target_order)
@@ -299,19 +316,35 @@ def test_unregularized_off_surface_fmm_vs_direct(ctx_factory):
     fplot = FieldPlotter(np.zeros(2), extent=5, npoints=100)
     from pytential.target import PointsTarget
     ptarget = PointsTarget(fplot.points)
-    from sumpy.kernel import LaplaceKernel
 
+    from pytential import GeometryCollection
+    places = GeometryCollection({
+        "unregularized_direct": direct,
+        "unregularized_fmm": fmm,
+        "targets": ptarget})
+
+    # }}}
+
+    # {{{ check
+
+    from sumpy.kernel import LaplaceKernel
     op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=None)
 
-    direct_fld_in_vol = bind((direct, ptarget), op)(queue, sigma=sigma)
-    fmm_fld_in_vol = bind((fmm, ptarget), op)(queue, sigma=sigma)
+    direct_fld_in_vol = bind(places, op,
+            auto_where=("unregularized_direct", "targets"))(
+                    queue, sigma=sigma)
+    fmm_fld_in_vol = bind(places, op,
+            auto_where=("unregularized_fmm", "targets"))(queue, sigma=sigma)
 
     err = cl.clmath.fabs(fmm_fld_in_vol - direct_fld_in_vol)
 
     linf_err = cl.array.max(err).get()
     print("l_inf error:", linf_err)
+
     assert linf_err < 5e-3
 
+    # }}}
+
 # }}}
 
 
@@ -348,12 +381,15 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
                 InterpolatoryQuadratureSimplexGroupFactory(3))
 
         from pytential.qbx import QBXLayerPotentialSource
-        qbx, _ = QBXLayerPotentialSource(
+        qbx = QBXLayerPotentialSource(
                 pre_discr, fine_order=4*target_order,
                 qbx_order=qbx_order,
                 fmm_order=qbx_order + 5,
                 fmm_backend="fmmlib"
-                ).with_refinement()
+                )
+
+        places = GeometryCollection(qbx)
+        density_discr = places.get_discretization(places.auto_source.geometry)
 
         from sumpy.kernel import LaplaceKernel
         knl = LaplaceKernel(3)
@@ -365,7 +401,7 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
                 sym.cse(sym.tangential_to_xyz(density_sym), "jxyz"),
                 qbx_forced_limit=qbx_forced_limit)))
 
-        x, y, z = qbx.density_discr.nodes().with_queue(queue)
+        x, y, z = density_discr.nodes().with_queue(queue)
         m = cl.clmath
 
         if relation == "nxcurls":
@@ -379,8 +415,7 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
             # conjure up some globally smooth functions, interpret their values
             # in the tangential coordinate system, and be done. Instead, generate
             # an XYZ function and project it.
-            density = bind(
-                    qbx,
+            density = bind(places,
                     sym.xyz_to_tangential(sym.make_sym_vector("jxyz", 3)))(
                             queue,
                             jxyz=sym.make_obj_array([
@@ -412,13 +447,13 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
         else:
             raise ValueError("unexpected value of 'relation': %s" % relation)
 
-        bound_jump_identity = bind(qbx, jump_identity_sym)
+        bound_jump_identity = bind(places, jump_identity_sym)
         jump_identity = bound_jump_identity(queue, density=density)
 
-        h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+        h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
         err = (
-                norm(qbx, queue, jump_identity, np.inf)
-                / norm(qbx, queue, density, np.inf))
+                norm(density_discr, queue, jump_identity, np.inf)
+                / norm(density_discr, queue, density, np.inf))
         print("ERROR", h_max, err)
 
         eoc_rec.add_data_point(h_max, err)
@@ -426,15 +461,15 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
         # {{{ visualization
 
         if visualize and relation == "nxcurls":
-            nxcurlS_ext = bind(qbx, nxcurlS(+1))(queue, density=density)
-            nxcurlS_avg = bind(qbx, nxcurlS("avg"))(queue, density=density)
-            jtxyz = bind(qbx, sym.tangential_to_xyz(density_sym))(
+            nxcurlS_ext = bind(places, nxcurlS(+1))(queue, density=density)
+            nxcurlS_avg = bind(places, nxcurlS("avg"))(queue, density=density)
+            jtxyz = bind(places, sym.tangential_to_xyz(density_sym))(
                     queue, density=density)
 
             from meshmode.discretization.visualization import make_visualizer
             bdry_vis = make_visualizer(queue, qbx.density_discr, target_order+3)
 
-            bdry_normals = bind(qbx, sym.normal(3))(queue)\
+            bdry_normals = bind(places, sym.normal(3))(queue)\
                     .as_vector(dtype=object)
 
             bdry_vis.write_vtk_file("source-%s.vtu" % nel_factor, [
@@ -445,16 +480,16 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
                 ])
 
         if visualize and relation == "sp":
-            sp_ext = bind(qbx, sym.Sp(knl, density_sym, qbx_forced_limit=+1))(
-                    queue, density=density)
-            sp_avg = bind(qbx, sym.Sp(knl, density_sym, qbx_forced_limit="avg"))(
-                    queue, density=density)
+            op = sym.Sp(knl, density_sym, qbx_forced_limit=+1)
+            sp_ext = bind(places, op)(queue, density=density)
+            op = sym.Sp(knl, density_sym, qbx_forced_limit="avg")
+            sp_avg = bind(places, op)(queue, density=density)
 
             from meshmode.discretization.visualization import make_visualizer
             bdry_vis = make_visualizer(queue, qbx.density_discr, target_order+3)
 
-            bdry_normals = bind(qbx, sym.normal(3))(queue)\
-                    .as_vector(dtype=object)
+            bdry_normals = bind(places,
+                    sym.normal(3))(queue).as_vector(dtype=object)
 
             bdry_vis.write_vtk_file("source-%s.vtu" % nel_factor, [
                 ("density", density),
diff --git a/test/test_layer_pot_eigenvalues.py b/test/test_layer_pot_eigenvalues.py
index 5737009ca3d30daa14dd56c39fd5446ea0dde795..af2278ebeb402190cd921255404bd06ed35a0d26 100644
--- a/test/test_layer_pot_eigenvalues.py
+++ b/test/test_layer_pot_eigenvalues.py
@@ -35,7 +35,9 @@ from functools import partial
 from meshmode.mesh.generation import (  # noqa
         ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
         make_curve_mesh, NArmedStarfish)
+
 from pytential import bind, sym, norm
+from pytential import GeometryCollection
 
 import logging
 logger = logging.getLogger(__name__)
@@ -62,7 +64,7 @@ except ImportError:
             (2, 7, 5, True),
             ])
 def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
-        force_direct):
+        force_direct, visualize=False):
     logging.basicConfig(level=logging.INFO)
 
     print("ellipse_aspect: %s, mode_nr: %d, qbx_order: %d" % (
@@ -107,28 +109,30 @@ def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
         pre_density_discr = Discretization(
                 cl_ctx, mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(target_order))
-        qbx, _ = QBXLayerPotentialSource(
+        qbx = QBXLayerPotentialSource(
                 pre_density_discr, 4*target_order,
                 qbx_order, fmm_order=fmm_order,
                 _expansions_in_tree_have_extent=True,
-                ).with_refinement()
+                )
+        places = GeometryCollection(qbx)
 
-        density_discr = qbx.density_discr
+        density_discr = places.get_discretization(places.auto_source.geometry)
         nodes = density_discr.nodes().with_queue(queue)
 
-        if 0:
+        if visualize:
             # plot geometry, centers, normals
-
-            centers = bind(qbx,
+            centers = bind(places,
                     sym.expansion_centers(qbx.ambient_dim, +1))(queue)
+            normal = bind(places,
+                    sym.normal(qbx.ambient_dim))(queue).as_vector(np.object)
 
             nodes_h = nodes.get()
             centers_h = [centers[0].get(), centers[1].get()]
+            normals_h = [normal[0].get(), normal[1].get()]
+
             pt.plot(nodes_h[0], nodes_h[1], "x-")
             pt.plot(centers_h[0], centers_h[1], "o")
-            normal = bind(qbx, sym.normal(ambient_dim=2))(queue).as_vector(np.object)
-            pt.quiver(nodes_h[0], nodes_h[1],
-                    normal[0].get(), normal[1].get())
+            pt.quiver(nodes_h[0], nodes_h[1], normals_h[0], normals_h[1])
             pt.gca().set_aspect("equal")
             pt.show()
 
@@ -146,10 +150,11 @@ def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
 
         # {{{ single layer
 
-        sigma = cl.clmath.cos(mode_nr*angle)/J
+        sigma_sym = sym.var("sigma")
+        s_sigma_op = sym.S(lap_knl, sigma_sym, qbx_forced_limit=+1)
 
-        s_sigma_op = bind(qbx, sym.S(lap_knl, sym.var("sigma"), qbx_forced_limit=+1))
-        s_sigma = s_sigma_op(queue=queue, sigma=sigma)
+        sigma = cl.clmath.cos(mode_nr*angle)/J
+        s_sigma = bind(places, s_sigma_op)(queue=queue, sigma=sigma)
 
         # SIGN BINGO! :)
         s_eigval = 1/(2*mode_nr) * (1 + (-1)**mode_nr * ellipse_fraction)
@@ -160,11 +165,11 @@ def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
         if 0:
             #pt.plot(s_sigma.get(), label="result")
             #pt.plot(s_sigma_ref.get(), label="ref")
-            pt.plot((s_sigma_ref-s_sigma).get(), label="err")
+            pt.plot((s_sigma_ref - s_sigma).get(), label="err")
             pt.legend()
             pt.show()
 
-        h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+        h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
         s_err = (
                 norm(density_discr, queue, s_sigma - s_sigma_ref)
                 / norm(density_discr, queue, s_sigma_ref))
@@ -174,11 +179,10 @@ def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
 
         # {{{ double layer
 
-        sigma = cl.clmath.cos(mode_nr*angle)
+        d_sigma_op = sym.D(lap_knl, sigma_sym, qbx_forced_limit="avg")
 
-        d_sigma_op = bind(qbx,
-                sym.D(lap_knl, sym.var("sigma"), qbx_forced_limit="avg"))
-        d_sigma = d_sigma_op(queue=queue, sigma=sigma)
+        sigma = cl.clmath.cos(mode_nr*angle)
+        d_sigma = bind(places, d_sigma_op)(queue=queue, sigma=sigma)
 
         # SIGN BINGO! :)
         d_eigval = -(-1)**mode_nr * 1/2*ellipse_fraction
@@ -206,11 +210,10 @@ def test_ellipse_eigenvalues(ctx_factory, ellipse_aspect, mode_nr, qbx_order,
         if ellipse_aspect == 1:
             # {{{ S'
 
-            sigma = cl.clmath.cos(mode_nr*angle)
+            sp_sigma_op = sym.Sp(lap_knl, sym.var("sigma"), qbx_forced_limit="avg")
 
-            sp_sigma_op = bind(qbx,
-                    sym.Sp(lap_knl, sym.var("sigma"), qbx_forced_limit="avg"))
-            sp_sigma = sp_sigma_op(queue=queue, sigma=sigma)
+            sigma = cl.clmath.cos(mode_nr*angle)
+            sp_sigma = bind(places, sp_sigma_op)(queue=queue, sigma=sigma)
             sp_eigval = 0
 
             sp_sigma_ref = sp_eigval*sigma
@@ -291,13 +294,14 @@ def test_sphere_eigenvalues(ctx_factory, mode_m, mode_n, qbx_order,
         pre_density_discr = Discretization(
                 cl_ctx, mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(target_order))
-        qbx, _ = QBXLayerPotentialSource(
+        qbx = QBXLayerPotentialSource(
                 pre_density_discr, 4*target_order,
                 qbx_order, fmm_order=6,
                 fmm_backend=fmm_backend,
-                ).with_refinement()
+                )
+        places = GeometryCollection(qbx)
 
-        density_discr = qbx.density_discr
+        density_discr = places.get_discretization(places.auto_source.geometry)
         nodes = density_discr.nodes().with_queue(queue)
         r = cl.clmath.sqrt(nodes[0]**2 + nodes[1]**2 + nodes[2]**2)
         phi = cl.clmath.acos(nodes[2]/r)
@@ -311,18 +315,19 @@ def test_sphere_eigenvalues(ctx_factory, mode_m, mode_n, qbx_order,
 
         # {{{ single layer
 
-        s_sigma_op = bind(qbx, sym.S(lap_knl, sym.var("sigma"), qbx_forced_limit=+1))
+        s_sigma_op = bind(places,
+                sym.S(lap_knl, sym.var("sigma"), qbx_forced_limit=+1))
         s_sigma = s_sigma_op(queue=queue, sigma=ymn)
         s_eigval = 1/(2*mode_n + 1)
 
-        h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+        h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
         s_eoc_rec.add_data_point(h_max, rel_err(s_sigma, s_eigval*ymn))
 
         # }}}
 
         # {{{ double layer
 
-        d_sigma_op = bind(qbx,
+        d_sigma_op = bind(places,
                 sym.D(lap_knl, sym.var("sigma"), qbx_forced_limit="avg"))
         d_sigma = d_sigma_op(queue=queue, sigma=ymn)
         d_eigval = -1/(2*(2*mode_n + 1))
@@ -332,7 +337,7 @@ def test_sphere_eigenvalues(ctx_factory, mode_m, mode_n, qbx_order,
 
         # {{{ S'
 
-        sp_sigma_op = bind(qbx,
+        sp_sigma_op = bind(places,
                  sym.Sp(lap_knl, sym.var("sigma"), qbx_forced_limit="avg"))
         sp_sigma = sp_sigma_op(queue=queue, sigma=ymn)
         sp_eigval = -1/(2*(2*mode_n + 1))
@@ -343,7 +348,7 @@ def test_sphere_eigenvalues(ctx_factory, mode_m, mode_n, qbx_order,
 
         # {{{ D'
 
-        dp_sigma_op = bind(qbx,
+        dp_sigma_op = bind(places,
                 sym.Dp(lap_knl, sym.var("sigma"), qbx_forced_limit="avg"))
         dp_sigma = dp_sigma_op(queue=queue, sigma=ymn)
         dp_eigval = -(mode_n*(mode_n+1))/(2*mode_n + 1)
diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py
index 9151f510d7841f4a19f07893016e92332ae181e4..7b0cbc65cd87a2061d06894cb405b4341967ba7c 100644
--- a/test/test_layer_pot_identity.py
+++ b/test/test_layer_pot_identity.py
@@ -36,8 +36,11 @@ from meshmode.mesh.generation import (  # noqa
         ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
         NArmedStarfish,
         make_curve_mesh)
+
 # from sumpy.visualization import FieldPlotter
 from pytential import bind, sym, norm
+from pytential import GeometryCollection
+
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
 
 import logging
@@ -314,27 +317,29 @@ def test_identity_convergence(ctx_factory,  case, visualize=False):
                 cl_ctx, mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-        refiner_extra_kwargs = {}
-
-        if case.k != 0:
-            refiner_extra_kwargs["kernel_length_scale"] = 5/case.k
-
-        qbx, _ = QBXLayerPotentialSource(
+        qbx = QBXLayerPotentialSource(
                 pre_density_discr, 4*target_order,
                 case.qbx_order,
                 fmm_order=case.fmm_order,
                 fmm_backend=case.fmm_backend,
+                target_association_tolerance=1.0e-1,
                 _expansions_in_tree_have_extent=True,
                 _expansion_stick_out_factor=getattr(
                     case, "_expansion_stick_out_factor", 0),
-                ).with_refinement(**refiner_extra_kwargs)
+                )
+        places = GeometryCollection(qbx)
 
-        density_discr = qbx.density_discr
+        from pytential.qbx.refinement import refine_geometry_collection
+        kernel_length_scale = 5 / case.k if case.k else None
+        places = refine_geometry_collection(queue, places,
+                kernel_length_scale=kernel_length_scale)
 
         # {{{ compute values of a solution to the PDE
 
+        density_discr = places.get_discretization(places.auto_source.geometry)
+
         nodes_host = density_discr.nodes().get(queue)
-        normal = bind(density_discr, sym.normal(d))(queue).as_vector(np.object)
+        normal = bind(places, sym.normal(d))(queue).as_vector(np.object)
         normal_host = [normal[j].get() for j in range(d)]
 
         if k != 0:
@@ -378,7 +383,7 @@ def test_identity_convergence(ctx_factory,  case, visualize=False):
         key = (case.qbx_order, case.geometry.mesh_name, resolution,
                 case.expr.zero_op_name)
 
-        bound_op = bind(qbx, case.expr.get_zero_op(k_sym, **knl_kwargs))
+        bound_op = bind(places, case.expr.get_zero_op(k_sym, **knl_kwargs))
         error = bound_op(
                 queue, u=u_dev, dn_u=dn_u_dev, grad_u=grad_u_dev, k=case.k)
         if 0:
@@ -388,15 +393,15 @@ def test_identity_convergence(ctx_factory,  case, visualize=False):
         linf_error_norm = norm(density_discr, queue, error, p=np.inf)
         print("--->", key, linf_error_norm)
 
-        h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+        h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
         eoc_rec.add_data_point(h_max, linf_error_norm)
 
         if visualize:
             from meshmode.discretization.visualization import make_visualizer
             bdry_vis = make_visualizer(queue, density_discr, target_order)
 
-            bdry_normals = bind(density_discr, sym.normal(mesh.ambient_dim))(queue)\
-                    .as_vector(dtype=object)
+            bdry_normals = bind(places, sym.normal(mesh.ambient_dim))(queue)\
+                    .as_vector(dtype=np.object)
 
             bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [
                 ("u", u_dev),
diff --git a/test/test_linalg_hss.py b/test/test_linalg_hss.py
new file mode 100644
index 0000000000000000000000000000000000000000..01edf036ed0b412e4c637becc3388007c2bc105d
--- /dev/null
+++ b/test/test_linalg_hss.py
@@ -0,0 +1,863 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+
+from pytential import sym
+from pytential import GeometryCollection
+
+from sumpy.tools import MatrixBlockIndexRanges
+
+from extra_matrix_tools import build_block_index
+from extra_matrix_tools import CurveHSSTestCase, TorusHSSTestCase
+
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+try:
+    import matplotlib.pyplot as pt
+    USE_MATPLOTLIB = True
+except ImportError:
+    USE_MATPLOTLIB = False
+
+import logging
+from pytential.log import set_up_logging
+
+
+logger = logging.getLogger(__name__)
+set_up_logging([__name__], logging.INFO)
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase, TorusHSSTestCase])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_proxy_generator(ctx_factory, cls, factor, visualize=False, **kwargs):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    target_order = 2 if cls.ambient_dim == 3 else 7
+    case = cls(partition_factor=factor, target_order=target_order)
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    logger.info("nblocks %3d nnodes %7d", case.nblocks, dep_discr.nnodes)
+
+    indices, _ = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+
+    # }}}
+
+    # {{{ check proxies
+
+    from pytential.linalg.hss import ProxyGenerator
+    generator = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor)
+    proxy = generator(queue, dd, indices)
+
+    indices = indices.get(queue)
+    nodes = dep_discr.nodes().get(queue)
+    proxy = proxy.get(queue)
+
+    pxyranges = proxy.indices.ranges
+    pxypoints = np.vstack(proxy.points)
+    pxycenters = np.vstack(proxy.centers)
+    pxyradii = proxy.radii
+
+    for i in range(case.nblocks):
+        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+        isrc = indices.block_indices(i)
+
+        r = la.norm(pxypoints[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)
+
+        r = la.norm(nodes[:, isrc] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.all(r < pxyradii[i])
+
+    # }}}
+
+    if not visualize:
+        return
+
+    if case.ambient_dim == 2:
+        import matplotlib.pyplot as pt
+        srcnodes = nodes[:, indices.indices]
+
+        from pytential import bind
+        radii = bind(places,
+                sym.expansion_radii(case.ambient_dim, dofdesc=dd))(queue)
+        center_int = bind(places,
+                sym.expansion_centers(case.ambient_dim, -1, dofdesc=dd))(queue)
+        center_ext = bind(places,
+                sym.expansion_centers(case.ambient_dim, +1, dofdesc=dd))(queue)
+
+        radii = radii.get(queue)
+        ce = np.vstack([c.get(queue) for c in center_ext])
+        ci = np.vstack([c.get(queue) for c in center_int])
+
+        fig = pt.figure(figsize=(10, 10), dpi=300)
+        for i in range(indices.nblocks):
+            isrc = indices.block_indices(i)
+            ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+            axis = pt.gca()
+            for j in isrc:
+                c = pt.Circle(ci[:, j], radii[j], color='k', alpha=0.1)
+                axis.add_artist(c)
+                c = pt.Circle(ce[:, j], radii[j], color='k', alpha=0.1)
+                axis.add_artist(c)
+
+            axis.plot(nodes[0], nodes[1], 'ko', ms=2.0, alpha=0.5)
+            axis.plot(srcnodes[0], srcnodes[1], 'o', ms=2.0)
+            axis.plot(nodes[0, isrc], nodes[1, isrc], 'o', ms=2.0)
+            axis.plot(pxypoints[0, ipxy], pxypoints[1, ipxy], 'o', ms=2.0)
+            axis.set_xlim([-1.5, 1.5])
+            axis.set_ylim([-1.5, 1.5])
+
+            filename = "test_proxy_generator_{}d_{:04}.png".format(
+                    case.ambient_dim, i)
+            fig.savefig(filename)
+            fig.clf()
+
+    if case.ambient_dim == 3:
+        from meshmode.discretization.visualization import make_visualizer
+        from meshmode.mesh.processing import ( # noqa
+                affine_map, merge_disjoint_meshes)
+        from meshmode.discretization import Discretization
+        from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+
+        from meshmode.mesh.generation import generate_icosphere
+        ref_mesh = generate_icosphere(1, generator.nproxy)
+
+        # NOTE: this does not plot the actual proxy points
+        for i in range(case.nblocks):
+            mesh = affine_map(ref_mesh,
+                A=(pxyradii[i] * np.eye(case.ambient_dim)),
+                b=pxycenters[:, i].reshape(-1))
+
+            mesh = merge_disjoint_meshes([mesh, dep_discr.mesh])
+            discr = Discretization(ctx, mesh,
+                InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
+
+            vis = make_visualizer(queue, discr, 10)
+            filename = "test_proxy_generator_{}d_{:04}.vtu".format(
+                    case.ambient_dim, i)
+            vis.write_vtk_file(filename, [], overwrite=True)
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase, TorusHSSTestCase])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+@pytest.mark.parametrize("nblocks", [10, 1])
+def test_neighboring_points(ctx_factory, cls, factor, nblocks,
+        visualize=False, **kwargs):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    target_order = 2 if cls.ambient_dim == 3 else 4
+    case = cls(partition_factor=factor,
+            nblocks=nblocks,
+            target_order=target_order,
+            **kwargs)
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    logger.info("nblocks %3d nnodes %7d", case.nblocks, dep_discr.nnodes)
+
+    indices, _ = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+
+    # }}}
+
+    # {{{ generate proxy points
+
+    from pytential.linalg.hss import ProxyGenerator
+    generator = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor)
+    proxy = generator(queue, dd, indices)
+
+    # }}}
+
+    # {{{ check neighbors
+
+    from pytential.linalg.hss import gather_block_neighbor_points
+    nbrindices = gather_block_neighbor_points(queue, dep_discr, indices, proxy)
+
+    srcindices = indices.get(queue)
+    nbrindices = nbrindices.get(queue)
+    proxy = proxy.get(queue)
+
+    nodes = dep_discr.nodes().get(queue)
+    pxycenters = np.vstack(proxy.centers)
+    pxyradii = proxy.radii
+    pxyranges = proxy.indices.ranges
+
+    for i in range(srcindices.nblocks):
+        isrc = srcindices.block_indices(i)
+        inbr = nbrindices.block_indices(i)
+        assert not np.any(np.isin(inbr, isrc))
+
+        r = la.norm(nodes[:, inbr] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.all(r < pxyradii[i])
+
+    # }}}
+
+    if not visualize:
+        return
+
+    if case.ambient_dim == 2:
+        import matplotlib.pyplot as pt
+        srcnodes = nodes[:, srcindices.indices]
+        pxypoints = np.vstack(proxy.points)
+
+        pt.figure(figsize=(10, 10), dpi=300)
+        for i in range(srcindices.nblocks):
+            isrc = srcindices.block_indices(i)
+            inbr = nbrindices.block_indices(i)
+            ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+            pt.plot(nodes[0], nodes[1], 'ko', ms=2.0, alpha=0.5)
+            pt.plot(srcnodes[0], srcnodes[1], 'o', ms=2.0)
+            pt.plot(nodes[0, isrc], nodes[1, isrc], 'o', ms=2.0)
+            pt.plot(nodes[0, inbr], nodes[1, inbr], 'o', ms=2.0)
+            pt.plot(pxypoints[0, ipxy], pxypoints[1, ipxy], 'o', ms=2.0)
+            pt.plot(pxycenters[0, i], pxycenters[1, i], 'ko', ms=6.0)
+            pt.xlim([-1.5, 1.5])
+            pt.ylim([-1.5, 1.5])
+
+            filename = "test_neighboring_points_{}d_{:04}.png".format(
+                    case.ambient_dim, i)
+            pt.savefig(filename)
+            pt.clf()
+
+    if case.ambient_dim == 3:
+        from meshmode.discretization.visualization import make_visualizer
+        from meshmode.discretization import Discretization
+        from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+        discr = Discretization(ctx, dep_discr.mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
+
+        marker = np.empty(discr.nnodes)
+        for i in range(srcindices.nblocks):
+            isrc = srcindices.block_indices(i)
+            inbr = nbrindices.block_indices(i)
+
+            # TODO: some way to turn off some of the interpolations
+            # would help visualize this better.
+            marker.fill(0.0)
+            marker[srcindices.indices] = 0.0
+            marker[isrc] = -42.0
+            marker[inbr] = +42.0
+            marker_dev = cl.array.to_device(queue, marker)
+
+            vis = make_visualizer(queue, discr, 10)
+            filename = "test_neighboring_points_{}d_{:04}.vtu".format(
+                    case.ambient_dim, i)
+            vis.write_vtk_file(filename, [
+                ("marker", marker_dev),
+                ], overwrite=True)
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase, TorusHSSTestCase])
+def test_skeletonize(ctx_factory, cls, visualize=False, **kwargs):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if cls.ambient_dim == 3:
+        visualize = False
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    target_order = 2 if cls.ambient_dim == 3 else 4
+    case = cls(nblocks=6,
+            op_type="single",
+            target_order=target_order,
+            **kwargs)
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+    indices, partition = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+    blkindices = MatrixBlockIndexRanges(ctx, indices, indices)
+
+    logger.info("nblocks %3d nnodes %7d", indices.nblocks, dep_discr.nnodes)
+
+    # }}}
+
+    # {{{ operators
+
+    op = case.get_operator(places.ambient_dim)
+
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
+
+    # }}}
+
+    # {{{ wranglers
+
+    from pytential.linalg.hss import ProxyGenerator, BlockEvaluationWrangler
+    proxy = ProxyGenerator(places,
+            radius_factor=case.proxy_radius_factor)
+
+    from pytools.obj_array import make_obj_array
+    wrangler = BlockEvaluationWrangler(
+            exprs=make_obj_array([sym_op]),
+            input_exprs=[sym_density],
+            domains=[places.auto_source],
+            context=case.knl_kwargs,
+            weighted_farfield=case.weighted_farfield,
+            farfield_block_builder=case.farfield_block_builder,
+            nearfield_block_builder=case.nearfield_block_builder)
+
+    # }}}
+
+    # {{{ recursive skeletonization
+
+    if visualize:
+        sources = dep_discr.nodes().get(queue)
+
+    while True:
+        from pytential.linalg.hss import _skeletonize
+        _, _, sklindices = _skeletonize(
+                queue, places, proxy, wrangler, blkindices,
+                id_eps=case.id_eps,
+                tree_max_particles_in_box=case.max_particles_in_box)
+
+        logger.info('Level:           %d / %d',
+            partition.level, partition.nlevels)
+        logger.info('\tSources:     %s', blkindices.row.indices.shape)
+        logger.info('\tTargets:     %s', blkindices.col.indices.shape)
+        logger.info('\tSkeletons:   %s', sklindices.col.indices.shape)
+        logger.info('\tBoxes:       %s', partition.partition_box_ids)
+        logger.info('\tParents:     %s', partition.partition_parent_map)
+
+        if visualize:
+            blkindices_ = blkindices.get(queue)
+            sklindices_ = sklindices.get(queue)
+
+            pt.figure(figsize=(10, 10), dpi=300)
+            pt.plot(sources[0][blkindices_.row.indices],
+                    sources[1][blkindices_.row.indices], 'ko', alpha=0.5)
+            for i in range(blkindices.nblocks):
+                isrc = sklindices_.row.block_indices(i)
+                pt.plot(sources[0][isrc], sources[1][isrc], 'o')
+            pt.savefig('test_skeletonize_run_{:02}_sources.png'
+                       .format(partition.level))
+            pt.clf()
+
+            pt.plot(sources[0][blkindices_.col.indices],
+                    sources[1][blkindices_.col.indices], 'ko', alpha=0.5)
+            for i in range(blkindices.nblocks):
+                itgt = sklindices_.col.block_indices(i)
+                pt.plot(sources[0][itgt], sources[1][itgt], 'o')
+            pt.savefig('test_skeletonize_run_{:02}_targets.png'
+                       .format(partition.level))
+            pt.clf()
+
+        if blkindices.nblocks == 1:
+            break
+
+        # cluster
+        blkindices = partition.cluster(sklindices)
+        partition = partition.cluster()
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase, TorusHSSTestCase])
+@pytest.mark.parametrize(
+        ("weighted_farfield", "matrix_type"),
+        [(None, "qbx"), (False, "p2p")])
+def test_hss_compression(ctx_factory,
+        cls, weighted_farfield, matrix_type, visualize=False, **kwargs):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if cls.ambient_dim == 3:
+        visualize = False
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    case = cls(matrix_type=matrix_type,
+            weighted_farfield=weighted_farfield,
+            id_eps=1.0e-8, nblocks=16,
+            proxy_radius_factor=1.2,
+            op_type="single",
+            **kwargs)
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_source = places.get_geometry(dd.geometry)
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+    indices, partition = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+
+    logger.info("nblocks %3d nnodes %7d", indices.nblocks, dep_discr.nnodes)
+
+    # }}}
+
+    # {{{ operators
+
+    op = case.get_operator(places.ambient_dim)
+
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
+
+    # }}}
+
+    # {{{ build compressed matrix
+
+    rtol = 10.0 ** case.ambient_dim * case.id_eps
+
+    from pytential.linalg.hss import build_compressed_matrix
+    hss = build_compressed_matrix(queue,
+            places=places,
+            exprs=sym_op,
+            input_exprs=sym_density,
+            context=case.knl_kwargs,
+
+            matrix_mode="forward",
+            id_eps=case.id_eps,
+
+            _proxy_radius_factor=case.proxy_radius_factor,
+            _tree_kind='adaptive',
+            _tree_max_particles_in_box=dep_discr.nnodes // case.nblocks,
+            _weighted_farfield=case.weighted_farfield,
+            _farfield_block_builder=case.farfield_block_builder,
+            _nearfield_block_builder=case.nearfield_block_builder)
+
+    from pytential.symbolic.execution import _prepare_expr
+    sym_op = _prepare_expr(places, sym_op)
+    mat = case.dense_matrix_builder(queue,
+            dep_expr=sym_density,
+            other_dep_exprs=[],
+            dep_source=dep_source,
+            dep_discr=dep_discr,
+            places=places,
+            context=case.knl_kwargs,
+            )(sym_op)
+
+    # }}}
+
+    # {{{ check compressed matrix
+
+    def take(A, idx, i, j):
+        itgt = idx.row.block_indices(i)
+        isrc = idx.col.block_indices(j)
+        return A[np.ix_(itgt, isrc)]
+
+    if visualize:
+        sources = dep_discr.nodes().get(queue)
+
+    for level in range(hss.nlevels - 1, -1, -1):
+        cmat = hss.levels[level]
+        partition = hss.partitions[level]
+
+        findices = cmat.indices.get(queue)
+        sindices = cmat.sklindices.get(queue)
+
+        # {{{ plot skeletonized geometry
+
+        logger.info('Level %s', level)
+        logger.info('    shape:      %s', cmat.shape)
+        logger.info('    nblocks:    %s x %s', cmat.nblocks, cmat.nblocks)
+
+        if visualize:
+            pt.figure(figsize=(10, 8), dpi=300)
+            pt.plot(sources[0][findices.row.indices],
+                    sources[1][findices.row.indices], 'ko', alpha=0.5)
+            for i in range(cmat.nblocks):
+                isrc = sindices.row.block_indices(i)
+                pt.plot(sources[0][isrc], sources[1][isrc], 'o')
+            pt.savefig('test_hss_compression_skeleton_{:02}_{}.png'.format(
+                level, case.matrix_type))
+            pt.close()
+
+        # }}}
+
+        # {{{ check clustering of points
+
+        logger.info('    level-information:')
+        logger.info('        box_ids:    %s', partition.partition_box_ids)
+        logger.info('        parents:    %s', partition.partition_parent_ids)
+        # logger.info('        parent-map: %s', partition.partition_parent_map)
+
+        if level < hss.nlevels - 1:
+            csindices = hss.levels[level + 1].sklindices.get(queue)
+            cpartition = hss.partitions[level + 1]
+
+            for k, ppm in enumerate(cpartition.partition_parent_map):
+                row_indices_c = np.hstack([csindices.row.block_indices(i)
+                                           for i in ppm])
+                row_indices_p = findices.row.block_indices(k)
+                assert np.all(np.in1d(row_indices_p, row_indices_c))
+
+                col_indices_c = np.hstack([csindices.col.block_indices(i)
+                                           for i in ppm])
+                col_indices_p = findices.col.block_indices(k)
+                assert np.all(np.in1d(col_indices_p, col_indices_c))
+
+        # }}}
+
+        # {{{ check compression accuracy for off-diagonal blocks
+
+        err_tgt = np.zeros((cmat.nblocks, cmat.nblocks))
+        err_src = np.zeros((cmat.nblocks, cmat.nblocks))
+        err_blk = np.zeros((cmat.nblocks, cmat.nblocks))
+
+        err_mat = np.zeros(mat.shape)
+        for i in range(cmat.nblocks):
+            for j in range(cmat.nblocks):
+                if i == j:
+                    continue
+
+                ftgt = findices.row.block_indices(i)
+                fsrc = findices.col.block_indices(j)
+
+                stgt = sindices.row.block_indices(i)
+                ssrc = sindices.col.block_indices(j)
+
+                # full decomposition
+                fblk = np.ix_(ftgt, fsrc)
+                sblk = np.ix_(stgt, ssrc)
+                err_mat[fblk] = np.abs(mat[fblk]
+                        - cmat.L[i, i].dot(mat[sblk].dot(cmat.R[j, j])))
+                err_blk[i, j] = la.norm(err_mat[fblk]) / la.norm(mat[fblk])
+
+                # column decomposition
+                fblk = np.ix_(ftgt, fsrc)
+                sblk = np.ix_(ftgt, ssrc)
+                err_src[i, j] = la.norm(mat[fblk]
+                        - mat[sblk].dot(cmat.R[j, j])) / la.norm(mat[fblk])
+
+                # row decomposition
+                fblk = np.ix_(ftgt, fsrc)
+                sblk = np.ix_(stgt, fsrc)
+                err_tgt[i, j] = la.norm(mat[fblk]
+                        - cmat.L[i, i].dot(mat[sblk])) / la.norm(mat[fblk])
+
+        err_max = np.array([
+                np.max(err_blk), np.max(err_src), np.max(err_tgt)
+                ])
+
+        logger.info('    off-diagonal-errors:')
+        logger.info('        rtol:       %.5e', rtol)
+        logger.info('        error:      %-11s %-11s %-11s',
+            'both', 'source', 'target')
+        logger.info('        max:        %.5e %.5e %.5e', *err_max)
+
+        assert np.all(err_max < rtol)
+
+        if visualize and level > 0:
+            # full error matrix
+            pt.figure(figsize=(10, 10), dpi=300)
+            pt.imshow(np.log10(err_mat + 1.0e-16), cmap='brg')
+            pt.colorbar()
+            pt.savefig('test_hss_compression_error_{:02}_{}.png'
+                    .format(level, case.matrix_type))
+            pt.close()
+
+            # block errors
+            from pytential.linalg.hss import ProxyGenerator
+            generator = ProxyGenerator(places,
+                    radius_factor=0.01)
+            pxy_close = generator(queue, dd, cmat.indices.row).get(queue)
+            pxy_close.centers = np.vstack(pxy_close.centers)
+
+            generator = ProxyGenerator(places,
+                    radius_factor=case.proxy_radius_factor)
+            pxy = generator(queue, dd, cmat.indices.row).get(queue)
+            pxy.centers = np.vstack(pxy.centers)
+
+            fig, (ax1, ax2) = pt.subplots(1, 2, figsize=(18, 10), dpi=300)
+            im = ax1.imshow(np.log10(err_blk + 1.0e-16),
+                            cmap='brg', origin='upper')
+            fig.colorbar(im, ax=ax1, pad=0.1, orientation='horizontal')
+
+            imax, jmax = np.unravel_index(np.argmax(err_blk), err_blk.shape)
+            for i in range(cmat.nblocks):
+                if i == imax or i == jmax:
+                    c = pt.Circle(pxy_close.centers[:, i], pxy_close.radii[i],
+                            color='r', alpha=0.4)
+                else:
+                    c = pt.Circle(pxy_close.centers[:, i], pxy_close.radii[i],
+                            alpha=0.25)
+                ax2.add_artist(c)
+
+                c = pt.Circle(pxy.centers[:, i], pxy.radii[i],
+                              color='k', alpha=0.1)
+                ax2.add_artist(c)
+                ax2.text(pxy_close.centers[0, i],
+                         pxy_close.centers[1, i], "{}".format(i))
+
+            ax2.plot(sources[0], sources[1], 'k')
+            ax2.set_xlim([-1.5, 1.5])
+            ax2.set_ylim([-1.5, 1.5])
+            ax2.set_aspect('equal')
+            fig.savefig('test_hss_compression_block_{:02}_{}.png'
+                        .format(level, case.matrix_type))
+            pt.close(fig)
+
+            # block ranks
+            rrank = np.array([(cmat.L[i, i].shape[1], cmat.L[i, i].shape[0])
+                              for i in range(cmat.nblocks)]).T
+
+            pt.figure(figsize=(10, 8), dpi=300)
+            pt.plot(rrank[0, :], 'o', label="Rank")
+            pt.plot(rrank[1, :], 'k--', label="Size")
+            pt.legend()
+            pt.savefig('test_hss_compression_rank_{:02}_{}.png'
+                       .format(level, case.matrix_type))
+
+        del err_blk, err_src, err_tgt
+        del err_mat
+
+        # }}}
+
+        # {{{ check diagonal block clustering
+
+        err_max = -np.inf
+        if level == hss.nlevels - 1:
+            for i in range(cmat.nblocks):
+                A = take(mat, findices, i, i)
+                err_max = max(err_max, la.norm(A - cmat.D[i, i]) / la.norm(A))
+        elif level > 0:
+            cpartition = hss.partitions[level + 1]
+            csindices = hss.levels[level + 1].sklindices.get(queue)
+            rranges = csindices.row.ranges
+            cranges = csindices.col.ranges
+
+            for i in range(cmat.nblocks):
+                ppm = cpartition.partition_parent_map[i]
+                r0 = np.min(rranges[ppm])
+                c0 = np.min(cranges[ppm])
+
+                A = take(mat, findices, i, i)
+                for j in ppm:
+                    itgt = np.s_[rranges[j] - r0:rranges[j + 1] - r0]
+                    isrc = np.s_[cranges[j] - c0:cranges[j + 1] - c0]
+                    A[itgt, isrc] = 0.0
+
+                err_max = max(err_max, la.norm(A - cmat.D[i, i]) / la.norm(A))
+
+        logger.info('    clustered-diagonal-errors:')
+        logger.info('        max:        %.5e', err_max)
+
+        # NOTE: diagonals are just evaluated, so this should be exact
+        assert err_max < 1.0e-15
+
+        # }}}
+
+    # }}}
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase, TorusHSSTestCase])
+@pytest.mark.parametrize(("op_type", "matrix_mode"),
+        [("single", "forward"), ("double", "backward")])
+@pytest.mark.parametrize(("weighted_farfield", "matrix_type"),
+        [(None, "qbx"), (False, "p2p")])
+def test_hss_apply(ctx_factory, cls,
+        op_type, matrix_mode,
+        matrix_type, weighted_farfield,
+        visualize=False, **kwargs):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if cls.ambient_dim == 3:
+        visualize = False
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    case = cls(
+            op_type=op_type,
+            matrix_mode=matrix_mode,
+            matrix_type=matrix_type,
+            weighted_farfield=weighted_farfield,
+            nblocks=16,
+            proxy_radius_factor=1.1,
+            **kwargs)
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    dep_source = places.get_geometry(dd.geometry)
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+    indices, partition = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+
+    logger.info("nblocks %3d nnodes %7d", indices.nblocks, dep_discr.nnodes)
+
+    # }}}
+
+    # {{{ operators
+
+    op = case.get_operator(places.ambient_dim)
+
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
+
+    # }}}
+
+    # {{{ build compress matrix
+
+    rtol = 5 * 10.0 ** (case.ambient_dim + 1) * case.id_eps
+
+    from pytential.linalg.hss import build_compressed_matrix
+    hss = build_compressed_matrix(queue,
+            matrix_mode=case.matrix_mode,
+            places=places,
+            exprs=sym_op,
+            input_exprs=sym_density,
+            id_eps=case.id_eps,
+            context=case.knl_kwargs,
+
+            _proxy_radius_factor=case.proxy_radius_factor,
+            _tree_kind='adaptive',
+            _tree_max_particles_in_box=dep_discr.nnodes // case.nblocks,
+            _weighted_farfield=case.weighted_farfield,
+            _farfield_block_builder=case.farfield_block_builder,
+            _nearfield_block_builder=case.nearfield_block_builder)
+
+    from pytential.symbolic.execution import _prepare_expr
+    sym_op = _prepare_expr(places, sym_op)
+    mat = case.dense_matrix_builder(queue,
+            dep_expr=sym_density,
+            other_dep_exprs=[],
+            dep_source=dep_source,
+            dep_discr=dep_discr,
+            places=places,
+            context=case.knl_kwargs,
+            )(sym_op)
+
+    # }}}
+
+    # {{{ test
+
+    if case.matrix_mode == "forward":
+        x = np.random.rand(mat.shape[1])
+        b_org = mat.dot(x)
+        b_hss = hss.dot(x)
+        error = la.norm(b_hss - b_org, np.inf) / la.norm(b_org, np.inf)
+    else:
+        x_org = np.random.rand(mat.shape[1])
+        b = mat.dot(x_org)
+        x_hss = hss.dot(b)
+        error = la.norm(x_hss - x_org, np.inf) / la.norm(x_org, np.inf)
+
+    kappa = la.cond(mat)
+    logger.info("conditioning:  %.5e", kappa)
+    logger.info('error:         %.5e', error)
+
+    if visualize:
+        import matplotlib.pyplot as pt
+
+        pt.figure(figsize=(10, 8), dpi=300)
+
+        if case.matrix_mode == "forward":
+            pt.plot(b_org, label='Full')
+            pt.plot(b_hss, '--', label='HSS')
+        else:
+            pt.plot(x_org, label="Full")
+            pt.plot(x_hss, '--', label="HSS")
+
+        pt.legend()
+        pt.savefig('test_linalg_hss_{}.png'.format(case.matrix_mode))
+        pt.close()
+
+    assert error < rtol
+
+    # }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_hss_int_eq.py b/test/test_linalg_hss_int_eq.py
new file mode 100644
index 0000000000000000000000000000000000000000..6147c045ce799daef42a17b362a4064fa55247c3
--- /dev/null
+++ b/test/test_linalg_hss_int_eq.py
@@ -0,0 +1,220 @@
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array
+
+from pytential import bind, sym
+from pytential import GeometryCollection
+
+from extra_matrix_tools import build_block_index
+from extra_matrix_tools import CurveHSSTestCase
+
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+try:
+    import matplotlib.pyplot as pt
+    USE_MATPLOTLIB = True
+except ImportError:
+    USE_MATPLOTLIB = False
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.parametrize("cls", [CurveHSSTestCase])
+@pytest.mark.parametrize("op_type", ["single", "double"])
+def test_integral_equation(ctx_factory, cls, op_type, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if not USE_MATPLOTLIB:
+        visualize = False
+
+    case = cls(
+            knl_class_or_helmholtz_k=0, side=-1,
+            op_type=op_type,
+            matrix_mode="backward",
+            matrix_type="qbx",
+            weighted_farfield=None,
+            id_eps=1.0e-10,
+            proxy_radius_factor=1.0,
+            inner_radius=0.25, outer_radius=2.0,
+            target_order=4,
+            resolutions=[128])
+
+    logger.info("\n%s", str(case))
+
+    # {{{ geometry
+
+    qbx = case.get_layer_potential(ctx, case.resolutions[0], case.target_order)
+    point_sources, point_targets = case.get_test_sources_and_targets(
+            ctx, case.ambient_dim,
+            nsources=10, ntargets=128)
+
+    dd = sym.as_dofdesc(case.name).copy(discr_stage=case.discr_stage)
+    places = GeometryCollection({
+        case.name: qbx,
+        "point_sources": point_sources,
+        "point_targets": point_targets
+        }, auto_where=dd)
+
+    dep_source = places.get_geometry(dd.geometry)
+    dep_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+    indices, partition = build_block_index(queue, dep_discr,
+            max_particles_in_box=case.max_particles_in_box,
+            nblocks=case.nblocks,
+            factor=case.partition_factor)
+
+    logger.info("nblocks %3d nnodes %7d", indices.nblocks, dep_discr.nnodes)
+
+    if visualize and case.ambient_dim == 2:
+        nodes = dep_discr.nodes().get(queue)
+        sources = point_sources.nodes()
+        targets = point_targets.nodes()
+
+        pt.figure(figsize=(10, 10), dpi=300)
+        pt.plot(nodes[0], nodes[1], 'o', label="Boundary")
+        pt.plot(sources[0], sources[1], 'o', label='Sources')
+        pt.plot(targets[0], targets[1], 'o', label='Targets')
+        pt.gca().set_aspect("equal")
+        pt.legend()
+        pt.savefig("test_linalg_hss_int_eq.png")
+
+    if visualize and case.ambient_dim == 3:
+        normals = bind(places,
+                sym.normal(case.ambient_dim).as_vector(),
+                auto_where=dd)(queue)
+
+        from meshmode.discretization.visualization import make_visualizer
+        boundary_visualizer = make_visualizer(queue, dep_discr,
+                case.target_order + 3)
+        boundary_visualizer.write_vtk_file("test_linalg_hss_normals.vtu", [
+            ("normal", normals)
+            ], overwrite=True)
+
+    # }}}
+
+    # {{{ operators
+
+    op = case.get_operator(places.ambient_dim)
+
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
+    sym_repr = op.representation(sym_density, qbx_forced_limit=2 * case.side)
+
+    sym_p2p = sym.IntG(
+            case.knl_class(case.ambient_dim),
+            op.get_density_var("charges"),
+            qbx_forced_limit=None,
+            **case.knl_kwargs_syms)
+
+    # }}}
+
+    # {{{ generate test data
+
+    charges = np.random.randn(point_sources.nnodes)
+    charges[-1] = -np.sum(charges[:-1])
+    charges = charges.astype(case.dtype)
+    charges = cl.array.to_device(queue, charges)
+
+    x_test = bind(places, sym_p2p,
+            auto_where=("point_sources", "point_targets"))(
+                    queue, charges=charges, **case.knl_kwargs).get(queue)
+    b = bind(places, sym_p2p,
+            auto_where=("point_sources", dd))(
+                    queue, charges=charges, **case.knl_kwargs).get(queue)
+    logger.info("evaluated test data...")
+
+    # }}}
+
+    # {{{ compute matrices
+
+    # rtol = 10.0 ** case.ambient_dim * case.id_eps
+
+    from pytential.linalg.hss import build_compressed_matrix
+    hss = build_compressed_matrix(queue,
+            matrix_mode=case.matrix_mode,
+            places=places,
+            exprs=sym_op,
+            input_exprs=sym_density,
+            id_eps=case.id_eps,
+            context=case.knl_kwargs,
+
+            _proxy_radius_factor=case.proxy_radius_factor,
+            _tree_max_particles_in_box=case.max_particles_in_box,
+            _weighted_farfield=case.weighted_farfield,
+            _farfield_block_builder=case.farfield_block_builder,
+            _nearfield_block_builder=case.nearfield_block_builder)
+    logger.info("constructed hss matrix")
+
+    from pytential.symbolic.execution import _prepare_expr
+    sym_op = _prepare_expr(places, sym_op)
+    mat = case.dense_matrix_builder(queue,
+            dep_expr=sym_density,
+            other_dep_exprs=[],
+            dep_source=dep_source,
+            dep_discr=dep_discr,
+            places=places,
+            context=case.knl_kwargs,
+            )(sym_op)
+    logger.info("constructed full matrix %d x %d...", *mat.shape)
+
+    # }}}
+
+    # {{{ solve
+
+    # apply inverse of compressed matrix
+    x_hss = hss.dot(b)
+    logger.info("solved with hss...")
+
+    # compute error vs full matrix solution
+    from scipy.sparse.linalg import gmres
+    x_mat, _ = gmres(mat, b,
+            x0=x_hss,
+            tol=1.0e-4 * case.id_eps)
+    logger.info("solved with full matrix...")
+
+    error = la.norm(x_mat - x_hss, np.inf) / la.norm(x_mat, np.inf)
+    logger.info("conditioning: %.5e", la.cond(mat))
+    logger.info("error: %.5e", error)
+
+    # interpolate solution to stage1
+    x_hss_dev = cl.array.to_device(queue, x_hss)
+    x_hss_stage1 = x_hss_dev
+    if dd.discr_stage is not sym.QBX_SOURCE_STAGE1:
+        logger.info("downsampling to stage1...")
+
+        from pytential.symbolic.dof_connection import connection_from_dds
+        conn = connection_from_dds(places, dd.to_stage1(), dd)
+
+        from meshmode.discretization.connection import \
+            L2ProjectionInverseDiscretizationConnection
+        downsampler = L2ProjectionInverseDiscretizationConnection(conn)
+
+        x_hss_stage1 = downsampler(queue, x_hss_stage1)
+
+    # compute error vs exact solution
+    x_tgt = bind(places, sym_repr, auto_where=(dd.geometry, "point_targets"))(
+            queue, u=x_hss_stage1, **case.knl_kwargs).get(queue)
+    logger.info("evaluated approximate solution...")
+
+    error = la.norm(x_test - x_tgt, np.inf) / la.norm(x_test, np.inf)
+    logger.info("error: %.5e", error)
+
+    # }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
deleted file mode 100644
index 1707a666026fa639984e96b59daebbd017b61639..0000000000000000000000000000000000000000
--- a/test/test_linalg_proxy.py
+++ /dev/null
@@ -1,362 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-import numpy as np
-import numpy.linalg as la
-
-import pyopencl as cl
-from pyopencl.array import to_device
-
-from pytential import bind, sym
-from sumpy.tools import BlockIndexRanges
-from meshmode.mesh.generation import ( # noqa
-        ellipse, NArmedStarfish, generate_torus, make_curve_mesh)
-
-import pytest
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl
-        as pytest_generate_tests)
-
-
-def _build_qbx_discr(queue,
-        ndim=2,
-        nelements=30,
-        target_order=7,
-        qbx_order=4,
-        curve_f=None):
-
-    if curve_f is None:
-        curve_f = NArmedStarfish(5, 0.25)
-
-    if ndim == 2:
-        mesh = make_curve_mesh(curve_f,
-                np.linspace(0, 1, nelements + 1),
-                target_order)
-    elif ndim == 3:
-        mesh = generate_torus(10.0, 2.0, order=target_order)
-    else:
-        raise ValueError("unsupported ambient dimension")
-
-    from meshmode.discretization import Discretization
-    from meshmode.discretization.poly_element import \
-            InterpolatoryQuadratureSimplexGroupFactory
-    from pytential.qbx import QBXLayerPotentialSource
-    density_discr = Discretization(
-            queue.context, mesh,
-            InterpolatoryQuadratureSimplexGroupFactory(target_order))
-
-    qbx, _ = QBXLayerPotentialSource(density_discr,
-            fine_order=4 * target_order,
-            qbx_order=qbx_order,
-            fmm_order=False).with_refinement()
-
-    return qbx
-
-
-def _build_block_index(discr,
-                       nblks=10,
-                       factor=1.0,
-                       use_tree=True):
-
-    from pytential.linalg.proxy import partition_by_nodes
-
-    nnodes = discr.nnodes
-    max_particles_in_box = nnodes // nblks
-
-    # create index ranges
-    indices = partition_by_nodes(discr,
-                                 use_tree=use_tree,
-                                 max_nodes_in_box=max_particles_in_box)
-
-    # randomly pick a subset of points
-    if abs(factor - 1.0) > 1.0e-14:
-        with cl.CommandQueue(discr.cl_context) as queue:
-            indices = indices.get(queue)
-
-            indices_ = np.empty(indices.nblocks, dtype=np.object)
-            for i in range(indices.nblocks):
-                iidx = indices.block_indices(i)
-                isize = int(factor * len(iidx))
-                isize = max(1, min(isize, len(iidx)))
-
-                indices_[i] = np.sort(
-                        np.random.choice(iidx, size=isize, replace=False))
-
-            ranges_ = to_device(queue,
-                    np.cumsum([0] + [r.shape[0] for r in indices_]))
-            indices_ = to_device(queue, np.hstack(indices_))
-
-            indices = BlockIndexRanges(discr.cl_context,
-                                       indices_.with_queue(None),
-                                       ranges_.with_queue(None))
-
-    return indices
-
-
-def _plot_partition_indices(queue, discr, indices, **kwargs):
-    import matplotlib.pyplot as pt
-    indices = indices.get(queue)
-
-    args = [
-        kwargs.get("method", "unknown"),
-        "tree" if kwargs.get("use_tree", False) else "linear",
-        kwargs.get("pid", "stage1"),
-        discr.ambient_dim
-        ]
-
-    pt.figure(figsize=(10, 8), dpi=300)
-    pt.plot(np.diff(indices.ranges))
-    pt.savefig("test_partition_{0}_{1}_{3}d_ranges_{2}.png".format(*args))
-    pt.clf()
-
-    if discr.ambient_dim == 2:
-        sources = discr.nodes().get(queue)
-
-        pt.figure(figsize=(10, 8), dpi=300)
-
-        if indices.indices.shape[0] != discr.nnodes:
-            pt.plot(sources[0], sources[1], 'ko', alpha=0.5)
-        for i in range(indices.nblocks):
-            isrc = indices.block_indices(i)
-            pt.plot(sources[0][isrc], sources[1][isrc], 'o')
-
-        pt.xlim([-1.5, 1.5])
-        pt.ylim([-1.5, 1.5])
-        pt.savefig("test_partition_{0}_{1}_{3}d_{2}.png".format(*args))
-        pt.clf()
-    elif discr.ambient_dim == 3:
-        from meshmode.discretization import NoninterpolatoryElementGroupError
-        try:
-            discr.groups[0].basis()
-        except NoninterpolatoryElementGroupError:
-            return
-
-        from meshmode.discretization.visualization import make_visualizer
-        marker = -42.0 * np.ones(discr.nnodes)
-
-        for i in range(indices.nblocks):
-            isrc = indices.block_indices(i)
-            marker[isrc] = 10.0 * (i + 1.0)
-
-        vis = make_visualizer(queue, discr, 10)
-
-        filename = "test_partition_{0}_{1}_{3}d_{2}.png".format(*args)
-        vis.write_vtk_file(filename, [
-            ("marker", cl.array.to_device(queue, marker))
-            ])
-
-
-@pytest.mark.parametrize("use_tree", [True, False])
-@pytest.mark.parametrize("ndim", [2, 3])
-def test_partition_points(ctx_factory, use_tree, ndim, visualize=False):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    qbx = _build_qbx_discr(queue, ndim=ndim)
-    _build_block_index(qbx.density_discr,
-                       use_tree=use_tree,
-                       factor=0.6)
-
-
-@pytest.mark.parametrize("ndim", [2, 3])
-@pytest.mark.parametrize("factor", [1.0, 0.6])
-def test_proxy_generator(ctx_factory, ndim, factor, visualize=False):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    qbx = _build_qbx_discr(queue, ndim=ndim)
-    srcindices = _build_block_index(qbx.density_discr,
-            factor=factor)
-
-    from pytential.linalg.proxy import ProxyGenerator
-    generator = ProxyGenerator(qbx, ratio=1.1)
-    proxies, pxyranges, pxycenters, pxyradii = generator(queue, srcindices)
-
-    proxies = np.vstack([p.get() for p in proxies])
-    pxyranges = pxyranges.get()
-    pxycenters = np.vstack([c.get() for c in pxycenters])
-    pxyradii = pxyradii.get()
-
-    for i in range(srcindices.nblocks):
-        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
-
-        r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
-        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)
-
-    srcindices = srcindices.get(queue)
-    if visualize:
-        if qbx.ambient_dim == 2:
-            import matplotlib.pyplot as pt
-
-            density_nodes = qbx.density_discr.nodes().get(queue)
-            ci = bind(qbx, sym.expansion_centers(qbx.ambient_dim, -1))(queue)
-            ci = np.vstack([c.get(queue) for c in ci])
-            ce = bind(qbx, sym.expansion_centers(qbx.ambient_dim, +1))(queue)
-            ce = np.vstack([c.get(queue) for c in ce])
-            r = bind(qbx, sym.expansion_radii(qbx.ambient_dim))(queue).get()
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
-
-                pt.figure(figsize=(10, 8))
-                axis = pt.gca()
-                for j in isrc:
-                    c = pt.Circle(ci[:, j], r[j], color='k', alpha=0.1)
-                    axis.add_artist(c)
-                    c = pt.Circle(ce[:, j], r[j], color='k', alpha=0.1)
-                    axis.add_artist(c)
-
-                pt.plot(density_nodes[0], density_nodes[1],
-                        'ko', ms=2.0, alpha=0.5)
-                pt.plot(density_nodes[0, srcindices.indices],
-                        density_nodes[1, srcindices.indices],
-                        'o', ms=2.0)
-                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
-                        'o', ms=2.0)
-                pt.plot(proxies[0, ipxy], proxies[1, ipxy],
-                        'o', ms=2.0)
-                pt.xlim([-1.5, 1.5])
-                pt.ylim([-1.5, 1.5])
-
-                filename = "test_proxy_generator_{}d_{:04}.png".format(ndim, i)
-                pt.savefig(filename, dpi=300)
-                pt.clf()
-        else:
-            from meshmode.discretization.visualization import make_visualizer
-            from meshmode.mesh.processing import ( # noqa
-                    affine_map, merge_disjoint_meshes)
-            from meshmode.discretization import Discretization
-            from meshmode.discretization.poly_element import \
-                InterpolatoryQuadratureSimplexGroupFactory
-
-            from meshmode.mesh.generation import generate_icosphere
-            ref_mesh = generate_icosphere(1, generator.nproxy)
-
-            # NOTE: this does not plot the actual proxy points
-            for i in range(srcindices.nblocks):
-                mesh = affine_map(ref_mesh,
-                    A=(pxyradii[i] * np.eye(ndim)),
-                    b=pxycenters[:, i].reshape(-1))
-
-                mesh = merge_disjoint_meshes([mesh, qbx.density_discr.mesh])
-                discr = Discretization(ctx, mesh,
-                    InterpolatoryQuadratureSimplexGroupFactory(10))
-
-                vis = make_visualizer(queue, discr, 10)
-                filename = "test_proxy_generator_{}d_{:04}.vtu".format(ndim, i)
-                vis.write_vtk_file(filename, [])
-
-
-@pytest.mark.parametrize("ndim", [2, 3])
-@pytest.mark.parametrize("factor", [1.0, 0.6])
-def test_interaction_points(ctx_factory, ndim, factor, visualize=False):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    qbx = _build_qbx_discr(queue, ndim=ndim)
-    srcindices = _build_block_index(qbx.density_discr,
-            factor=factor)
-
-    # generate proxy points
-    from pytential.linalg.proxy import ProxyGenerator
-    generator = ProxyGenerator(qbx)
-    _, _, pxycenters, pxyradii = generator(queue, srcindices)
-
-    from pytential.linalg.proxy import (  # noqa
-            gather_block_neighbor_points,
-            gather_block_interaction_points)
-    nbrindices = gather_block_neighbor_points(qbx.density_discr,
-            srcindices, pxycenters, pxyradii)
-    nodes, ranges = gather_block_interaction_points(qbx, srcindices)
-
-    srcindices = srcindices.get(queue)
-    nbrindices = nbrindices.get(queue)
-
-    for i in range(srcindices.nblocks):
-        isrc = srcindices.block_indices(i)
-        inbr = nbrindices.block_indices(i)
-
-        assert not np.any(np.isin(inbr, isrc))
-
-    if visualize:
-        if ndim == 2:
-            import matplotlib.pyplot as pt
-            density_nodes = qbx.density_discr.nodes().get(queue)
-            nodes = nodes.get(queue)
-            ranges = ranges.get(queue)
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                inbr = nbrindices.block_indices(i)
-                iall = np.s_[ranges[i]:ranges[i + 1]]
-
-                pt.figure(figsize=(10, 8))
-                pt.plot(density_nodes[0], density_nodes[1],
-                        'ko', ms=2.0, alpha=0.5)
-                pt.plot(density_nodes[0, srcindices.indices],
-                        density_nodes[1, srcindices.indices],
-                        'o', ms=2.0)
-                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
-                        'o', ms=2.0)
-                pt.plot(density_nodes[0, inbr], density_nodes[1, inbr],
-                        'o', ms=2.0)
-                pt.plot(nodes[0, iall], nodes[1, iall],
-                        'x', ms=2.0)
-                pt.xlim([-1.5, 1.5])
-                pt.ylim([-1.5, 1.5])
-
-                filename = "test_area_query_{}d_{:04}.png".format(ndim, i)
-                pt.savefig(filename, dpi=300)
-                pt.clf()
-        elif ndim == 3:
-            from meshmode.discretization.visualization import make_visualizer
-            marker = np.empty(qbx.density_discr.nnodes)
-
-            for i in range(srcindices.nblocks):
-                isrc = srcindices.block_indices(i)
-                inbr = nbrindices.block_indices(i)
-
-                marker.fill(0.0)
-                marker[srcindices.indices] = 0.0
-                marker[isrc] = -42.0
-                marker[inbr] = +42.0
-                marker_dev = cl.array.to_device(queue, marker)
-
-                vis = make_visualizer(queue, qbx.density_discr, 10)
-                filename = "test_area_query_{}d_{:04}.vtu".format(ndim, i)
-                vis.write_vtk_file(filename, [
-                    ("marker", marker_dev),
-                    ])
-
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) > 1:
-        exec(sys.argv[1])
-    else:
-        from pytest import main
-        main([__file__])
-
-# vim: fdm=marker
diff --git a/test/test_matrix.py b/test/test_matrix.py
index 66441657cf0b39a0256d84500fb43120e7362486..112b7330ba7e7bdc4f6030ee648fd025ff5e2fff 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -2,7 +2,7 @@ from __future__ import division, absolute_import, print_function
 
 __copyright__ = """
 Copyright (C) 2015 Andreas Kloeckner
-Copyright (C) 2018 Alexandru Fikl
+Copyright (C) 2018-2020 Alexandru Fikl
 """
 
 __license__ = """
@@ -35,199 +35,109 @@ import pyopencl.array   # noqa
 
 from pytools.obj_array import make_obj_array, is_obj_array
 
+from sumpy.tools import vector_to_device, vector_from_device
+from sumpy.tools import MatrixBlockIndexRanges
 from sumpy.symbolic import USE_SYMENGINE
-from meshmode.mesh.generation import (  # noqa
-        ellipse, NArmedStarfish, make_curve_mesh, generate_torus)
 
 from pytential import bind, sym
+from pytential import GeometryCollection
+
+from meshmode.mesh.generation import ellipse, NArmedStarfish
 
 import pytest
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
+from extra_geometry_tools import (
+        CurveIntEqTestCase,
+        StarfishIntEqTestCase,
+        TorusIntEqTestCase
+        )
+from extra_matrix_tools import build_block_index, block_max_error
 
-def _build_qbx_discr(queue,
-        ndim=2,
-        nelements=30,
-        target_order=7,
-        qbx_order=4,
-        curve_f=None):
-
-    if curve_f is None:
-        curve_f = NArmedStarfish(5, 0.25)
-
-    if ndim == 2:
-        mesh = make_curve_mesh(curve_f,
-                np.linspace(0, 1, nelements + 1),
-                target_order)
-    elif ndim == 3:
-        mesh = generate_torus(10.0, 2.0, order=target_order)
-    else:
-        raise ValueError("unsupported ambient dimension")
-
-    from meshmode.discretization import Discretization
-    from meshmode.discretization.poly_element import \
-            InterpolatoryQuadratureSimplexGroupFactory
-    from pytential.qbx import QBXLayerPotentialSource
-    density_discr = Discretization(
-            queue.context, mesh,
-            InterpolatoryQuadratureSimplexGroupFactory(target_order))
-
-    qbx, _ = QBXLayerPotentialSource(density_discr,
-            fine_order=4 * target_order,
-            qbx_order=qbx_order,
-            fmm_order=False).with_refinement()
-
-    return qbx
+try:
+    import matplotlib.pyplot as pt
+    USE_MATPLOTLIB = True
+except ImportError:
+    USE_MATPLOTLIB = False
 
+import logging
+logger = logging.getLogger(__name__)
 
-def _build_block_index(discr, nblks=10, factor=1.0):
-    nnodes = discr.nnodes
-    max_particles_in_box = nnodes // nblks
 
-    from pytential.linalg.proxy import partition_by_nodes
-    indices = partition_by_nodes(discr, use_tree=True,
-                                 max_nodes_in_box=max_particles_in_box)
-
-    # randomly pick a subset of points
-    from sumpy.tools import MatrixBlockIndexRanges, BlockIndexRanges
-    if abs(factor - 1.0) > 1.0e-14:
-        with cl.CommandQueue(discr.cl_context) as queue:
-            indices = indices.get(queue)
-
-            indices_ = np.empty(indices.nblocks, dtype=np.object)
-            for i in range(indices.nblocks):
-                iidx = indices.block_indices(i)
-                isize = int(factor * len(iidx))
-                isize = max(1, min(isize, len(iidx)))
-
-                indices_[i] = np.sort(
-                        np.random.choice(iidx, size=isize, replace=False))
+@pytest.mark.skipif(USE_SYMENGINE,
+        reason="https://gitlab.tiker.net/inducer/sumpy/issues/25")
+@pytest.mark.parametrize("k", [0, 42])
+@pytest.mark.parametrize("op_type", ["scalar_mixed", "vector_mixed"])
+@pytest.mark.parametrize("curve_fn", [
+    partial(ellipse, 3),
+    NArmedStarfish(5, 0.25)
+    ])
+def test_matrix_build(ctx_factory, k, op_type, curve_fn, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-            ranges_ = cl.array.to_device(queue,
-                    np.cumsum([0] + [r.shape[0] for r in indices_]))
-            indices_ = cl.array.to_device(queue, np.hstack(indices_))
+    # prevent cache explosion
+    from sympy.core.cache import clear_cache
+    clear_cache()
 
-            indices = BlockIndexRanges(discr.cl_context,
-                                       indices_.with_queue(None),
-                                       ranges_.with_queue(None))
+    if not USE_MATPLOTLIB:
+        visualize = False
 
-    indices = MatrixBlockIndexRanges(indices.cl_context,
-                                     indices, indices)
+    case = CurveIntEqTestCase(
+            knl_class_or_helmholtz_k=k,
+            curve_fn=curve_fn,
+            op_type=op_type,
+            qbx_order=4,
+            target_order=7)
 
-    return indices
+    logger.info("\n%s", str(case))
 
+    # {{{ geometry
 
-def _build_op(lpot_id,
-              k=0,
-              ndim=2,
-              source=sym.DEFAULT_SOURCE,
-              target=sym.DEFAULT_TARGET,
-              qbx_forced_limit="avg"):
+    nelements = 32
+    qbx = case.get_layer_potential(ctx, nelements, case.target_order)
 
-    from sumpy.kernel import LaplaceKernel, HelmholtzKernel
-    if k:
-        knl = HelmholtzKernel(ndim)
-        knl_kwargs = {"k": k}
-    else:
-        knl = LaplaceKernel(ndim)
-        knl_kwargs = {}
-
-    lpot_kwargs = {
-            "qbx_forced_limit": qbx_forced_limit,
-            "source": source,
-            "target": target}
-    lpot_kwargs.update(knl_kwargs)
-    if lpot_id == 1:
-        # scalar single-layer potential
-        u_sym = sym.var("u")
-        op = sym.S(knl, u_sym, **lpot_kwargs)
-    elif lpot_id == 2:
-        # scalar combination of layer potentials
-        u_sym = sym.var("u")
-        op = sym.S(knl, 0.3 * u_sym, **lpot_kwargs) \
-             + sym.D(knl, 0.5 * u_sym, **lpot_kwargs)
-    elif lpot_id == 3:
-        # vector potential
-        u_sym = sym.make_sym_vector("u", 2)
-        u0_sym, u1_sym = u_sym
-
-        op = make_obj_array([
-            sym.Sp(knl, u0_sym, **lpot_kwargs)
-            + sym.D(knl, u1_sym, **lpot_kwargs),
-            sym.S(knl, 0.4 * u0_sym, **lpot_kwargs)
-            + 0.3 * sym.D(knl, u0_sym, **lpot_kwargs)
-            ])
-    else:
-        raise ValueError("Unknown lpot_id: {}".format(lpot_id))
+    from pytential import GeometryCollection
+    from pytential.qbx.refinement import refine_geometry_collection
+    places = GeometryCollection(qbx, auto_where=case.name)
 
-    op = 0.5 * u_sym + op
+    k = getattr(case, "k", 0)
+    kernel_length_scale = 5.0 / k if k != 0 else None
+    places = refine_geometry_collection(queue, places,
+            kernel_length_scale=kernel_length_scale)
 
-    return op, u_sym, knl_kwargs
+    dd = places.auto_source
+    density_discr = places.get_discretization(dd.geometry)
 
+    # }}}
 
-def _max_block_error(mat, blk, index_set):
-    error = -np.inf
-    for i in range(index_set.nblocks):
-        mat_i = index_set.take(mat, i)
-        blk_i = index_set.block_take(blk, i)
+    # {{{ operator
 
-        error = max(error, la.norm(mat_i - blk_i) / la.norm(mat_i))
+    op = case.get_operator(places.ambient_dim)
 
-    return error
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
+    bound_op = bind(places, sym_op)
 
+    # }}}
 
-@pytest.mark.skipif(USE_SYMENGINE,
-        reason="https://gitlab.tiker.net/inducer/sumpy/issues/25")
-@pytest.mark.parametrize("k", [0, 42])
-@pytest.mark.parametrize("curve_f", [
-    partial(ellipse, 3),
-    NArmedStarfish(5, 0.25)])
-@pytest.mark.parametrize("lpot_id", [2, 3])
-def test_matrix_build(ctx_factory, k, curve_f, lpot_id, visualize=False):
-    cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
-
-    # prevent cache 'splosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
-
-    target_order = 7
-    qbx_order = 4
-    nelements = 30
-    mesh = make_curve_mesh(curve_f,
-            np.linspace(0, 1, nelements + 1),
-            target_order)
-
-    from meshmode.discretization import Discretization
-    from meshmode.discretization.poly_element import \
-            InterpolatoryQuadratureSimplexGroupFactory
-    pre_density_discr = Discretization(
-            cl_ctx, mesh,
-            InterpolatoryQuadratureSimplexGroupFactory(target_order))
-
-    from pytential.qbx import QBXLayerPotentialSource
-    qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4 * target_order,
-            qbx_order,
-            # Don't use FMM for now
-            fmm_order=False).with_refinement()
-    density_discr = qbx.density_discr
-
-    op, u_sym, knl_kwargs = _build_op(lpot_id, k=k)
-    bound_op = bind(qbx, op)
+    # {{{ construct matrix
 
     from pytential.symbolic.execution import build_matrix
-    mat = build_matrix(queue, qbx, op, u_sym).get()
+    mat = build_matrix(queue, places, sym_op, sym_density,
+            context=case.knl_kwargs).get()
 
     if visualize:
         from sumpy.tools import build_matrix as build_matrix_via_matvec
-        mat2 = bound_op.scipy_op(queue, "u", dtype=mat.dtype, **knl_kwargs)
+        mat2 = bound_op.scipy_op(queue, "u", dtype=mat.dtype, **op.kernel_arguments)
         mat2 = build_matrix_via_matvec(mat2)
-        print(la.norm((mat - mat2).real, "fro") / la.norm(mat2.real, "fro"),
-              la.norm((mat - mat2).imag, "fro") / la.norm(mat2.imag, "fro"))
 
-        import matplotlib.pyplot as pt
+        err_i = la.norm((mat - mat2).real, "fro") / la.norm(mat2.real, "fro")
+        err_r = la.norm((mat - mat2).imag, "fro") / la.norm(mat2.imag, "fro")
+        logger.info("error: real %.5e imag %.5e", err_r, err_i)
+
         pt.subplot(121)
         pt.imshow(np.log10(np.abs(1.0e-20 + (mat - mat2).real)))
         pt.colorbar()
@@ -237,7 +147,6 @@ def test_matrix_build(ctx_factory, k, curve_f, lpot_id, visualize=False):
         pt.show()
 
     if visualize:
-        import matplotlib.pyplot as pt
         pt.subplot(121)
         pt.imshow(mat.real)
         pt.colorbar()
@@ -246,165 +155,123 @@ def test_matrix_build(ctx_factory, k, curve_f, lpot_id, visualize=False):
         pt.colorbar()
         pt.show()
 
-    from sumpy.tools import vector_to_device, vector_from_device
+    # }}}
+
+    # {{{ check by multiplying against random densities
+
+    nsamples = 5
     np.random.seed(12)
-    for i in range(5):
-        if is_obj_array(u_sym):
+    for i in range(nsamples):
+        if is_obj_array(sym_density):
             u = make_obj_array([
                 np.random.randn(density_discr.nnodes)
-                for _ in range(len(u_sym))
+                for _ in range(len(sym_density))
                 ])
         else:
             u = np.random.randn(density_discr.nnodes)
-
         u_dev = vector_to_device(queue, u)
+
         res_matvec = np.hstack(
                 list(vector_from_device(
-                    queue, bound_op(queue, u=u_dev))))
+                    queue, bound_op(queue, u=u_dev, **case.knl_kwargs))))
 
         res_mat = mat.dot(np.hstack(list(u)))
 
         abs_err = la.norm(res_mat - res_matvec, np.inf)
         rel_err = abs_err / la.norm(res_matvec, np.inf)
 
-        print("AbsErr {:.5e} RelErr {:.5e}".format(abs_err, rel_err))
-        assert rel_err < 1e-13
+        logger.info("AbsErr %.5e RelErr %.5e", abs_err, rel_err)
+        assert rel_err < 1e-13, 'iteration: {}'.format(i)
 
 
-@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("cls", [
+    StarfishIntEqTestCase, TorusIntEqTestCase
+    ])
+@pytest.mark.parametrize("matrix_type", ["qbx", "p2p"])
+@pytest.mark.parametrize("op_type", ["single", "scalar_mixed"])
 @pytest.mark.parametrize("factor", [1.0, 0.6])
-@pytest.mark.parametrize("lpot_id", [1, 2])
-def test_p2p_block_builder(ctx_factory, factor, ndim, lpot_id,
-                           visualize=False):
+def test_block_builder(ctx_factory,
+        cls, matrix_type, op_type, factor, visualize=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    # prevent cache explosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
+    if not USE_MATPLOTLIB:
+        visualize = False
 
-    place_ids = (
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_SOURCE,
-                discr_stage=sym.QBX_SOURCE_STAGE1),
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_TARGET,
-                discr_stage=sym.QBX_SOURCE_STAGE1),
-            )
-    target_order = 2 if ndim == 3 else 7
+    if matrix_type not in ["qbx", "p2p"]:
+        raise ValueError("unknown matrix type: {}".format(matrix_type))
 
-    qbx = _build_qbx_discr(queue, target_order=target_order, ndim=ndim)
-    op, u_sym, _ = _build_op(lpot_id, ndim=ndim,
-            source=place_ids[0],
-            target=place_ids[1])
-    index_set = _build_block_index(qbx.density_discr, factor=factor)
+    target_order = 7 if cls.ambient_dim == 2 else 2
+    resolution = 32 if cls.ambient_dim == 2 else 0
 
-    from pytential.symbolic.execution import GeometryCollection
-    from pytential.symbolic.execution import _prepare_expr
-    places = GeometryCollection(qbx, auto_where=place_ids)
-    expr = _prepare_expr(places, op)
+    case = cls(
+            op_type=op_type,
+            partiton_factor=factor,
+            qbx_order=4,
+            target_order=target_order,
+            )
 
-    from pytential.symbolic.matrix import P2PMatrixBuilder
-    mbuilder = P2PMatrixBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            context={},
-            exclude_self=True)
-    mat = mbuilder(expr)
+    logger.info("\n%s", str(case))
 
-    from pytential.symbolic.matrix import FarFieldBlockBuilder
-    mbuilder = FarFieldBlockBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            index_set=index_set,
-            context={},
-            exclude_self=True)
-    blk = mbuilder(expr)
+    # {{{ geometry
 
-    index_set = index_set.get(queue)
-    if visualize and ndim == 2:
-        blk_full = np.zeros_like(mat)
-        mat_full = np.zeros_like(mat)
+    qbx = case.get_layer_potential(ctx, resolution, case.target_order)
 
-        for i in range(index_set.nblocks):
-            itgt, isrc = index_set.block_indices(i)
+    source_dd = sym.as_dofdesc(case.name).to_stage2()
+    target_dd = sym.as_dofdesc(case.name).to_stage1()
+    places = GeometryCollection(qbx, auto_where=(source_dd, target_dd))
 
-            blk_full[np.ix_(itgt, isrc)] = index_set.block_take(blk, i)
-            mat_full[np.ix_(itgt, isrc)] = index_set.take(mat, i)
+    dep_source = places.get_geometry(source_dd.geometry)
+    dep_discr = places.get_discretization(
+            source_dd.geometry, source_dd.discr_stage)
 
-        import matplotlib.pyplot as mp
-        _, (ax1, ax2) = mp.subplots(1, 2,
-                figsize=(10, 8), dpi=300, constrained_layout=True)
-        ax1.imshow(blk_full)
-        ax1.set_title('FarFieldBlockBuilder')
-        ax2.imshow(mat_full)
-        ax2.set_title('P2PMatrixBuilder')
-        mp.savefig("test_p2p_block_{}d_{:.1f}.png".format(ndim, factor))
+    index_set, _ = build_block_index(queue, dep_discr, factor=factor)
+    index_set = MatrixBlockIndexRanges(ctx, index_set, index_set)
 
-    assert _max_block_error(mat, blk, index_set) < 1.0e-14
+    # }}}
 
+    # {{{ operators
 
-@pytest.mark.parametrize("factor", [1.0, 0.6])
-@pytest.mark.parametrize("ndim", [2, 3])
-@pytest.mark.parametrize("lpot_id", [1, 2])
-def test_qbx_block_builder(ctx_factory, factor, ndim, lpot_id,
-                           visualize=False):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
+    op = case.get_operator(places.ambient_dim)
 
-    # prevent cache explosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
 
-    place_ids = (
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_SOURCE,
-                discr_stage=sym.QBX_SOURCE_STAGE2),
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_TARGET,
-                discr_stage=sym.QBX_SOURCE_STAGE2),
-            )
-    target_order = 2 if ndim == 3 else 7
+    from pytential.symbolic.execution import _prepare_expr
+    expr = _prepare_expr(places, sym_op)
 
-    qbx = _build_qbx_discr(queue, target_order=target_order, ndim=ndim)
-    op, u_sym, _ = _build_op(lpot_id, ndim=ndim,
-            source=place_ids[0],
-            target=place_ids[1],
-            qbx_forced_limit="avg")
+    # }}}
 
-    from pytential.symbolic.execution import GeometryCollection, _prepare_expr
-    places = GeometryCollection(qbx, auto_where=place_ids)
-    expr = _prepare_expr(places, op)
-    density_discr = places.get_discretization(place_ids[0])
-    index_set = _build_block_index(density_discr, factor=factor)
+    # {{{ check blocks are correctly constructed
 
-    from pytential.symbolic.matrix import NearFieldBlockBuilder
-    mbuilder = NearFieldBlockBuilder(queue,
-            dep_expr=u_sym,
+    mat_kwargs = dict(
+            dep_expr=sym_density,
             other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
+            dep_source=dep_source,
+            dep_discr=dep_discr,
             places=places,
-            index_set=index_set,
-            context={})
-    blk = mbuilder(expr)
+            context=case.knl_kwargs,
+            )
 
-    from pytential.symbolic.matrix import MatrixBuilder
-    mbuilder = MatrixBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            context={})
+    if matrix_type == "qbx":
+        from pytential.symbolic.matrix import MatrixBuilder
+        mbuilder = MatrixBuilder(queue, **mat_kwargs)
+    else:
+        from pytential.symbolic.matrix import P2PMatrixBuilder
+        mbuilder = P2PMatrixBuilder(queue, **mat_kwargs)
     mat = mbuilder(expr)
 
+    if matrix_type == "qbx":
+        from pytential.symbolic.matrix import NearFieldBlockBuilder
+        bbuilder = NearFieldBlockBuilder(queue, index_set=index_set, **mat_kwargs)
+    else:
+        from pytential.symbolic.matrix import FarFieldBlockBuilder
+        bbuilder = FarFieldBlockBuilder(queue, index_set=index_set, **mat_kwargs)
+
+    blk = bbuilder(expr)
+
+    # }}}
+
     index_set = index_set.get(queue)
     if visualize:
         blk_full = np.zeros_like(mat)
@@ -416,109 +283,100 @@ def test_qbx_block_builder(ctx_factory, factor, ndim, lpot_id,
             blk_full[np.ix_(itgt, isrc)] = index_set.block_take(blk, i)
             mat_full[np.ix_(itgt, isrc)] = index_set.take(mat, i)
 
-        import matplotlib.pyplot as mp
-        _, (ax1, ax2) = mp.subplots(1, 2,
+        _, (ax1, ax2) = pt.subplots(1, 2,
                 figsize=(10, 8), constrained_layout=True)
         ax1.imshow(mat_full)
-        ax1.set_title('MatrixBuilder')
+        ax1.set_title(type(mbuilder).__name__)
         ax2.imshow(blk_full)
-        ax2.set_title('NearFieldBlockBuilder')
-        mp.savefig("test_qbx_block_builder.png", dpi=300)
+        ax2.set_title(type(bbuilder).__name__)
+        pt.savefig("test_{}_block_builder.png".format(matrix_type), dpi=300)
 
-    assert _max_block_error(mat, blk, index_set) < 1.0e-14
+    assert block_max_error(mat, blk, index_set) < 1.0e-14
 
 
 @pytest.mark.parametrize(('source_discr_stage', 'target_discr_stage'),
         [(sym.QBX_SOURCE_STAGE1, sym.QBX_SOURCE_STAGE1),
-         (sym.QBX_SOURCE_QUAD_STAGE2, sym.QBX_SOURCE_QUAD_STAGE2)])
+         (sym.QBX_SOURCE_STAGE2, sym.QBX_SOURCE_STAGE2)])
 def test_build_matrix_places(ctx_factory,
-        source_discr_stage, target_discr_stage, visualize=False):
+        source_discr_stage, target_discr_stage):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    # prevent cache explosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
+    # {{{ geometry
 
-    qbx_forced_limit = -1
-    place_ids = (
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_SOURCE,
-                discr_stage=source_discr_stage),
-            sym.DOFDescriptor(
-                geometry=sym.DEFAULT_TARGET,
-                discr_stage=target_discr_stage),
-            )
+    case = CurveIntEqTestCase(
+            side=-1,
+            op_type="single",
+            target_order=2, qbx_order=4)
+
+    nelements = 8
+    qbx = case.get_layer_potential(ctx, nelements, case.target_order)
+
+    source_dd = sym.as_dofdesc(case.name).copy(discr_stage=source_discr_stage)
+    target_dd = sym.as_dofdesc(case.name).copy(discr_stage=target_discr_stage)
+    places = GeometryCollection(qbx, auto_where=(source_dd, target_dd))
+
+    dep_source = places.get_geometry(source_dd.geometry)
+    dep_target = places.get_discretization(
+            target_dd.geometry, target_dd.discr_stage)
+    dep_discr = places.get_discretization(
+            source_dd.geometry, source_dd.discr_stage)
 
-    # build test operators
-    qbx = _build_qbx_discr(queue, nelements=8, target_order=2, ndim=2,
-                           curve_f=partial(ellipse, 1.0))
+    index_set, _ = build_block_index(queue, dep_discr, factor=0.6)
+    index_set = MatrixBlockIndexRanges(ctx, index_set, index_set)
 
-    op, u_sym, _ = _build_op(lpot_id=1, ndim=2,
-            source=place_ids[0],
-            target=place_ids[1],
-            qbx_forced_limit=qbx_forced_limit)
+    # }}}
 
-    from pytential.symbolic.execution import GeometryCollection
-    places = GeometryCollection(qbx, auto_where=place_ids)
-    source_discr = places.get_discretization(place_ids[0])
-    target_discr = places.get_discretization(place_ids[1])
+    # {{{ operators
 
-    index_set = _build_block_index(source_discr, factor=0.6)
+    op = case.get_operator(places.ambient_dim)
+
+    sym_density = op.get_density_var("u")
+    sym_op = op.operator(sym_density)
 
     from pytential.symbolic.execution import _prepare_expr
-    op = _prepare_expr(places, op)
+    expr = _prepare_expr(places, sym_op)
 
-    # build full QBX matrix
-    from pytential.symbolic.matrix import MatrixBuilder
-    mbuilder = MatrixBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            context={})
-    qbx_mat = mbuilder(op)
+    # }}}
 
-    # build full p2p matrix
-    from pytential.symbolic.matrix import P2PMatrixBuilder
-    mbuilder = P2PMatrixBuilder(queue,
-            dep_expr=u_sym,
+    # {{{ check matching shapes
+
+    mat_kwargs = dict(
+            dep_expr=sym_density,
             other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
+            dep_source=dep_source,
+            dep_discr=dep_discr,
             places=places,
-            context={})
-    p2p_mat = mbuilder(op)
+            context=case.knl_kwargs,
+            )
+
+    # qbx
+    from pytential.symbolic.matrix import MatrixBuilder
+    mbuilder = MatrixBuilder(queue, **mat_kwargs)
+    qbx_mat = mbuilder(expr)
 
-    assert p2p_mat.shape == (target_discr.nnodes, source_discr.nnodes)
+    assert qbx_mat.shape == (dep_target.nnodes, dep_discr.nnodes)
 
-    # build block qbx and p2p matrices
     from pytential.symbolic.matrix import NearFieldBlockBuilder
-    mbuilder = NearFieldBlockBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            index_set=index_set,
-            context={})
-    mat = mbuilder(op)
-    if place_ids[0].discr_stage is not None:
-        assert _max_block_error(qbx_mat, mat, index_set.get(queue)) < 1.0e-14
+    mbuilder = NearFieldBlockBuilder(queue, index_set=index_set, **mat_kwargs)
+    qbx_blk = mbuilder(expr)
+
+    assert block_max_error(qbx_mat, qbx_blk, index_set.get(queue)) < 1.0e-14
+    del qbx_mat
+    del qbx_blk
+
+    # p2p
+    from pytential.symbolic.matrix import P2PMatrixBuilder
+    mbuilder = P2PMatrixBuilder(queue, **mat_kwargs)
+    p2p_mat = mbuilder(expr)
+
+    assert p2p_mat.shape == (dep_target.nnodes, dep_discr.nnodes)
 
     from pytential.symbolic.matrix import FarFieldBlockBuilder
-    mbuilder = FarFieldBlockBuilder(queue,
-            dep_expr=u_sym,
-            other_dep_exprs=[],
-            dep_source=places.get_geometry(place_ids[0]),
-            dep_discr=places.get_discretization(place_ids[0]),
-            places=places,
-            index_set=index_set,
-            context={},
-            exclude_self=True)
-    mat = mbuilder(op)
-    assert _max_block_error(p2p_mat, mat, index_set.get(queue)) < 1.0e-14
+    mbuilder = FarFieldBlockBuilder(queue, index_set=index_set, **mat_kwargs)
+    p2p_blk = mbuilder(expr)
+
+    assert block_max_error(p2p_mat, p2p_blk, index_set.get(queue)) < 1.0e-14
 
 
 if __name__ == "__main__":
diff --git a/test/test_maxwell.py b/test/test_maxwell.py
index 5adee9d328edd080755bd8e2a206e478b4980d63..2e950193e17daf410cf8ef9c4d4fa74cae915670 100644
--- a/test/test_maxwell.py
+++ b/test/test_maxwell.py
@@ -218,7 +218,8 @@ class EHField(object):
     #tc_int,
     tc_ext,
     ])
-def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
+def test_pec_mfie_extinction(ctx_factory, case,
+        use_plane_wave=False, visualize=False):
     """For (say) is_interior=False (the 'exterior' MFIE), this test verifies
     extinction of the combined (incoming + scattered) field on the interior
     of the scatterer.
@@ -252,32 +253,23 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
     rng = cl.clrandom.PhiloxGenerator(cl_ctx, seed=12)
     src_j = rng.normal(queue, (3, test_source.nnodes), dtype=np.float64)
 
-    def eval_inc_field_at(tgt):
-        if 0:
+    def eval_inc_field_at(places, source=None, target=None):
+        if source is None:
+            source = "test_source"
+
+        if use_plane_wave:
             # plane wave
-            return bind(
-                    tgt,
+            return bind(places,
                     get_sym_maxwell_plane_wave(
                         amplitude_vec=np.array([1, 1, 1]),
                         v=np.array([1, 0, 0]),
-                        omega=case.k)
-                    )(queue)
+                        omega=case.k),
+                    auto_where=target)(queue)
         else:
             # point source
-            return bind(
-                    (test_source, tgt),
-                    get_sym_maxwell_point_source(mfie.kernel, j_sym, mfie.k)
-                    )(queue, j=src_j, k=case.k)
-
-    pde_test_inc = EHField(
-            vector_from_device(queue, eval_inc_field_at(calc_patch_tgt)))
-
-    source_maxwell_resids = [
-            calc_patch.norm(x, np.inf) / calc_patch.norm(pde_test_inc.e, np.inf)
-            for x in frequency_domain_maxwell(
-                calc_patch, pde_test_inc.e, pde_test_inc.h, case.k)]
-    print("Source Maxwell residuals:", source_maxwell_resids)
-    assert max(source_maxwell_resids) < 1e-6
+            return bind(places,
+                    get_sym_maxwell_point_source(mfie.kernel, j_sym, mfie.k),
+                    auto_where=(source, target))(queue, j=src_j, k=case.k)
 
     # }}}
 
@@ -297,35 +289,73 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
     from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
 
     for resolution in case.resolutions:
+        places = {}
         scat_mesh = case.get_mesh(resolution, case.target_order)
         observation_mesh = case.get_observation_mesh(case.target_order)
 
         pre_scat_discr = Discretization(
                 cl_ctx, scat_mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
-        qbx, _ = QBXLayerPotentialSource(
+        qbx = QBXLayerPotentialSource(
                 pre_scat_discr, fine_order=4*case.target_order,
                 qbx_order=case.qbx_order,
                 fmm_level_to_order=SimpleExpansionOrderFinder(
                     case.fmm_tolerance),
-                fmm_backend=case.fmm_backend
-                ).with_refinement(_expansion_disturbance_tolerance=0.05)
-        h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+                fmm_backend=case.fmm_backend,
+                )
 
         scat_discr = qbx.density_discr
         obs_discr = Discretization(
                 cl_ctx, observation_mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
 
-        inc_field_scat = EHField(eval_inc_field_at(scat_discr))
-        inc_field_obs = EHField(eval_inc_field_at(obs_discr))
+        places.update({
+            sym.DEFAULT_SOURCE: qbx,
+            sym.DEFAULT_TARGET: qbx.density_discr,
+            "test_source": test_source,
+            "scat_discr": scat_discr,
+            "obs_discr": obs_discr,
+            "patch_target": calc_patch_tgt,
+            })
+
+        if visualize:
+            qbx_tgt_tol = qbx.copy(target_association_tolerance=0.2)
+
+            fplot = make_field_plotter_from_bbox(
+                    find_bounding_box(scat_discr.mesh), h=(0.05, 0.05, 0.3),
+                    extend_factor=0.3)
+            fplot_tgt = PointsTarget(cl.array.to_device(queue, fplot.points))
+
+            places.update({
+                "qbx_target_tol": qbx_tgt_tol,
+                "plot_targets": fplot_tgt,
+                })
+
+        from pytential import GeometryCollection
+        places = GeometryCollection(places)
+        density_discr = places.get_discretization(places.auto_source.geometry)
 
         # {{{ system solve
 
+        h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
+
+        pde_test_inc = EHField(vector_from_device(queue,
+            eval_inc_field_at(places, target="patch_target")))
+
+        source_maxwell_resids = [
+                calc_patch.norm(x, np.inf) / calc_patch.norm(pde_test_inc.e, np.inf)
+                for x in frequency_domain_maxwell(
+                    calc_patch, pde_test_inc.e, pde_test_inc.h, case.k)]
+        print("Source Maxwell residuals:", source_maxwell_resids)
+        assert max(source_maxwell_resids) < 1e-6
+
+        inc_field_scat = EHField(eval_inc_field_at(places, target="scat_discr"))
+        inc_field_obs = EHField(eval_inc_field_at(places, target="obs_discr"))
+
         inc_xyz_sym = EHField(sym.make_sym_vector("inc_fld", 6))
 
-        bound_j_op = bind(qbx, mfie.j_operator(loc_sign, jt_sym))
-        j_rhs = bind(qbx, mfie.j_rhs(inc_xyz_sym.h))(
+        bound_j_op = bind(places, mfie.j_operator(loc_sign, jt_sym))
+        j_rhs = bind(places, mfie.j_rhs(inc_xyz_sym.h))(
                 queue, inc_fld=inc_field_scat.field, **knl_kwargs)
 
         gmres_settings = dict(
@@ -340,8 +370,8 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
 
         jt = gmres_result.solution
 
-        bound_rho_op = bind(qbx, mfie.rho_operator(loc_sign, rho_sym))
-        rho_rhs = bind(qbx, mfie.rho_rhs(jt_sym, inc_xyz_sym.e))(
+        bound_rho_op = bind(places, mfie.rho_operator(loc_sign, rho_sym))
+        rho_rhs = bind(places, mfie.rho_rhs(jt_sym, inc_xyz_sym.e))(
                 queue, jt=jt, inc_fld=inc_field_scat.field, **knl_kwargs)
 
         gmres_result = gmres(
@@ -352,20 +382,21 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
 
         # }}}
 
-        jxyz = bind(qbx, sym.tangential_to_xyz(jt_sym))(queue, jt=jt)
+        jxyz = bind(places, sym.tangential_to_xyz(jt_sym))(queue, jt=jt)
 
         # {{{ volume eval
 
         sym_repr = mfie.scattered_volume_field(jt_sym, rho_sym)
 
-        def eval_repr_at(tgt, source=None):
+        def eval_repr_at(tgt, source=None, target=None):
             if source is None:
-                source = qbx
+                source = sym.DEFAULT_SOURCE
 
-            return bind((source, tgt), sym_repr)(queue, jt=jt, rho=rho, **knl_kwargs)
+            return bind(places, sym_repr, auto_where=(source, target))(
+                    queue, jt=jt, rho=rho, **knl_kwargs)
 
-        pde_test_repr = EHField(
-                vector_from_device(queue, eval_repr_at(calc_patch_tgt)))
+        pde_test_repr = EHField(vector_from_device(queue,
+            eval_repr_at(places, target="patch_target")))
 
         maxwell_residuals = [
                 calc_patch.norm(x, np.inf) / calc_patch.norm(pde_test_repr.e, np.inf)
@@ -384,12 +415,12 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
         pec_bc_e = sym.n_cross(bc_repr.e + inc_xyz_sym.e)
         pec_bc_h = sym.normal(3).as_vector().dot(bc_repr.h + inc_xyz_sym.h)
 
-        eh_bc_values = bind(qbx, sym.join_fields(pec_bc_e, pec_bc_h))(
+        eh_bc_values = bind(places, sym.join_fields(pec_bc_e, pec_bc_h))(
                     queue, jt=jt, rho=rho, inc_fld=inc_field_scat.field,
                     **knl_kwargs)
 
         def scat_norm(f):
-            return norm(qbx, queue, f, p=np.inf)
+            return norm(density_discr, queue, f, p=np.inf)
 
         e_bc_residual = scat_norm(eh_bc_values[:3]) / scat_norm(inc_field_scat.e)
         h_bc_residual = scat_norm(eh_bc_values[3]) / scat_norm(inc_field_scat.h)
@@ -406,8 +437,9 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
             from meshmode.discretization.visualization import make_visualizer
             bdry_vis = make_visualizer(queue, scat_discr, case.target_order+3)
 
-            bdry_normals = bind(scat_discr, sym.normal(3))(queue)\
-                    .as_vector(dtype=object)
+            bdry_normals = bind(places,
+                    sym.normal(3, dofdesc="scat_discr")
+                    )(queue).as_vector(dtype=object)
 
             bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [
                 ("j", jxyz),
@@ -419,17 +451,10 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
                 ("h_bc_residual", eh_bc_values[3]),
                 ])
 
-            fplot = make_field_plotter_from_bbox(
-                    find_bounding_box(scat_discr.mesh), h=(0.05, 0.05, 0.3),
-                    extend_factor=0.3)
-
             from pytential.qbx import QBXTargetAssociationFailedException
-
-            qbx_tgt_tol = qbx.copy(target_association_tolerance=0.2)
-
-            fplot_tgt = PointsTarget(cl.array.to_device(queue, fplot.points))
             try:
-                fplot_repr = eval_repr_at(fplot_tgt, source=qbx_tgt_tol)
+                fplot_repr = eval_repr_at(places,
+                        target="plot_targets", source="qbx_target_tol")
             except QBXTargetAssociationFailedException as e:
                 fplot.write_vtk_file(
                         "failed-targets.vts",
@@ -439,9 +464,8 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
                 raise
 
             fplot_repr = EHField(vector_from_device(queue, fplot_repr))
-
-            fplot_inc = EHField(
-                    vector_from_device(queue, eval_inc_field_at(fplot_tgt)))
+            fplot_inc = EHField(vector_from_device(queue,
+                eval_inc_field_at(places, target="plot_targets")))
 
             fplot.write_vtk_file(
                     "potential-%s.vts" % resolution,
@@ -457,7 +481,7 @@ def test_pec_mfie_extinction(ctx_factory, case, visualize=False):
 
         # {{{ error in E, H
 
-        obs_repr = EHField(eval_repr_at(obs_discr))
+        obs_repr = EHField(eval_repr_at(places, target="obs_discr"))
 
         def obs_norm(f):
             return norm(obs_discr, queue, f, p=np.inf)
diff --git a/test/test_scalar_int_eq.py b/test/test_scalar_int_eq.py
index 82faf4cc118e97e1c8dd3464a711cec484ea77a4..bc197395771f2ed3e82b709bbe9ef00fb4ca7839 100644
--- a/test/test_scalar_int_eq.py
+++ b/test/test_scalar_int_eq.py
@@ -22,24 +22,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
+from functools import partial
 import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
 import pyopencl.clmath  # noqa
+
 import pytest
-from pytools import Record
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
-from functools import partial
 from meshmode.mesh.generation import (  # noqa
         ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
         make_curve_mesh)
 from meshmode.discretization.visualization import make_visualizer
+
 from sumpy.symbolic import USE_SYMENGINE
+
 from pytential import bind, sym
 from pytential.qbx import QBXTargetAssociationFailedException
+from pytential import GeometryCollection
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel, BiharmonicKernel
 
 import logging
@@ -434,7 +436,7 @@ class BetterplaneIntEqTestCase(IntEqTestCase):
 
 # {{{ test backend
 
-def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
+def run_int_eq_test(cl_ctx, queue, case, resolution, visualize=False):
     mesh = case.get_mesh(resolution, case.target_order)
     print("%d elements" % mesh.nelements)
 
@@ -464,27 +466,72 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
         else:
             qbx_lpot_kwargs["fmm_order"] = case.qbx_order + 5
 
+    if case.prob_side == -1:
+        test_src_geo_radius = case.outer_radius
+        test_tgt_geo_radius = case.inner_radius
+    elif case.prob_side == +1:
+        test_src_geo_radius = case.inner_radius
+        test_tgt_geo_radius = case.outer_radius
+    elif case.prob_side == "scat":
+        test_src_geo_radius = case.outer_radius
+        test_tgt_geo_radius = case.outer_radius
+    else:
+        raise ValueError("unknown problem_side")
+
+    # {{{ construct geometries
+
     qbx = QBXLayerPotentialSource(
             pre_density_discr,
             fine_order=source_order,
             qbx_order=case.qbx_order,
 
+            _disable_refinement=not case.use_refinement,
             _box_extent_norm=getattr(case, "box_extent_norm", None),
             _from_sep_smaller_crit=getattr(case, "from_sep_smaller_crit", None),
             _from_sep_smaller_min_nsources_cumul=30,
             fmm_backend=case.fmm_backend, **qbx_lpot_kwargs)
 
+    from pytential.source import PointPotentialSource
+    point_sources = make_circular_point_group(
+            mesh.ambient_dim, 10, test_src_geo_radius,
+            func=lambda x: x**1.5)
+    point_source = PointPotentialSource(cl_ctx, point_sources)
+
+    from pytential.target import PointsTarget
+    test_targets = make_circular_point_group(
+            mesh.ambient_dim, 20, test_tgt_geo_radius)
+    point_target = PointsTarget(test_targets)
+
+    if visualize:
+        vis_grid_spacing = (0.1, 0.1, 0.1)[:qbx.ambient_dim]
+        if hasattr(case, "vis_grid_spacing"):
+            vis_grid_spacing = case.vis_grid_spacing
+
+        vis_extend_factor = 0.2
+        if hasattr(case, "vis_extend_factor"):
+            vis_grid_spacing = case.vis_grid_spacing
+
+        from sumpy.visualization import make_field_plotter_from_bbox  # noqa
+        from meshmode.mesh.processing import find_bounding_box
+        fplot = make_field_plotter_from_bbox(
+                find_bounding_box(mesh),
+                h=vis_grid_spacing,
+                extend_factor=vis_extend_factor)
+
+        from pytential.target import PointsTarget
+        plot_targets = PointsTarget(fplot.points)
+
     if case.use_refinement:
         if case.knl_class == HelmholtzKernel and \
                 getattr(case, "refine_on_helmholtz_k", True):
             refiner_extra_kwargs["kernel_length_scale"] = 5/case.k
 
         if hasattr(case, "scaled_max_curvature_threshold"):
-            refiner_extra_kwargs["_scaled_max_curvature_threshold"] = \
+            refiner_extra_kwargs["scaled_max_curvature_threshold"] = \
                     case.scaled_max_curvature_threshold
 
         if hasattr(case, "expansion_disturbance_tolerance"):
-            refiner_extra_kwargs["_expansion_disturbance_tolerance"] = \
+            refiner_extra_kwargs["expansion_disturbance_tolerance"] = \
                     case.expansion_disturbance_tolerance
 
         if hasattr(case, "refinement_maxiter"):
@@ -492,21 +539,47 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
         #refiner_extra_kwargs["visualize"] = True
 
+    places = {
+        sym.DEFAULT_SOURCE: qbx,
+        sym.DEFAULT_TARGET: qbx,
+        "point_source": point_source,
+        "point_target": point_target
+        }
+    if visualize:
+        places.update({
+            "qbx_target_tol": qbx.copy(target_association_tolerance=0.15),
+            "plot_targets": plot_targets
+            })
+
+    places = GeometryCollection(places)
+    if case.use_refinement:
+        from pytential.qbx.refinement import refine_geometry_collection
+        places = refine_geometry_collection(queue, places,
+                **refiner_extra_kwargs)
+
+    dd = sym.as_dofdesc(sym.DEFAULT_SOURCE).to_stage1()
+    density_discr = places.get_discretization(dd.geometry)
+
+    if case.use_refinement:
         print("%d elements before refinement" % pre_density_discr.mesh.nelements)
-        qbx, _ = qbx.with_refinement(**refiner_extra_kwargs)
+
+        discr = places.get_discretization(dd.geometry, sym.QBX_SOURCE_STAGE1)
         print("%d stage-1 elements after refinement"
-                % qbx.density_discr.mesh.nelements)
+                % discr.mesh.nelements)
+
+        discr = places.get_discretization(dd.geometry, sym.QBX_SOURCE_STAGE2)
         print("%d stage-2 elements after refinement"
-                % qbx.stage2_density_discr.mesh.nelements)
+                % discr.mesh.nelements)
+
+        discr = places.get_discretization(dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
         print("quad stage-2 elements have %d nodes"
-                % qbx.quad_stage2_density_discr.groups[0].nunit_nodes)
+                % discr.groups[0].nunit_nodes)
 
-    density_discr = qbx.density_discr
+    # }}}
 
     if hasattr(case, "visualize_geometry") and case.visualize_geometry:
-        bdry_normals = bind(
-                density_discr, sym.normal(mesh.ambient_dim)
-                )(queue).as_vector(dtype=object)
+        bdry_normals = bind(places, sym.normal(mesh.ambient_dim))(
+                queue).as_vector(dtype=np.object)
 
         bdry_vis = make_visualizer(queue, density_discr, case.target_order)
         bdry_vis.write_vtk_file("geometry.vtu", [
@@ -515,27 +588,25 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # {{{ plot geometry
 
-    if 0:
+    if visualize:
         if mesh.ambient_dim == 2:
             # show geometry, centers, normals
             nodes_h = density_discr.nodes().get(queue=queue)
+            normal = bind(places, sym.normal(2))(queue).as_vector(np.object)
+
             pt.plot(nodes_h[0], nodes_h[1], "x-")
-            normal = bind(density_discr, sym.normal(2))(queue).as_vector(np.object)
             pt.quiver(nodes_h[0], nodes_h[1],
                     normal[0].get(queue), normal[1].get(queue))
             pt.gca().set_aspect("equal")
             pt.show()
-
         elif mesh.ambient_dim == 3:
-            bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
-
-            bdry_normals = bind(density_discr, sym.normal(3))(queue)\
-                    .as_vector(dtype=object)
+            bdry_normals = bind(places, sym.normal(3))(
+                    queue).as_vector(dtype=object)
 
+            bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
             bdry_vis.write_vtk_file("pre-solve-source-%s.vtu" % resolution, [
                 ("bdry_normals", bdry_normals),
                 ])
-
         else:
             raise ValueError("invalid mesh dim")
 
@@ -577,24 +648,6 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # {{{ set up test data
 
-    if case.prob_side == -1:
-        test_src_geo_radius = case.outer_radius
-        test_tgt_geo_radius = case.inner_radius
-    elif case.prob_side == +1:
-        test_src_geo_radius = case.inner_radius
-        test_tgt_geo_radius = case.outer_radius
-    elif case.prob_side == "scat":
-        test_src_geo_radius = case.outer_radius
-        test_tgt_geo_radius = case.outer_radius
-    else:
-        raise ValueError("unknown problem_side")
-
-    point_sources = make_circular_point_group(
-            mesh.ambient_dim, 10, test_src_geo_radius,
-            func=lambda x: x**1.5)
-    test_targets = make_circular_point_group(
-            mesh.ambient_dim, 20, test_tgt_geo_radius)
-
     np.random.seed(22)
     source_charges = np.random.randn(point_sources.shape[1])
     source_charges[-1] = -np.sum(source_charges[:-1])
@@ -607,28 +660,24 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # {{{ establish BCs
 
-    from pytential.source import PointPotentialSource
-    from pytential.target import PointsTarget
-
-    point_source = PointPotentialSource(cl_ctx, point_sources)
-
     pot_src = sym.IntG(
         # FIXME: qbx_forced_limit--really?
         knl, sym.var("charges"), qbx_forced_limit=None, **knl_kwargs_syms)
 
-    test_direct = bind((point_source, PointsTarget(test_targets)), pot_src)(
+    test_direct = bind(places, pot_src,
+            auto_where=("point_source", "point_target"))(
             queue, charges=source_charges_dev, **concrete_knl_kwargs)
 
     if case.bc_type == "dirichlet":
-        bc = bind((point_source, density_discr), pot_src)(
-                queue, charges=source_charges_dev, **concrete_knl_kwargs)
+        bc = bind(places, pot_src,
+                auto_where=("point_source", sym.DEFAULT_TARGET))(
+                        queue, charges=source_charges_dev, **concrete_knl_kwargs)
 
     elif case.bc_type == "neumann":
-        bc = bind(
-                (point_source, density_discr),
-                sym.normal_derivative(
-                    qbx.ambient_dim, pot_src, dofdesc=sym.DEFAULT_TARGET)
-                )(queue, charges=source_charges_dev, **concrete_knl_kwargs)
+        bc = bind(places, sym.normal_derivative(
+            qbx.ambient_dim, pot_src, dofdesc=sym.DEFAULT_TARGET),
+            auto_where=("point_source", sym.DEFAULT_TARGET))(
+                    queue, charges=source_charges_dev, **concrete_knl_kwargs)
 
     elif case.bc_type == "clamped_plate":
         bc_u = bind((point_source, density_discr), pot_src)(
@@ -644,8 +693,8 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # {{{ solve
 
-    bound_op = bind(qbx, op_u)
-    rhs = bind(density_discr, op.prepare_rhs(op.get_density_var("bc")))(queue, bc=bc)
+    bound_op = bind(places, op_u)
+    rhs = bind(places, op.prepare_rhs(op.get_density_var("bc")))(queue, bc=bc)
 
     try:
         from pytential.solve import gmres
@@ -677,23 +726,19 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
                 bound_op.scipy_op(
                     queue, arg_name="u", dtype=dtype, **concrete_knl_kwargs))
         w, v = la.eig(mat)
-        if 0:
+        if visualize:
             pt.imshow(np.log10(1e-20+np.abs(mat)))
             pt.colorbar()
             pt.show()
 
-        #assert abs(s[-1]) < 1e-13, "h
-        #assert abs(s[-2]) > 1e-7
-        #from pudb import set_trace; set_trace()
-
     # }}}
 
     if case.prob_side != "scat":
         # {{{ error check
 
-        points_target = PointsTarget(test_targets)
-        bound_tgt_op = bind((qbx, points_target),
-                op.representation(op.get_density_var("u")))
+        bound_tgt_op = bind(places,
+                op.representation(op.get_density_var("u")),
+                auto_where=(sym.DEFAULT_SOURCE, "point_target"))
 
         test_via_bdry = bound_tgt_op(queue, u=weighted_u, **concrete_knl_kwargs)
 
@@ -729,22 +774,23 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
     # {{{ test gradient
 
     if case.check_gradient and case.prob_side != "scat":
-        bound_grad_op = bind((qbx, points_target),
+        bound_grad_op = bind(places,
                 op.representation(
                     op.get_density_var("u"),
                     map_potentials=lambda pot: sym.grad(mesh.ambient_dim, pot),
-                    qbx_forced_limit=None))
+                    qbx_forced_limit=None),
+                auto_where=(sym.DEFAULT_SOURCE, "point_target"))
 
         #print(bound_t_deriv_op.code)
 
         grad_from_src = bound_grad_op(
                 queue, u=weighted_u, **concrete_knl_kwargs)
 
-        grad_ref = (bind(
-                (point_source, points_target),
-                sym.grad(mesh.ambient_dim, pot_src)
-                )(queue, charges=source_charges_dev, **concrete_knl_kwargs)
-                )
+        grad_ref = bind(places,
+                sym.grad(mesh.ambient_dim, pot_src),
+                auto_where=("point_source", "point_target"))(queue,
+                        charges=source_charges_dev,
+                        **concrete_knl_kwargs)
 
         grad_err = (grad_from_src - grad_ref)
 
@@ -759,24 +805,23 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
     # {{{ test tangential derivative
 
     if case.check_tangential_deriv and case.prob_side != "scat":
-        bound_t_deriv_op = bind(qbx,
+        bound_t_deriv_op = bind(places,
                 op.representation(
                     op.get_density_var("u"),
-                    map_potentials=lambda pot: sym.tangential_derivative(2, pot),
+                    map_potentials=lambda pot:
+                    sym.tangential_derivative(qbx.ambient_dim, pot),
                     qbx_forced_limit=loc_sign))
 
-        #print(bound_t_deriv_op.code)
-
         tang_deriv_from_src = bound_t_deriv_op(
                 queue, u=weighted_u, **concrete_knl_kwargs).as_scalar().get()
 
-        tang_deriv_ref = (bind(
-                (point_source, density_discr),
-                sym.tangential_derivative(2, pot_src)
-                )(queue, charges=source_charges_dev, **concrete_knl_kwargs)
-                .as_scalar().get())
+        tang_deriv_ref = bind(places,
+                sym.tangential_derivative(qbx.ambient_dim, pot_src),
+                auto_where=("point_source", sym.DEFAULT_TARGET))(queue,
+                        charges=source_charges_dev,
+                        **concrete_knl_kwargs).as_scalar().get()
 
-        if 0:
+        if visualize:
             pt.plot(tang_deriv_ref.real)
             pt.plot(tang_deriv_from_src.real)
             pt.show()
@@ -795,44 +840,24 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
     # {{{ any-D file plotting
 
     if visualize:
-        bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
-
-        bdry_normals = bind(density_discr, sym.normal(qbx.ambient_dim))(queue)\
-                .as_vector(dtype=object)
+        bdry_normals = bind(places, sym.normal(qbx.ambient_dim))(
+                queue).as_vector(dtype=np.object)
 
         sym_sqrt_j = sym.sqrt_jac_q_weight(density_discr.ambient_dim)
-        u = bind(density_discr, op.get_density_var("u")/sym_sqrt_j)(queue,
-                u=weighted_u)
+        u = bind(places, op.get_density_var("u") / sym_sqrt_j)(queue, u=weighted_u)
 
+        bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
         bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [
             ("u", u),
             ("bc", bc),
             #("bdry_normals", bdry_normals),
             ])
 
-        from sumpy.visualization import make_field_plotter_from_bbox  # noqa
-        from meshmode.mesh.processing import find_bounding_box
-
-        vis_grid_spacing = (0.1, 0.1, 0.1)[:qbx.ambient_dim]
-        if hasattr(case, "vis_grid_spacing"):
-            vis_grid_spacing = case.vis_grid_spacing
-        vis_extend_factor = 0.2
-        if hasattr(case, "vis_extend_factor"):
-            vis_grid_spacing = case.vis_grid_spacing
-
-        fplot = make_field_plotter_from_bbox(
-                find_bounding_box(mesh),
-                h=vis_grid_spacing,
-                extend_factor=vis_extend_factor)
-
-        qbx_tgt_tol = qbx.copy(target_association_tolerance=0.15)
-        from pytential.target import PointsTarget
-
         try:
-            solved_pot = bind(
-                    (qbx_tgt_tol, PointsTarget(fplot.points)),
-                    op.representation(op.get_density_var("u"))
-                    )(queue, u=weighted_u, **concrete_knl_kwargs)
+            solved_pot = bind(places,
+                    op.representation(op.get_density_var("u")),
+                    auto_where=("qbx_target_tol", "plot_targets"))(
+                            queue, u=weighted_u, k=case.k)
         except QBXTargetAssociationFailedException as e:
             fplot.write_vtk_file(
                     "failed-targets.vts",
@@ -843,17 +868,21 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
         ones_density = density_discr.zeros(queue)
         ones_density.fill(1)
-        indicator = bind(
-                (qbx_tgt_tol, PointsTarget(fplot.points)),
-                -sym.D(LaplaceKernel(density_discr.ambient_dim),
-                    op.get_density_var("sigma"),
-                    qbx_forced_limit=None))(
-                queue, sigma=ones_density).get()
+
+        indicator = -sym.D(LaplaceKernel(qbx.ambient_dim),
+                op.get_density_var("sigma"),
+                qbx_forced_limit=None)
+        indicator = bind(places, indicator,
+                auto_where=("qbx_target_tol", "plot_targets"))(
+                        queue, sigma=ones_density).get()
 
         solved_pot = solved_pot.get()
 
-        true_pot = bind((point_source, PointsTarget(fplot.points)), pot_src)(
-                queue, charges=source_charges_dev, **concrete_knl_kwargs).get()
+        true_pot = bind(places, pot_src,
+                auto_where=("point_source", "plot_targets"))(
+                        queue,
+                        charges=source_charges_dev,
+                        **concrete_knl_kwargs).get()
 
         #fplot.show_scalar_in_mayavi(solved_pot.real, max_val=5)
         if case.prob_side == "scat":
@@ -877,11 +906,8 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # }}}
 
-    class Result(Record):
-        pass
-
-    h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
-    return Result(
+    h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
+    return dict(
             h_max=h_max,
             rel_err_2=rel_err_2,
             rel_err_inf=rel_err_inf,
@@ -936,17 +962,17 @@ def test_integral_equation(ctx_factory, case, visualize=False):
         result = run_int_eq_test(cl_ctx, queue, case, resolution,
                 visualize=visualize)
 
-        if result.rel_err_2 is not None:
+        if result["rel_err_2"] is not None:
             have_error_data = True
-            eoc_rec_target.add_data_point(result.h_max, result.rel_err_2)
+            eoc_rec_target.add_data_point(result["h_max"], result["rel_err_2"])
 
-        if result.rel_td_err_inf is not None:
-            eoc_rec_td.add_data_point(result.h_max, result.rel_td_err_inf)
+        if result["rel_td_err_inf"] is not None:
+            eoc_rec_td.add_data_point(result["h_max"], result["rel_td_err_inf"])
 
     if case.bc_type == "dirichlet":
         tgt_order = case.qbx_order
     elif case.bc_type == "neumann":
-        tgt_order = case.qbx_order-1
+        tgt_order = case.qbx_order - 1
     elif case.bc_type == "clamped_plate":
         tgt_order = case.qbx_order
     else:
diff --git a/test/test_stokes.py b/test/test_stokes.py
index e0b95000632d3637c61b43e62cba2597bb2c6475..19167efe102673c5a5350b89002b9b6f28b01509 100644
--- a/test/test_stokes.py
+++ b/test/test_stokes.py
@@ -39,13 +39,14 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
 from pytential import bind, sym, norm  # noqa
+from pytential import GeometryCollection
 from pytential.solve import gmres
 import logging
 
 
 def run_exterior_stokes_2d(ctx_factory, nelements,
         mesh_order=4, target_order=4, qbx_order=4,
-        fmm_order=10, mu=1, circle_rad=1.5, do_plot=False):
+        fmm_order=10, mu=1, circle_rad=1.5, visualize=False):
 
     # This program tests an exterior Stokes flow in 2D using the
     # compound representation given in Hsiao & Kress,
@@ -59,6 +60,8 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
 
     ovsmp_target_order = 4*target_order
 
+    # {{{ geometries
+
     from meshmode.mesh.generation import (  # noqa
             make_curve_mesh, starfish, ellipse, drop)
     mesh = make_curve_mesh(
@@ -71,16 +74,47 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
 
     from pytential.qbx import QBXLayerPotentialSource
     target_association_tolerance = 0.05
-    qbx, _ = QBXLayerPotentialSource(
+    qbx = QBXLayerPotentialSource(
             coarse_density_discr, fine_order=ovsmp_target_order, qbx_order=qbx_order,
             fmm_order=fmm_order,
             target_association_tolerance=target_association_tolerance,
             _expansions_in_tree_have_extent=True,
-            ).with_refinement()
+            )
+
+    def circle_mask(test_points, radius):
+        return (test_points[0, :]**2 + test_points[1, :]**2 > radius**2)
+
+    def outside_circle(test_points, radius):
+        mask = circle_mask(test_points, radius)
+        return np.array([
+            row[mask]
+            for row in test_points])
+
+    from pytential.target import PointsTarget
+    nsamp = 30
+    eval_points_1d = np.linspace(-3., 3., nsamp)
+    eval_points = np.zeros((2, len(eval_points_1d)**2))
+    eval_points[0, :] = np.tile(eval_points_1d, len(eval_points_1d))
+    eval_points[1, :] = np.repeat(eval_points_1d, len(eval_points_1d))
+    eval_points = outside_circle(eval_points, radius=circle_rad)
+    point_targets = PointsTarget(eval_points)
+
+    fplot = FieldPlotter(np.zeros(2), extent=6, npoints=100)
+    plot_targets = PointsTarget(outside_circle(fplot.points, radius=circle_rad))
+
+    places = GeometryCollection({
+        sym.DEFAULT_SOURCE: qbx,
+        sym.DEFAULT_TARGET: qbx.density_discr,
+        "point_target": point_targets,
+        "plot_target": plot_targets,
+        })
 
-    density_discr = qbx.density_discr
-    normal = bind(density_discr, sym.normal(2).as_vector())(queue)
-    path_length = bind(density_discr, sym.integral(2, 1, 1))(queue)
+    density_discr = places.get_discretization(sym.DEFAULT_SOURCE)
+
+    normal = bind(places, sym.normal(2).as_vector())(queue)
+    path_length = bind(places, sym.integral(2, 1, 1))(queue)
+
+    # }}}
 
     # {{{ describe bvp
 
@@ -104,13 +138,13 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
     bdry_op_sym = (
             -loc_sign * 0.5 * sigma_sym
             - stresslet_obj.apply(sigma_sym, nvec_sym, mu_sym,
-                qbx_forced_limit='avg')
+                qbx_forced_limit="avg")
             + stokeslet_obj.apply(meanless_sigma_sym, mu_sym,
-                qbx_forced_limit='avg') - (0.5/np.pi) * int_sigma)
+                qbx_forced_limit="avg") - (0.5/np.pi) * int_sigma)
 
     # }}}
 
-    bound_op = bind(qbx, bdry_op_sym)
+    bound_op = bind(places, bdry_op_sym)
 
     # {{{ fix rhs and solve
 
@@ -152,9 +186,9 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
     omega = [
             cl.array.to_device(queue, (strength/path_length)*np.ones(len(nodes[0]))),
             cl.array.to_device(queue, np.zeros(len(nodes[0])))]
-    bvp_rhs = bind(
-            qbx, sym.make_sym_vector("bc", dim) + u_A_sym_bdry
-            )(queue, bc=bc, mu=mu, omega=omega)
+    bvp_rhs = bind(places,
+            sym.make_sym_vector("bc", dim) + u_A_sym_bdry)(queue,
+                    bc=bc, mu=mu, omega=omega)
     gmres_result = gmres(
             bound_op.scipy_op(queue, "sigma", np.float64, mu=mu, normal=normal),
             bvp_rhs,
@@ -169,7 +203,7 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
 
     sigma = gmres_result.solution
     sigma_int_val_sym = sym.make_sym_vector("sigma_int_val", 2)
-    int_val = bind(qbx, sym.integral(2, 1, sigma_sym))(queue, sigma=sigma)
+    int_val = bind(places, sym.integral(2, 1, sigma_sym))(queue, sigma=sigma)
     int_val = -int_val/(2 * np.pi)
     print("int_val = ", int_val)
 
@@ -182,35 +216,22 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
                 meanless_sigma_sym, mu_sym, qbx_forced_limit=2)
             - u_A_sym_vol + sigma_int_val_sym)
 
-    nsamp = 30
-    eval_points_1d = np.linspace(-3., 3., nsamp)
-    eval_points = np.zeros((2, len(eval_points_1d)**2))
-    eval_points[0, :] = np.tile(eval_points_1d, len(eval_points_1d))
-    eval_points[1, :] = np.repeat(eval_points_1d, len(eval_points_1d))
-
-    def circle_mask(test_points, radius):
-        return (test_points[0, :]**2 + test_points[1, :]**2 > radius**2)
-
-    def outside_circle(test_points, radius):
-        mask = circle_mask(test_points, radius)
-        return np.array([
-            row[mask]
-            for row in test_points])
-
-    eval_points = outside_circle(eval_points, radius=circle_rad)
-    from pytential.target import PointsTarget
-    vel = bind(
-            (qbx, PointsTarget(eval_points)),
-            representation_sym)(queue, sigma=sigma, mu=mu, normal=normal,
-                    sigma_int_val=int_val, omega=omega)
+    where = (sym.DEFAULT_SOURCE, "point_target")
+    vel = bind(places, representation_sym, auto_where=where)(queue,
+            sigma=sigma,
+            mu=mu,
+            normal=normal,
+            sigma_int_val=int_val,
+            omega=omega)
     print("@@@@@@@@")
 
-    fplot = FieldPlotter(np.zeros(2), extent=6, npoints=100)
-    plot_pts = outside_circle(fplot.points, radius=circle_rad)
-    plot_vel = bind(
-          (qbx, PointsTarget(plot_pts)),
-          representation_sym)(queue, sigma=sigma, mu=mu, normal=normal,
-                  sigma_int_val=int_val, omega=omega)
+    plot_vel = bind(places, representation_sym,
+            auto_where=(sym.DEFAULT_SOURCE, "plot_target"))(queue,
+                    sigma=sigma,
+                    mu=mu,
+                    normal=normal,
+                    sigma_int_val=int_val,
+                    omega=omega)
 
     def get_obj_array(obj_array):
         return make_obj_array([
@@ -251,9 +272,7 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
     print("max rel error at sampled points: ",
             max(abs(rel_err[0])), max(abs(rel_err[1])))
 
-    if do_plot:
-        import matplotlib
-        matplotlib.use("Agg")
+    if visualize:
         import matplotlib.pyplot as plt
 
         full_pot = np.zeros_like(fplot.points) * float("nan")
@@ -270,7 +289,7 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
 
     # }}}
 
-    h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue)
+    h_max = bind(places, sym.h_max(qbx.ambient_dim))(queue)
     return h_max, l2_err
 
 
diff --git a/test/test_symbolic.py b/test/test_symbolic.py
index 12145d692b12616037f54c20839fa579510071f5..37b86c12a485b419b05f8b6878923870f1169a41 100644
--- a/test/test_symbolic.py
+++ b/test/test_symbolic.py
@@ -219,10 +219,9 @@ def test_layer_potential_construction(lpot_class, ambient_dim=2):
 
 
 @pytest.mark.parametrize(("name", "source_discr_stage", "target_granularity"), [
-    ("default", None, None),
-    ("default-explicit", sym.QBX_SOURCE_STAGE1, sym.GRANULARITY_NODE),
+    ("default_explicit", sym.QBX_SOURCE_STAGE1, sym.GRANULARITY_NODE),
     ("stage2", sym.QBX_SOURCE_STAGE2, sym.GRANULARITY_NODE),
-    ("stage2-center", sym.QBX_SOURCE_STAGE2, sym.GRANULARITY_CENTER),
+    ("stage2_center", sym.QBX_SOURCE_STAGE2, sym.GRANULARITY_CENTER),
     ("quad", sym.QBX_SOURCE_QUAD_STAGE2, sym.GRANULARITY_NODE)
     ])
 def test_interpolation(ctx_factory, name, source_discr_stage, target_granularity):
@@ -233,6 +232,16 @@ def test_interpolation(ctx_factory, name, source_discr_stage, target_granularity
     target_order = 7
     qbx_order = 4
 
+    where = sym.as_dofdesc("test_interpolation")
+    from_dd = sym.DOFDescriptor(
+            geometry=where.geometry,
+            discr_stage=source_discr_stage,
+            granularity=sym.GRANULARITY_NODE)
+    to_dd = sym.DOFDescriptor(
+            geometry=where.geometry,
+            discr_stage=sym.QBX_SOURCE_QUAD_STAGE2,
+            granularity=target_granularity)
+
     mesh = make_curve_mesh(starfish,
             np.linspace(0.0, 1.0, nelements + 1),
             target_order)
@@ -240,44 +249,36 @@ def test_interpolation(ctx_factory, name, source_discr_stage, target_granularity
             InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
     from pytential.qbx import QBXLayerPotentialSource
-    qbx, _ = QBXLayerPotentialSource(discr,
+    qbx = QBXLayerPotentialSource(discr,
             fine_order=4 * target_order,
             qbx_order=qbx_order,
-            fmm_order=False).with_refinement()
+            fmm_order=False)
 
-    where = 'test-interpolation'
-    from_dd = sym.DOFDescriptor(
-            geometry=where,
-            discr_stage=source_discr_stage,
-            granularity=sym.GRANULARITY_NODE)
-    to_dd = sym.DOFDescriptor(
-            geometry=where,
-            discr_stage=sym.QBX_SOURCE_QUAD_STAGE2,
-            granularity=target_granularity)
+    from pytential import GeometryCollection
+    places = GeometryCollection(qbx, auto_where=where)
 
     sigma_sym = sym.var("sigma")
     op_sym = sym.sin(sym.interp(from_dd, to_dd, sigma_sym))
-    bound_op = bind(qbx, op_sym, auto_where=where)
+    bound_op = bind(places, op_sym, auto_where=where)
 
-    target_nodes = qbx.quad_stage2_density_discr.nodes().get(queue)
-    if source_discr_stage == sym.QBX_SOURCE_STAGE2:
-        source_nodes = qbx.stage2_density_discr.nodes().get(queue)
-    elif source_discr_stage == sym.QBX_SOURCE_QUAD_STAGE2:
-        source_nodes = target_nodes
-    else:
-        source_nodes = qbx.density_discr.nodes().get(queue)
+    def nodes(stage):
+        density_discr = places.get_discretization(where.geometry, stage)
+        return density_discr.nodes().get(queue)
+
+    target_nodes = nodes(sym.QBX_SOURCE_QUAD_STAGE2)
+    source_nodes = nodes(source_discr_stage)
 
     sigma_dev = cl.array.to_device(queue, la.norm(source_nodes, axis=0))
     sigma_target = np.sin(la.norm(target_nodes, axis=0))
     sigma_target_interp = bound_op(queue, sigma=sigma_dev).get(queue)
 
-    if name in ('default', 'default-explicit', 'stage2', 'quad'):
+    if name in ("default", "default_explicit", "stage2", "quad"):
         error = la.norm(sigma_target_interp - sigma_target) / la.norm(sigma_target)
         assert error < 1.0e-10
-    elif name in ('stage2-center',):
+    elif name in ("stage2_center",):
         assert len(sigma_target_interp) == 2 * len(sigma_target)
     else:
-        raise ValueError('unknown test case name: {}'.format(name))
+        raise ValueError("unknown test case name: {}".format(name))
 
 
 # You can test individual routines by typing
diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py
index 258a4d75a716b0b7b8f3068b35861ba941c0d4d3..551bb38c5a5e73a30f4ed72124c866b89349330e 100644
--- a/test/test_target_specific_qbx.py
+++ b/test/test_target_specific_qbx.py
@@ -37,7 +37,9 @@ from meshmode.mesh.generation import (  # noqa
         NArmedStarfish,
         make_curve_mesh)
 
-from pytential import bind, sym, norm  # noqa
+from pytential import bind, sym
+from pytential import GeometryCollection
+
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
 
 import logging
@@ -154,13 +156,7 @@ def test_target_specific_qbx(ctx_factory, op, helmholtz_k, qbx_order):
             InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
     from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
-
-    refiner_extra_kwargs = {}
-
-    if helmholtz_k != 0:
-        refiner_extra_kwargs["kernel_length_scale"] = 5 / abs(helmholtz_k)
-
-    qbx, _ = QBXLayerPotentialSource(
+    qbx = QBXLayerPotentialSource(
             pre_density_discr, 4*target_order,
             qbx_order=qbx_order,
             fmm_level_to_order=SimpleExpansionOrderFinder(fmm_tol),
@@ -168,10 +164,20 @@ def test_target_specific_qbx(ctx_factory, op, helmholtz_k, qbx_order):
             _expansions_in_tree_have_extent=True,
             _expansion_stick_out_factor=0.9,
             _use_target_specific_qbx=False,
-            ).with_refinement(**refiner_extra_kwargs)
+            )
+
+    kernel_length_scale = 5 / abs(helmholtz_k) if helmholtz_k else None
+    places = {
+        "qbx": qbx,
+        "qbx_target_specific": qbx.copy(_use_target_specific_qbx=True)
+        }
 
-    density_discr = qbx.density_discr
+    from pytential.qbx.refinement import refine_geometry_collection
+    places = GeometryCollection(places, auto_where="qbx")
+    places = refine_geometry_collection(queue, places,
+            kernel_length_scale=kernel_length_scale)
 
+    density_discr = places.get_discretization("qbx")
     nodes = density_discr.nodes().with_queue(queue)
     u_dev = clmath.sin(nodes[0])
 
@@ -195,11 +201,10 @@ def test_target_specific_qbx(ctx_factory, op, helmholtz_k, qbx_order):
 
     expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs)
 
-    bound_op = bind(qbx, expr)
+    bound_op = bind(places, expr)
     pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k).get()
 
-    qbx = qbx.copy(_use_target_specific_qbx=True)
-    bound_op = bind(qbx, expr)
+    bound_op = bind(places, expr, auto_where="qbx_target_specific")
     pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k).get()
 
     assert np.allclose(pot_tsqbx, pot_ref, atol=1e-13, rtol=1e-13)
diff --git a/test/test_tools.py b/test/test_tools.py
index bf40f5f9224aae2ab6ea72945cc0e86c2c0b64d8..d2f107848cb9785a74528e64e1bd1ea0cbee6582 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from functools import partial
+
 import pytest
 
 import numpy as np
@@ -94,6 +96,77 @@ def test_interpolatory_error_reporting(ctx_factory):
         print("AREA", integral(vol_discr, queue, one), 0.25**2*np.pi)
 
 
+def test_geometry_collection_caching(ctx_factory):
+    # NOTE: checks that the on-demand caching works properly in
+    # the `GeometryCollection`. This is done by constructing a few separated
+    # spheres, putting a few `QBXLayerPotentialSource`s on them and requesting
+    # the `nodes` on each `discr_stage`.
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    ndim = 2
+    nelements = 1024
+    target_order = 7
+    qbx_order = 4
+    ngeometry = 3
+
+    # construct discretizations
+    from meshmode.mesh.generation import ellipse, make_curve_mesh
+    from meshmode.mesh.processing import affine_map
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+
+    discrs = []
+    radius = 1.0
+    for k in range(ngeometry):
+        if k == 0:
+            mesh = make_curve_mesh(partial(ellipse, radius),
+                    np.linspace(0.0, 1.0, nelements + 1),
+                    target_order)
+        else:
+            mesh = affine_map(discrs[0].mesh,
+                    b=np.array([3 * k * radius, 0]))
+
+        discr = Discretization(ctx, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+        discrs.append(discr)
+
+    # construct qbx source
+    from pytential.qbx import QBXLayerPotentialSource
+
+    lpots = []
+    sources = ["source_{}".format(k) for k in range(ngeometry)]
+    for k, density_discr in enumerate(discrs):
+        qbx = QBXLayerPotentialSource(density_discr,
+            fine_order=2 * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False)
+        lpots.append(qbx)
+
+    # construct a geometry collection
+    from pytential import GeometryCollection
+    places = GeometryCollection(dict(zip(sources, lpots)))
+    print(places.places)
+
+    # check on-demand refinement
+    from pytential import bind, sym
+    discr_stages = [sym.QBX_SOURCE_STAGE1,
+            sym.QBX_SOURCE_STAGE2,
+            sym.QBX_SOURCE_QUAD_STAGE2]
+
+    for k in range(ngeometry):
+        for discr_stage in discr_stages:
+            with pytest.raises(KeyError):
+                discr = places._get_discr_from_cache(sources[k], discr_stage)
+
+            dofdesc = sym.DOFDescriptor(sources[k], discr_stage=discr_stage)
+            bind(places, sym.nodes(ndim, dofdesc=dofdesc))(queue)
+
+            discr = places._get_discr_from_cache(sources[k], discr_stage)
+            assert discr is not None
+
+
 # You can test individual routines by typing
 # $ python test_tools.py 'test_routine()'
 
diff --git a/test/too_slow_test_helmholtz.py b/test/too_slow_test_helmholtz.py
index 52c304ebd6dd61e4c0392138b3413c67a5c64ff8..b9786d3e10065de391980915952adeb7b0863ec3 100644
--- a/test/too_slow_test_helmholtz.py
+++ b/test/too_slow_test_helmholtz.py
@@ -40,7 +40,7 @@ from meshmode.discretization.poly_element import \
 from six.moves import range
 
 from pytential import bind, sym, norm  # noqa
-from pytential.symbolic.pde.scalar import (  # noqa
+from pytential.symbolic.pde.maxwell.waveguide import (  # noqa
         DielectricSRep2DBoundaryOperator as SRep,
         DielectricSDRep2DBoundaryOperator as SDRep)
 
@@ -66,6 +66,8 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
     if bdry_ovsmp_quad_order is None:
         bdry_ovsmp_quad_order = 4*bdry_quad_order
 
+    # {{{ geometries
+
     from meshmode.mesh.generation import ellipse, make_curve_mesh
     from functools import partial
     mesh = make_curve_mesh(
@@ -79,6 +81,50 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
 
     logger.info("%d elements" % mesh.nelements)
 
+    from pytential.qbx import QBXLayerPotentialSource
+    qbx = QBXLayerPotentialSource(
+            density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
+            fmm_order=fmm_order
+            )
+
+    from pytential.target import PointsTarget
+    targets_0 = PointsTarget(make_obj_array(list(np.array([
+        [3.2 + t, -4]
+        for t in [0, 0.5, 1]
+        ]).T.copy())))
+    targets_1 = PointsTarget(make_obj_array(list(np.array([
+        [-0.3 * t, -0.2 * t]
+        for t in [0, 0.5, 1]
+        ]).T.copy())))
+
+    if visualize:
+        low_order_qbx = QBXLayerPotentialSource(
+                density_discr,
+                fine_order=bdry_ovsmp_quad_order, qbx_order=2,
+                fmm_order=3,
+                )
+
+        from sumpy.visualization import FieldPlotter
+        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=300)
+        targets_plot = PointsTarget(fplot.points)
+
+    places = {
+        sym.DEFAULT_SOURCE: qbx,
+        sym.DEFAULT_TARGET: qbx.density_discr,
+        "targets0": targets_0,
+        "targets1": targets_1
+        }
+    if visualize:
+        places.update({
+            "qbx-low-order": low_order_qbx,
+            "targets-plot": targets_plot
+            })
+
+    from pytential import GeometryCollection
+    places = GeometryCollection(places)
+
+    # }}}
+
     # from meshmode.discretization.visualization import make_visualizer
     # bdry_vis = make_visualizer(queue, density_discr, 20)
 
@@ -94,25 +140,16 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
     pde_op = op_class(
             mode,
             k_vacuum=1,
-            interfaces=((0, 1, sym.DEFAULT_SOURCE),),
             domain_k_exprs=(k0, k1),
             beta=beta,
+            interfaces=((0, 1, sym.DEFAULT_SOURCE),),
             use_l2_weighting=use_l2_weighting)
 
     op_unknown_sym = pde_op.make_unknown("unknown")
 
     representation0_sym = pde_op.representation(op_unknown_sym, 0)
     representation1_sym = pde_op.representation(op_unknown_sym, 1)
-
-    from pytential.qbx import QBXLayerPotentialSource
-    qbx = QBXLayerPotentialSource(
-            density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
-            fmm_order=fmm_order
-            ).with_refinement()
-
-    #print(sym.pretty(pde_op.operator(op_unknown_sym)))
-    #1/0
-    bound_pde_op = bind(qbx, pde_op.operator(op_unknown_sym))
+    bound_pde_op = bind(places, pde_op.operator(op_unknown_sym))
 
     e_factor = float(pde_op.ez_enabled)
     h_factor = float(pde_op.hz_enabled)
@@ -142,10 +179,11 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
     pot_p2p = P2P(cl_ctx, [kernel], exclude_self=False)
     pot_p2p_grad = P2P(cl_ctx, kernel_grad, exclude_self=False)
 
-    normal = bind(density_discr, sym.normal())(queue).as_vector(np.object)
-    tangent = bind(
-        density_discr,
-        sym.pseudoscalar()/sym.area_element())(queue).as_vector(np.object)
+    normal = bind(places, sym.normal(qbx.ambient_dim))(
+            queue).as_vector(np.object)
+    tangent = bind(places,
+            sym.pseudoscalar(qbx.ambient_dim)/sym.area_element(qbx.ambient_dim))(
+                    queue).as_vector(np.object)
 
     _, (E0,) = pot_p2p(queue, density_discr.nodes(), e_sources_0, [e_strengths_0],
                     out_host=False, k=K0)
@@ -181,7 +219,7 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
     H0_dttarget = (grad0_H0*tangent[0] + grad1_H0*tangent[1])  # noqa
     H1_dttarget = (grad0_H1*tangent[0] + grad1_H1*tangent[1])  # noqa
 
-    sqrt_w = bind(density_discr, sym.sqrt_jac_q_weight())(queue)
+    sqrt_w = bind(places, sym.sqrt_jac_q_weight(qbx.ambient_dim))(queue)
 
     bvp_rhs = np.zeros(len(pde_op.bcs), dtype=np.object)
     for i_bc, terms in enumerate(pde_op.bcs):
@@ -243,33 +281,30 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
 
     # }}}
 
-    targets_0 = make_obj_array(list(np.array([
-        [3.2 + t, -4]
-        for t in [0, 0.5, 1]
-        ]).T.copy()))
-    targets_1 = make_obj_array(list(np.array([
-        [t*-0.3, t*-0.2]
-        for t in [0, 0.5, 1]
-        ]).T.copy()))
-
-    from pytential.target import PointsTarget
     from sumpy.tools import vector_from_device
-    F0_tgt = vector_from_device(queue, bind(  # noqa
-            (qbx, PointsTarget(targets_0)),
-            representation0_sym)(queue, unknown=unknown, K0=K0, K1=K1))
-    F1_tgt = vector_from_device(queue, bind(  # noqa
-            (qbx, PointsTarget(targets_1)),
-            representation1_sym)(queue, unknown=unknown, K0=K0, K1=K1))
-
-    _, (E0_tgt_true,) = pot_p2p(queue, targets_0, e_sources_0, [e_strengths_0],
-                    out_host=True, k=K0)
-    _, (E1_tgt_true,) = pot_p2p(queue, targets_1, e_sources_1, [e_strengths_1],
-                    out_host=True, k=K1)
-
-    _, (H0_tgt_true,) = pot_p2p(queue, targets_0, h_sources_0, [h_strengths_0],
-                    out_host=True, k=K0)
-    _, (H1_tgt_true,) = pot_p2p(queue, targets_1, h_sources_1, [h_strengths_1],
-                    out_host=True, k=K1)
+    F0_tgt = bind(places, representation0_sym,
+            auto_where=(sym.DEFAULT_SOURCE, "targets0"))(
+                    queue, unknown=unknown, K0=K0, K1=K1)
+    F0_tgt = vector_from_device(queue, F0_tgt)
+
+    F1_tgt = bind(places, representation1_sym,
+            auto_where=(sym.DEFAULT_SOURCE, "targets1"))(
+                    queue, unknown=unknown, K0=K0, K1=K1)
+    F1_tgt = vector_from_device(queue, F1_tgt)
+
+    _, (E0_tgt_true,) = pot_p2p(queue,
+            targets_0.nodes(), e_sources_0, [e_strengths_0],
+            out_host=True, k=K0)
+    _, (E1_tgt_true,) = pot_p2p(queue,
+            targets_1.nodes(), e_sources_1, [e_strengths_1],
+            out_host=True, k=K1)
+
+    _, (H0_tgt_true,) = pot_p2p(queue,
+            targets_0.nodes(), h_sources_0, [h_strengths_0],
+            out_host=True, k=K0)
+    _, (H1_tgt_true,) = pot_p2p(queue,
+            targets_1.nodes(), h_sources_1, [h_strengths_1],
+            out_host=True, k=K1)
 
     err_F0_total = 0  # noqa
     err_F1_total = 0  # noqa
@@ -313,15 +348,12 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
         i_field += 1
 
     if visualize:
-        from sumpy.visualization import FieldPlotter
-        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=300)
-        from pytential.target import PointsTarget
-        fld0 = bind(
-                (qbx, PointsTarget(fplot.points)),
-                representation0_sym)(queue, unknown=unknown, K0=K0)
-        fld1 = bind(
-                (qbx, PointsTarget(fplot.points)),
-                representation1_sym)(queue, unknown=unknown, K1=K1)
+        fld0 = bind(places, representation0_sym,
+                auto_where=(sym.DEFAULT_SOURCE, "targets-plot"))(
+                        queue, unknown=unknown, K0=K0)
+        fld1 = bind(places, representation1_sym,
+                auto_where=(sym.DEFAULT_SOURCE, "targets-plot"))(
+                        queue, unknown=unknown, K1=K1)
 
         comp_fields = []
         i_field = 0
@@ -337,15 +369,11 @@ def run_dielectric_test(cl_ctx, queue, nelements, qbx_order,
 
             i_field += 0
 
-        low_order_qbx = QBXLayerPotentialSource(
-                density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=2,
-                fmm_order=3).with_refinement()
         from sumpy.kernel import LaplaceKernel
-        from pytential.target import PointsTarget
         ones = (cl.array.empty(queue, (density_discr.nnodes,), dtype=np.float64)
                 .fill(1))
-        ind_func = - bind((low_order_qbx, PointsTarget(fplot.points)),
-                sym.D(LaplaceKernel(2), sym.var("u")))(
+        ind_func = - bind(places, sym.D(LaplaceKernel(2), sym.var("u")),
+                auto_where=("qbx-low-order", "targets-plot"))(
                         queue, u=ones).get()
 
         _, (e_fld0_true,) = pot_p2p(