diff --git a/.gitignore b/.gitignore
index 3d5b80428d4d3d96051e085c47612d07a4aeecc0..c5d29d7c34f52c54cf724c91f0f86a9e7b4c96df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ examples/*.pdf
 .cache
 
 tags
+
+pytential/_git_rev.py
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 334a41d0fb0b56e6582d71e9242c09d9729fdb0e..9526444ebafca74473618978b184ea0ebad235e9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,7 +2,7 @@ Python 3.5 POCL:
   script:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy scipy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -16,7 +16,7 @@ Python 3.6 POCL:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy scipy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -26,7 +26,21 @@ Python 3.6 POCL:
   except:
   - tags
 
-Python 3.5 Conda:
+Python 3.6 POCL Examples:
+  script:
+  - export PY_EXE=python3.6
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
+  - ". ./build-py-project-and-run-examples.sh"
+  tags:
+  - python3.6
+  - pocl
+  - large-node
+  except:
+  - tags
+
+Python 3.6 Conda:
   script:
   - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine
   - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
@@ -43,7 +57,7 @@ Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 scipy numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -53,9 +67,28 @@ Python 2.7 POCL:
   except:
   - tags
 
+Python 3.6 Conda Apple:
+  script:
+  - export LC_ALL=en_US.UTF-8
+  - export LANG=en_US.UTF-8
+  - export PYTEST_ADDOPTS=-k-slowtest
+  - CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml
+  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+  - ". ./build-and-test-py-project-within-miniconda.sh"
+
+  # https://gitlab.tiker.net/inducer/pytential/issues/112
+  allow_failure: true
+
+  tags:
+  - apple
+  except:
+  - tags
+  retry: 2
+
 Documentation:
   script:
-  - EXTRA_INSTALL="numpy mako"
+  - EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
   - ". ./build-docs.sh"
   tags:
@@ -66,7 +99,7 @@ Documentation:
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh pytential test"
+  - ". ./prepare-and-run-flake8.sh pytential test examples"
   tags:
   - python3.5
   except:
diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7c0601cb2bf81ce546a590c71a8ea43fb87f9004
--- /dev/null
+++ b/.test-conda-env-py3-macos.yml
@@ -0,0 +1,20 @@
+name: test-conda-env-py3-macos
+channels:
+- conda-forge
+- defaults
+dependencies:
+- git
+- conda-forge::numpy
+- conda-forge::sympy
+- scipy
+- pocl=1.0
+- islpy
+- pyopencl
+- python=3.6
+- symengine=0.3.0
+- python-symengine=0.3.0
+- pyfmmlib
+- osx-pocl-opencl
+# for OpenMP support in pyfmmlib
+- libgfortran>=3.0.1
+# things not in here: loopy boxtree pymbolic meshmode sumpy
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index b4f204c7f1ee3b43208c3fe5728e1b72bf78c21a..16e053730d0b14a6374f60d48cb28c4a9308d598 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -4,12 +4,13 @@ channels:
 - defaults
 dependencies:
 - git
+- scipy
 - conda-forge::numpy
 - conda-forge::sympy
-- pocl=0.13
+- pocl=1.0
 - islpy
 - pyopencl
-- python=3.5
+- python=3.6
 - symengine=0.3.0
 - python-symengine=0.3.0
 - pyfmmlib
diff --git a/README.rst b/README.rst
index cbd540486d43975eb2691f59aa8117f4dd02d697..dfc5fe49b46f4a9ea13debdc136e9fbf3848c79d 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,19 @@
-pytential
-=========
+pytential: 2D/3D Layer Potential Evaluation
+===========================================
+
+.. image:: https://gitlab.tiker.net/inducer/pytential/badges/master/pipeline.svg
+   :target: https://gitlab.tiker.net/inducer/pytential/commits/master
+.. image:: https://badge.fury.io/py/pytential.png
+    :target: http://pypi.python.org/pypi/pytential
 
 pytential helps you accurately evaluate layer
 potentials (and, sooner or later, volume potentials).
 It also knows how to set up meshes and solve integral
 equations.
 
+See `here <https://documen.tician.de/pytential/misc.html#installing-pytential>`_
+for easy, self-contained installation instructions for Linux and macOS.
+
 It relies on
 
 * `numpy <http://pypi.python.org/pypi/numpy>`_ for arrays
@@ -20,14 +28,12 @@ and, indirectly,
 
 * `PyOpenCL <http://pypi.python.org/pypi/pyopencl>`_ as computational infrastructure
 
-PyOpenCL is likely the only package you'll have to install
-by hand, all the others will be installed automatically.
-
 .. image:: https://badge.fury.io/py/pytential.png
     :target: http://pypi.python.org/pypi/pytential
 
 Resources:
 
+* `installation instructions <https://documen.tician.de/pytential/misc.html#installing-pytential>`_
 * `documentation <http://documen.tician.de/pytential>`_
 * `wiki home page <http://wiki.tiker.net/Pytential>`_
 * `source code via git <http://github.com/inducer/pytential>`_
diff --git a/doc/conf.py b/doc/conf.py
index 81b15205e10a3e747bec9b350f31f9979c920656..6df4af25e85a60f552c9d45b3e2da91a8ea2fef3 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -271,6 +271,7 @@ intersphinx_mapping = {
     'http://docs.python.org/': None,
     'http://documen.tician.de/boxtree/': None,
     'http://docs.scipy.org/doc/numpy/': None,
+    'http://documen.tician.de/meshmode/': None,
     'http://documen.tician.de/modepy/': None,
     'http://documen.tician.de/pyopencl/': None,
     'http://documen.tician.de/pytools/': None,
diff --git a/doc/discretization.rst b/doc/discretization.rst
index ad2eb738a04b14f72044a72f07120787a007f535..ac4ca52679581b7b54076584418b8c088dd3841d 100644
--- a/doc/discretization.rst
+++ b/doc/discretization.rst
@@ -16,7 +16,7 @@ and you can start computing.
 .. automodule:: pytential.qbx
 
 Unregularized discretization
--------
+----------------------------
 
 .. automodule:: pytential.unregularized
 
diff --git a/doc/index.rst b/doc/index.rst
index d56c8bb52279e161973f2c402419343d7729d878..772a936a9211e7b2db269a8b5c97e5749b2db8d0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -17,6 +17,7 @@ Contents
 
     discretization
     symbolic
+    linalg
     tools
     misc
 
diff --git a/doc/linalg.rst b/doc/linalg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d5e78a6e288d50419be802bf3ab235dff81c419d
--- /dev/null
+++ b/doc/linalg.rst
@@ -0,0 +1,9 @@
+Linear Algebra Routines
+=======================
+
+Hierarchical Direct Solver
+--------------------------
+
+.. automodule:: pytential.linalg.proxy
+
+.. vim: sw=4:tw=75
diff --git a/doc/misc.rst b/doc/misc.rst
index 14f3f76468c290284e4e4cf4624485ab4d332a28..198b87da15c8c033552fe8ee881082991a07bc03 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -4,14 +4,15 @@ Installation and Usage
 Installing :mod:`pytential`
 ---------------------------
 
-This set of instructions is intended for 64-bit Linux computers.
-MacOS support is in the works.
+This set of instructions is intended for 64-bit Linux and macOS computers.
 
 #.  Make sure your system has the basics to build software.
 
     On Debian derivatives (Ubuntu and many more),
     installing ``build-essential`` should do the trick.
 
+    On macOS, run ``xcode-select --install`` to install build tools.
+
     Everywhere else, just making sure you have the ``g++`` package should be
     enough.
 
@@ -30,7 +31,9 @@ MacOS support is in the works.
 
 #.  ``conda config --add channels conda-forge``
 
-#.  ``conda install git pip pocl=0.13 islpy pyopencl sympy pyfmmlib pytest``
+#.  (*macOS only*) ``conda install osx-pocl-opencl pocl pyopencl``
+
+#.  ``conda install git pip pocl islpy pyopencl sympy pyfmmlib pytest``
 
 #.  Type the following command::
 
@@ -45,14 +48,6 @@ You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or
 After this, you should be able to run the `tests <https://github.com/inducer/pytential/tree/master/test>`_
 or `examples <https://github.com/inducer/pytential/tree/master/examples>`_.
 
-.. note::
-
-    You may have noticed that we prescribed pocl version 0.13 above. That's
-    because newer versions have a `bug
-    <https://gitlab.tiker.net/inducer/pytential/issues/64>`_ that we haven't
-    tracked down just yet. Until this bug is found, we discourage the use of
-    pocl 0.14 as results may be silently inaccurate.
-
 Troubleshooting the Installation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/doc/symbolic.rst b/doc/symbolic.rst
index 128fc84c68452907a4fe9e7676ee2b2e7c0e7358..82e38d0fb9c87211e42ea7f6d957f65869261b06 100644
--- a/doc/symbolic.rst
+++ b/doc/symbolic.rst
@@ -13,6 +13,8 @@ Binding an operator to a discretization
 
 .. currentmodule:: pytential
 
+.. autoclass:: GeometryCollection
+
 .. autofunction:: bind
 
 PDE operators
diff --git a/examples/fmm-error.py b/examples/fmm-error.py
index fea97c99fd54398e4e6b3dd6c2092c19e60a31f5..c3350786b6464b3b20b5f9e8d740cf74185e9cf1 100644
--- a/examples/fmm-error.py
+++ b/examples/fmm-error.py
@@ -6,89 +6,93 @@ from meshmode.mesh.generation import (  # noqa
 from sumpy.visualization import FieldPlotter
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
 
-import faulthandler
-faulthandler.enable()
 
-import logging
-logging.basicConfig(level=logging.INFO)
+def main():
+    import logging
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
-cl_ctx = cl.create_some_context()
-queue = cl.CommandQueue(cl_ctx)
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
 
-target_order = 16
-qbx_order = 3
-nelements = 60
-mode_nr = 0
+    target_order = 16
+    qbx_order = 3
+    nelements = 60
+    mode_nr = 0
 
-k = 0
-if k:
-    kernel = HelmholtzKernel("k")
-else:
-    kernel = LaplaceKernel()
-#kernel = OneKernel()
+    k = 0
+    if k:
+        kernel = HelmholtzKernel(2)
+    else:
+        kernel = LaplaceKernel(2)
+    #kernel = OneKernel()
 
-mesh = make_curve_mesh(
-        #lambda t: ellipse(1, t),
-        starfish,
-        np.linspace(0, 1, nelements+1),
-        target_order)
+    mesh = make_curve_mesh(
+            #lambda t: ellipse(1, t),
+            starfish,
+            np.linspace(0, 1, nelements+1),
+            target_order)
 
-from pytential.qbx import QBXLayerPotentialSource
-from meshmode.discretization import Discretization
-from meshmode.discretization.poly_element import \
-        InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
 
-density_discr = Discretization(
-        cl_ctx, mesh,
-        InterpolatoryQuadratureSimplexGroupFactory(target_order))
+    pre_density_discr = Discretization(
+            cl_ctx, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-qbx = QBXLayerPotentialSource(
-        density_discr, fine_order=2*target_order,
-        qbx_order=qbx_order, fmm_order=qbx_order)
-slow_qbx = QBXLayerPotentialSource(
-        density_discr, fine_order=2*target_order,
-        qbx_order=qbx_order, fmm_order=False)
+    slow_qbx, _ = QBXLayerPotentialSource(
+            pre_density_discr, fine_order=2*target_order,
+            qbx_order=qbx_order, fmm_order=False,
+            target_association_tolerance=.05
+            ).with_refinement()
+    qbx = slow_qbx.copy(fmm_order=10)
+    density_discr = slow_qbx.density_discr
 
-nodes = density_discr.nodes().with_queue(queue)
+    nodes = density_discr.nodes().with_queue(queue)
 
-angle = cl.clmath.atan2(nodes[1], nodes[0])
+    angle = cl.clmath.atan2(nodes[1], nodes[0])
 
-from pytential import bind, sym
-d = sym.Derivative()
-#op = d.nabla[0] * d(sym.S(kernel, sym.var("sigma")))
-#op = sym.D(kernel, sym.var("sigma"))
-op = sym.S(kernel, sym.var("sigma"))
+    from pytential import bind, sym
+    #op = sym.d_dx(sym.S(kernel, sym.var("sigma")), qbx_forced_limit=None)
+    #op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
+    op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)
 
-sigma = cl.clmath.cos(mode_nr*angle)
+    sigma = cl.clmath.cos(mode_nr*angle)
 
-if isinstance(kernel, HelmholtzKernel):
-    sigma = sigma.astype(np.complex128)
+    if isinstance(kernel, HelmholtzKernel):
+        sigma = sigma.astype(np.complex128)
 
-bound_bdry_op = bind(qbx, op)
+    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600)
+    from pytential.target import PointsTarget
 
-fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600)
-from pytential.target import PointsTarget
+    fld_in_vol = bind(
+            (slow_qbx, PointsTarget(fplot.points)),
+            op)(queue, sigma=sigma, k=k).get()
 
-fld_in_vol = bind(
-        (slow_qbx, PointsTarget(fplot.points)),
-        op)(queue, sigma=sigma, k=k).get()
+    fmm_fld_in_vol = bind(
+            (qbx, PointsTarget(fplot.points)),
+            op)(queue, sigma=sigma, k=k).get()
 
-fmm_fld_in_vol = bind(
-        (qbx, PointsTarget(fplot.points)),
-        op)(queue, sigma=sigma, k=k).get()
+    err = fmm_fld_in_vol-fld_in_vol
 
-err = fmm_fld_in_vol-fld_in_vol
-im = fplot.show_scalar_in_matplotlib(np.log10(np.abs(err)))
+    import matplotlib
+    matplotlib.use('Agg')
+    im = fplot.show_scalar_in_matplotlib(np.log10(np.abs(err) + 1e-17))
 
-from matplotlib.colors import Normalize
-im.set_norm(Normalize(vmin=-6, vmax=0))
+    from matplotlib.colors import Normalize
+    im.set_norm(Normalize(vmin=-12, vmax=0))
 
-import matplotlib.pyplot as pt
-from matplotlib.ticker import NullFormatter
-pt.gca().xaxis.set_major_formatter(NullFormatter())
-pt.gca().yaxis.set_major_formatter(NullFormatter())
+    import matplotlib.pyplot as pt
+    from matplotlib.ticker import NullFormatter
+    pt.gca().xaxis.set_major_formatter(NullFormatter())
+    pt.gca().yaxis.set_major_formatter(NullFormatter())
 
-cb = pt.colorbar(shrink=0.9)
-cb.set_label(r"$\log_{10}(\mathdefault{Error})$")
+    cb = pt.colorbar(shrink=0.9)
+    cb.set_label(r"$\log_{10}(\mathdefault{Error})$")
 
-pt.savefig("fmm-error-order-%d.pdf" % qbx_order)
+    pt.savefig("fmm-error-order-%d.pdf" % qbx_order)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/blob-2d.step b/examples/geometries/blob-2d.step
similarity index 100%
rename from examples/blob-2d.step
rename to examples/geometries/blob-2d.step
diff --git a/examples/circle.step b/examples/geometries/circle.step
similarity index 100%
rename from examples/circle.step
rename to examples/geometries/circle.step
diff --git a/examples/circles.step b/examples/geometries/circles.step
similarity index 100%
rename from examples/circles.step
rename to examples/geometries/circles.step
diff --git a/examples/ellipsoid.step b/examples/geometries/ellipsoid.step
similarity index 100%
rename from examples/ellipsoid.step
rename to examples/geometries/ellipsoid.step
diff --git a/examples/molecule.step b/examples/geometries/molecule.step
similarity index 100%
rename from examples/molecule.step
rename to examples/geometries/molecule.step
diff --git a/examples/two-balls.step b/examples/geometries/two-balls.step
similarity index 100%
rename from examples/two-balls.step
rename to examples/geometries/two-balls.step
diff --git a/examples/two-cylinders-smooth.step b/examples/geometries/two-cylinders-smooth.step
similarity index 100%
rename from examples/two-cylinders-smooth.step
rename to examples/geometries/two-cylinders-smooth.step
diff --git a/examples/helmholtz-dirichlet.py b/examples/helmholtz-dirichlet.py
index d6d9baa6c609399028305edb249565d0457d878d..847e5c3fdfec588b6c8d3e10d2b31bb237f74022 100644
--- a/examples/helmholtz-dirichlet.py
+++ b/examples/helmholtz-dirichlet.py
@@ -25,7 +25,7 @@ k = 3
 
 def main():
     import logging
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -96,8 +96,10 @@ def main():
 
     bdry_op_sym = (-loc_sign*0.5*sigma_sym
             + sqrt_w*(
-                alpha*sym.S(kernel, inv_sqrt_w_sigma, k=sym.var("k"))
-                - sym.D(kernel, inv_sqrt_w_sigma, k=sym.var("k"))
+                alpha*sym.S(kernel, inv_sqrt_w_sigma, k=sym.var("k"),
+                    qbx_forced_limit=+1)
+                - sym.D(kernel, inv_sqrt_w_sigma, k=sym.var("k"),
+                    qbx_forced_limit="avg")
                 ))
 
     # }}}
@@ -167,7 +169,7 @@ def main():
 
     #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
     fplot.write_vtk_file(
-            "potential.vts",
+            "potential-helm.vts",
             [
                 ("potential", fld_in_vol),
                 ("indicator", indicator),
diff --git a/examples/laplace-dirichlet-3d.py b/examples/laplace-dirichlet-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4166dddfa8d8e1606885e167fa65a8881fe64484
--- /dev/null
+++ b/examples/laplace-dirichlet-3d.py
@@ -0,0 +1,170 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+
+from meshmode.discretization import Discretization
+from meshmode.discretization.poly_element import \
+        InterpolatoryQuadratureSimplexGroupFactory
+
+from pytential import bind, sym, norm  # noqa
+from pytential.target import PointsTarget
+
+# {{{ set some constants for use below
+
+nelements = 20
+bdry_quad_order = 4
+mesh_order = bdry_quad_order
+qbx_order = bdry_quad_order
+bdry_ovsmp_quad_order = 4*bdry_quad_order
+fmm_order = 3
+
+# }}}
+
+
+def main():
+    import logging
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
+
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    from meshmode.mesh.generation import generate_torus
+
+    rout = 10
+    rin = 1
+    if 1:
+        base_mesh = generate_torus(
+                rout, rin, 40, 4,
+                mesh_order)
+
+        from meshmode.mesh.processing import affine_map, merge_disjoint_meshes
+        # nx = 1
+        # ny = 1
+        nz = 1
+        dz = 0
+        meshes = [
+                affine_map(
+                    base_mesh,
+                    A=np.diag([1, 1, 1]),
+                    b=np.array([0, 0, iz*dz]))
+                for iz in range(nz)]
+
+        mesh = merge_disjoint_meshes(meshes, single_group=True)
+
+        if 0:
+            from meshmode.mesh.visualization import draw_curve
+            draw_curve(mesh)
+            import matplotlib.pyplot as plt
+            plt.show()
+
+    pre_density_discr = Discretization(
+            cl_ctx, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order))
+
+    from pytential.qbx import (
+            QBXLayerPotentialSource, QBXTargetAssociationFailedException)
+    qbx, _ = QBXLayerPotentialSource(
+            pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
+            fmm_order=fmm_order
+            ).with_refinement()
+    density_discr = qbx.density_discr
+
+    # {{{ describe bvp
+
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(3)
+
+    cse = sym.cse
+
+    sigma_sym = sym.var("sigma")
+    #sqrt_w = sym.sqrt_jac_q_weight(3)
+    sqrt_w = 1
+    inv_sqrt_w_sigma = cse(sigma_sym/sqrt_w)
+
+    # -1 for interior Dirichlet
+    # +1 for exterior Dirichlet
+    loc_sign = +1
+
+    bdry_op_sym = (loc_sign*0.5*sigma_sym
+            + sqrt_w*(
+                sym.S(kernel, inv_sqrt_w_sigma)
+                + sym.D(kernel, inv_sqrt_w_sigma)
+                ))
+
+    # }}}
+
+    bound_op = bind(qbx, bdry_op_sym)
+
+    # {{{ fix rhs and solve
+
+    nodes = density_discr.nodes().with_queue(queue)
+    source = np.array([rout, 0, 0])
+
+    def u_incoming_func(x):
+        #        return 1/cl.clmath.sqrt( (x[0] - source[0])**2
+        #                                +(x[1] - source[1])**2
+        #                                +(x[2] - source[2])**2 )
+        return 1.0/la.norm(x.get()-source[:, None], axis=0)
+
+    bc = cl.array.to_device(queue, u_incoming_func(nodes))
+
+    bvp_rhs = bind(qbx, sqrt_w*sym.var("bc"))(queue, bc=bc)
+
+    from pytential.solve import gmres
+    gmres_result = gmres(
+            bound_op.scipy_op(queue, "sigma", dtype=np.float64),
+            bvp_rhs, tol=1e-14, progress=True,
+            stall_iterations=0,
+            hard_failure=True)
+
+    sigma = bind(qbx, sym.var("sigma")/sqrt_w)(queue, sigma=gmres_result.solution)
+
+    # }}}
+
+    from meshmode.discretization.visualization import make_visualizer
+    bdry_vis = make_visualizer(queue, density_discr, 20)
+    bdry_vis.write_vtk_file("laplace.vtu", [
+        ("sigma", sigma),
+        ])
+
+    # {{{ postprocess/visualize
+
+    repr_kwargs = dict(qbx_forced_limit=None)
+    representation_sym = (
+            sym.S(kernel, inv_sqrt_w_sigma, **repr_kwargs)
+            + sym.D(kernel, inv_sqrt_w_sigma, **repr_kwargs))
+
+    from sumpy.visualization import FieldPlotter
+    fplot = FieldPlotter(np.zeros(3), extent=20, npoints=50)
+
+    targets = cl.array.to_device(queue, fplot.points)
+
+    qbx_stick_out = qbx.copy(target_stick_out_factor=0.2)
+
+    try:
+        fld_in_vol = bind(
+                (qbx_stick_out, PointsTarget(targets)),
+                representation_sym)(queue, sigma=sigma).get()
+    except QBXTargetAssociationFailedException as e:
+        fplot.write_vtk_file(
+                "failed-targets.vts",
+                [
+                    ("failed", e.failed_target_flags.get(queue))
+                    ]
+                )
+        raise
+
+    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
+    fplot.write_vtk_file(
+            "potential-laplace-3d.vts",
+            [
+                ("potential", fld_in_vol),
+                ]
+            )
+
+    # }}}
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/layerpot-3d.py b/examples/layerpot-3d.py
index 0a35ebd7fa418b8cf517ada57bc7dee5e39f1471..28f0967e8aec28332902a128d0fb1efafb100d4e 100644
--- a/examples/layerpot-3d.py
+++ b/examples/layerpot-3d.py
@@ -9,10 +9,6 @@ import faulthandler
 from six.moves import range
 faulthandler.enable()
 
-import logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
 cl_ctx = cl.create_some_context()
 queue = cl.CommandQueue(cl_ctx)
 
@@ -21,10 +17,10 @@ qbx_order = 3
 mode_nr = 4
 
 if 1:
-    cad_file_name = "ellipsoid.step"
+    cad_file_name = "geometries/ellipsoid.step"
     h = 0.6
 else:
-    cad_file_name = "two-cylinders-smooth.step"
+    cad_file_name = "geometries/two-cylinders-smooth.step"
     h = 0.4
 
 k = 0
@@ -34,76 +30,86 @@ else:
     kernel = LaplaceKernel(3)
 #kernel = OneKernel()
 
-from meshmode.mesh.io import generate_gmsh, FileSource
-mesh = generate_gmsh(
-        FileSource(cad_file_name), 2, order=2,
-        other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h])
 
-from meshmode.mesh.processing import perform_flips
-# Flip elements--gmsh generates inside-out geometry.
-mesh = perform_flips(mesh, np.ones(mesh.nelements))
+def main():
+    import logging
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
+
+    from meshmode.mesh.io import generate_gmsh, FileSource
+    mesh = generate_gmsh(
+            FileSource(cad_file_name), 2, order=2,
+            other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h])
+
+    from meshmode.mesh.processing import perform_flips
+    # Flip elements--gmsh generates inside-out geometry.
+    mesh = perform_flips(mesh, np.ones(mesh.nelements))
+
+    from meshmode.mesh.processing import find_bounding_box
+    bbox_min, bbox_max = find_bounding_box(mesh)
+    bbox_center = 0.5*(bbox_min+bbox_max)
+    bbox_size = max(bbox_max-bbox_min) / 2
+
+    logger.info("%d elements" % mesh.nelements)
 
-from meshmode.mesh.processing import find_bounding_box
-bbox_min, bbox_max = find_bounding_box(mesh)
-bbox_center = 0.5*(bbox_min+bbox_max)
-bbox_size = max(bbox_max-bbox_min) / 2
+    from pytential.qbx import QBXLayerPotentialSource
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
 
-logger.info("%d elements" % mesh.nelements)
+    density_discr = Discretization(
+            cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-from pytential.qbx import QBXLayerPotentialSource
-from meshmode.discretization import Discretization
-from meshmode.discretization.poly_element import \
-        InterpolatoryQuadratureSimplexGroupFactory
+    qbx, _ = QBXLayerPotentialSource(density_discr, 4*target_order, qbx_order,
+            fmm_order=qbx_order + 3,
+            target_association_tolerance=0.15).with_refinement()
 
-density_discr = Discretization(
-        cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
+    nodes = density_discr.nodes().with_queue(queue)
 
-qbx, _ = QBXLayerPotentialSource(density_discr, 4*target_order, qbx_order,
-        fmm_order=qbx_order + 3,
-        target_association_tolerance=0.15).with_refinement()
+    angle = cl.clmath.atan2(nodes[1], nodes[0])
 
-nodes = density_discr.nodes().with_queue(queue)
+    from pytential import bind, sym
+    #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None))
+    op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
+    #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)
 
-angle = cl.clmath.atan2(nodes[1], nodes[0])
+    sigma = cl.clmath.cos(mode_nr*angle)
+    if 0:
+        sigma = 0*angle
+        from random import randrange
+        for i in range(5):
+            sigma[randrange(len(sigma))] = 1
 
-from pytential import bind, sym
-#op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None))
-op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
-#op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)
+    if isinstance(kernel, HelmholtzKernel):
+        sigma = sigma.astype(np.complex128)
 
-sigma = cl.clmath.cos(mode_nr*angle)
-if 0:
-    sigma = 0*angle
-    from random import randrange
-    for i in range(5):
-        sigma[randrange(len(sigma))] = 1
+    fplot = FieldPlotter(bbox_center, extent=3.5*bbox_size, npoints=150)
 
-if isinstance(kernel, HelmholtzKernel):
-    sigma = sigma.astype(np.complex128)
+    from pytential.target import PointsTarget
+    fld_in_vol = bind(
+            (qbx, PointsTarget(fplot.points)),
+            op)(queue, sigma=sigma, k=k).get()
 
-fplot = FieldPlotter(bbox_center, extent=3.5*bbox_size, npoints=150)
+    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
+    fplot.write_vtk_file(
+            "potential-3d.vts",
+            [
+                ("potential", fld_in_vol)
+                ]
+            )
 
-from pytential.target import PointsTarget
-fld_in_vol = bind(
-        (qbx, PointsTarget(fplot.points)),
-        op)(queue, sigma=sigma, k=k).get()
+    bdry_normals = bind(
+            density_discr,
+            sym.normal(density_discr.ambient_dim))(queue).as_vector(dtype=object)
 
-#fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-fplot.write_vtk_file(
-        "potential.vts",
-        [
-            ("potential", fld_in_vol)
-            ]
-        )
+    from meshmode.discretization.visualization import make_visualizer
+    bdry_vis = make_visualizer(queue, density_discr, target_order)
 
-bdry_normals = bind(
-        density_discr,
-        sym.normal(density_discr.ambient_dim))(queue).as_vector(dtype=object)
+    bdry_vis.write_vtk_file("source-3d.vtu", [
+        ("sigma", sigma),
+        ("bdry_normals", bdry_normals),
+        ])
 
-from meshmode.discretization.visualization import make_visualizer
-bdry_vis = make_visualizer(queue, density_discr, target_order)
 
-bdry_vis.write_vtk_file("source.vtu", [
-    ("sigma", sigma),
-    ("bdry_normals", bdry_normals),
-    ])
+if __name__ == "__main__":
+    main()
diff --git a/examples/layerpot.py b/examples/layerpot.py
index 0371b1ff557f57a7b2f839be6571bded120728bb..7b4737da00d6d1d1cc76e31f39932fc7c12783e8 100644
--- a/examples/layerpot.py
+++ b/examples/layerpot.py
@@ -15,13 +15,6 @@ import faulthandler
 from six.moves import range
 faulthandler.enable()
 
-import logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-cl_ctx = cl.create_some_context()
-queue = cl.CommandQueue(cl_ctx)
-
 target_order = 16
 qbx_order = 3
 nelements = 60
@@ -36,96 +29,107 @@ else:
     kernel_kwargs = {}
 #kernel = OneKernel()
 
-from meshmode.mesh.generation import (  # noqa
-        make_curve_mesh, starfish, ellipse, drop)
-mesh = make_curve_mesh(
-        #lambda t: ellipse(1, t),
-        starfish,
-        np.linspace(0, 1, nelements+1),
-        target_order)
 
-from pytential.qbx import QBXLayerPotentialSource
-from meshmode.discretization import Discretization
-from meshmode.discretization.poly_element import \
-        InterpolatoryQuadratureSimplexGroupFactory
+def main():
+    import logging
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
+
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    from meshmode.mesh.generation import (  # noqa
+            make_curve_mesh, starfish, ellipse, drop)
+    mesh = make_curve_mesh(
+            #lambda t: ellipse(1, t),
+            starfish,
+            np.linspace(0, 1, nelements+1),
+            target_order)
+
+    from pytential.qbx import QBXLayerPotentialSource
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+
+    pre_density_discr = Discretization(
+            cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-pre_density_discr = Discretization(
-        cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))
+    qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4*target_order, qbx_order,
+            fmm_order=qbx_order+3,
+            target_association_tolerance=0.005).with_refinement()
 
-qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4*target_order, qbx_order,
-        fmm_order=qbx_order+3,
-        target_association_tolerance=0.005).with_refinement()
+    density_discr = qbx.density_discr
 
-density_discr = qbx.density_discr
+    nodes = density_discr.nodes().with_queue(queue)
 
-nodes = density_discr.nodes().with_queue(queue)
+    angle = cl.clmath.atan2(nodes[1], nodes[0])
 
-angle = cl.clmath.atan2(nodes[1], nodes[0])
+    def op(**kwargs):
+        kwargs.update(kernel_kwargs)
 
+        #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), **kwargs))
+        return sym.D(kernel, sym.var("sigma"), **kwargs)
+        #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None, **kwargs)
 
-def op(**kwargs):
-    kwargs.update(kernel_kwargs)
+    sigma = cl.clmath.cos(mode_nr*angle)
+    if 0:
+        sigma = 0*angle
+        from random import randrange
+        for i in range(5):
+            sigma[randrange(len(sigma))] = 1
 
-    #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), **kwargs))
-    return sym.D(kernel, sym.var("sigma"), **kwargs)
-    #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None, **kwargs)
+    if isinstance(kernel, HelmholtzKernel):
+        sigma = sigma.astype(np.complex128)
 
+    bound_bdry_op = bind(qbx, op())
+    #mlab.figure(bgcolor=(1, 1, 1))
+    if 1:
+        fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000)
+        from pytential.target import PointsTarget
 
-sigma = cl.clmath.cos(mode_nr*angle)
-if 0:
-    sigma = 0*angle
-    from random import randrange
-    for i in range(5):
-        sigma[randrange(len(sigma))] = 1
+        targets_dev = cl.array.to_device(queue, fplot.points)
+        fld_in_vol = bind(
+                (qbx, PointsTarget(targets_dev)),
+                op(qbx_forced_limit=None))(queue, sigma=sigma, k=k).get()
 
-if isinstance(kernel, HelmholtzKernel):
-    sigma = sigma.astype(np.complex128)
+        if enable_mayavi:
+            fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
+        else:
+            fplot.write_vtk_file(
+                    "potential-2d.vts",
+                    [
+                        ("potential", fld_in_vol)
+                        ]
+                    )
 
-bound_bdry_op = bind(qbx, op())
-#mlab.figure(bgcolor=(1, 1, 1))
-if 1:
-    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000)
-    from pytential.target import PointsTarget
+    if 0:
+        def apply_op(density):
+            return bound_bdry_op(
+                    queue, sigma=cl.array.to_device(queue, density), k=k).get()
 
-    targets_dev = cl.array.to_device(queue, fplot.points)
-    fld_in_vol = bind(
-            (qbx, PointsTarget(targets_dev)),
-            op(qbx_forced_limit=None))(queue, sigma=sigma, k=k).get()
+        from sumpy.tools import build_matrix
+        n = len(sigma)
+        mat = build_matrix(apply_op, dtype=np.float64, shape=(n, n))
+
+        import matplotlib.pyplot as pt
+        pt.imshow(mat)
+        pt.colorbar()
+        pt.show()
 
     if enable_mayavi:
-        fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
-    else:
-        fplot.write_vtk_file(
-                "potential.vts",
-                [
-                    ("potential", fld_in_vol)
-                    ]
-                )
-
-if 0:
-    def apply_op(density):
-        return bound_bdry_op(
-                queue, sigma=cl.array.to_device(queue, density), k=k).get()
-
-    from sumpy.tools import build_matrix
-    n = len(sigma)
-    mat = build_matrix(apply_op, dtype=np.float64, shape=(n, n))
-
-    import matplotlib.pyplot as pt
-    pt.imshow(mat)
-    pt.colorbar()
-    pt.show()
+        # {{{ plot boundary field
 
-if enable_mayavi:
-    # {{{ plot boundary field
+        fld_on_bdry = bound_bdry_op(queue, sigma=sigma, k=k).get()
 
-    fld_on_bdry = bound_bdry_op(queue, sigma=sigma, k=k).get()
+        nodes_host = density_discr.nodes().get(queue=queue)
+        mlab.points3d(nodes_host[0], nodes_host[1],
+                fld_on_bdry.real, scale_factor=0.03)
 
-    nodes_host = density_discr.nodes().get(queue=queue)
-    mlab.points3d(nodes_host[0], nodes_host[1], fld_on_bdry.real, scale_factor=0.03)
+        # }}}
 
-    # }}}
+    if enable_mayavi:
+        mlab.colorbar()
+        mlab.show()
 
-if enable_mayavi:
-    mlab.colorbar()
-    mlab.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/perf-model.py b/examples/perf-model.py
deleted file mode 100644
index 3a87d63113ecedda8378485a4869e2771f0aaa4e..0000000000000000000000000000000000000000
--- a/examples/perf-model.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# ------------------------------
-nlevels = 6
-nboxes = 1365
-nsources = 60
-ntargets = 12040
-form_mp = 60*p_fmm
-prop_upward = 1365*p_fmm**2
-part_direct = 196560
-m2l = 31920*p_fmm**2
-mp_eval = 0
-form_local = 65000*p_fmm
-prop_downward = 1365*p_fmm**2
-eval_part = 12040*p_fmm
-ncenters = 2040
-qbxl_direct = 2370940*p_qbx
-qbx_m2l = 35339*p_fmm*p_qbx
-qbx_l2l = 2040*p_fmm*p_qbx
-qbx_eval = 1902*p_qbx
diff --git a/examples/scaling-study.py b/examples/scaling-study.py
index 183fc915cb286ccb7731a76e9c1ed6cc3869efd5..3327e3c8c6ce71262018551008a203a04d68e70b 100644
--- a/examples/scaling-study.py
+++ b/examples/scaling-study.py
@@ -16,8 +16,8 @@ bdry_quad_order = 4
 mesh_order = bdry_quad_order
 qbx_order = bdry_quad_order
 bdry_ovsmp_quad_order = 4*bdry_quad_order
-fmm_order = 25
-k = 25
+fmm_order = 10
+k = 0
 
 # }}}
 
@@ -54,7 +54,7 @@ def make_mesh(nx, ny):
 
 def timing_run(nx, ny):
     import logging
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info
 
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -154,6 +154,7 @@ def timing_run(nx, ny):
                 sym_op)(
                 queue, sigma=ones_density).get()
 
+        qbx_stick_out = qbx.copy(target_stick_out_factor=0.1)
         try:
             fld_in_vol = bind(
                     (qbx_stick_out, PointsTarget(targets)),
@@ -169,7 +170,7 @@ def timing_run(nx, ny):
 
         #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
         fplot.write_vtk_file(
-                "potential.vts",
+                "potential-scaling.vts",
                 [
                     ("potential", fld_in_vol),
                     ("indicator", indicator)
diff --git a/experiments/README.md b/experiments/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0f56efd2f7e29d95e945154962d5f00fdd98cb1
--- /dev/null
+++ b/experiments/README.md
@@ -0,0 +1,7 @@
+# Experiments
+
+What you find in this directory are experiments that *may* have done something
+useful at some point (or not). Unlike `examples`, they are not being tested on
+an ongoing basis.
+
+So if what you find here breaks for you, you get to keep both pieces.
diff --git a/examples/cahn-hilliard.py b/experiments/cahn-hilliard.py
similarity index 100%
rename from examples/cahn-hilliard.py
rename to experiments/cahn-hilliard.py
diff --git a/examples/find-photonic-mode-sk.py b/experiments/find-photonic-mode-sk.py
similarity index 100%
rename from examples/find-photonic-mode-sk.py
rename to experiments/find-photonic-mode-sk.py
diff --git a/examples/find-photonic-mode.py b/experiments/find-photonic-mode.py
similarity index 100%
rename from examples/find-photonic-mode.py
rename to experiments/find-photonic-mode.py
diff --git a/examples/helmholtz-expression-tree.py b/experiments/helmholtz-expression-tree.py
similarity index 100%
rename from examples/helmholtz-expression-tree.py
rename to experiments/helmholtz-expression-tree.py
diff --git a/examples/maxwell.py b/experiments/maxwell.py
similarity index 100%
rename from examples/maxwell.py
rename to experiments/maxwell.py
diff --git a/examples/maxwell_sphere.py b/experiments/maxwell_sphere.py
similarity index 100%
rename from examples/maxwell_sphere.py
rename to experiments/maxwell_sphere.py
diff --git a/examples/poisson.py b/experiments/poisson.py
similarity index 100%
rename from examples/poisson.py
rename to experiments/poisson.py
diff --git a/examples/qbx-tangential-deriv-jump.py b/experiments/qbx-tangential-deriv-jump.py
similarity index 100%
rename from examples/qbx-tangential-deriv-jump.py
rename to experiments/qbx-tangential-deriv-jump.py
diff --git a/examples/stokes-2d-interior.py b/experiments/stokes-2d-interior.py
similarity index 100%
rename from examples/stokes-2d-interior.py
rename to experiments/stokes-2d-interior.py
diff --git a/examples/two-domain-helmholtz.py b/experiments/two-domain-helmholtz.py
similarity index 100%
rename from examples/two-domain-helmholtz.py
rename to experiments/two-domain-helmholtz.py
diff --git a/pytential/__init__.py b/pytential/__init__.py
index 1d07499d48607e5eeb10899dfde365cdb601292b..d28e8bdbcc8be2377ee575edeeb8e2b6ce0fb6e7 100644
--- a/pytential/__init__.py
+++ b/pytential/__init__.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import numpy as np
 
 import pytential.symbolic.primitives as sym
+from pytential.symbolic.execution import GeometryCollection # noqa
 from pytential.symbolic.execution import bind
 
 from pytools import memoize_on_first_arg
diff --git a/pytential/linalg/__init__.py b/pytential/linalg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4a471df6aa7ab0893c8fa82fe2f3c1b319962c5
--- /dev/null
+++ b/pytential/linalg/__init__.py
@@ -0,0 +1,23 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2018 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc3e0aa9786e1db5b4ec3ab49c86d4e1273cabc
--- /dev/null
+++ b/pytential/linalg/proxy.py
@@ -0,0 +1,692 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+import pyopencl.array # noqa
+from pyopencl.array import to_device
+
+from pytools.obj_array import make_obj_array
+from pytools import memoize_method, memoize
+from sumpy.tools import BlockIndexRanges
+
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
+
+__doc__ = """
+Proxy Point Generation
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ProxyGenerator
+
+.. autofunction:: partition_by_nodes
+.. autofunction:: partition_by_elements
+.. autofunction:: partition_from_coarse
+
+.. autofunction:: gather_block_neighbor_points
+.. autofunction:: gather_block_interaction_points
+"""
+
+
+# {{{ point index partitioning
+
+def _element_node_range(group, ielement):
+    istart = group.node_nr_base + group.nunit_nodes * ielement
+    iend = group.node_nr_base + group.nunit_nodes * (ielement + 1)
+
+    return np.arange(istart, iend)
+
+
+def partition_by_nodes(discr,
+                       use_tree=True,
+                       max_nodes_in_box=None):
+    """Generate equally sized ranges of nodes. The partition is created at the
+    lowest level of granularity, i.e. nodes. This results in balanced ranges
+    of points, but will split elements across different ranges.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg use_tree: if ``True``, node partitions are generated using a
+        :class:`boxtree.TreeBuilder`, which leads to geometrically close
+        points to belong to the same partition. If ``False``, a simple linear
+        partition is constructed.
+    :arg max_nodes_in_box: passed to :class:`boxtree.TreeBuilder`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_nodes_in_box is None:
+        # FIXME: this is just an arbitrary value
+        max_nodes_in_box = 32
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        if use_tree:
+            from boxtree import box_flags_enum
+            from boxtree import TreeBuilder
+
+            builder = TreeBuilder(discr.cl_context)
+
+            tree, _ = builder(queue, discr.nodes(),
+                max_particles_in_box=max_nodes_in_box)
+
+            tree = tree.get(queue)
+            leaf_boxes, = (tree.box_flags
+                           & box_flags_enum.HAS_CHILDREN == 0).nonzero()
+
+            indices = np.empty(len(leaf_boxes), dtype=np.object)
+            for i, ibox in enumerate(leaf_boxes):
+                box_start = tree.box_source_starts[ibox]
+                box_end = box_start + tree.box_source_counts_cumul[ibox]
+                indices[i] = tree.user_source_ids[box_start:box_end]
+
+            ranges = to_device(queue,
+                np.cumsum([0] + [box.shape[0] for box in indices]))
+            indices = to_device(queue, np.hstack(indices))
+        else:
+            indices = cl.array.arange(queue, 0, discr.nnodes,
+                                      dtype=np.int)
+            ranges = cl.array.arange(queue, 0, discr.nnodes + 1,
+                                     discr.nnodes // max_nodes_in_box,
+                                     dtype=np.int)
+        assert ranges[-1] == discr.nnodes
+
+        return BlockIndexRanges(discr.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+
+def partition_by_elements(discr,
+                          use_tree=True,
+                          max_elements_in_box=None):
+    """Generate equally sized ranges of points. The partition is created at the
+    element level, so that all the nodes belonging to an element belong to
+    the same range. This can result in slightly larger differences in size
+    between the ranges, but can be very useful when the individual partitions
+    need to be resampled, integrated, etc.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg use_tree: if ``True``, node partitions are generated using a
+        :class:`boxtree.TreeBuilder`, which leads to geometrically close
+        points to belong to the same partition. If ``False``, a simple linear
+        partition is constructed.
+    :arg max_elements_in_box: passed to :class:`boxtree.TreeBuilder`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_elements_in_box is None:
+        # NOTE: keep in sync with partition_by_nodes
+        max_nodes_in_box = 32
+
+        nunit_nodes = int(np.mean([g.nunit_nodes for g in discr.groups]))
+        max_elements_in_box = max_nodes_in_box // nunit_nodes
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        if use_tree:
+            from boxtree import box_flags_enum
+            from boxtree import TreeBuilder
+
+            builder = TreeBuilder(discr.cl_context)
+
+            from pytential.qbx.utils import element_centers_of_mass
+            elranges = np.cumsum([group.nelements for group in discr.mesh.groups])
+            elcenters = element_centers_of_mass(discr)
+
+            tree, _ = builder(queue, elcenters,
+                max_particles_in_box=max_elements_in_box)
+
+            groups = discr.groups
+            tree = tree.get(queue)
+            leaf_boxes, = (tree.box_flags
+                           & box_flags_enum.HAS_CHILDREN == 0).nonzero()
+
+            indices = np.empty(len(leaf_boxes), dtype=np.object)
+            for i, ibox in enumerate(leaf_boxes):
+                box_start = tree.box_source_starts[ibox]
+                box_end = box_start + tree.box_source_counts_cumul[ibox]
+
+                ielement = tree.user_source_ids[box_start:box_end]
+                igroup = np.digitize(ielement, elranges)
+
+                indices[i] = np.hstack([_element_node_range(groups[j], k)
+                                        for j, k in zip(igroup, ielement)])
+        else:
+            nelements = discr.mesh.nelements
+            elements = np.array_split(np.arange(0, nelements),
+                                      nelements // max_elements_in_box)
+
+            elranges = np.cumsum([g.nelements for g in discr.groups])
+            elgroups = [np.digitize(elements[i], elranges)
+                        for i in range(len(elements))]
+
+            indices = np.empty(len(elements), dtype=np.object)
+            for i in range(indices.shape[0]):
+                indices[i] = np.hstack([_element_node_range(discr.groups[j], k)
+                                        for j, k in zip(elgroups[i], elements[i])])
+
+        ranges = to_device(queue,
+                np.cumsum([0] + [b.shape[0] for b in indices]))
+        indices = to_device(queue, np.hstack(indices))
+        assert ranges[-1] == discr.nnodes
+
+        return BlockIndexRanges(discr.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+
+def partition_from_coarse(resampler, from_indices):
+    """Generate a partition of nodes from an existing partition on a
+    coarser discretization. The new partition is generated based on element
+    refinement relationships in *resampler*, so the existing partition
+    needs to be created using :func:`partition_by_elements`,
+    since we assume that each range contains all the nodes in an element.
+
+    The new partition will have the same number of ranges as the old partition.
+    The nodes inside each range in the new partition are all the nodes in
+    *resampler.to_discr* that were refined from elements in the same
+    range from *resampler.from_discr*.
+
+    :arg resampler: a
+        :class:`meshmode.discretization.connection.DirectDiscretizationConnection`.
+    :arg from_indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if not hasattr(resampler, "groups"):
+        raise ValueError("resampler must be a DirectDiscretizationConnection.")
+
+    with cl.CommandQueue(resampler.cl_context) as queue:
+        from_indices = from_indices.get(queue)
+
+        # construct ranges
+        from_discr = resampler.from_discr
+        from_grp_ranges = np.cumsum(
+            [0] + [grp.nelements for grp in from_discr.mesh.groups])
+        from_el_ranges = np.hstack([
+            np.arange(grp.node_nr_base, grp.nnodes + 1, grp.nunit_nodes)
+            for grp in from_discr.groups])
+
+        # construct coarse element arrays in each from_range
+        el_indices = np.empty(from_indices.nblocks, dtype=np.object)
+        el_ranges = np.full(from_grp_ranges[-1], -1, dtype=np.int)
+        for i in range(from_indices.nblocks):
+            ifrom = from_indices.block_indices(i)
+            el_indices[i] = np.unique(np.digitize(ifrom, from_el_ranges)) - 1
+            el_ranges[el_indices[i]] = i
+        el_indices = np.hstack(el_indices)
+
+        # construct lookup table
+        to_el_table = [np.full(g.nelements, -1, dtype=np.int)
+                       for g in resampler.to_discr.groups]
+
+        for igrp, grp in enumerate(resampler.groups):
+            for batch in grp.batches:
+                to_el_table[igrp][batch.to_element_indices.get(queue)] = \
+                    from_grp_ranges[igrp] + batch.from_element_indices.get(queue)
+
+        # construct fine node index list
+        indices = [np.empty(0, dtype=np.int)
+                   for _ in range(from_indices.nblocks)]
+        for igrp in range(len(resampler.groups)):
+            to_element_indices = \
+                    np.where(np.isin(to_el_table[igrp], el_indices))[0]
+
+            for i, j in zip(el_ranges[to_el_table[igrp][to_element_indices]],
+                            to_element_indices):
+                indices[i] = np.hstack([indices[i],
+                    _element_node_range(resampler.to_discr.groups[igrp], j)])
+
+        ranges = to_device(queue,
+                np.cumsum([0] + [b.shape[0] for b in indices]))
+        indices = to_device(queue, np.hstack(indices))
+
+        return BlockIndexRanges(resampler.cl_context,
+                                indices.with_queue(None),
+                                ranges.with_queue(None))
+
+# }}}
+
+
+# {{{ proxy point generator
+
+def _generate_unit_sphere(ambient_dim, approx_npoints):
+    """Generate uniform points on a unit sphere.
+
+    :arg ambient_dim: dimension of the ambient space.
+    :arg approx_npoints: approximate number of points to generate. If the
+        ambient space is 3D, this will not generate the exact number of points.
+    :return: array of shape ``(ambient_dim, npoints)``, where ``npoints``
+        will not generally be the same as ``approx_npoints``.
+    """
+
+    if ambient_dim == 2:
+        t = np.linspace(0.0, 2.0 * np.pi, approx_npoints)
+        points = np.vstack([np.cos(t), np.sin(t)])
+    elif ambient_dim == 3:
+        # https://www.cmu.edu/biolphys/deserno/pdf/sphere_equi.pdf
+        # code by Matt Wala from
+        # https://github.com/mattwala/gigaqbx-accuracy-experiments/blob/d56ed063ffd7843186f4fe05d2a5b5bfe6ef420c/translation_accuracy.py#L23
+        a = 4.0 * np.pi / approx_npoints
+        m_theta = int(np.round(np.pi / np.sqrt(a)))
+        d_theta = np.pi / m_theta
+        d_phi = a / d_theta
+
+        points = []
+        for m in range(m_theta):
+            theta = np.pi * (m + 0.5) / m_theta
+            m_phi = int(np.round(2.0 * np.pi * np.sin(theta) / d_phi))
+
+            for n in range(m_phi):
+                phi = 2.0 * np.pi * n / m_phi
+                points.append(np.array([np.sin(theta) * np.cos(phi),
+                                        np.sin(theta) * np.sin(phi),
+                                        np.cos(theta)]))
+
+        for i in range(ambient_dim):
+            for sign in [-1, 1]:
+                pole = np.zeros(ambient_dim)
+                pole[i] = sign
+                points.append(pole)
+
+        points = np.array(points).T
+    else:
+        raise ValueError("ambient_dim > 3 not supported.")
+
+    return points
+
+
+class ProxyGenerator(object):
+    r"""
+    .. attribute:: ambient_dim
+    .. attribute:: nproxy
+
+        Number of proxy points in a single proxy ball.
+
+    .. attribute:: source
+
+        A :class:`pytential.qbx.QBXLayerPotentialSource`.
+
+    .. attribute:: ratio
+
+        A ratio used to compute the proxy ball radius. The radius
+        is computed in the :math:`\ell^2` norm, resulting in a circle or
+        sphere of proxy points. For QBX, we have two radii of interest
+        for a set of points: the radius :math:`r_{block}` of the
+        smallest ball containing all the points and the radius
+        :math:`r_{qbx}` of the smallest ball containing all the QBX
+        expansion balls in the block. If the ratio :math:`\theta \in
+        [0, 1]`, then the radius of the proxy ball is
+
+        .. math::
+
+            r = (1 - \theta) r_{block} + \theta r_{qbx}.
+
+        If the ratio :math:`\theta > 1`, the the radius is simply
+
+        .. math::
+
+            r = \theta r_{qbx}.
+
+    .. attribute:: ref_points
+
+        Reference points on a unit ball. Can be used to construct the points
+        of a proxy ball :math:`i` by translating them to ``center[i]`` and
+        scaling by ``radii[i]``, as obtained by :meth:`__call__`.
+
+    .. automethod:: __call__
+    """
+
+    def __init__(self, source, approx_nproxy=None, ratio=None):
+        self.source = source
+        self.ambient_dim = source.density_discr.ambient_dim
+        self.ratio = 1.1 if ratio is None else ratio
+
+        approx_nproxy = 32 if approx_nproxy is None else approx_nproxy
+        self.ref_points = \
+                _generate_unit_sphere(self.ambient_dim, approx_nproxy)
+
+    @property
+    def nproxy(self):
+        return self.ref_points.shape[1]
+
+    @memoize_method
+    def get_kernel(self):
+        if self.ratio < 1.0:
+            radius_expr = "(1.0 - {ratio}) * rblk + {ratio} * rqbx"
+        else:
+            radius_expr = "{ratio} * rqbx"
+        radius_expr = radius_expr.format(ratio=self.ratio)
+
+        # NOTE: centers of mass are computed using a second-order approximation
+        # that currently matches what is in `element_centers_of_mass`.
+        knl = lp.make_kernel([
+            "{[irange]: 0 <= irange < nranges}",
+            "{[i]: 0 <= i < npoints}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            ["""
+            for irange
+                <> ioffset = srcranges[irange]
+                <> npoints = srcranges[irange + 1] - srcranges[irange]
+
+                proxy_center[idim, irange] = 1.0 / npoints * \
+                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
+                        {{dup=idim:i}}
+
+                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         sources[idim, srcindices[i + ioffset]]) ** 2)))
+
+                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
+                        (proxy_center[idim, irange] -
+                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
+                         expansion_radii[srcindices[i + ioffset]])
+                <> rqbx = if(rqbx_ext < rqbx_int, rqbx_int, rqbx_ext)
+
+                proxy_radius[irange] = {radius_expr}
+            end
+            """.format(radius_expr=radius_expr)],
+            [
+                lp.GlobalArg("sources", None,
+                    shape=(self.ambient_dim, "nsources")),
+                lp.GlobalArg("center_int", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("center_ext", None,
+                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
+                lp.GlobalArg("proxy_center", None,
+                    shape=(self.ambient_dim, "nranges")),
+                lp.GlobalArg("proxy_radius", None,
+                    shape="nranges"),
+                lp.ValueArg("nsources", np.int),
+                "..."
+            ],
+            name="find_proxy_radii_knl",
+            assumptions="dim>=1 and nranges>=1",
+            fixed_parameters=dict(dim=self.ambient_dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        knl = lp.tag_inames(knl, "idim*:unr")
+
+        return knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+
+        return knl
+
+    def __call__(self, queue, indices, **kwargs):
+        """Generate proxy points for each given range of source points in
+        the discretization in :attr:`source`.
+
+        :arg queue: a :class:`pyopencl.CommandQueue`.
+        :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+        :return: a tuple of ``(proxies, pxyranges, pxycenters, pxyranges)``,
+            where each element is a :class:`pyopencl.array.Array`. The
+            sizes of the arrays are as follows: ``pxycenters`` is of size
+            ``(2, nranges)``, ``pxyradii`` is of size ``(nranges,)``,
+            ``pxyranges`` is of size ``(nranges + 1,)`` and ``proxies`` is
+            of size ``(2, nranges * nproxy)``. The proxy points in a range
+            :math:`i` can be obtained by a slice
+            ``proxies[pxyranges[i]:pxyranges[i + 1]]`` and are all at a
+            distance ``pxyradii[i]`` from the range center ``pxycenters[i]``.
+        """
+
+        def _affine_map(v, A, b):
+            return np.dot(A, v) + b
+
+        from pytential.qbx.utils import get_centers_on_side
+
+        knl = self.get_kernel()
+        _, (centers_dev, radii_dev,) = knl(queue,
+            sources=self.source.density_discr.nodes(),
+            center_int=get_centers_on_side(self.source, -1),
+            center_ext=get_centers_on_side(self.source, +1),
+            expansion_radii=self.source._expansion_radii("nsources"),
+            srcindices=indices.indices,
+            srcranges=indices.ranges, **kwargs)
+        centers = centers_dev.get()
+        radii = radii_dev.get()
+
+        proxies = np.empty(indices.nblocks, dtype=np.object)
+        for i in range(indices.nblocks):
+            proxies[i] = _affine_map(self.ref_points,
+                    A=(radii[i] * np.eye(self.ambient_dim)),
+                    b=centers[:, i].reshape(-1, 1))
+
+        pxyranges = cl.array.arange(queue,
+                0,
+                proxies.shape[0] * proxies[0].shape[1] + 1,
+                proxies[0].shape[1],
+                dtype=indices.ranges.dtype)
+        proxies = make_obj_array([
+            cl.array.to_device(queue, np.hstack([p[idim] for p in proxies]))
+            for idim in range(self.ambient_dim)])
+        centers = make_obj_array([
+            centers_dev[idim].with_queue(queue).copy()
+            for idim in range(self.ambient_dim)])
+
+        assert pxyranges[-1] == proxies[0].shape[0]
+        return proxies, pxyranges, centers, radii_dev
+
+
+def gather_block_neighbor_points(discr, indices, pxycenters, pxyradii,
+                                 max_nodes_in_box=None):
+    """Generate a set of neighboring points for each range of points in
+    *discr*. Neighboring points of a range :math:`i` are defined
+    as all the points inside the proxy ball :math:`i` that do not also
+    belong to the range itself.
+
+    :arg discr: a :class:`meshmode.discretization.Discretization`.
+    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+    :arg pxycenters: an array containing the center of each proxy ball.
+    :arg pxyradii: an array containing the radius of each proxy ball.
+
+    :return: a :class:`sumpy.tools.BlockIndexRanges`.
+    """
+
+    if max_nodes_in_box is None:
+        # FIXME: this is a fairly arbitrary value
+        max_nodes_in_box = 32
+
+    with cl.CommandQueue(discr.cl_context) as queue:
+        indices = indices.get(queue)
+
+        # NOTE: this is constructed for multiple reasons:
+        #   * TreeBuilder takes object arrays
+        #   * `srcindices` can be a small subset of nodes, so this will save
+        #   some work
+        #   * `srcindices` may reorder the array returned by nodes(), so this
+        #   makes sure that we have the same order in tree.user_source_ids
+        #   and friends
+        sources = discr.nodes().get(queue)
+        sources = make_obj_array([
+            cl.array.to_device(queue, sources[idim, indices.indices])
+            for idim in range(discr.ambient_dim)])
+
+        # construct tree
+        from boxtree import TreeBuilder
+        builder = TreeBuilder(discr.cl_context)
+        tree, _ = builder(queue, sources,
+                          max_particles_in_box=max_nodes_in_box)
+
+        from boxtree.area_query import AreaQueryBuilder
+        builder = AreaQueryBuilder(discr.cl_context)
+        query, _ = builder(queue, tree, pxycenters, pxyradii)
+
+        # find nodes inside each proxy ball
+        tree = tree.get(queue)
+        query = query.get(queue)
+
+        if isinstance(pxycenters[0], cl.array.Array):
+            pxycenters = np.vstack([pxycenters[idim].get(queue)
+                                    for idim in range(discr.ambient_dim)])
+        if isinstance(pxyradii, cl.array.Array):
+            pxyradii = pxyradii.get(queue)
+
+        nbrindices = np.empty(indices.nblocks, dtype=np.object)
+        for iproxy in range(indices.nblocks):
+            # get list of boxes intersecting the current ball
+            istart = query.leaves_near_ball_starts[iproxy]
+            iend = query.leaves_near_ball_starts[iproxy + 1]
+            iboxes = query.leaves_near_ball_lists[istart:iend]
+
+            # get nodes inside the boxes
+            istart = tree.box_source_starts[iboxes]
+            iend = istart + tree.box_source_counts_cumul[iboxes]
+            isources = np.hstack([np.arange(s, e)
+                                  for s, e in zip(istart, iend)])
+            nodes = np.vstack([tree.sources[idim][isources]
+                               for idim in range(discr.ambient_dim)])
+            isources = tree.user_source_ids[isources]
+
+            # get nodes inside the ball but outside the current range
+            center = pxycenters[:, iproxy].reshape(-1, 1)
+            radius = pxyradii[iproxy]
+            mask = ((la.norm(nodes - center, axis=0) < radius)
+                    & ((isources < indices.ranges[iproxy])
+                        | (indices.ranges[iproxy + 1] <= isources)))
+
+            nbrindices[iproxy] = indices.indices[isources[mask]]
+
+        nbrranges = to_device(queue,
+                np.cumsum([0] + [n.shape[0] for n in nbrindices]))
+        nbrindices = to_device(queue, np.hstack(nbrindices))
+
+        return BlockIndexRanges(discr.cl_context,
+                                nbrindices.with_queue(None),
+                                nbrranges.with_queue(None))
+
+
+def gather_block_interaction_points(source, indices,
+                                    ratio=None,
+                                    approx_nproxy=None,
+                                    max_nodes_in_box=None):
+    """Generate sets of interaction points for each given range of indices
+    in the *source* discretization. For each input range of indices,
+    the corresponding output range of points is consists of:
+
+    - a set of proxy points (or balls) around the range, which
+      model farfield interactions. These are constructed using
+      :class:`ProxyGenerator`.
+
+    - a set of neighboring points that are inside the proxy balls, but
+      do not belong to the given range, which model nearby interactions.
+      These are constructed with :func:`gather_block_neighbor_points`.
+
+    :arg source: a :class:`pytential.qbx.QBXLayerPotentialSource`.
+    :arg indices: a :class:`sumpy.tools.BlockIndexRanges`.
+
+    :return: a tuple ``(nodes, ranges)``, where each value is a
+        :class:`pyopencl.array.Array`. For a range :math:`i`, we can
+        get the slice using ``nodes[ranges[i]:ranges[i + 1]]``.
+    """
+
+    @memoize
+    def knl():
+        loopy_knl = lp.make_kernel([
+            "{[irange, idim]: 0 <= irange < nranges and \
+                              0 <= idim < dim}",
+            "{[ipxy, ingb]: 0 <= ipxy < npxyblock and \
+                            0 <= ingb < nngbblock}"
+            ],
+            """
+            for irange
+                <> pxystart = pxyranges[irange]
+                <> pxyend = pxyranges[irange + 1]
+                <> npxyblock = pxyend - pxystart
+
+                <> ngbstart = nbrranges[irange]
+                <> ngbend = nbrranges[irange + 1]
+                <> nngbblock = ngbend - ngbstart
+
+                <> istart = pxyranges[irange] + nbrranges[irange]
+                nodes[idim, istart + ipxy] = \
+                    proxies[idim, pxystart + ipxy] \
+                    {id_prefix=write_pxy,nosync=write_ngb}
+                nodes[idim, istart + npxyblock + ingb] = \
+                    sources[idim, nbrindices[ngbstart + ingb]] \
+                    {id_prefix=write_ngb,nosync=write_pxy}
+                ranges[irange + 1] = ranges[irange] + npxyblock + nngbblock
+            end
+            """,
+            [
+                lp.GlobalArg("sources", None,
+                    shape=(source.ambient_dim, "nsources")),
+                lp.GlobalArg("proxies", None,
+                    shape=(source.ambient_dim, "nproxies"), dim_tags="sep,C"),
+                lp.GlobalArg("nbrindices", None,
+                    shape="nnbrindices"),
+                lp.GlobalArg("nodes", None,
+                    shape=(source.ambient_dim, "nproxies + nnbrindices")),
+                lp.ValueArg("nsources", np.int),
+                lp.ValueArg("nproxies", np.int),
+                lp.ValueArg("nnbrindices", np.int),
+                "..."
+            ],
+            name="concat_proxy_and_neighbors",
+            default_offset=lp.auto,
+            silenced_warnings="write_race(write_*)",
+            fixed_parameters=dict(dim=source.ambient_dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+        return loopy_knl
+
+    with cl.CommandQueue(source.cl_context) as queue:
+        generator = ProxyGenerator(source,
+                                   ratio=ratio,
+                                   approx_nproxy=approx_nproxy)
+        proxies, pxyranges, pxycenters, pxyradii = generator(queue, indices)
+
+        neighbors = gather_block_neighbor_points(source.density_discr,
+                indices, pxycenters, pxyradii,
+                max_nodes_in_box=max_nodes_in_box)
+
+        ranges = cl.array.zeros(queue, indices.nblocks + 1, dtype=np.int)
+        _, (nodes, ranges) = knl()(queue,
+                sources=source.density_discr.nodes(),
+                proxies=proxies,
+                pxyranges=pxyranges,
+                nbrindices=neighbors.indices,
+                nbrranges=neighbors.ranges,
+                ranges=ranges)
+
+        return nodes.with_queue(None), ranges.with_queue(None)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py
index c3ca7d95fc58877182743fb6760b1de04b65d94a..188660f72b642ca67d0882a9e92b25e211e717fa 100644
--- a/pytential/qbx/__init__.py
+++ b/pytential/qbx/__init__.py
@@ -78,10 +78,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             _expansions_in_tree_have_extent=True,
             _expansion_stick_out_factor=0.5,
             _well_sep_is_n_away=2,
-            _max_leaf_refine_weight=32,
+            _max_leaf_refine_weight=None,
             _box_extent_norm=None,
             _from_sep_smaller_crit=None,
             _from_sep_smaller_min_nsources_cumul=None,
+            _tree_kind="adaptive",
             geometry_data_inspector=None,
             fmm_backend="sumpy",
             target_stick_out_factor=_not_provided):
@@ -105,6 +106,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         # {{{ argument processing
 
+        if fine_order is None:
+            raise ValueError("fine_order must be provided.")
+
+        if qbx_order is None:
+            raise ValueError("qbx_order must be provided.")
+
         if target_stick_out_factor is not _not_provided:
             from warnings import warn
             warn("target_stick_out_factor has been renamed to "
@@ -131,6 +138,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         if _box_extent_norm is None:
             _box_extent_norm = "l2"
 
+        if _from_sep_smaller_crit is None:
+            # This seems to win no matter what the box extent norm is
+            # https://gitlab.tiker.net/papers/2017-qbx-fmm-3d/issues/10
+            _from_sep_smaller_crit = "precise_linf"
+
         if fmm_level_to_order is None:
             if fmm_order is False:
                 fmm_level_to_order = False
@@ -138,6 +150,18 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 def fmm_level_to_order(kernel, kernel_args, tree, level):
                     return fmm_order
 
+        if _max_leaf_refine_weight is None:
+            if density_discr.ambient_dim == 2:
+                # FIXME: This should be verified now that l^2 is the default.
+                _max_leaf_refine_weight = 64
+            elif density_discr.ambient_dim == 3:
+                # For static_linf/linf: https://gitlab.tiker.net/papers/2017-qbx-fmm-3d/issues/8#note_25009  # noqa
+                # For static_l2/l2: https://gitlab.tiker.net/papers/2017-qbx-fmm-3d/issues/12  # noqa
+                _max_leaf_refine_weight = 512
+            else:
+                # Just guessing...
+                _max_leaf_refine_weight = 64
+
         if _from_sep_smaller_min_nsources_cumul is None:
             # See here for the comment thread that led to these defaults:
             # https://gitlab.tiker.net/inducer/boxtree/merge_requests/28#note_18661
@@ -177,6 +201,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         self._from_sep_smaller_crit = _from_sep_smaller_crit
         self._from_sep_smaller_min_nsources_cumul = \
                 _from_sep_smaller_min_nsources_cumul
+        self._tree_kind = _tree_kind
         self.geometry_data_inspector = geometry_data_inspector
 
         # /!\ *All* parameters set here must also be set by copy() below,
@@ -189,12 +214,18 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             density_discr=None,
             fine_order=None,
             qbx_order=None,
+            fmm_order=_not_provided,
             fmm_level_to_order=_not_provided,
             to_refined_connection=None,
             target_association_tolerance=_not_provided,
             _expansions_in_tree_have_extent=_not_provided,
             _expansion_stick_out_factor=_not_provided,
+            _max_leaf_refine_weight=None,
+            _box_extent_norm=None,
+            _from_sep_smaller_crit=None,
+            _tree_kind=None,
             geometry_data_inspector=None,
+            fmm_backend=None,
 
             debug=_not_provided,
             _refined_for_global_qbx=_not_provided,
@@ -224,6 +255,18 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         # }}}
 
+        kwargs = {}
+
+        if (fmm_order is not _not_provided
+                and fmm_level_to_order is not _not_provided):
+            raise TypeError("may not specify both fmm_order and fmm_level_to_order")
+        elif fmm_order is not _not_provided:
+            kwargs["fmm_order"] = fmm_order
+        elif fmm_level_to_order is not _not_provided:
+            kwargs["fmm_level_to_order"] = fmm_level_to_order
+        else:
+            kwargs["fmm_level_to_order"] = self.fmm_level_to_order
+
         # FIXME Could/should share wrangler and geometry kernels
         # if no relevant changes have been made.
         return QBXLayerPotentialSource(
@@ -231,11 +274,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 fine_order=(
                     fine_order if fine_order is not None else self.fine_order),
                 qbx_order=qbx_order if qbx_order is not None else self.qbx_order,
-                fmm_level_to_order=(
-                    # False is a valid value here
-                    fmm_level_to_order
-                    if fmm_level_to_order is not _not_provided
-                    else self.fmm_level_to_order),
 
                 target_association_tolerance=target_association_tolerance,
                 to_refined_connection=(
@@ -260,15 +298,18 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                     if _expansion_stick_out_factor is not _not_provided
                     else self._expansion_stick_out_factor),
                 _well_sep_is_n_away=self._well_sep_is_n_away,
-                _max_leaf_refine_weight=self._max_leaf_refine_weight,
-                _box_extent_norm=self._box_extent_norm,
-                _from_sep_smaller_crit=self._from_sep_smaller_crit,
+                _max_leaf_refine_weight=(
+                    _max_leaf_refine_weight or self._max_leaf_refine_weight),
+                _box_extent_norm=(_box_extent_norm or self._box_extent_norm),
+                _from_sep_smaller_crit=(
+                    _from_sep_smaller_crit or self._from_sep_smaller_crit),
                 _from_sep_smaller_min_nsources_cumul=(
                     self._from_sep_smaller_min_nsources_cumul),
+                _tree_kind=_tree_kind or self._tree_kind,
                 geometry_data_inspector=(
                     geometry_data_inspector or self.geometry_data_inspector),
-                fmm_backend=self.fmm_backend,
-                )
+                fmm_backend=fmm_backend or self.fmm_backend,
+                **kwargs)
 
     # }}}
 
@@ -340,6 +381,28 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         return conn
 
+    @property
+    @memoize_method
+    def direct_resampler(self):
+        """
+        .. warning::
+
+            This always returns a
+            :class:`~meshmode.discretization.connection.DirectDiscretizationConnect`.
+            In case the geometry has been refined multiple times, a direct
+            connection can have a large number of groups and/or
+            interpolation batches, making it scale significantly worse than
+            the one returned by :attr:`resampler`.
+        """
+        from meshmode.discretization.connection import \
+                flatten_chained_connection
+
+        conn = self.resampler
+        with cl.CommandQueue(self.cl_context) as queue:
+            conn = flatten_chained_connection(queue, conn)
+
+        return conn
+
     @property
     @memoize_method
     def tree_code_container(self):
@@ -361,8 +424,17 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     @memoize_method
     def with_refinement(self, target_order=None, kernel_length_scale=None,
-            maxiter=None, visualize=False, _expansion_disturbance_tolerance=None):
+            maxiter=None, visualize=False, refiner=None,
+            _expansion_disturbance_tolerance=None,
+            _force_stage2_uniform_refinement_rounds=None,
+            _scaled_max_curvature_threshold=None):
         """
+        :arg refiner: If the mesh underlying :attr:`density_discr`
+            is itself the result of refinement, then its
+            :class:`meshmode.refinement.Refiner` instance may need to
+            be reused for continued refinement. This argument
+            provides the opportunity to pass in an existing refiner
+            that should be used for continued refinement.
         :returns: a tuple ``(lpot_src, cnx)``, where ``lpot_src`` is a
             :class:`QBXLayerPotentialSource` and ``cnx`` is a
             :class:`meshmode.discretization.connection.DiscretizationConnection`
@@ -383,7 +455,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                     InterpolatoryQuadratureSimplexGroupFactory(target_order),
                     kernel_length_scale=kernel_length_scale,
                     maxiter=maxiter, visualize=visualize,
-                    expansion_disturbance_tolerance=_expansion_disturbance_tolerance)
+                    expansion_disturbance_tolerance=_expansion_disturbance_tolerance,
+                    force_stage2_uniform_refinement_rounds=(
+                        _force_stage2_uniform_refinement_rounds),
+                    scaled_max_curvature_threshold=(
+                        _scaled_max_curvature_threshold),
+                    refiner=refiner)
 
         return lpot, connection
 
@@ -391,8 +468,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
     @memoize_method
     def h_max(self):
         with cl.CommandQueue(self.cl_context) as queue:
-            panel_sizes = self._panel_sizes("npanels").with_queue(queue)
-            return np.asscalar(cl.array.max(panel_sizes).get())
+            quad_res = self._coarsest_quad_resolution("npanels").with_queue(queue)
+            return np.asscalar(cl.array.max(quad_res).get())
 
     # {{{ internal API
 
@@ -402,44 +479,98 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         return utils.element_centers_of_mass(self.density_discr)
 
     @memoize_method
-    def _fine_panel_centers_of_mass(self):
+    def _stage2_panel_centers_of_mass(self):
         import pytential.qbx.utils as utils
         return utils.element_centers_of_mass(self.stage2_density_discr)
 
+    def _dim_fudge_factor(self):
+        if self.density_discr.dim == 2:
+            return 0.5
+        else:
+            return 1
+
     @memoize_method
     def _expansion_radii(self, last_dim_length):
-        if last_dim_length == "npanels":
-            # FIXME: Make this an error
-
-            from warnings import warn
-            warn("Passing 'npanels' as last_dim_length to _expansion_radii is "
-                    "deprecated. Expansion radii should be allowed to vary "
-                    "within a panel.", stacklevel=3)
-
         with cl.CommandQueue(self.cl_context) as queue:
-                return (self._panel_sizes(last_dim_length).with_queue(queue) * 0.5
-                        ).with_queue(None)
+                return (self._coarsest_quad_resolution(last_dim_length)
+                        .with_queue(queue)
+                        * 0.5 * self._dim_fudge_factor()).with_queue(None)
 
     # _expansion_radii should not be needed for the fine discretization
 
+    @memoize_method
+    def _source_danger_zone_radii(self, last_dim_length="npanels"):
+        # This should be the expression of the expansion radii, but
+        #
+        # - in reference to the stage 2 discretization
+        # - mutliplied by 0.75 because
+        #
+        #   - Setting this equal to the expansion radii ensures that *every*
+        #     stage 2 element will be refined, which is wasteful.
+        #     (so this needs to be smaller than that)
+        #
+
+        #   - Setting this equal to half the expansion radius will not provide
+        #     a refinement 'buffer layer' at a 2x coarsening fringe.
+
+        with cl.CommandQueue(self.cl_context) as queue:
+            return (
+                    (self._stage2_coarsest_quad_resolution(last_dim_length)
+                        .with_queue(queue))
+                    * 0.5 * 0.75 * self._dim_fudge_factor()).with_queue(None)
+
     @memoize_method
     def _close_target_tunnel_radius(self, last_dim_length):
         with cl.CommandQueue(self.cl_context) as queue:
-                return (self._panel_sizes(last_dim_length).with_queue(queue) * 0.5
+                return (
+                        self._expansion_radii(last_dim_length).with_queue(queue)
+                        * 0.5
                         ).with_queue(None)
 
     @memoize_method
-    def _panel_sizes(self, last_dim_length="npanels"):
+    def _coarsest_quad_resolution(self, last_dim_length="npanels"):
+        """This measures the quadrature resolution across the
+        mesh. In a 1D uniform mesh of uniform 'parametrization speed', it
+        should be the same as the panel length.
+        """
         import pytential.qbx.utils as utils
-        return utils.panel_sizes(self.density_discr, last_dim_length)
+        from pytential import sym, bind
+        with cl.CommandQueue(self.cl_context) as queue:
+            maxstretch = bind(
+                    self,
+                    sym._simplex_mapping_max_stretch_factor(
+                        self.ambient_dim)
+                    )(queue)
+
+            maxstretch = utils.to_last_dim_length(
+                    self.density_discr, maxstretch, last_dim_length)
+            maxstretch = maxstretch.with_queue(None)
+
+        return maxstretch
 
     @memoize_method
-    def _fine_panel_sizes(self, last_dim_length="npanels"):
+    def _stage2_coarsest_quad_resolution(self, last_dim_length="npanels"):
+        """This measures the quadrature resolution across the
+        mesh. In a 1D uniform mesh of uniform 'parametrization speed', it
+        should be the same as the panel length.
+        """
         if last_dim_length != "npanels":
+            # Not technically required below, but no need to loosen for now.
             raise NotImplementedError()
 
         import pytential.qbx.utils as utils
-        return utils.panel_sizes(self.stage2_density_discr, last_dim_length)
+        from pytential import sym, bind
+        with cl.CommandQueue(self.cl_context) as queue:
+            maxstretch = bind(
+                    self, sym._simplex_mapping_max_stretch_factor(
+                        self.ambient_dim,
+                        where=sym.QBXSourceStage2(sym.DEFAULT_SOURCE))
+                    )(queue)
+            maxstretch = utils.to_last_dim_length(
+                    self.stage2_density_discr, maxstretch, last_dim_length)
+            maxstretch = maxstretch.with_queue(None)
+
+        return maxstretch
 
     @memoize_method
     def qbx_fmm_geometry_data(self, target_discrs_and_qbx_sides):
@@ -457,6 +588,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         return QBXFMMGeometryData(self.qbx_fmm_code_getter,
                 self, target_discrs_and_qbx_sides,
                 target_association_tolerance=self.target_association_tolerance,
+                tree_kind=self._tree_kind,
                 debug=self.debug)
 
     # }}}
@@ -486,8 +618,13 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
         from pytools.obj_array import with_object_array_or_scalar
-        from functools import partial
-        oversample = partial(self.resampler, queue)
+
+        def oversample_nonscalars(vec):
+            from numbers import Number
+            if isinstance(vec, Number):
+                return vec
+            else:
+                return self.resampler(queue, vec)
 
         if not self._refined_for_global_qbx:
             from warnings import warn
@@ -497,7 +634,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         def evaluate_wrapper(expr):
             value = evaluate(expr)
-            return with_object_array_or_scalar(oversample, value)
+            return with_object_array_or_scalar(oversample_nonscalars, value)
 
         if self.fmm_level_to_order is False:
             func = self.exec_compute_potential_insn_direct
@@ -579,6 +716,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides)
 
+        # geo_data.plot()
+
         # FIXME Exert more positive control over geo_data attribute lifetimes using
         # geo_data.<method>.clear_cache(geo_data).
 
@@ -618,7 +757,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         if self.geometry_data_inspector is not None:
             perform_fmm = self.geometry_data_inspector(insn, bound_expr, geo_data)
             if not perform_fmm:
-                return [(o.name, 0) for o in insn.outputs], []
+                return [(o.name, 0) for o in insn.outputs]
 
         # }}}
 
@@ -641,7 +780,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                     (o.name,
                         all_potentials_on_every_tgt[o.kernel_index][tgt_slice]))
 
-        return result, []
+        return result
 
     # }}}
 
@@ -802,7 +941,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
                 result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result, []
+        return result
 
     # }}}
 
@@ -812,8 +951,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
 
 __all__ = (
-        QBXLayerPotentialSource,
-        QBXTargetAssociationFailedException,
+        "QBXLayerPotentialSource",
+        "QBXTargetAssociationFailedException",
         )
 
 # vim: fdm=marker
diff --git a/pytential/qbx/direct.py b/pytential/qbx/direct.py
index 6dc5cd9abbb7319d0cd7a4029a3a2b22b6a710e5..496259c921f7e6a6fdba59ceb42eda48a805bcd1 100644
--- a/pytential/qbx/direct.py
+++ b/pytential/qbx/direct.py
@@ -25,62 +25,97 @@ THE SOFTWARE.
 import loopy as lp
 import numpy as np
 
-from sumpy.qbx import LayerPotential as LayerPotentialBase
+from sumpy.qbx import LayerPotentialBase
+
+from pytential.version import PYTENTIAL_KERNEL_VERSION
 
 
 # {{{ qbx applier on a target/center subset
 
 class LayerPotentialOnTargetAndCenterSubset(LayerPotentialBase):
-    def get_compute_a_and_b_vecs(self):
-        return """
-            <> icenter = qbx_center_numbers[itgt]
-            <> itgt_overall = qbx_tgt_numbers[itgt]
-            for idim
-            <> a[idim] = center[idim,icenter] - src[idim,isrc] {id=compute_a}
-            <> b[idim] = tgt[idim,itgt_overall] - center[idim,icenter] \
-                    {id=compute_b}
-            <> rscale = expansion_radii[icenter]
-            end
-            """
-
-    def get_src_tgt_arguments(self):
-        return [
+    default_name = "qbx_tgt_ctr_subset"
+
+    def get_cache_key(self):
+        return super(LayerPotentialOnTargetAndCenterSubset, self).get_cache_key() + (
+                PYTENTIAL_KERNEL_VERSION,)
+
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+
+        from sumpy.tools import gather_loopy_source_arguments
+        arguments = (
+            gather_loopy_source_arguments(self.kernels)
+            + [
                 lp.GlobalArg("src", None,
                     shape=(self.dim, "nsources"), order="C"),
                 lp.GlobalArg("tgt", None,
                     shape=(self.dim, "ntargets_total"), order="C"),
                 lp.GlobalArg("center", None,
-                    shape=(self.dim, "ncenters_total"), order="C"),
-                lp.GlobalArg("expansion_radii", None, shape="ncenters_total"),
-                lp.GlobalArg("qbx_tgt_numbers", None, shape="ntargets"),
-                lp.GlobalArg("qbx_center_numbers", None, shape="ntargets"),
+                    shape=(self.dim, "ncenters_total"), dim_tags="sep,C"),
+                lp.GlobalArg("expansion_radii", None,
+                    shape="ncenters_total"),
+                lp.GlobalArg("qbx_tgt_numbers", None,
+                    shape="ntargets"),
+                lp.GlobalArg("qbx_center_numbers", None,
+                    shape="ntargets"),
                 lp.ValueArg("nsources", np.int32),
                 lp.ValueArg("ntargets", np.int32),
                 lp.ValueArg("ntargets_total", np.int32),
-                lp.ValueArg("ncenters_total", np.int32),
-                ]
-
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C")
-                for i in range(self.strength_count)
-                ]+[
-                lp.GlobalArg("result_%d" % i, None, shape="ntargets_total",
-                    order="C")
-                for i in range(len(self.kernels))
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_KNLIDX[itgt_overall] = \
-                        knl_KNLIDX_scaling*simul_reduce(\
-                            sum, isrc, pair_result_KNLIDX)  {inames=itgt}
-                """.replace("KNLIDX", str(iknl))
-                for iknl in range(len(self.expansions))
-                ]
+                lp.ValueArg("ncenters_total", np.int32)]
+            + [lp.GlobalArg("strength_%d" % i, None,
+                shape="nsources", order="C")
+            for i in range(self.strength_count)]
+            + [lp.GlobalArg("result_%d" % i, self.value_dtypes[i],
+                shape="ntargets_total", order="C")
+            for i in range(len(self.kernels))])
 
-# }}}
+        loopy_knl = lp.make_kernel([
+            "{[itgt]: 0 <= itgt < ntargets}",
+            "{[isrc]: 0 <= isrc < nsources}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["""
+                <> icenter = qbx_center_numbers[itgt]
+                <> itgt_overall = qbx_tgt_numbers[itgt]
+
+                <> a[idim] = center[idim, icenter] - src[idim, isrc] \
+                        {dup=idim}
+                <> b[idim] = tgt[idim, itgt_overall] - center[idim, icenter] \
+                        {dup=idim}
+                <> rscale = expansion_radii[icenter]
+            """]
+            + loopy_insns + kernel_exprs
+            + ["""
+                result_{i}[itgt_overall] = knl_{i}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{i})  \
+                        {{inames=itgt}}
+                """.format(i=iknl)
+                for iknl in range(len(self.expansions))]
+            + ["end"],
+            arguments,
+            name=self.name,
+            assumptions="ntargets>=1 and nsources>=1",
+            fixed_parameters=dict(dim=self.dim))
 
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        for expn in self.expansions:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
+
+    def __call__(self, queue, targets, sources, centers, strengths, expansion_radii,
+            **kwargs):
+        knl = self.get_cached_optimized_kernel()
+
+        for i, dens in enumerate(strengths):
+            kwargs["strength_%d" % i] = dens
+
+        return knl(queue, src=sources, tgt=targets, center=centers,
+                expansion_radii=expansion_radii, **kwargs)
+
+# }}}
 
 # vim: foldmethod=marker
diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py
index a5292fdede8bab1f61e0df97cf75e19b7cd63d8f..badf630046b239303344e1b1a3664e706a12a0f4 100644
--- a/pytential/qbx/fmm.py
+++ b/pytential/qbx/fmm.py
@@ -28,11 +28,14 @@ import numpy as np  # noqa
 import pyopencl as cl  # noqa
 import pyopencl.array  # noqa
 from sumpy.fmm import (SumpyExpansionWranglerCodeContainer,
-        SumpyExpansionWrangler, level_to_rscale)
+        SumpyExpansionWrangler, level_to_rscale, SumpyTimingFuture)
 
 from pytools import memoize_method
 from pytential.qbx.interactions import P2QBXLFromCSR, M2QBXL, L2QBXL, QBXL2P
 
+from boxtree.fmm import TimingRecorder
+from pytools import log_process, ProcessLogger
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -192,12 +195,14 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
     # {{{ qbx-related
 
+    @log_process(logger)
     def form_global_qbx_locals(self, src_weights):
         local_exps = self.qbx_local_expansion_zeros()
+        events = []
 
         geo_data = self.geo_data
         if len(geo_data.global_qbx_centers()) == 0:
-            return local_exps
+            return (local_exps, SumpyTimingFuture(self.queue, events))
 
         traversal = geo_data.traversal()
 
@@ -223,17 +228,20 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
                 **kwargs)
 
+        events.append(evt)
         assert local_exps is result
         result.add_event(evt)
 
-        return result
+        return (result, SumpyTimingFuture(self.queue, events))
 
+    @log_process(logger)
     def translate_box_multipoles_to_qbx_local(self, multipole_exps):
         qbx_expansions = self.qbx_local_expansion_zeros()
+        events = []
 
         geo_data = self.geo_data
         if geo_data.ncenters == 0:
-            return qbx_expansions
+            return (qbx_expansions, SumpyTimingFuture(self.queue, events))
 
         traversal = geo_data.traversal()
 
@@ -248,7 +256,9 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
                     self.multipole_expansions_view(multipole_exps, isrc_level)
 
             evt, (qbx_expansions_res,) = m2qbxl(self.queue,
-                    qbx_center_to_target_box=geo_data.qbx_center_to_target_box(),
+                    qbx_center_to_target_box_source_level=(
+                        geo_data.qbx_center_to_target_box_source_level(isrc_level)
+                    ),
 
                     centers=self.tree.box_centers,
                     qbx_centers=geo_data.centers(),
@@ -267,19 +277,24 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
                     **self.kernel_extra_kwargs)
 
+            events.append(evt)
             wait_for = [evt]
             assert qbx_expansions_res is qbx_expansions
 
         qbx_expansions.add_event(evt)
 
-        return qbx_expansions
+        return (qbx_expansions, SumpyTimingFuture(self.queue, events))
 
+    @log_process(logger)
     def translate_box_local_to_qbx_local(self, local_exps):
         qbx_expansions = self.qbx_local_expansion_zeros()
 
         geo_data = self.geo_data
+        events = []
+
         if geo_data.ncenters == 0:
-            return qbx_expansions
+            return (qbx_expansions, SumpyTimingFuture(self.queue, events))
+
         trav = geo_data.traversal()
 
         wait_for = local_exps.events
@@ -311,19 +326,23 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
                     **self.kernel_extra_kwargs)
 
+            events.append(evt)
             wait_for = [evt]
             assert qbx_expansions_res is qbx_expansions
 
         qbx_expansions.add_event(evt)
 
-        return qbx_expansions
+        return (qbx_expansions, SumpyTimingFuture(self.queue, events))
 
+    @log_process(logger)
     def eval_qbx_expansions(self, qbx_expansions):
         pot = self.full_output_zeros()
 
         geo_data = self.geo_data
+        events = []
+
         if len(geo_data.global_qbx_centers()) == 0:
-            return pot
+            return (pot, SumpyTimingFuture(self.queue, events))
 
         ctt = geo_data.center_to_tree_targets()
 
@@ -348,7 +367,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
         for pot_i, pot_res_i in zip(pot, pot_res):
             assert pot_i is pot_res_i
 
-        return pot
+        return (pot, SumpyTimingFuture(self.queue, events))
 
     # }}}
 
@@ -357,7 +376,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
 # {{{ FMM top-level
 
-def drive_fmm(expansion_wrangler, src_weights):
+def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
     """Top-level driver routine for the QBX fast multipole calculation.
 
     :arg geo_data: A :class:`QBXFMMGeometryData` instance.
@@ -365,6 +384,8 @@ def drive_fmm(expansion_wrangler, src_weights):
         :class:`ExpansionWranglerInterface`.
     :arg src_weights: Source 'density/weights/charges'.
         Passed unmodified to *expansion_wrangler*.
+    :arg timing_data: Either *None* or a dictionary that collects
+        timing data.
 
     Returns the potentials computed by *expansion_wrangler*.
 
@@ -375,73 +396,76 @@ def drive_fmm(expansion_wrangler, src_weights):
     geo_data = wrangler.geo_data
     traversal = geo_data.traversal()
     tree = traversal.tree
+    recorder = TimingRecorder()
 
     # Interface guidelines: Attributes of the tree are assumed to be known
     # to the expansion wrangler and should not be passed.
 
-    from time import time
-    start_time = time()
-    logger.info("start qbx fmm")
+    fmm_proc = ProcessLogger(logger, "qbx fmm")
 
-    logger.info("reorder source weights")
     src_weights = wrangler.reorder_sources(src_weights)
 
     # {{{ construct local multipoles
 
-    logger.info("construct local multipoles")
-    mpole_exps = wrangler.form_multipoles(
+    mpole_exps, timing_future = wrangler.form_multipoles(
             traversal.level_start_source_box_nrs,
             traversal.source_boxes,
             src_weights)
 
+    recorder.add("form_multipoles", timing_future)
+
     # }}}
 
     # {{{ propagate multipoles upward
 
-    logger.info("propagate multipoles upward")
-    wrangler.coarsen_multipoles(
+    mpole_exps, timing_future = wrangler.coarsen_multipoles(
             traversal.level_start_source_parent_box_nrs,
             traversal.source_parent_boxes,
             mpole_exps)
 
+    recorder.add("coarsen_multipoles", timing_future)
+
     # }}}
 
     # {{{ direct evaluation from neighbor source boxes ("list 1")
 
-    logger.info("direct evaluation from neighbor source boxes ('list 1')")
-    non_qbx_potentials = wrangler.eval_direct(
+    non_qbx_potentials, timing_future = wrangler.eval_direct(
             traversal.target_boxes,
             traversal.neighbor_source_boxes_starts,
             traversal.neighbor_source_boxes_lists,
             src_weights)
 
+    recorder.add("eval_direct", timing_future)
+
     # }}}
 
     # {{{ translate separated siblings' ("list 2") mpoles to local
 
-    logger.info("translate separated siblings' ('list 2') mpoles to local")
-    local_exps = wrangler.multipole_to_local(
+    local_exps, timing_future = wrangler.multipole_to_local(
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_siblings_starts,
             traversal.from_sep_siblings_lists,
             mpole_exps)
 
+    recorder.add("multipole_to_local", timing_future)
+
     # }}}
 
     # {{{ evaluate sep. smaller mpoles ("list 3") at particles
 
-    logger.info("evaluate sep. smaller mpoles at particles ('list 3 far')")
-
     # (the point of aiming this stage at particles is specifically to keep its
     # contribution *out* of the downward-propagating local expansions)
 
-    non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles(
-            traversal.level_start_target_box_nrs,
-            traversal.target_boxes,
+    mpole_result, timing_future = wrangler.eval_multipoles(
+            traversal.target_boxes_sep_smaller_by_source_level,
             traversal.from_sep_smaller_by_level,
             mpole_exps)
 
+    recorder.add("eval_multipoles", timing_future)
+
+    non_qbx_potentials = non_qbx_potentials + mpole_result
+
     # assert that list 3 close has been merged into list 1
     assert traversal.from_sep_close_smaller_starts is None
 
@@ -449,15 +473,17 @@ def drive_fmm(expansion_wrangler, src_weights):
 
     # {{{ form locals for separated bigger source boxes ("list 4")
 
-    logger.info("form locals for separated bigger source boxes ('list 4 far')")
-
-    local_exps = local_exps + wrangler.form_locals(
+    local_result, timing_future = wrangler.form_locals(
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_bigger_starts,
             traversal.from_sep_bigger_lists,
             src_weights)
 
+    recorder.add("form_locals", timing_future)
+
+    local_exps = local_exps + local_result
+
     # assert that list 4 close has been merged into list 1
     assert traversal.from_sep_close_bigger_starts is None
 
@@ -465,48 +491,56 @@ def drive_fmm(expansion_wrangler, src_weights):
 
     # {{{ propagate local_exps downward
 
-    logger.info("propagate local_exps downward")
-    wrangler.refine_locals(
+    local_exps, timing_future = wrangler.refine_locals(
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             local_exps)
 
+    recorder.add("refine_locals", timing_future)
+
     # }}}
 
     # {{{ evaluate locals
 
-    logger.info("evaluate locals")
-    non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals(
+    local_result, timing_future = wrangler.eval_locals(
             traversal.level_start_target_box_nrs,
             traversal.target_boxes,
             local_exps)
 
+    recorder.add("eval_locals", timing_future)
+
+    non_qbx_potentials = non_qbx_potentials + local_result
+
     # }}}
 
     # {{{ wrangle qbx expansions
 
-    logger.info("form global qbx expansions from list 1")
-    qbx_expansions = wrangler.form_global_qbx_locals(src_weights)
+    qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights)
+
+    recorder.add("form_global_qbx_locals", timing_future)
+
+    local_result, timing_future = (
+            wrangler.translate_box_multipoles_to_qbx_local(mpole_exps))
+
+    recorder.add("translate_box_multipoles_to_qbx_local", timing_future)
 
-    logger.info("translate from list 3 multipoles to qbx local expansions")
-    qbx_expansions = qbx_expansions + \
-            wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)
+    qbx_expansions = qbx_expansions + local_result
 
-    logger.info("translate from box local expansions to contained "
-            "qbx local expansions")
-    qbx_expansions = qbx_expansions + \
-            wrangler.translate_box_local_to_qbx_local(local_exps)
+    local_result, timing_future = (
+            wrangler.translate_box_local_to_qbx_local(local_exps))
 
-    logger.info("evaluate qbx local expansions")
-    qbx_potentials = wrangler.eval_qbx_expansions(
-            qbx_expansions)
+    recorder.add("translate_box_local_to_qbx_local", timing_future)
+
+    qbx_expansions = qbx_expansions + local_result
+
+    qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions)
+
+    recorder.add("eval_qbx_expansions", timing_future)
 
     # }}}
 
     # {{{ reorder potentials
 
-    logger.info("reorder potentials")
-
     nqbtl = geo_data.non_qbx_box_target_lists()
 
     all_potentials_in_tree_order = wrangler.full_output_zeros()
@@ -527,7 +561,10 @@ def drive_fmm(expansion_wrangler, src_weights):
 
     # }}}
 
-    logger.info("qbx fmm complete in %.2f s" % (time() - start_time))
+    fmm_proc.done()
+
+    if timing_data is not None:
+        timing_data.update(recorder.summarize())
 
     return result
 
@@ -538,6 +575,7 @@ def drive_fmm(expansion_wrangler, src_weights):
 
 def assemble_performance_data(geo_data, uses_pde_expansions,
         translation_source_power=None, translation_target_power=None,
+        translation_max_power=None,
         summarize_parallel=None, merge_close_lists=True):
     """
     :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM
@@ -586,10 +624,13 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
         if d == 2:
             default_translation_source_power = 1
             default_translation_target_power = 1
+            default_translation_max_power = 0
 
         elif d == 3:
-            default_translation_source_power = 2
-            default_translation_target_power = 1
+            # Based on a reading of FMMlib, i.e. a point-and-shoot FMM.
+            default_translation_source_power = 0
+            default_translation_target_power = 0
+            default_translation_max_power = 3
 
         else:
             raise ValueError("Don't know how to estimate expansion complexities "
@@ -605,11 +646,16 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
         translation_source_power = default_translation_source_power
     if translation_target_power is None:
         translation_target_power = default_translation_target_power
+    if translation_max_power is None:
+        translation_max_power = default_translation_max_power
 
     def xlat_cost(p_source, p_target):
+        from pymbolic.primitives import Max
         return (
                 p_source ** translation_source_power
-                * p_target ** translation_target_power)
+                * p_target ** translation_target_power
+                * Max((p_source, p_target)) ** translation_max_power
+                )
 
     result.update(
             nlevels=tree.nlevels,
@@ -717,13 +763,15 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
 
         assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
 
-        for itgt_box, tgt_ibox in enumerate(traversal.target_boxes):
-            ntargets = box_target_counts_nonchild[tgt_ibox]
-
-            for ilevel, sep_smaller_list in enumerate(
-                    traversal.from_sep_smaller_by_level):
+        for ilevel, sep_smaller_list in enumerate(
+                traversal.from_sep_smaller_by_level):
+            for itgt_box, tgt_ibox in enumerate(
+                        traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
+                ntargets = box_target_counts_nonchild[tgt_ibox]
                 start, end = sep_smaller_list.starts[itgt_box:itgt_box+2]
-                nmp_eval[ilevel, itgt_box] += ntargets * (end-start)
+                nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = (
+                        ntargets * (end-start)
+                )
 
         result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm)
 
@@ -763,15 +811,27 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
 
     # {{{ evaluate locals
 
-    result["eval_part"] = tree.ntargets * ncoeffs_fmm
+    non_qbx_box_targets = geo_data.non_qbx_box_target_lists()
+    result["eval_part"] = non_qbx_box_targets.nfiltered_targets * ncoeffs_fmm
 
     # }}}
 
     # {{{ form global qbx locals
 
     global_qbx_centers = geo_data.global_qbx_centers()
+
+    # If merge_close_lists is False above, then this builds another traversal
+    # (which is OK).
     qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
     center_to_targets_starts = geo_data.center_to_tree_targets().starts
+    qbx_center_to_target_box_source_level = np.empty(
+        (tree.nlevels,), dtype=object
+    )
+
+    for src_level in range(tree.nlevels):
+        qbx_center_to_target_box_source_level[src_level] = (
+            geo_data.qbx_center_to_target_box_source_level(src_level)
+        )
 
     with cl.CommandQueue(geo_data.cl_context) as queue:
         global_qbx_centers = global_qbx_centers.get(
@@ -780,6 +840,10 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
                 queue=queue)
         center_to_targets_starts = center_to_targets_starts.get(
                 queue=queue)
+        for src_level in range(tree.nlevels):
+            qbx_center_to_target_box_source_level[src_level] = (
+                qbx_center_to_target_box_source_level[src_level].get(queue=queue)
+            )
 
     def process_form_qbxl():
         ncenters = geo_data.ncenters
@@ -856,8 +920,13 @@ def assemble_performance_data(geo_data, uses_pde_expansions,
         assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
 
         for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
+
             for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
-                icontaining_tgt_box = qbx_center_to_target_box[tgt_icenter]
+                icontaining_tgt_box = qbx_center_to_target_box_source_level[
+                    isrc_level][tgt_icenter]
+
+                if icontaining_tgt_box == -1:
+                    continue
 
                 start, stop = (
                         ssn.starts[icontaining_tgt_box],
diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py
index 887b3049b568f0b0a98a2a998aeee81d6ebf4dfe..2d21a3d2ac6865d0d8d3c3d3f41c388b0fe160b0 100644
--- a/pytential/qbx/fmmlib.py
+++ b/pytential/qbx/fmmlib.py
@@ -30,6 +30,9 @@ from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
 
 
+from boxtree.tools import return_timing_data
+from pytools import log_process
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -99,6 +102,11 @@ class ToHostTransferredGeoDataWrapper(object):
     def qbx_center_to_target_box(self):
         return self.geo_data.qbx_center_to_target_box().get(queue=self.queue)
 
+    @memoize_method
+    def qbx_center_to_target_box_source_level(self, source_level):
+        return self.geo_data.qbx_center_to_target_box_source_level(
+            source_level).get(queue=self.queue)
+
     @memoize_method
     def non_qbx_box_target_lists(self):
         return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue)
@@ -293,6 +301,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
     # {{{ p2qbxl
 
+    @log_process(logger)
+    @return_timing_data
     def form_global_qbx_locals(self, src_weights):
         geo_data = self.geo_data
         trav = geo_data.traversal()
@@ -343,38 +353,48 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
     # {{{ m2qbxl
 
+    @log_process(logger)
+    @return_timing_data
     def translate_box_multipoles_to_qbx_local(self, multipole_exps):
         qbx_exps = self.qbx_local_expansion_zeros()
 
         geo_data = self.geo_data
-        qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
         qbx_centers = geo_data.centers()
         centers = self.tree.box_centers
         ngqbx_centers = len(geo_data.global_qbx_centers())
+        traversal = geo_data.traversal()
 
         if ngqbx_centers == 0:
             return qbx_exps
 
         mploc = self.get_translation_routine("%ddmploc", vec_suffix="_imany")
 
-        for isrc_level, ssn in enumerate(
-                geo_data.traversal().from_sep_smaller_by_level):
+        for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
             source_level_start_ibox, source_mpoles_view = \
                     self.multipole_expansions_view(multipole_exps, isrc_level)
 
             tgt_icenter_vec = geo_data.global_qbx_centers()
-            icontaining_tgt_box_vec = qbx_center_to_target_box[tgt_icenter_vec]
+            qbx_center_to_target_box_source_level = (
+                geo_data.qbx_center_to_target_box_source_level(isrc_level)
+            )
+            icontaining_tgt_box_vec = qbx_center_to_target_box_source_level[
+                tgt_icenter_vec
+            ]
 
             rscale2 = geo_data.expansion_radii()[geo_data.global_qbx_centers()]
 
             kwargs = {}
             if self.dim == 3 and self.eqn_letter == "h":
-                kwargs["radius"] = (0.5 *
-                        geo_data.expansion_radii()[geo_data.global_qbx_centers()])
-
-            nsrc_boxes_per_gqbx_center = (
-                    ssn.starts[icontaining_tgt_box_vec+1]
-                    - ssn.starts[icontaining_tgt_box_vec])
+                kwargs["radius"] = (0.5
+                        * geo_data.expansion_radii()[geo_data.global_qbx_centers()])
+
+            nsrc_boxes_per_gqbx_center = np.zeros(icontaining_tgt_box_vec.shape,
+                                                  dtype=traversal.tree.box_id_dtype)
+            mask = (icontaining_tgt_box_vec != -1)
+            nsrc_boxes_per_gqbx_center[mask] = (
+                ssn.starts[icontaining_tgt_box_vec[mask] + 1]
+                - ssn.starts[icontaining_tgt_box_vec[mask]]
+            )
             nsrc_boxes = np.sum(nsrc_boxes_per_gqbx_center)
 
             src_boxes_starts = np.empty(ngqbx_centers+1, dtype=np.int32)
@@ -387,7 +407,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
             src_ibox = np.empty(nsrc_boxes, dtype=np.int32)
             for itgt_center, tgt_icenter in enumerate(
                     geo_data.global_qbx_centers()):
-                icontaining_tgt_box = qbx_center_to_target_box[tgt_icenter]
+                icontaining_tgt_box = qbx_center_to_target_box_source_level[
+                    tgt_icenter
+                ]
                 src_ibox[
                         src_boxes_starts[itgt_center]:
                         src_boxes_starts[itgt_center+1]] = (
@@ -443,66 +465,100 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
     # }}}
 
+    @log_process(logger)
+    @return_timing_data
     def translate_box_local_to_qbx_local(self, local_exps):
         qbx_expansions = self.qbx_local_expansion_zeros()
 
         geo_data = self.geo_data
-        if geo_data.ncenters == 0:
+        global_qbx_centers = geo_data.global_qbx_centers()
+
+        if global_qbx_centers.size == 0:
             return qbx_expansions
+
         trav = geo_data.traversal()
         qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
         qbx_centers = geo_data.centers()
         qbx_radii = geo_data.expansion_radii()
 
-        locloc = self.get_translation_routine("%ddlocloc")
+        is_global_qbx_center = np.zeros(geo_data.ncenters, dtype=int)
+        is_global_qbx_center[global_qbx_centers] = 1
+
+        locloc = self.get_translation_routine("%ddlocloc", vec_suffix="_qbx")
+
+        nlevels = geo_data.tree().nlevels
 
-        for isrc_level in range(geo_data.tree().nlevels):
+        box_to_rscale = np.empty(geo_data.tree().nboxes, dtype=np.float)
+        for isrc_level in range(nlevels):
             lev_box_start, lev_box_stop = self.tree.level_start_box_nrs[
                     isrc_level:isrc_level+2]
-            locals_level_start_ibox, locals_view = \
-                    self.local_expansions_view(local_exps, isrc_level)
-            assert locals_level_start_ibox == lev_box_start
+            box_to_rscale[lev_box_start:lev_box_stop] = (
+                    self.level_to_rscale(isrc_level))
 
-            kwargs = {}
-            kwargs.update(self.kernel_kwargs)
+        box_centers = self._get_single_box_centers_array()
 
-            for tgt_icenter in range(geo_data.ncenters):
-                if self.dim == 3 and self.eqn_letter == "h":
-                    # Yuck: This keeps overwriting 'radius' in the dict.
-                    kwargs["radius"] = 0.5 * (
-                            geo_data.expansion_radii()[tgt_icenter])
+        # This translates from target box to global box numbers.
+        qbx_center_to_box = trav.target_boxes[qbx_center_to_target_box]
 
-                isrc_box = qbx_center_to_target_box[tgt_icenter]
+        kwargs = {}
+        kwargs.update(self.kernel_kwargs)
 
-                tgt_center = qbx_centers[:, tgt_icenter]
+        for isrc_level in range(nlevels):
+            lev_box_start, lev_box_stop = self.tree.level_start_box_nrs[
+                    isrc_level:isrc_level+2]
 
-                # The box's expansions which we're translating here
-                # (our source) is, globally speaking, a target box.
+            locals_level_start_ibox, locals_view = \
+                    self.local_expansions_view(local_exps, isrc_level)
 
-                src_ibox = trav.target_boxes[isrc_box]
+            assert locals_level_start_ibox == lev_box_start
 
-                # Is the box number on the level currently under
-                # consideration?
-                in_range = (lev_box_start <= src_ibox and src_ibox < lev_box_stop)
+            # Find used QBX centers that are on this level. (This is not ideal,
+            # but we're supplied a mapping of QBX centers to boxes and we have
+            # to invert that in some way.)
+            curr_level_qbx_centers = np.flatnonzero(
+                    is_global_qbx_center
+                    & (lev_box_start <= qbx_center_to_box)
+                    & (qbx_center_to_box < lev_box_stop))
 
-                if in_range:
-                    src_center = self.tree.box_centers[:, src_ibox]
-                    tmp_loc_exp = locloc(
-                                rscale1=self.level_to_rscale(isrc_level),
-                                center1=src_center,
-                                expn1=locals_view[
-                                    src_ibox - locals_level_start_ibox].T,
+            if curr_level_qbx_centers.size == 0:
+                continue
 
-                                rscale2=qbx_radii[tgt_icenter],
-                                center2=tgt_center,
-                                nterms2=self.qbx_order,
+            icurr_level_qbx_center_to_box = (
+                    qbx_center_to_box[curr_level_qbx_centers])
 
-                                **kwargs)[..., 0].T
+            if self.dim == 3 and self.eqn_letter == "h":
+                kwargs["radius"] = 0.5 * (
+                        geo_data.expansion_radii()[curr_level_qbx_centers])
+
+            # This returns either the expansion or a tuple (ier, expn).
+            rvals = locloc(
+                    rscale1=box_to_rscale,
+                    rscale1_offsets=icurr_level_qbx_center_to_box,
+                    center1=box_centers,
+                    center1_offsets=icurr_level_qbx_center_to_box,
+                    expn1=locals_view.T,
+                    expn1_offsets=icurr_level_qbx_center_to_box - lev_box_start,
+                    nterms1=self.level_nterms[isrc_level],
+                    nterms2=self.qbx_order,
+                    rscale2=qbx_radii,
+                    rscale2_offsets=curr_level_qbx_centers,
+                    center2=qbx_centers,
+                    center2_offsets=curr_level_qbx_centers,
+                    **kwargs)
+
+            if isinstance(rvals, tuple):
+                ier, expn2 = rvals
+                if ier.any():
+                    raise RuntimeError("locloc failed")
+            else:
+                expn2 = rvals
 
-                    qbx_expansions[tgt_icenter] += tmp_loc_exp
+            qbx_expansions[curr_level_qbx_centers] += expn2.T
 
         return qbx_expansions
 
+    @log_process(logger)
+    @return_timing_data
     def eval_qbx_expansions(self, qbx_expansions):
         output = self.full_output_zeros()
 
diff --git a/pytential/qbx/geometry.py b/pytential/qbx/geometry.py
index 71160948543fcdc225be382d2ed2ffcf5ef7aec3..ee4421848fb7ba0388e76682be256727d47313ac 100644
--- a/pytential/qbx/geometry.py
+++ b/pytential/qbx/geometry.py
@@ -30,11 +30,13 @@ import pyopencl.array  # noqa
 from pytools import memoize_method
 from boxtree.tools import DeviceDataRecord
 import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from cgen import Enum
 
 
 from pytential.qbx.utils import TreeCodeContainerMixin
 
+from pytools import log_process
 
 import logging
 logger = logging.getLogger(__name__)
@@ -125,7 +127,8 @@ class QBXFMMGeometryCodeGetter(TreeCodeContainerMixin):
             """
                 targets[dim, i] = points[dim, i]
                 """,
-            default_offset=lp.auto, name="copy_targets")
+            default_offset=lp.auto, name="copy_targets",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         knl = lp.fix_parameters(knl, ndims=self.ambient_dim)
 
@@ -182,7 +185,8 @@ class QBXFMMGeometryCodeGetter(TreeCodeContainerMixin):
                 "..."
                 ],
             name="qbx_center_to_target_box_lookup",
-            silenced_warnings="write_race(tgt_write)")
+            silenced_warnings="write_race(tgt_write)",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         knl = lp.split_iname(knl, "ibox", 128,
                 inner_tag="l.0", outer_tag="g.0")
@@ -244,7 +248,8 @@ class QBXFMMGeometryCodeGetter(TreeCodeContainerMixin):
                 lp.ValueArg("ntargets", np.int32),
             ],
             name="pick_used_centers",
-            silenced_warnings="write_race(center_is_used_write)")
+            silenced_warnings="write_race(center_is_used_write)",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
         return knl
@@ -349,13 +354,16 @@ class QBXFMMGeometryData(object):
 
     def __init__(self, code_getter, lpot_source,
             target_discrs_and_qbx_sides,
-            target_association_tolerance, debug):
+            target_association_tolerance,
+            tree_kind, debug):
         """
         .. rubric:: Constructor arguments
 
         See the attributes of the same name for the meaning of most
         of the constructor arguments.
 
+        :arg tree_kind: The tree kind to pass to the tree builder
+
         :arg debug: a :class:`bool` flag for whether to enable
             potentially costly self-checks
         """
@@ -365,6 +373,7 @@ class QBXFMMGeometryData(object):
         self.target_discrs_and_qbx_sides = \
                 target_discrs_and_qbx_sides
         self.target_association_tolerance = target_association_tolerance
+        self.tree_kind = tree_kind
         self.debug = debug
 
     @property
@@ -494,11 +503,17 @@ class QBXFMMGeometryData(object):
                         self.coord_dtype)
                 target_radii[:self.ncenters] = self.expansion_radii()
 
-            # FIXME: https://gitlab.tiker.net/inducer/pytential/issues/72
-            # refine_weights = cl.array.zeros(queue, nparticles, dtype=np.int32)
-            # refine_weights[:nsources] = 1
             refine_weights = cl.array.empty(queue, nparticles, dtype=np.int32)
-            refine_weights.fill(1)
+
+            # Assign a weight of 1 to all sources, QBX centers, and conventional
+            # (non-QBX) targets. Assign a weight of 0 to all targets that need
+            # QBX centers. The potential at the latter targets is mediated
+            # entirely by the QBX center, so as a matter of evaluation cost,
+            # their location in the tree is irrelevant.
+            refine_weights[:-target_info.ntargets] = 1
+            user_ttc = self.user_target_to_center().with_queue(queue)
+            refine_weights[-target_info.ntargets:] = (
+                    user_ttc == target_state.NO_QBX_NEEDED).astype(np.int32)
 
             refine_weights.finish()
 
@@ -511,7 +526,7 @@ class QBXFMMGeometryData(object):
                     debug=self.debug,
                     stick_out_factor=lpot_src._expansion_stick_out_factor,
                     extent_norm=lpot_src._box_extent_norm,
-                    kind="adaptive")
+                    kind=self.tree_kind)
 
             if self.debug:
                 tgt_count_2 = cl.array.sum(
@@ -604,6 +619,36 @@ class QBXFMMGeometryData(object):
 
             return qbx_center_to_target_box.with_queue(None)
 
+    @memoize_method
+    def qbx_center_to_target_box_source_level(self, source_level):
+        """Return an array for mapping qbx centers to indices into
+        interaction lists as found in
+        ``traversal.from_sep_smaller_by_level[source_level].``
+        -1 if no such interaction list exist on *source_level*.
+        """
+        traversal = self.traversal()
+        sep_smaller = traversal.from_sep_smaller_by_level[source_level]
+        qbx_center_to_target_box = self.qbx_center_to_target_box()
+
+        with cl.CommandQueue(self.cl_context) as queue:
+            target_box_to_target_box_source_level = cl.array.empty(
+                queue, len(traversal.target_boxes),
+                dtype=traversal.tree.box_id_dtype
+            )
+            target_box_to_target_box_source_level.fill(-1)
+            target_box_to_target_box_source_level[sep_smaller.nonempty_indices] = (
+                cl.array.arange(queue, sep_smaller.num_nonempty_lists,
+                                dtype=traversal.tree.box_id_dtype)
+            )
+
+            qbx_center_to_target_box_source_level = (
+                target_box_to_target_box_source_level[
+                    qbx_center_to_target_box
+                ]
+            )
+
+            return qbx_center_to_target_box_source_level.with_queue(None)
+
     @memoize_method
     def global_qbx_flags(self):
         """Return an array of :class:`numpy.int8` of length
@@ -625,6 +670,7 @@ class QBXFMMGeometryData(object):
         return result.with_queue(None)
 
     @memoize_method
+    @log_process(logger)
     def global_qbx_centers(self):
         """Build a list of indices of QBX centers that use global QBX.  This
         indexes into the global list of targets, (see :meth:`target_info`) of
@@ -637,8 +683,6 @@ class QBXFMMGeometryData(object):
         user_target_to_center = self.user_target_to_center()
 
         with cl.CommandQueue(self.cl_context) as queue:
-            logger.info("find global qbx centers: start")
-
             tgt_assoc_result = (
                     user_target_to_center.with_queue(queue)[self.ncenters:])
 
@@ -663,8 +707,6 @@ class QBXFMMGeometryData(object):
                         ],
                     queue=queue)
 
-            logger.info("find global qbx centers: done")
-
             if self.debug:
                 logger.debug(
                         "find global qbx centers: using %d/%d centers"
@@ -689,7 +731,7 @@ class QBXFMMGeometryData(object):
 
         with cl.CommandQueue(self.cl_context) as queue:
             target_side_prefs = (self
-                .target_side_preferences()[self.ncenters:].get(queue=queue))
+                    .target_side_preferences()[self.ncenters:].get(queue=queue))
 
             target_discrs_and_qbx_sides = [(
                     PointsTarget(tgt_info.targets[:, self.ncenters:]),
@@ -706,15 +748,15 @@ class QBXFMMGeometryData(object):
                     target_association_tolerance=(
                         self.target_association_tolerance))
 
-            tree = self.tree()
-
-            result = cl.array.empty(queue, tgt_info.ntargets, tree.particle_id_dtype)
+            result = cl.array.empty(queue, tgt_info.ntargets,
+                    tgt_assoc_result.target_to_center.dtype)
             result[:self.ncenters].fill(target_state.NO_QBX_NEEDED)
             result[self.ncenters:] = tgt_assoc_result.target_to_center
 
         return result.with_queue(None)
 
     @memoize_method
+    @log_process(logger)
     def center_to_tree_targets(self):
         """Return a :class:`CenterToTargetList`. See :meth:`target_to_center`
         for the reverse look-up table with targets in user order.
@@ -725,8 +767,6 @@ class QBXFMMGeometryData(object):
         user_ttc = self.user_target_to_center()
 
         with cl.CommandQueue(self.cl_context) as queue:
-            logger.info("build center -> targets lookup table: start")
-
             tree_ttc = cl.array.empty_like(user_ttc).with_queue(queue)
             tree_ttc[self.tree().sorted_target_ids] = user_ttc
 
@@ -749,8 +789,6 @@ class QBXFMMGeometryData(object):
                             filtered_tree_ttc, filtered_target_ids,
                             self.ncenters, tree_ttc.dtype)
 
-            logger.info("build center -> targets lookup table: done")
-
             result = CenterToTargetList(
                     starts=center_target_starts,
                     lists=targets_sorted_by_center).with_queue(None)
@@ -758,6 +796,7 @@ class QBXFMMGeometryData(object):
             return result
 
     @memoize_method
+    @log_process(logger)
     def non_qbx_box_target_lists(self):
         """Build a list of targets per box that don't need to bother with QBX.
         Returns a :class:`boxtree.tree.FilteredTargetListsInTreeOrder`.
@@ -768,8 +807,6 @@ class QBXFMMGeometryData(object):
         """
 
         with cl.CommandQueue(self.cl_context) as queue:
-            logger.info("find non-qbx box target lists: start")
-
             flags = (self.user_target_to_center().with_queue(queue)
                     == target_state.NO_QBX_NEEDED)
 
@@ -781,10 +818,9 @@ class QBXFMMGeometryData(object):
             nqbx_centers = self.ncenters
             flags[:nqbx_centers] = 0
 
-            from boxtree.tree import filter_target_lists_in_tree_order
-            result = filter_target_lists_in_tree_order(queue, self.tree(), flags)
-
-            logger.info("find non-qbx box target lists: done")
+            tree = self.tree()
+            plfilt = self.code_getter.particle_list_filter()
+            result = plfilt.filter_target_lists_in_tree_order(queue, tree, flags)
 
             return result.with_queue(None)
 
@@ -803,12 +839,14 @@ class QBXFMMGeometryData(object):
             This only works for two-dimensional geometries.
         """
 
+        import matplotlib.pyplot as pt
+        pt.clf()
+
         dims = self.tree().targets.shape[0]
         if dims != 2:
             raise ValueError("only 2-dimensional geometry info can be plotted")
 
         with cl.CommandQueue(self.cl_context) as queue:
-            import matplotlib.pyplot as pt
             from meshmode.discretization.visualization import draw_curve
             draw_curve(self.lpot_source.quad_stage2_density_discr)
 
@@ -893,8 +931,10 @@ class QBXFMMGeometryData(object):
             # }}}
 
             pt.gca().set_aspect("equal")
-            pt.legend()
-            pt.show()
+            #pt.legend()
+            pt.savefig(
+                    "geodata-stage2-nelem%d.pdf"
+                    % self.lpot_source.stage2_density_discr.mesh.nelements)
 
     # }}}
 
diff --git a/pytential/qbx/interactions.py b/pytential/qbx/interactions.py
index 6105472db63a1aad41256798880ecef91e852c49..dad4db1f2988f566fcdd9c2d07ead3c8d455e525 100644
--- a/pytential/qbx/interactions.py
+++ b/pytential/qbx/interactions.py
@@ -24,6 +24,7 @@ THE SOFTWARE.
 
 import numpy as np
 import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pytools import memoize_method
 from six.moves import range
 
@@ -31,8 +32,7 @@ from sumpy.p2e import P2EBase
 from sumpy.e2e import E2EBase
 from sumpy.e2p import E2PBase
 
-
-PYTENTIAL_KERNEL_VERSION = 5
+from pytential.version import PYTENTIAL_KERNEL_VERSION
 
 
 # {{{ form qbx expansions from points
@@ -106,14 +106,16 @@ class P2QBXLFromCSR(P2EBase):
                     qbx_expansions[tgt_icenter, {i}] = \
                             simul_reduce(sum, (isrc_box, isrc), strength*coeff{i}) \
                             {{id_prefix=write_expn}}
-                    """.format(i=i) for i in range(ncoeffs)] + ["""
+                    """.format(i=i)
+                        for i in range(ncoeffs)] + ["""
 
                 end
                 """],
                 arguments,
                 name=self.name, assumptions="ntgt_centers>=1",
                 silenced_warnings="write_race(write_expn*)",
-                fixed_parameters=dict(dim=self.dim))
+                fixed_parameters=dict(dim=self.dim),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
@@ -158,37 +160,42 @@ class M2QBXL(E2EBase):
                     ],
                 ["""
                 for icenter
-                    <> icontaining_tgt_box = qbx_center_to_target_box[icenter]
+                    <> icontaining_tgt_box = \
+                        qbx_center_to_target_box_source_level[icenter]
 
-                    <> tgt_center[idim] = qbx_centers[idim, icenter] \
-                            {id=fetch_tgt_center}
-                    <> tgt_rscale = qbx_expansion_radii[icenter]
+                    if icontaining_tgt_box != -1
+                        <> tgt_center[idim] = qbx_centers[idim, icenter] \
+                                {id=fetch_tgt_center}
+                        <> tgt_rscale = qbx_expansion_radii[icenter]
 
-                    <> isrc_start = src_box_starts[icontaining_tgt_box]
-                    <> isrc_stop = src_box_starts[icontaining_tgt_box+1]
+                        <> isrc_start = src_box_starts[icontaining_tgt_box]
+                        <> isrc_stop = src_box_starts[icontaining_tgt_box+1]
 
-                    for isrc_box
-                        <> src_ibox = src_box_lists[isrc_box] \
-                                {id=read_src_ibox}
-                        <> src_center[idim] = centers[idim, src_ibox] {dup=idim}
-                        <> d[idim] = tgt_center[idim] - src_center[idim] {dup=idim}
-                        """] + ["""
+                        for isrc_box
+                            <> src_ibox = src_box_lists[isrc_box] \
+                                    {id=read_src_ibox}
+                            <> src_center[idim] = centers[idim, src_ibox] {dup=idim}
+                            <> d[idim] = tgt_center[idim] - src_center[idim] \
+                                    {dup=idim}
+                            """] + ["""
 
-                        <> src_coeff{i} = \
-                            src_expansions[src_ibox - src_base_ibox, {i}] \
-                            {{dep=read_src_ibox}}
+                            <> src_coeff{i} = \
+                                src_expansions[src_ibox - src_base_ibox, {i}] \
+                                {{dep=read_src_ibox}}
 
-                        """.format(i=i) for i in range(ncoeff_src)] + [
+                            """.format(i=i) for i in range(ncoeff_src)] + [
 
-                        ] + self.get_translation_loopy_insns() + ["""
+                            ] + self.get_translation_loopy_insns() + ["""
 
+                        end
+                        """] + ["""
+                        qbx_expansions[icenter, {i}] = \
+                                qbx_expansions[icenter, {i}] + \
+                                simul_reduce(sum, isrc_box, coeff{i}) \
+                                {{id_prefix=write_expn}}
+                        """.format(i=i)
+                                for i in range(ncoeff_tgt)] + ["""
                     end
-                    """] + ["""
-                    qbx_expansions[icenter, {i}] = qbx_expansions[icenter, {i}] + \
-                            simul_reduce(sum, isrc_box, coeff{i}) \
-                            {{id_prefix=write_expn}}
-                    """.format(i=i) for i in range(ncoeff_tgt)] + ["""
-
                 end
                 """],
                 [
@@ -209,7 +216,8 @@ class M2QBXL(E2EBase):
                 ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]),
                 name=self.name, assumptions="ncenters>=1",
                 silenced_warnings="write_race(write_expn*)",
-                fixed_parameters=dict(dim=self.dim))
+                fixed_parameters=dict(dim=self.dim),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         for expn in [self.src_expansion, self.tgt_expansion]:
             loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
@@ -289,7 +297,8 @@ class L2QBXL(E2EBase):
                         qbx_expansions[icenter, {i}] = \
                             qbx_expansions[icenter, {i}] + coeff{i} \
                             {{id_prefix=write_expn}}
-                        """.format(i=i) for i in range(ncoeff_tgt)] + ["""
+                        """.format(i=i)
+                            for i in range(ncoeff_tgt)] + ["""
                     end
                 end
                 """],
@@ -309,7 +318,8 @@ class L2QBXL(E2EBase):
                 name=self.name,
                 assumptions="ncenters>=1",
                 silenced_warnings="write_race(write_expn*)",
-                fixed_parameters=dict(dim=self.dim, nchildren=2**self.dim))
+                fixed_parameters=dict(dim=self.dim, nchildren=2**self.dim),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         for expn in [self.src_expansion, self.tgt_expansion]:
             loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
@@ -384,7 +394,8 @@ class QBXL2P(E2PBase):
 
                         result[{i},center_itgt] = kernel_scaling * result_{i}_p \
                                 {{id_prefix=write_result}}
-                        """.format(i=i) for i in range(len(result_names))] + ["""
+                        """.format(i=i)
+                            for i in range(len(result_names))] + ["""
                     end
                 end
                 """],
@@ -405,7 +416,8 @@ class QBXL2P(E2PBase):
                 name=self.name,
                 assumptions="nglobal_qbx_centers>=1",
                 silenced_warnings="write_race(write_result*)",
-                fixed_parameters=dict(dim=self.dim, nresults=len(result_names)))
+                fixed_parameters=dict(dim=self.dim, nresults=len(result_names)),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
         loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
diff --git a/pytential/qbx/refinement.py b/pytential/qbx/refinement.py
index 5d90a320015ecd2a249c00634ffcf221cac261ef..1911a48d1df66c1ea12c7fbb1e8e930ff3a18028 100644
--- a/pytential/qbx/refinement.py
+++ b/pytential/qbx/refinement.py
@@ -28,6 +28,7 @@ THE SOFTWARE.
 
 
 import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 import numpy as np
 import pyopencl as cl
 
@@ -38,6 +39,8 @@ from pytential.qbx.utils import (
         QBX_TREE_C_PREAMBLE, QBX_TREE_MAKO_DEFS, TreeWranglerBase,
         TreeCodeContainerMixin)
 
+from pytools import ProcessLogger, log_process
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -241,22 +244,24 @@ class RefinerCodeContainer(TreeCodeContainerMixin):
                 extra_type_aliases=(("particle_id_t", particle_id_dtype),))
 
     @memoize_method
-    def kernel_length_scale_to_panel_size_ratio_checker(self):
+    def element_prop_threshold_checker(self):
         knl = lp.make_kernel(
-            "{[panel]: 0<=panel<npanels}",
+            "{[ielement]: 0<=ielement<nelements}",
             """
-            for panel
-                <> oversize = panel_sizes[panel] > kernel_length_scale
-                if oversize
-                    refine_flags[panel] = 1
+            for ielement
+                <> over_threshold = element_property[ielement] > threshold
+                if over_threshold
+                    refine_flags[ielement] = 1
                     refine_flags_updated = 1 {id=write_refine_flags_updated}
                 end
             end
             """,
             options="return_dict",
             silenced_warnings="write_race(write_refine_flags_updated)",
-            name="refine_kernel_length_scale_to_panel_size_ratio")
-        knl = lp.split_iname(knl, "panel", 128, inner_tag="l.0", outer_tag="g.0")
+            name="refine_kernel_length_scale_to_quad_resolution_ratio",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        knl = lp.split_iname(knl, "ielement", 128, inner_tag="l.0", outer_tag="g.0")
         return knl
 
     def get_wrangler(self, queue):
@@ -278,6 +283,7 @@ class RefinerWrangler(TreeWranglerBase):
 
     # {{{ check subroutines for conditions 1-3
 
+    @log_process(logger)
     def check_expansion_disks_undisturbed_by_sources(self,
             lpot_source, tree, peer_lists,
             expansion_disturbance_tolerance,
@@ -296,9 +302,6 @@ class RefinerWrangler(TreeWranglerBase):
                 tree.particle_id_dtype,
                 max_levels)
 
-        logger.info("refiner: checking that expansion disk is "
-                "undisturbed by sources")
-
         if debug:
             npanels_to_refine_prev = cl.array.sum(refine_flags).get()
 
@@ -336,10 +339,9 @@ class RefinerWrangler(TreeWranglerBase):
                 logger.debug("refiner: found {} panel(s) to refine".format(
                     npanels_to_refine - npanels_to_refine_prev))
 
-        logger.info("refiner: done checking center is closest to orig panel")
-
         return found_panel_to_refine.get()[0] == 1
 
+    @log_process(logger)
     def check_sufficient_source_quadrature_resolution(
             self, lpot_source, tree, peer_lists, refine_flags, debug,
             wait_for=None):
@@ -358,14 +360,11 @@ class RefinerWrangler(TreeWranglerBase):
         if debug:
             npanels_to_refine_prev = cl.array.sum(refine_flags).get()
 
-        logger.info("refiner: checking sufficient quadrature resolution")
-
         found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
         found_panel_to_refine.finish()
 
-        source_danger_zone_radii_by_panel = (
-                lpot_source._fine_panel_sizes("npanels")
-                .with_queue(self.queue) / 4)
+        source_danger_zone_radii_by_panel = \
+                lpot_source._source_danger_zone_radii("npanels")
         unwrap_args = AreaQueryElementwiseTemplate.unwrap_args
 
         evt = knl(
@@ -394,24 +393,21 @@ class RefinerWrangler(TreeWranglerBase):
                 logger.debug("refiner: found {} panel(s) to refine".format(
                     npanels_to_refine - npanels_to_refine_prev))
 
-        logger.info("refiner: done checking sufficient quadrature resolution")
-
         return found_panel_to_refine.get()[0] == 1
 
-    def check_kernel_length_scale_to_panel_size_ratio(self, lpot_source,
-            kernel_length_scale, refine_flags, debug, wait_for=None):
-        knl = self.code_container.kernel_length_scale_to_panel_size_ratio_checker()
-
-        logger.info("refiner: checking kernel length scale to panel size ratio")
+    def check_element_prop_threshold(self, element_property, threshold, refine_flags,
+            debug, wait_for=None):
+        knl = self.code_container.element_prop_threshold_checker()
 
         if debug:
             npanels_to_refine_prev = cl.array.sum(refine_flags).get()
 
         evt, out = knl(self.queue,
-                       panel_sizes=lpot_source._panel_sizes("npanels"),
+                       element_property=element_property,
+                       # lpot_source._coarsest_quad_resolution("npanels")),
                        refine_flags=refine_flags,
                        refine_flags_updated=np.array(0),
-                       kernel_length_scale=np.array(kernel_length_scale),
+                       threshold=np.array(threshold),
                        wait_for=wait_for)
 
         cl.wait_for_events([evt])
@@ -422,8 +418,6 @@ class RefinerWrangler(TreeWranglerBase):
                 logger.debug("refiner: found {} panel(s) to refine".format(
                     npanels_to_refine - npanels_to_refine_prev))
 
-        logger.info("refiner: done checking kernel length scale to panel size ratio")
-
         return (out["refine_flags_updated"].get() == 1).all()
 
     # }}}
@@ -436,13 +430,10 @@ class RefinerWrangler(TreeWranglerBase):
             refine_flags = refine_flags.get(self.queue)
         refine_flags = refine_flags.astype(np.bool)
 
-        logger.info("refiner: calling meshmode")
-
-        refiner.refine(refine_flags)
-        from meshmode.discretization.connection import make_refinement_connection
-        conn = make_refinement_connection(refiner, density_discr, factory)
-
-        logger.info("refiner: done calling meshmode")
+        with ProcessLogger(logger, "refine mesh"):
+            refiner.refine(refine_flags)
+            from meshmode.discretization.connection import make_refinement_connection
+            conn = make_refinement_connection(refiner, density_discr, factory)
 
         return conn
 
@@ -474,8 +465,11 @@ def make_empty_refine_flags(queue, lpot_source, use_stage2_discr=False):
 
 def refine_for_global_qbx(lpot_source, wrangler,
         group_factory, kernel_length_scale=None,
-        refine_flags=None, debug=None, maxiter=None,
-        visualize=None, expansion_disturbance_tolerance=None):
+        force_stage2_uniform_refinement_rounds=None,
+        scaled_max_curvature_threshold=None,
+        debug=None, maxiter=None,
+        visualize=None, expansion_disturbance_tolerance=None,
+        refiner=None):
     """
     Entry point for calling the refiner.
 
@@ -490,11 +484,6 @@ def refine_for_global_qbx(lpot_source, wrangler,
     :arg kernel_length_scale: The kernel length scale, or *None* if not
         applicable. All panels are refined to below this size.
 
-    :arg refine_flags: A :class:`pyopencl.array.Array` indicating which
-        panels should get refined initially, or `None` if no initial
-        refinement should be done. Should have size equal to the number of
-        panels. See also :func:`make_empty_refine_flags()`.
-
     :arg maxiter: The maximum number of refiner iterations.
 
     :returns: A tuple ``(lpot_source, *conn*)`` where ``lpot_source`` is the
@@ -513,35 +502,48 @@ def refine_for_global_qbx(lpot_source, wrangler,
     if expansion_disturbance_tolerance is None:
         expansion_disturbance_tolerance = 0.025
 
+    if force_stage2_uniform_refinement_rounds is None:
+        force_stage2_uniform_refinement_rounds = 0
+
     # TODO: Stop doing redundant checks by avoiding panels which no longer need
     # refinement.
 
-    from meshmode.mesh.refinement import Refiner
+    from meshmode.mesh.refinement import RefinerWithoutAdjacency
     from meshmode.discretization.connection import (
             ChainedDiscretizationConnection, make_same_mesh_connection)
 
-    refiner = Refiner(lpot_source.density_discr.mesh)
-    connections = []
+    if refiner is not None:
+        assert refiner.get_current_mesh() == lpot_source.density_discr.mesh
+    else:
+        # We may be handed a mesh that's already non-conforming, we don't rely
+        # on adjacency, and the no-adjacency refiner is faster.
+        refiner = RefinerWithoutAdjacency(lpot_source.density_discr.mesh)
 
-    # Do initial refinement.
-    if refine_flags is not None:
-        conn = wrangler.refine(
-                lpot_source.density_discr, refiner, refine_flags, group_factory,
-                debug)
-        connections.append(conn)
-        lpot_source = lpot_source.copy(density_discr=conn.to_discr)
+    connections = []
 
     # {{{ first stage refinement
 
-    def visualize_refinement(niter, stage, flags):
+    def visualize_refinement(niter, stage_nr, stage_name, flags):
         if not visualize:
             return
 
-        discr = lpot_source.density_discr
+        if stage_nr == 1:
+            discr = lpot_source.density_discr
+        elif stage_nr == 2:
+            discr = lpot_source.stage2_density_discr
+        else:
+            raise ValueError("unexpected stage number")
+
+        flags = flags.get()
+        logger.info("for stage %s: splitting %d/%d stage-%d elements",
+                stage_name, np.sum(flags), discr.mesh.nelements, stage_nr)
+
         from meshmode.discretization.visualization import make_visualizer
-        vis = make_visualizer(wrangler.queue, discr, 10)
+        vis = make_visualizer(wrangler.queue, discr, 3)
 
-        flags = flags.get().astype(np.bool)
+        assert len(flags) == discr.mesh.nelements
+
+        flags = flags.astype(np.bool)
         nodes_flags = np.zeros(discr.nnodes)
         for grp in discr.groups:
             meg = grp.mesh_el_group
@@ -559,7 +561,7 @@ def refine_for_global_qbx(lpot_source, wrangler,
                     wrangler.queue).as_vector(dtype=object)
             vis_data.append(("bdry_normals", bdry_normals),)
 
-        vis.write_vtk_file("refinement-%03d-%s.vtu" % (niter, stage), vis_data)
+        vis.write_vtk_file("refinement-%s-%03d.vtu" % (stage_name, niter), vis_data)
 
     def warn_max_iterations():
         from warnings import warn
@@ -596,32 +598,68 @@ def refine_for_global_qbx(lpot_source, wrangler,
             warn_max_iterations()
             break
 
-        # Build tree and auxiliary data.
-        # FIXME: The tree should not have to be rebuilt at each iteration.
-        tree = wrangler.build_tree(lpot_source)
-        peer_lists = wrangler.find_peer_lists(tree)
         refine_flags = make_empty_refine_flags(wrangler.queue, lpot_source)
 
-        # Check condition 1.
-        has_disturbed_expansions = \
-                wrangler.check_expansion_disks_undisturbed_by_sources(
-                        lpot_source, tree, peer_lists,
-                        expansion_disturbance_tolerance,
-                        refine_flags, debug)
-        if has_disturbed_expansions:
-            iter_violated_criteria.append("disturbed expansions")
-            visualize_refinement(niter, "disturbed-expansions", refine_flags)
-
-        # Check condition 3.
         if kernel_length_scale is not None:
-
-            violates_kernel_length_scale = \
-                    wrangler.check_kernel_length_scale_to_panel_size_ratio(
-                            lpot_source, kernel_length_scale, refine_flags, debug)
-
-            if violates_kernel_length_scale:
-                iter_violated_criteria.append("kernel length scale")
-                visualize_refinement(niter, "kernel-length-scale", refine_flags)
+            with ProcessLogger(logger,
+                    "checking kernel length scale to panel size ratio"):
+
+                violates_kernel_length_scale = \
+                        wrangler.check_element_prop_threshold(
+                                element_property=(
+                                    lpot_source._coarsest_quad_resolution(
+                                        "npanels")),
+                                threshold=kernel_length_scale,
+                                refine_flags=refine_flags, debug=debug)
+
+                if violates_kernel_length_scale:
+                    iter_violated_criteria.append("kernel length scale")
+                    visualize_refinement(
+                            niter, 1, "kernel-length-scale", refine_flags)
+
+        if scaled_max_curvature_threshold is not None:
+            with ProcessLogger(logger,
+                    "checking scaled max curvature threshold"):
+                from pytential.qbx.utils import to_last_dim_length
+                from pytential import sym, bind
+                scaled_max_curv = to_last_dim_length(
+                        lpot_source.density_discr,
+                        bind(lpot_source,
+                            sym.ElementwiseMax(
+                                sym._scaled_max_curvature(
+                                    lpot_source.density_discr.ambient_dim)))
+                            (wrangler.queue), "npanels")
+
+                violates_scaled_max_curv = \
+                        wrangler.check_element_prop_threshold(
+                                element_property=scaled_max_curv,
+                                threshold=scaled_max_curvature_threshold,
+                                refine_flags=refine_flags, debug=debug)
+
+                if violates_scaled_max_curv:
+                    iter_violated_criteria.append("curvature")
+                    visualize_refinement(niter, 1, "curvature", refine_flags)
+
+        if not iter_violated_criteria:
+            # Only start building trees once the simple length-based criteria
+            # are happy.
+
+            # Build tree and auxiliary data.
+            # FIXME: The tree should not have to be rebuilt at each iteration.
+            tree = wrangler.build_tree(lpot_source)
+            peer_lists = wrangler.find_peer_lists(tree)
+
+            has_disturbed_expansions = \
+                    wrangler.check_expansion_disks_undisturbed_by_sources(
+                            lpot_source, tree, peer_lists,
+                            expansion_disturbance_tolerance,
+                            refine_flags, debug)
+            if has_disturbed_expansions:
+                iter_violated_criteria.append("disturbed expansions")
+                visualize_refinement(niter, 1, "disturbed-expansions", refine_flags)
+
+            del tree
+            del peer_lists
 
         if iter_violated_criteria:
             violated_criteria.append(" and ".join(iter_violated_criteria))
@@ -632,9 +670,7 @@ def refine_for_global_qbx(lpot_source, wrangler,
             connections.append(conn)
             lpot_source = lpot_source.copy(density_discr=conn.to_discr)
 
-        del tree
         del refine_flags
-        del peer_lists
 
     # }}}
 
@@ -666,7 +702,7 @@ def refine_for_global_qbx(lpot_source, wrangler,
                         lpot_source, tree, peer_lists, refine_flags, debug)
         if has_insufficient_quad_res:
             iter_violated_criteria.append("insufficient quadrature resolution")
-            visualize_refinement(niter, "quad-resolution", refine_flags)
+            visualize_refinement(niter, 2, "quad-resolution", refine_flags)
 
         if iter_violated_criteria:
             violated_criteria.append(" and ".join(iter_violated_criteria))
@@ -684,6 +720,18 @@ def refine_for_global_qbx(lpot_source, wrangler,
         del refine_flags
         del peer_lists
 
+    for round in range(force_stage2_uniform_refinement_rounds):
+        conn = wrangler.refine(
+                stage2_density_discr,
+                refiner,
+                np.ones(stage2_density_discr.mesh.nelements, dtype=np.bool),
+                group_factory, debug)
+        stage2_density_discr = conn.to_discr
+        fine_connections.append(conn)
+        lpot_source = lpot_source.copy(
+                to_refined_connection=ChainedDiscretizationConnection(
+                    fine_connections))
+
     # }}}
 
     lpot_source = lpot_source.copy(debug=debug, _refined_for_global_qbx=True)
diff --git a/pytential/qbx/target_assoc.py b/pytential/qbx/target_assoc.py
index 7b9736ce4b6d34f70dcb411bfcba387ea2ec7889..11b70c4b2f1cee8db85fd2c00a159a28682d7fa1 100644
--- a/pytential/qbx/target_assoc.py
+++ b/pytential/qbx/target_assoc.py
@@ -43,6 +43,7 @@ from pytential.qbx.utils import (
 
 unwrap_args = AreaQueryElementwiseTemplate.unwrap_args
 
+from pytools import log_process
 
 import logging
 logger = logging.getLogger(__name__)
@@ -90,8 +91,6 @@ Return values
 
 .. autoclass:: QBXTargetAssociation
 
-.. autoclass:: QBXTargetAssociationFailedException
-
 Target association driver
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -438,6 +437,7 @@ class TargetAssociationWrangler(TreeWranglerBase):
         self.code_container = code_container
         self.queue = queue
 
+    @log_process(logger)
     def mark_targets(self, tree, peer_lists, lpot_source, target_status,
                      debug, wait_for=None):
         # Round up level count--this gets included in the kernel as
@@ -497,8 +497,6 @@ class TargetAssociationWrangler(TreeWranglerBase):
                 wait_for=wait_for)
         wait_for = [evt]
 
-        logger.info("target association: marking targets close to panels")
-
         tunnel_radius_by_source = lpot_source._close_target_tunnel_radius("nsources")
 
         evt = knl(
@@ -527,10 +525,9 @@ class TargetAssociationWrangler(TreeWranglerBase):
 
         cl.wait_for_events([evt])
 
-        logger.info("target association: done marking targets close to panels")
-
         return (found_target_close_to_panel == 1).all().get()
 
+    @log_process(logger)
     def try_find_centers(self, tree, peer_lists, lpot_source,
                          target_status, target_flags, target_assoc,
                          target_association_tolerance, debug, wait_for=None):
@@ -582,8 +579,6 @@ class TargetAssociationWrangler(TreeWranglerBase):
 
         wait_for.extend(min_dist_to_center.events)
 
-        logger.info("target association: finding centers for targets")
-
         evt = knl(
             *unwrap_args(
                 tree, peer_lists,
@@ -613,9 +608,8 @@ class TargetAssociationWrangler(TreeWranglerBase):
                          .format(ntargets_associated))
 
         cl.wait_for_events([evt])
-        logger.info("target association: done finding centers for targets")
-        return
 
+    @log_process(logger)
     def mark_panels_for_refinement(self, tree, peer_lists, lpot_source,
                                    target_status, refine_flags, debug,
                                    wait_for=None):
@@ -653,8 +647,6 @@ class TargetAssociationWrangler(TreeWranglerBase):
                 wait_for=wait_for)
         wait_for = [evt]
 
-        logger.info("target association: marking panels for refinement")
-
         evt = knl(
             *unwrap_args(
                 tree, peer_lists,
@@ -684,8 +676,6 @@ class TargetAssociationWrangler(TreeWranglerBase):
 
         cl.wait_for_events([evt])
 
-        logger.info("target association: done marking panels for refinement")
-
         return (found_panel_to_refine == 1).all().get()
 
     def make_target_flags(self, target_discrs_and_qbx_sides):
@@ -735,7 +725,7 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler,
 
         The side request can take on the values in :ref:`qbx-side-request-table`.
 
-    :raises QBXTargetAssociationFailedException:
+    :raises pytential.qbx.QBXTargetAssociationFailedException:
         when target association failed to find a center for a target.
         The returned exception object contains suggested refine flags.
 
diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py
index d03e8365fa188d55a04a78c41b56279af883a318..b0ffc066035c0c8f1aea690ecc5dcb2618367275 100644
--- a/pytential/qbx/utils.py
+++ b/pytential/qbx/utils.py
@@ -32,10 +32,13 @@ from boxtree.tree import Tree
 import pyopencl as cl
 import pyopencl.array # noqa
 from pytools import memoize, memoize_method
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
 import logging
 logger = logging.getLogger(__name__)
 
+from pytools import log_process
+
 
 # {{{ c and mako snippets
 
@@ -84,7 +87,8 @@ def get_interleaver_kernel(dtype):
             lp.GlobalArg("dst", shape=(var("dstlen"),), dtype=dtype),
             "..."
         ],
-        assumptions="2*srclen = dstlen")
+        assumptions="2*srclen = dstlen",
+        lang_version=MOST_RECENT_LANGUAGE_VERSION)
     knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
     return knl
 
@@ -154,6 +158,11 @@ class TreeCodeContainer(object):
         from boxtree.area_query import PeerListFinder
         return PeerListFinder(self.cl_context)
 
+    @memoize_method
+    def particle_list_filter(self):
+        from boxtree.tree import ParticleListFilter
+        return ParticleListFilter(self.cl_context)
+
 # }}}
 
 
@@ -170,6 +179,9 @@ class TreeCodeContainerMixin(object):
     def peer_list_finder(self):
         return self.tree_code_container.peer_list_finder()
 
+    def particle_list_filter(self):
+        return self.tree_code_container.particle_list_filter()
+
 # }}}
 
 
@@ -180,9 +192,10 @@ class TreeWranglerBase(object):
     def build_tree(self, lpot_source, targets_list=(),
                    use_stage2_discr=False):
         tb = self.code_container.build_tree()
+        plfilt = self.code_container.particle_list_filter()
         from pytential.qbx.utils import build_tree_with_qbx_metadata
         return build_tree_with_qbx_metadata(
-                self.queue, tb, lpot_source, targets_list=targets_list,
+                self.queue, tb, plfilt, lpot_source, targets_list=targets_list,
                 use_stage2_discr=use_stage2_discr)
 
     def find_peer_lists(self, tree):
@@ -194,66 +207,51 @@ class TreeWranglerBase(object):
 # }}}
 
 
-# {{{ panel sizes
+# {{{ to_last_dim_length
 
-def panel_sizes(discr, last_dim_length):
-    if last_dim_length not in ("nsources", "ncenters", "npanels"):
-        raise ValueError(
-                "invalid value of last_dim_length: %s" % last_dim_length)
+def to_last_dim_length(discr, vec, last_dim_length, queue=None):
+    """Takes a :class:`pyopencl.array.Array` with a last axis that has the same
+    length as the number of discretization nodes in the discretization *discr*
+    and converts it so that the last axis has a length as specified by
+    *last_dim_length*.
+    """
 
-    # To get the panel size this does the equivalent of (∫ 1 ds)**(1/dim).
-    # FIXME: Kernel optimizations
+    queue = queue or vec.queue
 
-    if last_dim_length == "nsources" or last_dim_length == "ncenters":
-        knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i<nelements and 0<=j,k<nunit_nodes}",
-            "panel_sizes[i,j] = sum(k, ds[i,k])**(1/dim)",
-            name="compute_size")
+    if last_dim_length == "nsources":
+        return vec
 
-        def panel_size_view(discr, group_nr):
-            return discr.groups[group_nr].view
+    elif last_dim_length == "ncenters":
+        knl = get_interleaver_kernel(vec.dtype)
+        _, (result,) = knl(queue, dstlen=2*discr.nnodes, src1=vec, src2=vec)
+        return result
 
     elif last_dim_length == "npanels":
         knl = lp.make_kernel(
-            "{[i,j]: 0<=i<nelements and 0<=j<nunit_nodes}",
-            "panel_sizes[i] = sum(j, ds[i,j])**(1/dim)",
-            name="compute_size")
-        from functools import partial
-
-        def panel_size_view(discr, group_nr):
-            return partial(el_view, discr, group_nr)
+            "{[i,k]: 0<=i<nelements}",
+            "result[i] = a[i,0]",
+            [
+                lp.GlobalArg("a", shape="nelements, nunit_nodes", dtype=lp.auto),
+                lp.ValueArg("nunit_nodes", dtype=np.int32),
+                "..."
+                ],
+            name="subsample_to_panels",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION,
+            )
+
+        knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
+
+        result = cl.array.empty(queue, discr.mesh.nelements, vec.dtype)
+        for group_nr, group in enumerate(discr.groups):
+            knl(queue,
+                nelements=group.nelements,
+                a=group.view(vec),
+                result=el_view(discr, group_nr, result))
 
+        return result
     else:
         raise ValueError("unknown dim length specified")
 
-    knl = lp.fix_parameters(knl, dim=discr.dim)
-
-    with cl.CommandQueue(discr.cl_context) as queue:
-        from pytential import bind, sym
-        ds = bind(
-                discr,
-                sym.area_element(ambient_dim=discr.ambient_dim, dim=discr.dim)
-                * sym.QWeight()
-                )(queue)
-        panel_sizes = cl.array.empty(
-            queue, discr.nnodes
-            if last_dim_length in ("nsources", "ncenters")
-            else discr.mesh.nelements, discr.real_dtype)
-        for group_nr, group in enumerate(discr.groups):
-            _, (result,) = knl(queue,
-                nelements=group.nelements,
-                nunit_nodes=group.nunit_nodes,
-                ds=group.view(ds),
-                panel_sizes=panel_size_view(
-                    discr, group_nr)(panel_sizes))
-        panel_sizes.finish()
-        if last_dim_length == "ncenters":
-            from pytential.qbx.utils import get_interleaver_kernel
-            knl = get_interleaver_kernel(discr.real_dtype)
-            _, (panel_sizes,) = knl(queue, dstlen=2*discr.nnodes,
-                                    src1=panel_sizes, src2=panel_sizes)
-        return panel_sizes.with_queue(None)
-
 # }}}
 
 
@@ -268,7 +266,8 @@ def element_centers_of_mass(discr):
         """
             panels[dim, k] = sum(i, nodes[dim, k, i])/nunit_nodes
             """,
-        default_offset=lp.auto, name="find_panel_centers_of_mass")
+        default_offset=lp.auto, name="find_panel_centers_of_mass",
+        lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
     knl = lp.fix_parameters(knl, ndims=discr.ambient_dim)
 
@@ -317,11 +316,10 @@ def el_view(discr, group_nr, global_array):
     where *nelements* is the global (per-discretization) node count.
     """
 
-    group = discr.groups[group_nr]
-    el_nr_base = sum(group.nelements for group in discr.groups[:group_nr])
+    group = discr.mesh.groups[group_nr]
 
     return global_array[
-        ..., el_nr_base:el_nr_base + group.nelements] \
+        ..., group.element_nr_base:group.element_nr_base + group.nelements] \
         .reshape(
             global_array.shape[:-1]
             + (group.nelements,))
@@ -447,8 +445,9 @@ class TreeWithQBXMetadata(Tree):
 MAX_REFINE_WEIGHT = 64
 
 
+@log_process(logger)
 def build_tree_with_qbx_metadata(
-        queue, tree_builder, lpot_source, targets_list=(),
+        queue, tree_builder, particle_list_filter, lpot_source, targets_list=(),
         use_stage2_discr=False):
     """Return a :class:`TreeWithQBXMetadata` built from the given layer
     potential source. This contains particles of four different types:
@@ -476,8 +475,6 @@ def build_tree_with_qbx_metadata(
     # - then panels (=centers of mass)
     # - then targets
 
-    logger.info("start building tree with qbx metadata")
-
     sources = (
             lpot_source.density_discr.nodes()
             if not use_stage2_discr
@@ -488,7 +485,7 @@ def build_tree_with_qbx_metadata(
     centers_of_mass = (
             lpot_source._panel_centers_of_mass()
             if not use_stage2_discr
-            else lpot_source._fine_panel_centers_of_mass())
+            else lpot_source._stage2_panel_centers_of_mass())
 
     targets = (tgt.nodes() for tgt in targets_list)
 
@@ -542,9 +539,9 @@ def build_tree_with_qbx_metadata(
         flags[particle_slice].fill(1)
         flags.finish()
 
-        from boxtree.tree import filter_target_lists_in_user_order
         box_to_class = (
-            filter_target_lists_in_user_order(queue, tree, flags)
+            particle_list_filter
+            .filter_target_lists_in_user_order(queue, tree, flags)
             .with_queue(queue))
 
         if fixup:
@@ -589,8 +586,6 @@ def build_tree_with_qbx_metadata(
 
     tree_attrs.update(particle_classes)
 
-    logger.info("done building tree with qbx metadata")
-
     return TreeWithQBXMetadata(
         qbx_panel_to_source_starts=qbx_panel_to_source_starts,
         qbx_panel_to_center_starts=qbx_panel_to_center_starts,
diff --git a/pytential/solve.py b/pytential/solve.py
index ace7a32200a8313051ace7d72af69dd2a4cfadb7..fc0549b5cb0801a544d70d90d5fea53486b00003 100644
--- a/pytential/solve.py
+++ b/pytential/solve.py
@@ -55,7 +55,7 @@ def get_array_module(vec):
 # {{{ block system support
 
 class VectorChopper(object):
-    def __init__(self, structured_vec, queue = None):
+    def __init__(self, structured_vec, queue=None):
         from pytools.obj_array import is_obj_array
         self.is_structured = is_obj_array(structured_vec)
         self.array_module = get_array_module(structured_vec)
@@ -72,7 +72,7 @@ class VectorChopper(object):
                     length = 1
                     is_scalar = True
 
-                self.slices.append((is_scalar,slice(num_dofs, num_dofs+length)))
+                self.slices.append((is_scalar, slice(num_dofs, num_dofs+length)))
                 num_dofs += length
 
     def stack(self, vec):
@@ -81,7 +81,11 @@ class VectorChopper(object):
         if not self.is_structured:
             return vec
 
-        for n in range(0,len(self.slices)):
+        if self.queue is None:
+            raise ValueError("a CL queue must be supplied if support of systems "
+                    "of equations is desired")
+
+        for n in range(0, len(self.slices)):
             if self.slices[n][0]:
                 vec[n] = cl.array.to_device(self.queue, np.array([vec[n]]))
 
@@ -91,14 +95,18 @@ class VectorChopper(object):
         if not self.is_structured:
             return vec
 
+        if self.queue is None:
+            raise ValueError("a CL queue must be supplied if support of systems "
+                    "of equations is desired")
+
         from pytools.obj_array import make_obj_array
-        result = make_obj_array([vec[slc] for (is_scalar,slc) in self.slices])
+        result = make_obj_array([vec[slc] for (is_scalar, slc) in self.slices])
 
         if self.queue is not None:
-            for n in range(0,len(self.slices)):
+            for n in range(0, len(self.slices)):
                 if self.slices[n][0]:
                     result[n] = result[n].get(self.queue)[0]
-                    
+
         return result
 
 # }}}
@@ -188,7 +196,6 @@ def _gmres(A, b, restart=None, tol=None, x0=None, dot=None,  # noqa
 
     norm_b = norm(b)
     last_resid_norm = None
-    stall_count = 0
     residual_norms = []
 
     for iteration in range(maxiter):
@@ -228,21 +235,20 @@ def _gmres(A, b, restart=None, tol=None, x0=None, dot=None,  # noqa
                 else:
                     print("*** WARNING: non-monotonic residuals in GMRES")
 
-            if stall_iterations and \
-                    norm_r > last_resid_norm/no_progress_factor:
-                stall_count += 1
+            if (stall_iterations
+                    and len(residual_norms) > stall_iterations
+                    and norm_r > (
+                        residual_norms[-stall_iterations]
+                        / no_progress_factor)):
 
-                if stall_count >= stall_iterations:
-                    state = "stalled"
-                    if hard_failure:
-                        raise GMRESError(state)
-                    else:
-                        return GMRESResult(solution=x,
-                                residual_norms=residual_norms,
-                                iteration_count=iteration, success=False,
-                                state=state)
-            else:
-                stall_count = 0
+                state = "stalled"
+                if hard_failure:
+                    raise GMRESError(state)
+                else:
+                    return GMRESResult(solution=x,
+                            residual_norms=residual_norms,
+                            iteration_count=iteration, success=False,
+                            state=state)
 
         last_resid_norm = norm_r
 
@@ -322,7 +328,7 @@ def gmres(op, rhs, restart=None, tol=None, x0=None,
         inner_product=None,
         maxiter=None, hard_failure=None,
         no_progress_factor=None, stall_iterations=None,
-        callback=None, progress=False):
+        callback=None, progress=False, cl_queue=None):
     """Solve a linear system Ax=b by means of GMRES
     with restarts.
 
@@ -338,11 +344,17 @@ def gmres(op, rhs, restart=None, tol=None, x0=None,
     :arg stall_iterations: Number of iterations with residual decrease
         below *no_progress_factor* indicates stall. Set to 0 to disable
         stall detection.
+    :arg cl_queue: a :class:`pyopencl.CommandQueue` instance, to support
+        automatic vector splitting/assembly for systems of equations
 
     :return: a :class:`GMRESResult`
     """
     amod = get_array_module(rhs)
-    chopper = VectorChopper(rhs,op.queue)
+
+    if cl_queue is None:
+        cl_queue = getattr(op, "queue", None)
+    chopper = VectorChopper(rhs, cl_queue)
+
     stacked_rhs = chopper.stack(rhs)
 
     if inner_product is None:
diff --git a/pytential/source.py b/pytential/source.py
index 16eac21c8913e729ebfb5433e68b5a139b77a3fa..9334dff7e1785cf7d1b36c991effcd6382d2a52e 100644
--- a/pytential/source.py
+++ b/pytential/source.py
@@ -38,7 +38,7 @@ __doc__ = """
 
 class PotentialSource(object):
     """
-    .. method:: preprocess_optemplate(name, expr)
+    .. automethod:: preprocess_optemplate
 
     .. method:: op_group_features(expr)
 
@@ -49,29 +49,43 @@ class PotentialSource(object):
         :class:`pytential.symbolic.primitives.IntG`.
     """
 
+    def preprocess_optemplate(self, name, discretizations, expr):
+        return expr
+
 
 # {{{ point potential source
 
 class PointPotentialSource(PotentialSource):
     """
-    .. attribute:: points
+    .. attribute:: nodes
 
-        An :class:`pyopencl.array.Array` of shape ``[ambient_dim, npoints]``.
+        An :class:`pyopencl.array.Array` of shape ``[ambient_dim, nnodes]``.
 
     .. attribute:: nnodes
     """
 
-    def __init__(self, cl_context, points):
+    def __init__(self, cl_context, nodes):
         self.cl_context = cl_context
-        self.points = points
+        self._nodes = nodes
+
+    @property
+    def points(self):
+        from warnings import warn
+        warn("'points' has been renamed to nodes().",
+             DeprecationWarning, stacklevel=2)
+
+        return self._nodes
+
+    def nodes(self):
+        return self._nodes
 
     @property
     def real_dtype(self):
-        return self.points.dtype
+        return self._nodes.dtype
 
     @property
     def nnodes(self):
-        return self.points.shape[-1]
+        return self._nodes.shape[-1]
 
     @property
     def complex_dtype(self):
@@ -82,7 +96,7 @@ class PointPotentialSource(PotentialSource):
 
     @property
     def ambient_dim(self):
-        return self.points.shape[0]
+        return self._nodes.shape[0]
 
     def op_group_features(self, expr):
         from sumpy.kernel import AxisTargetDerivativeRemover
@@ -129,17 +143,17 @@ class PointPotentialSource(PotentialSource):
                 p2p = self.get_p2p(insn.kernels)
 
             evt, output_for_each_kernel = p2p(queue,
-                    target_discr.nodes(), self.points,
+                    target_discr.nodes(), self._nodes,
                     [strengths], **kernel_args)
 
             result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result, []
+        return result
 
     @memoize_method
     def weights_and_area_elements(self):
         with cl.CommandQueue(self.cl_context) as queue:
-            result = cl.array.empty(queue, self.points.shape[-1],
+            result = cl.array.empty(queue, self._nodes.shape[-1],
                     dtype=self.real_dtype)
             result.fill(1)
 
@@ -173,7 +187,7 @@ class LayerPotentialSourceBase(PotentialSource):
 
     .. rubric:: Execution
 
-    .. automethod:: weights_and_area_elements
+    .. method:: weights_and_area_elements
     .. method:: exec_compute_potential_insn
     """
 
@@ -267,5 +281,6 @@ class LayerPotentialSourceBase(PotentialSource):
 
     # }}}
 
-
 # }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py
index 68b39d2d45ae80948962cb3c036fe417c1b5f7ad..456044e75a3903b08ffdfa6537f1fac0cd1f6e09 100644
--- a/pytential/symbolic/compiler.py
+++ b/pytential/symbolic/compiler.py
@@ -315,7 +315,8 @@ class Code(object):
 
         return "\n".join(lines)
 
-    # {{{ dynamic scheduler (generates static schedules by self-observation)
+    # {{{ dynamic scheduler
+
     class NoInstructionAvailable(Exception):
         pass
 
@@ -371,38 +372,35 @@ class Code(object):
         done_insns = set()
 
         while True:
-            insn = None
             discardable_vars = []
+            insn = None
 
-            # pick the next insn
-            if insn is None:
-                try:
-                    insn, discardable_vars = self.get_next_step(
-                            frozenset(list(context.keys())),
-                            frozenset(done_insns))
+            try:
+                insn, discardable_vars = self.get_next_step(
+                        frozenset(list(context.keys())),
+                        frozenset(done_insns))
 
-                except self.NoInstructionAvailable:
-                    # no available instructions: we're done
-                    break
-                else:
-                    for name in discardable_vars:
-                        del context[name]
+            except self.NoInstructionAvailable:
+                # no available instructions: we're done
+                break
+            else:
+                for name in discardable_vars:
+                    del context[name]
 
-                    done_insns.add(insn)
-                    assignments, new_futures = (
-                            insn.get_exec_function(exec_mapper)
-                            (exec_mapper.queue, insn, exec_mapper.bound_expr,
-                                exec_mapper))
+                done_insns.add(insn)
+                assignments = (
+                        insn.get_exec_function(exec_mapper)
+                        (exec_mapper.queue, insn, exec_mapper.bound_expr,
+                            exec_mapper))
 
-            if insn is not None:
+                assignees = insn.get_assignees()
                 for target, value in assignments:
                     if pre_assign_check is not None:
                         pre_assign_check(target, value)
 
+                    assert target in assignees
                     context[target] = value
 
-                assert not new_futures
-
         if len(done_insns) < len(self.instructions):
             print("Unreachable instructions:")
             for insn in set(self.instructions) - done_insns:
@@ -579,9 +577,7 @@ class OperatorCompiler(IdentityMapper):
             group = self.group_to_operators[self.op_group_features(expr)]
             names = [self.get_var_name() for op in group]
 
-            kernels = sorted(
-                    set(op.kernel for op in group),
-                    key=lambda kernel: repr(kernel))
+            kernels = sorted(set(op.kernel for op in group), key=repr)
 
             kernel_to_index = dict(
                     (kernel, i) for i, kernel in enumerate(kernels))
diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py
index 1722db15f78eca2fed3bd8351ef79aeb692b010f..703a40d7a232b5b0e5778fef6516d6e3dafce57b 100644
--- a/pytential/symbolic/execution.py
+++ b/pytential/symbolic/execution.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2013 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -33,6 +36,13 @@ import pyopencl as cl
 import pyopencl.array  # noqa
 import pyopencl.clmath  # noqa
 
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
+from pytools import memoize_in
+from pytential.symbolic.primitives import DEFAULT_SOURCE, DEFAULT_TARGET
+from pytential.symbolic.primitives import (
+    QBXSourceStage1, QBXSourceStage2, QBXSourceQuadStage2)
+
 
 # FIXME caches: fix up queues
 
@@ -49,11 +59,76 @@ class EvaluationMapper(EvaluationMapperBase):
 
     # {{{ map_XXX
 
+    def _map_minmax(self, func, inherited, expr):
+        ev_children = [self.rec(ch) for ch in expr.children]
+        from functools import reduce, partial
+        if any(isinstance(ch, cl.array.Array) for ch in ev_children):
+            return reduce(partial(func, queue=self.queue), ev_children)
+        else:
+            return inherited(expr)
+
+    def map_max(self, expr):
+        return self._map_minmax(
+                cl.array.maximum,
+                super(EvaluationMapper, self).map_max,
+                expr)
+
+    def map_min(self, expr):
+        return self._map_minmax(
+                cl.array.minimum,
+                super(EvaluationMapper, self).map_min,
+                expr)
+
     def map_node_sum(self, expr):
-        return cl.array.sum(self.rec(expr.operand)).get()[()]
+        expr_val = self.rec(expr.operand)
+        from numbers import Number
+        if isinstance(expr_val, Number) and expr_val == 0:
+            return expr_val
+
+        return cl.array.sum(expr_val).get()[()]
 
     def map_node_max(self, expr):
-        return cl.array.max(self.rec(expr.operand)).get()[()]
+        expr_val = self.rec(expr.operand)
+        from numbers import Number
+        if isinstance(expr_val, Number) and expr_val == 0:
+            return expr_val
+
+        return cl.array.max(expr_val).get()[()]
+
+    def _map_elementwise_reduction(self, reduction_name, expr):
+        @memoize_in(self.bound_expr, "elementwise_"+reduction_name)
+        def knl():
+            import loopy as lp
+            knl = lp.make_kernel(
+                "{[el, idof, jdof]: 0<=el<nelements and 0<=idof,jdof<ndofs}",
+                "result[el, idof] = %s(jdof, input[el, jdof])" % reduction_name,
+                default_offset=lp.auto,
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            knl = lp.tag_inames(knl, "el:g.0,idof:l.0")
+            return knl
+
+        discr = self.bound_expr.get_discretization(expr.where)
+
+        operand = self.rec(expr.operand)
+
+        assert operand.shape == (discr.nnodes,)
+
+        result = cl.array.empty(self.queue, discr.nnodes, operand.dtype)
+        for group in discr.groups:
+            knl()(self.queue,
+                    input=group.view(operand),
+                    result=group.view(result))
+
+        return result
+
+    def map_elementwise_sum(self, expr):
+        return self._map_elementwise_reduction("sum", expr)
+
+    def map_elementwise_min(self, expr):
+        return self._map_elementwise_reduction("min", expr)
+
+    def map_elementwise_max(self, expr):
+        return self._map_elementwise_reduction("max", expr)
 
     def map_ones(self, expr):
         discr = self.bound_expr.get_discretization(expr.where)
@@ -73,9 +148,11 @@ class EvaluationMapper(EvaluationMapperBase):
     def map_num_reference_derivative(self, expr):
         discr = self.bound_expr.get_discretization(expr.where)
 
+        from pytools import flatten
+        ref_axes = flatten([axis] * mult for axis, mult in expr.ref_axes)
         return discr.num_reference_derivative(
                 self.queue,
-                expr.ref_axes, self.rec(expr.operand)) \
+                ref_axes, self.rec(expr.operand)) \
                         .with_queue(self.queue)
 
     def map_q_weight(self, expr):
@@ -112,7 +189,7 @@ class EvaluationMapper(EvaluationMapperBase):
 
     def exec_assign(self, queue, insn, bound_expr, evaluate):
         return [(name, evaluate(expr))
-                for name, expr in zip(insn.names, insn.exprs)], []
+                for name, expr in zip(insn.names, insn.exprs)]
 
     # {{{ functions
 
@@ -170,7 +247,7 @@ class MatVecOp:
 
     def __init__(self,
             bound_expr, queue, arg_name, dtype, total_dofs,
-            starts_and_ends, extra_args, isScalar=False):
+            starts_and_ends, extra_args, is_scalar=False):
         self.bound_expr = bound_expr
         self.queue = queue
         self.arg_name = arg_name
@@ -178,7 +255,7 @@ class MatVecOp:
         self.total_dofs = total_dofs
         self.starts_and_ends = starts_and_ends
         self.extra_args = extra_args
-        self.isScalar = isScalar
+        self.is_scalar = is_scalar
 
     @property
     def shape(self):
@@ -191,7 +268,7 @@ class MatVecOp:
         else:
             out_host = False
 
-        do_split = not self.isScalar
+        do_split = not self.is_scalar
         from pytools.obj_array import make_obj_array
 
         if do_split:
@@ -199,7 +276,7 @@ class MatVecOp:
                     [x[start:end] for start, end in self.starts_and_ends])
 
         # make any scalar terms actually scalars
-        for n in range(0,len(x)):
+        for n in range(0, len(x)):
             if x[n].shape == (1,):
                 x[n] = x[n].get(self.queue)[0]
 
@@ -223,70 +300,214 @@ class MatVecOp:
 # }}}
 
 
-# {{{ default for 'domains' parameter
+# {{{ expression prep
+
+def _prepare_domains(nresults, places, domains, default_domain):
+    """
+    :arg nresults: number of results.
+    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg domains: recommended domains.
+    :arg default_domain: default value for domains which are not provided.
+
+    :return: a list of domains for each result. If domains is `None`, each
+        element in the list is *default_domain*. If *domains* is a scalar
+        (i.e., not a *list* or *tuple*), each element in the list is
+        *domains*. Otherwise, *domains* is returned as is.
+    """
 
-def _domains_default(nresults, places, domains, default_val):
     if domains is None:
-        if default_val not in places:
+        if default_domain not in places:
             raise RuntimeError("'domains is None' requires "
-                    "default domain to be defined")
-        dom_name = default_val
-        return nresults*[dom_name]
-
+                               "default domain to be defined in places")
+        dom_name = default_domain
+        return nresults * [dom_name]
     elif not isinstance(domains, (list, tuple)):
         dom_name = domains
-        return nresults*[dom_name]
+        return nresults * [dom_name]
+
+    assert len(domains) == nresults
+    return domains
+
+
+def _prepare_expr(places, expr):
+    """
+    :arg places: :class:`pytential.symbolic.execution.GeometryCollection`.
+    :arg expr: a symbolic expression.
+    :return: processed symbolic expressions, tagged with the appropriate
+        `where` identifier from places, etc.
+    """
+
+    from pytential.source import LayerPotentialSourceBase
+    from pytential.symbolic.mappers import (
+            ToTargetTagger, DerivativeBinder)
+
+    expr = ToTargetTagger(*places._default_place_ids)(expr)
+    expr = DerivativeBinder()(expr)
+
+    for name, place in six.iteritems(places.places):
+        if isinstance(place, LayerPotentialSourceBase):
+            expr = place.preprocess_optemplate(name, places, expr)
 
-    else:
-        return domains
+    return expr
 
 # }}}
 
 
 # {{{ bound expression
 
-class BoundExpression:
-    def __init__(self, optemplate, places):
-        self.optemplate = optemplate
-        self.places = places
+class GeometryCollection(object):
+    """A mapping from symbolic identifiers ("place IDs", typically strings)
+    to 'geometries', where a geometry can be a
+    :class:`pytential.source.PotentialSource`
+    or a :class:`pytential.target.TargetBase`.
+    This class is meant to hold a specific combination of sources and targets
+    serve to host caches of information derived from them, e.g. FMM trees
+    of subsets of them, as well as related common subexpressions such as
+    metric terms.
+
+    .. method:: __getitem__
+    .. method:: get_discretization
+    .. method:: get_cache
+    """
+
+    def __init__(self, places, auto_where=None):
+        """
+        :arg places: a scalar, tuple of or mapping of symbolic names to
+            geometry objects. Supported objects are
+            :class:`~pytential.source.PotentialSource`,
+            :class:`~potential.target.TargetBase` and
+            :class:`~meshmode.discretization.Discretization`.
+        :arg auto_where: location identifier for each geometry object, used
+            to denote specific discretizations, e.g. in the case where
+            *places* is a :class:`~pytential.source.LayerPotentialSourceBase`.
+            By default, we assume
+            :class:`~pytential.symbolic.primitives.DEFAULT_SOURCE` and
+            :class:`~pytential.symbolic.primitives.DEFAULT_TARGET` for
+            sources and targets, respectively.
+        """
+
+        from pytential.target import TargetBase
+        from meshmode.discretization import Discretization
+        from pytential.source import LayerPotentialSourceBase, PotentialSource
+
+        if auto_where is None:
+            source_where, target_where = DEFAULT_SOURCE, DEFAULT_TARGET
+        else:
+            # NOTE: keeping this here to make sure auto_where unpacks into
+            # just the two elements
+            source_where, target_where = auto_where
+
+        self._default_source_place = source_where
+        self._default_target_place = target_where
+        self._default_place_ids = (source_where, target_where)
+
+        self.places = {}
+        if isinstance(places, LayerPotentialSourceBase):
+            self.places[source_where] = places
+            self.places[target_where] = \
+                    self._get_lpot_discretization(target_where, places)
+        elif isinstance(places, (Discretization, TargetBase)):
+            self.places[target_where] = places
+        elif isinstance(places, tuple):
+            source_discr, target_discr = places
+            self.places[source_where] = source_discr
+            self.places[target_where] = target_discr
+        else:
+            self.places = places.copy()
+
+        for p in six.itervalues(self.places):
+            if not isinstance(p, (PotentialSource, TargetBase, Discretization)):
+                raise TypeError("Must pass discretization, targets or "
+                        "layer potential sources as 'places'.")
 
         self.caches = {}
 
-        from pytential.symbolic.compiler import OperatorCompiler
-        self.code = OperatorCompiler(self.places)(optemplate)
+    def _get_lpot_discretization(self, where, lpot):
+        from pytential.source import LayerPotentialSourceBase
+        if not isinstance(lpot, LayerPotentialSourceBase):
+            return lpot
+
+        from pytential.symbolic.primitives import _QBXSource
+        if not isinstance(where, _QBXSource):
+            where = QBXSourceStage1(where)
+
+        if isinstance(where, QBXSourceStage1):
+            return lpot.density_discr
+        if isinstance(where, QBXSourceStage2):
+            return lpot.stage2_density_discr
+        if isinstance(where, QBXSourceQuadStage2):
+            return lpot.quad_stage2_density_discr
+
+        raise ValueError('unknown `where` identifier: {}'.format(type(where)))
+
+    def get_discretization(self, where):
+        """
+        :arg where: location identifier.
+
+        :return: a geometry object in the collection corresponding to the
+            key *where*. If it is a
+            :class:`~pytential.source.LayerPotentialSourceBase`, we look for
+            the corresponding :class:`~meshmode.discretization.Discretization`
+            in its attributes instead.
+        """
+
+        if where in self.places:
+            lpot = self.places[where]
+        else:
+            lpot = self.places.get(getattr(where, 'where', None), None)
+
+        if lpot is None:
+            raise KeyError('`where` not in the collection: {}'.format(where))
+
+        return self._get_lpot_discretization(where, lpot)
+
+    def __getitem__(self, where):
+        return self.places[where]
+
+    def __contains__(self, where):
+        return where in self.places
+
+    def copy(self):
+        return GeometryCollection(self.places, auto_where=self.where)
 
     def get_cache(self, name):
         return self.caches.setdefault(name, {})
 
-    def get_discretization(self, where):
-        discr = self.places[where]
 
-        from pytential.source import LayerPotentialSourceBase
-        if isinstance(discr, LayerPotentialSourceBase):
-            discr = discr.density_discr
+class BoundExpression(object):
+    def __init__(self, places, sym_op_expr):
+        self.places = places
+        self.sym_op_expr = sym_op_expr
+        self.caches = {}
+
+        from pytential.symbolic.compiler import OperatorCompiler
+        self.code = OperatorCompiler(self.places)(sym_op_expr)
+
+    def get_cache(self, name):
+        return self.places.get_cache(name)
 
-        return discr
+    def get_discretization(self, where):
+        return self.places.get_discretization(where)
 
     def scipy_op(self, queue, arg_name, dtype, domains=None, **extra_args):
         """
         :arg domains: a list of discretization identifiers or
             *None* values indicating the domains on which each component of the
             solution vector lives.  *None* values indicate that the component
-            is a scalar.  If *None*,
+            is a scalar.  If *domains* is *None*,
             :class:`pytential.symbolic.primitives.DEFAULT_TARGET`, is required
             to be a key in :attr:`places`.
         """
 
         from pytools.obj_array import is_obj_array
-        isScalar = False
+        is_scalar = False
         if is_obj_array(self.code.result):
             nresults = len(self.code.result)
         else:
             nresults = 1
-            isScalar = True
+            is_scalar = True
 
-        from pytential.symbolic.primitives import DEFAULT_TARGET
-        domains = _domains_default(nresults, self.places, domains,
+        domains = _prepare_domains(nresults, self.places, domains,
                 DEFAULT_TARGET)
 
         total_dofs = 0
@@ -306,121 +527,81 @@ class BoundExpression:
         # for linear system solving, in which case the assumption
         # has to be true.
         return MatVecOp(self, queue,
-                arg_name, dtype, total_dofs, starts_and_ends, extra_args, isScalar=isScalar)
+                arg_name, dtype, total_dofs, starts_and_ends, extra_args,
+                is_scalar=is_scalar)
 
     def __call__(self, queue, **args):
         exec_mapper = EvaluationMapper(self, queue, args)
         return self.code.execute(exec_mapper)
 
-# }}}
-
-
-# {{{ expression prep
-
-def prepare_places(places):
-    from pytential.symbolic.primitives import DEFAULT_SOURCE, DEFAULT_TARGET
-    from meshmode.discretization import Discretization
-    from pytential.source import LayerPotentialSourceBase
-    from pytential.target import TargetBase
-
-    if isinstance(places, LayerPotentialSourceBase):
-        places = {
-                DEFAULT_SOURCE: places,
-                DEFAULT_TARGET: places.density_discr,
-                }
-    elif isinstance(places, (Discretization, TargetBase)):
-        places = {
-                DEFAULT_TARGET: places,
-                }
-
-    elif isinstance(places, tuple):
-        source_discr, target_discr = places
-        places = {
-                DEFAULT_SOURCE: source_discr,
-                DEFAULT_TARGET: target_discr,
-                }
-        del source_discr
-        del target_discr
-
-    def cast_to_place(discr):
-        from pytential.target import TargetBase
-        from pytential.source import PotentialSource
-        if not isinstance(discr, (Discretization, TargetBase, PotentialSource)):
-            raise TypeError("must pass discretizations, "
-                    "layer potential sources or targets as 'places'")
-        return discr
-
-    return dict(
-            (key, cast_to_place(value))
-            for key, value in six.iteritems(places))
 
-
-def prepare_expr(places, expr, auto_where=None):
+def bind(places, expr, auto_where=None):
     """
-    :arg places: result of :func:`prepare_places`
-    :arg auto_where: For simple source-to-self or source-to-target
+    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+        Alternatively, any list or mapping that is a valid argument for its
+        constructor can also be used.
+    :arg auto_where: for simple source-to-self or source-to-target
         evaluations, find 'where' attributes automatically.
     """
 
-    from pytential.symbolic.primitives import DEFAULT_SOURCE, DEFAULT_TARGET
-    from pytential.source import LayerPotentialSourceBase
-
-    from pytential.symbolic.mappers import (
-            ToTargetTagger,
-            DerivativeBinder,
-            )
-
-    if auto_where is None:
-        if DEFAULT_TARGET in places:
-            auto_where = DEFAULT_SOURCE, DEFAULT_TARGET
-        else:
-            auto_where = DEFAULT_SOURCE, DEFAULT_SOURCE
+    if not isinstance(places, GeometryCollection):
+        places = GeometryCollection(places, auto_where=auto_where)
 
-    if auto_where:
-        expr = ToTargetTagger(*auto_where)(expr)
+    expr = _prepare_expr(places, expr)
 
-    expr = DerivativeBinder()(expr)
+    return BoundExpression(places, expr)
 
-    for name, place in six.iteritems(places):
-        if isinstance(place, LayerPotentialSourceBase):
-            expr = place.preprocess_optemplate(name, places, expr)
+# }}}
 
-    return expr
 
-# }}}
+# {{{ matrix building
 
+def _bmat(blocks, dtypes):
+    from pytools import single_valued
+    from pytential.symbolic.matrix import is_zero
 
-def bind(places, expr, auto_where=None):
-    """
-    :arg places: a mapping of symbolic names to
-        :class:`pytential.discretization.Discretization` objects or a subclass
-        of :class:`pytential.discretization.target.TargetBase`.
-    :arg auto_where: For simple source-to-self or source-to-target
-        evaluations, find 'where' attributes automatically.
-    """
+    nrows = blocks.shape[0]
+    ncolumns = blocks.shape[1]
 
-    places = prepare_places(places)
-    expr = prepare_expr(places, expr, auto_where=auto_where)
-    return BoundExpression(expr, places)
+    # "block row starts"/"block column starts"
+    brs = np.cumsum([0]
+            + [single_valued(blocks[ibrow, ibcol].shape[0]
+                             for ibcol in range(ncolumns)
+                             if not is_zero(blocks[ibrow, ibcol]))
+             for ibrow in range(nrows)])
+
+    bcs = np.cumsum([0]
+            + [single_valued(blocks[ibrow, ibcol].shape[1]
+                             for ibrow in range(nrows)
+                             if not is_zero(blocks[ibrow, ibcol]))
+             for ibcol in range(ncolumns)])
+
+    result = np.zeros((brs[-1], bcs[-1]),
+                      dtype=np.find_common_type(dtypes, []))
+    for ibcol in range(ncolumns):
+        for ibrow in range(nrows):
+            result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \
+                    blocks[ibrow, ibcol]
 
+    return result
 
-# {{{ matrix building
 
-def build_matrix(queue, places, expr, input_exprs, domains=None,
+def build_matrix(queue, places, exprs, input_exprs, domains=None,
         auto_where=None, context=None):
     """
-    :arg queue: a :class:`pyopencl.CommandQueue` used to synchronize
-        the calculation.
-    :arg places: a mapping of symbolic names to
-        :class:`pytential.discretization.Discretization` objects or a subclass
-        of :class:`pytential.discretization.target.TargetBase`.
-    :arg input_exprs: A sequence of expressions corresponding to the
-        input block columns of the matrix.
+    :arg queue: a :class:`pyopencl.CommandQueue`.
+    :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`.
+        Alternatively, any list or mapping that is a valid argument for its
+        constructor can also be used.
+    :arg exprs: an array of expressions corresponding to the output block
+        rows of the matrix. May also be a single expression.
+    :arg input_exprs: an array of expressions corresponding to the
+        input block columns of the matrix. May also be a single expression.
     :arg domains: a list of discretization identifiers (see 'places') or
         *None* values indicating the domains on which each component of the
         solution vector lives.  *None* values indicate that the component
-        is a scalar.  If *None*,
-        :class:`pytential.symbolic.primitives.DEFAULT_TARGET`, is required
+        is a scalar.  If *None*, *auto_where* or, if it is not provided,
+        :class:`~pytential.symbolic.primitives.DEFAULT_SOURCE` is required
         to be a key in :attr:`places`.
     :arg auto_where: For simple source-to-self or source-to-target
         evaluations, find 'where' attributes automatically.
@@ -429,81 +610,48 @@ def build_matrix(queue, places, expr, input_exprs, domains=None,
     if context is None:
         context = {}
 
-    places = prepare_places(places)
-    expr = prepare_expr(places, expr, auto_where=auto_where)
-
     from pytools.obj_array import is_obj_array, make_obj_array
-    if not is_obj_array(expr):
-        expr = make_obj_array([expr])
+    if not isinstance(places, GeometryCollection):
+        places = GeometryCollection(places, auto_where=auto_where)
+    exprs = _prepare_expr(places, exprs)
 
-    from pytential.symbolic.primitives import DEFAULT_SOURCE
-    domains = _domains_default(len(input_exprs), places, domains,
-            DEFAULT_SOURCE)
+    if not is_obj_array(exprs):
+        exprs = make_obj_array([exprs])
+    try:
+        input_exprs = list(input_exprs)
+    except TypeError:
+        # not iterable, wrap in a list
+        input_exprs = [input_exprs]
 
-    input_exprs = list(input_exprs)
+    domains = _prepare_domains(len(input_exprs), places, domains,
+                               places._default_source_place)
 
-    nblock_rows = len(expr)
+    from pytential.symbolic.matrix import MatrixBuilder, is_zero
+    nblock_rows = len(exprs)
     nblock_columns = len(input_exprs)
-
     blocks = np.zeros((nblock_rows, nblock_columns), dtype=np.object)
 
-    from pytential.symbolic.matrix import MatrixBuilder, is_zero
-
     dtypes = []
-
     for ibcol in range(nblock_columns):
         mbuilder = MatrixBuilder(
                 queue,
                 dep_expr=input_exprs[ibcol],
-                other_dep_exprs=(
-                    input_exprs[:ibcol]
-                    +
-                    input_exprs[ibcol+1:]),
+                other_dep_exprs=(input_exprs[:ibcol]
+                                 + input_exprs[ibcol + 1:]),
                 dep_source=places[domains[ibcol]],
+                dep_discr=places.get_discretization(domains[ibcol]),
                 places=places,
                 context=context)
 
         for ibrow in range(nblock_rows):
-            block = mbuilder(expr[ibrow])
-
-            assert (
-                    is_zero(block)
-                    or isinstance(block, np.ndarray))
-            if isinstance(block, np.ndarray):
-                dtypes.append(block.dtype)
+            block = mbuilder(exprs[ibrow])
+            assert is_zero(block) or isinstance(block, np.ndarray)
 
             blocks[ibrow, ibcol] = block
-
-            if isinstance(block, cl.array.Array):
+            if isinstance(block, np.ndarray):
                 dtypes.append(block.dtype)
 
-    from pytools import single_valued
-
-    block_row_counts = [
-            single_valued(
-                blocks[ibrow, ibcol].shape[0]
-                for ibcol in range(nblock_columns)
-                if not is_zero(blocks[ibrow, ibcol]))
-            for ibrow in range(nblock_rows)]
-
-    block_col_counts = [
-            places[domains[ibcol]].density_discr.nnodes
-            for ibcol in range(nblock_columns)]
-
-    # "block row starts"/"block column starts"
-    brs = np.cumsum([0] + block_row_counts)
-    bcs = np.cumsum([0] + block_col_counts)
-
-    result = np.zeros(
-            (sum(block_row_counts), sum(block_col_counts)),
-            dtype=np.find_common_type(dtypes, []))
-
-    for ibcol in range(nblock_columns):
-        for ibrow in range(nblock_rows):
-            result[brs[ibrow]:brs[ibrow+1], bcs[ibcol]:bcs[ibcol+1]] = \
-                    blocks[ibrow, ibcol]
-
-    return cl.array.to_device(queue, result)
+    return cl.array.to_device(queue, _bmat(blocks, dtypes))
 
 # }}}
 
diff --git a/pytential/symbolic/mappers.py b/pytential/symbolic/mappers.py
index cca6b91983eeefa4c7b3e0113db671893917961f..00c43689e45ffbe435a87f9b8e6fd3c497d87303 100644
--- a/pytential/symbolic/mappers.py
+++ b/pytential/symbolic/mappers.py
@@ -62,6 +62,12 @@ class IdentityMapper(IdentityMapperBase):
 
     map_node_max = map_node_sum
 
+    def map_elementwise_sum(self, expr):
+        return type(expr)(self.rec(expr.operand), expr.where)
+
+    map_elementwise_min = map_elementwise_sum
+    map_elementwise_max = map_elementwise_sum
+
     def map_num_reference_derivative(self, expr):
         return type(expr)(expr.ref_axes, self.rec(expr.operand),
                 expr.where)
@@ -104,6 +110,9 @@ class CombineMapper(CombineMapperBase):
 
     map_node_max = map_node_sum
     map_num_reference_derivative = map_node_sum
+    map_elementwise_sum = map_node_sum
+    map_elementwise_min = map_node_sum
+    map_elementwise_max = map_node_sum
 
     def map_int_g(self, expr):
         return self.combine(
@@ -229,6 +238,14 @@ class LocationTagger(CSECachingMapperMixin, IdentityMapper):
         else:
             return expr
 
+    def map_elementwise_sum(self, expr):
+        return type(expr)(
+                self.rec(expr.operand),
+                expr.where if expr.where is not None else self.default_where)
+
+    map_elementwise_min = map_elementwise_sum
+    map_elementwise_max = map_elementwise_sum
+
     def map_int_g(self, expr):
         source = expr.source
         target = expr.target
@@ -275,8 +292,10 @@ class ToTargetTagger(LocationTagger):
     """
 
     def __init__(self, default_source, default_target):
-        LocationTagger.__init__(self, default_target)
-        self.operand_rec = LocationTagger(default_source)
+        LocationTagger.__init__(self, default_target,
+                                default_source=default_source)
+        self.operand_rec = LocationTagger(default_source,
+                                          default_source=default_source)
 
 # }}}
 
@@ -380,19 +399,15 @@ class QBXPreprocessor(IdentityMapper):
         self.places = places
 
     def map_int_g(self, expr):
-        source = self.places[self.source_name]
-        target_discr = self.places[expr.target]
-
-        from pytential.source import LayerPotentialSourceBase
-        if isinstance(target_discr, LayerPotentialSourceBase):
-            target_discr = target_discr.density_discr
+        source_discr = self.places.get_discretization(self.source_name)
+        target_discr = self.places.get_discretization(expr.target)
 
         if expr.qbx_forced_limit == 0:
             raise ValueError("qbx_forced_limit == 0 was a bad idea and "
                     "is no longer supported. Use qbx_forced_limit == 'avg' "
                     "to request two-sided averaging explicitly if needed.")
 
-        is_self = source.density_discr is target_discr
+        is_self = source_discr is target_discr
 
         expr = expr.copy(
                 kernel=expr.kernel,
@@ -436,6 +451,13 @@ class QBXPreprocessor(IdentityMapper):
 # {{{ stringifier
 
 def stringify_where(where):
+    if isinstance(where, prim.QBXSourceStage1):
+        return "stage1(%s)" % stringify_where(where.where)
+    if isinstance(where, prim.QBXSourceStage2):
+        return "stage2(%s)" % stringify_where(where.where)
+    if isinstance(where, prim.QBXSourceQuadStage2):
+        return "quad_stage2(%s)" % stringify_where(where.where)
+
     if where is None:
         return "?"
     elif where is prim.DEFAULT_SOURCE:
@@ -465,19 +487,40 @@ class StringifyMapper(BaseStringifyMapper):
                 for name_expr in six.itervalues(expr.extra_vars)),
                 set())
 
-    def map_node_sum(self, expr, enclosing_prec):
-        return "NodeSum(%s)" % self.rec(expr.operand, PREC_NONE)
+    def map_elementwise_sum(self, expr, enclosing_prec):
+        return "ElwiseSum.%s(%s)" % (
+                stringify_where(expr.where),
+                self.rec(expr.operand, PREC_NONE))
+
+    def map_elementwise_min(self, expr, enclosing_prec):
+        return "ElwiseMin.%s(%s)" % (
+                stringify_where(expr.where),
+                self.rec(expr.operand, PREC_NONE))
+
+    def map_elementwise_max(self, expr, enclosing_prec):
+        return "ElwiseMax.%s(%s)" % (
+                stringify_where(expr.where),
+                self.rec(expr.operand, PREC_NONE))
 
     def map_node_max(self, expr, enclosing_prec):
         return "NodeMax(%s)" % self.rec(expr.operand, PREC_NONE)
 
+    def map_node_sum(self, expr, enclosing_prec):
+        return "NodeSum(%s)" % self.rec(expr.operand, PREC_NONE)
+
     def map_node_coordinate_component(self, expr, enclosing_prec):
         return "x%d.%s" % (expr.ambient_axis,
                 stringify_where(expr.where))
 
     def map_num_reference_derivative(self, expr, enclosing_prec):
-        result = "d/dr%s.%s %s" % (
-                ",".join(str(ax) for ax in expr.ref_axes),
+        diff_op = " ".join(
+                "d/dr%d" % axis
+                if mult == 1 else
+                "d/dr%d^%d" % (axis, mult)
+                for axis, mult in expr.ref_axes)
+
+        result = "%s.%s %s" % (
+                diff_op,
                 stringify_where(expr.where),
                 self.rec(expr.operand, PREC_PRODUCT),
                 )
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index 84cd46ce23990fb893f47f3e3c2d7672aa40129f..9d0c2b2f66a7080f91cda5c668377c9985bfca3f 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2015 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -34,39 +37,214 @@ import pytential.symbolic.primitives as sym
 from pytential.symbolic.execution import bind
 
 
+# {{{ helpers
+
 def is_zero(x):
     return isinstance(x, (int, float, complex, np.number)) and x == 0
 
 
-# FIXME: PyOpenCL doesn't do all the required matrix math yet.
-# We'll cheat and build the matrix on the host.
+def _resample_arg(queue, source, x):
+    """
+    :arg queue: a :class:`pyopencl.CommandQueue`.
+    :arg source: a :class:`pytential.source.LayerPotentialSourceBase` subclass.
+        If it is not a layer potential source, no resampling is done.
+    :arg x: a :class:`numpy.ndarray`.
+
+    :return: a resampled :class:`numpy.ndarray` (see
+        :method:`pytential.source.LayerPotentialSourceBase.resampler`).
+    """
+
+    from pytential.source import LayerPotentialSourceBase
+    if not isinstance(source, LayerPotentialSourceBase):
+        return x
+
+    if not isinstance(x, np.ndarray):
+        return x
+
+    if len(x.shape) >= 2:
+        raise RuntimeError("matrix variables in kernel arguments")
+
+    def resample(y):
+        return source.resampler(queue, cl.array.to_device(queue, y)).get(queue)
+
+    from pytools.obj_array import with_object_array_or_scalar
+    return with_object_array_or_scalar(resample, x)
+
+
+def _get_layer_potential_args(mapper, expr, source):
+    """
+    :arg mapper: a :class:`pytential.symbolic.matrix.MatrixBuilderBase`.
+    :arg expr: symbolic layer potential expression.
+    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
+
+    :return: a mapping of kernel arguments evaluated by the *mapper*.
+    """
+
+    # skip resampling if source and target are the same
+    from pytential.symbolic.primitives import DEFAULT_SOURCE, DEFAULT_TARGET
+    if ((expr.source is not DEFAULT_SOURCE)
+            and (expr.target is not DEFAULT_TARGET)
+            and (isinstance(expr.source, type(expr.target)))):
+        source = None
+
+    kernel_args = {}
+    for arg_name, arg_expr in six.iteritems(expr.kernel_arguments):
+        rec_arg = mapper.rec(arg_expr)
+        kernel_args[arg_name] = _resample_arg(mapper.queue, source, rec_arg)
+
+    return kernel_args
+
+
+def _get_kernel_args(mapper, kernel, expr, source):
+    """
+    :arg mapper: a :class:`pytential.symbolic.matrix.MatrixBuilderBase`.
+    :arg kernel: a :class:`sumpy.kernel.Kernel`.
+    :arg expr: symbolic layer potential expression.
+    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
+
+    :return: a mapping of kernel arguments evaluated by the *mapper*.
+    """
+
+    # NOTE: copied from pytential.symbolic.primitives.IntG
+    inner_kernel_args = kernel.get_args() + kernel.get_source_args()
+    inner_kernel_args = set(arg.loopy_arg.name for arg in inner_kernel_args)
+
+    kernel_args = {}
+    for arg_name, arg_expr in six.iteritems(expr.kernel_arguments):
+        if arg_name not in inner_kernel_args:
+            continue
+
+        rec_arg = mapper.rec(arg_expr)
+        kernel_args[arg_name] = _resample_arg(mapper.queue, source, rec_arg)
+
+    return kernel_args
+
+
+def _get_weights_and_area_elements(queue, source, source_discr):
+    """
+    :arg queue: a :class:`pyopencl.CommandQueue`.
+    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
+    :arg source_discr: a :class:`meshmode.discretization.Discretization`.
+
+    :return: quadrature weights for each node in *source_discr*.
+    """
+
+    if source.quad_stage2_density_discr is source_discr:
+        waa = source.weights_and_area_elements().with_queue(queue)
+    else:
+        # NOTE: copied from `weights_and_area_elements`, but using the
+        # discretization given by `where` and no interpolation
+        area = bind(source_discr,
+                sym.area_element(source.ambient_dim, source.dim))(queue)
+        qweight = bind(source_discr, sym.QWeight())(queue)
+        waa = area * qweight
+
+    return waa
+
+
+def _get_centers_and_expansion_radii(queue, source, target_discr, qbx_forced_limit):
+    """
+    :arg queue: a :class:`pyopencl.CommandQueue`.
+    :arg source: a :class:`pytential.source.LayerPotentialSourceBase`.
+    :arg target_discr: a :class:`meshmode.discretization.Discretization`.
+    :arg qbx_forced_limit: an integer (*+1* or *-1*).
+
+    :return: a tuple of `(centers, radii)` for each node in *target_discr*.
+    """
+
+    if source.density_discr is target_discr:
+        # NOTE: skip expensive target association
+        from pytential.qbx.utils import get_centers_on_side
+        centers = get_centers_on_side(source, qbx_forced_limit)
+        radii = source._expansion_radii('nsources')
+    else:
+        from pytential.qbx.utils import get_interleaved_centers
+        centers = get_interleaved_centers(queue, source)
+        radii = source._expansion_radii('ncenters')
+
+        # NOTE: using a very small tolerance to make sure all the stage2
+        # targets are associated to a center. We can't use the user provided
+        # source.target_association_tolerance here because it will likely be
+        # way too small.
+        target_association_tolerance = 1.0e-1
+
+        from pytential.qbx.target_assoc import associate_targets_to_qbx_centers
+        code_container = source.target_association_code_container
+        assoc = associate_targets_to_qbx_centers(
+                source,
+                code_container.get_wrangler(queue),
+                [(target_discr, qbx_forced_limit)],
+                target_association_tolerance=target_association_tolerance)
+
+        centers = [cl.array.take(c, assoc.target_to_center, queue=queue)
+                   for c in centers]
+        radii = cl.array.take(radii, assoc.target_to_center, queue=queue)
+
+    return centers, radii
+
+# }}}
+
+
+# {{{ base class for matrix builders
+
+class MatrixBuilderBase(EvaluationMapperBase):
+    def __init__(self, queue, dep_expr, other_dep_exprs,
+            dep_source, dep_discr, places, context):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue`.
+        :arg dep_expr: symbolic expression for the input block column
+            that the builder is evaluating.
+        :arg other_dep_exprs: symbolic expressions for the remaining input
+            block columns.
+        :arg dep_source: a :class:`pytential.source.LayerPotentialSourceBase`
+            for the given *dep_expr*.
+        :arg dep_discr: a concerete :class:`meshmode.discretization.Discretization`
+            for the given *dep_expr*.
+        :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`
+            for all the sources and targets the builder is expected to
+            encounter.
+        """
+        super(MatrixBuilderBase, self).__init__(context=context)
 
-class MatrixBuilder(EvaluationMapperBase):
-    def __init__(self, queue, dep_expr, other_dep_exprs, dep_source, places,
-            context):
         self.queue = queue
         self.dep_expr = dep_expr
         self.other_dep_exprs = other_dep_exprs
         self.dep_source = dep_source
-        self.dep_discr = dep_source.density_discr
+        self.dep_discr = dep_discr
         self.places = places
-        self.context = context
+
+        self.dep_nnodes = dep_discr.nnodes
+
+    # {{{
+
+    def get_dep_variable(self):
+        return np.eye(self.dep_nnodes, dtype=np.float64)
+
+    def is_kind_vector(self, x):
+        return len(x.shape) == 1
+
+    def is_kind_matrix(self, x):
+        return len(x.shape) == 2
+
+    # }}}
+
+    # {{{ map_xxx implementation
 
     def map_variable(self, expr):
         if expr == self.dep_expr:
-            return np.eye(self.dep_discr.nnodes, dtype=np.float64)
+            return self.get_dep_variable()
         elif expr in self.other_dep_exprs:
             return 0
         else:
-            return super(MatrixBuilder, self).map_variable(expr)
+            return super(MatrixBuilderBase, self).map_variable(expr)
 
     def map_subscript(self, expr):
         if expr == self.dep_expr:
-            return np.eye(self.dep_discr.nnodes, dtype=np.float64)
+            return self.get_dep_variable()
         elif expr in self.other_dep_exprs:
             return 0
         else:
-            return super(MatrixBuilder, self).map_subscript(expr)
+            return super(MatrixBuilderBase, self).map_subscript(expr)
 
     def map_sum(self, expr):
         sum_kind = None
@@ -83,13 +261,12 @@ class MatrixBuilder(EvaluationMapperBase):
                 continue
 
             if isinstance(rec_child, np.ndarray):
-                if len(rec_child.shape) == 2:
+                if self.is_kind_matrix(rec_child):
                     term_kind = term_kind_matrix
-                elif len(rec_child.shape) == 1:
+                elif self.is_kind_vector(rec_child):
                     term_kind = term_kind_vector
                 else:
                     raise RuntimeError("unexpected array rank")
-
             else:
                 term_kind = term_kind_scalar
 
@@ -108,71 +285,141 @@ class MatrixBuilder(EvaluationMapperBase):
         mat_result = None
         vecs_and_scalars = 1
 
-        for term in expr.children:
-            rec_term = self.rec(term)
+        for child in expr.children:
+            rec_child = self.rec(child)
 
-            if is_zero(rec_term):
+            if is_zero(rec_child):
                 return 0
 
-            if isinstance(rec_term, (np.number, int, float, complex)):
-                vecs_and_scalars = vecs_and_scalars * rec_term
-            elif isinstance(rec_term, np.ndarray):
-                if len(rec_term.shape) == 2:
+            if isinstance(rec_child, (np.number, int, float, complex)):
+                vecs_and_scalars = vecs_and_scalars * rec_child
+            elif isinstance(rec_child, np.ndarray):
+                if self.is_kind_matrix(rec_child):
                     if mat_result is not None:
                         raise RuntimeError("expression is nonlinear in %s"
                                 % self.dep_expr)
                     else:
-                        mat_result = rec_term
+                        mat_result = rec_child
                 else:
-                    vecs_and_scalars = vecs_and_scalars * rec_term
+                    vecs_and_scalars = vecs_and_scalars * rec_child
 
         if mat_result is not None:
-            if (
-                    isinstance(vecs_and_scalars, np.ndarray)
-                    and len(vecs_and_scalars.shape) == 1):
+            if (isinstance(vecs_and_scalars, np.ndarray)
+                    and self.is_kind_vector(vecs_and_scalars)):
                 vecs_and_scalars = vecs_and_scalars[:, np.newaxis]
 
             return mat_result * vecs_and_scalars
         else:
             return vecs_and_scalars
 
+    def map_num_reference_derivative(self, expr):
+        rec_operand = self.rec(expr.operand)
+
+        assert isinstance(rec_operand, np.ndarray)
+        if self.is_kind_matrix(rec_operand):
+            raise NotImplementedError("derivatives")
+
+        where_discr = self.places[expr.where]
+        op = sym.NumReferenceDerivative(expr.ref_axes, sym.var("u"))
+        return bind(where_discr, op)(
+                self.queue, u=cl.array.to_device(self.queue, rec_operand)).get()
+
+    def map_node_coordinate_component(self, expr):
+        where_discr = self.places[expr.where]
+        op = sym.NodeCoordinateComponent(expr.ambient_axis)
+        return bind(where_discr, op)(self.queue).get()
+
+    def map_call(self, expr):
+        arg, = expr.parameters
+        rec_arg = self.rec(arg)
+
+        if isinstance(rec_arg, np.ndarray) and self.is_kind_matrix(rec_arg):
+            raise RuntimeError("expression is nonlinear in variable")
+
+        if isinstance(rec_arg, np.ndarray):
+            rec_arg = cl.array.to_device(self.queue, rec_arg)
+
+        op = expr.function(sym.var("u"))
+        result = bind(self.dep_source, op)(self.queue, u=rec_arg)
+
+        if isinstance(result, cl.array.Array):
+            result = result.get()
+
+        return result
+
+    # }}}
+
+
+class MatrixBlockBuilderBase(MatrixBuilderBase):
+    """Evaluate individual blocks of a matrix operator.
+
+    Unlike, e.g. :class:`MatrixBuilder`, matrix block builders are
+    significantly reduced in scope. They are basically just meant
+    to evaluate linear combinations of layer potential operators.
+    For example, they do not support composition of operators because we
+    assume that each operator acts directly on the density.
+    """
+
+    def __init__(self, queue, dep_expr, other_dep_exprs,
+            dep_source, dep_discr, places, index_set, context):
+        """
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` class
+            describing which blocks are going to be evaluated.
+        """
+
+        super(MatrixBlockBuilderBase, self).__init__(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, context)
+
+        self.index_set = index_set
+        self.dep_nnodes = index_set.col.indices.size
+
+    def get_dep_variable(self):
+        return 1.0
+
+    def is_kind_vector(self, x):
+        # NOTE: since matrices are flattened, the only way to differentiate
+        # them from a vector is by size
+        return x.size == self.index_set.row.indices.size
+
+    def is_kind_matrix(self, x):
+        # NOTE: since matrices are flattened, we recognize them by checking
+        # if they have the right size
+        return x.size == self.index_set.linear_row_indices.size
+
+# }}}
+
+
+# {{{ QBX layer potential matrix builder
+
+# FIXME: PyOpenCL doesn't do all the required matrix math yet.
+# We'll cheat and build the matrix on the host.
+
+class MatrixBuilder(MatrixBuilderBase):
+    def __init__(self, queue, dep_expr, other_dep_exprs,
+            dep_source, dep_discr, places, context):
+        super(MatrixBuilder, self).__init__(queue, dep_expr, other_dep_exprs,
+                dep_source, dep_discr, places, context)
+
     def map_int_g(self, expr):
-        source = self.places[expr.source]
-        target_discr = self.places[expr.target]
+        where_source = expr.source
+        if where_source is sym.DEFAULT_SOURCE:
+            where_source = sym.QBXSourceQuadStage2(expr.source)
 
-        if source.density_discr is not target_discr:
-            raise NotImplementedError()
+        source = self.places[expr.source]
+        source_discr = self.places.get_discretization(where_source)
+        target_discr = self.places.get_discretization(expr.target)
 
         rec_density = self.rec(expr.density)
         if is_zero(rec_density):
             return 0
 
         assert isinstance(rec_density, np.ndarray)
-        if len(rec_density.shape) != 2:
+        if not self.is_kind_matrix(rec_density):
             raise NotImplementedError("layer potentials on non-variables")
 
         kernel = expr.kernel
-
-        kernel_args = {}
-        for arg_name, arg_expr in six.iteritems(expr.kernel_arguments):
-            rec_arg = self.rec(arg_expr)
-
-            if isinstance(rec_arg, np.ndarray):
-                if len(rec_arg.shape) == 2:
-                    raise RuntimeError("matrix variables in kernel arguments")
-                if len(rec_arg.shape) == 1:
-                    from pytools.obj_array import with_object_array_or_scalar
-
-                    def resample(x):
-                        return (
-                                source.resampler(
-                                    self.queue,
-                                    cl.array.to_device(self.queue, x))
-                                .get(queue=self.queue))
-
-                    rec_arg = with_object_array_or_scalar(resample, rec_arg)
-
-            kernel_args[arg_name] = rec_arg
+        kernel_args = _get_layer_potential_args(self, expr, source)
 
         from sumpy.expansion.local import LineTaylorLocalExpansion
         local_expn = LineTaylorLocalExpansion(kernel, source.qbx_order)
@@ -181,65 +428,210 @@ class MatrixBuilder(EvaluationMapperBase):
         mat_gen = LayerPotentialMatrixGenerator(
                 self.queue.context, (local_expn,))
 
-        assert target_discr is source.density_discr
+        assert abs(expr.qbx_forced_limit) > 0
+        centers, radii = _get_centers_and_expansion_radii(self.queue,
+                source, target_discr, expr.qbx_forced_limit)
 
-        from pytential.qbx.utils import get_centers_on_side
+        _, (mat,) = mat_gen(self.queue,
+                targets=target_discr.nodes(),
+                sources=source_discr.nodes(),
+                centers=centers,
+                expansion_radii=radii,
+                **kernel_args)
+        mat = mat.get()
+
+        waa = _get_weights_and_area_elements(self.queue, source, source_discr)
+        mat[:, :] *= waa.get(self.queue)
+
+        if target_discr.nnodes != source_discr.nnodes:
+            # NOTE: we only resample sources
+            assert target_discr.nnodes < source_discr.nnodes
+
+            resampler = source.direct_resampler
+            resample_mat = resampler.full_resample_matrix(self.queue).get(self.queue)
+            mat = mat.dot(resample_mat)
+
+        mat = mat.dot(rec_density)
+
+        return mat
+
+# }}}
+
+
+# {{{ p2p matrix builder
+
+class P2PMatrixBuilder(MatrixBuilderBase):
+    def __init__(self, queue, dep_expr, other_dep_exprs,
+            dep_source, dep_discr, places, context, exclude_self=True):
+        super(P2PMatrixBuilder, self).__init__(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, context)
+
+        self.exclude_self = exclude_self
+
+    def map_int_g(self, expr):
+        source = self.places[expr.source]
+        source_discr = self.places.get_discretization(expr.source)
+        target_discr = self.places.get_discretization(expr.target)
+
+        rec_density = self.rec(expr.density)
+        if is_zero(rec_density):
+            return 0
+
+        assert isinstance(rec_density, np.ndarray)
+        if not self.is_kind_matrix(rec_density):
+            raise NotImplementedError("layer potentials on non-variables")
+
+        kernel = expr.kernel.get_base_kernel()
+        kernel_args = _get_kernel_args(self, kernel, expr, source)
+        if self.exclude_self:
+            kernel_args["target_to_source"] = \
+                cl.array.arange(self.queue, 0, target_discr.nnodes, dtype=np.int)
+
+        from sumpy.p2p import P2PMatrixGenerator
+        mat_gen = P2PMatrixGenerator(
+                self.queue.context, (kernel,), exclude_self=self.exclude_self)
 
-        assert abs(expr.qbx_forced_limit) > 0
         _, (mat,) = mat_gen(self.queue,
-                target_discr.nodes(),
-                source.quad_stage2_density_discr.nodes(),
-                get_centers_on_side(source, expr.qbx_forced_limit),
-                expansion_radii=self.dep_source._expansion_radii("nsources"),
+                targets=target_discr.nodes(),
+                sources=source_discr.nodes(),
                 **kernel_args)
 
         mat = mat.get()
+        mat = mat.dot(rec_density)
 
-        waa = source.weights_and_area_elements().get(queue=self.queue)
-        mat[:, :] *= waa
+        return mat
+# }}}
 
-        resample_mat = (
-                source.resampler.full_resample_matrix(self.queue).get(self.queue))
-        mat = mat.dot(resample_mat)
-        mat = mat.dot(rec_density)
+
+# {{{ block matrix builders
+
+class NearFieldBlockBuilder(MatrixBlockBuilderBase):
+    def __init__(self, queue, dep_expr, other_dep_exprs, dep_source, dep_discr,
+            places, index_set, context):
+        super(NearFieldBlockBuilder, self).__init__(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, index_set, context)
+
+        # NOTE: we need additional mappers to redirect some operations:
+        #   * mat_mapper is used to compute any kernel arguments that need to
+        #   be computed on the full discretization, ignoring our index_set,
+        #   e.g the normal in a double layer potential
+        #   * blk_mapper is used to recursively compute the density to
+        #   a layer potential operator to ensure there is no composition
+        self.mat_mapper = MatrixBuilderBase(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, context)
+        self.blk_mapper = MatrixBlockBuilderBase(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, index_set, context)
+
+    def get_dep_variable(self):
+        tgtindices = self.index_set.linear_row_indices.get(self.queue)
+        srcindices = self.index_set.linear_col_indices.get(self.queue)
+
+        return np.equal(tgtindices, srcindices).astype(np.float64)
+
+    def map_int_g(self, expr):
+        source = self.places[expr.source]
+        source_discr = self.places.get_discretization(expr.source)
+        target_discr = self.places.get_discretization(expr.target)
+
+        if source_discr is not target_discr:
+            raise NotImplementedError()
+
+        rec_density = self.blk_mapper.rec(expr.density)
+        if is_zero(rec_density):
+            return 0
+
+        if not np.isscalar(rec_density):
+            raise NotImplementedError()
+
+        kernel = expr.kernel
+        kernel_args = _get_layer_potential_args(self.mat_mapper, expr, source)
+
+        from sumpy.expansion.local import LineTaylorLocalExpansion
+        local_expn = LineTaylorLocalExpansion(kernel, source.qbx_order)
+
+        from sumpy.qbx import LayerPotentialMatrixBlockGenerator
+        mat_gen = LayerPotentialMatrixBlockGenerator(
+                self.queue.context, (local_expn,))
+
+        assert abs(expr.qbx_forced_limit) > 0
+        centers, radii = _get_centers_and_expansion_radii(self.queue,
+                source, target_discr, expr.qbx_forced_limit)
+
+        _, (mat,) = mat_gen(self.queue,
+                targets=target_discr.nodes(),
+                sources=source_discr.nodes(),
+                centers=centers,
+                expansion_radii=radii,
+                index_set=self.index_set,
+                **kernel_args)
+
+        waa = _get_weights_and_area_elements(self.queue, source, source_discr)
+        mat *= waa[self.index_set.linear_col_indices]
+        mat = rec_density * mat.get(self.queue)
 
         return mat
 
-    # IntGdSource should have been removed by a preprocessor
 
-    def map_num_reference_derivative(self, expr):
-        rec_operand = self.rec(expr.operand)
+class FarFieldBlockBuilder(MatrixBlockBuilderBase):
+    def __init__(self, queue, dep_expr, other_dep_exprs, dep_source, dep_discr,
+            places, index_set, context, exclude_self=False):
+        super(FarFieldBlockBuilder, self).__init__(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, index_set, context)
 
-        assert isinstance(rec_operand, np.ndarray)
-        if len(rec_operand.shape) == 2:
-            raise NotImplementedError("derivatives")
+        # NOTE: same mapper issues as in the NearFieldBlockBuilder
+        self.exclude_self = exclude_self
+        self.mat_mapper = MatrixBuilderBase(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, context)
+        self.blk_mapper = MatrixBlockBuilderBase(queue,
+                dep_expr, other_dep_exprs, dep_source, dep_discr,
+                places, index_set, context)
 
-        where_discr = self.places[expr.where]
-        op = sym.NumReferenceDerivative(expr.ref_axes, sym.var("u"))
-        return bind(where_discr, op)(
-                self.queue, u=cl.array.to_device(self.queue, rec_operand)).get()
+    def get_dep_variable(self):
+        tgtindices = self.index_set.linear_row_indices.get(self.queue)
+        srcindices = self.index_set.linear_col_indices.get(self.queue)
 
-    def map_node_coordinate_component(self, expr):
-        where_discr = self.places[expr.where]
-        op = sym.NodeCoordinateComponent(expr.ambient_axis)
-        return bind(where_discr, op)(self.queue).get()
+        return np.equal(tgtindices, srcindices).astype(np.float64)
 
-    def map_call(self, expr):
-        arg, = expr.parameters
-        rec_arg = self.rec(arg)
+    def map_int_g(self, expr):
+        source = self.places[expr.source]
+        source_discr = self.places.get_discretization(expr.source)
+        target_discr = self.places.get_discretization(expr.target)
 
-        if (
-                isinstance(rec_arg, np.ndarray)
-                and len(rec_arg.shape) == 2):
-            raise RuntimeError("expression is nonlinear in variable")
+        if source_discr is not target_discr:
+            raise NotImplementedError()
 
-        if isinstance(rec_arg, np.ndarray):
-            rec_arg = cl.array.to_device(self.queue, rec_arg)
+        rec_density = self.blk_mapper.rec(expr.density)
+        if is_zero(rec_density):
+            return 0
 
-        op = expr.function(sym.var("u"))
-        result = bind(self.dep_source, op)(self.queue, u=rec_arg)
+        if not np.isscalar(rec_density):
+            raise NotImplementedError()
 
-        if isinstance(result, cl.array.Array):
-            result = result.get()
+        kernel = expr.kernel.get_base_kernel()
+        kernel_args = _get_kernel_args(self.mat_mapper, kernel, expr, source)
+        if self.exclude_self:
+            kernel_args["target_to_source"] = \
+                cl.array.arange(self.queue, 0, target_discr.nnodes, dtype=np.int)
 
-        return result
+        from sumpy.p2p import P2PMatrixBlockGenerator
+        mat_gen = P2PMatrixBlockGenerator(
+                self.queue.context, (kernel,), exclude_self=self.exclude_self)
+
+        _, (mat,) = mat_gen(self.queue,
+                targets=target_discr.nodes(),
+                sources=source_discr.nodes(),
+                index_set=self.index_set,
+                **kernel_args)
+        mat = rec_density * mat.get(self.queue)
+
+        return mat
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/symbolic/pde/cahn_hilliard.py b/pytential/symbolic/pde/cahn_hilliard.py
index fd6ec4824cc107fb5d4ea4de8520997f59557ca5..3263f18e1b9c144a743335107339770a12425c30 100644
--- a/pytential/symbolic/pde/cahn_hilliard.py
+++ b/pytential/symbolic/pde/cahn_hilliard.py
@@ -54,12 +54,10 @@ class CahnHilliardOperator(L2WeightedPDEOperator):
             return (
                     # FIXME: Verify scaling
                     -1/(2*np.pi*(lam1**2-lam2**2)) / hhk_scaling
-                    *
-                    (
+                    * (
                         op_map(sym.S(hhk, density, k=1j*lam1,
                             qbx_forced_limit=qbx_forced_limit))
-                        -
-                        op_map(sym.S(hhk, density, k=1j*lam2,
+                        - op_map(sym.S(hhk, density, k=1j*lam2,
                             qbx_forced_limit=qbx_forced_limit))))
         else:
             return (
diff --git a/pytential/symbolic/pde/maxwell/__init__.py b/pytential/symbolic/pde/maxwell/__init__.py
index 78219b568053480f6ef08bcfb913efdb9757b5a5..419043d4ecc06e764463f8871a56d4c4f01c4a3a 100644
--- a/pytential/symbolic/pde/maxwell/__init__.py
+++ b/pytential/symbolic/pde/maxwell/__init__.py
@@ -117,7 +117,8 @@ def get_sym_maxwell_plane_wave(amplitude_vec, v, omega, epsilon=1, mu=1, where=N
 
 # }}}
 
-# {{{ point source for vector potential based on Lorenz gauge 
+
+# {{{ Maxwell sources for scalar/vector potentials
 
 def get_sym_maxwell_point_source_potentials(kernel, jxyz, k):
     r"""Return a symbolic expression that, when bound to a
@@ -136,50 +137,67 @@ def get_sym_maxwell_point_source_potentials(kernel, jxyz, k):
         field[:3]/(1j*k)    # vector potential
         )
 
-# }}}
 
 def get_sym_maxwell_planewave_gradphi(u, Ep, k, where=None):
-    r"""
-    Return symbolic expression that can be bound to a :class:`pytential.source.PointPotentialSource`
-    and yield the gradient of a scalar potential field satisfying Maxwell's equations.
+    r""" Return symbolic expression that can be bound to a
+    :class:`pytential.source.PointPotentialSource` and yield the gradient of a
+    scalar potential field satisfying Maxwell's equations.
+
+    Represents the following:
 
-    Should be representing the following:
     .. math::
+
         \nabla \phi(x) = - e^{i k x^T u} E_p^T \left( 1 + i k x^T u\right)
     """
     x = sym.nodes(3, where).as_vector()
-    grad_phi = -sym.exp(1j*k*np.dot(x,u)) * (Ep.T + 1j*k*np.dot(Ep,x)*u.T)
+    grad_phi = -sym.exp(1j*k*np.dot(x, u)) * (Ep.T + 1j*k*np.dot(Ep, x)*u.T)
     return grad_phi
 
+
 def get_sym_maxwell_planewave_divA(u, Ep, k, epsilon=1, mu=1, where=None):
-    r"""
-    Return symbolic expression that can be bound to a :class:`pytential.source.PointPotentialSource`
-    and yield the divergence of a vector potential field satisfying Maxwell's equations.
+    r"""Return symbolic expression that can be bound to a
+    :class:`pytential.source.PointPotentialSource` and yield the divergence of
+    a vector potential field satisfying Maxwell's equations.
+
+    Represents the following:
 
-    Should be representing the following:
     .. math::
-        \nabla \cdot \boldsymbol{A} = -\sqrt{\mu \epsilon} e^{i k x^T u} E_p^T \left( u + i k x\right)
+
+        \nabla \cdot \boldsymbol{A} = -\sqrt{\mu \epsilon}
+        e^{i k x^T u} E_p^T \left( u + i k x\right)
     """
     x = sym.nodes(3, where).as_vector()
-    divA = sym.join_fields(-sym.sqrt(epsilon*mu) * sym.exp(1j*k*np.dot(x,u)) * np.dot(Ep,u + 1j*k*x))
+    divA = sym.join_fields(
+            -sym.sqrt(epsilon*mu) * sym.exp(1j*k*np.dot(x, u))
+            * np.dot(Ep, u + 1j*k*x))
+
     return divA
 
+
 def get_sym_maxwell_planewave_potentials(u, Ep, k, epsilon=1, mu=1, where=None):
-    r"""
-    Return a 2-tuple of symbolic expressions that can be bound to a :class:`pytential.source.PointPotentialSource`
-    and yield the scalar and vector potential fields satisfying Maxwell's equations that represent
-    a plane wave.
+    r"""Return a 2-tuple of symbolic expressions that can be bound to a
+    :class:`pytential.source.PointPotentialSource` and yield the scalar and
+    vector potential fields satisfying Maxwell's equations that represent a
+    plane wave.
+
+    Represents the following:
 
-    Should be representing the following:
     .. math::
-        \boldsymbol{A} = -u \left(x \cdot E_p \right)\sqrt{\mu \epsilon} e^{i k x^T u}
+
+        \boldsymbol{A} = -u \left(x \cdot E_p \right)
+        \sqrt{\mu \epsilon} e^{i k x^T u}
+
     .. math::
+
         \phi = - \left(x \cdot E_p\right) e^{i k x^T u}
     """
     x = sym.nodes(3, where).as_vector()
-    A = -u * np.dot(x,Ep) * sym.sqrt(epsilon*mu) * sym.exp(1j*k*np.dot(x,u))
-    phi = sym.join_fields(-np.dot(x,Ep) * sym.exp(1j*k*np.dot(x,u)))
-    return (phi, A)
+    A = -u * np.dot(x, Ep) * sym.sqrt(epsilon*mu) * sym.exp(1j*k*np.dot(x, u))
+    phi = sym.join_fields(-np.dot(x, Ep) * sym.exp(1j*k*np.dot(x, u)))
+    return phi, A
+
+# }}}
+
 
 # {{{ Charge-Current MFIE
 
@@ -234,7 +252,7 @@ class PECChargeCurrentMFIEOperator:
 
     def scattered_volume_field(self, Jt, rho, qbx_forced_limit=None):
         """
-        This will return an object of six entries, the first three of which
+        This will return an object array of six entries, the first three of which
         represent the electric, and the second three of which represent the
         magnetic field. This satisfies the time-domain Maxwell's equations
         as verified by :func:`sumpy.point_calculus.frequency_domain_maxwell`.
@@ -299,14 +317,14 @@ class MuellerAugmentedMFIEOperator(object):
 
         grad = partial(sym.grad, 3)
 
-        E0 = sym.cse(1j*omega*mu0*eps0*S(Jxyz, k=k0) +
-            mu0*curl_S(Mxyz, k=k0) - grad(S(u.rho_e, k=k0)), "E0")
-        H0 = sym.cse(-1j*omega*mu0*eps0*S(Mxyz, k=k0) +
-            eps0*curl_S(Jxyz, k=k0) + grad(S(u.rho_m, k=k0)), "H0")
-        E1 = sym.cse(1j*omega*mu1*eps1*S(Jxyz, k=k1) +
-            mu1*curl_S(Mxyz, k=k1) - grad(S(u.rho_e, k=k1)), "E1")
-        H1 = sym.cse(-1j*omega*mu1*eps1*S(Mxyz, k=k1) +
-            eps1*curl_S(Jxyz, k=k1) + grad(S(u.rho_m, k=k1)), "H1")
+        E0 = sym.cse(1j*omega*mu0*eps0*S(Jxyz, k=k0)
+            + mu0*curl_S(Mxyz, k=k0) - grad(S(u.rho_e, k=k0)), "E0")
+        H0 = sym.cse(-1j*omega*mu0*eps0*S(Mxyz, k=k0)
+            + eps0*curl_S(Jxyz, k=k0) + grad(S(u.rho_m, k=k0)), "H0")
+        E1 = sym.cse(1j*omega*mu1*eps1*S(Jxyz, k=k1)
+            + mu1*curl_S(Mxyz, k=k1) - grad(S(u.rho_e, k=k1)), "E1")
+        H1 = sym.cse(-1j*omega*mu1*eps1*S(Mxyz, k=k1)
+            + eps1*curl_S(Jxyz, k=k1) + grad(S(u.rho_m, k=k1)), "H1")
 
         F1 = (xyz_to_tangential(sym.n_cross(H1-H0) + 0.5*(eps0+eps1)*Jxyz))
         F2 = (sym.n_dot(eps1*E1-eps0*E0) + 0.5*(eps1+eps0)*u.rho_e)
diff --git a/pytential/symbolic/pde/maxwell/dpie.py b/pytential/symbolic/pde/maxwell/dpie.py
index 8b0b39dd0ef8e53002d38a38897d130aab1e70b7..b722d2981363c325ea570c87b09538e4849dd063 100644
--- a/pytential/symbolic/pde/maxwell/dpie.py
+++ b/pytential/symbolic/pde/maxwell/dpie.py
@@ -22,38 +22,32 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-# import useful tools/libs
-import numpy        as np  # noqa
-from pytential      import bind, sym
-from collections    import namedtuple
-from functools      import partial
-
-# define a few functions based on existing functions
-tangential_to_xyz = sym.tangential_to_xyz
-xyz_to_tangential = sym.xyz_to_tangential
-cse = sym.cse
+import numpy as np  # noqa
+from pytential import sym
 
 __doc__ = """
 .. autoclass:: DPIEOperator
 .. autoclass:: DPIEOperatorEvanescent
 """
 
+cse = sym.cse
 
 
 # {{{ Decoupled Potential Integral Equation Operator - based on Arxiv paper
-class DPIEOperator:
+
+class DPIEOperator(object):
     r"""
     Decoupled Potential Integral Equation operator with PEC boundary
     conditions, defaults as scaled DPIE.
 
-    See https://arxiv.org/abs/1404.0749 for derivation.
+    See `the arxiv paper <https://arxiv.org/abs/1404.0749>`_ for derivation.
 
-    Uses :math:`E(x,t) = Re \lbrace E(x) \exp(-i \omega t) \rbrace` and 
-    :math:`H(x,t) = Re \lbrace H(x) \exp(-i \omega t) \rbrace` and solves for 
+    Uses :math:`E(x, t) = Re \lbrace E(x) \exp(-i \omega t) \rbrace` and
+    :math:`H(x, t) = Re \lbrace H(x) \exp(-i \omega t) \rbrace` and solves for
     the :math:`E(x)`, :math:`H(x)` fields using vector and scalar potentials via
-    the Lorenz Gauage. The DPIE formulates the problem purely in terms of the 
-    vector and scalar potentials, :math:`\boldsymbol{A}` and :math:`\phi`, 
-    and then backs out :math:`E(x)` and :math:`H(x)` via relationships to 
+    the Lorenz Gauage. The DPIE formulates the problem purely in terms of the
+    vector and scalar potentials, :math:`\boldsymbol{A}` and :math:`\phi`,
+    and then backs out :math:`E(x)` and :math:`H(x)` via relationships to
     the vector and scalar potentials.
     """
 
@@ -61,15 +55,15 @@ class DPIEOperator:
         from sumpy.kernel import HelmholtzKernel
 
         # specify the frequency variable that will be tuned
-        self.k          = k
-        self.stype      = type(self.k)
+        self.k = k
+        self.stype = type(self.k)
 
-        # specify the 3-D Helmholtz kernel 
-        self.kernel     = HelmholtzKernel(3)
+        # specify the 3-D Helmholtz kernel
+        self.kernel = HelmholtzKernel(3)
 
         # specify a list of strings representing geometry objects
-        self.geometry_list   = geometry_list
-        self.nobjs           = len(geometry_list)
+        self.geometry_list = geometry_list
+        self.nobjs = len(geometry_list)
 
     def num_distinct_objects(self):
         return self.nobjs
@@ -84,17 +78,16 @@ class DPIEOperator:
         return 2*len(self.geometry_list)
 
     def get_vector_domain_list(self):
-        """
-        Method to return domain list that will be used within the scipy_op method to
-        solve the system of discretized integral equations. What is returned should just
-        be a list with values that are strings or None.
+        """Method to return domain list that will be used within the scipy_op
+        method to solve the system of discretized integral equations. What is
+        returned should just be a list with values that are strings or None.
         """
 
         # initialize domain list
         domain_list = [None]*self.num_vector_potential_densities()
 
         # get strings for the actual densities
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
             # grab nth location identifier
             location = self.geometry_list[n] + "t"
@@ -110,17 +103,16 @@ class DPIEOperator:
         return domain_list
 
     def get_scalar_domain_list(self):
-        """
-        Method to return domain list that will be used within the scipy_op method to
-        solve the system of discretized integral equations. What is returned should just
-        be a list with values that are strings or None.
+        """Method to return domain list that will be used within the scipy_op
+        method to solve the system of discretized integral equations. What is
+        returned should just be a list with values that are strings or None.
         """
 
         # initialize domain list
         domain_list = [None]*self.num_scalar_potential_densities()
 
         # get strings for the actual densities
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
             # grab nth location identifier
             location = self.geometry_list[n] + "t"
@@ -132,17 +124,16 @@ class DPIEOperator:
         return domain_list
 
     def get_subproblem_domain_list(self):
-        """
-        Method to return domain list that will be used within the scipy_op method to
-        solve the system of discretized integral equations. What is returned should just
-        be a list with values that are strings or None.
+        """Method to return domain list that will be used within the scipy_op
+        method to solve the system of discretized integral equations. What is
+        returned should just be a list with values that are strings or None.
         """
 
         # initialize domain list
         domain_list = [None]*self.nobjs
 
         # get strings for the actual densities
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
             # grab nth location identifier
             location = self.geometry_list[n] + "t"
@@ -153,10 +144,10 @@ class DPIEOperator:
         # return the domain list
         return domain_list
 
-
-    def _layerpot_op(self,layerpot_op,density_vec, target=None, qfl="avg", k=None, kernel=None, use_laplace=False):
-        """
-        Generic layer potential operator method that works across all objects within the DPIE model
+    def _layerpot_op(self, layerpot_op, density_vec, target=None, qfl="avg", k=None,
+            kernel=None, use_laplace=False):
+        """Generic layer potential operator method that works across all
+        objects within the DPIE model
         """
         if kernel is None:
             kernel = self.kernel
@@ -168,9 +159,11 @@ class DPIEOperator:
         if not use_laplace:
             kargs['k'] = k
 
-        # define a convenient integral operator that functions across the multiple objects
+        # define a convenient integral operator that functions across the
+        # multiple objects
         def int_op(idx):
-                return layerpot_op(kernel, density_vec[:,idx],qbx_forced_limit=qfl,source=self.geometry_list[idx],target=target, **kargs)
+                return layerpot_op(kernel, density_vec[:, idx], qbx_forced_limit=qfl,
+                        source=self.geometry_list[idx], target=target, **kargs)
 
         # get the shape of density_vec
         (ndim, nobj) = density_vec.shape
@@ -180,7 +173,7 @@ class DPIEOperator:
 
         # compute individual double layer potential evaluations at the given
         # density across all the disjoint objects
-        for i in range(0,nobj):
+        for i in range(0, nobj):
             output = output + int_op(i)
 
         # return the output summation
@@ -189,42 +182,56 @@ class DPIEOperator:
         else:
             return output
 
-    def D(self, density_vec, target=None, qfl="avg", k=None, kernel=None, use_laplace=False):
+    def D(self, density_vec, target=None, qfl="avg", k=None, kernel=None,
+            use_laplace=False):
         """
         Double layer potential operator across multiple disjoint objects
         """
-        return self._layerpot_op(layerpot_op=sym.D, density_vec=density_vec, target=target, qfl=qfl, k=k, kernel=kernel, use_laplace=use_laplace)
+        return self._layerpot_op(layerpot_op=sym.D, density_vec=density_vec,
+                target=target, qfl=qfl, k=k, kernel=kernel,
+                use_laplace=use_laplace)
 
-    def S(self, density_vec, target=None, qfl="avg", k=None, kernel=None, use_laplace=False):
+    def S(self, density_vec, target=None, qfl="avg", k=None, kernel=None,
+            use_laplace=False):
         """
         Single layer potential operator across multiple disjoint objects
         """
-        return self._layerpot_op(layerpot_op=sym.S, density_vec=density_vec, target=target, qfl=qfl, k=k, kernel=kernel, use_laplace=use_laplace)
-
+        return self._layerpot_op(layerpot_op=sym.S, density_vec=density_vec,
+                target=target, qfl=qfl, k=k, kernel=kernel,
+                use_laplace=use_laplace)
 
-    def Dp(self, density_vec, target=None, qfl="avg", k=None, kernel=None, use_laplace=False):
+    def Dp(self, density_vec, target=None, qfl="avg", k=None, kernel=None,
+            use_laplace=False):
         """
         D' layer potential operator across multiple disjoint objects
         """
-        return self._layerpot_op(layerpot_op=sym.Dp, density_vec=density_vec, target=target, qfl=qfl, k=k, kernel=kernel, use_laplace=use_laplace)
+        return self._layerpot_op(layerpot_op=sym.Dp, density_vec=density_vec,
+                target=target, qfl=qfl, k=k, kernel=kernel,
+                use_laplace=use_laplace)
 
-    def Sp(self, density_vec, target=None, qfl="avg", k=None, kernel=None, use_laplace=False):
+    def Sp(self, density_vec, target=None, qfl="avg", k=None, kernel=None,
+            use_laplace=False):
         """
         S' layer potential operator across multiple disjoint objects
         """
-        return self._layerpot_op(layerpot_op=sym.Sp, density_vec=density_vec, target=target, qfl=qfl, k=k, kernel=kernel, use_laplace=use_laplace)
+        return self._layerpot_op(layerpot_op=sym.Sp, density_vec=density_vec,
+                target=target, qfl=qfl, k=k, kernel=kernel,
+                use_laplace=use_laplace)
 
     def n_cross(self, density_vec):
-        r"""
-        This method is such that an cross(n,a) can operate across vectors
-        a and n that are local to a set of disjoint source surfaces. Essentially,
-        imagine that :math:`\bar{a} = [a_1, a_2, \cdots, a_m]`, where :math:`a_k` represents a vector density
-        defined on the :math:`k^{th}` disjoint object. Also imagine that :math:`bar{n} = [n_1, n_2, \cdots, n_m]`,
-        where :math:`n_k` represents a normal that exists on the :math:`k^{th}` disjoint object. The goal, then,
-        is to have an operator that does element-wise cross products.. ie:
+        r"""This method is such that ``cross(n, a)`` can operate across vectors
+        a and n that are local to a set of disjoint source surfaces.
+        Essentially, imagine that :math:`\bar{a} = [a_1, a_2, \cdots, a_m]`,
+        where :math:`a_k` represents a vector density defined on the
+        :math:`k^{th}` disjoint object. Also imagine that :math:`bar{n} = [n_1,
+        n_2, \cdots, n_m]`, where :math:`n_k` represents a normal that exists
+        on the :math:`k^{th}` disjoint object. The goal, then, is to have an
+        operator that does element-wise cross products.. ie:
 
         .. math::
-            \bar{n} \times \bar{a}) = [ \left(n_1 \times a_1\right), ..., \left(n_m \times a_m \right)]
+
+            \bar{n} \times \bar{a}) = [ \left(n_1 \times a_1\right), \dots,
+            \left(n_m \times a_m \right)]
         """
 
         # specify the sources to be evaluated at
@@ -241,23 +248,28 @@ class DPIEOperator:
 
         # loop through the density and sources to construct the appropriate
         # element-wise cross product operation
-        for k in range(0,nobj):
-            output[:,k] = sym.n_cross(density_vec[:,k],where=sources[k])
+        for k in range(0, nobj):
+            output[:, k] = sym.n_cross(density_vec[:, k], where=sources[k])
 
         # return result from element-wise cross product
         return output
 
     def n_times(self, density_vec):
-        r"""
-        This method is such that an :math:`\boldsymbol{n} \rho`, for some normal :math:`\boldsymbol{n}` and
-        some scalar :math:`\rho` can be done across normals and scalars that exist on multiple surfaces. Essentially,
-        imagine that :math:`\bar{\rho} = [\rho_1, \cdots, \rho_m]`, where :math:`\rho_k` represents a scalar density
-        defined on the :math:`k^{th}` disjoint object. Also imagine that :math:`bar{n} = [\boldsymbol{n}_1, \cdots, \boldsymbol{n}_m]`,
-        where :math:`n_k` represents a normal that exists on the :math:`k^{th}` disjoint object. The goal, then,
-        is to have an operator that does element-wise products.. ie:
+        r"""This method is such that an :math:`\boldsymbol{n} \rho`, for some
+        normal :math:`\boldsymbol{n}` and some scalar :math:`\rho` can be done
+        across normals and scalars that exist on multiple surfaces.
+        Essentially, imagine that :math:`\bar{\rho} = [\rho_1, \cdots,
+        \rho_m]`, where :math:`\rho_k` represents a scalar density defined on
+        the :math:`k^{th}` disjoint object. Also imagine that :math:`bar{n} =
+        [\boldsymbol{n}_1, \cdots, \boldsymbol{n}_m]`, where :math:`n_k`
+        represents a normal that exists on the :math:`k^{th}` disjoint object.
+        The goal, then, is to have an operator that does element-wise
+        products, i.e.:
 
         .. math::
-            \bar{n}\bar{\rho} = [ \left(\boldsymbol{n}_1 \rho_1\right), ..., \left(\boldsymbol{n}_m \rho_m \right)]
+
+            \bar{n}\bar{\rho} = [ \left(\boldsymbol{n}_1 \rho_1\right), \dots,
+            \left(\boldsymbol{n}_m \rho_m \right)]
         """
 
         # specify the sources to be evaluated at
@@ -270,69 +282,77 @@ class DPIEOperator:
         assert ndim == 1
 
         # init output symbolic quantity with zeros
-        output = np.zeros((3,nobj), dtype=self.stype)
+        output = np.zeros((3, nobj), dtype=self.stype)
 
         # loop through the density and sources to construct the appropriate
         # element-wise cross product operation
-        for k in range(0,nobj):
-            output[:,k] = sym.normal(3,where=sources[k]).as_vector() * density_vec[0,k]
+        for k in range(0, nobj):
+            output[:, k] = \
+                    sym.normal(3, where=sources[k]).as_vector() * density_vec[0, k]
 
         # return result from element-wise cross product
         return output
 
-    def _extract_phi_densities(self,phi_densities):
-        return (phi_densities[:self.nobjs],phi_densities[:self.nobjs].reshape((1,self.nobjs)),phi_densities[self.nobjs:])
+    def _extract_phi_densities(self, phi_densities):
+        return (phi_densities[:self.nobjs],
+                phi_densities[:self.nobjs].reshape((1, self.nobjs)),
+                phi_densities[self.nobjs:])
 
-    def _extract_tau_densities(self,tau_densities):
-        return (tau_densities,tau_densities.reshape((1,self.nobjs)))
+    def _extract_tau_densities(self, tau_densities):
+        return (tau_densities, tau_densities.reshape((1, self.nobjs)))
 
-    def _extract_a_densities(self,A_densities):
+    def _extract_a_densities(self, A_densities):
         a0 = A_densities[:(2*self.nobjs)]
-        a = np.zeros((3,self.nobjs),dtype=self.stype)
+        a = np.zeros((3, self.nobjs), dtype=self.stype)
         rho0 = A_densities[(2*self.nobjs):(3*self.nobjs)]
-        rho = rho0.reshape((1,self.nobjs))
+        rho = rho0.reshape((1, self.nobjs))
         v = A_densities[(3*self.nobjs):]
-        for n in range(0,self.nobjs):
-            a[:,n] = cse(sym.tangential_to_xyz(a0[2*n:2*(n+1)],where=self.geometry_list[n]),"axyz_{0}".format(n))
+        for n in range(0, self.nobjs):
+            a[:, n] = cse(sym.tangential_to_xyz(a0[2*n:2*(n+1)],
+                where=self.geometry_list[n]), "axyz_{0}".format(n))
         return (a0, a, rho0, rho, v)
 
     def _L(self, a, rho, where):
 
         # define some useful common sub expressions
-        Sa = cse(self.S(a,where),"Sa_"+where)
-        Srho = cse(self.S(rho,where),"Srho_"+where)
-        Sn_times_rho = cse(self.S(self.n_times(rho),where),"Sn_times_rho_"+where)
-        Sn_cross_a = cse(self.S(self.n_cross(a),where),"Sn_cross_a_"+where)
-        Drho = cse(self.D(rho,where),"Drho_"+where)
+        # Sa = cse(self.S(a, where), "Sa_"+where)
+        # Srho = cse(self.S(rho, where), "Srho_"+where)
+        Sn_times_rho = cse(self.S(self.n_times(rho), where), "Sn_times_rho_"+where)
+        # Sn_cross_a = cse(self.S(self.n_cross(a), where), "Sn_cross_a_"+where)
+        Drho = cse(self.D(rho, where), "Drho_"+where)
 
         return sym.join_fields(
-            sym.n_cross(sym.curl(self.S(a,where)) - self.k * Sn_times_rho,where=where),
+            sym.n_cross(
+                sym.curl(self.S(a, where)) - self.k * Sn_times_rho,
+                where=where),
             Drho)
 
     def _R(self, a, rho, where):
         # define some useful common sub expressions
-        Sa = cse(self.S(a,where),"Sa_"+where)
-        Srho = cse(self.S(rho,where),"Srho_"+where)
-        Sn_times_rho = cse(self.S(self.n_times(rho),where),"Sn_times_rho_"+where)
-        Sn_cross_a = cse(self.S(self.n_cross(a),where),"Sn_cross_a_"+where)
-        Drho = cse(self.D(rho,where),"Drho_"+where)
+        # Sa = cse(self.S(a, where), "Sa_"+where)
+        Srho = cse(self.S(rho, where), "Srho_"+where)
+        # Sn_times_rho = cse(self.S(self.n_times(rho), where), "Sn_times_rho_"+where)
+        Sn_cross_a = cse(self.S(self.n_cross(a), where), "Sn_cross_a_"+where)
+        # Drho = cse(self.D(rho, where), "Drho_"+where)
 
         return sym.join_fields(
-            sym.n_cross( self.k * Sn_cross_a + sym.grad(ambient_dim=3,operand=self.S(rho,where)),where=where),
-            sym.div(self.S(self.n_cross(a),where)) - self.k * Srho
+            sym.n_cross(self.k * Sn_cross_a + sym.grad(ambient_dim=3,
+                operand=self.S(rho, where)), where=where),
+            sym.div(self.S(self.n_cross(a), where)) - self.k * Srho
             )
 
     def _scaledDPIEs_integral(self, sigma, sigma_n, where):
-        qfl="avg"
+        qfl = "avg"
 
         return sym.integral(
             ambient_dim=3,
             dim=2,
-            operand=(self.Dp(sigma,target=where,qfl=qfl)/self.k + 1j*0.5*sigma_n - 1j*self.Sp(sigma,target=where,qfl=qfl)),
+            operand=(self.Dp(sigma, target=where, qfl=qfl)/self.k
+                + 1j*0.5*sigma_n - 1j*self.Sp(sigma, target=where, qfl=qfl)),
             where=where)
 
     def _scaledDPIEv_integral(self, **kwargs):
-        qfl="avg"
+        qfl = "avg"
 
         # grab densities and domain to integrate over
         a = kwargs['a']
@@ -341,49 +361,55 @@ class DPIEOperator:
         where = kwargs['where']
 
         # define some useful common sub expressions
-        Sa = cse(self.S(a,where),"Sa_"+where)
-        Srho = cse(self.S(rho,where),"Srho_"+where)
-        Sn_times_rho = cse(self.S(self.n_times(rho),where),"Sn_times_rho_"+where)
-        Sn_cross_a = cse(self.S(self.n_cross(a),where),"Sn_cross_a_"+where)
-        Drho = cse(self.D(rho,where),"Drho_"+where)
+        # Sa = cse(self.S(a, where), "Sa_"+where)
+        # Srho = cse(self.S(rho, where), "Srho_"+where)
+        Sn_times_rho = cse(self.S(self.n_times(rho), where), "Sn_times_rho_"+where)
+        Sn_cross_a = cse(self.S(self.n_cross(a), where), "Sn_cross_a_"+where)
+        # Drho = cse(self.D(rho, where), "Drho_"+where)
 
         return sym.integral(
             ambient_dim=3,
             dim=2,
             operand=(
-                sym.n_dot( sym.curl(self.S(a,where)),where=where) - self.k*sym.n_dot(Sn_times_rho,where=where) \
-                + 1j*(self.k*sym.n_dot(Sn_cross_a,where=where) - 0.5*rho_n + self.Sp(rho,target=where,qfl=qfl))
+                sym.n_dot(sym.curl(self.S(a, where)), where=where)
+                - self.k*sym.n_dot(Sn_times_rho, where=where)
+                + 1j*(self.k*sym.n_dot(Sn_cross_a, where=where) - 0.5*rho_n
+                    + self.Sp(rho, target=where, qfl=qfl))
             ),
             where=where)
 
-
     def phi_operator(self, phi_densities):
         """
         Integral Equation operator for obtaining scalar potential, `phi`
         """
 
         # extract the densities needed to solve the system of equations
-        (sigma0,sigma,V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
 
         # init output matvec vector for the phi density IE
         output = np.zeros((2*self.nobjs,), dtype=self.stype)
 
         # produce integral equation system over each disjoint object
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
-            # get nth disjoint object 
+            # get nth disjoint object
             obj_n = self.geometry_list[n]
 
             # setup IE for evaluation over the nth disjoint object's surface
-            output[n] = 0.5*sigma0[n] + self.D(sigma,obj_n) - 1j*self.k*self.S(sigma,obj_n) - V[n]
+            output[n] = (
+                    0.5*sigma0[n]
+                    + self.D(sigma, obj_n)
+                    - 1j*self.k*self.S(sigma, obj_n) - V[n]
+                    )
 
-            # setup equation that integrates some integral operators over the nth surface
-            output[self.nobjs + n] = self._scaledDPIEs_integral(sigma,sigma[0,n],where=obj_n)
+            # set up equation that integrates some integral operators over the
+            # nth surface
+            output[self.nobjs + n] = self._scaledDPIEs_integral(sigma, sigma[0,
+                n], where=obj_n)
 
         # return the resulting system of IE
         return output
 
-
     def phi_rhs(self, phi_inc, gradphi_inc):
         """
         The Right-Hand-Side for the Integral Equation for `phi`
@@ -391,18 +417,18 @@ class DPIEOperator:
 
         # get the scalar f expression for each object
         f = np.zeros((self.nobjs,), dtype=self.stype)
-        for i in range(0,self.nobjs):
+        for i in range(0, self.nobjs):
             f[i] = -phi_inc[i]
 
         # get the Q_{j} terms inside RHS expression
         Q = np.zeros((self.nobjs,), dtype=self.stype)
-        for i in range(0,self.nobjs):
-            Q[i] = -sym.integral(3,2,sym.n_dot(gradphi_inc,where=self.geometry_list[i]),where=self.geometry_list[i])
+        for i in range(0, self.nobjs):
+            Q[i] = -sym.integral(3, 2, sym.n_dot(gradphi_inc,
+                where=self.geometry_list[i]), where=self.geometry_list[i])
 
         # return the resulting field
         return sym.join_fields(f, Q/self.k)
 
-
     def a_operator(self, A_densities):
         """
         Integral Equation operator for obtaining vector potential, `A`
@@ -415,7 +441,7 @@ class DPIEOperator:
         output = np.zeros((4*self.nobjs,), dtype=self.stype)
 
         # produce integral equation system over each disjoint object
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
             # get the nth target geometry to have IE solved across
             obj_n = self.geometry_list[n]
@@ -426,14 +452,17 @@ class DPIEOperator:
 
             # generate the set of equations for the vector densities, a, coupled
             # across the various geometries involved
-            output[2*n:2*(n+1)] = xyz_to_tangential(0.5*a[:,n] + L[:3] + 1j*R[:3], where=obj_n)
+            output[2*n:2*(n+1)] = sym.xyz_to_tangential(
+                    0.5*a[:, n] + L[:3] + 1j*R[:3],
+                    where=obj_n)
 
             # generate the set of equations for the scalar densities, rho, coupled
             # across the various geometries involved
-            output[(2*self.nobjs + n)] = 0.5*rho[0,n] + L[-1] + 1j*R[-1] - v[n]
+            output[(2*self.nobjs + n)] = 0.5*rho[0, n] + L[-1] + 1j*R[-1] - v[n]
 
             # add the equation that integrates everything out into some constant
-            output[3*self.nobjs + n] = self._scaledDPIEv_integral(a=a, rho=rho, rho_n=rho[0,n], where=obj_n)
+            output[3*self.nobjs + n] = self._scaledDPIEv_integral(
+                    a=a, rho=rho, rho_n=rho[0, n], where=obj_n)
 
         # return output equations
         return output
@@ -447,16 +476,18 @@ class DPIEOperator:
         q = np.zeros((self.nobjs,), dtype=self.stype)
         h = np.zeros((self.nobjs,), dtype=self.stype)
         f = np.zeros((2*self.nobjs,), dtype=self.stype)
-        for i in range(0,self.nobjs):
+        for i in range(0, self.nobjs):
             obj_n = self.geometry_list[i]
-            q[i] = -sym.integral(3,2,sym.n_dot(A_inc[3*i:3*(i+1)],where=obj_n),where=obj_n)
+            q[i] = -sym.integral(3, 2, sym.n_dot(A_inc[3*i:3*(i+1)], where=obj_n),
+                    where=obj_n)
             h[i] = -divA_inc[i]
-            f[2*i:2*(i+1)] = xyz_to_tangential(-sym.n_cross(A_inc[3*i:3*(i+1)],where=obj_n),where=obj_n)
+            f[2*i:2*(i+1)] = sym.xyz_to_tangential(
+                    -sym.n_cross(A_inc[3*i:3*(i+1)], where=obj_n), where=obj_n)
 
         # define RHS for `A` integral equation system
-        return sym.join_fields( f, h/self.k, q )
+        return sym.join_fields(f, h/self.k, q)
 
-    def subproblem_operator(self, tau_densities, alpha = 1j):
+    def subproblem_operator(self, tau_densities, alpha=1j):
         """
         Integral Equation operator for obtaining sub problem solution
         """
@@ -468,13 +499,13 @@ class DPIEOperator:
         output = np.zeros((self.nobjs,), dtype=self.stype)
 
         # produce integral equation system over each disjoint object
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
-            # get nth disjoint object 
+            # get nth disjoint object
             obj_n = self.geometry_list[n]
 
             # setup IE for evaluation over the nth disjoint object's surface
-            output[n] = 0.5*tau0[n] + self.D(tau,obj_n) - alpha*self.S(tau,obj_n)
+            output[n] = 0.5*tau0[n] + self.D(tau, obj_n) - alpha*self.S(tau, obj_n)
 
         # return the resulting system of IE
         return output
@@ -490,13 +521,13 @@ class DPIEOperator:
         output = np.zeros((self.nobjs,), dtype=self.stype)
 
         # produce integral equation system over each disjoint object
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
-            # get nth disjoint object 
+            # get nth disjoint object
             obj_n = self.geometry_list[n]
 
             # setup IE for evaluation over the nth disjoint object's surface
-            output[n] = sym.div(self.S(a,target=obj_n,qfl="avg"))
+            output[n] = sym.div(self.S(a, target=obj_n, qfl="avg"))
 
         # return the resulting system of IE
         return output
@@ -510,9 +541,9 @@ class DPIEOperator:
         output = np.zeros((self.nobjs,), dtype=self.stype)
 
         # produce integral equation system over each disjoint object
-        for n in range(0,self.nobjs):
+        for n in range(0, self.nobjs):
 
-            # get nth disjoint object 
+            # get nth disjoint object
             obj_n = self.geometry_list[n]
 
             # setup IE for evaluation over the nth disjoint object's surface
@@ -521,7 +552,6 @@ class DPIEOperator:
         # return the resulting system of IE
         return output
 
-
     def scalar_potential_rep(self, phi_densities, target=None, qfl=None):
         """
         This method is a representation of the scalar potential, phi,
@@ -529,10 +559,13 @@ class DPIEOperator:
         """
 
         # extract the densities needed to solve the system of equations
-        (sigma0,sigma,V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
 
         # evaluate scalar potential representation
-        return self.D(sigma,target,qfl=qfl) - (1j*self.k)*self.S(sigma,target,qfl=qfl)
+        return (
+                self.D(sigma, target, qfl=qfl)
+                - (1j*self.k)*self.S(sigma, target, qfl=qfl)
+                )
 
     def scalar_potential_constants(self, phi_densities):
         """
@@ -541,7 +574,7 @@ class DPIEOperator:
         """
 
         # extract the densities needed to solve the system of equations
-        (sigma0,sigma,V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
 
         # evaluate scalar potential representation
         return V
@@ -553,10 +586,13 @@ class DPIEOperator:
         """
 
         # extract the densities needed to solve the system of equations
-        (sigma0,sigma,V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
 
         # evaluate scalar potential representation
-        return sym.grad(3,self.D(sigma,target,qfl=qfl)) - (1j*self.k)*sym.grad(3,self.S(sigma,target,qfl=qfl))
+        return (
+                sym.grad(3, self.D(sigma, target, qfl=qfl))
+                - (1j*self.k)*sym.grad(3, self.S(sigma, target, qfl=qfl))
+                )
 
     def vector_potential_rep(self, A_densities, target=None, qfl=None):
         """
@@ -568,8 +604,12 @@ class DPIEOperator:
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
 
         # define the vector potential representation
-        return (sym.curl(self.S(a,target,qfl=qfl)) - self.k*self.S(self.n_times(rho),target,qfl=qfl)) \
-        + 1j*(self.k*self.S(self.n_cross(a),target,qfl=qfl) + sym.grad(3,self.S(rho,target,qfl=qfl)))
+        return (
+                (sym.curl(self.S(a, target, qfl=qfl))
+                    - self.k*self.S(self.n_times(rho), target, qfl=qfl))
+                + 1j*(self.k*self.S(self.n_cross(a), target, qfl=qfl)
+                    + sym.grad(3, self.S(rho, target, qfl=qfl)))
+                )
 
     def div_vector_potential_rep(self, A_densities, target=None, qfl=None):
         """
@@ -581,10 +621,11 @@ class DPIEOperator:
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
 
         # define the vector potential representation
-        return self.k*( self.D(rho,target,qfl=qfl) \
-            + 1j*(sym.div(self.S(self.n_cross(a),target,qfl=qfl)) - self.k * self.S(rho,target,qfl=qfl)))
+        return self.k*(self.D(rho, target, qfl=qfl)
+            + 1j*(sym.div(self.S(self.n_cross(a), target, qfl=qfl))
+                - self.k * self.S(rho, target, qfl=qfl)))
 
-    def subproblem_rep(self, tau_densities, target=None, alpha = 1j, qfl=None):
+    def subproblem_rep(self, tau_densities, target=None, alpha=1j, qfl=None):
         """
         This method is a representation of the scalar potential, phi,
         based on the density `sigma`.
@@ -594,13 +635,14 @@ class DPIEOperator:
         (tau0, tau) = self._extract_tau_densities(tau_densities)
 
         # evaluate scalar potential representation
-        return self.D(tau,target,qfl=qfl) - alpha*self.S(tau,target,qfl=qfl)
+        return self.D(tau, target, qfl=qfl) - alpha*self.S(tau, target, qfl=qfl)
 
-    def scattered_volume_field(self, phi_densities, A_densities, tau_densities, target=None, alpha=1j,qfl=None):
+    def scattered_volume_field(self, phi_densities, A_densities, tau_densities,
+            target=None, alpha=1j, qfl=None):
         """
         This will return an object of six entries, the first three of which
         represent the electric, and the second three of which represent the
-        magnetic field. 
+        magnetic field.
 
         This satisfies the time-domain Maxwell's equations
         as verified by :func:`sumpy.point_calculus.frequency_domain_maxwell`.
@@ -608,20 +650,27 @@ class DPIEOperator:
 
         # extract the densities needed
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
-        (sigma0,sigma, V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
         (tau0, tau) = self._extract_tau_densities(tau_densities)
 
         # obtain expressions for scalar and vector potentials
-        A   = self.vector_potential_rep(A_densities, target=target)
-        phi = self.scalar_potential_rep(phi_densities, target=target)
+        A = self.vector_potential_rep(A_densities, target=target)
+        # phi = self.scalar_potential_rep(phi_densities, target=target)
 
         # evaluate the potential form for the electric and magnetic fields
-        E_scat = 1j*self.k*A - sym.grad(3, self.D(sigma,target,qfl=qfl)) + 1j*self.k*sym.grad(3, self.S(sigma,target,qfl=qfl))
-        H_scat = sym.grad(3,operand=(self.D(tau,target,qfl=qfl) - alpha*self.S(tau,target,qfl=qfl))) \
-            + (self.k**2) * self.S(a,target,qfl=qfl) \
-            - self.k * sym.curl(self.S(self.n_times(rho),target,qfl=qfl)) \
-            + 1j*self.k*sym.curl(self.S(self.n_cross(a),target,qfl=qfl))
-                
+        E_scat = (
+                1j*self.k*A
+                - sym.grad(3, self.D(sigma, target, qfl=qfl))
+                + 1j*self.k*sym.grad(3, self.S(sigma, target, qfl=qfl))
+                )
+        H_scat = (
+                sym.grad(3, operand=(
+                    self.D(tau, target, qfl=qfl)
+                    - alpha*self.S(tau, target, qfl=qfl)))
+                + (self.k**2) * self.S(a, target, qfl=qfl)
+                - self.k * sym.curl(self.S(self.n_times(rho), target, qfl=qfl))
+                + 1j*self.k*sym.curl(self.S(self.n_cross(a), target, qfl=qfl))
+                )
 
         # join the fields into a vector
         return sym.join_fields(E_scat, H_scat)
@@ -629,21 +678,22 @@ class DPIEOperator:
 # }}}
 
 
-
 # {{{ Decoupled Potential Integral Equation Operator - Based on Journal Paper
+
 class DPIEOperatorEvanescent(DPIEOperator):
     r"""
     Decoupled Potential Integral Equation operator with PEC boundary
     conditions, defaults as scaled DPIE.
 
-    See https://onlinelibrary.wiley.com/doi/abs/10.1002/cpa.21585 for journal paper.
+    See `the journal paper
+    <https://onlinelibrary.wiley.com/doi/abs/10.1002/cpa.21585>`_ for details.
 
-    Uses :math:`E(x,t) = Re \lbrace E(x) \exp(-i \omega t) \rbrace` and 
-    :math:`H(x,t) = Re \lbrace H(x) \exp(-i \omega t) \rbrace` and solves for 
+    Uses :math:`E(x, t) = Re \lbrace E(x) \exp(-i \omega t) \rbrace` and
+    :math:`H(x, t) = Re \lbrace H(x) \exp(-i \omega t) \rbrace` and solves for
     the :math:`E(x)`, :math:`H(x)` fields using vector and scalar potentials via
-    the Lorenz Gauage. The DPIE formulates the problem purely in terms of the 
-    vector and scalar potentials, :math:`\boldsymbol{A}` and :math:`\phi`, 
-    and then backs out :math:`E(x)` and :math:`H(x)` via relationships to 
+    the Lorenz Gauage. The DPIE formulates the problem purely in terms of the
+    vector and scalar potentials, :math:`\boldsymbol{A}` and :math:`\phi`,
+    and then backs out :math:`E(x)` and :math:`H(x)` via relationships to
     the vector and scalar potentials.
     """
 
@@ -652,83 +702,102 @@ class DPIEOperatorEvanescent(DPIEOperator):
         from sumpy.kernel import LaplaceKernel
 
         # specify the frequency variable that will be tuned
-        self.k          = k
-        self.ik         = 1j*k
-        self.stype      = type(self.k)
+        self.k = k
+        self.ik = 1j*k
+        self.stype = type(self.k)
 
-        # specify the 3-D Helmholtz kernel 
-        self.kernel     = HelmholtzKernel(3)
-        self.kernel_ik  = HelmholtzKernel(3, allow_evanescent=True)
+        # specify the 3-D Helmholtz kernel
+        self.kernel = HelmholtzKernel(3)
+        self.kernel_ik = HelmholtzKernel(3, allow_evanescent=True)
         self.kernel_laplace = LaplaceKernel(3)
 
         # specify a list of strings representing geometry objects
-        self.geometry_list   = geometry_list
-        self.nobjs           = len(geometry_list)
+        self.geometry_list = geometry_list
+        self.nobjs = len(geometry_list)
 
     def _eval_all_objects(self, density_vec, int_op, qfl="avg", k=None, kernel=None):
-        """
-        This private method is so some input integral operator and input density can be used to
-        evaluate the set of locations defined by the geometry list
+        """This private method is so some input integral operator and input
+        density can be used to evaluate the set of locations defined by the
+        geometry list
         """
         output = np.zeros(density_vec.shape, dtype=self.stype)
         (ndim, nobj) = density_vec.shape
-        for i in range(0,nobj):
-            output[:,i] = int_op(density_vec=density_vec, target=self.geometry_list[i], qfl=qfl, k=k, kernel=kernel)
+        for i in range(0, nobj):
+            output[:, i] = int_op(density_vec=density_vec,
+                    target=self.geometry_list[i], qfl=qfl, k=k, kernel=kernel)
         return output
 
     def _L(self, a, rho, where):
 
         # define some useful common sub expressions
-        Sn_times_rho = cse(self.S(self.n_times(rho),where),"Sn_times_rho_"+where)
-        Drho = cse(self.D(rho,where),"Drho_"+where)
+        Sn_times_rho = cse(self.S(self.n_times(rho), where), "Sn_times_rho_"+where)
+        Drho = cse(self.D(rho, where), "Drho_"+where)
 
         return sym.join_fields(
-            sym.n_cross(sym.curl(self.S(a,where)) - self.k * Sn_times_rho,where=where),
+            sym.n_cross(
+                sym.curl(self.S(a, where)) - self.k * Sn_times_rho,
+                where=where),
             Drho)
 
     def _R(self, a, rho, where):
         # define some useful common sub expressions
-        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik, kernel=self.kernel_ik), "Sa_ik_nest")
-        Srho_ik_nest = cse(self._eval_all_objects(rho,self.S, k=self.ik, kernel=self.kernel_ik),"Srho_ik_nest")
-        Srho = cse(self.S(Srho_ik_nest,where),"Srho_"+where)
-        Sn_cross_a = cse(self.S(self.n_cross(Sa_ik_nest),where),"Sn_cross_a_"+where)
+        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Sa_ik_nest")
+        Srho_ik_nest = cse(self._eval_all_objects(rho, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Srho_ik_nest")
+        Srho = cse(self.S(Srho_ik_nest, where), "Srho_"+where)
+        Sn_cross_a = cse(self.S(self.n_cross(Sa_ik_nest), where),
+                "Sn_cross_a_"+where)
 
         return self.k*sym.join_fields(
-            sym.n_cross( self.k * Sn_cross_a + sym.grad(ambient_dim=3,operand=self.S(Srho_ik_nest,where)),where=where),
-            sym.div(self.S(self.n_cross(Sa_ik_nest),where)) - self.k * Srho
+            sym.n_cross(self.k * Sn_cross_a + sym.grad(ambient_dim=3,
+                operand=self.S(Srho_ik_nest, where)), where=where),
+            sym.div(self.S(self.n_cross(Sa_ik_nest), where)) - self.k * Srho
             )
 
     def _scaledDPIEs_integral(self, sigma, sigma_n, where):
-        qfl="avg"
+        qfl = "avg"
 
         return sym.integral(
             ambient_dim=3,
             dim=2,
-            operand=( (self.Dp(sigma,target=where,qfl=qfl) - self.Dp(sigma,target=where,qfl=qfl,kernel=self.kernel_laplace,use_laplace=True))/self.k + 1j*0.5*sigma_n - 1j*self.Sp(sigma,target=where,qfl=qfl)),
+            operand=(
+                (self.Dp(sigma, target=where, qfl=qfl)
+                    - self.Dp(sigma, target=where, qfl=qfl,
+                        kernel=self.kernel_laplace, use_laplace=True))
+                / self.k
+                + 1j*0.5*sigma_n
+                - 1j*self.Sp(sigma, target=where, qfl=qfl)),
             where=where)
 
     def _scaledDPIEv_integral(self, **kwargs):
-        qfl="avg"
+        qfl = "avg"
 
         # grab densities and domain to integrate over
         a = kwargs['a']
         rho = kwargs['rho']
-        rho_n = kwargs['rho_n']
+        # rho_n = kwargs['rho_n']
         where = kwargs['where']
 
         # define some useful common sub expressions
-        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik, kernel=self.kernel_ik), "Sa_ik_nest")
-        Srho_ik = cse(self.S(rho,where,k=self.ik,kernel=self.kernel_ik),"Srho_ik"+where)
-        Srho_ik_nest = cse(self._eval_all_objects(rho,self.S, k=self.ik, kernel=self.kernel_ik),"Srho_ik_nest")
-        Sn_cross_a = cse(self.S(self.n_cross(Sa_ik_nest),where),"Sn_cross_a_nest_"+where)
-        Sn_times_rho = cse(self.S(self.n_times(rho),where),"Sn_times_rho_"+where)
+        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Sa_ik_nest")
+        Srho_ik = cse(self.S(rho, where, k=self.ik, kernel=self.kernel_ik),
+                "Srho_ik"+where)
+        Srho_ik_nest = cse(self._eval_all_objects(rho, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Srho_ik_nest")
+        Sn_cross_a = cse(self.S(self.n_cross(Sa_ik_nest), where),
+                "Sn_cross_a_nest_"+where)
+        Sn_times_rho = cse(self.S(self.n_times(rho), where), "Sn_times_rho_"+where)
 
         return sym.integral(
             ambient_dim=3,
             dim=2,
             operand=(
-                -self.k*sym.n_dot(Sn_times_rho,where=where) \
-                + 1j*self.k*(self.k*sym.n_dot(Sn_cross_a,where=where) - 0.5*Srho_ik + self.Sp(Srho_ik_nest,target=where,qfl=qfl))
+                -self.k*sym.n_dot(Sn_times_rho, where=where)
+                + 1j*self.k*(
+                    self.k*sym.n_dot(Sn_cross_a, where=where)
+                    - 0.5*Srho_ik + self.Sp(Srho_ik_nest, target=where, qfl=qfl))
             ),
             where=where)
 
@@ -742,12 +811,18 @@ class DPIEOperatorEvanescent(DPIEOperator):
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
 
         # define some useful quantities
-        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik, kernel=self.kernel_ik), "Sa_ik_nest")
-        Srho_ik_nest = cse(self._eval_all_objects(rho,self.S, k=self.ik, kernel=self.kernel_ik),"Srho_ik_nest")
+        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Sa_ik_nest")
+        Srho_ik_nest = cse(self._eval_all_objects(rho, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Srho_ik_nest")
 
         # define the vector potential representation
-        return (sym.curl(self.S(a,target,qfl=qfl)) - self.k*self.S(self.n_times(rho),target,qfl=qfl)) \
-        + 1j*self.k*(self.k*self.S(self.n_cross(Sa_ik_nest),target,qfl=qfl) + sym.grad(3,self.S(Srho_ik_nest,target,qfl=qfl)))
+        return (
+                (sym.curl(self.S(a, target, qfl=qfl))
+                    - self.k*self.S(self.n_times(rho), target, qfl=qfl))
+                + 1j*self.k*(self.k*self.S(self.n_cross(Sa_ik_nest), target, qfl=qfl)
+                    + sym.grad(3, self.S(Srho_ik_nest, target, qfl=qfl)))
+                )
 
     def div_vector_potential_rep(self, A_densities, target=None, qfl=None):
         """
@@ -759,18 +834,22 @@ class DPIEOperatorEvanescent(DPIEOperator):
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
 
         # define some useful quantities
-        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik, kernel=self.kernel_ik), "Sa_ik_nest")
-        Srho_ik_nest = cse(self._eval_all_objects(rho,self.S, k=self.ik, kernel=self.kernel_ik),"Srho_ik_nest")
+        Sa_ik_nest = cse(self._eval_all_objects(a, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Sa_ik_nest")
+        Srho_ik_nest = cse(self._eval_all_objects(rho, self.S, k=self.ik,
+            kernel=self.kernel_ik), "Srho_ik_nest")
 
         # define the vector potential representation
-        return self.k*( self.D(rho,target,qfl=qfl) \
-            + 1j*self.k*(sym.div(self.S(self.n_cross(Sa_ik_nest),target,qfl=qfl)) - self.k * self.S(Srho_ik_nest,target,qfl=qfl)))
+        return self.k*(self.D(rho, target, qfl=qfl)
+            + 1j*self.k*(sym.div(self.S(self.n_cross(Sa_ik_nest), target,
+                qfl=qfl)) - self.k * self.S(Srho_ik_nest, target, qfl=qfl)))
 
-    def scattered_volume_field(self, phi_densities, A_densities, tau_densities, target=None, alpha=1j,qfl=None):
+    def scattered_volume_field(self, phi_densities, A_densities, tau_densities,
+            target=None, alpha=1j, qfl=None):
         """
         This will return an object of six entries, the first three of which
         represent the electric, and the second three of which represent the
-        magnetic field. 
+        magnetic field.
 
         This satisfies the time-domain Maxwell's equations
         as verified by :func:`sumpy.point_calculus.frequency_domain_maxwell`.
@@ -778,23 +857,33 @@ class DPIEOperatorEvanescent(DPIEOperator):
 
         # extract the densities needed
         (a0, a, rho0, rho, v) = self._extract_a_densities(A_densities)
-        (sigma0,sigma, V) = self._extract_phi_densities(phi_densities)
+        (sigma0, sigma, V) = self._extract_phi_densities(phi_densities)
         (tau0, tau) = self._extract_tau_densities(tau_densities)
 
         # obtain expressions for scalar and vector potentials
-        Sa_ik_nest = self._eval_all_objects(a, self.S, k=self.ik, kernel=self.kernel_ik)
-        A   = self.vector_potential_rep(A_densities, target=target)
-        phi = self.scalar_potential_rep(phi_densities, target=target)
+        Sa_ik_nest = self._eval_all_objects(a, self.S, k=self.ik,
+                kernel=self.kernel_ik)
+        A = self.vector_potential_rep(A_densities, target=target)
+        # phi = self.scalar_potential_rep(phi_densities, target=target)
 
         # evaluate the potential form for the electric and magnetic fields
-        E_scat = 1j*self.k*A - sym.grad(3, self.D(sigma,target,qfl=qfl)) + 1j*self.k*sym.grad(3, self.S(sigma,target,qfl=qfl))
-        H_scat = sym.grad(3,operand=(self.D(tau,target,qfl=qfl) - alpha*self.S(tau,target,qfl=qfl))) \
-            + (self.k**2) * self.S(a,target,qfl=qfl) \
-            - self.k * sym.curl(self.S(self.n_times(rho),target,qfl=qfl)) \
-            + 1j*(self.k**2)*sym.curl(self.S(self.n_cross(Sa_ik_nest),target,qfl=qfl))
-                
+        E_scat = (
+                1j*self.k*A
+                - sym.grad(3, self.D(sigma, target, qfl=qfl))
+                + 1j*self.k*sym.grad(3, self.S(sigma, target, qfl=qfl)))
+        H_scat = (
+                sym.grad(3, operand=(
+                    self.D(tau, target, qfl=qfl)
+                    - alpha*self.S(tau, target, qfl=qfl)))
+                + (self.k**2) * self.S(a, target, qfl=qfl)
+                - self.k * sym.curl(self.S(self.n_times(rho), target, qfl=qfl))
+                + 1j*(self.k**2)*sym.curl(self.S(self.n_cross(Sa_ik_nest),
+                    target, qfl=qfl))
+                )
 
         # join the fields into a vector
         return sym.join_fields(E_scat, H_scat)
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/symbolic/pde/maxwell/waveguide.py b/pytential/symbolic/pde/maxwell/waveguide.py
index 6a303570d853cd0c9203fa66c930b5000efd5bfd..4049cf17eb5555b048aaa5a9490d4c8c3e85b6fc 100644
--- a/pytential/symbolic/pde/maxwell/waveguide.py
+++ b/pytential/symbolic/pde/maxwell/waveguide.py
@@ -561,8 +561,7 @@ class Dielectric2DBoundaryOperatorBase(L2WeightedPDEOperator):
                     for term in bc)
             is_necessary = (
                     (self.ez_enabled and any_significant_e)
-                    or
-                    (self.hz_enabled and any_significant_h))
+                    or (self.hz_enabled and any_significant_h))
 
             # Only keep tangential modes for TEM. Otherwise,
             # no jump in H already implies jump condition on
@@ -588,8 +587,7 @@ class Dielectric2DBoundaryOperatorBase(L2WeightedPDEOperator):
     def is_field_present(self, field_kind):
         return (
                 (field_kind == self.field_kind_e and self.ez_enabled)
-                or
-                (field_kind == self.field_kind_h and self.hz_enabled))
+                or (field_kind == self.field_kind_h and self.hz_enabled))
 
     def make_unknown(self, name):
         num_densities = (
diff --git a/pytential/symbolic/primitives.py b/pytential/symbolic/primitives.py
index 2556fb0351d8179d4b6fd70ad87c529a5368ff73..f59d0342dde344b2cdfc2edceb9ad0bd3c5514f4 100644
--- a/pytential/symbolic/primitives.py
+++ b/pytential/symbolic/primitives.py
@@ -78,10 +78,10 @@ visible only once evaluated.)
 Placeholders
 ^^^^^^^^^^^^
 
-.. autoclass:: Variable
-.. autoclass:: make_sym_vector
-.. autoclass:: make_sym_mv
-.. autoclass:: make_sym_surface_mv
+.. autoclass:: var
+.. autofunction:: make_sym_vector
+.. autofunction:: make_sym_mv
+.. autofunction:: make_sym_surface_mv
 
 Functions
 ^^^^^^^^^
@@ -124,6 +124,10 @@ Discretization properties
 .. autofunction:: area_element
 .. autofunction:: sqrt_jac_q_weight
 .. autofunction:: normal
+.. autofunction:: mean_curvature
+.. autofunction:: first_fundamental_form
+.. autofunction:: second_fundamental_form
+.. autofunction:: shape_operator
 
 Elementary numerics
 ^^^^^^^^^^^^^^^^^^^
@@ -131,6 +135,8 @@ Elementary numerics
 .. autoclass:: NumReferenceDerivative
 .. autoclass:: NodeSum
 .. autoclass:: NodeMax
+.. autoclass:: ElementwiseSum
+.. autoclass:: ElementwiseMax
 .. autofunction:: integral
 .. autoclass:: Ones
 .. autofunction:: ones_vec
@@ -138,14 +144,18 @@ Elementary numerics
 .. autofunction:: mean
 .. autoclass:: IterativeInverse
 
-Calculus (based on Geometric Algebra)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Geometric Calculus (based on Geometric/Clifford Algebra)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: Derivative
+
+Conventional Calculus
+^^^^^^^^^^^^^^^^^^^^^
+
 .. autofunction:: dd_axis
-.. autofunction:: d_dx
-.. autofunction:: d_dy
-.. autofunction:: d_dz
+.. function:: d_dx
+.. function:: d_dy
+.. function:: d_dz
 .. autofunction:: grad_mv
 .. autofunction:: grad
 .. autofunction:: laplace
@@ -184,6 +194,8 @@ Pretty-printing expressions
 """
 
 
+# {{{ 'where' specifiers
+
 class DEFAULT_SOURCE:  # noqa
     pass
 
@@ -192,6 +204,58 @@ class DEFAULT_TARGET:  # noqa
     pass
 
 
+class _QBXSource(object):
+    """A symbolic 'where' specifier for the a density of a
+    :attr:`pytential.qbx.QBXLayerPotentialSource`
+    layer potential source identified by :attr:`where`.
+
+    .. attribute:: where
+
+        An identifier of a layer potential source, as used in
+        :func:`pytential.bind`.
+
+    .. note::
+
+        This is not documented functionality and only intended for
+        internal use.
+    """
+
+    def __init__(self, where):
+        self.where = where
+
+    def __hash__(self):
+        return hash((type(self), self.where))
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.where == other.where
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+class QBXSourceStage1(_QBXSource):
+    """An explicit symbolic 'where' specifier for the
+    :attr:`pytential.qbx.QBXLayerPotentialSource.density_discr`
+    of the layer potential source identified by :attr:`where`.
+    """
+
+
+class QBXSourceStage2(_QBXSource):
+    """A symbolic 'where' specifier for the
+    :attr:`pytential.qbx.QBXLayerPotentialSource.stage2_density_discr`
+    of the layer potential source identified by :attr:`where`.
+    """
+
+
+class QBXSourceQuadStage2(_QBXSource):
+    """A symbolic 'where' specifier for the
+    :attr:`pytential.qbx.QBXLayerPotentialSource.quad_stage2_density_discr`
+    of the layer potential source identified by :attr:`where`.
+    """
+
+# }}}
+
+
 class cse_scope(cse_scope_base):  # noqa
     DISCRETIZATION = "pytential_discretization"
 
@@ -226,8 +290,7 @@ def make_sym_surface_mv(name, ambient_dim, dim, where=None):
 
     return sum(
             var("%s%d" % (name, i))
-            *
-            cse(MultiVector(vec), "tangent%d" % i, cse_scope.DISCRETIZATION)
+            * cse(MultiVector(vec), "tangent%d" % i, cse_scope.DISCRETIZATION)
             for i, vec in enumerate(par_grad.T))
 
 
@@ -288,6 +351,8 @@ class DiscretizationProperty(Expression):
     further arguments.
     """
 
+    init_arg_names = ("where",)
+
     def __init__(self, where=None):
         """
         :arg where: |where-blurb|
@@ -308,6 +373,9 @@ class QWeight(DiscretizationProperty):
 
 
 class NodeCoordinateComponent(DiscretizationProperty):
+
+    init_arg_names = ("ambient_axis", "where")
+
     def __init__(self, ambient_axis, where=None):
         """
         :arg where: |where-blurb|
@@ -338,18 +406,48 @@ class NumReferenceDerivative(DiscretizationProperty):
     reference coordinates.
     """
 
+    init_arg_names = ("ref_axes", "operand", "where")
+
+    def __new__(cls, ref_axes=None, operand=None, where=None):
+        # If the constructor is handed a multivector object, return an
+        # object array of the operator applied to each of the
+        # coefficients in the multivector.
+
+        if isinstance(operand, np.ndarray):
+            def make_op(operand_i):
+                return cls(ref_axes, operand_i, where=where)
+
+            return componentwise(make_op, operand)
+        else:
+            return DiscretizationProperty.__new__(cls)
+
     def __init__(self, ref_axes, operand, where=None):
         """
-        :arg ref_axes: a :class:`frozenset` of indices of
-            reference coordinates along which derivatives
-            will be taken.
+        :arg ref_axes: a :class:`tuple` of tuples indicating indices of
+            coordinate axes of the reference element to the number of derivatives
+            which will be taken.  For example, the value ``((0, 2), (1, 1))``
+            indicates that Each axis must occur at most once. The tuple must be
+            sorted by the axis index.
+
+            May also be a singile integer *i*, which is viewed as equivalent
+            to ``((i, 1),)``.
         :arg where: |where-blurb|
         """
 
-        if not isinstance(ref_axes, frozenset):
-            raise ValueError("ref_axes must be a frozenset")
+        if isinstance(ref_axes, int):
+            ref_axes = ((ref_axes, 1),)
+
+        if not isinstance(ref_axes, tuple):
+            raise ValueError("ref_axes must be a tuple")
+
+        if tuple(sorted(ref_axes)) != ref_axes:
+            raise ValueError("ref_axes must be sorted")
+
+        if len(dict(ref_axes)) != len(ref_axes):
+            raise ValueError("ref_axes must not contain an axis more than once")
 
         self.ref_axes = ref_axes
+
         self.operand = operand
         DiscretizationProperty.__init__(self, where)
 
@@ -368,10 +466,7 @@ def reference_jacobian(func, output_dim, dim, where=None):
     for i in range(output_dim):
         func_component = func[i]
         for j in range(dim):
-            jac[i, j] = NumReferenceDerivative(
-                frozenset([j]),
-                func_component,
-                where)
+            jac[i, j] = NumReferenceDerivative(j, func_component, where)
 
     return jac
 
@@ -381,9 +476,11 @@ def parametrization_derivative_matrix(ambient_dim, dim, where=None):
     reference-to-global parametrization.
     """
 
-    return reference_jacobian(
-            [NodeCoordinateComponent(i, where) for i in range(ambient_dim)],
-            ambient_dim, dim, where)
+    return cse(
+            reference_jacobian(
+                [NodeCoordinateComponent(i, where) for i in range(ambient_dim)],
+                ambient_dim, dim, where=where),
+            "pd_matrix", cse_scope.DISCRETIZATION)
 
 
 def parametrization_derivative(ambient_dim, dim, where=None):
@@ -450,9 +547,7 @@ def mean_curvature(ambient_dim, dim=None, where=None):
         raise NotImplementedError(
                 "only know how to calculate curvature for a curve in 2D")
 
-    xp, yp = cse(
-            parametrization_derivative_matrix(ambient_dim, dim, where),
-            "pd_matrix", cse_scope.DISCRETIZATION)
+    xp, yp = parametrization_derivative_matrix(ambient_dim, dim, where)
 
     xpp, ypp = cse(
             reference_jacobian([xp[0], yp[0]], ambient_dim, dim, where),
@@ -460,24 +555,235 @@ def mean_curvature(ambient_dim, dim=None, where=None):
 
     return (xp[0]*ypp[0] - yp[0]*xpp[0]) / (xp[0]**2 + yp[0]**2)**(3/2)
 
-# FIXME: make sense of this in the context of GA
-# def xyz_to_local_matrix(dim, where=None):
-#     """First two rows are tangents."""
-#     result = np.zeros((dim, dim), dtype=np.object)
-#
-#     for i in range(dim-1):
-#         result[i] = make_tangent(i, dim, where)
-#     result[-1] = make_normal(dim, where)
-#
-#     return result
+
+def first_fundamental_form(ambient_dim, dim=None, where=None):
+    if dim is None:
+        dim = ambient_dim - 1
+
+    if ambient_dim != 3 and dim != 2:
+        raise NotImplementedError("only available for surfaces in 3D")
+
+    pd_mat = parametrization_derivative_matrix(ambient_dim, dim, where)
+
+    return cse(
+            np.dot(pd_mat.T, pd_mat),
+            "fundform1")
+
+
+def second_fundamental_form(ambient_dim, dim=None, where=None):
+    """Compute the second fundamental form of a surface. This is in reference
+    to the reference-to-global mapping in use for each element.
+
+    .. note::
+
+        Some references assume that the second fundamental form is computed
+        with respect to an orthonormal basis, which this is not.
+    """
+    if dim is None:
+        dim = ambient_dim - 1
+
+    if ambient_dim != 3 and dim != 2:
+        raise NotImplementedError("only available for surfaces in 3D")
+
+    r = nodes(ambient_dim, where=where).as_vector()
+
+    # https://en.wikipedia.org/w/index.php?title=Second_fundamental_form&oldid=821047433#Classical_notation
+
+    from functools import partial
+    d = partial(NumReferenceDerivative, where=where)
+    ruu = d(((0, 2),), r)
+    ruv = d(((0, 1), (1, 1)), r)
+    rvv = d(((1, 2),), r)
+
+    nrml = normal(ambient_dim, dim, where).as_vector()
+
+    ff2_l = cse(np.dot(ruu, nrml), "fundform2_L")
+    ff2_m = cse(np.dot(ruv, nrml), "fundform2_M")
+    ff2_n = cse(np.dot(rvv, nrml), "fundform2_N")
+
+    result = np.zeros((2, 2), dtype=object)
+    result[0, 0] = ff2_l
+    result[0, 1] = result[1, 0] = ff2_m
+    result[1, 1] = ff2_n
+
+    return result
+
+
+def shape_operator(ambient_dim, dim=None, where=None):
+    if dim is None:
+        dim = ambient_dim - 1
+
+    if ambient_dim != 3 and dim != 2:
+        raise NotImplementedError("only available for surfaces in 3D")
+
+    # https://en.wikipedia.org/w/index.php?title=Differential_geometry_of_surfaces&oldid=833587563
+    (E, F), (F, G) = first_fundamental_form(ambient_dim, dim, where)
+    (e, f), (f, g) = second_fundamental_form(ambient_dim, dim, where)
+
+    result = np.zeros((2, 2), dtype=object)
+    result[0, 0] = e*G-f*F
+    result[0, 1] = f*G-g*F
+    result[1, 0] = f*E-e*F
+    result[1, 1] = g*E-f*F
+
+    return cse(
+            1/(E*G-F*F)*result,
+            "shape_operator")
+
+
+def _panel_size(ambient_dim, dim=None, where=None):
+    # A broken quasi-1D approximation of 1D element size. Do not use.
+
+    if dim is None:
+        dim = ambient_dim - 1
+
+    return ElementwiseSum(
+            area_element(ambient_dim=ambient_dim, dim=dim)
+            * QWeight())**(1/dim)
+
+
+def _small_mat_inverse(mat):
+    m, n = mat.shape
+    if m != n:
+        raise ValueError("inverses only make sense for square matrices")
+
+    if m == 1:
+        return make_obj_array([1/mat[0, 0]])
+    elif m == 2:
+        (a, b), (c, d) = mat
+        return 1/(a*d-b*c) * make_obj_array([
+            [d, -b],
+            [-c, a],
+            ])
+    else:
+        raise NotImplementedError(
+                "inverse formula for %dx%d matrices" % (m, n))
+
+
+def _small_mat_eigenvalues(mat):
+    m, n = mat.shape
+    if m != n:
+        raise ValueError("eigenvalues only make sense for square matrices")
+
+    if m == 1:
+        return make_obj_array([mat[0, 0]])
+    elif m == 2:
+        (a, b), (c, d) = mat
+        return make_obj_array([
+                -(sqrt(d**2-2*a*d+4*b*c+a**2)-d-a)/2,
+                 (sqrt(d**2-2*a*d+4*b*c+a**2)+d+a)/2
+                ])
+    else:
+        raise NotImplementedError(
+                "eigenvalue formula for %dx%d matrices" % (m, n))
+
+
+def _equilateral_parametrization_derivative_matrix(ambient_dim, dim=None,
+        where=None):
+    if dim is None:
+        dim = ambient_dim - 1
+
+    pder_mat = parametrization_derivative_matrix(ambient_dim, dim, where)
+
+    # The above procedure works well only when the 'reference' end of the
+    # mapping is in equilateral coordinates.
+    from modepy.tools import EQUILATERAL_TO_UNIT_MAP
+    equi_to_unit = EQUILATERAL_TO_UNIT_MAP[dim].a
+
+    # This is the Jacobian of the (equilateral reference element) -> (global) map.
+    return cse(
+            np.dot(pder_mat, equi_to_unit),
+            "equilateral_pder_mat")
+
+
+def _simplex_mapping_max_stretch_factor(ambient_dim, dim=None, where=None,
+        with_elementwise_max=True):
+    """Return the largest factor by which the reference-to-global
+    mapping stretches the bi-unit (i.e. :math:`[-1,1]`) reference
+    element along any axis.
+
+    Returns a DOF vector that is elementwise constant.
+    """
+
+    if dim is None:
+        dim = ambient_dim - 1
+
+    # The 'technique' here is ad-hoc, but I'm fairly confident it's better than
+    # what we had. The idea is that singular values of the mapping Jacobian
+    # yield "stretch factors" of the mapping Why? because it maps a right
+    # singular vector $`v_1`$ (of unit length) to $`\sigma_1 u_1`$, where
+    # $`u_1`$ is the corresponding left singular vector (also of unit length).
+    # And so the biggest one tells us about the direction with the 'biggest'
+    # stretching, where 'stretching' (*2 to remove bi-unit reference element)
+    # reflects available quadrature resolution in that direction.
+
+    equi_pder_mat = _equilateral_parametrization_derivative_matrix(
+            ambient_dim, dim, where)
+
+    # Compute eigenvalues of J^T to compute SVD.
+    equi_pder_mat_jtj = cse(
+            np.dot(equi_pder_mat.T, equi_pder_mat),
+            "pd_mat_jtj")
+
+    stretch_factors = [
+            cse(sqrt(s), "mapping_singval_%d" % i)
+            for i, s in enumerate(
+                _small_mat_eigenvalues(
+                    # Multiply by 4 to compensate for equilateral reference
+                    # elements of side length 2. (J^T J contains two factors of
+                    # two.)
+                    4 * equi_pder_mat_jtj))]
+
+    from pymbolic.primitives import Max
+    result = Max(tuple(stretch_factors))
+
+    if with_elementwise_max:
+        result = ElementwiseMax(result, where=where)
+
+    return cse(result, "mapping_max_stretch", cse_scope.DISCRETIZATION)
+
+
+def _max_curvature(ambient_dim, dim=None, where=None):
+    # An attempt at a 'max curvature' criterion.
+
+    if dim is None:
+        dim = ambient_dim - 1
+
+    if ambient_dim == 2:
+        return abs(mean_curvature(ambient_dim, dim, where=where))
+    elif ambient_dim == 3:
+        shape_op = shape_operator(ambient_dim, dim, where=where)
+
+        abs_principal_curvatures = [
+                abs(x) for x in _small_mat_eigenvalues(shape_op)]
+        from pymbolic.primitives import Max
+        return cse(Max(tuple(abs_principal_curvatures)))
+    else:
+        raise NotImplementedError("curvature criterion not implemented in %d "
+                "dimensions" % ambient_dim)
+
+
+def _scaled_max_curvature(ambient_dim, dim=None, where=None):
+    """An attempt at a unit-less, scale-invariant quantity that characterizes
+    'how much curviness there is on an element'. Values seem to hover around 1
+    on typical meshes. Empirical evidence suggests that elements exceeding
+    a threshold of about 0.8-1 will have high QBX truncation error.
+    """
+
+    return _max_curvature(ambient_dim, dim, where=where) * \
+            _simplex_mapping_max_stretch_factor(ambient_dim, dim, where=where,
+                    with_elementwise_max=False)
 
 # }}}
 
 
 # {{{ operators
 
-class NodalOperation(Expression):
-    def __new__(cls, operand):
+class SingleScalarOperandExpression(Expression):
+
+    init_arg_names = ("operand",)
+
+    def __new__(cls, operand=None):
         # If the constructor is handed a multivector object, return an
         # object array of the operator applied to each of the
         # coefficients in the multivector.
@@ -497,13 +803,13 @@ class NodalOperation(Expression):
         return (self.operand,)
 
 
-class NodeSum(NodalOperation):
+class NodeSum(SingleScalarOperandExpression):
     """Implements a global sum over all discretization nodes."""
 
     mapper_method = "map_node_sum"
 
 
-class NodeMax(NodalOperation):
+class NodeMax(SingleScalarOperandExpression):
     """Implements a global maximum over all discretization nodes."""
 
     mapper_method = "map_node_max"
@@ -515,9 +821,10 @@ def integral(ambient_dim, dim, operand, where=None):
     `ambient_dim` is the number of dimensions used to represent space while `dim`
     is the dimensionality of the surface being integrated over.
 
-    Example|
-    We wish to integrate over the 2-D surface of a sphere that resides in 
-    in 3-dimensions, so `ambient_dim` = 3 and `dim` = 2.
+    .. note::
+
+        If, for example, we wish to integrate over the 2-D surface of a sphere
+        that resides in in 3-dimensions, then *ambient_dim* = 3 and *dim* = 2.
     """
 
     return NodeSum(
@@ -526,11 +833,62 @@ def integral(ambient_dim, dim, operand, where=None):
             * operand)
 
 
+class SingleScalarOperandExpressionWithWhere(Expression):
+
+    init_arg_names = ("operand", "where")
+
+    def __new__(cls, operand=None, where=None):
+        # If the constructor is handed a multivector object, return an
+        # object array of the operator applied to each of the
+        # coefficients in the multivector.
+
+        if isinstance(operand, (np.ndarray, MultiVector)):
+            def make_op(operand_i):
+                return cls(operand_i, where)
+
+            return componentwise(make_op, operand)
+        else:
+            return Expression.__new__(cls)
+
+    def __init__(self, operand, where=None):
+        self.operand = operand
+        self.where = where
+
+    def __getinitargs__(self):
+        return (self.operand, self.where)
+
+
+class ElementwiseSum(SingleScalarOperandExpressionWithWhere):
+    """Returns a vector of DOFs with all entries on each element set
+    to the sum of DOFs on that element.
+    """
+
+    mapper_method = "map_elementwise_sum"
+
+
+class ElementwiseMin(SingleScalarOperandExpressionWithWhere):
+    """Returns a vector of DOFs with all entries on each element set
+    to the minimum of DOFs on that element.
+    """
+
+    mapper_method = "map_elementwise_min"
+
+
+class ElementwiseMax(SingleScalarOperandExpressionWithWhere):
+    """Returns a vector of DOFs with all entries on each element set
+    to the maximum of DOFs on that element.
+    """
+
+    mapper_method = "map_elementwise_max"
+
+
 class Ones(Expression):
     """A DOF-vector that is constant *one* on the whole
     discretization.
     """
 
+    init_arg_names = ("where",)
+
     def __init__(self, where=None):
         self.where = where
 
@@ -554,11 +912,13 @@ def area(ambient_dim, dim, where=None):
 def mean(ambient_dim, dim, operand, where=None):
     return (
             integral(ambient_dim, dim, operand, where)
-            /
-            area(ambient_dim, dim, where))
+            / area(ambient_dim, dim, where))
 
 
 class IterativeInverse(Expression):
+
+    init_arg_names = ("expression", "rhs", "variable_name", "extra_vars", "where")
+
     def __init__(self, expression, rhs, variable_name, extra_vars={},
             where=None):
         self.expression = expression
@@ -598,6 +958,13 @@ def dd_axis(axis, ambient_dim, operand):
     """Return the derivative along (XYZ) axis *axis*
     (in *ambient_dim*-dimensional space) of *operand*.
     """
+    from pytools.obj_array import is_obj_array, with_object_array_or_scalar
+    if is_obj_array(operand):
+        def dd_axis_comp(operand_i):
+            return dd_axis(axis, ambient_dim, operand_i)
+
+        return with_object_array_or_scalar(dd_axis_comp, operand)
+
     d = Derivative()
 
     unit_vector = np.zeros(ambient_dim)
@@ -664,7 +1031,10 @@ class IntG(Expression):
     where :math:`\sigma` is *density*.
     """
 
-    def __new__(cls, kernel, density, *args, **kwargs):
+    init_arg_names = ("kernel", "density", "qbx_forced_limit", "source", "target",
+                      "kernel_arguments")
+
+    def __new__(cls, kernel=None, density=None, *args, **kwargs):
         # If the constructor is handed a multivector object, return an
         # object array of the operator applied to each of the
         # coefficients in the multivector.
@@ -706,8 +1076,8 @@ class IntG(Expression):
         :arg kernel_arguments: A dictionary mapping named
             :class:`sumpy.kernel.Kernel` arguments
             (see :meth:`sumpy.kernel.Kernel.get_args`
-            and :meth:`sumpy.kernel.Kernel.get_source_args`
-            to expressions that determine them)
+            and :meth:`sumpy.kernel.Kernel.get_source_args`)
+            to expressions that determine them
 
         :arg source: The symbolic name of the source discretization. This name
             is bound to a concrete :class:`pytential.source.LayerPotentialSourceBase`
@@ -745,8 +1115,7 @@ class IntG(Expression):
                 karg.loopy_arg.name
                 for karg in (
                     kernel.get_args()
-                    +
-                    kernel.get_source_args()))
+                    + kernel.get_source_args()))
 
         kernel_arguments = kernel_arguments.copy()
         if kwargs:
@@ -796,6 +1165,11 @@ class IntG(Expression):
                 self.source, self.target,
                 hashable_kernel_args(self.kernel_arguments))
 
+    def __setstate__(self, state):
+        # Overwrite pymbolic.Expression.__setstate__
+        assert len(self.init_arg_names) == len(state), type(self)
+        self.__init__(*state)
+
     mapper_method = intern("map_int_g")
 
 
@@ -812,7 +1186,7 @@ def _insert_source_derivative_into_kernel(kernel):
                 kernel, dir_vec_name=_DIR_VEC_NAME)
     else:
         return kernel.replace_inner_kernel(
-                _insert_source_derivative_into_kernel(kernel.kernel))
+                _insert_source_derivative_into_kernel(kernel.inner_kernel))
 
 
 def _get_dir_vec(dsource, ambient_dim):
@@ -937,14 +1311,15 @@ def normal_derivative(ambient_dim, operand, dim=None, where=None):
         def make_op(operand_i):
             d = Derivative()
             return d.resolve(
-                (normal(ambient_dim, dim, where).scalar_product(d.dnabla(ambient_dim))) 
+                (normal(ambient_dim, dim, where)
+                    .scalar_product(d.dnabla(ambient_dim)))
                 * d(operand_i))
 
         return componentwise(make_op, operand)
     else:
         d = Derivative()
         return d.resolve(
-            (normal(ambient_dim, dim, where).scalar_product(d.dnabla(ambient_dim))) 
+            (normal(ambient_dim, dim, where).scalar_product(d.dnabla(ambient_dim)))
             * d(operand))
 
 
@@ -1039,11 +1414,14 @@ def Dp(kernel, *args, **kwargs):  # noqa
 # {{{ conventional vector calculus
 
 def tangential_onb(ambient_dim, dim=None, where=None):
+    """Return a matrix of shape ``(ambient_dim, dim)`` with orthogonal columns
+    spanning the tangential space of the surface of *where*.
+    """
+
     if dim is None:
         dim = ambient_dim - 1
 
-    pd_mat = cse(parametrization_derivative_matrix(ambient_dim, dim, where),
-            "pd_matrix", cse_scope.DISCRETIZATION)
+    pd_mat = parametrization_derivative_matrix(ambient_dim, dim, where)
 
     # {{{ Gram-Schmidt
 
@@ -1108,7 +1486,10 @@ def n_cross(vec, where=None):
 
 def div(vec):
     ambient_dim = len(vec)
-    return sum(dd_axis(iaxis, ambient_dim, vec[iaxis]) for iaxis in range(ambient_dim))
+    return sum(
+            dd_axis(iaxis, ambient_dim, vec[iaxis])
+            for iaxis in range(ambient_dim))
+
 
 def curl(vec):
     from pytools import levi_civita
diff --git a/pytential/symbolic/stokes.py b/pytential/symbolic/stokes.py
index adfc23d5eaa5e50144b242e23ada9d6b8e23024d..7e47bdb222a106faaeb97d36b525c056286156bf 100644
--- a/pytential/symbolic/stokes.py
+++ b/pytential/symbolic/stokes.py
@@ -476,8 +476,8 @@ class StressletWrapper(object):
             for j in range(self.dim):
                 sym_expr[comp] = sym_expr[comp] + (
                                     dir_vec_sym[j] * mu_sym * (
-                                        sym_grad_matrix[comp][j] +
-                                        sym_grad_matrix[j][comp])
+                                        sym_grad_matrix[comp][j]
+                                        + sym_grad_matrix[j][comp])
                                         )
 
         return sym_expr
diff --git a/pytential/unregularized.py b/pytential/unregularized.py
index 18e8b65c3efec2e25bc81c6b87a457f238963816..e7fe1b3e73bfe17e14ab9aa6d57a86f6c9c92375 100644
--- a/pytential/unregularized.py
+++ b/pytential/unregularized.py
@@ -31,6 +31,7 @@ import numpy as np
 import loopy as lp
 
 from boxtree.tools import DeviceDataRecord
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pytential.source import LayerPotentialSourceBase
 from pytools import memoize_method
 
@@ -186,7 +187,7 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
 
             result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result, []
+        return result
 
     # {{{ fmm-based execution
 
@@ -287,7 +288,7 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
 
         # }}}
 
-        return result, []
+        return result
 
     # }}}
 
@@ -312,7 +313,8 @@ class _FMMGeometryCodeContainer(object):
             """
                 targets[dim, i] = points[dim, i]
                 """,
-            default_offset=lp.auto, name="copy_targets")
+            default_offset=lp.auto, name="copy_targets",
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         knl = lp.fix_parameters(knl, ndims=self.ambient_dim)
 
@@ -445,7 +447,7 @@ class _FMMGeometryData(object):
 
 
 __all__ = (
-        UnregularizedLayerPotentialSource,
+        "UnregularizedLayerPotentialSource",
         )
 
 # vim: fdm=marker
diff --git a/pytential/version.py b/pytential/version.py
index d26fbc2f9341a880b1119e7e6079bd51e59e11b9..5cb9cc61161073b0cd9e9a5fad471dcc6620763d 100644
--- a/pytential/version.py
+++ b/pytential/version.py
@@ -1,2 +1,49 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+# {{{ find install- or run-time git revision
+
+import os
+if os.environ.get("AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY") is not None:
+    # We're just being exec'd by setup.py. We can't import anything.
+    _git_rev = None
+
+else:
+    import pytential._git_rev as _git_rev_mod
+    _git_rev = _git_rev_mod.GIT_REVISION
+
+    # If we're running from a dev tree, the last install (and hence the most
+    # recent update of the above git rev) could have taken place very long ago.
+    from pytools import find_module_git_revision
+    _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1)
+    if _runtime_git_rev is not None:
+        _git_rev = _runtime_git_rev
+
+# }}}
+
+
 VERSION = (2016, 1)
 VERSION_TEXT = ".".join(str(i) for i in VERSION)
+
+PYTENTIAL_KERNEL_VERSION = (VERSION, _git_rev, 0)
diff --git a/requirements.txt b/requirements.txt
index 8925c34ed4f7ccd7426bf4b153b8864bc3e77106..dd15a69ebf6dade1ca2fb2df0bdc3644b6b8c6d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy
 git+git://github.com/inducer/pymbolic
-sympy==1.0
+sympy==1.1.1
 git+https://github.com/inducer/modepy
 git+https://github.com/inducer/pyopencl
 git+https://github.com/inducer/islpy
@@ -8,4 +8,4 @@ git+https://github.com/inducer/loopy
 git+https://gitlab.tiker.net/inducer/boxtree
 git+https://github.com/inducer/meshmode
 git+https://gitlab.tiker.net/inducer/sumpy
-git+https://github.com/inducer/pyfmmlib
+git+https://gitlab.tiker.net/inducer/pyfmmlib
diff --git a/setup.cfg b/setup.cfg
index 42291e82433c3c7522f8e267e41321c4d19db099..a353f3f7242d42f689d5721b8f4a104aaf3e4e6b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,4 @@
+
 [flake8]
 ignore = E126,E127,E128,E123,E226,E241,E242,E265,E402,W503,N803,N806,N802,D102,D103
 max-line-length=85
@@ -5,3 +6,6 @@ exclude=
     pytential/symbolic/old_diffop_primitives.py,
     pytential/symbolic/pde/maxwell/generalized_debye.py,
 
+[tool:pytest]
+markers=
+    slowtest: mark a test as slow
diff --git a/setup.py b/setup.py
index d8d49a9cb1d30b0ac134f308ce983cd8dee0e7b1..e2a1ae90b75867829508b789fbe340e15b29c426 100644
--- a/setup.py
+++ b/setup.py
@@ -1,63 +1,111 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import os
+from setuptools import setup, find_packages
 
-def main():
-    from setuptools import setup, find_packages
-
-    version_dict = {}
-    init_filename = "pytential/version.py"
-    exec(compile(open(init_filename, "r").read(), init_filename, "exec"),
-            version_dict)
-
-    setup(name="pytential",
-          version=version_dict["VERSION_TEXT"],
-          description="Evaluate layer and volume potentials accurately. "
-          "Solve integral equations.",
-          long_description=open("README.rst", "rt").read(),
-          author="Andreas Kloeckner",
-          author_email="inform@tiker.net",
-          license="MIT",
-          url="http://wiki.tiker.net/Pytential",
-          classifiers=[
-              'Development Status :: 3 - Alpha',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Other Audience',
-              'Intended Audience :: Science/Research',
-              'License :: OSI Approved :: MIT License',
-              'Natural Language :: English',
-              'Programming Language :: Python',
-
-              'Programming Language :: Python :: 2.6',
-              'Programming Language :: Python :: 2.7',
-              # 3.x has not yet been tested.
-              'Topic :: Scientific/Engineering',
-              'Topic :: Scientific/Engineering :: Information Analysis',
-              'Topic :: Scientific/Engineering :: Mathematics',
-              'Topic :: Scientific/Engineering :: Visualization',
-              'Topic :: Software Development :: Libraries',
-              'Topic :: Utilities',
-              ],
-
-          packages=find_packages(),
-
-          install_requires=[
-              "pytest>=2.3",
-              # FIXME leave out for now
-              # https://code.google.com/p/sympy/issues/detail?id=3874
-              #"sympy>=0.7.2",
-
-              "modepy>=2013.3",
-              "pyopencl>=2013.1",
-              "boxtree>=2013.1",
-              "pymbolic>=2013.2",
-              "loo.py>=2017.2",
-              "sumpy>=2013.1",
-              "cgen>=2013.1.2",
-
-              "six",
-              ])
-
-
-if __name__ == '__main__':
-    main()
+
+# {{{ capture git revision at install time
+
+# authoritative version in pytools/__init__.py
+def find_git_revision(tree_root):
+    # Keep this routine self-contained so that it can be copy-pasted into
+    # setup.py.
+
+    from os.path import join, exists, abspath
+    tree_root = abspath(tree_root)
+
+    if not exists(join(tree_root, ".git")):
+        return None
+
+    from subprocess import Popen, PIPE, STDOUT
+    p = Popen(["git", "rev-parse", "HEAD"], shell=False,
+              stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True,
+              cwd=tree_root)
+    (git_rev, _) = p.communicate()
+
+    import sys
+    if sys.version_info >= (3,):
+        git_rev = git_rev.decode()
+
+    git_rev = git_rev.rstrip()
+
+    retcode = p.returncode
+    assert retcode is not None
+    if retcode != 0:
+        from warnings import warn
+        warn("unable to find git revision")
+        return None
+
+    return git_rev
+
+
+def write_git_revision(package_name):
+    from os.path import dirname, join
+    dn = dirname(__file__)
+    git_rev = find_git_revision(dn)
+
+    with open(join(dn, package_name, "_git_rev.py"), "w") as outf:
+        outf.write("GIT_REVISION = %s\n" % repr(git_rev))
+
+
+write_git_revision("pytential")
+
+# }}}
+
+
+version_dict = {}
+init_filename = "pytential/version.py"
+os.environ["AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY"] = "1"
+exec(compile(open(init_filename, "r").read(), init_filename, "exec"),
+        version_dict)
+
+setup(name="pytential",
+      version=version_dict["VERSION_TEXT"],
+      description="Evaluate layer and volume potentials accurately. "
+      "Solve integral equations.",
+      long_description=open("README.rst", "rt").read(),
+      author="Andreas Kloeckner",
+      author_email="inform@tiker.net",
+      license="MIT",
+      url="http://wiki.tiker.net/Pytential",
+      classifiers=[
+          'Development Status :: 3 - Alpha',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Other Audience',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: MIT License',
+          'Natural Language :: English',
+          'Programming Language :: Python',
+
+          'Programming Language :: Python :: 2.6',
+          'Programming Language :: Python :: 2.7',
+          # 3.x has not yet been tested.
+          'Topic :: Scientific/Engineering',
+          'Topic :: Scientific/Engineering :: Information Analysis',
+          'Topic :: Scientific/Engineering :: Mathematics',
+          'Topic :: Scientific/Engineering :: Visualization',
+          'Topic :: Software Development :: Libraries',
+          'Topic :: Utilities',
+          ],
+
+      packages=find_packages(),
+
+      install_requires=[
+          "pytest>=2.3",
+          # FIXME leave out for now
+          # https://code.google.com/p/sympy/issues/detail?id=3874
+          #"sympy>=0.7.2",
+
+          "pytools>=2018.2",
+          "modepy>=2013.3",
+          "pyopencl>=2013.1",
+          "boxtree>=2018.2",
+          "pymbolic>=2013.2",
+          "loo.py>=2017.2",
+          "sumpy>=2013.1",
+          "cgen>=2013.1.2",
+          "pyfmmlib>=2018.1",
+
+          "six",
+          ])
diff --git a/test/extra_curve_data.py b/test/extra_curve_data.py
index c6679953f523ad5aac6bb56ed4307c41fec97bf8..4d2dacca6bbae937119f59757cdd738b4f0f95e3 100644
--- a/test/extra_curve_data.py
+++ b/test/extra_curve_data.py
@@ -84,8 +84,8 @@ class Segment(Curve):
 
     def __call__(self, ts):
         return (
-            self.start[:, np.newaxis] +
-            ts * (self.end - self.start)[:, np.newaxis])
+            self.start[:, np.newaxis]
+            + ts * (self.end - self.start)[:, np.newaxis])
 
 
 class Arc(Curve):
@@ -134,12 +134,12 @@ class Arc(Curve):
     def __call__(self, t):
         if self.theta_increasing:
             thetas = (
-                self.theta_range[0] +
-                t * (self.theta_range[1] - self.theta_range[0]))
+                self.theta_range[0]
+                + t * (self.theta_range[1] - self.theta_range[0]))
         else:
             thetas = (
-                self.theta_range[1] -
-                t * (self.theta_range[1] - self.theta_range[0]))
+                self.theta_range[1]
+                - t * (self.theta_range[1] - self.theta_range[0]))
         val = (self.r * np.exp(1j * thetas)) + self.center
         return np.array([val.real, val.imag])
 
@@ -149,19 +149,19 @@ class Arc(Curve):
 # To avoid issues with crossing non-smooth regions, make sure the number of
 # panels given to this function (for make_curve_mesh) is a multiple of 8.
 horseshoe = (
-    Segment((0, 0), (-5, 0)) +
-    Arc((-5, 0), (-5.5, -0.5), (-5, -1)) +
-    Segment((-5, -1), (0, -1)) +
-    Arc((0, -1), (1.5, 0.5), (0, 2)) +
-    Segment((0, 2), (-5, 2)) +
-    Arc((-5, 2), (-5.5, 1.5), (-5, 1)) +
-    Segment((-5, 1), (0, 1)) +
-    Arc((0, 1), (0.5, 0.5), (0, 0))
+    Segment((0, 0), (-5, 0))
+    + Arc((-5, 0), (-5.5, -0.5), (-5, -1))
+    + Segment((-5, -1), (0, -1))
+    + Arc((0, -1), (1.5, 0.5), (0, 2))
+    + Segment((0, 2), (-5, 2))
+    + Arc((-5, 2), (-5.5, 1.5), (-5, 1))
+    + Segment((-5, 1), (0, 1))
+    + Arc((0, 1), (0.5, 0.5), (0, 0))
     )
 
 # unit square
 unit_square = (
-    Segment((1, -1), (1, 1)) +
-    Segment((1, 1), (-1, 1)) +
-    Segment((-1, 1), (-1, -1)) +
-    Segment((-1, -1), (1, -1)))
+    Segment((1, -1), (1, 1))
+    + Segment((1, 1), (-1, 1))
+    + Segment((-1, 1), (-1, -1))
+    + Segment((-1, -1), (1, -1)))
diff --git a/test/test_global_qbx.py b/test/test_global_qbx.py
index cc0f26f0ebbce0cee1ab5c423fb92f6ed94b9c66..7f700fa104b2bc10e99416d594aecd090eca69eb 100644
--- a/test/test_global_qbx.py
+++ b/test/test_global_qbx.py
@@ -96,7 +96,9 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
 
     from pytential.qbx.utils import TreeCodeContainer
 
-    lpot_source = QBXLayerPotentialSource(discr, order)
+    lpot_source = QBXLayerPotentialSource(discr,
+            qbx_order=order,  # not used in refinement
+            fine_order=order)
     del discr
 
     expansion_disturbance_tolerance = 0.025
@@ -121,9 +123,10 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
     int_centers = np.array([axis.get(queue) for axis in int_centers])
     ext_centers = get_centers_on_side(lpot_source, +1)
     ext_centers = np.array([axis.get(queue) for axis in ext_centers])
-    expansion_radii = lpot_source._expansion_radii("npanels").get(queue)
-    panel_sizes = lpot_source._panel_sizes("npanels").get(queue)
-    fine_panel_sizes = lpot_source._fine_panel_sizes("npanels").get(queue)
+    expansion_radii = lpot_source._expansion_radii("nsources").get(queue)
+    quad_res = lpot_source._coarsest_quad_resolution("npanels").get(queue)
+    source_danger_zone_radii = \
+            lpot_source._source_danger_zone_radii("npanels").get(queue)
 
     # {{{ check if satisfying criteria
 
@@ -141,8 +144,8 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
         # =distance(centers of panel 1, panel 2)
         dist = (
             la.norm((
-                    all_centers[..., np.newaxis] -
-                    nodes[:, np.newaxis, ...]).T,
+                    all_centers[..., np.newaxis]
+                    - nodes[:, np.newaxis, ...]).T,
                 axis=-1)
             .min())
 
@@ -150,12 +153,12 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
         # A center cannot be closer to another panel than to its originating
         # panel.
 
-        rad = expansion_radii[centers_panel.element_nr]
-        assert dist >= rad * (1-expansion_disturbance_tolerance), \
+        rad = expansion_radii[centers_panel.discr_slice]
+        assert (dist >= rad * (1-expansion_disturbance_tolerance)).all(), \
                 (dist, rad, centers_panel.element_nr, sources_panel.element_nr)
 
     def check_sufficient_quadrature_resolution(centers_panel, sources_panel):
-        h = fine_panel_sizes[sources_panel.element_nr]
+        dz_radius = source_danger_zone_radii[sources_panel.element_nr]
 
         my_int_centers = int_centers[:, centers_panel.discr_slice]
         my_ext_centers = ext_centers[:, centers_panel.discr_slice]
@@ -166,20 +169,20 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
         # =distance(centers of panel 1, panel 2)
         dist = (
             la.norm((
-                    all_centers[..., np.newaxis] -
-                    nodes[:, np.newaxis, ...]).T,
+                    all_centers[..., np.newaxis]
+                    - nodes[:, np.newaxis, ...]).T,
                 axis=-1)
             .min())
 
         # Criterion:
         # The quadrature contribution from each panel is as accurate
         # as from the center's own source panel.
-        assert dist >= h / 4, \
-                (dist, h, centers_panel.element_nr, sources_panel.element_nr)
+        assert dist >= dz_radius, \
+                (dist, dz_radius, centers_panel.element_nr, sources_panel.element_nr)
 
-    def check_panel_size_to_helmholtz_k_ratio(panel):
+    def check_quad_res_to_helmholtz_k_ratio(panel):
         # Check wavenumber to panel size ratio.
-        assert panel_sizes[panel.element_nr] * helmholtz_k <= 5
+        assert quad_res[panel.element_nr] * helmholtz_k <= 5
 
     for i, panel_1 in enumerate(iter_elements(lpot_source.density_discr)):
         for panel_2 in iter_elements(lpot_source.density_discr):
@@ -187,7 +190,7 @@ def run_source_refinement_test(ctx_getter, mesh, order, helmholtz_k=None):
         for panel_2 in iter_elements(lpot_source.quad_stage2_density_discr):
             check_sufficient_quadrature_resolution(panel_1, panel_2)
         if helmholtz_k is not None:
-            check_panel_size_to_helmholtz_k_ratio(panel_1)
+            check_quad_res_to_helmholtz_k_ratio(panel_1)
 
     # }}}
 
@@ -219,7 +222,8 @@ def test_source_refinement_3d(ctx_getter, surface_name, surface_f, order):
     ("20-to-1 ellipse", partial(ellipse, 20), 100),
     ("horseshoe", horseshoe, 64),
     ])
-def test_target_association(ctx_getter, curve_name, curve_f, nelements):
+def test_target_association(ctx_getter, curve_name, curve_f, nelements,
+        visualize=False):
     cl_ctx = ctx_getter()
     queue = cl.CommandQueue(cl_ctx)
 
@@ -237,7 +241,9 @@ def test_target_association(ctx_getter, curve_name, curve_f, nelements):
 
     discr = Discretization(cl_ctx, mesh, factory)
 
-    lpot_source, conn = QBXLayerPotentialSource(discr, order).with_refinement()
+    lpot_source, conn = QBXLayerPotentialSource(discr,
+            qbx_order=order,  # not used in target association
+            fine_order=order).with_refinement()
     del discr
 
     from pytential.qbx.utils import get_interleaved_centers
@@ -312,51 +318,79 @@ def test_target_association(ctx_getter, curve_name, curve_f, nelements):
 
     expansion_radii = lpot_source._expansion_radii("ncenters").get(queue)
 
+    surf_targets = np.array(
+            [axis.get(queue) for axis in lpot_source.density_discr.nodes()])
     int_targets = np.array([axis.get(queue) for axis in int_targets.nodes()])
     ext_targets = np.array([axis.get(queue) for axis in ext_targets.nodes()])
 
-    # Checks that the sources match with their own centers.
-    def check_on_surface_targets(nsources, true_side, target_to_center,
-                                 target_to_side_result):
-        assert (target_to_center >= 0).all()
+    def visualize_curve_and_assoc():
+        import matplotlib.pyplot as plt
+        from meshmode.mesh.visualization import draw_curve
 
-        sources = np.arange(0, nsources)
+        draw_curve(lpot_source.density_discr.mesh)
 
-        # Centers are on alternating sides of the geometry. Dividing by
-        # two yields the number of the source that spawned the center.
-        assert (target_to_center//2 == sources).all()
+        targets = int_targets
+        tgt_slice = surf_int_slice
 
-        assert (target_to_side_result == true_side).all()
+        plt.plot(centers[0], centers[1], "+", color="orange")
+        ax = plt.gca()
+
+        for tx, ty, tcenter in zip(
+                targets[0, tgt_slice],
+                targets[1, tgt_slice],
+                target_assoc.target_to_center[tgt_slice]):
+            if tcenter >= 0:
+                ax.add_artist(
+                        plt.Line2D(
+                            (tx, centers[0, tcenter]),
+                            (ty, centers[1, tcenter]),
+                            ))
+
+        ax.set_aspect("equal")
+        plt.show()
+
+    if visualize:
+        visualize_curve_and_assoc()
 
     # Checks that the targets match with centers on the appropriate side and
     # within the allowable distance.
     def check_close_targets(centers, targets, true_side,
                             target_to_center, target_to_side_result,
                             tgt_slice):
-        assert (target_to_center >= 0).all()
+        targets_have_centers = (target_to_center >= 0).all()
+        assert targets_have_centers
+
         assert (target_to_side_result == true_side).all()
+
+        TOL = 1e-3
         dists = la.norm((targets.T - centers.T[target_to_center]), axis=1)
-        assert (dists <= expansion_radii[target_to_center]).all()
+        assert (dists <= (1 + TOL) * expansion_radii[target_to_center]).all()
 
     # Center side order = -1, 1, -1, 1, ...
     target_to_center_side = 2 * (target_assoc.target_to_center % 2) - 1
 
-    check_on_surface_targets(
-        nsources, -1,
+    # interior surface
+    check_close_targets(
+        centers, surf_targets, -1,
         target_assoc.target_to_center[surf_int_slice],
-        target_to_center_side[surf_int_slice])
+        target_to_center_side[surf_int_slice],
+        surf_int_slice)
 
-    check_on_surface_targets(
-        nsources, +1,
+    # exterior surface
+    check_close_targets(
+        centers, surf_targets, +1,
         target_assoc.target_to_center[surf_ext_slice],
-        target_to_center_side[surf_ext_slice])
+        target_to_center_side[surf_ext_slice],
+        surf_ext_slice)
 
+    # interior volume
     check_close_targets(
         centers, int_targets, -1,
         target_assoc.target_to_center[vol_int_slice],
         target_to_center_side[vol_int_slice],
         vol_int_slice)
 
+    # exterior volume
     check_close_targets(
         centers, ext_targets, +1,
         target_assoc.target_to_center[vol_ext_slice],
@@ -387,7 +421,9 @@ def test_target_association_failure(ctx_getter):
             InterpolatoryQuadratureSimplexGroupFactory
     factory = InterpolatoryQuadratureSimplexGroupFactory(order)
     discr = Discretization(cl_ctx, mesh, factory)
-    lpot_source = QBXLayerPotentialSource(discr, order)
+    lpot_source = QBXLayerPotentialSource(discr,
+            qbx_order=order,  # not used in target association
+            fine_order=order)
 
     # }}}
 
@@ -431,7 +467,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_layer_pot.py b/test/test_layer_pot.py
index cb8669e4eae4127510b45712b111a3d9fdd67b12..4b5d70b675871031023d3f5cf40c049c008d0f5b 100644
--- a/test/test_layer_pot.py
+++ b/test/test_layer_pot.py
@@ -92,7 +92,7 @@ def test_off_surface_eval(ctx_getter, use_fmm, do_plot=False):
     nelements = 30
     target_order = 8
     qbx_order = 3
-    if use_fmm is True:
+    if use_fmm:
         fmm_order = qbx_order
     else:
         fmm_order = False
@@ -139,8 +139,7 @@ def test_off_surface_eval(ctx_getter, use_fmm, do_plot=False):
         pt.colorbar()
         pt.show()
 
-    # FIXME: Why does the FMM only meet this sloppy tolerance?
-    assert linf_err < 1e-2
+    assert linf_err < 1e-3
 
 # }}}
 
@@ -389,22 +388,24 @@ def test_perf_data_gathering(ctx_getter, n_arms=5):
 
 # {{{ test 3D jump relations
 
-@pytest.mark.parametrize("relation", ["sp", "nxcurls"])
+@pytest.mark.parametrize("relation", ["sp", "nxcurls", "div_s"])
 def test_3d_jump_relations(ctx_factory, relation, visualize=False):
-    #logging.basicConfig(level=logging.INFO)
-
-    pytest.importorskip("pyfmmlib")
+    # logging.basicConfig(level=logging.INFO)
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
-    target_order = 4
+    if relation == "div_s":
+        target_order = 3
+    else:
+        target_order = 4
+
     qbx_order = target_order
 
     from pytools.convergence import EOCRecorder
     eoc_rec = EOCRecorder()
 
-    for nel_factor in [6, 8, 12]:
+    for nel_factor in [6, 10, 14]:
         from meshmode.mesh.generation import generate_torus
         mesh = generate_torus(
                 5, 2, order=target_order,
@@ -469,6 +470,16 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
                     - (sym.Sp(knl, density_sym, qbx_forced_limit="avg")
                         - 0.5*density_sym))
 
+        elif relation == "div_s":
+
+            density = m.cos(2*x) * m.cos(2*y) * m.cos(z)
+            density_sym = sym.var("density")
+
+            jump_identity_sym = (
+                    sym.div(sym.S(knl, sym.normal(3).as_vector()*density_sym,
+                        qbx_forced_limit="avg"))
+                    + sym.D(knl, density_sym, qbx_forced_limit="avg"))
+
         else:
             raise ValueError("unexpected value of 'relation': %s" % relation)
 
@@ -477,8 +488,7 @@ def test_3d_jump_relations(ctx_factory, relation, visualize=False):
 
         err = (
                 norm(qbx, queue, jump_identity, np.inf)
-                /
-                norm(qbx, queue, density, np.inf))
+                / norm(qbx, queue, density, np.inf))
         print("ERROR", qbx.h_max, err)
 
         eoc_rec.add_data_point(qbx.h_max, err)
@@ -540,7 +550,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_layer_pot_eigenvalues.py b/test/test_layer_pot_eigenvalues.py
index 2da95fcbd473220475719c476e4d4c50bac055ac..b4e986d4af6e16cc149effac6a08403d11668b4c 100644
--- a/test/test_layer_pot_eigenvalues.py
+++ b/test/test_layer_pot_eigenvalues.py
@@ -100,7 +100,7 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
                 np.linspace(0, 1, nelements+1),
                 target_order)
 
-        fmm_order = 10
+        fmm_order = 12
         if force_direct:
             fmm_order = False
 
@@ -126,7 +126,7 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
             centers_h = [centers[0].get(), centers[1].get()]
             pt.plot(nodes_h[0], nodes_h[1], "x-")
             pt.plot(centers_h[0], centers_h[1], "o")
-            normal = bind(qbx, sym.normal())(queue).as_vector(np.object)
+            normal = bind(qbx, sym.normal(ambient_dim=2))(queue).as_vector(np.object)
             pt.quiver(nodes_h[0], nodes_h[1],
                     normal[0].get(), normal[1].get())
             pt.gca().set_aspect("equal")
@@ -166,8 +166,7 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
 
         s_err = (
                 norm(density_discr, queue, s_sigma - s_sigma_ref)
-                /
-                norm(density_discr, queue, s_sigma_ref))
+                / norm(density_discr, queue, s_sigma_ref))
         s_eoc_rec.add_data_point(qbx.h_max, s_err)
 
         # }}}
@@ -198,8 +197,7 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
 
         d_err = (
                 norm(density_discr, queue, d_sigma - d_sigma_ref)
-                /
-                d_ref_norm)
+                / d_ref_norm)
         d_eoc_rec.add_data_point(qbx.h_max, d_err)
 
         # }}}
@@ -218,8 +216,7 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
 
             sp_err = (
                     norm(density_discr, queue, sp_sigma - sp_sigma_ref)
-                    /
-                    norm(density_discr, queue, sigma))
+                    / norm(density_discr, queue, sigma))
             sp_eoc_rec.add_data_point(qbx.h_max, sp_err)
 
             # }}}
@@ -252,15 +249,12 @@ def test_ellipse_eigenvalues(ctx_getter, ellipse_aspect, mode_nr, qbx_order,
     "sumpy",
     "fmmlib",
     ])
-def no_test_sphere_eigenvalues(ctx_getter, mode_m, mode_n, qbx_order,
+def test_sphere_eigenvalues(ctx_getter, mode_m, mode_n, qbx_order,
         fmm_backend):
     logging.basicConfig(level=logging.INFO)
 
     special = pytest.importorskip("scipy.special")
 
-    if fmm_backend == "fmmlib":
-        pytest.importorskip("pyfmmlib")
-
     cl_ctx = ctx_getter()
     queue = cl.CommandQueue(cl_ctx)
 
@@ -280,8 +274,7 @@ def no_test_sphere_eigenvalues(ctx_getter, mode_m, mode_n, qbx_order,
     def rel_err(comp, ref):
         return (
                 norm(density_discr, queue, comp - ref)
-                /
-                norm(density_discr, queue, ref))
+                / norm(density_discr, queue, ref))
 
     for nrefinements in [0, 1]:
         from meshmode.mesh.generation import generate_icosphere
@@ -382,7 +375,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py
index fe001342de665dc58cfdee0d8f9047f74f7c7c3c..c376fdf3e5072aba8604147c70a4f5d918c15f33 100644
--- a/test/test_layer_pot_identity.py
+++ b/test/test_layer_pot_identity.py
@@ -82,7 +82,7 @@ class StarfishGeometry(object):
 
     dim = 2
 
-    resolutions = [30, 50, 70]
+    resolutions = [30, 50, 70, 90]
 
     def get_mesh(self, nelements, target_order):
         return make_curve_mesh(
@@ -240,9 +240,6 @@ class DynamicTestCase(object):
                 and self.expr.zero_op_name == "green_grad"):
             pytest.skip("does not achieve sufficient precision")
 
-        if self.fmm_backend == "fmmlib":
-            pytest.importorskip("pyfmmlib")
-
 
 # {{{ integral identity tester
 
@@ -410,7 +407,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a063ca3b94f515e20ee9b130893f0f23243dc3
--- /dev/null
+++ b/test/test_linalg_proxy.py
@@ -0,0 +1,436 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import os
+import time
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+from pyopencl.array import to_device
+
+from sumpy.tools import BlockIndexRanges
+from meshmode.mesh.generation import ( # noqa
+        ellipse, NArmedStarfish, generate_torus, make_curve_mesh)
+
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+
+def _build_qbx_discr(queue,
+        ndim=2,
+        nelements=30,
+        target_order=7,
+        qbx_order=4,
+        curve_f=None):
+
+    if curve_f is None:
+        curve_f = NArmedStarfish(5, 0.25)
+
+    if ndim == 2:
+        mesh = make_curve_mesh(curve_f,
+                np.linspace(0, 1, nelements + 1),
+                target_order)
+    elif ndim == 3:
+        mesh = generate_torus(10.0, 2.0, order=target_order)
+    else:
+        raise ValueError("unsupported ambient dimension")
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    qbx, _ = QBXLayerPotentialSource(density_discr,
+            fine_order=4 * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False).with_refinement()
+
+    return qbx
+
+
+def _build_block_index(discr,
+                       nblks=10,
+                       factor=1.0,
+                       method='elements',
+                       use_tree=True):
+
+    from pytential.linalg.proxy import (
+            partition_by_nodes, partition_by_elements)
+
+    if method == 'elements':
+        factor = 1.0
+
+    if method == 'nodes':
+        nnodes = discr.nnodes
+    else:
+        nnodes = discr.mesh.nelements
+    max_particles_in_box = nnodes // nblks
+
+    # create index ranges
+    if method == 'nodes':
+        indices = partition_by_nodes(discr,
+                                     use_tree=use_tree,
+                                     max_nodes_in_box=max_particles_in_box)
+    elif method == 'elements':
+        indices = partition_by_elements(discr,
+                                        use_tree=use_tree,
+                                        max_elements_in_box=max_particles_in_box)
+    else:
+        raise ValueError('unknown method: {}'.format(method))
+
+    # randomly pick a subset of points
+    if abs(factor - 1.0) > 1.0e-14:
+        with cl.CommandQueue(discr.cl_context) as queue:
+            indices = indices.get(queue)
+
+            indices_ = np.empty(indices.nblocks, dtype=np.object)
+            for i in range(indices.nblocks):
+                iidx = indices.block_indices(i)
+                isize = int(factor * len(iidx))
+                isize = max(1, min(isize, len(iidx)))
+
+                indices_[i] = np.sort(
+                        np.random.choice(iidx, size=isize, replace=False))
+
+            ranges_ = to_device(queue,
+                    np.cumsum([0] + [r.shape[0] for r in indices_]))
+            indices_ = to_device(queue, np.hstack(indices_))
+
+            indices = BlockIndexRanges(discr.cl_context,
+                                       indices_.with_queue(None),
+                                       ranges_.with_queue(None))
+
+    return indices
+
+
+def _plot_partition_indices(queue, discr, indices, **kwargs):
+    import matplotlib.pyplot as pt
+    indices = indices.get(queue)
+
+    args = [
+        kwargs.get("method", "unknown"),
+        "tree" if kwargs.get("use_tree", False) else "linear",
+        kwargs.get("pid", "stage1"),
+        discr.ambient_dim
+        ]
+
+    pt.figure(figsize=(10, 8), dpi=300)
+    pt.plot(np.diff(indices.ranges))
+    pt.savefig("test_partition_{0}_{1}_{3}d_ranges_{2}.png".format(*args))
+    pt.clf()
+
+    if discr.ambient_dim == 2:
+        sources = discr.nodes().get(queue)
+
+        pt.figure(figsize=(10, 8), dpi=300)
+
+        if indices.indices.shape[0] != discr.nnodes:
+            pt.plot(sources[0], sources[1], 'ko', alpha=0.5)
+        for i in range(indices.nblocks):
+            isrc = indices.block_indices(i)
+            pt.plot(sources[0][isrc], sources[1][isrc], 'o')
+
+        pt.xlim([-1.5, 1.5])
+        pt.ylim([-1.5, 1.5])
+        pt.savefig("test_partition_{0}_{1}_{3}d_{2}.png".format(*args))
+        pt.clf()
+    elif discr.ambient_dim == 3:
+        from meshmode.discretization import NoninterpolatoryElementGroupError
+        try:
+            discr.groups[0].basis()
+        except NoninterpolatoryElementGroupError:
+            return
+
+        from meshmode.discretization.visualization import make_visualizer
+        marker = -42.0 * np.ones(discr.nnodes)
+
+        for i in range(indices.nblocks):
+            isrc = indices.block_indices(i)
+            marker[isrc] = 10.0 * (i + 1.0)
+
+        vis = make_visualizer(queue, discr, 10)
+
+        filename = "test_partition_{0}_{1}_{3}d_{2}.png".format(*args)
+        if os.path.isfile(filename):
+            os.remove(filename)
+
+        vis.write_vtk_file(filename, [
+            ("marker", cl.array.to_device(queue, marker))
+            ])
+
+
+@pytest.mark.parametrize("method", ["nodes", "elements"])
+@pytest.mark.parametrize("use_tree", [True, False])
+@pytest.mark.parametrize("ndim", [2, 3])
+def test_partition_points(ctx_factory, method, use_tree, ndim, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    _build_block_index(qbx.density_discr,
+                       method=method,
+                       use_tree=use_tree,
+                       factor=0.6)
+
+
+@pytest.mark.parametrize("use_tree", [True, False])
+@pytest.mark.parametrize("ndim", [2, 3])
+def test_partition_coarse(ctx_factory, use_tree, ndim, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method="elements", use_tree=use_tree)
+
+    if visualize:
+        discr = qbx.resampler.from_discr
+        _plot_partition_indices(queue, discr, srcindices,
+                method="elements", use_tree=use_tree, pid="stage1")
+
+    from pytential.linalg.proxy import partition_from_coarse
+    resampler = qbx.direct_resampler
+
+    t_start = time.time()
+    srcindices_ = partition_from_coarse(resampler, srcindices)
+    t_end = time.time()
+    if visualize:
+        print('Time: {:.5f}s'.format(t_end - t_start))
+
+    srcindices = srcindices.get(queue)
+    srcindices_ = srcindices_.get(queue)
+
+    sources = resampler.from_discr.nodes().get(queue)
+    sources_ = resampler.to_discr.nodes().get(queue)
+
+    for i in range(srcindices.nblocks):
+        isrc = srcindices.block_indices(i)
+        isrc_ = srcindices_.block_indices(i)
+
+        for j in range(ndim):
+            assert np.min(sources_[j][isrc_]) <= np.min(sources[j][isrc])
+            assert np.max(sources_[j][isrc_]) >= np.max(sources[j][isrc])
+
+    if visualize:
+        discr = resampler.to_discr
+        _plot_partition_indices(queue, discr, srcindices_,
+                method="elements", use_tree=use_tree, pid="stage2")
+
+
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_proxy_generator(ctx_factory, ndim, factor, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method='nodes', factor=factor)
+
+    from pytential.linalg.proxy import ProxyGenerator
+    generator = ProxyGenerator(qbx, ratio=1.1)
+    proxies, pxyranges, pxycenters, pxyradii = generator(queue, srcindices)
+
+    proxies = np.vstack([p.get() for p in proxies])
+    pxyranges = pxyranges.get()
+    pxycenters = np.vstack([c.get() for c in pxycenters])
+    pxyradii = pxyradii.get()
+
+    for i in range(srcindices.nblocks):
+        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+        r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
+        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)
+
+    srcindices = srcindices.get(queue)
+    if visualize:
+        if qbx.ambient_dim == 2:
+            import matplotlib.pyplot as pt
+            from pytential.qbx.utils import get_centers_on_side
+
+            density_nodes = qbx.density_discr.nodes().get(queue)
+            ci = get_centers_on_side(qbx, -1)
+            ci = np.vstack([c.get(queue) for c in ci])
+            ce = get_centers_on_side(qbx, +1)
+            ce = np.vstack([c.get(queue) for c in ce])
+            r = qbx._expansion_radii("nsources").get(queue)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]
+
+                pt.figure(figsize=(10, 8))
+                axis = pt.gca()
+                for j in isrc:
+                    c = pt.Circle(ci[:, j], r[j], color='k', alpha=0.1)
+                    axis.add_artist(c)
+                    c = pt.Circle(ce[:, j], r[j], color='k', alpha=0.1)
+                    axis.add_artist(c)
+
+                pt.plot(density_nodes[0], density_nodes[1],
+                        'ko', ms=2.0, alpha=0.5)
+                pt.plot(density_nodes[0, srcindices.indices],
+                        density_nodes[1, srcindices.indices],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
+                        'o', ms=2.0)
+                pt.plot(proxies[0, ipxy], proxies[1, ipxy],
+                        'o', ms=2.0)
+                pt.xlim([-1.5, 1.5])
+                pt.ylim([-1.5, 1.5])
+
+                filename = "test_proxy_generator_{}d_{:04}.png".format(ndim, i)
+                pt.savefig(filename, dpi=300)
+                pt.clf()
+        else:
+            from meshmode.discretization.visualization import make_visualizer
+            from meshmode.mesh.processing import ( # noqa
+                    affine_map, merge_disjoint_meshes)
+            from meshmode.discretization import Discretization
+            from meshmode.discretization.poly_element import \
+                InterpolatoryQuadratureSimplexGroupFactory
+
+            from meshmode.mesh.generation import generate_icosphere
+            ref_mesh = generate_icosphere(1, generator.nproxy)
+
+            # NOTE: this does not plot the actual proxy points
+            for i in range(srcindices.nblocks):
+                mesh = affine_map(ref_mesh,
+                    A=(pxyradii[i] * np.eye(ndim)),
+                    b=pxycenters[:, i].reshape(-1))
+
+                mesh = merge_disjoint_meshes([mesh, qbx.density_discr.mesh])
+                discr = Discretization(ctx, mesh,
+                    InterpolatoryQuadratureSimplexGroupFactory(10))
+
+                vis = make_visualizer(queue, discr, 10)
+                filename = "test_proxy_generator_{}d_{:04}.vtu".format(ndim, i)
+                if os.path.isfile(filename):
+                    os.remove(filename)
+                vis.write_vtk_file(filename, [])
+
+
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_interaction_points(ctx_factory, ndim, factor, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    qbx = _build_qbx_discr(queue, ndim=ndim)
+    srcindices = _build_block_index(qbx.density_discr,
+            method='nodes', factor=factor)
+
+    # generate proxy points
+    from pytential.linalg.proxy import ProxyGenerator
+    generator = ProxyGenerator(qbx)
+    _, _, pxycenters, pxyradii = generator(queue, srcindices)
+
+    from pytential.linalg.proxy import (  # noqa
+            gather_block_neighbor_points,
+            gather_block_interaction_points)
+    nbrindices = gather_block_neighbor_points(qbx.density_discr,
+            srcindices, pxycenters, pxyradii)
+    nodes, ranges = gather_block_interaction_points(qbx, srcindices)
+
+    srcindices = srcindices.get(queue)
+    nbrindices = nbrindices.get(queue)
+
+    for i in range(srcindices.nblocks):
+        isrc = srcindices.block_indices(i)
+        inbr = nbrindices.block_indices(i)
+
+        assert not np.any(np.isin(inbr, isrc))
+
+    if visualize:
+        if ndim == 2:
+            import matplotlib.pyplot as pt
+            density_nodes = qbx.density_discr.nodes().get(queue)
+            nodes = nodes.get(queue)
+            ranges = ranges.get(queue)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                inbr = nbrindices.block_indices(i)
+                iall = np.s_[ranges[i]:ranges[i + 1]]
+
+                pt.figure(figsize=(10, 8))
+                pt.plot(density_nodes[0], density_nodes[1],
+                        'ko', ms=2.0, alpha=0.5)
+                pt.plot(density_nodes[0, srcindices.indices],
+                        density_nodes[1, srcindices.indices],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, isrc], density_nodes[1, isrc],
+                        'o', ms=2.0)
+                pt.plot(density_nodes[0, inbr], density_nodes[1, inbr],
+                        'o', ms=2.0)
+                pt.plot(nodes[0, iall], nodes[1, iall],
+                        'x', ms=2.0)
+                pt.xlim([-1.5, 1.5])
+                pt.ylim([-1.5, 1.5])
+
+                filename = "test_area_query_{}d_{:04}.png".format(ndim, i)
+                pt.savefig(filename, dpi=300)
+                pt.clf()
+        elif ndim == 3:
+            from meshmode.discretization.visualization import make_visualizer
+            marker = np.empty(qbx.density_discr.nnodes)
+
+            for i in range(srcindices.nblocks):
+                isrc = srcindices.block_indices(i)
+                inbr = nbrindices.block_indices(i)
+
+                # TODO: some way to turn off some of the interpolations
+                # would help visualize this better.
+                marker.fill(0.0)
+                marker[srcindices.indices] = 0.0
+                marker[isrc] = -42.0
+                marker[inbr] = +42.0
+                marker_dev = cl.array.to_device(queue, marker)
+
+                vis = make_visualizer(queue, qbx.density_discr, 10)
+                filename = "test_area_query_{}d_{:04}.vtu".format(ndim, i)
+                if os.path.isfile(filename):
+                    os.remove(filename)
+
+                vis.write_vtk_file(filename, [
+                    ("marker", marker_dev),
+                    ])
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_matrix.py b/test/test_matrix.py
index d1558bcc03fcd42c64c4cde780557721a40e3131..d912b2e75f1e932c15e697b130ebfcada5808197 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import, print_function
 
-__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2015 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -22,123 +25,215 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from functools import partial
+
 import numpy as np
 import numpy.linalg as la
+
 import pyopencl as cl
-import pytest
+import pyopencl.array   # noqa
+
+from pytools.obj_array import make_obj_array, is_obj_array
+
+from sumpy.symbolic import USE_SYMENGINE
 from meshmode.mesh.generation import (  # noqa
-        ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut,
-        make_curve_mesh)
+        ellipse, NArmedStarfish, make_curve_mesh, generate_torus)
+
 from pytential import bind, sym
-from functools import partial
-from sumpy.symbolic import USE_SYMENGINE
+from pytential.symbolic.primitives import DEFAULT_SOURCE, DEFAULT_TARGET
+from pytential.symbolic.primitives import QBXSourceStage1, QBXSourceQuadStage2
 
+import pytest
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
 
-@pytest.mark.skipif(USE_SYMENGINE,
-        reason="https://gitlab.tiker.net/inducer/sumpy/issues/25")
-def test_matrix_build(ctx_factory):
-    cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
+def _build_qbx_discr(queue,
+        ndim=2,
+        nelements=30,
+        target_order=7,
+        qbx_order=4,
+        curve_f=None):
 
-    # prevent cache 'splosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
+    if curve_f is None:
+        curve_f = NArmedStarfish(5, 0.25)
+
+    if ndim == 2:
+        mesh = make_curve_mesh(curve_f,
+                np.linspace(0, 1, nelements + 1),
+                target_order)
+    elif ndim == 3:
+        mesh = generate_torus(10.0, 2.0, order=target_order)
+    else:
+        raise ValueError("unsupported ambient dimension")
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+            InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    qbx, _ = QBXLayerPotentialSource(density_discr,
+            fine_order=4 * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False).with_refinement()
+
+    return qbx
 
-    target_order = 7
-    qbx_order = 4
-    nelements = 30
-    curve_f = partial(ellipse, 3)
 
-    k = 1
+def _build_block_index(discr, nblks=10, factor=1.0):
+    nnodes = discr.nnodes
+    max_particles_in_box = nnodes // nblks
+
+    from pytential.linalg.proxy import partition_by_nodes
+    indices = partition_by_nodes(discr, use_tree=True,
+                                 max_nodes_in_box=max_particles_in_box)
+
+    # randomly pick a subset of points
+    from sumpy.tools import MatrixBlockIndexRanges, BlockIndexRanges
+    if abs(factor - 1.0) > 1.0e-14:
+        with cl.CommandQueue(discr.cl_context) as queue:
+            indices = indices.get(queue)
+
+            indices_ = np.empty(indices.nblocks, dtype=np.object)
+            for i in range(indices.nblocks):
+                iidx = indices.block_indices(i)
+                isize = int(factor * len(iidx))
+                isize = max(1, min(isize, len(iidx)))
+
+                indices_[i] = np.sort(
+                        np.random.choice(iidx, size=isize, replace=False))
+
+            ranges_ = cl.array.to_device(queue,
+                    np.cumsum([0] + [r.shape[0] for r in indices_]))
+            indices_ = cl.array.to_device(queue, np.hstack(indices_))
+
+            indices = BlockIndexRanges(discr.cl_context,
+                                       indices_.with_queue(None),
+                                       ranges_.with_queue(None))
+
+    indices = MatrixBlockIndexRanges(indices.cl_context,
+                                     indices, indices)
+
+    return indices
+
+
+def _build_op(lpot_id,
+              k=0,
+              ndim=2,
+              qbx_forced_limit="avg"):
 
     from sumpy.kernel import LaplaceKernel, HelmholtzKernel
     if k:
-        knl = HelmholtzKernel(2)
+        knl = HelmholtzKernel(ndim)
         knl_kwargs = {"k": k}
     else:
-        knl = LaplaceKernel(2)
+        knl = LaplaceKernel(ndim)
         knl_kwargs = {}
 
-    from pytools.obj_array import make_obj_array, is_obj_array
-
-    if 1:
+    lpot_kwargs = {"qbx_forced_limit": qbx_forced_limit}
+    lpot_kwargs.update(knl_kwargs)
+    if lpot_id == 1:
+        # scalar single-layer potential
+        u_sym = sym.var("u")
+        op = sym.S(knl, u_sym, **lpot_kwargs)
+    elif lpot_id == 2:
+        # scalar combination of layer potentials
+        u_sym = sym.var("u")
+        op = sym.S(knl, 0.3 * u_sym, **lpot_kwargs) \
+             + sym.D(knl, 0.5 * u_sym, **lpot_kwargs)
+    elif lpot_id == 3:
+        # vector potential
         u_sym = sym.make_sym_vector("u", 2)
         u0_sym, u1_sym = u_sym
 
         op = make_obj_array([
-            sym.Sp(knl, u0_sym, **knl_kwargs)
-            + sym.D(knl, u1_sym, **knl_kwargs),
-
-            sym.S(knl, 0.4*u0_sym, **knl_kwargs)
-            + 0.3*sym.D(knl, u0_sym, **knl_kwargs)
+            sym.Sp(knl, u0_sym, **lpot_kwargs)
+            + sym.D(knl, u1_sym, **lpot_kwargs),
+            sym.S(knl, 0.4 * u0_sym, **lpot_kwargs)
+            + 0.3 * sym.D(knl, u0_sym, **lpot_kwargs)
             ])
-    elif 0:
-        u_sym = sym.var("u")
-        op = sym.Sp(knl, u_sym, **knl_kwargs)
     else:
-        k0 = 3
-        k1 = 2.9
-        beta = 2.5
-
-        from pytential.symbolic.pde.scalar import (  # noqa
-                DielectricSRep2DBoundaryOperator as SRep,
-                DielectricSDRep2DBoundaryOperator as SDRep)
-        pde_op = SDRep(
-                mode="tem",
-                k_vacuum=1,
-                interfaces=((0, 1, sym.DEFAULT_SOURCE),),
-                domain_k_exprs=(k0, k1),
-                beta=beta,
-                use_l2_weighting=False)
-
-        u_sym = pde_op.make_unknown("u")
-        op = pde_op.operator(u_sym)
+        raise ValueError("Unknown lpot_id: {}".format(lpot_id))
+
+    op = 0.5 * u_sym + op
+
+    return op, u_sym, knl_kwargs
+
+
+def _max_block_error(mat, blk, index_set):
+    error = -np.inf
+    for i in range(index_set.nblocks):
+        mat_i = index_set.take(mat, i)
+        blk_i = index_set.block_take(blk, i)
+
+        error = max(error, la.norm(mat_i - blk_i) / la.norm(mat_i))
+
+    return error
+
 
+@pytest.mark.skipif(USE_SYMENGINE,
+        reason="https://gitlab.tiker.net/inducer/sumpy/issues/25")
+@pytest.mark.parametrize("k", [0, 42])
+@pytest.mark.parametrize("curve_f", [
+    partial(ellipse, 3),
+    NArmedStarfish(5, 0.25)])
+@pytest.mark.parametrize("lpot_id", [2, 3])
+def test_matrix_build(ctx_factory, k, curve_f, lpot_id, visualize=False):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    # prevent cache 'splosion
+    from sympy.core.cache import clear_cache
+    clear_cache()
+
+    target_order = 7
+    qbx_order = 4
+    nelements = 30
     mesh = make_curve_mesh(curve_f,
-            np.linspace(0, 1, nelements+1),
+            np.linspace(0, 1, nelements + 1),
             target_order)
 
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
-    from pytential.qbx import QBXLayerPotentialSource
     pre_density_discr = Discretization(
             cl_ctx, mesh,
             InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
-    qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4*target_order,
+    from pytential.qbx import QBXLayerPotentialSource
+    qbx, _ = QBXLayerPotentialSource(pre_density_discr, 4 * target_order,
             qbx_order,
             # Don't use FMM for now
             fmm_order=False).with_refinement()
-
     density_discr = qbx.density_discr
 
+    op, u_sym, knl_kwargs = _build_op(lpot_id, k=k)
     bound_op = bind(qbx, op)
 
     from pytential.symbolic.execution import build_matrix
     mat = build_matrix(queue, qbx, op, u_sym).get()
 
-    if 0:
+    if visualize:
         from sumpy.tools import build_matrix as build_matrix_via_matvec
-        mat2 = build_matrix_via_matvec(bound_op.scipy_op(queue, "u"))
+        mat2 = bound_op.scipy_op(queue, "u", dtype=mat.dtype, **knl_kwargs)
+        mat2 = build_matrix_via_matvec(mat2)
+        print(la.norm((mat - mat2).real, "fro") / la.norm(mat2.real, "fro"),
+              la.norm((mat - mat2).imag, "fro") / la.norm(mat2.imag, "fro"))
 
-        print(
-                la.norm((mat-mat2).real, "fro")/la.norm(mat2.real, "fro"),
-                la.norm((mat-mat2).imag, "fro")/la.norm(mat2.imag, "fro"))
         import matplotlib.pyplot as pt
         pt.subplot(121)
-        pt.imshow(np.log10(np.abs(1e-20+(mat-mat2).real)))
+        pt.imshow(np.log10(np.abs(1.0e-20 + (mat - mat2).real)))
         pt.colorbar()
         pt.subplot(122)
-        pt.imshow(np.log10(np.abs(1e-20+(mat-mat2).imag)))
+        pt.imshow(np.log10(np.abs(1.0e-20 + (mat - mat2).imag)))
         pt.colorbar()
         pt.show()
 
-    if 0:
+    if visualize:
         import matplotlib.pyplot as pt
         pt.subplot(121)
         pt.imshow(mat.real)
@@ -154,7 +249,7 @@ def test_matrix_build(ctx_factory):
         if is_obj_array(u_sym):
             u = make_obj_array([
                 np.random.randn(density_discr.nnodes)
-                for i in range(len(u_sym))
+                for _ in range(len(u_sym))
                 ])
         else:
             u = np.random.randn(density_discr.nnodes)
@@ -169,16 +264,236 @@ def test_matrix_build(ctx_factory):
         abs_err = la.norm(res_mat - res_matvec, np.inf)
         rel_err = abs_err / la.norm(res_matvec, np.inf)
 
-        print(abs_err, rel_err)
+        print("AbsErr {:.5e} RelErr {:.5e}".format(abs_err, rel_err))
         assert rel_err < 1e-13
 
 
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+@pytest.mark.parametrize("lpot_id", [1, 2])
+def test_p2p_block_builder(ctx_factory, factor, ndim, lpot_id,
+                           visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # prevent cache explosion
+    from sympy.core.cache import clear_cache
+    clear_cache()
+
+    target_order = 2 if ndim == 3 else 7
+    qbx = _build_qbx_discr(queue, target_order=target_order, ndim=ndim)
+    op, u_sym, _ = _build_op(lpot_id, ndim=ndim)
+    index_set = _build_block_index(qbx.density_discr, factor=factor)
+
+    from pytential.symbolic.execution import GeometryCollection
+    from pytential.symbolic.execution import _prepare_expr, _prepare_domains
+    places = GeometryCollection(qbx)
+    expr = _prepare_expr(places, op)
+    domains = _prepare_domains(1, places, None, DEFAULT_SOURCE)
+
+    from pytential.symbolic.matrix import P2PMatrixBuilder
+    mbuilder = P2PMatrixBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[domains[0]],
+            dep_discr=places.get_discretization(domains[0]),
+            places=places,
+            context={},
+            exclude_self=True)
+    mat = mbuilder(expr)
+
+    from pytential.symbolic.matrix import FarFieldBlockBuilder
+    mbuilder = FarFieldBlockBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[domains[0]],
+            dep_discr=places.get_discretization(domains[0]),
+            places=places,
+            index_set=index_set,
+            context={},
+            exclude_self=True)
+    blk = mbuilder(expr)
+
+    index_set = index_set.get(queue)
+    if visualize and ndim == 2:
+        blk_full = np.zeros_like(mat)
+        mat_full = np.zeros_like(mat)
+
+        for i in range(index_set.nblocks):
+            itgt, isrc = index_set.block_indices(i)
+
+            blk_full[np.ix_(itgt, isrc)] = index_set.block_take(blk, i)
+            mat_full[np.ix_(itgt, isrc)] = index_set.take(mat, i)
+
+        import matplotlib.pyplot as mp
+        _, (ax1, ax2) = mp.subplots(1, 2,
+                figsize=(10, 8), dpi=300, constrained_layout=True)
+        ax1.imshow(blk_full)
+        ax1.set_title('FarFieldBlockBuilder')
+        ax2.imshow(mat_full)
+        ax2.set_title('P2PMatrixBuilder')
+        mp.savefig("test_p2p_block_{}d_{:.1f}.png".format(ndim, factor))
+
+    assert _max_block_error(mat, blk, index_set) < 1.0e-14
+
+
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+@pytest.mark.parametrize("ndim", [2, 3])
+@pytest.mark.parametrize("lpot_id", [1, 2])
+def test_qbx_block_builder(ctx_factory, factor, ndim, lpot_id,
+                           visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # prevent cache explosion
+    from sympy.core.cache import clear_cache
+    clear_cache()
+
+    target_order = 2 if ndim == 3 else 7
+    qbx = _build_qbx_discr(queue, target_order=target_order, ndim=ndim)
+    op, u_sym, _ = _build_op(lpot_id, ndim=ndim)
+
+    # NOTE: NearFieldBlockBuilder only does stage1/stage1 or stage2/stage2,
+    # so we need to hardcode the discr for MatrixBuilder too, since the
+    # defaults are different
+    where = (QBXSourceStage1(DEFAULT_SOURCE), QBXSourceStage1(DEFAULT_TARGET))
+
+    from pytential.symbolic.execution import GeometryCollection, _prepare_expr
+    places = GeometryCollection(qbx, auto_where=where)
+    expr = _prepare_expr(places, op)
+    density_discr = places.get_discretization(where[0])
+    index_set = _build_block_index(density_discr, factor=factor)
+
+    from pytential.symbolic.matrix import NearFieldBlockBuilder
+    mbuilder = NearFieldBlockBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[where[0]],
+            dep_discr=places.get_discretization(where[0]),
+            places=places,
+            index_set=index_set,
+            context={})
+    blk = mbuilder(expr)
+
+    from pytential.symbolic.matrix import MatrixBuilder
+    mbuilder = MatrixBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[where[0]],
+            dep_discr=places.get_discretization(where[0]),
+            places=places,
+            context={})
+    mat = mbuilder(expr)
+
+    index_set = index_set.get(queue)
+    if visualize:
+        blk_full = np.zeros_like(mat)
+        mat_full = np.zeros_like(mat)
+
+        for i in range(index_set.nblocks):
+            itgt, isrc = index_set.block_indices(i)
+
+            blk_full[np.ix_(itgt, isrc)] = index_set.block_take(blk, i)
+            mat_full[np.ix_(itgt, isrc)] = index_set.take(mat, i)
+
+        import matplotlib.pyplot as mp
+        _, (ax1, ax2) = mp.subplots(1, 2,
+                figsize=(10, 8), constrained_layout=True)
+        ax1.imshow(mat_full)
+        ax1.set_title('MatrixBuilder')
+        ax2.imshow(blk_full)
+        ax2.set_title('NearFieldBlockBuilder')
+        mp.savefig("test_qbx_block_builder.png", dpi=300)
+
+    assert _max_block_error(mat, blk, index_set) < 1.0e-14
+
+
+@pytest.mark.parametrize('place_id',
+        [(DEFAULT_SOURCE, DEFAULT_TARGET),
+         (QBXSourceStage1(DEFAULT_SOURCE),
+          QBXSourceStage1(DEFAULT_TARGET)),
+         (QBXSourceQuadStage2(DEFAULT_SOURCE),
+          QBXSourceQuadStage2(DEFAULT_TARGET))])
+def test_build_matrix_places(ctx_factory, place_id, visualize=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # prevent cache explosion
+    from sympy.core.cache import clear_cache
+    clear_cache()
+
+    qbx = _build_qbx_discr(queue, nelements=8, target_order=2, ndim=2,
+                           curve_f=partial(ellipse, 1.0))
+
+    qbx_forced_limit = -1
+    op, u_sym, _ = _build_op(lpot_id=1, ndim=2,
+            qbx_forced_limit=qbx_forced_limit)
+
+    from pytential.symbolic.execution import GeometryCollection
+    places = GeometryCollection(qbx, auto_where=place_id)
+    source_discr = places.get_discretization(place_id[0])
+    target_discr = places.get_discretization(place_id[1])
+
+    index_set = _build_block_index(source_discr, factor=0.6)
+
+    # build full QBX matrix
+    from pytential.symbolic.execution import build_matrix
+    qbx_mat = build_matrix(queue, qbx, op, u_sym,
+                           auto_where=place_id, domains=place_id[0])
+    qbx_mat = qbx_mat.get(queue)
+
+    assert qbx_mat.shape == (target_discr.nnodes, source_discr.nnodes)
+
+    # build full p2p matrix
+    from pytential.symbolic.execution import _prepare_expr
+    op = _prepare_expr(places, op)
+
+    from pytential.symbolic.matrix import P2PMatrixBuilder
+    mbuilder = P2PMatrixBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[place_id[0]],
+            dep_discr=places.get_discretization(place_id[0]),
+            places=places,
+            context={})
+    p2p_mat = mbuilder(op)
+
+    assert p2p_mat.shape == (target_discr.nnodes, source_discr.nnodes)
+
+    # build block qbx and p2p matrices
+    from pytential.symbolic.matrix import NearFieldBlockBuilder
+    mbuilder = NearFieldBlockBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[place_id[0]],
+            dep_discr=places.get_discretization(place_id[0]),
+            places=places,
+            index_set=index_set,
+            context={})
+    mat = mbuilder(op)
+    if place_id[0] is not DEFAULT_SOURCE:
+        assert _max_block_error(qbx_mat, mat, index_set.get(queue)) < 1.0e-14
+
+    from pytential.symbolic.matrix import FarFieldBlockBuilder
+    mbuilder = FarFieldBlockBuilder(queue,
+            dep_expr=u_sym,
+            other_dep_exprs=[],
+            dep_source=places[place_id[0]],
+            dep_discr=places.get_discretization(place_id[0]),
+            places=places,
+            index_set=index_set,
+            context={},
+            exclude_self=True)
+    mat = mbuilder(op)
+    assert _max_block_error(p2p_mat, mat, index_set.get(queue)) < 1.0e-14
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_maxwell.py b/test/test_maxwell.py
index a3ceca6ef2c6dc18b65596e615d4d8aecdfc2ae9..52617e0bd709ce9cc46869fe58b40383bc33e0f5 100644
--- a/test/test_maxwell.py
+++ b/test/test_maxwell.py
@@ -213,6 +213,7 @@ class EHField(object):
 
 # {{{ driver
 
+@pytest.mark.slowtest
 @pytest.mark.parametrize("case", [
     #tc_int,
     tc_ext,
@@ -227,8 +228,6 @@ def test_pec_mfie_extinction(ctx_getter, case, visualize=False):
     cl_ctx = ctx_getter()
     queue = cl.CommandQueue(cl_ctx)
 
-    pytest.importorskip("pyfmmlib")
-
     np.random.seed(12)
 
     knl_kwargs = {"k": case.k}
@@ -267,7 +266,7 @@ def test_pec_mfie_extinction(ctx_getter, case, visualize=False):
             # point source
             return bind(
                     (test_source, tgt),
-                    get_sym_maxwell_point_source(mfie.kernel, j_sym, mfie.k)
+                    get_sym_maxwell_point_source_em(mfie.kernel, j_sym, mfie.k)
                     )(queue, j=src_j, k=case.k)
 
     pde_test_inc = EHField(
@@ -393,7 +392,7 @@ def test_pec_mfie_extinction(ctx_getter, case, visualize=False):
             return norm(qbx, queue, f, p=np.inf)
 
         e_bc_residual = scat_norm(eh_bc_values[:3]) / scat_norm(inc_field_scat.e)
-        h_bc_residual = scat_norm(eh_bc_values[3]) / scat_norm(inc_field_scat.h)
+        h_bc_residual = scat_norm(eh_bc_values[3:]) / scat_norm(inc_field_scat.h)
 
         print("E/H PEC BC residuals:", h_max, e_bc_residual, h_bc_residual)
 
@@ -417,7 +416,7 @@ def test_pec_mfie_extinction(ctx_getter, case, visualize=False):
                 ("Hinc", inc_field_scat.h),
                 ("bdry_normals", bdry_normals),
                 ("e_bc_residual", eh_bc_values[:3]),
-                ("h_bc_residual", eh_bc_values[3]),
+                ("h_bc_residual", eh_bc_values[3:]),
                 ])
 
             fplot = make_field_plotter_from_bbox(
@@ -506,7 +505,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_maxwell_dpie.py b/test/test_maxwell_dpie.py
index 27d0a9d0d4cc8806e9e5f57bdb54bdee6fa074ba..987f574537b8afdcce8434c4400699983674429b 100644
--- a/test/test_maxwell_dpie.py
+++ b/test/test_maxwell_dpie.py
@@ -39,6 +39,17 @@ from sumpy.point_calculus import CalculusPatch, frequency_domain_maxwell
 from sumpy.tools import vector_from_device
 from pytential.target import PointsTarget
 from meshmode.mesh.processing import find_bounding_box
+from pytools.convergence import EOCRecorder
+from pytential.solve import gmres
+
+import pytential.symbolic.pde.maxwell as mw
+import pytential.symbolic.pde.maxwell.dpie as mw_dpie
+from pytential.qbx import QBXLayerPotentialSource
+from meshmode.discretization import Discretization
+from meshmode.discretization.poly_element import \
+    InterpolatoryQuadratureSimplexGroupFactory
+from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
+
 
 import logging
 logger = logging.getLogger(__name__)
@@ -112,7 +123,7 @@ class RoundedCubeTestCase(MaxwellTestCase):
         mesh = affine_map(mesh, b=np.array([-0.5, -0.5, -0.5]))
         mesh = affine_map(mesh, A=np.eye(3)*2)
 
-        # now centered at origin and extends to -1,1
+        # now centered at origin and extends to -1, 1
 
         # Flip elements--gmsh generates inside-out geometry.
         return perform_flips(mesh, np.ones(mesh.nelements))
@@ -158,7 +169,7 @@ class ElliptiPlaneTestCase(MaxwellTestCase):
                     "-string",
                     "Mesh.CharacteristicLengthMax = %g;" % resolution])
 
-        # now centered at origin and extends to -1,1
+        # now centered at origin and extends to -1, 1
 
         # Flip elements--gmsh generates inside-out geometry.
         from meshmode.mesh.processing import perform_flips
@@ -192,8 +203,8 @@ class ElliptiPlaneTestCase(MaxwellTestCase):
 tc_int = SphereTestCase(k=1.2, is_interior=True, resolutions=[0, 1],
         qbx_order=3, fmm_tolerance=1e-4)
 
-tc_ext = SphereTestCase(k=1.2, is_interior=False, resolutions=[0],
-        qbx_order=7, fmm_tolerance=1e-4)
+tc_ext = SphereTestCase(k=1.2, is_interior=False, resolutions=[0, 1],
+        qbx_order=3, fmm_tolerance=1e-4)
 
 tc_rc_ext = RoundedCubeTestCase(k=6.4, is_interior=False, resolutions=[0.1],
         qbx_order=3, fmm_tolerance=1e-4)
@@ -216,51 +227,182 @@ class EHField(object):
         return self.field[3:]
 
 
-# {{{ driver
+# {{ test_dpie_auxiliary
 
 @pytest.mark.parametrize("case", [
     #tc_int,
     tc_ext,
     ])
-def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
-    """For (say) is_interior=False (the 'exterior' MFIE), this test verifies
+def test_dpie_auxiliary(ctx_factory, case):
+    logging.basicConfig(level=logging.INFO)
+
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    pytest.importorskip("pyfmmlib")
+
+    np.random.seed(12)
+
+    knl_kwargs = {"k": case.k, "ik": 1j*case.k}
+
+    geom_list = ["obj0"]
+
+    dpie = mw_dpie.DPIEOperatorEvanescent(geometry_list=geom_list)
+    tau_densities = sym.make_sym_vector("tau_densities", dpie.num_distinct_objects())
+
+    calc_patch = CalculusPatch(np.array([-3, 0, 0]), h=0.01)
+
+    # {{{ test the auxiliary problem
+
+    # test that the aux problem is capable of computing the desired derivatives
+    # of an appropriate input field
+
+    # define method to get locations to evaluate representation
+    def epsilon_off_boundary(where=None, epsilon=1e-4):
+        x = sym.nodes(3, where).as_vector()
+        return x + sym.normal(3, 2, where).as_vector()*epsilon
+
+    # loop through the case's resolutions and compute the scattered field
+    # solution
+
+    eoc_rec = EOCRecorder()
+
+    for resolution in case.resolutions:
+
+        # get the scattered and observation mesh
+        scat_mesh = case.get_mesh(resolution, case.target_order)
+        # observation_mesh = case.get_observation_mesh(case.target_order)
+
+        # define the pre-scattered discretization
+        pre_scat_discr = Discretization(
+                cl_ctx, scat_mesh,
+                InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
+
+        # use OpenCL random number generator to create a set of random
+        # source locations for various variables being solved for
+        dpie0 = mw_dpie.DPIEOperator(geometry_list=geom_list)
+        qbx0, _ = QBXLayerPotentialSource(
+                    pre_scat_discr, fine_order=4*case.target_order,
+                    #fmm_order=False,
+                    qbx_order=case.qbx_order,
+                    fmm_level_to_order=SimpleExpansionOrderFinder(
+                        case.fmm_tolerance),
+                    fmm_backend=case.fmm_backend
+                    ).with_refinement(_expansion_disturbance_tolerance=0.05)
+
+        # define the geometry dictionary
+        geom_map = {
+                "obj0": qbx0,
+                "obj0t": qbx0.density_discr,
+                "scat": qbx0.density_discr}
+
+        # define points to evaluate the gradient at
+        tgt_n = PointsTarget(bind(geom_map,
+            epsilon_off_boundary(where='obj0', epsilon=1.0))(queue))
+        geom_map['tgt'] = tgt_n
+
+        # define the quantity that will have a derivative taken of it and
+        # its associated derivative
+        def getTestFunction(where=None):
+            z = sym.nodes(3, where).as_vector()
+            z2 = sym.cse(np.dot(z, z), "z_mag_squared")
+            g = sym.exp(1j*dpie0.k*sym.sqrt(z2))/(4.0*np.pi*sym.sqrt(z2))
+            return g
+
+        def getTestGradient(where=None):
+            z = sym.nodes(3, where).as_vector()
+            z2 = sym.cse(np.dot(z, z), "z_mag_squared")
+            grad_g = z*sym.exp(1j*dpie0.k*sym.sqrt(z2))*(1j*dpie.k
+                    - 1.0/sym.sqrt(z2))/(4*np.pi*z2)
+            return grad_g
+
+        # compute output gradient evaluated at the desired object
+        tgrad = bind(geom_map, getTestGradient(where="tgt"))(queue, **knl_kwargs)
+        test_func_d = vector_from_device(queue, tgrad)
+
+        # define the problem that will be solved
+        test_tau_op = bind(geom_map,
+                dpie0.subproblem_operator(tau_densities=tau_densities))
+        test_tau_rhs = bind(geom_map,
+                dpie0.subproblem_rhs_func(function=getTestFunction))(queue,
+                        **knl_kwargs)
+
+        # set GMRES settings for solving
+        gmres_settings = dict(
+                tol=case.gmres_tol,
+                progress=True,
+                hard_failure=True,
+                stall_iterations=50, no_progress_factor=1.05)
+
+        subprob_result = gmres(
+                test_tau_op.scipy_op(queue, "tau_densities", np.complex128,
+                    domains=dpie0.get_subproblem_domain_list(),
+                    **knl_kwargs),
+                test_tau_rhs, **gmres_settings)
+        dummy_tau = subprob_result.solution
+
+        # compute the error between the associated derivative quantities
+        tgrad = bind(geom_map, sym.grad(3,
+            dpie0.subproblem_rep(tau_densities=tau_densities,
+                target='tgt')))(queue, tau_densities=dummy_tau,
+                        **knl_kwargs)
+        approx_d = vector_from_device(queue, tgrad)
+        err = (
+                calc_patch.norm(test_func_d - approx_d, np.inf)
+                / calc_patch.norm(approx_d, np.inf))
+
+        # append error to the error list
+        eoc_rec.add_data_point(qbx0.h_max, err)
+
+    print(eoc_rec)
+
+    assert eoc_rec.order_estimate() >= case.qbx_order - 0.5
+
+    # }}}
+
+# }}}
+
+
+@pytest.mark.parametrize("case", [
+    #tc_int,
+    tc_ext,
+    ])
+def test_pec_dpie_extinction(
+        ctx_factory, case,
+        visualize=False,
+        test_representations=False,
+        test_operators=False,
+        ):
+
+    """For (say) is_interior=False (the 'exterior' BVP), this test verifies
     extinction of the combined (incoming + scattered) field on the interior
     of the scatterer.
     """
 
-    # setup the basic config for logging
     logging.basicConfig(level=logging.INFO)
 
-    # setup the OpenCL context and queue
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
-    # import or skip pyfmmlib
     pytest.importorskip("pyfmmlib")
 
-    # initialize the random seed
     np.random.seed(12)
 
-    # specify a dictionary with some useful arguments
     knl_kwargs = {"k": case.k, "ik": 1j*case.k}
 
-    # specify the list of geometry objects being used
     geom_list = ["obj0"]
 
     # {{{ come up with a solution to Maxwell's equations
 
-    # import some functionality from maxwell into this
-    # local scope environment
-    import pytential.symbolic.pde.maxwell       as mw
-    import pytential.symbolic.pde.maxwell.dpie  as mw_dpie
-    
     # initialize the DPIE operator based on the geometry list
     dpie = mw_dpie.DPIEOperatorEvanescent(geometry_list=geom_list)
 
     # specify some symbolic variables that will be used
     # in the process to solve integral equations for the DPIE
-    phi_densities   = sym.make_sym_vector("phi_densities", dpie.num_scalar_potential_densities())
-    A_densities     = sym.make_sym_vector("A_densities", dpie.num_vector_potential_densities())
+    phi_densities = sym.make_sym_vector("phi_densities",
+            dpie.num_scalar_potential_densities())
+    A_densities = sym.make_sym_vector("A_densities",
+            dpie.num_vector_potential_densities())
     tau_densities = sym.make_sym_vector("tau_densities", dpie.num_distinct_objects())
 
     # get test source locations from the passed in case's queue
@@ -270,45 +412,51 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
     calc_patch = CalculusPatch(np.array([-3, 0, 0]), h=0.01)
     calc_patch_tgt = PointsTarget(cl.array.to_device(queue, calc_patch.points))
 
-    # define a random number generator based on OpenCL
-    rng = cl.clrandom.PhiloxGenerator(cl_ctx, seed=12)
-
     # define some parameters for the incident wave
     # direction for the wave
-    u_dir = np.array([1, 0, 0],dtype=np.complex128)
+    u_dir = np.array([1, 0, 0], dtype=np.complex128)
 
     # polarization vector
-    Ep = np.array([0, 1, 1],dtype=np.complex128)
+    Ep = np.array([0, 1, 1], dtype=np.complex128)
 
     # define symbolic vectors for use
     uvar = sym.make_sym_vector("u", 3)
-    Evar = sym.make_sym_vector("Ep",3)
+    Evar = sym.make_sym_vector("Ep", 3)
 
-    # define functions that can be used to generate incident fields for an input discretization
+    # define functions that can be used to generate incident fields for an
+    # input discretization
     # define potentials based on incident plane wave
     def get_incident_plane_wave_EHField(tgt):
-        return bind((test_source,tgt),mw.get_sym_maxwell_plane_wave(amplitude_vec=Evar, v=uvar, omega=dpie.k))(queue,u=u_dir,Ep=Ep,**knl_kwargs)
+        return bind((test_source, tgt),
+                mw.get_sym_maxwell_plane_wave(amplitude_vec=Evar, v=uvar,
+                    omega=dpie.k))(queue, u=u_dir, Ep=Ep, **knl_kwargs)
 
     # get the gradphi_inc field evaluated at some source locations
     def get_incident_gradphi(objects, target=None):
-        return bind(objects,mw.get_sym_maxwell_planewave_gradphi(u=uvar, Ep=Evar, k=dpie.k,where=target))(queue,u=u_dir,Ep=Ep,**knl_kwargs)
+        return bind(objects, mw.get_sym_maxwell_planewave_gradphi(u=uvar,
+            Ep=Evar, k=dpie.k, where=target))(queue, u=u_dir,
+                    Ep=Ep, **knl_kwargs)
 
     # get the incident plane wave div(A)
     def get_incident_divA(objects, target=None):
-        return bind(objects,mw.get_sym_maxwell_planewave_divA(u=uvar, Ep=Evar, k=dpie.k,where=target))(queue,u=u_dir,Ep=Ep,**knl_kwargs)
+        return bind(objects, mw.get_sym_maxwell_planewave_divA(u=uvar, Ep=Evar,
+            k=dpie.k, where=target))(queue, u=u_dir, Ep=Ep, **knl_kwargs)
 
-    # method to get vector potential and scalar potential for incident 
+    # method to get vector potential and scalar potential for incident
     # E-M fields
     def get_incident_potentials(objects, target=None):
-        return bind(objects,mw.get_sym_maxwell_planewave_potentials(u=uvar, Ep=Evar, k=dpie.k,where=target))(queue,u=u_dir,Ep=Ep,**knl_kwargs)
+        return bind(objects, mw.get_sym_maxwell_planewave_potentials(u=uvar,
+            Ep=Evar, k=dpie.k, where=target))(queue, u=u_dir,
+                    Ep=Ep, **knl_kwargs)
 
     # define a smooth function to represent the density
-    def dummy_density(omega = 1.0, where=None):
+    def dummy_density(omega=1.0, where=None):
         x = sym.nodes(3, where).as_vector()
-        return sym.sin(omega*sym.n_dot(x,where))
+        return sym.sin(omega*sym.n_dot(x, where))
 
     # get the Electromagnetic field evaluated at the target calculus patch
-    pde_test_inc = EHField(vector_from_device(queue, get_incident_plane_wave_EHField(calc_patch_tgt)))
+    pde_test_inc = EHField(vector_from_device(queue,
+        get_incident_plane_wave_EHField(calc_patch_tgt)))
 
     # compute residuals of incident field at source points
     source_maxwell_resids = [
@@ -323,128 +471,25 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
     # }}}
 
-
-    # {{{ Test the auxiliary problem is capable of computing the desired derivatives of an appropriate input field
-    test_auxiliary  = False
-
-    if test_auxiliary:
-        # import a bunch of stuff that will be useful
-        from pytools.convergence import EOCRecorder
-        from pytential.qbx import QBXLayerPotentialSource
-        from meshmode.discretization import Discretization
-        from meshmode.discretization.poly_element import \
-            InterpolatoryQuadratureSimplexGroupFactory
-        from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
-
-
-        # define method to get locations to evaluate representation
-        def epsilon_off_boundary(where=None, epsilon=1e-4):
-            x = sym.nodes(3, where).as_vector()
-            return x + sym.normal(3,2,where).as_vector()*epsilon
-
-        # # loop through the case's resolutions and compute the scattered field solution
-        deriv_error = []
-        for resolution in case.resolutions:
-
-            # get the scattered and observation mesh
-            scat_mesh           = case.get_mesh(resolution, case.target_order)
-            observation_mesh    = case.get_observation_mesh(case.target_order)
-
-            # define the pre-scattered discretization
-            pre_scat_discr = Discretization(
-                    cl_ctx, scat_mesh,
-                    InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
-
-            # use OpenCL random number generator to create a set of random
-            # source locations for various variables being solved for
-            dpie0 = mw_dpie.DPIEOperator(geometry_list=geom_list)
-            qbx0, _ = QBXLayerPotentialSource(
-                        pre_scat_discr, fine_order=4*case.target_order,
-                        #fmm_order=False,
-                        qbx_order=case.qbx_order,
-                        fmm_level_to_order=SimpleExpansionOrderFinder(case.fmm_tolerance),
-                        fmm_backend=case.fmm_backend
-                        ).with_refinement(_expansion_disturbance_tolerance=0.05)
-
-            # define the geometry dictionary
-            geom_map = {"obj0":qbx0, "obj0t":qbx0.density_discr, "scat":qbx0.density_discr}
-
-            # define points to evaluate the gradient at
-            tgt_n = PointsTarget(bind(geom_map, epsilon_off_boundary(where='obj0',epsilon=1.0))(queue))
-            geom_map['tgt'] = tgt_n
-
-            # define the quantity that will have a derivative taken of it and its associated derivative
-            def getTestFunction(where=None):
-                z = sym.nodes(3, where).as_vector()
-                z2 = sym.cse(np.dot(z, z), "z_mag_squared")
-                g = sym.exp(1j*dpie0.k*sym.sqrt(z2))/(4.0*np.pi*sym.sqrt(z2))
-                return g
-
-            def getTestGradient(where=None):
-                z = sym.nodes(3, where).as_vector()
-                z2 = sym.cse(np.dot(z, z), "z_mag_squared")
-                grad_g = z*sym.exp(1j*dpie0.k*sym.sqrt(z2))*(1j*dpie.k - 1.0/sym.sqrt(z2))/(4*np.pi*z2)
-                return grad_g
-
-            # compute output gradient evaluated at the desired object
-            tgrad = bind(geom_map,getTestGradient(where="tgt"))(queue,**knl_kwargs)
-            test_func_d = vector_from_device(queue,tgrad)
-
-            # define the problem that will be solved
-            test_tau_op= bind(geom_map,dpie0.subproblem_operator(tau_densities=tau_densities))
-            test_tau_rhs= bind(geom_map,dpie0.subproblem_rhs_func(function=getTestFunction))(queue,**knl_kwargs)
-
-            # set GMRES settings for solving
-            gmres_settings = dict(
-                    tol=case.gmres_tol,
-                    progress=True,
-                    hard_failure=True,
-                    stall_iterations=50, no_progress_factor=1.05)
-
-            # get the GMRES functionality
-            from pytential.solve import gmres
-
-            subprob_result = gmres(
-                    test_tau_op.scipy_op(queue, "tau_densities", np.complex128, domains=dpie0.get_subproblem_domain_list(), **knl_kwargs),
-                    test_tau_rhs, **gmres_settings)
-            dummy_tau = subprob_result.solution
-
-            # compute the error between the associated derivative quantities
-            tgrad = bind(geom_map,sym.grad(3,dpie0.subproblem_rep(tau_densities=tau_densities,target='tgt')))(queue,tau_densities=dummy_tau,**knl_kwargs)
-            approx_d = vector_from_device(queue,tgrad)
-            err = calc_patch.norm(test_func_d - approx_d, np.inf)
-
-            # append error to the error list
-            deriv_error.append(err)
-
-        print("Auxiliary Error Results:")
-        for n in range(0,len(deriv_error)):
-            print("Case {0}: {1}".format(n+1,deriv_error[n]))
-
-    # }}}
-
-
-    # # {{{ Test the representations
-    test_representations = False
+    # {{{ test the representations
 
     if test_representations:
 
         # import a bunch of stuff that will be useful
-        from pytools.convergence import EOCRecorder
         from pytential.qbx import QBXLayerPotentialSource
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
         from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
 
-
-        # # loop through the case's resolutions and compute the scattered field solution
+        # loop through the case's resolutions and compute the scattered field
+        # solution
         rep_error = []
         for resolution in case.resolutions:
 
             # get the scattered and observation mesh
-            scat_mesh           = case.get_mesh(resolution, case.target_order)
-            observation_mesh    = case.get_observation_mesh(case.target_order)
+            scat_mesh = case.get_mesh(resolution, case.target_order)
+            # observation_mesh = case.get_observation_mesh(case.target_order)
 
             # define the pre-scattered discretization
             pre_scat_discr = Discretization(
@@ -458,24 +503,36 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                         pre_scat_discr, fine_order=4*case.target_order,
                         #fmm_order=False,
                         qbx_order=case.qbx_order,
-                        fmm_level_to_order=SimpleExpansionOrderFinder(case.fmm_tolerance),
+                        fmm_level_to_order=SimpleExpansionOrderFinder(
+                            case.fmm_tolerance),
                         fmm_backend=case.fmm_backend
                         ).with_refinement(_expansion_disturbance_tolerance=0.05)
 
             # define the geometry dictionary
-            geom_map = {"obj0":qbx0, "obj0t":qbx0.density_discr, "scat":qbx0.density_discr}
-            dummy_phi = np.array([None]*dpie0.num_scalar_potential_densities(),dtype=dpie0.stype)
-            dummy_A = np.array([None]*dpie0.num_vector_potential_densities(),dtype=dpie0.stype)
-            v = rng.normal(queue, (qbx0.density_discr.nnodes,), dtype=np.float64)
-            s = 0*rng.normal(queue, (), dtype=np.float64)
+            geom_map = {
+                    "obj0": qbx0,
+                    "obj0t": qbx0.density_discr,
+                    "scat": qbx0.density_discr
+                    }
+
+            dummy_phi = np.array([None]*dpie0.num_scalar_potential_densities(),
+                    dtype=dpie0.stype)
+            dummy_A = np.array([None]*dpie0.num_vector_potential_densities(),
+                    dtype=dpie0.stype)
+
+            # v = rng.normal(queue, (qbx0.density_discr.nnodes,), dtype=np.float64)
+            # s = 0*rng.normal(queue, (), dtype=np.float64)
             n1 = len(dummy_phi)
             n2 = len(dummy_A)
-            for i in range(0,n1):
-                dummy_phi[i] = bind(geom_map,dummy_density(where='obj0'))(queue)
-            for i in range(0,n2):
-                dummy_A[i] = bind(geom_map,dummy_density(where='obj0'))(queue)
-            test_tau_op= bind(geom_map,dpie0.subproblem_operator(tau_densities=tau_densities))
-            test_tau_rhs= bind(geom_map,dpie0.subproblem_rhs(A_densities=A_densities))(queue,A_densities=dummy_A,**knl_kwargs)
+            for i in range(0, n1):
+                dummy_phi[i] = bind(geom_map, dummy_density(where='obj0'))(queue)
+            for i in range(0, n2):
+                dummy_A[i] = bind(geom_map, dummy_density(where='obj0'))(queue)
+            test_tau_op = bind(geom_map,
+                    dpie0.subproblem_operator(tau_densities=tau_densities))
+            test_tau_rhs = bind(geom_map,
+                    dpie0.subproblem_rhs(A_densities=A_densities))(queue,
+                            A_densities=dummy_A, **knl_kwargs)
 
             # set GMRES settings for solving
             gmres_settings = dict(
@@ -484,38 +541,42 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                     hard_failure=True,
                     stall_iterations=50, no_progress_factor=1.05)
 
-            # get the GMRES functionality
-            from pytential.solve import gmres
-
             subprob_result = gmres(
-                    test_tau_op.scipy_op(queue, "tau_densities", np.complex128, domains=dpie0.get_subproblem_domain_list(), **knl_kwargs),
+                    test_tau_op.scipy_op(queue, "tau_densities", np.complex128,
+                        domains=dpie0.get_subproblem_domain_list(),
+                        **knl_kwargs),
                     test_tau_rhs, **gmres_settings)
-            dummy_tau    = subprob_result.solution
+            dummy_tau = subprob_result.solution
 
-            sym_repr0 = dpie.scattered_volume_field(phi_densities,A_densities,tau_densities,target='tgt')
+            sym_repr0 = dpie.scattered_volume_field(phi_densities, A_densities,
+                    tau_densities, target='tgt')
 
             def eval_test_repr_at(tgt):
                 map = geom_map
                 map['tgt'] = tgt
-                return bind(map, sym_repr0)(queue, phi_densities=dummy_phi, A_densities=dummy_A, tau_densities=dummy_tau, **knl_kwargs)
+                return bind(map, sym_repr0)(queue, phi_densities=dummy_phi,
+                        A_densities=dummy_A, tau_densities=dummy_tau,
+                        **knl_kwargs)
 
-            pde_test_repr = EHField(vector_from_device(queue, eval_test_repr_at(calc_patch_tgt)))
+            pde_test_repr = EHField(vector_from_device(queue,
+                eval_test_repr_at(calc_patch_tgt)))
 
             maxwell_residuals = [
-                    calc_patch.norm(x, np.inf) / calc_patch.norm(pde_test_repr.e, np.inf)
-                    for x in frequency_domain_maxwell(calc_patch, pde_test_repr.e, pde_test_repr.h, case.k)]
+                    calc_patch.norm(x, np.inf)
+                    / calc_patch.norm(pde_test_repr.e, np.inf)
+                    for x in frequency_domain_maxwell(calc_patch,
+                        pde_test_repr.e, pde_test_repr.h, case.k)]
             print("Maxwell residuals:", maxwell_residuals)
             rep_error.append(maxwell_residuals)
 
         print("Representation Error Results:")
-        for n in range(0,len(rep_error)):
-            print("Case {0}: {1}".format(n+1,rep_error[n]))
+        for n in range(0, len(rep_error)):
+            print("Case {0}: {1}".format(n+1, rep_error[n]))
 
-    # #}}}
+    # }}}
 
+    # {{{ test the operators
 
-    # # {{{ Test the operators
-    test_operators = False
     if test_operators:
 
         # define error array
@@ -524,23 +585,22 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
         # define method to get locations to evaluate representation
         def epsilon_off_boundary(where=None, epsilon=1e-4):
             x = sym.nodes(3, where).as_vector()
-            return x + sym.normal(3,2,where).as_vector()*epsilon
+            return x + sym.normal(3, 2, where).as_vector()*epsilon
 
         # import a bunch of stuff that will be useful
-        from pytools.convergence import EOCRecorder
         from pytential.qbx import QBXLayerPotentialSource
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
         from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
 
-
-        # loop through the case's resolutions and compute the scattered field solution
+        # loop through the case's resolutions and compute the scattered field
+        # solution
         for resolution in case.resolutions:
 
             # get the scattered and observation mesh
-            scat_mesh           = case.get_mesh(resolution, case.target_order)
-            observation_mesh    = case.get_observation_mesh(case.target_order)
+            scat_mesh = case.get_mesh(resolution, case.target_order)
+            # observation_mesh = case.get_observation_mesh(case.target_order)
 
             # define the pre-scattered discretization
             pre_scat_discr = Discretization(
@@ -554,18 +614,27 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                         pre_scat_discr, fine_order=4*case.target_order,
                         #fmm_order=False,
                         qbx_order=case.qbx_order,
-                        fmm_level_to_order=SimpleExpansionOrderFinder(case.fmm_tolerance),
+                        fmm_level_to_order=SimpleExpansionOrderFinder(
+                            case.fmm_tolerance),
                         fmm_backend=case.fmm_backend
                         ).with_refinement(_expansion_disturbance_tolerance=0.05)
 
             # define the geometry dictionary
-            geom_map = {"obj0":qbx0, "obj0t":qbx0.density_discr, "scat":qbx0.density_discr}
-
-            # compute off-boundary locations that the representation will need to be evaluated at
-            tgt_n = PointsTarget(bind(geom_map, epsilon_off_boundary(where='obj0',epsilon=1e-4))(queue))
+            geom_map = {
+                    "obj0": qbx0,
+                    "obj0t": qbx0.density_discr,
+                    "scat": qbx0.density_discr
+                    }
+
+            # compute off-boundary locations that the representation will need
+            # to be evaluated at
+            tgt_n = PointsTarget(
+                    bind(geom_map,
+                        epsilon_off_boundary(where='obj0', epsilon=1e-4))(queue))
             geom_map['tgt'] = tgt_n
 
-            # define a dummy density, specifically to be used for the vector potential A densities
+            # define a dummy density, specifically to be used for the vector
+            # potential A densities
             x, y, z = qbx0.density_discr.nodes().with_queue(queue)
             m = cl.clmath
 
@@ -586,66 +655,99 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                                 ]))
 
             # redefine a_densities
-            #A_densities = sym.make_sym_vector("A_densities", dpie.num_vector_potential_densities2())
+            #A_densities = sym.make_sym_vector("A_densities", dpie.num_vector_potential_densities2())  # noqa
 
             # init random dummy densities for the vector and scalar potentials
-            dummy_phi = np.array([None]*dpie0.num_scalar_potential_densities(),dtype=dpie0.stype)
-            dummy_A = np.array([None]*dpie0.num_vector_potential_densities(),dtype=dpie0.stype)
-            dummy_tau = np.array([None]*dpie0.num_distinct_objects(),dtype=dpie0.stype)
-
-            # Compute zero scalar for use in extra constants that are usually solved for in operators
+            dummy_phi = np.array([None]*dpie0.num_scalar_potential_densities(),
+                    dtype=dpie0.stype)
+            dummy_A = np.array([None]*dpie0.num_vector_potential_densities(),
+                    dtype=dpie0.stype)
+            dummy_tau = np.array([None]*dpie0.num_distinct_objects(),
+                    dtype=dpie0.stype)
+
+            # compute zero scalar for use in extra constants that are usually
+            # solved for in operators
             n1 = len(dummy_phi)
             n2 = len(dummy_A)
             n3 = len(dummy_tau)
 
-            for i in range(0,n1):
+            for i in range(0, n1):
                 if i < (n1-1):
-                    dummy_phi[i] = bind(geom_map,dummy_density(where='obj0'))(queue)
+                    dummy_phi[i] = bind(geom_map, dummy_density(where='obj0'))(queue)
                 else:
                     dummy_phi[i] = 0.0
-            for i in range(0,n2):
+            for i in range(0, n2):
                 if i < 2:
                     dummy_A[i] = density[i]
                 elif i < (n2-1):
-                    dummy_A[i] = bind(geom_map,dummy_density(where='obj0'))(queue)
+                    dummy_A[i] = bind(geom_map, dummy_density(where='obj0'))(queue)
                 else:
                     dummy_A[i] = 0.0
-            for i in range(0,n3):
-                dummy_tau[i] = bind(geom_map,dummy_density(where='obj0'))(queue)
+            for i in range(0, n3):
+                dummy_tau[i] = bind(geom_map, dummy_density(where='obj0'))(queue)
 
             # check that the scalar density operator and representation are similar
             def vector_op_transform(vec_op_out):
                 a = sym.tangential_to_xyz(vec_op_out[:2], where='obj0')
-                return sym.join_fields(a,vec_op_out[2:])
+                return sym.join_fields(a, vec_op_out[2:])
 
             scalar_op = dpie0.phi_operator(phi_densities=phi_densities)
-            #vector_op = vector_op_transform(dpie0.a_operator0(A_densities=A_densities)[:-1])
-            vector_op = vector_op_transform(dpie0.a_operator(A_densities=A_densities))
+            #vector_op = vector_op_transform(dpie0.a_operator0(A_densities=A_densities)[:-1])  # noqa
+            vector_op = vector_op_transform(
+                    dpie0.a_operator(A_densities=A_densities))
             #vector_op = dpie0.a_operator2(A_densities=A_densities)[:-1]
             tau_op = dpie0.subproblem_operator(tau_densities=tau_densities)
 
             # evaluate operators at the dummy densities
-            scalar_op_eval = vector_from_device(queue,bind(geom_map, scalar_op)(queue, phi_densities=dummy_phi, **knl_kwargs))
-            vector_op_eval = vector_from_device(queue,bind(geom_map, vector_op)(queue, A_densities=dummy_A, **knl_kwargs))
-            tau_op_eval = vector_from_device(queue,bind(geom_map, tau_op)(queue, tau_densities=dummy_tau, **knl_kwargs))
+            scalar_op_eval = vector_from_device(queue,
+                    bind(geom_map, scalar_op)(
+                        queue, phi_densities=dummy_phi, **knl_kwargs))
+            vector_op_eval = vector_from_device(queue,
+                    bind(geom_map, vector_op)(
+                        queue, A_densities=dummy_A, **knl_kwargs))
+            tau_op_eval = vector_from_device(queue,
+                    bind(geom_map, tau_op)(
+                        queue, tau_densities=dummy_tau, **knl_kwargs))
 
             # define the vector operator equivalent representations
             #def vec_op_repr(A_densities, target):
-            #    return sym.join_fields(sym.n_cross(dpie0.vector_potential_rep0(A_densities=A_densities, target=target),where='obj0'),
-            #        dpie0.div_vector_potential_rep0(A_densities=A_densities, target=target)/dpie0.k)
+            #    return sym.join_fields(sym.n_cross(dpie0.vector_potential_rep0(A_densities=A_densities, target=target), where='obj0'),  # noqa: E501
+            #        dpie0.div_vector_potential_rep0(A_densities=A_densities, target=target)/dpie0.k)  # noqa: E501
             def vec_op_repr(A_densities, target):
-                return sym.join_fields(sym.n_cross(dpie0.vector_potential_rep(A_densities=A_densities, target=target),where='obj0'),
-                    dpie0.div_vector_potential_rep(A_densities=A_densities, target=target)/dpie0.k)
-
-            scalar_rep_eval = vector_from_device(queue,bind(geom_map, dpie0.scalar_potential_rep(phi_densities=phi_densities, target='tgt'))(queue, phi_densities=dummy_phi, **knl_kwargs))
-            vector_rep_eval = vector_from_device(queue,bind(geom_map, vec_op_repr(A_densities=A_densities,target='tgt'))(queue, A_densities=dummy_A, **knl_kwargs))
-            tau_rep_eval = vector_from_device(queue,bind(geom_map, dpie0.subproblem_rep(tau_densities=tau_densities,target='tgt'))(queue, tau_densities=dummy_tau, **knl_kwargs))
-
+                return sym.join_fields(
+                        sym.n_cross(
+                            dpie0.vector_potential_rep(
+                                A_densities=A_densities,
+                                target=target),
+                            where='obj0'),
+                        dpie0.div_vector_potential_rep(
+                            A_densities=A_densities,
+                            target=target)/dpie0.k)
+
+            scalar_rep_eval = vector_from_device(queue, bind(geom_map,
+                dpie0.scalar_potential_rep(phi_densities=phi_densities,
+                    target='tgt'))(queue, phi_densities=dummy_phi,
+                        **knl_kwargs))
+            vector_rep_eval = vector_from_device(queue, bind(geom_map,
+                vec_op_repr(A_densities=A_densities, target='tgt'))(queue,
+                    A_densities=dummy_A, **knl_kwargs))
+            tau_rep_eval = vector_from_device(queue, bind(geom_map,
+                dpie0.subproblem_rep(tau_densities=tau_densities,
+                    target='tgt'))(queue, tau_densities=dummy_tau,
+                        **knl_kwargs))
+
+            axyz = sym.tangential_to_xyz(density_sym, where='obj0')
 
-            axyz = sym.tangential_to_xyz(density_sym,where='obj0')
             def nxcurlS0(qbx_forced_limit):
-                return sym.n_cross(sym.curl(dpie0.S(axyz.reshape(3,1),target='obj0t',qfl=qbx_forced_limit)),where='obj0')
-            test_op_err = vector_from_device(queue,bind(geom_map, 0.5*axyz + nxcurlS0("avg") - nxcurlS0(+1))(queue,density=density,**knl_kwargs))
+                return sym.n_cross(sym.curl(dpie0.S(axyz.reshape(3, 1),
+                    target='obj0t', qfl=qbx_forced_limit)), where='obj0')
+
+            test_op_err = vector_from_device(queue,
+                    bind(
+                        geom_map,
+                        0.5*axyz
+                        + nxcurlS0("avg") - nxcurlS0(+1)
+                        )(queue, density=density, **knl_kwargs))
 
             from sumpy.kernel import LaplaceKernel
             knl = LaplaceKernel(3)
@@ -655,15 +757,21 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
             def nxcurlS(qbx_forced_limit):
 
-                return sym.n_cross(sym.curl(sym.S(
-                    knl,
-                    sym.cse(sym.tangential_to_xyz(density_sym, where='obj0'), "jxyz"),
-                    k=dpie0.k,
-                    qbx_forced_limit=qbx_forced_limit,source='obj0', target='obj0t')),where='obj0')
+                return sym.n_cross(
+                        sym.curl(sym.S(
+                            knl,
+                            sym.cse(
+                                sym.tangential_to_xyz(density_sym, where='obj0'),
+                                "jxyz"),
+                            k=dpie0.k,
+                            qbx_forced_limit=qbx_forced_limit,
+                            source='obj0', target='obj0t')),
+                        where='obj0')
 
             jump_identity_sym = (
                     nxcurlS(+1)
-                    - (nxcurlS("avg") + 0.5*sym.tangential_to_xyz(density_sym,where='obj0')))
+                    - (nxcurlS("avg")
+                        + 0.5*sym.tangential_to_xyz(density_sym, where='obj0')))
 
             bound_jump_identity = bind(geom_map, jump_identity_sym)
             jump_identity = bound_jump_identity(queue, density=density, **knl_kwargs)
@@ -671,35 +779,36 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
             err = (norm(qbx0, queue, jump_identity, np.inf))
             print("ERROR", qbx0.h_max, err)
 
-            # compute the error between the operator values and the representation values
-            def error_diff(u,v):
-                return np.linalg.norm(u-v,np.inf)
-            error_v = [error_diff(scalar_op_eval[0],scalar_rep_eval), 
-                        error_diff(vector_op_eval[0],vector_rep_eval[0]),
-                        error_diff(vector_op_eval[1],vector_rep_eval[1]),
-                        error_diff(vector_op_eval[2],vector_rep_eval[2]),
-                        error_diff(vector_op_eval[3],vector_rep_eval[3]),
-                        error_diff(tau_op_eval[0],tau_rep_eval),
-                        np.linalg.norm(test_op_err[0],np.inf),
-                        np.linalg.norm(test_op_err[1],np.inf),
-                        np.linalg.norm(test_op_err[2],np.inf)]
+            # compute the error between the operator values and the
+            # representation values
+
+            def error_diff(u, v):
+                return np.linalg.norm(u-v, np.inf)
+            error_v = [error_diff(scalar_op_eval[0], scalar_rep_eval),
+                        error_diff(vector_op_eval[0], vector_rep_eval[0]),
+                        error_diff(vector_op_eval[1], vector_rep_eval[1]),
+                        error_diff(vector_op_eval[2], vector_rep_eval[2]),
+                        error_diff(vector_op_eval[3], vector_rep_eval[3]),
+                        error_diff(tau_op_eval[0], tau_rep_eval),
+                        np.linalg.norm(test_op_err[0], np.inf),
+                        np.linalg.norm(test_op_err[1], np.inf),
+                        np.linalg.norm(test_op_err[2], np.inf)]
             op_error.append(error_v)
 
         # print the resulting error results
         print("Operator Error Results:")
-        for n in range(0,len(op_error)):
-            print("Case {0}: {1}".format(n+1,op_error[n]))
+        for n in range(0, len(op_error)):
+            print("Case {0}: {1}".format(n+1, op_error[n]))
 
-    # #}}}
+    # }}}
 
+    # {{{ solve for the scattered field
 
-    # {{{ Solve for the scattered field
     solve_scattered_field = True
     if solve_scattered_field:
         loc_sign = -1 if case.is_interior else +1
 
         # import a bunch of stuff that will be useful
-        from pytools.convergence import EOCRecorder
         from pytential.qbx import QBXLayerPotentialSource
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
@@ -707,10 +816,8 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
         from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
 
         # setup an EOC Recorder
-        eoc_rec_repr_maxwell    = EOCRecorder()
-        eoc_pec_bc              = EOCRecorder()
-        eoc_rec_e               = EOCRecorder()
-        eoc_rec_h               = EOCRecorder()
+        eoc_rec_repr_maxwell = EOCRecorder()
+        eoc_pec_bc = EOCRecorder()
 
         def frequency_domain_gauge_condition(cpatch, A, phi, k):
             # define constants used for the computation
@@ -726,17 +833,18 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
             # return the residual for the gauge condition
             return resid_gauge_cond
 
-        def gauge_check(divA,phi,k):
+        def gauge_check(divA, phi, k):
             return divA - 1j*k*phi
 
-        # loop through the case's resolutions and compute the scattered field solution
+        # loop through the case's resolutions and compute the scattered field
+        # solution
         gauge_err = []
         maxwell_err = []
         for resolution in case.resolutions:
 
             # get the scattered and observation mesh
-            scat_mesh           = case.get_mesh(resolution, case.target_order)
-            observation_mesh    = case.get_observation_mesh(case.target_order)
+            scat_mesh = case.get_mesh(resolution, case.target_order)
+            # observation_mesh = case.get_observation_mesh(case.target_order)
 
             # define the pre-scattered discretization
             pre_scat_discr = Discretization(
@@ -748,21 +856,27 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                     pre_scat_discr, fine_order=4*case.target_order,
                     #fmm_order=False,
                     qbx_order=case.qbx_order,
-                    fmm_level_to_order=SimpleExpansionOrderFinder(case.fmm_tolerance),
+                    fmm_level_to_order=SimpleExpansionOrderFinder(
+                        case.fmm_tolerance),
                     fmm_backend=case.fmm_backend
                     ).with_refinement(_expansion_disturbance_tolerance=0.05)
 
             # define the geometry dictionary
             #geom_map = {"g0": qbx}
-            geom_map = {"obj0":qbx, "obj0t":qbx.density_discr, "scat":qbx.density_discr}
+            geom_map = {
+                    "obj0": qbx,
+                    "obj0t": qbx.density_discr,
+                    "scat": qbx.density_discr
+                    }
 
             # get the maximum mesh element edge length
             h_max = qbx.h_max
 
             # define the scattered and observation discretization
-            scat_discr  = qbx.density_discr
-            obs_discr   = Discretization(cl_ctx, observation_mesh,
-                                         InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
+            scat_discr = qbx.density_discr
+            # obs_discr = Discretization(
+            #         cl_ctx, observation_mesh,
+            #         InterpolatoryQuadratureSimplexGroupFactory(case.target_order))
 
             # get the incident field at the scatter and observation locations
             #inc_EM_field_scat   = EHField(eval_inc_field_at(scat_discr))
@@ -772,25 +886,34 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
             # {{{ solve the system of integral equations
             inc_A = sym.make_sym_vector("inc_A", 3)
-            inc_phi = sym.make_sym_vector("inc_phi",1)
-            inc_divA = sym.make_sym_vector("inc_divA",1)
+            inc_phi = sym.make_sym_vector("inc_phi", 1)
+            inc_divA = sym.make_sym_vector("inc_divA", 1)
             inc_gradPhi = sym.make_sym_vector("inc_gradPhi", 3)
 
             # get the incident fields used for boundary conditions
-            (phi_inc, A_inc) = get_incident_potentials(geom_map,'scat')
-            inc_divA_scat = get_incident_divA(geom_map,'scat')
-            inc_gradPhi_scat = get_incident_gradphi(geom_map,'scat')
+            (phi_inc, A_inc) = get_incident_potentials(geom_map, 'scat')
+            inc_divA_scat = get_incident_divA(geom_map, 'scat')
+            inc_gradPhi_scat = get_incident_gradphi(geom_map, 'scat')
 
             # check that the boundary conditions satisfy gauge condition
-            resid = bind(geom_map,gauge_check(inc_divA, inc_phi, dpie.k))(queue,inc_divA=inc_divA_scat,inc_phi=phi_inc,**knl_kwargs)
+            # resid = bind(geom_map, gauge_check(inc_divA, inc_phi,
+            #     dpie.k))(queue, inc_divA=inc_divA_scat, inc_phi=phi_inc,
+            #             **knl_kwargs)
 
             # setup operators that will be solved
-            phi_op  = bind(geom_map,dpie.phi_operator(phi_densities=phi_densities))
-            A_op    = bind(geom_map,dpie.a_operator(A_densities=A_densities))
+            phi_op = bind(geom_map, dpie.phi_operator(phi_densities=phi_densities))
+            A_op = bind(geom_map, dpie.a_operator(A_densities=A_densities))
 
-            # setup the RHS with provided data so we can solve for density values across the domain
-            phi_rhs = bind(geom_map,dpie.phi_rhs(phi_inc=inc_phi,gradphi_inc=inc_gradPhi))(queue,inc_phi=phi_inc,inc_gradPhi=inc_gradPhi_scat,**knl_kwargs)
-            A_rhs   = bind(geom_map,dpie.a_rhs(A_inc=inc_A,divA_inc=inc_divA))(queue,inc_A=A_inc,inc_divA=inc_divA_scat,**knl_kwargs)
+            # setup the RHS with provided data so we can solve for density
+            # values across the domain
+
+            phi_rhs = bind(geom_map, dpie.phi_rhs(phi_inc=inc_phi,
+                gradphi_inc=inc_gradPhi))(queue, inc_phi=phi_inc,
+                        inc_gradPhi=inc_gradPhi_scat, **knl_kwargs)
+            A_rhs = bind(geom_map, dpie.a_rhs(A_inc=inc_A,
+                divA_inc=inc_divA))(
+                        queue, inc_A=A_inc, inc_divA=inc_divA_scat,
+                        **knl_kwargs)
 
             # set GMRES settings for solving
             gmres_settings = dict(
@@ -799,26 +922,32 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
                     hard_failure=True,
                     stall_iterations=50, no_progress_factor=1.05)
 
-            # get the GMRES functionality
-            from pytential.solve import gmres
-
             # solve for the scalar potential densities
             gmres_result = gmres(
-                            phi_op.scipy_op(queue, "phi_densities", np.complex128, domains=dpie.get_scalar_domain_list(),**knl_kwargs),
-                            phi_rhs, **gmres_settings)
+                phi_op.scipy_op(queue, "phi_densities",
+                    np.complex128,
+                    domains=dpie.get_scalar_domain_list(),
+                    **knl_kwargs),
+                phi_rhs, **gmres_settings)
             phi_dens = gmres_result.solution
 
             # solve for the vector potential densities
             gmres_result = gmres(
-                    A_op.scipy_op(queue, "A_densities", np.complex128, domains=dpie.get_vector_domain_list(), **knl_kwargs),
+                    A_op.scipy_op(queue, "A_densities", np.complex128,
+                        domains=dpie.get_vector_domain_list(), **knl_kwargs),
                     A_rhs, **gmres_settings)
             A_dens = gmres_result.solution
 
             # solve sub problem for sigma densities
-            tau_op= bind(geom_map,dpie.subproblem_operator(tau_densities=tau_densities))
-            tau_rhs= bind(geom_map,dpie.subproblem_rhs(A_densities=A_densities))(queue,A_densities=A_dens,**knl_kwargs)
+            tau_op = bind(geom_map,
+                    dpie.subproblem_operator(tau_densities=tau_densities))
+            tau_rhs = bind(geom_map,
+                    dpie.subproblem_rhs(A_densities=A_densities))(queue,
+                            A_densities=A_dens, **knl_kwargs)
             gmres_result = gmres(
-                    tau_op.scipy_op(queue, "tau_densities", np.complex128, domains=dpie.get_subproblem_domain_list(), **knl_kwargs),
+                    tau_op.scipy_op(queue, "tau_densities", np.complex128,
+                        domains=dpie.get_subproblem_domain_list(),
+                        **knl_kwargs),
                     tau_rhs, **gmres_settings)
             tau_dens = gmres_result.solution
 
@@ -826,32 +955,46 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
             def eval_potentials(tgt):
                 tmap = geom_map
                 tmap['tgt'] = tgt
-                phi     = vector_from_device(queue,bind(tmap, dpie.scalar_potential_rep(phi_densities=phi_densities,target='tgt'))(queue, phi_densities=phi_dens, **knl_kwargs))
-                Axyz    = vector_from_device(queue,bind(tmap, dpie.vector_potential_rep(A_densities=A_densities,target='tgt'))(queue, A_densities=A_dens, **knl_kwargs))
-                return (phi,Axyz)
-
-            (phi,A) = eval_potentials(calc_patch_tgt)
-            gauge_residual = frequency_domain_gauge_condition(calc_patch, A, phi, case.k)
-            err = calc_patch.norm(gauge_residual,np.inf)
+                phi = vector_from_device(queue, bind(tmap,
+                    dpie.scalar_potential_rep(phi_densities=phi_densities,
+                        target='tgt'))(queue, phi_densities=phi_dens,
+                            **knl_kwargs))
+                Axyz = vector_from_device(queue, bind(tmap,
+                    dpie.vector_potential_rep(A_densities=A_densities,
+                        target='tgt'))(queue, A_densities=A_dens,
+                            **knl_kwargs))
+                return (phi, Axyz)
+
+            (phi, A) = eval_potentials(calc_patch_tgt)
+            gauge_residual = frequency_domain_gauge_condition(
+                    calc_patch, A, phi, case.k)
+            err = calc_patch.norm(gauge_residual, np.inf)
             gauge_err.append(err)
 
-
             # }}}
 
             # {{{ volume eval
 
-            sym_repr = dpie.scattered_volume_field(phi_densities,A_densities,tau_densities,target='tgt')
+            sym_repr = dpie.scattered_volume_field(
+                    phi_densities, A_densities, tau_densities, target='tgt')
 
             def eval_repr_at(tgt):
                 map = geom_map
                 map['tgt'] = tgt
-                return bind(map, sym_repr)(queue, phi_densities=phi_dens, A_densities=A_dens, tau_densities=tau_dens, **knl_kwargs)
+                return bind(map, sym_repr)(queue, phi_densities=phi_dens,
+                        A_densities=A_dens, tau_densities=tau_dens,
+                        **knl_kwargs)
 
-            pde_test_repr = EHField(vector_from_device(queue, eval_repr_at(calc_patch_tgt)))
+            pde_test_repr = EHField(vector_from_device(queue,
+                eval_repr_at(calc_patch_tgt)))
 
             maxwell_residuals = [
-                    calc_patch.norm(x, np.inf) / calc_patch.norm(pde_test_repr.e, np.inf)
-                    for x in frequency_domain_maxwell(calc_patch, pde_test_repr.e, pde_test_repr.h, case.k)]
+                    calc_patch.norm(x, np.inf)
+                    / calc_patch.norm(pde_test_repr.e, np.inf)
+                    for x in frequency_domain_maxwell(
+                        calc_patch, pde_test_repr.e,
+                        pde_test_repr.h, case.k)]
+
             print("Maxwell residuals:", maxwell_residuals)
 
             maxwell_err.append(maxwell_residuals)
@@ -863,33 +1006,44 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
             def scalar_pot_PEC_residual(phi, inc_phi, where=None):
                 V = dpie.scalar_potential_constants(phi_densities=phi)
-                return dpie.scalar_potential_rep(phi_densities=phi,target=where, qfl=loc_sign) + inc_phi - V[0]
+                return dpie.scalar_potential_rep(
+                        phi_densities=phi, target=where, qfl=loc_sign
+                        ) + inc_phi - V[0]
 
             def vector_pot_PEC_residual(a_densities, inc_a, where=None):
-                return sym.n_cross( dpie.vector_potential_rep(A_densities=a_densities, target=where, qfl=loc_sign) + inc_a, where=where)
+                return sym.n_cross(
+                        dpie.vector_potential_rep(
+                            A_densities=a_densities, target=where, qfl=loc_sign)
+                        + inc_a, where=where)
 
-            phi_pec_bc_resid = scalar_pot_PEC_residual(phi_densities, inc_phi, where="obj0")
-            A_pec_bc_resid = vector_pot_PEC_residual(A_densities, inc_A, where="obj0")
+            phi_pec_bc_resid = scalar_pot_PEC_residual(
+                    phi_densities, inc_phi, where="obj0")
+            A_pec_bc_resid = vector_pot_PEC_residual(
+                    A_densities, inc_A, where="obj0")
 
-            scalar_bc_values = bind(geom_map, phi_pec_bc_resid)(queue, phi_densities=phi_dens, inc_phi=phi_inc,**knl_kwargs)
-            vector_bc_values = bind(geom_map, A_pec_bc_resid)(queue, A_densities=A_dens, inc_A=A_inc,**knl_kwargs)
+            scalar_bc_values = bind(geom_map, phi_pec_bc_resid)(
+                    queue, phi_densities=phi_dens, inc_phi=phi_inc, **knl_kwargs)
+            vector_bc_values = bind(geom_map, A_pec_bc_resid)(
+                    queue, A_densities=A_dens, inc_A=A_inc, **knl_kwargs)
 
             def scat_norm(f):
-               return norm(qbx, queue, f, p=np.inf)
+                return norm(qbx, queue, f, p=np.inf)
 
-            scalar_bc_residual = scat_norm(scalar_bc_values) #/ scat_norm(phi_inc)
-            vector_bc_residual = scat_norm(vector_bc_values) #/ scat_norm(A_inc)
+            scalar_bc_residual = scat_norm(scalar_bc_values)  # / scat_norm(phi_inc)
+            vector_bc_residual = scat_norm(vector_bc_values)  # / scat_norm(A_inc)
 
-            print("Potential PEC BC residuals:", h_max, scalar_bc_residual, vector_bc_residual)
+            print(
+                    "Potential PEC BC residuals:",
+                    h_max, scalar_bc_residual, vector_bc_residual)
 
-            eoc_pec_bc.add_data_point(h_max, max(scalar_bc_residual, vector_bc_residual))
+            eoc_pec_bc.add_data_point(
+                    h_max, max(scalar_bc_residual, vector_bc_residual))
 
             # }}}
 
-            # {{{ check if DPIE helmholtz BCs are satisfied
-
+            # {{{ check if dpie helmholtz bcs are satisfied
 
-            #}}}
+            # }}}
 
             # {{{ visualization
 
@@ -902,12 +1056,12 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
                 bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [
                     ("phi", phi),
-                    ("Axyz", Axyz),
-                    ("Einc", inc_EM_field_scat.e),
-                    ("Hinc", inc_EM_field_scat.h),
+                    # ("Axyz", Axyz),
+                    # ("Einc", inc_EM_field_scat.e),
+                    # ("Hinc", inc_EM_field_scat.h),
                     ("bdry_normals", bdry_normals),
-                    ("e_bc_residual", eh_bc_values[:3]),
-                    ("h_bc_residual", eh_bc_values[3]),
+                    # ("e_bc_residual", eh_bc_values[:3]),
+                    # ("h_bc_residual", eh_bc_values[3]),
                     ])
 
                 fplot = make_field_plotter_from_bbox(
@@ -931,16 +1085,16 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
                 fplot_repr = EHField(vector_from_device(queue, fplot_repr))
 
-                fplot_inc = EHField(
-                        vector_from_device(queue, eval_inc_field_at(fplot_tgt)))
+                # fplot_inc = EHField(
+                #         vector_from_device(queue, eval_inc_field_at(fplot_tgt)))
 
                 fplot.write_vtk_file(
                         "potential-%s.vts" % resolution,
                         [
                             ("E", fplot_repr.e),
                             ("H", fplot_repr.h),
-                            ("Einc", fplot_inc.e),
-                            ("Hinc", fplot_inc.h),
+                            # ("Einc", fplot_inc.e),
+                            # ("Hinc", fplot_inc.h),
                             ]
                         )
 
@@ -991,6 +1145,8 @@ def test_pec_dpie_extinction(ctx_getter, case, visualize=False):
 
         assert good
 
+    # }}}
+
 # }}}
 
 
diff --git a/test/test_muller.py b/test/test_muller.py
index fb23e911d32c1c720a3284004014cb83088d75d7..1516bb3c51ae9f6b7fba9e048dc15c330f828dcd 100644
--- a/test/test_muller.py
+++ b/test/test_muller.py
@@ -82,5 +82,5 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
diff --git a/test/test_scalar_int_eq.py b/test/test_scalar_int_eq.py
index 2fd80990f33878d16accad59d180515b1749b936..72d08fcf3dcee46ebbdbd03f960c40eb1839992f 100644
--- a/test/test_scalar_int_eq.py
+++ b/test/test_scalar_int_eq.py
@@ -36,9 +36,10 @@ from functools import partial
 from meshmode.mesh.generation import (  # noqa
         ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
         make_curve_mesh)
-from sumpy.visualization import FieldPlotter
+from meshmode.discretization.visualization import make_visualizer
 from sumpy.symbolic import USE_SYMENGINE
 from pytential import bind, sym
+from pytential.qbx import QBXTargetAssociationFailedException
 
 import logging
 logger = logging.getLogger(__name__)
@@ -63,19 +64,26 @@ def make_circular_point_group(ambient_dim, npoints, radius,
 # {{{ test cases
 
 class IntEqTestCase:
-    def __init__(self, helmholtz_k, bc_type, loc_sign):
+    def __init__(self, helmholtz_k, bc_type, prob_side):
+        """
+        :arg prob_side: may be -1, +1, or ``'scat'`` for a scattering problem
+        """
+
+        if helmholtz_k is None:
+            helmholtz_k = self.default_helmholtz_k
+
         self.helmholtz_k = helmholtz_k
         self.bc_type = bc_type
-        self.loc_sign = loc_sign
+        self.prob_side = prob_side
 
     @property
     def k(self):
         return self.helmholtz_k
 
     def __str__(self):
-        return ("name: %s, bc_type: %s, loc_sign: %s, "
+        return ("name: %s, bc_type: %s, prob_side: %s, "
                 "helmholtz_k: %s, qbx_order: %d, target_order: %d"
-            % (self.name, self.bc_type, self.loc_sign, self.helmholtz_k,
+            % (self.name, self.bc_type, self.prob_side, self.helmholtz_k,
                 self.qbx_order, self.target_order))
 
     fmm_backend = "sumpy"
@@ -83,7 +91,7 @@ class IntEqTestCase:
 
 
 class CurveIntEqTestCase(IntEqTestCase):
-    resolutions = [30, 40, 50]
+    resolutions = [40, 50, 60]
 
     def get_mesh(self, resolution, target_order):
         return make_curve_mesh(
@@ -140,6 +148,7 @@ class EllipsoidIntEqTestCase(Helmholtz3DIntEqTestCase):
         # Flip elements--gmsh generates inside-out geometry.
         return perform_flips(mesh, np.ones(mesh.nelements))
 
+    qbx_order = 5
     fmm_order = 13
 
     inner_radius = 0.4
@@ -248,7 +257,7 @@ class ManyEllipsoidIntEqTestCase(Helmholtz3DIntEqTestCase):
 class ElliptiplaneIntEqTestCase(IntEqTestCase):
     name = "elliptiplane"
 
-    resolutions = [0.2]
+    resolutions = [0.1]
 
     fmm_backend = "fmmlib"
     use_refinement = True
@@ -263,6 +272,10 @@ class ElliptiplaneIntEqTestCase(IntEqTestCase):
     # kidding?
     gmres_tol = 1e-5
 
+    # to match the scheme given in the GIGAQBX3D paper
+    box_extent_norm = "l2"
+    from_sep_smaller_crit = "static_l2"
+
     def get_mesh(self, resolution, target_order):
         from pytools import download_from_web_if_not_present
 
@@ -284,7 +297,111 @@ class ElliptiplaneIntEqTestCase(IntEqTestCase):
         return perform_flips(mesh, np.ones(mesh.nelements))
 
     inner_radius = 0.2
-    outer_radius = 12
+    outer_radius = 12  # was '-13' in some large-scale run (?)
+
+
+class BetterplaneIntEqTestCase(IntEqTestCase):
+    name = "betterplane"
+
+    default_helmholtz_k = 20
+    resolutions = [0.2]
+    # refine_on_helmholtz_k = False
+
+    fmm_backend = "fmmlib"
+    use_refinement = True
+
+    qbx_order = 3
+    fmm_tol = 1e-4
+    target_order = 6
+    check_gradient = False
+    check_tangential_deriv = False
+
+    visualize_geometry = True
+
+    #scaled_max_curvature_threshold = 1
+    expansion_disturbance_tolerance = 0.3
+
+    # We're only expecting three digits based on FMM settings. Who are we
+    # kidding?
+    gmres_tol = 1e-5
+
+    vis_grid_spacing = (0.025, 0.2, 0.025)
+    vis_extend_factor = 0.2
+
+    def get_mesh(self, resolution, target_order):
+        from pytools import download_from_web_if_not_present
+
+        download_from_web_if_not_present(
+                "https://raw.githubusercontent.com/inducer/geometries/a869fc3/"
+                "surface-3d/betterplane.brep")
+
+        from meshmode.mesh.io import generate_gmsh, ScriptWithFilesSource
+        mesh = generate_gmsh(
+                ScriptWithFilesSource("""
+                    Merge "betterplane.brep";
+
+                    Mesh.CharacteristicLengthMax = %(lcmax)f;
+                    Mesh.ElementOrder = 2;
+                    Mesh.CharacteristicLengthExtendFromBoundary = 0;
+
+                    // 2D mesh optimization
+                    // Mesh.Lloyd = 1;
+
+                    l_superfine() = Unique(Abs(Boundary{ Surface{
+                        27, 25, 17, 13, 18  }; }));
+                    l_fine() = Unique(Abs(Boundary{ Surface{ 2, 6, 7}; }));
+                    l_coarse() = Unique(Abs(Boundary{ Surface{ 14, 16  }; }));
+
+                    // p() = Unique(Abs(Boundary{ Line{l_fine()}; }));
+                    // Characteristic Length{p()} = 0.05;
+
+                    Field[1] = Attractor;
+                    Field[1].NNodesByEdge = 100;
+                    Field[1].EdgesList = {l_superfine()};
+
+                    Field[2] = Threshold;
+                    Field[2].IField = 1;
+                    Field[2].LcMin = 0.075;
+                    Field[2].LcMax = %(lcmax)f;
+                    Field[2].DistMin = 0.1;
+                    Field[2].DistMax = 0.4;
+
+                    Field[3] = Attractor;
+                    Field[3].NNodesByEdge = 100;
+                    Field[3].EdgesList = {l_fine()};
+
+                    Field[4] = Threshold;
+                    Field[4].IField = 3;
+                    Field[4].LcMin = 0.1;
+                    Field[4].LcMax = %(lcmax)f;
+                    Field[4].DistMin = 0.15;
+                    Field[4].DistMax = 0.4;
+
+                    Field[5] = Attractor;
+                    Field[5].NNodesByEdge = 100;
+                    Field[5].EdgesList = {l_coarse()};
+
+                    Field[6] = Threshold;
+                    Field[6].IField = 5;
+                    Field[6].LcMin = 0.15;
+                    Field[6].LcMax = %(lcmax)f;
+                    Field[6].DistMin = 0.2;
+                    Field[6].DistMax = 0.4;
+
+                    Field[7] = Min;
+                    Field[7].FieldsList = {2, 4, 6};
+
+                    Background Field = 7;
+                    """ % {
+                        "lcmax": resolution,
+                        }, ["betterplane.brep"]), 2)
+
+        # Flip elements--gmsh generates inside-out geometry.
+        from meshmode.mesh.processing import perform_flips
+        return perform_flips(mesh, np.ones(mesh.nelements))
+
+    inner_radius = 0.2
+    outer_radius = 15
 
 # }}}
 
@@ -322,19 +439,53 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
             qbx_lpot_kwargs["fmm_order"] = case.qbx_order + 5
 
     qbx = QBXLayerPotentialSource(
-            pre_density_discr, fine_order=source_order, qbx_order=case.qbx_order,
+            pre_density_discr,
+            fine_order=source_order,
+            qbx_order=case.qbx_order,
+
+            _box_extent_norm=getattr(case, "box_extent_norm", None),
+            _from_sep_smaller_crit=getattr(case, "from_sep_smaller_crit", None),
+            _from_sep_smaller_min_nsources_cumul=30,
             fmm_backend=case.fmm_backend, **qbx_lpot_kwargs)
 
     if case.use_refinement:
-        if case.k != 0:
+        if case.k != 0 and getattr(case, "refine_on_helmholtz_k", True):
             refiner_extra_kwargs["kernel_length_scale"] = 5/case.k
 
+        if hasattr(case, "scaled_max_curvature_threshold"):
+            refiner_extra_kwargs["_scaled_max_curvature_threshold"] = \
+                    case.scaled_max_curvature_threshold
+
+        if hasattr(case, "expansion_disturbance_tolerance"):
+            refiner_extra_kwargs["_expansion_disturbance_tolerance"] = \
+                    case.expansion_disturbance_tolerance
+
+        if hasattr(case, "refinement_maxiter"):
+            refiner_extra_kwargs["maxiter"] = case.refinement_maxiter
+
+        #refiner_extra_kwargs["visualize"] = True
+
         print("%d elements before refinement" % pre_density_discr.mesh.nelements)
         qbx, _ = qbx.with_refinement(**refiner_extra_kwargs)
-        print("%d elements after refinement" % qbx.density_discr.mesh.nelements)
+        print("%d stage-1 elements after refinement"
+                % qbx.density_discr.mesh.nelements)
+        print("%d stage-2 elements after refinement"
+                % qbx.stage2_density_discr.mesh.nelements)
+        print("quad stage-2 elements have %d nodes"
+                % qbx.quad_stage2_density_discr.groups[0].nunit_nodes)
 
     density_discr = qbx.density_discr
 
+    if hasattr(case, "visualize_geometry") and case.visualize_geometry:
+        bdry_normals = bind(
+                density_discr, sym.normal(mesh.ambient_dim)
+                )(queue).as_vector(dtype=object)
+
+        bdry_vis = make_visualizer(queue, density_discr, case.target_order)
+        bdry_vis.write_vtk_file("geometry.vtu", [
+            ("normals", bdry_normals)
+            ])
+
     # {{{ plot geometry
 
     if 0:
@@ -349,7 +500,6 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
             pt.show()
 
         elif mesh.ambient_dim == 3:
-            from meshmode.discretization.visualization import make_visualizer
             bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
 
             bdry_normals = bind(density_discr, sym.normal(3))(queue)\
@@ -385,11 +535,13 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
     else:
         dtype = np.float64
 
+    loc_sign = +1 if case.prob_side in [+1, "scat"] else -1
+
     if case.bc_type == "dirichlet":
-        op = DirichletOperator(knl, case.loc_sign, use_l2_weighting=False,
+        op = DirichletOperator(knl, loc_sign, use_l2_weighting=True,
                 kernel_arguments=knl_kwargs)
     elif case.bc_type == "neumann":
-        op = NeumannOperator(knl, case.loc_sign, use_l2_weighting=False,
+        op = NeumannOperator(knl, loc_sign, use_l2_weighting=True,
                  use_improved_operator=False, kernel_arguments=knl_kwargs)
     else:
         assert False
@@ -400,12 +552,17 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # {{{ set up test data
 
-    if case.loc_sign < 0:
+    if case.prob_side == -1:
         test_src_geo_radius = case.outer_radius
         test_tgt_geo_radius = case.inner_radius
-    else:
+    elif case.prob_side == +1:
         test_src_geo_radius = case.inner_radius
         test_tgt_geo_radius = case.outer_radius
+    elif case.prob_side == "scat":
+        test_src_geo_radius = case.outer_radius
+        test_tgt_geo_radius = case.outer_radius
+    else:
+        raise ValueError("unknown problem_side")
 
     point_sources = make_circular_point_group(
             mesh.ambient_dim, 10, test_src_geo_radius,
@@ -456,16 +613,25 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     rhs = bind(density_discr, op.prepare_rhs(sym.var("bc")))(queue, bc=bc)
 
-    from pytential.solve import gmres
-    gmres_result = gmres(
-            bound_op.scipy_op(queue, "u", dtype, **concrete_knl_kwargs),
-            rhs,
-            tol=case.gmres_tol,
-            progress=True,
-            hard_failure=True)
+    try:
+        from pytential.solve import gmres
+        gmres_result = gmres(
+                bound_op.scipy_op(queue, "u", dtype, **concrete_knl_kwargs),
+                rhs,
+                tol=case.gmres_tol,
+                progress=True,
+                hard_failure=True,
+                stall_iterations=50, no_progress_factor=1.05)
+    except QBXTargetAssociationFailedException as e:
+        bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
+
+        bdry_vis.write_vtk_file("failed-targets-%s.vtu" % resolution, [
+            ("failed_targets", e.failed_target_flags),
+            ])
+        raise
 
     print("gmres state:", gmres_result.state)
-    u = gmres_result.solution
+    weighted_u = gmres_result.solution
 
     # }}}
 
@@ -486,41 +652,46 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # }}}
 
-    # {{{ error check
+    if case.prob_side != "scat":
+        # {{{ error check
 
-    points_target = PointsTarget(test_targets)
-    bound_tgt_op = bind((qbx, points_target),
-            op.representation(sym.var("u")))
+        points_target = PointsTarget(test_targets)
+        bound_tgt_op = bind((qbx, points_target),
+                op.representation(sym.var("u")))
 
-    test_via_bdry = bound_tgt_op(queue, u=u, k=case.k)
+        test_via_bdry = bound_tgt_op(queue, u=weighted_u, k=case.k)
 
-    err = test_direct-test_via_bdry
+        err = test_via_bdry - test_direct
 
-    err = err.get()
-    test_direct = test_direct.get()
-    test_via_bdry = test_via_bdry.get()
+        err = err.get()
+        test_direct = test_direct.get()
+        test_via_bdry = test_via_bdry.get()
 
-    # {{{ remove effect of net source charge
+        # {{{ remove effect of net source charge
 
-    if case.k == 0 and case.bc_type == "neumann" and case.loc_sign == -1:
+        if case.k == 0 and case.bc_type == "neumann" and loc_sign == -1:
 
-        # remove constant offset in interior Laplace Neumann error
-        tgt_ones = np.ones_like(test_direct)
-        tgt_ones = tgt_ones/la.norm(tgt_ones)
-        err = err - np.vdot(tgt_ones, err)*tgt_ones
+            # remove constant offset in interior Laplace Neumann error
+            tgt_ones = np.ones_like(test_direct)
+            tgt_ones = tgt_ones/la.norm(tgt_ones)
+            err = err - np.vdot(tgt_ones, err)*tgt_ones
 
-    # }}}
+        # }}}
 
-    rel_err_2 = la.norm(err)/la.norm(test_direct)
-    rel_err_inf = la.norm(err, np.inf)/la.norm(test_direct, np.inf)
+        rel_err_2 = la.norm(err)/la.norm(test_direct)
+        rel_err_inf = la.norm(err, np.inf)/la.norm(test_direct, np.inf)
 
-    # }}}
+        # }}}
 
-    print("rel_err_2: %g rel_err_inf: %g" % (rel_err_2, rel_err_inf))
+        print("rel_err_2: %g rel_err_inf: %g" % (rel_err_2, rel_err_inf))
+
+    else:
+        rel_err_2 = None
+        rel_err_inf = None
 
     # {{{ test gradient
 
-    if case.check_gradient:
+    if case.check_gradient and case.prob_side != "scat":
         bound_grad_op = bind((qbx, points_target),
                 op.representation(
                     sym.var("u"),
@@ -530,7 +701,7 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
         #print(bound_t_deriv_op.code)
 
         grad_from_src = bound_grad_op(
-                queue, u=u, **concrete_knl_kwargs)
+                queue, u=weighted_u, **concrete_knl_kwargs)
 
         grad_ref = (bind(
                 (point_source, points_target),
@@ -542,25 +713,25 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
         rel_grad_err_inf = (
                 la.norm(grad_err[0].get(), np.inf)
-                /
-                la.norm(grad_ref[0].get(), np.inf))
+                / la.norm(grad_ref[0].get(), np.inf))
 
         print("rel_grad_err_inf: %g" % rel_grad_err_inf)
 
     # }}}
+
     # {{{ test tangential derivative
 
-    if case.check_tangential_deriv:
+    if case.check_tangential_deriv and case.prob_side != "scat":
         bound_t_deriv_op = bind(qbx,
                 op.representation(
                     sym.var("u"),
                     map_potentials=lambda pot: sym.tangential_derivative(2, pot),
-                    qbx_forced_limit=case.loc_sign))
+                    qbx_forced_limit=loc_sign))
 
         #print(bound_t_deriv_op.code)
 
         tang_deriv_from_src = bound_t_deriv_op(
-                queue, u=u, **concrete_knl_kwargs).as_scalar().get()
+                queue, u=weighted_u, **concrete_knl_kwargs).as_scalar().get()
 
         tang_deriv_ref = (bind(
                 (point_source, density_discr),
@@ -584,36 +755,46 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
     # }}}
 
-    # {{{ 3D plotting
+    # {{{ any-D file plotting
 
-    if visualize and qbx.ambient_dim == 3:
-        from meshmode.discretization.visualization import make_visualizer
+    if visualize:
         bdry_vis = make_visualizer(queue, density_discr, case.target_order+3)
 
         bdry_normals = bind(density_discr, sym.normal(qbx.ambient_dim))(queue)\
                 .as_vector(dtype=object)
 
+        sym_sqrt_j = sym.sqrt_jac_q_weight(density_discr.ambient_dim)
+        u = bind(density_discr, sym.var("u")/sym_sqrt_j)(queue, u=weighted_u)
+
         bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [
             ("u", u),
             ("bc", bc),
-            ("bdry_normals", bdry_normals),
+            #("bdry_normals", bdry_normals),
             ])
 
         from sumpy.visualization import make_field_plotter_from_bbox  # noqa
         from meshmode.mesh.processing import find_bounding_box
 
+        vis_grid_spacing = (0.1, 0.1, 0.1)[:qbx.ambient_dim]
+        if hasattr(case, "vis_grid_spacing"):
+            vis_grid_spacing = case.vis_grid_spacing
+        vis_extend_factor = 0.2
+        if hasattr(case, "vis_extend_factor"):
+            vis_grid_spacing = case.vis_grid_spacing
+
         fplot = make_field_plotter_from_bbox(
-                find_bounding_box(mesh), h=(0.025, 0.025, 0.15)[:qbx.ambient_dim])
+                find_bounding_box(mesh),
+                h=vis_grid_spacing,
+                extend_factor=vis_extend_factor)
 
         qbx_tgt_tol = qbx.copy(target_association_tolerance=0.15)
         from pytential.target import PointsTarget
-        from pytential.qbx import QBXTargetAssociationFailedException
 
         try:
             solved_pot = bind(
                     (qbx_tgt_tol, PointsTarget(fplot.points)),
                     op.representation(sym.var("u"))
-                    )(queue, u=u, k=case.k)
+                    )(queue, u=weighted_u, k=case.k)
         except QBXTargetAssociationFailedException as e:
             fplot.write_vtk_file(
                     "failed-targets.vts",
@@ -622,119 +803,43 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
                         ])
             raise
 
+        from sumpy.kernel import LaplaceKernel
+        ones_density = density_discr.zeros(queue)
+        ones_density.fill(1)
+        indicator = bind(
+                (qbx_tgt_tol, PointsTarget(fplot.points)),
+                -sym.D(LaplaceKernel(density_discr.ambient_dim),
+                    sym.var("sigma"),
+                    qbx_forced_limit=None))(
+                queue, sigma=ones_density).get()
+
         solved_pot = solved_pot.get()
 
         true_pot = bind((point_source, PointsTarget(fplot.points)), pot_src)(
                 queue, charges=source_charges_dev, **concrete_knl_kwargs).get()
 
         #fplot.show_scalar_in_mayavi(solved_pot.real, max_val=5)
-        fplot.write_vtk_file(
-                "potential-%s.vts" % resolution,
-                [
-                    ("solved_pot", solved_pot),
-                    ("true_pot", true_pot),
-                    ("pot_diff", solved_pot-true_pot),
-                    ]
-                )
+        if case.prob_side == "scat":
+            fplot.write_vtk_file(
+                    "potential-%s.vts" % resolution,
+                    [
+                        ("pot_scattered", solved_pot),
+                        ("pot_incoming", -true_pot),
+                        ("indicator", indicator),
+                        ]
+                    )
+        else:
+            fplot.write_vtk_file(
+                    "potential-%s.vts" % resolution,
+                    [
+                        ("solved_pot", solved_pot),
+                        ("true_pot", true_pot),
+                        ("indicator", indicator),
+                        ]
+                    )
 
     # }}}
 
-    # {{{ 2D plotting
-
-    if 0:
-        fplot = FieldPlotter(np.zeros(2),
-                extent=1.25*2*max(test_src_geo_radius, test_tgt_geo_radius),
-                npoints=200)
-
-        #pt.plot(u)
-        #pt.show()
-
-        fld_from_src = bind((point_source, PointsTarget(fplot.points)),
-                pot_src)(queue, charges=source_charges_dev, **concrete_knl_kwargs)
-        fld_from_bdry = bind(
-                (qbx, PointsTarget(fplot.points)),
-                op.representation(sym.var("u"))
-                )(queue, u=u, k=case.k)
-        fld_from_src = fld_from_src.get()
-        fld_from_bdry = fld_from_bdry.get()
-
-        nodes = density_discr.nodes().get(queue=queue)
-
-        def prep():
-            pt.plot(point_sources[0], point_sources[1], "o",
-                    label="Monopole 'Point Charges'")
-            pt.plot(test_targets[0], test_targets[1], "v",
-                    label="Observation Points")
-            pt.plot(nodes[0], nodes[1], "k-",
-                    label=r"$\Gamma$")
-
-        from matplotlib.cm import get_cmap
-        cmap = get_cmap()
-        cmap._init()
-        if 0:
-            cmap._lut[(cmap.N*99)//100:, -1] = 0  # make last percent transparent?
-
-        prep()
-        if 1:
-            pt.subplot(131)
-            pt.title("Field error (loc_sign=%s)" % case.loc_sign)
-            log_err = np.log10(1e-20+np.abs(fld_from_src-fld_from_bdry))
-            log_err = np.minimum(-3, log_err)
-            fplot.show_scalar_in_matplotlib(log_err, cmap=cmap)
-
-            #from matplotlib.colors import Normalize
-            #im.set_norm(Normalize(vmin=-6, vmax=1))
-
-            cb = pt.colorbar(shrink=0.9)
-            cb.set_label(r"$\log_{10}(\mathdefault{Error})$")
-
-        if 1:
-            pt.subplot(132)
-            prep()
-            pt.title("Source Field")
-            fplot.show_scalar_in_matplotlib(
-                    fld_from_src.real, max_val=3)
-
-            pt.colorbar(shrink=0.9)
-        if 1:
-            pt.subplot(133)
-            prep()
-            pt.title("Solved Field")
-            fplot.show_scalar_in_matplotlib(
-                    fld_from_bdry.real, max_val=3)
-
-            pt.colorbar(shrink=0.9)
-
-        # total field
-        #fplot.show_scalar_in_matplotlib(
-        #fld_from_src.real+fld_from_bdry.real, max_val=0.1)
-
-        #pt.colorbar()
-
-        pt.legend(loc="best", prop=dict(size=15))
-        from matplotlib.ticker import NullFormatter
-        pt.gca().xaxis.set_major_formatter(NullFormatter())
-        pt.gca().yaxis.set_major_formatter(NullFormatter())
-
-        pt.gca().set_aspect("equal")
-
-        if 0:
-            border_factor_top = 0.9
-            border_factor = 0.3
-
-            xl, xh = pt.xlim()
-            xhsize = 0.5*(xh-xl)
-            pt.xlim(xl-border_factor*xhsize, xh+border_factor*xhsize)
-
-            yl, yh = pt.ylim()
-            yhsize = 0.5*(yh-yl)
-            pt.ylim(yl-border_factor_top*yhsize, yh+border_factor*yhsize)
-
-        #pt.savefig("helmholtz.pdf", dpi=600)
-        pt.show()
-
-        # }}}
-
     class Result(Record):
         pass
 
@@ -752,10 +857,10 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize):
 
 @pytest.mark.parametrize("case", [
     EllipseIntEqTestCase(helmholtz_k=helmholtz_k, bc_type=bc_type,
-        loc_sign=loc_sign)
+        prob_side=prob_side)
     for helmholtz_k in [0, 1.2]
     for bc_type in ["dirichlet", "neumann"]
-    for loc_sign in [-1, +1]
+    for prob_side in [-1, +1]
     ])
 # Sample test run:
 # 'test_integral_equation(cl._csc, EllipseIntEqTestCase(0, "dirichlet", +1), visualize=True)'  # noqa: E501
@@ -765,9 +870,6 @@ def test_integral_equation(ctx_getter, case, visualize=False):
     cl_ctx = ctx_getter()
     queue = cl.CommandQueue(cl_ctx)
 
-    if case.fmm_backend == "fmmlib":
-        pytest.importorskip("pyfmmlib")
-
     if USE_SYMENGINE and case.fmm_backend is None:
         pytest.skip("https://gitlab.tiker.net/inducer/sumpy/issues/25")
 
@@ -781,11 +883,14 @@ def test_integral_equation(ctx_getter, case, visualize=False):
     eoc_rec_target = EOCRecorder()
     eoc_rec_td = EOCRecorder()
 
+    have_error_data = False
     for resolution in case.resolutions:
         result = run_int_eq_test(cl_ctx, queue, case, resolution,
                 visualize=visualize)
 
-        eoc_rec_target.add_data_point(result.h_max, result.rel_err_2)
+        if result.rel_err_2 is not None:
+            have_error_data = True
+            eoc_rec_target.add_data_point(result.h_max, result.rel_err_2)
 
         if result.rel_td_err_inf is not None:
             eoc_rec_td.add_data_point(result.h_max, result.rel_td_err_inf)
@@ -797,14 +902,15 @@ def test_integral_equation(ctx_getter, case, visualize=False):
     else:
         assert False
 
-    print("TARGET ERROR:")
-    print(eoc_rec_target)
-    assert eoc_rec_target.order_estimate() > tgt_order - 1.3
+    if have_error_data:
+        print("TARGET ERROR:")
+        print(eoc_rec_target)
+        assert eoc_rec_target.order_estimate() > tgt_order - 1.3
 
-    if case.check_tangential_deriv:
-        print("TANGENTIAL DERIVATIVE ERROR:")
-        print(eoc_rec_td)
-        assert eoc_rec_td.order_estimate() > tgt_order - 2.3
+        if case.check_tangential_deriv:
+            print("TANGENTIAL DERIVATIVE ERROR:")
+            print(eoc_rec_td)
+            assert eoc_rec_td.order_estimate() > tgt_order - 2.3
 
 # }}}
 
@@ -817,7 +923,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_stokes.py b/test/test_stokes.py
index 1b85080fb5343c0364d4851f28e92ca11ad715a0..d8d5c821b775ccc3e8057d7f399cfc79452e5bd9 100644
--- a/test/test_stokes.py
+++ b/test/test_stokes.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 import numpy as np
 import pyopencl as cl
 import pyopencl.clmath  # noqa
+import pytest
 
 from meshmode.discretization import Discretization
 from meshmode.discretization.poly_element import \
@@ -270,6 +271,7 @@ def run_exterior_stokes_2d(ctx_factory, nelements,
     return qbx.h_max, l2_err
 
 
+@pytest.mark.slowtest
 def test_exterior_stokes_2d(ctx_factory, qbx_order=3):
     from pytools.convergence import EOCRecorder
     eoc_rec = EOCRecorder()
@@ -290,7 +292,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_symbolic.py b/test/test_symbolic.py
index 6894e15e1a5be78324fcae96edabc7f4d32f8420..2f5633d34025210c9e37e761db1596a0201470ce 100644
--- a/test/test_symbolic.py
+++ b/test/test_symbolic.py
@@ -169,15 +169,45 @@ def test_tangential_onb(ctx_factory):
 # }}}
 
 
+# {{{ test_expr_pickling
+
+def test_expr_pickling():
+    from sumpy.kernel import LaplaceKernel, AxisTargetDerivative
+    import pickle
+    import pytential
+
+    ops_for_testing = [
+        pytential.sym.d_dx(
+            2,
+            pytential.sym.D(
+                LaplaceKernel(2), pytential.sym.var("sigma"), qbx_forced_limit=-2
+            )
+        ),
+        pytential.sym.D(
+            AxisTargetDerivative(0, LaplaceKernel(2)),
+            pytential.sym.var("sigma"),
+            qbx_forced_limit=-2
+        )
+    ]
+
+    for op in ops_for_testing:
+        pickled_op = pickle.dumps(op)
+        after_pickle_op = pickle.loads(pickled_op)
+
+        assert op == after_pickle_op
+
+# }}}
+
+
 # You can test individual routines by typing
-# $ python test_tools.py 'test_routine()'
+# $ python test_symbolic.py 'test_routine()'
 
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_tools.py b/test/test_tools.py
index a0f2ea8e01804684065223fc0fd7a943dd5d7d01..af7740953a36088b77011aeef70e174d4f36cebf 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -101,7 +101,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker
diff --git a/test/too_slow_test_helmholtz.py b/test/too_slow_test_helmholtz.py
index 9add5d0522b35a05852b3fb1fe9c983286f1a932..25eeb075ee2091110a6de0199938835d68bf2b2f 100644
--- a/test/too_slow_test_helmholtz.py
+++ b/test/too_slow_test_helmholtz.py
@@ -421,7 +421,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
 
 # vim: fdm=marker