diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index dd008ea38b3a28e208f5204f6aeb43de0e1bb89b..2f33414ed2ceeca856f0faec7bec8b827dc3b671 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,8 +1,15 @@
+# Environment variables
+#
+# * PYTEST_ADDOPTS is used to filter test runs. The default value is "-k-slowtest",
+#   which skips the slow running tests.
+# * SKIP_EXAMPLES, if non-empty, can be used to skip the examples job.
+
 Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 scipy numpy mako"
+  - export PYOPENCL_TEST=portable:pthread
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -18,8 +25,9 @@ Python 2.7 POCL:
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy scipy mako"
+  - export PYOPENCL_TEST=portable:pthread
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -37,7 +45,8 @@ Python 3 Intel:
   - export PY_EXE=python3
   - source /opt/enable-intel-cl.sh
   - export PYOPENCL_TEST="intel(r):pu"
-  - export EXTRA_INSTALL="pybind11 numpy scipy mako"
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -54,8 +63,8 @@ Python 3 POCL Examples:
   script:
   - test -n "$SKIP_EXAMPLES" && exit
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
+  - export PYOPENCL_TEST=portable:pthread
+  - export EXTRA_INSTALL="Cython pybind11 numpy mako pyvisfile matplotlib"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
   - ". ./build-py-project-and-run-examples.sh"
   tags:
@@ -71,8 +80,9 @@ Python 3 POCL Examples:
 Python 3 Conda:
   script:
   - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine
-  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - export CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
@@ -89,9 +99,11 @@ Python 3 Conda Apple:
   script:
   - export LC_ALL=en_US.UTF-8
   - export LANG=en_US.UTF-8
-  - export PYTEST_ADDOPTS=-k-slowtest
-  - CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml
-  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - export CC=clang
+  - set -o xtrace
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
 
@@ -107,7 +119,7 @@ Python 3 Conda Apple:
 
 Documentation:
   script:
-  - EXTRA_INSTALL="pybind11 numpy mako"
+  - EXTRA_INSTALL="Cython pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
   - ". ./build-docs.sh"
   tags:
@@ -120,7 +132,9 @@ Pylint:
   # Needed to avoid name shadowing issues when running from source directory.
   - PROJECT_INSTALL_FLAGS="--editable"
   - export PY_EXE=python3
-  - EXTRA_INSTALL="pybind11 numpy mako matplotlib"
+  # Pin to numpy 1.15
+  # See https://github.com/PyCQA/pylint/issues/2721
+  - EXTRA_INSTALL="Cython pybind11 numpy==1.15 mako matplotlib"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
   - ". ./prepare-and-run-pylint.sh pytential test/test_*.py"
   tags:
diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml
index fbd7df48f7bc893b651a778aed4f4fdd7a5fff71..c941e862b76be92273b919aee5119046e76841c4 100644
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
@@ -3,3 +3,5 @@
     - old_diffop_primitives.py
     - generalized_debye.py
     - waveguide.py  # See issue #116 on gitlab
+- arg: extension-pkg-whitelist
+  val: pytential.qbx.target_specific
diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml
index b71555f6d6e474a6b0cef0c9d9ed03c168d88a79..eea9ddd7e59168d635c08f4a0f29ed04dc19ede5 100644
--- a/.test-conda-env-py3-macos.yml
+++ b/.test-conda-env-py3-macos.yml
@@ -10,10 +10,13 @@ dependencies:
 - pocl
 - islpy
 - pyopencl
-- python=3.6
+- python>=3.6
 - symengine=0.3.0
 - python-symengine=0.3.0
 - pyfmmlib
 # for OpenMP support in pyfmmlib
 - libgfortran>=3.0.1
+- clangdev
+- openmp
+- cython
 # things not in here: loopy boxtree pymbolic meshmode sumpy
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index a0803d620f2a0ff994aca677746813dafaaf27fc..8023391b5f8598a9c94677f2d5dcefd0c6537f5d 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -10,8 +10,9 @@ dependencies:
 - pocl
 - islpy
 - pyopencl
-- python=3.6
+- python>=3.6
 - symengine=0.3.0
 - python-symengine=0.3.0
 - pyfmmlib
+- cython
 # things not in here: loopy boxtree pymbolic meshmode sumpy
diff --git a/MANIFEST.in b/MANIFEST.in
index 6afc650e95ffcc703625770d13610d52031f774a..80a68f8867ebc255754b7b41d22905ee5dbde4ce 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,8 @@
 include test/*.py
+include test/*.step
+
 include examples/*.py
+include examples/geometries/*step
 
 include doc/*.rst
 include doc/Makefile
diff --git a/README.rst b/README.rst
index dfc5fe49b46f4a9ea13debdc136e9fbf3848c79d..ad3f79057516a8347148e9e9451aada8a73f553b 100644
--- a/README.rst
+++ b/README.rst
@@ -20,7 +20,7 @@ It relies on
 * `boxtree <http://pypi.python.org/pypi/boxtree>`_ for FMM tree building
 * `sumpy <http://pypi.python.org/pypi/sumpy>`_ for expansions and analytical routines
 * `modepy <http://pypi.python.org/pypi/modepy>`_ for modes and nodes on simplices
-* `meshmode <http://pypi.python.org/pypi/meshmode>`_ for modes and nodes on simplices
+* `meshmode <http://pypi.python.org/pypi/meshmode>`_ for high order discretizations
 * `loopy <http://pypi.python.org/pypi/loo.py>`_ for fast array operations
 * `pytest <http://pypi.python.org/pypi/pytest>`_ for automated testing
 
diff --git a/doc/misc.rst b/doc/misc.rst
index f551887fe53b96ed924be0019188770d820e4d33..d72d829c7ceefe35dd69a2d269e5655ea79a749a 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -11,7 +11,9 @@ This set of instructions is intended for 64-bit Linux and macOS computers.
     On Debian derivatives (Ubuntu and many more),
     installing ``build-essential`` should do the trick.
 
-    On macOS, run ``xcode-select --install`` to install build tools.
+    On macOS, run ``xcode-select --install`` to install build tools. On Mojave (10.14),
+    you may need to follow `these steps <https://stackoverflow.com/a/52530212>`_
+    as well.
 
     Everywhere else, just making sure you have the ``g++`` package should be
     enough.
@@ -31,12 +33,22 @@ This set of instructions is intended for 64-bit Linux and macOS computers.
 
 #.  ``conda config --add channels conda-forge``
 
-#.  ``conda install git pip pocl islpy pyopencl sympy pyfmmlib pytest``
+Then, on Linux:
+
+#.  ``conda install cython git pip pocl islpy pyopencl sympy pyfmmlib pytest``
 
 #.  Type the following command::
 
         hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode pytential; do python -m pip install git+https://github.com/inducer/$i; done
 
+And on macOS:
+
+#.  ``conda install openmp clangdev cython git pip pocl islpy pyopencl sympy pyfmmlib pytest``
+
+#.  Type the following command::
+
+        hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode pytential; do CC=clang python -m pip install git+https://github.com/inducer/$i; done
+
 Next time you want to use :mod:`pytential`, just run the following command::
 
     source /WHERE/YOU/INSTALLED/miniconda3/bin/activate inteq
diff --git a/doc/qbx.rst b/doc/qbx.rst
index b49698663b9de46e643f571ee46f3148f603c4d0..e3b9e95d0e4bbfa03b8db0414e865089e797911c 100644
--- a/doc/qbx.rst
+++ b/doc/qbx.rst
@@ -28,5 +28,10 @@ Fast multipole driver
 
 .. automodule:: pytential.qbx.fmm
 
+Cost model
+----------
+
+.. automodule:: pytential.qbx.cost
+
 
 .. vim: sw=4:tw=75
diff --git a/examples/cost.py b/examples/cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d733b4b35f90f22d5e646c4e3db413560f5f11
--- /dev/null
+++ b/examples/cost.py
@@ -0,0 +1,166 @@
+"""Calibrates a cost model and reports on the accuracy."""
+
+import pyopencl as cl
+import numpy as np
+
+from pytential import sym, bind
+from pytools import one
+
+
+# {{{ global params
+
+TARGET_ORDER = 8
+OVSMP_FACTOR = 5
+TCF = 0.9
+QBX_ORDER = 5
+FMM_ORDER = 10
+RUNS = 3
+
+DEFAULT_LPOT_KWARGS = {
+        "_box_extent_norm": "l2",
+        "_from_sep_smaller_crit": "static_l2",
+        }
+
+PANELS_PER_ARM = 30
+TRAINING_ARMS = (10, 15, 25)
+TESTING_ARMS = (20,)
+
+
+def starfish_lpot_source(queue, n_arms):
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import (
+            InterpolatoryQuadratureSimplexGroupFactory)
+
+    from meshmode.mesh.generation import make_curve_mesh, NArmedStarfish
+
+    mesh = make_curve_mesh(
+            NArmedStarfish(n_arms, 0.8),
+            np.linspace(0, 1, 1 + PANELS_PER_ARM * n_arms),
+            TARGET_ORDER)
+
+    pre_density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(TARGET_ORDER))
+
+    lpot_kwargs = DEFAULT_LPOT_KWARGS.copy()
+    lpot_kwargs.update(
+            target_association_tolerance=0.025,
+            _expansion_stick_out_factor=TCF,
+            fmm_order=FMM_ORDER, qbx_order=QBX_ORDER,
+            fmm_backend="fmmlib"
+            )
+
+    from pytential.qbx import QBXLayerPotentialSource
+    lpot_source = QBXLayerPotentialSource(
+            pre_density_discr, OVSMP_FACTOR * TARGET_ORDER,
+            **lpot_kwargs)
+
+    lpot_source, _ = lpot_source.with_refinement()
+
+    return lpot_source
+
+# }}}
+
+
+def training_geometries(queue):
+    for n_arms in TRAINING_ARMS:
+        yield starfish_lpot_source(queue, n_arms)
+
+
+def test_geometries(queue):
+    for n_arms in TESTING_ARMS:
+        yield starfish_lpot_source(queue, n_arms)
+
+
+def get_bound_op(lpot_source):
+    from sumpy.kernel import LaplaceKernel
+    sigma_sym = sym.var("sigma")
+    k_sym = LaplaceKernel(lpot_source.ambient_dim)
+    op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+
+    return bind(lpot_source, op)
+
+
+def get_test_density(queue, lpot_source):
+    density_discr = lpot_source.density_discr
+    nodes = density_discr.nodes().with_queue(queue)
+    sigma = cl.clmath.sin(10 * nodes[0])
+
+    return sigma
+
+
+def calibrate_cost_model(ctx):
+    queue = cl.CommandQueue(ctx)
+
+    from pytential.qbx.cost import CostModel, estimate_calibration_params
+    cost_model = CostModel()
+
+    model_results = []
+    timing_results = []
+
+    for lpot_source in training_geometries(queue):
+        lpot_source = lpot_source.copy(cost_model=cost_model)
+        bound_op = get_bound_op(lpot_source)
+        sigma = get_test_density(queue, lpot_source)
+
+        cost_S = bound_op.get_modeled_cost(queue, sigma=sigma)
+
+        # Warm-up run.
+        bound_op.eval(queue, {"sigma": sigma})
+
+        for _ in range(RUNS):
+            timing_data = {}
+            bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data)
+
+            model_results.append(one(cost_S.values()))
+            timing_results.append(one(timing_data.values()))
+
+    calibration_params = (
+            estimate_calibration_params(model_results, timing_results))
+
+    return cost_model.with_calibration_params(calibration_params)
+
+
+def test_cost_model(ctx, cost_model):
+    queue = cl.CommandQueue(ctx)
+
+    for lpot_source in test_geometries(queue):
+        lpot_source = lpot_source.copy(cost_model=cost_model)
+        bound_op = get_bound_op(lpot_source)
+        sigma = get_test_density(queue, lpot_source)
+
+        cost_S = bound_op.get_modeled_cost(queue, sigma=sigma)
+        model_result = (
+                one(cost_S.values())
+                .get_predicted_times(merge_close_lists=True))
+
+        # Warm-up run.
+        bound_op.eval(queue, {"sigma": sigma})
+
+        temp_timing_results = []
+        for _ in range(RUNS):
+            timing_data = {}
+            bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data)
+            temp_timing_results.append(one(timing_data.values()))
+
+        timing_result = {}
+        for param in model_result:
+            timing_result[param] = (
+                    sum(temp_timing_result[param]["process_elapsed"]
+                        for temp_timing_result in temp_timing_results)) / RUNS
+
+        print("=" * 20)
+        for stage in model_result:
+            print("stage: ", stage)
+            print("actual: ", timing_result[stage])
+            print("predicted: ", model_result[stage])
+        print("=" * 20)
+
+
+def predict_cost(ctx):
+    model = calibrate_cost_model(ctx)
+    test_cost_model(ctx, model)
+
+
+if __name__ == "__main__":
+    predict_cost(cl.create_some_context(0))
diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py
index c6762642ce8e4b37c056bd402592fe06b55ee83c..f31ec75646640d599316c9dbae17e6818557e327 100644
--- a/pytential/qbx/__init__.py
+++ b/pytential/qbx/__init__.py
@@ -78,7 +78,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             expansion_factory=None,
             target_association_tolerance=_not_provided,
 
-            # begin undocumented arguments
+            # begin experimental arguments
             # FIXME default debug=False once everything has matured
             debug=True,
             _refined_for_global_qbx=False,
@@ -90,7 +90,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             _from_sep_smaller_crit=None,
             _from_sep_smaller_min_nsources_cumul=None,
             _tree_kind="adaptive",
+            _use_target_specific_qbx=None,
             geometry_data_inspector=None,
+            cost_model=None,
             fmm_backend="sumpy",
             target_stick_out_factor=_not_provided):
         """
@@ -109,6 +111,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
              *kernel* is the :class:`sumpy.kernel.Kernel` being evaluated, and
              *kernel_args* is a set of *(key, value)* tuples with evaluated
              kernel arguments. May not be given if *fmm_order* is given.
+
+        Experimental arguments without a promise of forward compatibility:
+
+        :arg _use_target_specific_qbx: Whether to use target-specific
+            acceleration by default if possible. *None* means
+            "use if possible".
+        :arg cost_model: Either *None* or instance of
+             :class:`~pytential.qbx.cost.CostModel`, used for gathering modeled
+             costs (experimental)
         """
 
         # {{{ argument processing
@@ -210,8 +221,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         self._from_sep_smaller_min_nsources_cumul = \
                 _from_sep_smaller_min_nsources_cumul
         self._tree_kind = _tree_kind
+        self._use_target_specific_qbx = _use_target_specific_qbx
         self.geometry_data_inspector = geometry_data_inspector
 
+        if cost_model is None:
+            from pytential.qbx.cost import CostModel
+            cost_model = CostModel()
+
+        self.cost_model = cost_model
+
         # /!\ *All* parameters set here must also be set by copy() below,
         # otherwise they will be reset to their default values behind your
         # back if the layer potential source is ever copied. (such as
@@ -232,7 +250,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             _box_extent_norm=None,
             _from_sep_smaller_crit=None,
             _tree_kind=None,
+            _use_target_specific_qbx=_not_provided,
             geometry_data_inspector=None,
+            cost_model=_not_provided,
             fmm_backend=None,
 
             debug=_not_provided,
@@ -314,8 +334,16 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 _from_sep_smaller_min_nsources_cumul=(
                     self._from_sep_smaller_min_nsources_cumul),
                 _tree_kind=_tree_kind or self._tree_kind,
+                _use_target_specific_qbx=(_use_target_specific_qbx
+                    if _use_target_specific_qbx is not _not_provided
+                    else self._use_target_specific_qbx),
                 geometry_data_inspector=(
                     geometry_data_inspector or self.geometry_data_inspector),
+                cost_model=(
+                    # None is a valid value here
+                    cost_model
+                    if cost_model is not _not_provided
+                    else self.cost_model),
                 fmm_backend=fmm_backend or self.fmm_backend,
                 **kwargs)
 
@@ -506,19 +534,67 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
     # {{{ internal functionality for execution
 
-    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
-        if not self._refined_for_global_qbx:
-            from warnings import warn
-            warn(
-                "Executing global QBX without refinement. "
-                "This is unlikely to work.")
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate,
+            return_timing_data):
+        extra_args = {}
 
         if self.fmm_level_to_order is False:
             func = self.exec_compute_potential_insn_direct
+            extra_args["return_timing_data"] = return_timing_data
+
         else:
             func = self.exec_compute_potential_insn_fmm
 
-        return func(queue, insn, bound_expr, evaluate)
+            def drive_fmm(wrangler, strengths, geo_data, kernel, kernel_arguments):
+                del geo_data, kernel, kernel_arguments
+                from pytential.qbx.fmm import drive_fmm
+                if return_timing_data:
+                    timing_data = {}
+                else:
+                    timing_data = None
+                return drive_fmm(wrangler, strengths, timing_data), timing_data
+
+            extra_args["fmm_driver"] = drive_fmm
+
+        return self._dispatch_compute_potential_insn(
+                queue, insn, bound_expr, evaluate, func, extra_args)
+
+    def cost_model_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+        """Using :attr:`cost_model`, evaluate the cost of executing *insn*.
+        Cost model results are gathered in
+        :attr:`pytential.symbolic.execution.BoundExpression.modeled_cost`
+        along the way.
+
+        :returns: whatever :meth:`exec_compute_potential_insn_fmm` returns.
+        """
+
+        if self.fmm_level_to_order is False:
+            raise NotImplementedError("perf modeling direct evaluations")
+
+        def drive_cost_model(
+                    wrangler, strengths, geo_data, kernel, kernel_arguments):
+            del strengths
+            cost_model_result = (
+                    self.cost_model(wrangler, geo_data, kernel, kernel_arguments))
+            return wrangler.full_output_zeros(), cost_model_result
+
+        return self._dispatch_compute_potential_insn(
+                queue, insn, bound_expr, evaluate,
+                self.exec_compute_potential_insn_fmm,
+                extra_args={"fmm_driver": drive_cost_model})
+
+    def _dispatch_compute_potential_insn(self, queue, insn, bound_expr,
+            evaluate, func, extra_args=None):
+        if not self._refined_for_global_qbx:
+            from warnings import warn
+            warn(
+                    "Executing global QBX without refinement. "
+                    "This is unlikely to work.")
+
+        if extra_args is None:
+            extra_args = {}
+
+        return func(queue, insn, bound_expr, evaluate, **extra_args)
 
     @property
     @memoize_method
@@ -562,18 +638,19 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         else:
             raise ValueError("invalid FMM backend: %s" % self.fmm_backend)
 
-    def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate):
-        # {{{ build list of unique target discretizations used
-
+    def get_target_discrs_and_qbx_sides(self, insn, bound_expr):
+        """Build the list of unique target discretizations used by the
+        provided instruction.
+        """
         # map (name, qbx_side) to number in list
-        tgt_name_and_side_to_number = {}
+        target_name_and_side_to_number = {}
         # list of tuples (discr, qbx_side)
         target_discrs_and_qbx_sides = []
 
         for o in insn.outputs:
             key = (o.target_name, o.qbx_forced_limit)
-            if key not in tgt_name_and_side_to_number:
-                tgt_name_and_side_to_number[key] = \
+            if key not in target_name_and_side_to_number:
+                target_name_and_side_to_number[key] = \
                         len(target_discrs_and_qbx_sides)
 
                 target_discr = bound_expr.places.get_geometry(o.target_name)
@@ -587,14 +664,24 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 target_discrs_and_qbx_sides.append(
                         (target_discr, qbx_forced_limit))
 
-        target_discrs_and_qbx_sides = tuple(target_discrs_and_qbx_sides)
+        return target_name_and_side_to_number, tuple(target_discrs_and_qbx_sides)
 
-        # }}}
+    def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate,
+            fmm_driver):
+        """
+        :arg fmm_driver: A function that accepts four arguments:
+            *wrangler*, *strength*, *geo_data*, *kernel*, *kernel_arguments*
+        :returns: a tuple ``(assignments, extra_outputs)``, where *assignments*
+            is a list of tuples containing pairs ``(name, value)`` representing
+            assignments to be performed in the evaluation context.
+            *extra_outputs* is data that *fmm_driver* may return
+            (such as timing data), passed through unmodified.
+        """
+        target_name_and_side_to_number, target_discrs_and_qbx_sides = (
+                self.get_target_discrs_and_qbx_sides(insn, bound_expr))
 
         geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides)
 
-        # geo_data.plot()
-
         # FIXME Exert more positive control over geo_data attribute lifetimes using
         # geo_data.<method>.clear_cache(geo_data).
 
@@ -606,7 +693,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         strengths = (evaluate(insn.density).with_queue(queue)
                 * self.weights_and_area_elements())
-
         out_kernels = tuple(knl for knl in insn.kernels)
         fmm_kernel = self.get_fmm_kernel(out_kernels)
         output_and_expansion_dtype = (
@@ -622,14 +708,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                         self.qbx_order,
                         self.fmm_level_to_order,
                         source_extra_kwargs=source_extra_kwargs,
-                        kernel_extra_kwargs=kernel_extra_kwargs)
+                        kernel_extra_kwargs=kernel_extra_kwargs,
+                        _use_target_specific_qbx=self._use_target_specific_qbx)
 
         from pytential.qbx.geometry import target_state
         if (geo_data.user_target_to_center().with_queue(queue)
                 == target_state.FAILED).any().get():
             raise RuntimeError("geometry has failed targets")
 
-        # {{{ performance data hook
+        # {{{ geometry data inspection hook
 
         if self.geometry_data_inspector is not None:
             perform_fmm = self.geometry_data_inspector(insn, bound_expr, geo_data)
@@ -638,26 +725,23 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
         # }}}
 
-        # {{{ execute global QBX
-
-        from pytential.qbx.fmm import drive_fmm
-        all_potentials_on_every_tgt = drive_fmm(wrangler, strengths)
-
-        # }}}
+        # Execute global QBX.
+        all_potentials_on_every_target, extra_outputs = (
+                fmm_driver(
+                    wrangler, strengths, geo_data, fmm_kernel, kernel_extra_kwargs))
 
         result = []
 
         for o in insn.outputs:
-            tgt_side_number = tgt_name_and_side_to_number[
+            target_side_number = target_name_and_side_to_number[
                     o.target_name, o.qbx_forced_limit]
-            tgt_slice = slice(*geo_data.target_info().target_discr_starts[
-                    tgt_side_number:tgt_side_number+2])
+            target_slice = slice(*geo_data.target_info().target_discr_starts[
+                    target_side_number:target_side_number+2])
 
-            result.append(
-                    (o.name,
-                        all_potentials_on_every_tgt[o.kernel_index][tgt_slice]))
+            result.append((o.name,
+                    all_potentials_on_every_target[o.kernel_index][target_slice]))
 
-        return result
+        return result, extra_outputs
 
     # }}}
 
@@ -715,7 +799,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                         *count = item;
                     """)
 
-    def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate):
+    def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate,
+            return_timing_data):
+        if return_timing_data:
+            from pytential.source import UnableToCollectTimingData
+            from warnings import warn
+            warn(
+                    "Timing data collection not supported.",
+                    category=UnableToCollectTimingData)
+
         lpot_applier = self.get_lpot_applier(insn.kernels)
         p2p = None
         lpot_applier_on_tgt_subset = None
@@ -826,7 +918,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
 
                 result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result
+        timing_data = {}
+        return result, timing_data
 
     # }}}
 
diff --git a/pytential/qbx/cost.py b/pytential/qbx/cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa01b239e89f15a6962831987b7912bcbfb3cda
--- /dev/null
+++ b/pytential/qbx/cost.py
@@ -0,0 +1,937 @@
+from __future__ import division, absolute_import
+
+__copyright__ = """
+Copyright (C) 2013 Andreas Kloeckner
+Copyright (C) 2018 Matt Wala
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from collections import OrderedDict
+
+import logging
+
+import numpy as np
+import pyopencl as cl
+from six.moves import range
+import sympy as sp
+
+from pytools import log_process
+from pymbolic import var
+
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. note::
+
+   This module is experimental. Its interface is subject to change until this
+   notice is removed.
+
+.. autoclass:: CostModel
+.. autoclass:: ParametrizedCosts
+
+.. autoclass:: TranslationCostModel
+
+.. autofunction:: pde_aware_translation_cost_model
+.. autofunction:: taylor_translation_cost_model
+
+.. autofunction:: estimate_calibration_params
+"""
+
+
+# {{{ translation cost model
+
+class TranslationCostModel(object):
+    """Provides modeled costs for individual translations or evaluations."""
+
+    def __init__(self, ncoeffs_qbx, ncoeffs_fmm_by_level, uses_point_and_shoot):
+        self.ncoeffs_qbx = ncoeffs_qbx
+        self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level
+        self.uses_point_and_shoot = uses_point_and_shoot
+
+    @staticmethod
+    def direct():
+        return var("c_p2p")
+
+    def p2qbxl(self):
+        return var("c_p2qbxl") * self.ncoeffs_qbx
+
+    def p2p_tsqbx(self):
+        # This term should be linear in the QBX order, which is the
+        # square root of the number of QBX coefficients.
+        return var("c_p2p_tsqbx") * self.ncoeffs_qbx ** (1/2)
+
+    def qbxl2p(self):
+        return var("c_qbxl2p") * self.ncoeffs_qbx
+
+    def p2l(self, level):
+        return var("c_p2l") * self.ncoeffs_fmm_by_level[level]
+
+    def l2p(self, level):
+        return var("c_l2p") * self.ncoeffs_fmm_by_level[level]
+
+    def p2m(self, level):
+        return var("c_p2m") * self.ncoeffs_fmm_by_level[level]
+
+    def m2p(self, level):
+        return var("c_m2p") * self.ncoeffs_fmm_by_level[level]
+
+    def m2m(self, src_level, tgt_level):
+        return var("c_m2m") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def l2l(self, src_level, tgt_level):
+        return var("c_l2l") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def m2l(self, src_level, tgt_level):
+        return var("c_m2l") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def m2qbxl(self, level):
+        return var("c_m2qbxl") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[level],
+                self.ncoeffs_qbx)
+
+    def l2qbxl(self, level):
+        return var("c_l2qbxl") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[level],
+                self.ncoeffs_qbx)
+
+    def e2e_cost(self, nsource_coeffs, ntarget_coeffs):
+        if self.uses_point_and_shoot:
+            return (
+                    # Rotate the coordinate system to be z axis aligned.
+                    nsource_coeffs ** (3 / 2)
+                    # Translate the expansion along the z axis.
+                    + nsource_coeffs ** (1 / 2) * ntarget_coeffs
+                    # Rotate the coordinate system back.
+                    + ntarget_coeffs ** (3 / 2))
+
+        return nsource_coeffs * ntarget_coeffs
+
+# }}}
+
+
+# {{{ translation cost model factories
+
+def pde_aware_translation_cost_model(dim, nlevels):
+    """Create a cost model for FMM translation operators that make use of the
+    knowledge that the potential satisfies a PDE.
+    """
+    p_qbx = var("p_qbx")
+    p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)])
+
+    uses_point_and_shoot = False
+
+    ncoeffs_fmm = (p_fmm + 1) ** (dim - 1)
+    ncoeffs_qbx = (p_qbx + 1) ** (dim - 1)
+
+    if dim == 3:
+        uses_point_and_shoot = True
+
+    return TranslationCostModel(
+            ncoeffs_qbx=ncoeffs_qbx,
+            ncoeffs_fmm_by_level=ncoeffs_fmm,
+            uses_point_and_shoot=uses_point_and_shoot)
+
+
+def taylor_translation_cost_model(dim, nlevels):
+    """Create a cost model for FMM translation based on Taylor expansions
+    in Cartesian coordinates.
+    """
+    p_qbx = var("p_qbx")
+    p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)])
+
+    ncoeffs_fmm = (p_fmm + 1) ** dim
+    ncoeffs_qbx = (p_qbx + 1) ** dim
+
+    return TranslationCostModel(
+            ncoeffs_qbx=ncoeffs_qbx,
+            ncoeffs_fmm_by_level=ncoeffs_fmm,
+            uses_point_and_shoot=False)
+
+# }}}
+
+
+# {{{ parameterized costs returned by cost model
+
+class ParametrizedCosts(object):
+    """A container for data returned by the cost model.
+
+    This holds both symbolic costs as well as parameter values. To obtain a
+    prediction of the running time, use :meth:`get_predicted_times`.
+
+    .. attribute:: raw_costs
+
+        A dictionary mapping algorithmic stage names to symbolic costs.
+
+    .. attribute:: params
+
+        A dictionary mapping names of symbolic parameters to values.  Parameters
+        appear in *raw_costs* and may include values such as QBX or FMM order
+        as well as calibration constants.
+
+    .. automethod:: copy
+    .. automethod:: with_params
+    .. automethod:: get_predicted_times
+    """
+
+    def __init__(self, raw_costs, params):
+        self.raw_costs = OrderedDict(raw_costs)
+        self.params = params
+
+    def with_params(self, params):
+        """Return a copy of *self* with parameters updated to include *params*."""
+        new_params = self.params.copy()
+        new_params.update(params)
+        return type(self)(
+                raw_costs=self.raw_costs.copy(),
+                params=new_params)
+
+    def copy(self):
+        return self.with_params({})
+
+    def __str__(self):
+        return "".join([
+                type(self).__name__,
+                "(raw_costs=",
+                str(self.raw_costs),
+                ", params=",
+                str(self.params),
+                ")"])
+
+    def __repr__(self):
+        return "".join([
+                type(self).__name__,
+                "(raw_costs=",
+                repr(self.raw_costs),
+                ", params=",
+                repr(self.params),
+                ")"])
+
+    def get_predicted_times(self, merge_close_lists=False):
+        """Return a dictionary mapping stage names to predicted time in seconds.
+
+        :arg merge_close_lists: If *True*, the returned estimate combines
+            the cost of "close" lists (Lists 1, 3 close, and 4 close). If
+            *False*, the time of each "close" list is reported separately.
+        """
+        from pymbolic import evaluate
+        from functools import partial
+
+        get_time = partial(evaluate, context=self.params)
+
+        result = OrderedDict()
+
+        for name, val in self.raw_costs.items():
+            if merge_close_lists:
+                for suffix in ("_list1", "_list3", "_list4"):
+                    if name.endswith(suffix):
+                        name = name[:-len(suffix)]
+                        break
+
+            result[name] = get_time(val) + result.get(name, 0)
+
+        return result
+
+# }}}
+
+
+# {{{ cost model
+
+class CostModel(object):
+    """
+    .. automethod:: with_calibration_params
+    .. automethod:: __call__
+
+    The cost model relies on a translation cost model. See
+    :class:`TranslationCostModel` for the translation cost model interface.
+    """
+
+    def __init__(self,
+            translation_cost_model_factory=pde_aware_translation_cost_model,
+            calibration_params=None):
+        """
+        :arg translation_cost_model_factory: A callable which, given arguments
+            (*dim*, *nlevels*), returns a :class:`TranslationCostModel`.
+        """
+        self.translation_cost_model_factory = translation_cost_model_factory
+        if calibration_params is None:
+            calibration_params = dict()
+        self.calibration_params = calibration_params
+
+    def with_calibration_params(self, calibration_params):
+        """Return a copy of *self* with a new set of calibration parameters."""
+        return type(self)(
+                translation_cost_model_factory=self.translation_cost_model_factory,
+                calibration_params=calibration_params)
+
+    # {{{ form multipoles
+
+    def process_form_multipoles(self, xlat_cost, traversal, tree):
+        result = 0
+
+        for level in range(tree.nlevels):
+            src_count = 0
+            start, stop = traversal.level_start_source_box_nrs[level:level + 2]
+            for src_ibox in traversal.source_boxes[start:stop]:
+                nsrcs = tree.box_source_counts_nonchild[src_ibox]
+                src_count += nsrcs
+            result += src_count * xlat_cost.p2m(level)
+
+        return dict(form_multipoles=result)
+
+    # }}}
+
+    # {{{ propagate multipoles upward
+
+    def process_coarsen_multipoles(self, xlat_cost, traversal, tree):
+        result = 0
+
+        # nlevels-1 is the last valid level index
+        # nlevels-2 is the last valid level that could have children
+        #
+        # 3 is the last relevant source_level.
+        # 2 is the last relevant target_level.
+        # (because no level 1 box will be well-separated from another)
+        for source_level in range(tree.nlevels-1, 2, -1):
+            target_level = source_level - 1
+            cost = xlat_cost.m2m(source_level, target_level)
+
+            nmultipoles = 0
+            start, stop = traversal.level_start_source_parent_box_nrs[
+                            target_level:target_level+2]
+            for ibox in traversal.source_parent_boxes[start:stop]:
+                for child in tree.box_child_ids[:, ibox]:
+                    if child:
+                        nmultipoles += 1
+
+            result += cost * nmultipoles
+
+        return dict(coarsen_multipoles=result)
+
+    # }}}
+
+    # {{{ collect direct interaction data
+
+    @staticmethod
+    def _collect_direction_interaction_data(traversal, tree):
+        ntarget_boxes = len(traversal.target_boxes)
+
+        # target box index -> nsources
+        nlist1_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp)
+        nlist3close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp)
+        nlist4close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp)
+
+        for itgt_box in range(ntarget_boxes):
+            nlist1_srcs = 0
+            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
+                nlist1_srcs += tree.box_source_counts_nonchild[src_ibox]
+
+            nlist1_srcs_by_itgt_box[itgt_box] = nlist1_srcs
+
+            nlist3close_srcs = 0
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_smaller_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
+                    nlist3close_srcs += tree.box_source_counts_nonchild[src_ibox]
+
+            nlist3close_srcs_by_itgt_box[itgt_box] = nlist3close_srcs
+
+            nlist4close_srcs = 0
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_bigger_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
+                    nlist4close_srcs += tree.box_source_counts_nonchild[src_ibox]
+
+            nlist4close_srcs_by_itgt_box[itgt_box] = nlist4close_srcs
+
+        result = {}
+        result["nlist1_srcs_by_itgt_box"] = nlist1_srcs_by_itgt_box
+        result["nlist3close_srcs_by_itgt_box"] = nlist3close_srcs_by_itgt_box
+        result["nlist4close_srcs_by_itgt_box"] = nlist4close_srcs_by_itgt_box
+
+        return result
+
+    # }}}
+
+    # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
+
+    def process_direct(self, xlat_cost, traversal, direct_interaction_data,
+            box_target_counts_nonchild):
+        nlist1_srcs_by_itgt_box = (
+                direct_interaction_data["nlist1_srcs_by_itgt_box"])
+        nlist3close_srcs_by_itgt_box = (
+                direct_interaction_data["nlist3close_srcs_by_itgt_box"])
+        nlist4close_srcs_by_itgt_box = (
+                direct_interaction_data["nlist4close_srcs_by_itgt_box"])
+
+        # list -> number of source-target interactions
+        npart_direct_list1 = 0
+        npart_direct_list3 = 0
+        npart_direct_list4 = 0
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_boxes):
+            ntargets = box_target_counts_nonchild[tgt_ibox]
+
+            npart_direct_list1 += ntargets * nlist1_srcs_by_itgt_box[itgt_box]
+            npart_direct_list3 += ntargets * nlist3close_srcs_by_itgt_box[itgt_box]
+            npart_direct_list4 += ntargets * nlist4close_srcs_by_itgt_box[itgt_box]
+
+        result = {}
+        result["eval_direct_list1"] = npart_direct_list1 * xlat_cost.direct()
+        result["eval_direct_list3"] = npart_direct_list3 * xlat_cost.direct()
+        result["eval_direct_list4"] = npart_direct_list4 * xlat_cost.direct()
+
+        return result
+
+    # }}}
+
+    # {{{ translate separated siblings' ("list 2") mpoles to local
+
+    def process_list2(self, xlat_cost, traversal, tree):
+        nm2l_by_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
+            start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
+
+            level = tree.box_levels[tgt_ibox]
+            nm2l_by_level[level] += end-start
+
+        result = sum(
+                cost * xlat_cost.m2l(ilevel, ilevel)
+                for ilevel, cost in enumerate(nm2l_by_level))
+
+        return dict(multipole_to_local=result)
+
+    # }}}
+
+    # {{{ evaluate sep. smaller mpoles ("list 3") at particles
+
+    def process_list3(self, xlat_cost, traversal, tree, box_target_counts_nonchild):
+        nmp_eval_by_source_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
+
+        for ilevel, sep_smaller_list in enumerate(
+                traversal.from_sep_smaller_by_level):
+            for itgt_box, tgt_ibox in enumerate(
+                        traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
+                ntargets = box_target_counts_nonchild[tgt_ibox]
+                start, end = sep_smaller_list.starts[itgt_box:itgt_box+2]
+                nmp_eval_by_source_level[ilevel] += ntargets * (end-start)
+
+        result = sum(
+                cost * xlat_cost.m2p(ilevel)
+                for ilevel, cost in enumerate(nmp_eval_by_source_level))
+
+        return dict(eval_multipoles=result)
+
+    # }}}
+
+    # {{{ form locals for separated bigger source boxes ("list 4")
+
+    def process_list4(self, xlat_cost, traversal, tree):
+        nform_local_by_source_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        for itgt_box in range(len(traversal.target_or_target_parent_boxes)):
+            start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.from_sep_bigger_lists[start:end]:
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+                level = tree.box_levels[src_ibox]
+                nform_local_by_source_level[level] += nsources
+
+        result = sum(
+                cost * xlat_cost.p2l(ilevel)
+                for ilevel, cost in enumerate(nform_local_by_source_level))
+
+        return dict(form_locals=result)
+
+    # }}}
+
+    # {{{ propogate locals downward
+
+    def process_refine_locals(self, xlat_cost, traversal, tree):
+        result = 0
+
+        for target_lev in range(1, tree.nlevels):
+            start, stop = traversal.level_start_target_or_target_parent_box_nrs[
+                    target_lev:target_lev+2]
+            source_lev = target_lev - 1
+            result += (stop-start) * xlat_cost.l2l(source_lev, target_lev)
+
+        return dict(refine_locals=result)
+
+    # }}}
+
+    # {{{ evaluate local expansions at non-qbx targets
+
+    def process_eval_locals(self, xlat_cost, traversal, tree, nqbtl):
+        ntargets_by_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        for target_lev in range(tree.nlevels):
+            start, stop = traversal.level_start_target_box_nrs[
+                    target_lev:target_lev+2]
+            for tgt_ibox in traversal.target_boxes[start:stop]:
+                ntargets_by_level[target_lev] += (
+                        nqbtl.box_target_counts_nonchild[tgt_ibox])
+
+        result = sum(
+                cost * xlat_cost.l2p(ilevel)
+                for ilevel, cost in enumerate(ntargets_by_level))
+
+        return dict(eval_locals=result)
+
+    # }}}
+
+    # {{{ collect data about direct interactions with qbx centers
+
+    @staticmethod
+    def _collect_qbxl_direct_interaction_data(direct_interaction_data,
+            global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts):
+        nlist1_srcs_by_itgt_box = (
+                direct_interaction_data["nlist1_srcs_by_itgt_box"])
+        nlist3close_srcs_by_itgt_box = (
+                direct_interaction_data["nlist3close_srcs_by_itgt_box"])
+        nlist4close_srcs_by_itgt_box = (
+                direct_interaction_data["nlist4close_srcs_by_itgt_box"])
+
+        # center -> nsources
+        np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp)
+        np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp)
+        np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp)
+
+        # center -> number of associated targets
+        nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp)
+
+        for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
+            start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2]
+            nqbxl2p_by_center[itgt_center] = end - start
+
+            itgt_box = qbx_center_to_target_box[tgt_icenter]
+            np2qbxl_list1_by_center[itgt_center] = (
+                    nlist1_srcs_by_itgt_box[itgt_box])
+            np2qbxl_list3_by_center[itgt_center] = (
+                    nlist3close_srcs_by_itgt_box[itgt_box])
+            np2qbxl_list4_by_center[itgt_center] = (
+                    nlist4close_srcs_by_itgt_box[itgt_box])
+
+        result = {}
+        result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center
+        result["np2qbxl_list3_by_center"] = np2qbxl_list3_by_center
+        result["np2qbxl_list4_by_center"] = np2qbxl_list4_by_center
+        result["nqbxl2p_by_center"] = nqbxl2p_by_center
+
+        return result
+
+    # }}}
+
+    # {{{ eval target specific qbx expansions
+
+    def process_eval_target_specific_qbxl(self, xlat_cost, direct_interaction_data,
+            global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts):
+
+        counts = self._collect_qbxl_direct_interaction_data(
+                direct_interaction_data, global_qbx_centers,
+                qbx_center_to_target_box, center_to_targets_starts)
+
+        result = {}
+        result["eval_target_specific_qbx_locals_list1"] = (
+                sum(counts["np2qbxl_list1_by_center"] * counts["nqbxl2p_by_center"])
+                * xlat_cost.p2p_tsqbx())
+        result["eval_target_specific_qbx_locals_list3"] = (
+                sum(counts["np2qbxl_list3_by_center"] * counts["nqbxl2p_by_center"])
+                * xlat_cost.p2p_tsqbx())
+        result["eval_target_specific_qbx_locals_list4"] = (
+                sum(counts["np2qbxl_list4_by_center"] * counts["nqbxl2p_by_center"])
+                * xlat_cost.p2p_tsqbx())
+
+        return result
+
+    # }}}
+
+    # {{{ form global qbx locals
+
+    def process_form_qbxl(self, xlat_cost, direct_interaction_data,
+            global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts):
+
+        counts = self._collect_qbxl_direct_interaction_data(
+                direct_interaction_data, global_qbx_centers,
+                qbx_center_to_target_box, center_to_targets_starts)
+
+        result = {}
+        result["form_global_qbx_locals_list1"] = (
+                sum(counts["np2qbxl_list1_by_center"]) * xlat_cost.p2qbxl())
+        result["form_global_qbx_locals_list3"] = (
+                sum(counts["np2qbxl_list3_by_center"]) * xlat_cost.p2qbxl())
+        result["form_global_qbx_locals_list4"] = (
+                sum(counts["np2qbxl_list4_by_center"]) * xlat_cost.p2qbxl())
+
+        return result
+
+    # }}}
+
+    # {{{ translate from list 3 multipoles to qbx local expansions
+
+    def process_m2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers,
+            qbx_center_to_target_box_source_level):
+        nm2qbxl_by_source_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
+
+        for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
+            for tgt_icenter in global_qbx_centers:
+                icontaining_tgt_box = qbx_center_to_target_box_source_level[
+                        isrc_level][tgt_icenter]
+
+                if icontaining_tgt_box == -1:
+                    continue
+
+                start, stop = (
+                        ssn.starts[icontaining_tgt_box],
+                        ssn.starts[icontaining_tgt_box+1])
+
+                nm2qbxl_by_source_level[isrc_level] += stop-start
+
+        result = sum(
+                cost * xlat_cost.m2qbxl(ilevel)
+                for ilevel, cost in enumerate(nm2qbxl_by_source_level))
+
+        return dict(translate_box_multipoles_to_qbx_local=result)
+
+    # }}}
+
+    # {{{ translate from box locals to qbx local expansions
+
+    def process_l2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers,
+            qbx_center_to_target_box):
+        nl2qbxl_by_level = np.zeros(tree.nlevels, dtype=np.intp)
+
+        for tgt_icenter in global_qbx_centers:
+            itgt_box = qbx_center_to_target_box[tgt_icenter]
+            tgt_ibox = traversal.target_boxes[itgt_box]
+            level = tree.box_levels[tgt_ibox]
+            nl2qbxl_by_level[level] += 1
+
+        result = sum(
+                cost * xlat_cost.l2qbxl(ilevel)
+                for ilevel, cost in enumerate(nl2qbxl_by_level))
+
+        return dict(translate_box_local_to_qbx_local=result)
+
+    # }}}
+
+    # {{{ evaluate qbx local expansions
+
+    def process_eval_qbxl(self, xlat_cost, global_qbx_centers,
+            center_to_targets_starts):
+        result = 0
+
+        for src_icenter in global_qbx_centers:
+            start, end = center_to_targets_starts[src_icenter:src_icenter+2]
+            result += (end - start)
+
+        result *= xlat_cost.qbxl2p()
+
+        return dict(eval_qbx_expansions=result)
+
+    # }}}
+
+    @log_process(logger, "model cost")
+    def __call__(self, wrangler, geo_data, kernel, kernel_arguments):
+        """Analyze the given geometry and return cost data.
+
+        :returns: An instance of :class:`ParametrizedCosts`.
+        """
+        result = OrderedDict()
+
+        lpot_source = geo_data.lpot_source
+
+        with cl.CommandQueue(geo_data.cl_context) as queue:
+            tree = geo_data.tree().get(queue)
+            traversal = geo_data.traversal(merge_close_lists=False).get(queue)
+            nqbtl = geo_data.non_qbx_box_target_lists().get(queue)
+
+        box_target_counts_nonchild = nqbtl.box_target_counts_nonchild
+
+        params = dict(
+                nlevels=tree.nlevels,
+                nboxes=tree.nboxes,
+                nsources=tree.nsources,
+                ntargets=tree.ntargets,
+                ncenters=geo_data.ncenters,
+                p_qbx=lpot_source.qbx_order,
+                )
+
+        # FIXME: We can avoid using *kernel* and *kernel_arguments* if we talk
+        # to the wrangler to obtain the FMM order (see also
+        # https://gitlab.tiker.net/inducer/boxtree/issues/25)
+        for ilevel in range(tree.nlevels):
+            params["p_fmm_lev%d" % ilevel] = (
+                    lpot_source.fmm_level_to_order(
+                        kernel.get_base_kernel(), kernel_arguments, tree, ilevel))
+
+        params.update(self.calibration_params)
+
+        xlat_cost = (
+                self.translation_cost_model_factory(tree.dimensions, tree.nlevels))
+
+        # {{{ construct local multipoles
+
+        result.update(self.process_form_multipoles(xlat_cost, traversal, tree))
+
+        # }}}
+
+        # {{{ propagate multipoles upward
+
+        result.update(self.process_coarsen_multipoles(xlat_cost, traversal, tree))
+
+        # }}}
+
+        direct_interaction_data = (
+                self._collect_direction_interaction_data(traversal, tree))
+
+        # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
+
+        result.update(self.process_direct(
+                xlat_cost, traversal, direct_interaction_data,
+                box_target_counts_nonchild))
+
+        # }}}
+
+        # {{{ translate separated siblings' ("list 2") mpoles to local
+
+        result.update(self.process_list2(xlat_cost, traversal, tree))
+
+        # }}}
+
+        # {{{ evaluate sep. smaller mpoles ("list 3") at particles
+
+        result.update(self.process_list3(
+                xlat_cost, traversal, tree, box_target_counts_nonchild))
+
+        # }}}
+
+        # {{{ form locals for separated bigger source boxes ("list 4")
+
+        result.update(self.process_list4(xlat_cost, traversal, tree))
+
+        # }}}
+
+        # {{{ propagate local_exps downward
+
+        result.update(self.process_refine_locals(xlat_cost, traversal, tree))
+
+        # }}}
+
+        # {{{ evaluate locals
+
+        result.update(self.process_eval_locals(xlat_cost, traversal, tree, nqbtl))
+
+        # }}}
+
+        global_qbx_centers = geo_data.global_qbx_centers()
+
+        qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
+        center_to_targets_starts = geo_data.center_to_tree_targets().starts
+        qbx_center_to_target_box_source_level = np.empty(
+                (tree.nlevels,), dtype=object)
+
+        for src_level in range(tree.nlevels):
+            qbx_center_to_target_box_source_level[src_level] = (
+                    geo_data.qbx_center_to_target_box_source_level(src_level))
+
+        with cl.CommandQueue(geo_data.cl_context) as queue:
+            global_qbx_centers = global_qbx_centers.get(
+                    queue=queue)
+            qbx_center_to_target_box = qbx_center_to_target_box.get(
+                    queue=queue)
+            center_to_targets_starts = center_to_targets_starts.get(
+                    queue=queue)
+            for src_level in range(tree.nlevels):
+                qbx_center_to_target_box_source_level[src_level] = (
+                        qbx_center_to_target_box_source_level[src_level]
+                        .get(queue=queue))
+
+        # {{{ form global qbx locals or evaluate target specific qbx expansions
+
+        if wrangler.using_tsqbx:
+            result.update(self.process_eval_target_specific_qbxl(
+                    xlat_cost, direct_interaction_data, global_qbx_centers,
+                    qbx_center_to_target_box, center_to_targets_starts))
+            result["form_global_qbx_locals_list1"] = 0
+            result["form_global_qbx_locals_list3"] = 0
+            result["form_global_qbx_locals_list4"] = 0
+
+        else:
+            result.update(self.process_form_qbxl(
+                    xlat_cost, direct_interaction_data, global_qbx_centers,
+                    qbx_center_to_target_box, center_to_targets_starts))
+            result["eval_target_specific_qbx_locals_list1"] = 0
+            result["eval_target_specific_qbx_locals_list3"] = 0
+            result["eval_target_specific_qbx_locals_list4"] = 0
+
+        # }}}
+
+        # {{{ translate from list 3 multipoles to qbx local expansions
+
+        result.update(self.process_m2qbxl(
+                xlat_cost, traversal, tree, global_qbx_centers,
+                qbx_center_to_target_box_source_level))
+
+        # }}}
+
+        # {{{ translate from box local expansions to qbx local expansions
+
+        result.update(self.process_l2qbxl(
+                xlat_cost, traversal, tree, global_qbx_centers,
+                qbx_center_to_target_box))
+
+        # }}}
+
+        # {{{ evaluate qbx local expansions
+
+        result.update(self.process_eval_qbxl(
+                xlat_cost, global_qbx_centers, center_to_targets_starts))
+
+        # }}}
+
+        return ParametrizedCosts(result, params)
+
+# }}}
+
+
+# {{{ calibrate cost model
+
+def _collect(expr, variables):
+    """Collect terms with respect to a list of variables.
+
+    This applies :func:`sympy.simplify.collect` to the a :mod:`pymbolic` expression
+    with respect to the iterable of names in *variables*.
+
+    Returns a dictionary mapping variable names to terms.
+    """
+    from pymbolic.interop.sympy import PymbolicToSympyMapper, SympyToPymbolicMapper
+    p2s = PymbolicToSympyMapper()
+    s2p = SympyToPymbolicMapper()
+
+    from sympy.simplify import collect
+    sympy_variables = [sp.var(v) for v in variables]
+    collect_result = collect(p2s(expr), sympy_variables, evaluate=False)
+
+    result = {}
+    for v in variables:
+        try:
+            result[v] = s2p(collect_result[sp.var(v)])
+        except KeyError:
+            continue
+
+    return result
+
+
+_FMM_STAGE_TO_CALIBRATION_PARAMETER = {
+        "form_multipoles": "c_p2m",
+        "coarsen_multipoles": "c_m2m",
+        "eval_direct": "c_p2p",
+        "multipole_to_local": "c_m2l",
+        "eval_multipoles": "c_m2p",
+        "form_locals": "c_p2l",
+        "refine_locals": "c_l2l",
+        "eval_locals": "c_l2p",
+        "form_global_qbx_locals": "c_p2qbxl",
+        "translate_box_multipoles_to_qbx_local": "c_m2qbxl",
+        "translate_box_local_to_qbx_local": "c_l2qbxl",
+        "eval_qbx_expansions": "c_qbxl2p",
+        "eval_target_specific_qbx_locals": "c_p2p_tsqbx",
+        }
+
+
+def estimate_calibration_params(model_results, timing_results):
+    """Given a set of model results and matching timing results, estimate the best
+    calibration parameters for the model.
+    """
+
+    params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values())
+
+    nresults = len(model_results)
+
+    if nresults != len(timing_results):
+        raise ValueError("must have same number of model and timing results")
+
+    uncalibrated_times = {}
+    actual_times = {}
+
+    for param in params:
+        uncalibrated_times[param] = np.zeros(nresults)
+        actual_times[param] = np.zeros(nresults)
+
+    from pymbolic import evaluate
+
+    for i, model_result in enumerate(model_results):
+        context = model_result.params.copy()
+        for param in params:
+            context[param] = var(param)
+
+        # Represents the total modeled cost, but leaves the calibration
+        # parameters symbolic.
+        total_modeled_cost = evaluate(
+                sum(model_result.raw_costs.values()),
+                context=context)
+
+        collected_times = _collect(total_modeled_cost, params)
+
+        for param, time in collected_times.items():
+            uncalibrated_times[param][i] = time
+
+    for i, timing_result in enumerate(timing_results):
+        for param, time in timing_result.items():
+            calibration_param = (
+                    _FMM_STAGE_TO_CALIBRATION_PARAMETER[param])
+            actual_times[calibration_param][i] = time["process_elapsed"]
+
+    result = {}
+
+    for param in params:
+        uncalibrated = uncalibrated_times[param]
+        actual = actual_times[param]
+
+        if np.allclose(uncalibrated, 0):
+            result[param] = float("NaN")
+            continue
+
+        result[param] = (
+                actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated))
+
+    return result
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py
index 3918914713e9f9b3605ce51ff74a9246cb1b7ca8..8b9ea4292868fc7c811f46d8fe8b966555b5784e 100644
--- a/pytential/qbx/fmm.py
+++ b/pytential/qbx/fmm.py
@@ -91,12 +91,14 @@ class QBXSumpyExpansionWranglerCodeContainer(SumpyExpansionWranglerCodeContainer
     def get_wrangler(self, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs={},
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            _use_target_specific_qbx=False):
         return QBXExpansionWrangler(self, queue, geo_data,
                 dtype,
                 qbx_order, fmm_level_to_order,
                 source_extra_kwargs,
-                kernel_extra_kwargs)
+                kernel_extra_kwargs,
+                _use_target_specific_qbx)
 
 
 class QBXExpansionWrangler(SumpyExpansionWrangler):
@@ -120,13 +122,18 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
     def __init__(self, code_container, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
-            source_extra_kwargs, kernel_extra_kwargs):
+            source_extra_kwargs, kernel_extra_kwargs,
+            _use_target_specific_qbx=None):
+        if _use_target_specific_qbx:
+            raise ValueError("TSQBX is not implemented in sumpy")
+
         SumpyExpansionWrangler.__init__(self,
                 code_container, queue, geo_data.tree(),
                 dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs)
 
         self.qbx_order = qbx_order
         self.geo_data = geo_data
+        self.using_tsqbx = False
 
     # {{{ data vector utilities
 
@@ -369,6 +376,10 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
         return (pot, SumpyTimingFuture(self.queue, events))
 
+    @log_process(logger)
+    def eval_target_specific_qbx_locals(self, src_weights):
+        return (self.full_output_zeros(), SumpyTimingFuture(self.queue, events=()))
+
     # }}}
 
 # }}}
@@ -376,7 +387,8 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
 # {{{ FMM top-level
 
-def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
+def drive_fmm(expansion_wrangler, src_weights, timing_data=None,
+        traversal=None):
     """Top-level driver routine for the QBX fast multipole calculation.
 
     :arg geo_data: A :class:`QBXFMMGeometryData` instance.
@@ -394,8 +406,12 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
     wrangler = expansion_wrangler
 
     geo_data = wrangler.geo_data
-    traversal = geo_data.traversal()
+
+    if traversal is None:
+        traversal = geo_data.traversal()
+
     tree = traversal.tree
+
     recorder = TimingRecorder()
 
     # Interface guidelines: Attributes of the tree are assumed to be known
@@ -515,6 +531,11 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
 
     # {{{ wrangle qbx expansions
 
+    # form_global_qbx_locals and eval_target_specific_qbx_locals are responsible
+    # for the same interactions (directly evaluated portion of the potentials
+    # via unified List 1).  Which one is used depends on the wrangler. If one of
+    # them is unused the corresponding output entries will be zero.
+
     qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights)
 
     recorder.add("form_global_qbx_locals", timing_future)
@@ -537,6 +558,12 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
 
     recorder.add("eval_qbx_expansions", timing_future)
 
+    ts_result, timing_future = wrangler.eval_target_specific_qbx_locals(src_weights)
+
+    qbx_potentials = qbx_potentials + ts_result
+
+    recorder.add("eval_target_specific_qbx_locals", timing_future)
+
     # }}}
 
     # {{{ reorder potentials
@@ -565,402 +592,6 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
 
     if timing_data is not None:
         timing_data.update(recorder.summarize())
-
-    return result
-
-# }}}
-
-
-# {{{ performance data
-
-def assemble_performance_data(geo_data, uses_pde_expansions,
-        translation_source_power=None, translation_target_power=None,
-        translation_max_power=None,
-        summarize_parallel=None, merge_close_lists=True):
-    """
-    :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM
-        uses translation operators that make use of the knowledge that the
-        potential satisfies a PDE.
-    :arg summarize_parallel: a function of two arguments
-        *(parallel_array, sym_multipliers)* used to process an array of
-        workloads of 'parallelizable units'. By default, all workloads are
-        summed into one number encompassing the total workload.
-    :arg merge_close_lists: A :class:`bool` indicating whether or not all
-        boxes requiring direct evaluation should be merged into a single
-        interaction list. If *False*, *part_direct* and *p2qbxl* will be
-        suffixed with the originating list as follows:
-
-        * *_neighbor* (List 1)
-        * *_sep_smaller* (List 3 close)
-        * *_sep_bigger* (List 4 close).
-    """
-
-    # FIXME: This should suport target filtering.
-
-    if summarize_parallel is None:
-        def summarize_parallel(parallel_array, sym_multipliers):  # noqa pylint:disable=function-redefined
-            return np.sum(parallel_array) * sym_multipliers
-
-    from collections import OrderedDict
-    result = OrderedDict()
-
-    from pymbolic import var
-    p_fmm = var("p_fmm")
-    p_qbx = var("p_qbx")
-
-    nqbtl = geo_data.non_qbx_box_target_lists()
-
-    with cl.CommandQueue(geo_data.cl_context) as queue:
-        tree = geo_data.tree().get(queue=queue)
-        traversal = geo_data.traversal(merge_close_lists).get(queue=queue)
-        box_target_counts_nonchild = (
-                nqbtl.box_target_counts_nonchild.get(queue=queue))
-
-    d = tree.dimensions
-    if uses_pde_expansions:
-        ncoeffs_fmm = p_fmm ** (d-1)
-        ncoeffs_qbx = p_qbx ** (d-1)
-
-        if d == 2:
-            default_translation_source_power = 1
-            default_translation_target_power = 1
-            default_translation_max_power = 0
-
-        elif d == 3:
-            # Based on a reading of FMMlib, i.e. a point-and-shoot FMM.
-            default_translation_source_power = 0
-            default_translation_target_power = 0
-            default_translation_max_power = 3
-
-        else:
-            raise ValueError("Don't know how to estimate expansion complexities "
-                    "for dimension %d" % d)
-
-    else:
-        ncoeffs_fmm = p_fmm ** d
-        ncoeffs_qbx = p_qbx ** d
-        default_translation_source_power = d
-        default_translation_target_power = d
-
-    if translation_source_power is None:
-        translation_source_power = default_translation_source_power
-    if translation_target_power is None:
-        translation_target_power = default_translation_target_power
-    if translation_max_power is None:
-        translation_max_power = default_translation_max_power
-
-    def xlat_cost(p_source, p_target):
-        from pymbolic.primitives import Max
-        return (
-                p_source ** translation_source_power
-                * p_target ** translation_target_power
-                * Max((p_source, p_target)) ** translation_max_power
-                )
-
-    result.update(
-            nlevels=tree.nlevels,
-            nboxes=tree.nboxes,
-            nsources=tree.nsources,
-            ntargets=tree.ntargets)
-
-    # {{{ construct local multipoles
-
-    result["form_mp"] = tree.nsources*ncoeffs_fmm
-
-    # }}}
-
-    # {{{ propagate multipoles upward
-
-    result["prop_upward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
-
-    # }}}
-
-    # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
-
-    def process_direct():
-        # box -> nsources * ntargets
-        npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-        npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-        npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_boxes):
-            ntargets = box_target_counts_nonchild[tgt_ibox]
-
-            npart_direct_list1_srcs = 0
-            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
-            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                npart_direct_list1_srcs += nsources
-
-            npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs
-
-            if merge_close_lists:
-                continue
-
-            npart_direct_list3_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_smaller_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    npart_direct_list3_srcs += nsources
-
-            npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs
-
-            npart_direct_list4_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_bigger_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    npart_direct_list4_srcs += nsources
-
-            npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs
-
-        if merge_close_lists:
-            result["part_direct"] = summarize_parallel(npart_direct_list1, 1)
-        else:
-            result["part_direct_neighbor"] = (
-                    summarize_parallel(npart_direct_list1, 1))
-            result["part_direct_sep_smaller"] = (
-                    summarize_parallel(npart_direct_list3, 1))
-            result["part_direct_sep_bigger"] = (
-                    summarize_parallel(npart_direct_list4, 1))
-
-    process_direct()
-
-    # }}}
-
-    # {{{ translate separated siblings' ("list 2") mpoles to local
-
-    def process_list2():
-        nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
-            start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
-
-            nm2l[itgt_box] += end-start
-
-        result["m2l"] = summarize_parallel(nm2l, xlat_cost(p_fmm, p_fmm))
-
-    process_list2()
-
-    # }}}
-
-    # {{{ evaluate sep. smaller mpoles ("list 3") at particles
-
-    def process_list3():
-        nmp_eval = np.zeros(
-                (tree.nlevels, len(traversal.target_boxes)),
-                dtype=np.intp)
-
-        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
-
-        for ilevel, sep_smaller_list in enumerate(
-                traversal.from_sep_smaller_by_level):
-            for itgt_box, tgt_ibox in enumerate(
-                        traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
-                ntargets = box_target_counts_nonchild[tgt_ibox]
-                start, end = sep_smaller_list.starts[itgt_box:itgt_box+2]
-                nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = (
-                        ntargets * (end-start)
-                )
-
-        result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm)
-
-    process_list3()
-
-    # }}}
-
-    # {{{ form locals for separated bigger source boxes ("list 4")
-
-    def process_list4():
-        nform_local = np.zeros(
-                len(traversal.target_or_target_parent_boxes),
-                dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
-            start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
-
-            nform_local_box = 0
-            for src_ibox in traversal.from_sep_bigger_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                nform_local_box += nsources
-
-            nform_local[itgt_box] = nform_local_box
-
-        result["form_local"] = summarize_parallel(nform_local, ncoeffs_fmm)
-
-    process_list4()
-
-    # }}}
-
-    # {{{ propagate local_exps downward
-
-    result["prop_downward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
-
-    # }}}
-
-    # {{{ evaluate locals
-
-    non_qbx_box_targets = geo_data.non_qbx_box_target_lists()
-    result["eval_part"] = non_qbx_box_targets.nfiltered_targets * ncoeffs_fmm
-
-    # }}}
-
-    # {{{ form global qbx locals
-
-    global_qbx_centers = geo_data.global_qbx_centers()
-
-    # If merge_close_lists is False above, then this builds another traversal
-    # (which is OK).
-    qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
-    center_to_targets_starts = geo_data.center_to_tree_targets().starts
-    qbx_center_to_target_box_source_level = np.empty(
-        (tree.nlevels,), dtype=object
-    )
-
-    for src_level in range(tree.nlevels):
-        qbx_center_to_target_box_source_level[src_level] = (
-            geo_data.qbx_center_to_target_box_source_level(src_level)
-        )
-
-    with cl.CommandQueue(geo_data.cl_context) as queue:
-        global_qbx_centers = global_qbx_centers.get(
-                queue=queue)
-        qbx_center_to_target_box = qbx_center_to_target_box.get(
-                queue=queue)
-        center_to_targets_starts = center_to_targets_starts.get(
-                queue=queue)
-        for src_level in range(tree.nlevels):
-            qbx_center_to_target_box_source_level[src_level] = (
-                qbx_center_to_target_box_source_level[src_level].get(queue=queue)
-            )
-
-    def process_form_qbxl():
-        ncenters = geo_data.ncenters
-
-        result["ncenters"] = ncenters
-
-        # center -> nsources
-        np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-        np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-        np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-
-        for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
-            itgt_box = qbx_center_to_target_box[tgt_icenter]
-
-            np2qbxl_list1_srcs = 0
-            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
-            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                np2qbxl_list1_srcs += nsources
-
-            np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs
-
-            if merge_close_lists:
-                continue
-
-            np2qbxl_list3_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_smaller_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    np2qbxl_list3_srcs += nsources
-
-            np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs
-
-            np2qbxl_list4_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_bigger_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    np2qbxl_list4_srcs += nsources
-
-            np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs
-
-        if merge_close_lists:
-            result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx)
-        else:
-            result["p2qbxl_neighbor"] = (
-                    summarize_parallel(np2qbxl_list1, ncoeffs_qbx))
-            result["p2qbxl_sep_smaller"] = (
-                    summarize_parallel(np2qbxl_list3, ncoeffs_qbx))
-            result["p2qbxl_sep_bigger"] = (
-                    summarize_parallel(np2qbxl_list4, ncoeffs_qbx))
-
-    process_form_qbxl()
-
-    # }}}
-
-    # {{{ translate from list 3 multipoles to qbx local expansions
-
-    def process_m2qbxl():
-        nm2qbxl = np.zeros(
-                (tree.nlevels, len(global_qbx_centers)),
-                dtype=np.intp)
-
-        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
-
-        for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
-
-            for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
-                icontaining_tgt_box = qbx_center_to_target_box_source_level[
-                    isrc_level][tgt_icenter]
-
-                if icontaining_tgt_box == -1:
-                    continue
-
-                start, stop = (
-                        ssn.starts[icontaining_tgt_box],
-                        ssn.starts[icontaining_tgt_box+1])
-
-                nm2qbxl[isrc_level, itgt_center] += stop-start
-
-        result["m2qbxl"] = summarize_parallel(nm2qbxl, xlat_cost(p_fmm, p_qbx))
-
-    process_m2qbxl()
-
-    # }}}
-
-    # {{{ translate from box local expansions to qbx local expansions
-
-    result["l2qbxl"] = geo_data.ncenters * xlat_cost(p_fmm, p_qbx)
-
-    # }}}
-
-    # {{{ evaluate qbx local expansions
-
-    def process_eval_qbxl():
-        nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp)
-
-        for isrc_center, src_icenter in enumerate(global_qbx_centers):
-            start, end = center_to_targets_starts[src_icenter:src_icenter+2]
-            nqbx_eval[isrc_center] += end-start
-
-        result["qbxl2p"] = summarize_parallel(nqbx_eval, ncoeffs_qbx)
-
-    process_eval_qbxl()
-
-    # }}}
-
     return result
 
 # }}}
diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py
index 0cda4945db781af8582c88578177a840fb18b552..98df707e296fd8e0d128edb1dafcae7e7f6b7fa1 100644
--- a/pytential/qbx/fmmlib.py
+++ b/pytential/qbx/fmmlib.py
@@ -23,11 +23,14 @@ THE SOFTWARE.
 """
 
 import numpy as np
-from pytools import memoize_method, Record
+from pytools import Record, memoize_method
 import pyopencl as cl  # noqa
 import pyopencl.array  # noqa: F401
 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
-from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+from sumpy.kernel import (
+        LaplaceKernel, HelmholtzKernel, AxisTargetDerivative,
+        DirectionalSourceDerivative)
+import pytential.qbx.target_specific as ts
 
 
 from boxtree.tools import return_timing_data
@@ -55,80 +58,14 @@ class QBXFMMLibExpansionWranglerCodeContainer(object):
     def get_wrangler(self, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs={},
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            _use_target_specific_qbx=None):
 
         return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype,
                 qbx_order, fmm_level_to_order,
                 source_extra_kwargs,
-                kernel_extra_kwargs)
-
-# }}}
-
-
-# {{{ host geo data wrapper
-
-class ToHostTransferredGeoDataWrapper(object):
-    def __init__(self, queue, geo_data):
-        self.queue = queue
-        self.geo_data = geo_data
-
-    @memoize_method
-    def tree(self):
-        return self.traversal().tree
-
-    @memoize_method
-    def traversal(self):
-        return self.geo_data.traversal().get(queue=self.queue)
-
-    @property
-    def ncenters(self):
-        return self.geo_data.ncenters
-
-    @memoize_method
-    def centers(self):
-        return np.array([
-            ci.get(queue=self.queue)
-            for ci in self.geo_data.centers()])
-
-    @memoize_method
-    def expansion_radii(self):
-        return self.geo_data.expansion_radii().get(queue=self.queue)
-
-    @memoize_method
-    def global_qbx_centers(self):
-        return self.geo_data.global_qbx_centers().get(queue=self.queue)
-
-    @memoize_method
-    def qbx_center_to_target_box(self):
-        return self.geo_data.qbx_center_to_target_box().get(queue=self.queue)
-
-    @memoize_method
-    def qbx_center_to_target_box_source_level(self, source_level):
-        return self.geo_data.qbx_center_to_target_box_source_level(
-            source_level).get(queue=self.queue)
-
-    @memoize_method
-    def non_qbx_box_target_lists(self):
-        return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue)
-
-    @memoize_method
-    def center_to_tree_targets(self):
-        return self.geo_data.center_to_tree_targets().get(queue=self.queue)
-
-    @memoize_method
-    def all_targets(self):
-        """All (not just non-QBX) targets packaged into a single array."""
-        return np.array(list(self.tree().targets))
-
-    @memoize_method
-    def m2l_rotation_angles(self):
-        # Already on host
-        return self.geo_data.m2l_rotation_angles()
-
-    @memoize_method
-    def m2l_rotation_lists(self):
-        # Already on host
-        return self.geo_data.m2l_rotation_lists()
+                kernel_extra_kwargs,
+                _use_target_specific_qbx)
 
 # }}}
 
@@ -139,13 +76,15 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
     def __init__(self, code, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs,
-            kernel_extra_kwargs):
-
+            kernel_extra_kwargs,
+            _use_target_specific_qbx=None):
         self.code = code
         self.queue = queue
 
         # FMMLib is CPU-only. This wrapper gets the geometry out of
         # OpenCL-land.
+
+        from pytential.qbx.utils import ToHostTransferredGeoDataWrapper
         geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data)
 
         self.geo_data = geo_data
@@ -153,60 +92,70 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
         # {{{ digest out_kernels
 
-        from sumpy.kernel import AxisTargetDerivative, DirectionalSourceDerivative
-
-        k_names = []
+        ifgrad = False
+        outputs = []
         source_deriv_names = []
+        k_names = []
 
-        def is_supported_helmknl(knl):
-            if isinstance(knl, DirectionalSourceDerivative):
-                source_deriv_name = knl.dir_vec_name
-                knl = knl.inner_kernel
-            else:
-                source_deriv_name = None
-
-            if isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3]:
-                k_names.append(knl.helmholtz_k_name)
-                source_deriv_names.append(source_deriv_name)
-                return True
-            elif isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]:
-                k_names.append(None)
-                source_deriv_names.append(source_deriv_name)
-                return True
-
-            return False
+        using_tsqbx = (
+                _use_target_specific_qbx
+                # None means use by default if possible
+                or _use_target_specific_qbx is None)
 
-        ifgrad = False
-        outputs = []
         for out_knl in self.code.out_kernels:
-            if is_supported_helmknl(out_knl):
+            if not self.is_supported_helmknl_for_tsqbx(out_knl):
+                if _use_target_specific_qbx:
+                    raise ValueError("not all kernels passed support TSQBX")
+                using_tsqbx = False
+
+            if self.is_supported_helmknl(out_knl):
                 outputs.append(())
+                no_target_deriv_knl = out_knl
+
             elif (isinstance(out_knl, AxisTargetDerivative)
-                    and is_supported_helmknl(out_knl.inner_kernel)):
+                    and self.is_supported_helmknl(out_knl.inner_kernel)):
                 outputs.append((out_knl.axis,))
                 ifgrad = True
+                no_target_deriv_knl = out_knl.inner_kernel
+
             else:
-                raise NotImplementedError(
+                raise ValueError(
                         "only the 2/3D Laplace and Helmholtz kernel "
                         "and their derivatives are supported")
 
+            source_deriv_names.append(no_target_deriv_knl.dir_vec_name
+                    if isinstance(no_target_deriv_knl, DirectionalSourceDerivative)
+                    else None)
+
+            base_knl = out_knl.get_base_kernel()
+            k_names.append(base_knl.helmholtz_k_name
+                    if isinstance(base_knl, HelmholtzKernel)
+                    else None)
+
+        self.using_tsqbx = using_tsqbx
+        self.outputs = outputs
+
         from pytools import is_single_valued
+
         if not is_single_valued(source_deriv_names):
             raise ValueError("not all kernels passed are the same in "
                     "whether they represent a source derivative")
 
         source_deriv_name = source_deriv_names[0]
-        self.outputs = outputs
 
-        # }}}
+        if not is_single_valued(k_names):
+            raise ValueError("not all kernels passed have the same "
+                    "Helmholtz parameter")
+
+        k_name = k_names[0]
 
-        from pytools import single_valued
-        k_name = single_valued(k_names)
         if k_name is None:
             helmholtz_k = 0
         else:
             helmholtz_k = kernel_extra_kwargs[k_name]
 
+        # }}}
+
         dipole_vec = None
         if source_deriv_name is not None:
             dipole_vec = np.array([
@@ -215,7 +164,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
                     order="F")
 
         def inner_fmm_level_to_nterms(tree, level):
-            from sumpy.kernel import LaplaceKernel, HelmholtzKernel
             if helmholtz_k == 0:
                 return fmm_level_to_order(
                         LaplaceKernel(tree.dimensions),
@@ -237,6 +185,23 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
                 ifgrad=ifgrad)
 
+    @staticmethod
+    def is_supported_helmknl_for_tsqbx(knl):
+        # Supports at most one derivative.
+        if isinstance(knl, (DirectionalSourceDerivative, AxisTargetDerivative)):
+            knl = knl.inner_kernel
+
+        return (isinstance(knl, (LaplaceKernel, HelmholtzKernel))
+                and knl.dim == 3)
+
+    @staticmethod
+    def is_supported_helmknl(knl):
+        if isinstance(knl, DirectionalSourceDerivative):
+            knl = knl.inner_kernel
+
+        return (isinstance(knl, (LaplaceKernel, HelmholtzKernel))
+                and knl.dim in (2, 3))
+
     # {{{ data vector helpers
 
     def output_zeros(self):
@@ -288,6 +253,13 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
                 raise ValueError("element '%s' of outputs array not "
                         "understood" % out)
 
+    @memoize_method
+    def _get_single_centers_array(self):
+        return np.array([
+            self.geo_data.centers()[idim]
+            for idim in range(self.dim)
+            ], order="F")
+
     # }}}
 
     # {{{ override target lists to only hit non-QBX targets
@@ -316,6 +288,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
     @log_process(logger)
     @return_timing_data
     def form_global_qbx_locals(self, src_weights):
+        if self.using_tsqbx:
+            return self.qbx_local_expansion_zeros()
+
         geo_data = self.geo_data
         trav = geo_data.traversal()
 
@@ -584,7 +559,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
         taeval = self.get_expn_eval_routine("ta")
 
-        for isrc_center, src_icenter in enumerate(global_qbx_centers):
+        for src_icenter in global_qbx_centers:
             for icenter_tgt in range(
                     ctt.starts[src_icenter],
                     ctt.starts[src_icenter+1]):
@@ -604,6 +579,60 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
         return output
 
+    @log_process(logger)
+    @return_timing_data
+    def eval_target_specific_qbx_locals(self, src_weights):
+        if not self.using_tsqbx:
+            return self.full_output_zeros()
+
+        geo_data = self.geo_data
+        trav = geo_data.traversal()
+
+        ctt = geo_data.center_to_tree_targets()
+
+        src_weights = src_weights.astype(np.complex128)
+
+        ifcharge = self.dipole_vec is None
+        ifdipole = self.dipole_vec is not None
+
+        ifpot = any(not output for output in self.outputs)
+        ifgrad = self.ifgrad
+
+        # Create temporary output arrays for potential / gradient.
+        pot = np.zeros(self.tree.ntargets, np.complex) if ifpot else None
+        grad = (
+                np.zeros((self.dim, self.tree.ntargets), np.complex)
+                if ifgrad else None)
+
+        ts.eval_target_specific_qbx_locals(
+                ifpot=ifpot,
+                ifgrad=ifgrad,
+                ifcharge=ifcharge,
+                ifdipole=ifdipole,
+                order=self.qbx_order,
+                sources=self._get_single_sources_array(),
+                targets=geo_data.all_targets(),
+                centers=self._get_single_centers_array(),
+                qbx_centers=geo_data.global_qbx_centers(),
+                qbx_center_to_target_box=geo_data.qbx_center_to_target_box(),
+                center_to_target_starts=ctt.starts,
+                center_to_target_lists=ctt.lists,
+                source_box_starts=trav.neighbor_source_boxes_starts,
+                source_box_lists=trav.neighbor_source_boxes_lists,
+                box_source_starts=self.tree.box_source_starts,
+                box_source_counts_nonchild=self.tree.box_source_counts_nonchild,
+                helmholtz_k=self.kernel_kwargs.get("zk", 0),
+                charge=src_weights,
+                dipstr=src_weights,
+                dipvec=self.dipole_vec,
+                pot=pot,
+                grad=grad)
+
+        output = self.full_output_zeros()
+        self.add_potgrad_onto_output(output, slice(None), pot, grad)
+
+        return output
+
     def finalize_potentials(self, potential):
         potential = super(QBXFMMLibExpansionWrangler, self).finalize_potentials(
                 potential)
diff --git a/pytential/qbx/target_assoc.py b/pytential/qbx/target_assoc.py
index 01fd6eb653cbeda02d812589638f98ee4b0abfe5..06fe1472608113ebf63f306757606648b4a50e60 100644
--- a/pytential/qbx/target_assoc.py
+++ b/pytential/qbx/target_assoc.py
@@ -525,9 +525,9 @@ class TargetAssociationWrangler(TreeWranglerBase):
         return (found_target_close_to_panel == 1).all().get()
 
     @log_process(logger)
-    def try_find_centers(self, tree, peer_lists, lpot_source,
-                         target_status, target_flags, target_assoc,
-                         target_association_tolerance, debug, wait_for=None):
+    def find_centers(self, tree, peer_lists, lpot_source,
+                     target_status, target_flags, target_assoc,
+                     target_association_tolerance, debug, wait_for=None):
         # Round up level count--this gets included in the kernel as
         # a stack bound. Rounding avoids too many kernel versions.
         from pytools import div_ceil
@@ -752,7 +752,7 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler,
 
     target_flags = wrangler.make_target_flags(target_discrs_and_qbx_sides)
 
-    wrangler.try_find_centers(tree, peer_lists, lpot_source, target_status,
+    wrangler.find_centers(tree, peer_lists, lpot_source, target_status,
             target_flags, target_assoc, target_association_tolerance, debug)
 
     center_not_found = (
diff --git a/pytential/qbx/target_specific/__init__.py b/pytential/qbx/target_specific/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f10bba8c64d193c69110a13b91cac15d4de6ab
--- /dev/null
+++ b/pytential/qbx/target_specific/__init__.py
@@ -0,0 +1,24 @@
+__copyright__ = "Copyright (C) 2018 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from .impl import *  # noqa
diff --git a/pytential/qbx/target_specific/helmholtz_utils.c b/pytential/qbx/target_specific/helmholtz_utils.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c06b7fc22863a022edbe296a0830a4a8d7e2500
--- /dev/null
+++ b/pytential/qbx/target_specific/helmholtz_utils.c
@@ -0,0 +1,515 @@
+/* This file contains routines for evaluating spherical Bessel and Hankel
+   functions.
+
+   This is based on cdjseval3d.f and helmrouts3d.f from fmmlib3d, translated
+   with a hacked version of f2c and manually postprocessed. */
+
+/* Original copyright notice: */
+
+/* **********************************************************************
+
+Copyright (c) 2009-2012, Leslie Greengard, Zydrunas Gimbutas
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+********************************************************************** */
+
+#include <complex.h>
+
+/* declarations for f2c generated things */
+
+typedef int integer;
+typedef double doublereal;
+typedef double complex doublecomplex;
+
+static inline double z_abs(doublecomplex *z) {
+    return cabs(*z);
+}
+
+static inline void z_exp(doublecomplex *out, doublecomplex *z) {
+    *out = cexp(*z);
+}
+
+static inline void z_sin(doublecomplex *out, doublecomplex *z) {
+    *out = csin(*z);
+}
+
+static inline void z_cos(doublecomplex *out, doublecomplex *z) {
+    *out = ccos(*z);
+}
+
+/* Start of functions borrowed from cdjseval3d.f */
+
+/*     Computation of spherical Bessel functions via recurrence */
+
+/* ********************************************************************** */
+/* Subroutine */ int jfuns3d_(integer *ier, integer *nterms, doublecomplex *
+	z__, doublereal *scale, doublecomplex *fjs, integer *ifder,
+	doublecomplex *fjder, integer *lwfjs, integer *iscale, integer *ntop)
+{
+    /* Initialized data */
+
+    static doublereal upbound2 = 1e40;
+    static doublereal upbound2inv = 1e-40;
+    static doublereal tiny = 1e-200;
+    static doublereal done = 1.;
+    static doublereal zero = 0.;
+
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    integer i__;
+    doublereal d0, d1, dd, dc1, dc2;
+    doublecomplex fj0, fj1, zinv, ztmp;
+    doublereal dcoef;
+    doublereal sctot;
+    doublecomplex zscale;
+    doublereal scalinv;
+
+/* ********************************************************************** */
+
+/* PURPOSE: */
+
+/* 	This subroutine evaluates the first NTERMS spherical Bessel */
+/* 	functions and if required, their derivatives. */
+/* 	It incorporates a scaling parameter SCALE so that */
+
+/* 		fjs_n(z)=j_n(z)/SCALE^n */
+/* 		fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} */
+
+/* 	NOTE: The scaling parameter SCALE is meant to be used when */
+/*             abs(z) < 1, in which case we recommend setting */
+/* 	      SCALE = abs(z). This prevents the fjs_n from */
+/*             underflowing too rapidly. */
+/* 	      Otherwise, set SCALE=1. */
+/* 	      Do not set SCALE = abs(z) if z could take on the */
+/*             value zero. */
+/*             In an FMM, when forming an expansion from a collection of */
+/*             sources, set SCALE = min( abs(k*r), 1) */
+/*             where k is the Helmholtz parameter and r is the box dimension */
+/*             at the relevant level. */
+
+/* INPUT: */
+
+/*    nterms (integer): order of expansion of output array fjs */
+/*    z     (complex *16): argument of the spherical Bessel functions */
+/*    scale    (real *8) : scaling factor (discussed above) */
+/*    ifder  (integer): flag indicating whether to calculate "fjder" */
+/* 		          0	NO */
+/* 		          1	YES */
+/*    lwfjs  (integer): upper limit of input arrays */
+/*                         fjs(0:lwfjs) and iscale(0:lwfjs) */
+/*    iscale (integer): integer workspace used to keep track of */
+/*                         internal scaling */
+
+/* OUTPUT: */
+
+/*    ier    (integer): error return code */
+/*                         ier=0 normal return; */
+/*                         ier=8 insufficient array dimension lwfjs */
+/*    fjs   (complex *16): array of scaled Bessel functions. */
+/*    fjder (complex *16): array of derivs of scaled Bessel functions. */
+/*    ntop  (integer) : highest index in arrays fjs that is nonzero */
+
+/*       NOTE, that fjs and fjder arrays must be at least (nterms+2) */
+/*       complex *16 elements long. */
+
+
+
+
+/* ... Initializing ... */
+
+    *ier = 0;
+
+/*       set to asymptotic values if argument is sufficiently small */
+
+    if (z_abs(z__) < tiny) {
+	fjs[0] = done;
+	i__1 = *nterms;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    fjs[i__] = zero;
+	}
+
+	if (*ifder == 1) {
+	    i__1 = *nterms;
+	    for (i__ = 0; i__ <= i__1; ++i__) {
+		fjder[i__] = zero;
+	    }
+	    fjder[1] = done / (*scale * 3);
+	}
+
+	return 0;
+    }
+
+/* ... Step 1: recursion up to find ntop, starting from nterms */
+
+    *ntop = 0;
+    zinv = done / *z__;
+    fjs[*nterms] = done;
+    fjs[*nterms - 1] = zero;
+
+    i__1 = *lwfjs;
+    for (i__ = *nterms; i__ <= i__1; ++i__) {
+	dcoef = (i__ << 1) + done;
+	ztmp = dcoef * zinv * fjs[i__] - fjs[i__ - 1];
+	fjs[i__ + 1] = ztmp;
+
+/* Computing 2nd power */
+	d__1 = creal(ztmp);
+/* Computing 2nd power */
+	d__2 = cimag(ztmp);
+	dd = d__1 * d__1 + d__2 * d__2;
+	if (dd > upbound2) {
+	    *ntop = i__ + 1;
+	    break;
+	}
+    }
+    if (*ntop == 0) {
+	*ier = 8;
+	return 0;
+    }
+
+/* ... Step 2: Recursion back down to generate the unscaled jfuns: */
+/*             if magnitude exceeds UPBOUND2, rescale and continue the */
+/* 	      recursion (saving the order at which rescaling occurred */
+/* 	      in array iscale. */
+
+    i__1 = *ntop;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	iscale[i__] = 0;
+    }
+
+    fjs[*ntop] = zero;
+    fjs[*ntop - 1] = done;
+    for (i__ = *ntop - 1; i__ >= 1; --i__) {
+	dcoef = (i__ << 1) + done;
+	ztmp = dcoef * zinv * fjs[i__] - fjs[i__ + 1];
+	fjs[i__ - 1] = ztmp;
+
+/* Computing 2nd power */
+	d__1 = creal(ztmp);
+/* Computing 2nd power */
+	d__2 = cimag(ztmp);
+	dd = d__1 * d__1 + d__2 * d__2;
+	if (dd > upbound2) {
+	    fjs[i__] *= upbound2inv;
+	    fjs[i__ - 1] *= upbound2inv;
+	    iscale[i__] = 1;
+	}
+/* L2200: */
+    }
+
+/* ...  Step 3: go back up to the top and make sure that all */
+/*              Bessel functions are scaled by the same factor */
+/*              (i.e. the net total of times rescaling was invoked */
+/*              on the way down in the previous loop). */
+/*              At the same time, add scaling to fjs array. */
+
+    scalinv = done / *scale;
+    sctot = 1.;
+    i__1 = *ntop;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	sctot *= scalinv;
+	if (iscale[i__ - 1] == 1) {
+	    sctot *= upbound2inv;
+	}
+	fjs[i__] *= sctot;
+    }
+
+/* ... Determine the normalization parameter: */
+
+    z_sin(&z__1, z__);
+    fj0 = z__1 * zinv;
+    z_cos(&z__1, z__);
+    fj1 = fj0 * zinv - z__1 * zinv;
+
+    d0 = z_abs(&fj0);
+    d1 = z_abs(&fj1);
+    if (d1 > d0) {
+	zscale = fj1 / (fjs[1] * *scale);
+    } else {
+	zscale = fj0 / fjs[0];
+    }
+
+/* ... Scale the jfuns by zscale: */
+
+    ztmp = zscale;
+    i__1 = *nterms;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	fjs[i__] *= ztmp;
+    }
+
+/* ... Finally, calculate the derivatives if desired: */
+
+    if (*ifder == 1) {
+	fjs[*nterms + 1] *= ztmp;
+
+	fjder[0] = -fjs[1] * *scale;
+	i__1 = *nterms;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dc1 = i__ / ((i__ << 1) + done);
+	    dc2 = done - dc1;
+	    dc1 *= scalinv;
+	    dc2 *= *scale;
+	    fjder[i__] = dc1 * fjs[i__ - 1] - dc2 * fjs[i__ + 1];
+	}
+    }
+    return 0;
+} /* jfuns3d_ */
+
+/* Start of functions borrowed from helmrouts3d.f */
+
+/*      This file contains the basic subroutines for */
+/*      forming and evaluating multipole (partial wave) expansions. */
+
+/*      Documentation is incomplete and ongoing... */
+
+
+/*      Remarks on scaling conventions. */
+
+/*      1)  Hankel and Bessel functions are consistently scaled as */
+/*       	hvec(n)= h_n(z)*scale^(n) */
+/*       	jvec(n)= j_n(z)/scale^(n) */
+
+/*          In some earlier FMM implementations, the convention */
+/*       	hvec(n)= h_n(z)*scale^(n+1) */
+/*          was sometimes used, leading to various obscure rescaling */
+/*          steps. */
+
+/*          scale should be of the order of |z| if |z| < 1. Otherwise, */
+/*          scale should be set to 1. */
+
+
+/*      2) There are many definitions of the spherical harmonics, */
+/*         which differ in terms of normalization constants. We */
+/*         adopt the following convention: */
+
+/*         For m>0, we define Y_n^m according to */
+
+/*         Y_n^m = \sqrt{2n+1} \sqrt{\frac{ (n-m)!}{(n+m)!}} \cdot */
+/*                 P_n^m(\cos \theta)  e^{i m phi} */
+/*         and */
+
+/*         Y_n^-m = dconjg( Y_n^m ) */
+
+/*         We omit the Condon-Shortley phase factor (-1)^m in the */
+/*         definition of Y_n^m for m<0. (This is standard in several */
+/*         communities.) */
+
+/*         We also omit the factor \sqrt{\frac{1}{4 \pi}}, so that */
+/*         the Y_n^m are orthogonal on the unit sphere but not */
+/*         orthonormal.  (This is also standard in several communities.) */
+/*         More precisely, */
+
+/*                 \int_S Y_n^m Y_n^m d\Omega = 4 \pi. */
+
+/*         Using our standard definition, the addition theorem takes */
+/*         the simple form */
+
+/*         e^( i k r}/(ikr) = */
+/*         \sum_n \sum_m  j_n(k|S|) Ylm*(S) h_n(k|T|) Ylm(T) */
+
+
+/* ----------------------------------------------------------------------- */
+/*      h3d01: computes h0, h1 (first two spherical Hankel fns.) */
+/*      h3dall: computes Hankel functions of all orders and scales them */
+/* ********************************************************************** */
+/* Subroutine */ static int h3d01_(doublecomplex *z__, doublecomplex *h0,
+	doublecomplex *h1)
+{
+    /* Initialized data */
+
+    static doublecomplex eye = I;
+    static doublereal thresh = 1e-15;
+    static doublereal done = 1.;
+
+    /* System generated locals */
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublecomplex cd;
+
+/* ********************************************************************** */
+
+/*     Compute spherical Hankel functions of order 0 and 1 */
+
+/*     h0(z)  =   exp(i*z)/(i*z), */
+/*     h1(z)  =   - h0' = -h0*(i-1/z) = h0*(1/z-i) */
+
+/* ----------------------------------------------------------------------- */
+/*     INPUT: */
+
+/* 	z   :  argument of Hankel functions */
+/*              if abs(z)<1.0d-15, returns zero. */
+
+/* ----------------------------------------------------------------------- */
+/*     OUTPUT: */
+
+/* 	h0  :  h0(z)    (spherical Hankel function of order 0). */
+/* 	h1  :  -h0'(z)  (spherical Hankel function of order 1). */
+
+/* ----------------------------------------------------------------------- */
+
+    if (z_abs(z__) < thresh) {
+	*h0 = 0.;
+	*h1 = 0.;
+	return 0;
+    }
+
+/*     Otherwise, use formula */
+
+    cd = eye * *z__;
+    z_exp(&z__1, &cd);
+    *h0 = z__1 / cd;
+    *h1 = *h0 * (done / *z__ - eye);
+
+    return 0;
+} /* h3d01_ */
+
+
+
+
+/* ********************************************************************** */
+/* Subroutine */ int h3dall_(integer *nterms, doublecomplex *z__, doublereal *
+	scale, doublecomplex *hvec, integer *ifder, doublecomplex *hder)
+{
+    /* Initialized data */
+    static doublereal thresh = 1e-15;
+    static doublereal done = 1.;
+
+    /* Builtin functions */
+    double z_abs(doublecomplex *);
+
+    /* Local variables */
+    integer i__;
+    integer i__1;
+    doublereal dtmp;
+    doublecomplex zinv, ztmp;
+    doublereal scal2;
+
+/* ********************************************************************** */
+
+/*     This subroutine computes scaled versions of the spherical Hankel */
+/*     functions h_n of orders 0 to nterms. */
+
+/*       	hvec(n)= h_n(z)*scale^(n) */
+
+/*     The parameter SCALE is useful when |z| < 1, in which case */
+/*     it damps out the rapid growth of h_n as n increases. In such */
+/*     cases, we recommend setting */
+
+/*               scale = |z| */
+
+/*     or something close. If |z| > 1, set scale = 1. */
+
+/*     If the flag IFDER is set to one, it also computes the */
+/*     derivatives of h_n. */
+
+/* 		hder(n)= h_n'(z)*scale^(n) */
+
+/*     NOTE: If |z| < 1.0d-15, the subroutine returns zero. */
+
+/* ----------------------------------------------------------------------- */
+/*     INPUT: */
+
+/*     nterms  : highest order of the Hankel functions to be computed. */
+/*     z       : argument of the Hankel functions. */
+/*     scale   : scaling parameter discussed above */
+/*     ifder   : flag indcating whether derivatives should be computed. */
+/* 		ifder = 1   ==> compute */
+/* 		ifder = 0   ==> do not compute */
+
+/* ----------------------------------------------------------------------- */
+/*     OUTPUT: */
+
+/*     hvec    : the vector of spherical Hankel functions */
+/*     hder    : the derivatives of the spherical Hankel functions */
+
+/* ----------------------------------------------------------------------- */
+
+
+/*     If |z| < thresh, return zeros. */
+
+    if (z_abs(z__) < thresh) {
+	i__1 = *nterms;
+	for (i__ = 0; i__ <= i__1; ++i__) {
+	    hvec[i__] = 0;
+	    hder[i__] = 0;
+	}
+	return 0;
+    }
+
+/*     Otherwise, get h_0 and h_1 analytically and the rest via */
+/*     recursion. */
+
+    h3d01_(z__, hvec, &hvec[1]);
+    hvec[0] = hvec[0];
+    hvec[1] *= *scale;
+
+/*     From Abramowitz and Stegun (10.1.19) */
+
+/*     h_{n+1}(z)=(2n+1)/z * h_n(z) - h_{n-1}(z) */
+
+/*     With scaling: */
+
+/*     hvec(n+1)=scale*(2n+1)/z * hvec(n) -(scale**2) hvec(n-1) */
+
+    scal2 = *scale * *scale;
+    zinv = *scale / *z__;
+    i__1 = *nterms - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtmp = (i__ << 1) + done;
+	ztmp = zinv * dtmp;
+	hvec[i__ + 1] = ztmp * hvec[i__] - scal2 * hvec[i__ - 1];
+    }
+
+/*     From Abramowitz and Stegun (10.1.21) */
+
+/* 	h_{n}'(z)= h_{n-1}(z) - (n+1)/z * h_n(z) */
+
+/*     With scaling: */
+
+/*     hder(n)=scale* hvec(n-1) - (n+1)/z * hvec(n) */
+
+
+    if (*ifder == 1) {
+
+	hder[0] = -hvec[1] / *scale;
+	zinv = 1. / *z__;
+	i__1 = *nterms;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dtmp = i__ + done;
+	    ztmp = zinv * dtmp;
+	    hder[i__] = *scale * hvec[i__ - 1] - ztmp * hvec[i__];
+	}
+    }
+
+    return 0;
+} /* h3dall_ */
diff --git a/pytential/qbx/target_specific/helmholtz_utils.h b/pytential/qbx/target_specific/helmholtz_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d36b3b1206830f43d31c67551aab9b00dbd80252
--- /dev/null
+++ b/pytential/qbx/target_specific/helmholtz_utils.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2018 Matt Wala
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef HELMHOLTZ_UTILS_H
+#define HELMHOLTZ_UTILS_H
+
+#include <complex.h>
+
+extern int jfuns3d_(int *ier, int *nterms, double complex *z,
+                    double *scale, double complex *fjs, int *ifder,
+                    double complex *fjder, int *lwfjs, int *iscale,
+                    int *ntop);
+
+extern int h3dall_(int *nterms, double complex *z, double *scale,
+		   double complex *hvec, int *ifder, double complex *hder);
+
+#endif  /* HELMHOLTZ_UTILS_H */
diff --git a/pytential/qbx/target_specific/impl.h b/pytential/qbx/target_specific/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d7281b4d3e45cafdc74c2243bf2ca5dbca90abd
--- /dev/null
+++ b/pytential/qbx/target_specific/impl.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2018 Matt Wala
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef IMPL_H
+#define IMPL_H
+
+/* Temporary buffer size for holding e.g. Legendre polynomial values */
+#define BUFSIZE 64
+
+/* Padding for false sharing prevention */
+#define PADDING 65
+
+#endif  /* IMPL_H */
diff --git a/pytential/qbx/target_specific/impl.pyx b/pytential/qbx/target_specific/impl.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..37884f3f522af70946d82b5ea07974aa617ace28
--- /dev/null
+++ b/pytential/qbx/target_specific/impl.pyx
@@ -0,0 +1,786 @@
+# cython: warn.unused=True, warn.unused_arg=True, warn.unreachable=True
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+# cython: embedsignature=True, language_level=3
+
+"""Copyright (C) 2018 Matt Wala
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import cython
+import cython.parallel
+
+from libc.math cimport sqrt
+from libc.stdio cimport printf, fprintf, stderr
+from libc.stdlib cimport abort
+
+cimport openmp
+
+
+cdef extern from "complex.h" nogil:
+    double cabs(double complex)
+
+
+cdef extern from "helmholtz_utils.h" nogil:
+    int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale,
+                 double complex *fjs, int *ifder, double complex *fjder,
+                 int *lwfjs, int *iscale, int *ntop);
+    int h3dall_(int *nterms, double complex *z, double *scale,
+		double complex *hvec, int *ifder, double complex *hder);
+
+
+cdef extern from "impl.h" nogil:
+    const int BUFSIZE
+    const int PADDING
+
+
+# {{{ (externally visible) wrappers for bessel / hankel functions
+
+def jfuns3d_wrapper(nterms, z, scale, fjs, fjder):
+    """Evaluate spherical Bessel functions.
+
+    Arguments:
+        nterms: Highest order to be computed
+        z: Argument
+        scale: Output scaling factor (recommended: min(abs(z), 1))
+        fjs: Output array of complex doubles
+        fjder: *None*, or output array of complex double derivatives
+    """
+    cdef:
+        double complex[BUFSIZE] fjstemp
+        double complex[BUFSIZE] fjdertmp
+        int[BUFSIZE] iscale
+        int ier, ifder, lwfjs, ntop, i, nterms_
+        double scale_
+        double complex z_
+
+    if nterms <= 0:
+        raise ValueError("nterms should be positive")
+
+    nterms_ = nterms
+    z_ = z
+    scale_ = scale
+    ifder = fjder is not None
+    lwfjs = BUFSIZE
+
+    jfuns3d_(&ier, &nterms_, &z_, &scale_, fjstemp, &ifder, fjdertmp, &lwfjs,
+             iscale, &ntop)
+
+    if ier:
+        raise ValueError("jfuns3d_ returned error code %d" % ier)
+
+    for i in range(1 + nterms):
+        fjs[i] = fjstemp[i]
+        if ifder:
+            fjder[i] = fjdertmp[i]
+
+
+def h3dall_wrapper(nterms, z, scale, hs, hders):
+    """Evaluate spherical Hankel functions.
+
+    Arguments:
+        nterms: Highest order to be computed
+        z: Argument
+        scale: Output scaling factor (recommended: min(abs(z), 1))
+        hs: Output array of complex doubles
+        hders: *None*, or output array of complex double derivatives
+    """
+    cdef:
+        int nterms_, ifder
+        double scale_
+        double complex z_
+        double complex[:] hvec = np.empty(1 + nterms, np.complex)
+        double complex[:] hdervec = np.empty(1 + nterms, np.complex)
+
+    ifder = hders is not None
+
+    if nterms <= 0:
+        raise ValueError("nterms should be positive")
+
+    z_ = z
+    scale_ = scale
+    nterms_ = nterms
+
+    h3dall_(&nterms_, &z_, &scale_, &hvec[0], &ifder, &hdervec[0])
+
+    hs[:1 + nterms] = hvec[:]
+    if ifder:
+        hders[:1 + nterms] = hdervec[:]
+
+# }}}
+
+
+# {{{ helpers
+
+cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil:
+    """Compute the values of the Legendre polynomial up to order n at x.
+    Optionally, if derivs is non-NULL, compute the values of the derivative too.
+
+    Borrowed from fmmlib.
+    """
+    cdef:
+        double pj, derj, pjm2, pjm1, derjm2, derjm1
+        int j
+
+    pjm2 = 1
+    pjm1 = x
+
+    vals[0] = 1
+    if derivs != NULL:
+        derivs[0] = 0
+        derjm2 = 0
+        derjm1 = 1
+
+    if n == 0:
+        return
+
+    vals[1] = x
+    if derivs != NULL:
+        derivs[1] = 1
+
+    if n == 1:
+        return
+
+    for j in range(2, n + 1):
+        pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j
+        vals[j] = pj
+
+        if derivs != NULL:
+            derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2
+            derj = derj / j
+            derivs[j] = derj
+            derjm2 = derjm1
+            derjm1 = derj
+
+        pjm2 = pjm1
+        pjm1 = pj
+
+
+cdef double dist(double[3] a, double[3] b) nogil:
+    """Calculate the Euclidean distance between a and b."""
+    return sqrt(
+            (a[0] - b[0]) * (a[0] - b[0]) +
+            (a[1] - b[1]) * (a[1] - b[1]) +
+            (a[2] - b[2]) * (a[2] - b[2]))
+
+
+cdef void ts_helmholtz_precompute(
+        double[3] center,
+        double[3] target,
+        int order,
+        int ifder,
+        double complex k,
+        double complex[] jvals,
+        double complex[] jderivs,
+        double *jscale) nogil:
+    """Evaluate the source-invariant Bessel terms of the Helmholtz target-specific
+    expansion."""
+
+    cdef:
+        double complex z
+        double tc_d
+        int ier, ntop, lwfjs
+        int[BUFSIZE] iscale
+
+    tc_d = dist(target, center)
+    jscale[0] = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1
+
+    # Evaluate the spherical Bessel terms.
+    z = k * tc_d
+    lwfjs = BUFSIZE
+    # jfuns3d_ only supports order > 0 (goes out of bounds if order = 0)
+    order = max(1, order)
+    jfuns3d_(&ier, &order, &z, jscale, jvals, &ifder, jderivs, &lwfjs, iscale,
+             &ntop)
+    if ier:
+        # This could in theory fail.
+        fprintf(stderr, "array passed to jfuns3d_ was too small\n")
+        abort()
+
+# }}}
+
+
+# {{{ Laplace S
+
+cdef double complex ts_laplace_s(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double complex charge,
+        int order) nogil:
+    """Evaluate the target-specific expansion of the Laplace single-layer kernel."""
+
+    cdef:
+        double j
+        double result, r, sc_d, tc_d, cos_angle
+        # Legendre recurrence values
+        double pj, pjm1, pjm2
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    cos_angle = ((
+            (target[0] - center[0]) * (source[0] - center[0]) +
+            (target[1] - center[1]) * (source[1] - center[1]) +
+            (target[2] - center[2]) * (source[2] - center[2]))
+            / (tc_d * sc_d))
+
+    if order == 0:
+        return charge / sc_d
+
+    pjm2 = 1
+    pjm1 = cos_angle
+
+    result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d)
+
+    r = (tc_d * tc_d) / (sc_d * sc_d * sc_d)
+
+    # Invariant: j matches loop counter. Using a double-precision version of the
+    # loop counter avoids an int-to-double conversion inside the loop.
+    j = 2.
+
+    for _ in range(2, order + 1):
+        pj = ( (2.*j-1.)*cos_angle*pjm1-(j-1.)*pjm2 ) / j
+        result += pj * r
+
+        r *= (tc_d / sc_d)
+        j += 1
+        pjm2 = pjm1
+        pjm1 = pj
+
+    return charge * result
+
+# }}}
+
+
+# {{{ Laplace grad(S)
+
+cdef void ts_laplace_sp(
+        double complex[3] grad,
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double complex charge,
+        int order) nogil:
+    """Evaluate the target-specific expansion of the gradient of the Laplace
+    single-layer kernel."""
+
+    cdef:
+        double[3] grad_tmp
+        double sc_d, tc_d, cos_angle, Rn
+        double[BUFSIZE] lvals, lderivs
+        double[3] smc, tmc
+        int n
+
+    for m in range(3):
+        smc[m] = source[m] - center[m]
+        tmc[m] = target[m] - center[m]
+        grad_tmp[m] = 0
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d)
+    legvals(cos_angle, order, lvals, lderivs)
+
+    # Invariant: Rn = tc_d ** (n - 1) / sc_d ** (n + 1)
+    Rn = 1 / (sc_d * sc_d)
+
+    for n in range(1, 1 + order):
+        for m in range(3):
+            grad_tmp[m] += Rn * (
+                n * (tmc[m] / tc_d) * lvals[n]
+                + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d) * lderivs[n])
+        Rn *= tc_d / sc_d
+
+    for m in range(3):
+        grad[m] += charge * grad_tmp[m]
+
+# }}}
+
+
+# {{{ Laplace D
+
+cdef double complex ts_laplace_d(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double[3] dipole,
+        double complex dipstr,
+        int order) nogil:
+    """Evaluate the target-specific expansion of the Laplace double-layer kernel."""
+
+    cdef:
+        int n, m
+        double sc_d, tc_d, cos_angle, Rn
+        double[BUFSIZE] lvals, lderivs
+        double[3] smc, tmc, grad
+
+    for m in range(3):
+        smc[m] = source[m] - center[m]
+        tmc[m] = target[m] - center[m]
+        grad[m] = 0
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d)
+    legvals(cos_angle, order, lvals, lderivs)
+
+    # Invariant: Rn = (tc_d ** n / sc_d ** (n + 2))
+    Rn = 1 / (sc_d * sc_d)
+
+    for n in range(0, order + 1):
+        for m in range(3):
+            grad[m] += Rn * (
+                    -(n + 1) * (smc[m] / sc_d) * lvals[n]
+                    + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d) * lderivs[n])
+        Rn *= (tc_d / sc_d)
+
+    return dipstr * (
+            dipole[0] * grad[0] + dipole[1] * grad[1] + dipole[2] * grad[2])
+
+# }}}
+
+
+# {{{ Helmholtz S
+
+cdef double complex ts_helmholtz_s(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double complex charge,
+        int order,
+        double complex k,
+        double complex[] jvals,
+        double jscale) nogil:
+    """Evaluate the target-specific expansion of the Helmholtz single-layer
+    kernel."""
+
+    cdef:
+        int n, ifder
+        double sc_d, tc_d, cos_angle
+        double[BUFSIZE] lvals
+        double complex[BUFSIZE] hvals
+        double hscale, unscale
+        double complex z, result
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    cos_angle = ((
+            (target[0] - center[0]) * (source[0] - center[0]) +
+            (target[1] - center[1]) * (source[1] - center[1]) +
+            (target[2] - center[2]) * (source[2] - center[2]))
+            / (tc_d * sc_d))
+
+    # Evaluate the Legendre terms.
+    legvals(cos_angle, order, lvals, NULL)
+
+    # Scaling magic for Hankel terms.
+    # These values are taken from the fmmlib documentation.
+    hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1
+    # unscale = (jscale / hscale) ** n
+    # Multiply against unscale to remove the scaling.
+    unscale = 1
+
+    # Evaluate the spherical Hankel terms.
+    z = k * sc_d
+    ifder = 0
+    h3dall_(&order, &z, &hscale, hvals, &ifder, NULL)
+
+    result = 0
+
+    for n in range(1 + order):
+        result += (2 * n + 1) * unscale * (jvals[n] * hvals[n] * lvals[n])
+        unscale *= jscale / hscale
+
+    return 1j * k * charge * result
+
+# }}}
+
+
+# {{{ Helmholtz grad(S)
+
+cdef void ts_helmholtz_sp(
+        double complex[3] grad,
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double complex charge,
+        int order,
+        double complex k,
+        double complex[] jvals,
+        double complex[] jderivs,
+        double jscale) nogil:
+    """Evaluate the target-specific expansion of the gradient of the Helmholtz
+    single-layer kernel."""
+
+    cdef:
+        int n, m
+        int ifder
+        double sc_d, tc_d, cos_angle
+        double[3] smc, tmc
+        double complex[3] grad_tmp
+        double[BUFSIZE] lvals, lderivs
+        double complex z
+        double complex [BUFSIZE] hvals
+        double hscale, unscale
+
+    for m in range(3):
+        smc[m] = source[m] - center[m]
+        tmc[m] = target[m] - center[m]
+        grad_tmp[m] = 0
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    # Evaluate the Legendre terms.
+    cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d)
+    legvals(cos_angle, order, lvals, lderivs)
+
+    # Scaling magic for Hankel terms.
+    # These values are taken from the fmmlib documentation.
+    hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1
+    # unscale = (jscale / hscale) ** n
+    # Multiply against unscale to remove the scaling.
+    unscale = 1
+
+    # Evaluate the spherical Hankel terms.
+    z = k * sc_d
+    ifder = 0
+    h3dall_(&order, &z, &hscale, hvals, &ifder, NULL)
+
+    #
+    # This is a mess, but amounts to the t-gradient of:
+    #
+    #      __ order
+    #  ik \         (2n  +  1) j (k |t - c|) h (k |s - c|) P (cos θ)
+    #     /__ n = 0             n             n             n
+    #
+    #
+    for n in range(0, order + 1):
+        for m in range(3):
+            grad_tmp[m] += (2 * n + 1) * unscale * hvals[n] / tc_d * (
+                    k * jderivs[n] * lvals[n] * tmc[m]
+                    + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d)
+                    * jvals[n] * lderivs[n])
+        unscale *= jscale / hscale
+
+    for m in range(3):
+        grad[m] += 1j * k * charge * grad_tmp[m]
+
+# }}}
+
+
+# {{{ Helmholtz D
+
+cdef double complex ts_helmholtz_d(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        double[3] dipole,
+        double complex dipstr,
+        int order,
+        double complex k,
+        double complex[] jvals,
+        double jscale) nogil:
+    """Evaluate the target-specific expansion of the Helmholtz double-layer
+    kernel."""
+
+    cdef:
+        int n, m
+        int ifder
+        double sc_d, tc_d, cos_angle
+        double[3] smc, tmc
+        double complex[3] grad
+        double[BUFSIZE] lvals, lderivs
+        double complex z
+        double complex [BUFSIZE] hvals, hderivs
+        double hscale, unscale
+
+    for m in range(3):
+        smc[m] = source[m] - center[m]
+        tmc[m] = target[m] - center[m]
+        grad[m] = 0
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+
+    cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d)
+
+    # Evaluate the Legendre terms.
+    legvals(cos_angle, order, lvals, lderivs)
+
+    # Scaling magic for Hankel terms.
+    # These values are taken from the fmmlib documentation.
+    hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1
+    # unscale = (jscale / hscale) ** n
+    # Multiply against unscale to remove the scaling.
+    unscale = 1
+
+    # Evaluate the spherical Hankel terms.
+    z = k * sc_d
+    ifder = 1
+    h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs)
+
+    #
+    # This is a mess, but amounts to the s-gradient of:
+    #
+    #      __ order
+    #  ik \         (2n  +  1) j (k |t - c|) h (k |s - c|) P (cos θ)
+    #     /__ n = 0             n             n             n
+    #
+    #
+    for n in range(0, order + 1):
+        for m in range(3):
+            grad[m] += (2 * n + 1) * unscale * jvals[n] / sc_d * (
+                    k * smc[m] * hderivs[n] * lvals[n]
+                    + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d)
+                    * hvals[n] * lderivs[n])
+        unscale *= jscale / hscale
+
+    return 1j * k * dipstr * (
+            grad[0] * dipole[0] + grad[1] * dipole[1] + grad[2] * dipole[2])
+
+# }}}
+
+
+def eval_target_specific_qbx_locals(
+        int ifpot,
+        int ifgrad,
+        int ifcharge,
+        int ifdipole,
+        int order,
+        double[:,:] sources,
+        double[:,:] targets,
+        double[:,:] centers,
+        int[:] qbx_centers,
+        int[:] qbx_center_to_target_box,
+        int[:] center_to_target_starts, int[:] center_to_target_lists,
+        int[:] source_box_starts, int[:] source_box_lists,
+        int[:] box_source_starts, int[:] box_source_counts_nonchild,
+        double complex helmholtz_k,
+        double complex[:] charge,
+        double complex[:] dipstr,
+        double[:,:] dipvec,
+        double complex[:] pot,
+        double complex[:,:] grad):
+    """TSQBX entry point.
+
+    Arguments:
+        ifpot: Flag indicating whether to evaluate the potential
+        ifgrad: Flag indicating whether to evaluate the gradient of the potential
+        ifcharge: Flag indicating whether to include monopole sources
+        ifdipole: Flag indicating whether to include dipole sources
+        order: Expansion order
+        sources: Array of sources of shape (3, *nsrcs*)
+        targets: Array of targets of shape (3, *ntgts*)
+        centers: Array of centers of shape (3, *nctrs*)
+        qbx_centers: Array of subset of indices into *centers* which are QBX centers
+        qbx_center_to_target_box: Array mapping centers to target box numbers
+        center_to_target_starts: "Start" indices for center-to-target CSR list
+        center_to_target_lists: Center-to-target CSR list
+        source_box_starts: "Start" indices for target-box-to-source-box CSR list
+        source_box_lists: Target-box-to-source-box CSR list
+        box_source_starts: "Start" indices for sources for each box
+        box_source_counts_nonchild: Number of sources per box
+        helmholtz_k: Helmholtz parameter (Pass 0 for Laplace)
+        charge: (Complex) Source strengths, shape (*nsrcs*,), or *None*
+        dipstr: (Complex) Dipole source strengths, shape (*nsrcs*,) or *None*
+        dipvec: (Real) Dipole source orientations, shape (3, *nsrcs*), or *None*
+        pot: (Complex) Output potential, shape (*ngts*,), or *None*
+        grad: (Complex) Output gradient, shape (3, *ntgts*), or *None*
+    """
+
+    cdef:
+        int tgt, ictr, ctr
+        int itgt, itgt_start, itgt_end
+        int tgt_box, src_ibox
+        int isrc_box, isrc_box_start, isrc_box_end
+        int isrc, isrc_start, isrc_end
+        int tid, m
+        double jscale
+        double complex result
+        double[:,:] source, center, target, dipole
+        double complex[:,:] result_grad, jvals, jderivs
+        int laplace_s, helmholtz_s, laplace_sp, helmholtz_sp, laplace_d, helmholtz_d
+
+    # {{{ process arguments
+
+    if ifcharge:
+        if charge is None:
+            raise ValueError("Missing charge")
+
+    if ifdipole:
+        if dipstr is None:
+            raise ValueError("Missing dipstr")
+        if dipvec is None:
+            raise ValueError("Missing dipvec")
+
+    if ifdipole and ifgrad:
+        raise ValueError("Does not support computing gradient of dipole sources")
+
+    if helmholtz_k == 0:
+        helmholtz_s = helmholtz_sp = helmholtz_d = 0
+
+        if ifpot:
+            laplace_s = ifcharge
+            laplace_d = ifdipole
+
+        if ifgrad:
+            laplace_sp = ifcharge
+
+    else:
+        laplace_s = laplace_sp = laplace_d = 0
+
+        if ifpot:
+            helmholtz_s = ifcharge
+            helmholtz_d = ifdipole
+
+        if ifgrad:
+            helmholtz_sp = ifcharge
+
+    # }}}
+
+    if not any([
+            laplace_s, laplace_sp, laplace_d, helmholtz_s, helmholtz_sp,
+            helmholtz_d]):
+        return
+
+    if qbx_centers.shape[0] == 0:
+        return
+
+    # {{{ set up thread-local storage
+
+    # Hack to obtain thread-local storage
+    maxthreads = openmp.omp_get_max_threads()
+
+    # Prevent false sharing by padding the thread-local buffers
+    source = np.zeros((maxthreads, PADDING))
+    target = np.zeros((maxthreads, PADDING))
+    center = np.zeros((maxthreads, PADDING))
+    dipole = np.zeros((maxthreads, PADDING))
+    result_grad = np.zeros((maxthreads, PADDING), dtype=np.complex)
+    jvals = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex)
+    jderivs = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex)
+
+    # TODO: Check that the order is not too high, since temporary
+    # arrays in this module that are limited by BUFSIZE may overflow
+    # if that is the case
+
+    # }}}
+
+    for ictr in cython.parallel.prange(0, qbx_centers.shape[0],
+                                       nogil=True, schedule="static",
+                                       chunksize=128):
+        # Assign to jscale so Cython marks it as private
+        jscale = 0
+        ctr = qbx_centers[ictr]
+        itgt_start = center_to_target_starts[ctr]
+        itgt_end = center_to_target_starts[ctr + 1]
+        tgt_box = qbx_center_to_target_box[ctr]
+        tid = cython.parallel.threadid()
+
+        for m in range(3):
+            center[tid, m] = centers[m, ctr]
+
+        for itgt in range(itgt_start, itgt_end):
+            result = 0
+            tgt = center_to_target_lists[itgt]
+
+            for m in range(3):
+                target[tid, m] = targets[m, tgt]
+                if ifgrad:
+                    result_grad[tid, m] = 0
+
+            if helmholtz_s or helmholtz_sp or helmholtz_d:
+                # Precompute source-invariant Helmholtz terms.
+                ts_helmholtz_precompute(
+                        &center[tid, 0], &target[tid, 0],
+                        order, ifgrad, helmholtz_k, &jvals[tid, 0],
+                        &jderivs[tid, 0], &jscale)
+
+            isrc_box_start = source_box_starts[tgt_box]
+            isrc_box_end = source_box_starts[tgt_box + 1]
+
+            for isrc_box in range(isrc_box_start, isrc_box_end):
+                src_ibox = source_box_lists[isrc_box]
+                isrc_start = box_source_starts[src_ibox]
+                isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
+
+                for isrc in range(isrc_start, isrc_end):
+
+                    for m in range(3):
+                        source[tid, m] = sources[m, isrc]
+                        if ifdipole:
+                            dipole[tid, m] = dipvec[m, isrc]
+
+                    # NOTE: Don't use +=, since that makes Cython think we are
+                    # doing an OpenMP reduction.
+
+                    # {{{ evaluate potentials
+
+                    if laplace_s:
+                        result = result + (
+                                ts_laplace_s(
+                                    &source[tid, 0], &center[tid, 0], &target[tid, 0],
+                                    charge[isrc], order))
+
+                    if laplace_sp:
+                        ts_laplace_sp(
+                                &result_grad[tid, 0],
+                                &source[tid, 0], &center[tid, 0], &target[tid, 0],
+                                charge[isrc], order)
+
+                    if laplace_d:
+                        result = result + (
+                                ts_laplace_d(
+                                    &source[tid, 0], &center[tid, 0], &target[tid, 0],
+                                    &dipole[tid, 0], dipstr[isrc], order))
+
+                    if helmholtz_s:
+                        result = result + (
+                                ts_helmholtz_s(&source[tid, 0], &center[tid, 0],
+                                    &target[tid, 0], charge[isrc], order, helmholtz_k,
+                                    &jvals[tid, 0], jscale))
+
+                    if helmholtz_sp:
+                        ts_helmholtz_sp(
+                                &result_grad[tid, 0],
+                                &source[tid, 0], &center[tid, 0], &target[tid, 0],
+                                charge[isrc], order, helmholtz_k,
+                                &jvals[tid, 0], &jderivs[tid, 0], jscale)
+
+                    if helmholtz_d:
+                        result = result + (
+                                ts_helmholtz_d(
+                                    &source[tid, 0], &center[tid, 0], &target[tid, 0],
+                                    &dipole[tid, 0], dipstr[isrc], order, helmholtz_k,
+                                    &jvals[tid, 0], jscale))
+
+                    # }}}
+
+            if ifpot:
+                pot[tgt] = result
+
+            if ifgrad:
+                for m in range(3):
+                    grad[m, tgt] = result_grad[tid, m]
diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py
index 194c90fc7eb9ff18a2d3c6bfb1da6df33a3f1951..f5a74dc1605a38ae51415e07a41b6cc60e491c7f 100644
--- a/pytential/qbx/utils.py
+++ b/pytential/qbx/utils.py
@@ -31,6 +31,7 @@ from boxtree.tree import Tree
 import pyopencl as cl
 import pyopencl.array # noqa
 from pytools import memoize_method
+from boxtree.pyfmmlib_integration import FMMLibRotationDataInterface
 
 import logging
 logger = logging.getLogger(__name__)
@@ -402,4 +403,78 @@ def build_tree_with_qbx_metadata(
 
 # }}}
 
+
+# {{{ host geo data wrapper
+
+class ToHostTransferredGeoDataWrapper(FMMLibRotationDataInterface):
+    """Wraps an instance of :class:`pytential.qbx.geometry.QBXFMMGeometryData`,
+    automatically converting returned OpenCL arrays to host data.
+    """
+
+    def __init__(self, queue, geo_data):
+        self.queue = queue
+        self.geo_data = geo_data
+
+    @memoize_method
+    def tree(self):
+        return self.geo_data.tree().get(queue=self.queue)
+
+    @memoize_method
+    def traversal(self):
+        return self.geo_data.traversal().get(queue=self.queue)
+
+    @property
+    def lpot_source(self):
+        return self.geo_data.lpot_source
+
+    @property
+    def ncenters(self):
+        return self.geo_data.ncenters
+
+    @memoize_method
+    def centers(self):
+        return np.array([
+            ci.get(queue=self.queue)
+            for ci in self.geo_data.centers()])
+
+    @memoize_method
+    def expansion_radii(self):
+        return self.geo_data.expansion_radii().get(queue=self.queue)
+
+    @memoize_method
+    def global_qbx_centers(self):
+        return self.geo_data.global_qbx_centers().get(queue=self.queue)
+
+    @memoize_method
+    def qbx_center_to_target_box(self):
+        return self.geo_data.qbx_center_to_target_box().get(queue=self.queue)
+
+    @memoize_method
+    def qbx_center_to_target_box_source_level(self, source_level):
+        return self.geo_data.qbx_center_to_target_box_source_level(
+            source_level).get(queue=self.queue)
+
+    @memoize_method
+    def non_qbx_box_target_lists(self):
+        return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue)
+
+    @memoize_method
+    def center_to_tree_targets(self):
+        return self.geo_data.center_to_tree_targets().get(queue=self.queue)
+
+    @memoize_method
+    def all_targets(self):
+        """All (not just non-QBX) targets packaged into a single array."""
+        return np.array(list(self.tree().targets))
+
+    def m2l_rotation_lists(self):
+        # Already on host
+        return self.geo_data.m2l_rotation_lists()
+
+    def m2l_rotation_angles(self):
+        # Already on host
+        return self.geo_data.m2l_rotation_angles()
+
+# }}}
+
 # vim: foldmethod=marker:filetype=pyopencl
diff --git a/pytential/source.py b/pytential/source.py
index 1a55878ef41a4d9b507c2d54562428bd723ea2c4..7b3e456b6850f4fcde309172432cc055db189831 100644
--- a/pytential/source.py
+++ b/pytential/source.py
@@ -27,6 +27,7 @@ import numpy as np  # noqa: F401
 import pyopencl as cl  # noqa: F401
 import six
 from pytools import memoize_method
+from sumpy.fmm import UnableToCollectTimingData
 
 
 __doc__ = """
@@ -62,6 +63,9 @@ class PointPotentialSource(PotentialSource):
         An :class:`pyopencl.array.Array` of shape ``[ambient_dim, nnodes]``.
 
     .. attribute:: nnodes
+
+    .. automethod:: cost_model_compute_potential_insn
+    .. automethod:: exec_compute_potential_insn
     """
 
     def __init__(self, cl_context, nodes):
@@ -123,7 +127,18 @@ class PointPotentialSource(PotentialSource):
 
         return p2p
 
-    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+    def cost_model_compute_potential_insn(self, queue, insn, bound_expr,
+                                          evaluate, costs):
+        raise NotImplementedError
+
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate,
+            return_timing_data):
+        if return_timing_data:
+            from warnings import warn
+            warn(
+                   "Timing data collection not supported.",
+                   category=UnableToCollectTimingData)
+
         p2p = None
 
         kernel_args = {}
@@ -147,7 +162,8 @@ class PointPotentialSource(PotentialSource):
 
             result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result
+        timing_data = {}
+        return result, timing_data
 
     @memoize_method
     def weights_and_area_elements(self):
@@ -183,8 +199,9 @@ class LayerPotentialSourceBase(PotentialSource):
 
     .. rubric:: Execution
 
-    .. method:: weights_and_area_elements
-    .. method:: exec_compute_potential_insn
+    .. automethod:: weights_and_area_elements
+    .. automethod:: cost_model_compute_potential_insn
+    .. automethod:: exec_compute_potential_insn
     """
 
     def __init__(self, density_discr):
diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py
index 8f7ca490a22ccccc3c4e45a185ab9c596d429d31..86b10d24ad0e163506fc2e2b4dd004a6b7bf7c73 100644
--- a/pytential/symbolic/compiler.py
+++ b/pytential/symbolic/compiler.py
@@ -47,9 +47,6 @@ class Instruction(Record):
     def __str__(self):
         raise NotImplementedError
 
-    def get_exec_function(self, exec_mapper):
-        raise NotImplementedError
-
 
 class Assign(Instruction):
     # attributes: names, exprs, do_not_return, priority
@@ -111,9 +108,6 @@ class Assign(Instruction):
             lines.append("}")
             return "\n".join(lines)
 
-    def get_exec_function(self, exec_mapper):
-        return exec_mapper.exec_assign
-
     def __hash__(self):
         return id(self)
 
@@ -223,10 +217,6 @@ class ComputePotentialInstruction(Instruction):
         return "{ /* Pot(%s) */\n  %s\n}" % (
                 ", ".join(args), "\n  ".join(lines))
 
-    def get_exec_function(self, exec_mapper):
-        source = exec_mapper.bound_expr.places.get_geometry(self.source)
-        return source.exec_compute_potential_insn
-
     def __hash__(self):
         return id(self)
 
@@ -362,6 +352,14 @@ class Code(object):
 
         return argmax2(available_insns), discardable_vars
 
+    @staticmethod
+    def get_exec_function(insn, exec_mapper):
+        if isinstance(insn, Assign):
+            return exec_mapper.exec_assign
+        if isinstance(insn, ComputePotentialInstruction):
+            return exec_mapper.exec_compute_potential_insn
+        raise ValueError("unknown instruction class: %s" % type(insn))
+
     def execute(self, exec_mapper, pre_assign_check=None):
         """Execute the instruction stream, make all scheduling decisions
         dynamically.
@@ -389,7 +387,7 @@ class Code(object):
 
                 done_insns.add(insn)
                 assignments = (
-                        insn.get_exec_function(exec_mapper)
+                        self.get_exec_function(insn, exec_mapper)
                         (exec_mapper.queue, insn, exec_mapper.bound_expr,
                             exec_mapper))
 
diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py
index 5a8f8b786bd41209700a46a4cefbfee3cdf9a0ad..ca9696da46b821025b9c2a80024d90d5e0c6884d 100644
--- a/pytential/symbolic/execution.py
+++ b/pytential/symbolic/execution.py
@@ -29,7 +29,7 @@ import six
 from six.moves import zip
 
 from pymbolic.mapper.evaluator import (
-        EvaluationMapper as EvaluationMapperBase)
+        EvaluationMapper as PymbolicEvaluationMapper)
 import numpy as np
 
 import pyopencl as cl
@@ -42,6 +42,11 @@ from pytools import memoize_in
 from pytential import sym
 
 
+__doc__ = """
+.. autoclass :: BoundExpression
+"""
+
+
 # FIXME caches: fix up queues
 
 # {{{ evaluation mapper
@@ -62,11 +67,13 @@ def mesh_el_view(mesh, group_nr, global_array):
             + (group.nelements,))
 
 
-class EvaluationMapper(EvaluationMapperBase):
-    def __init__(self, bound_expr, queue, context={},
+class EvaluationMapperBase(PymbolicEvaluationMapper):
+    def __init__(self, bound_expr, queue, context=None,
             target_geometry=None,
             target_points=None, target_normals=None, target_tangents=None):
-        EvaluationMapperBase.__init__(self, context)
+        if context is None:
+            context = {}
+        PymbolicEvaluationMapper.__init__(self, context)
 
         self.bound_expr = bound_expr
         self.places = bound_expr.places
@@ -199,7 +206,7 @@ class EvaluationMapper(EvaluationMapperBase):
                 .with_queue(self.queue)
 
     def map_inverse(self, expr):
-        bound_op_cache = self.bound_expr.get_cache("bound_op")
+        bound_op_cache = self.bound_expr.places.get_cache("bound_op")
 
         try:
             bound_op = bound_op_cache[expr]
@@ -239,6 +246,9 @@ class EvaluationMapper(EvaluationMapperBase):
         return [(name, evaluate(expr))
                 for name, expr in zip(insn.names, insn.exprs)]
 
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+        raise NotImplementedError
+
     # {{{ functions
 
     def apply_real(self, args):
@@ -285,6 +295,77 @@ class EvaluationMapper(EvaluationMapperBase):
 # }}}
 
 
+# {{{ evaluation mapper
+
+class EvaluationMapper(EvaluationMapperBase):
+
+    def __init__(self, bound_expr, queue, context=None,
+            timing_data=None):
+        EvaluationMapperBase.__init__(self, bound_expr, queue, context)
+        self.timing_data = timing_data
+
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+        source = bound_expr.places.get_geometry(insn.source)
+
+        return_timing_data = self.timing_data is not None
+
+        result, timing_data = (
+                source.exec_compute_potential_insn(
+                    queue, insn, bound_expr, evaluate, return_timing_data))
+
+        if return_timing_data:
+            # The compiler ensures this.
+            assert insn not in self.timing_data
+
+            self.timing_data[insn] = timing_data
+
+        return result
+
+# }}}
+
+
+# {{{ cost model mapper
+
+class CostModelMapper(EvaluationMapperBase):
+    """Mapper for evaluating cost models.
+
+    This executes everything *except* the layer potential operator. Instead of
+    executing the operator, the cost model gets run and the cost
+    data is collected.
+    """
+
+    def __init__(self, bound_expr, queue, context=None,
+            target_geometry=None,
+            target_points=None, target_normals=None, target_tangents=None):
+        if context is None:
+            context = {}
+        EvaluationMapperBase.__init__(
+                self, bound_expr, queue, context,
+                target_geometry,
+                target_points,
+                target_normals,
+                target_tangents)
+        self.modeled_cost = {}
+
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+        source = bound_expr.places.get_geometry(insn.source)
+
+        result, cost_model_result = (
+                source.cost_model_compute_potential_insn(
+                    queue, insn, bound_expr, evaluate))
+
+        # The compiler ensures this.
+        assert insn not in self.modeled_cost
+
+        self.modeled_cost[insn] = cost_model_result
+        return result
+
+    def get_modeled_cost(self):
+        return self.modeled_cost
+
+# }}}
+
+
 # {{{ scipy-like mat-vec op
 
 class MatVecOp:
@@ -553,6 +634,17 @@ class GeometryCollection(object):
 # {{{ bound expression
 
 class BoundExpression(object):
+    """An expression readied for evaluation by binding it to a
+    :class:`GeometryCollection`.
+
+    .. automethod :: get_modeled_cost
+    .. automethod :: scipy_pop
+    .. automethod :: eval
+    .. automethod :: __call__
+
+    Created by calling :func:`bind`.
+    """
+
     def __init__(self, places, sym_op_expr):
         self.places = places
         self.sym_op_expr = sym_op_expr
@@ -561,12 +653,14 @@ class BoundExpression(object):
         from pytential.symbolic.compiler import OperatorCompiler
         self.code = OperatorCompiler(self.places)(sym_op_expr)
 
-    def get_cache(self, name):
-        return self.places.get_cache(name)
-
     def get_discretization(self, where):
         return self.places.get_discretization(where)
 
+    def get_modeled_cost(self, queue, **args):
+        cost_model_mapper = CostModelMapper(self, queue, args)
+        self.code.execute(cost_model_mapper)
+        return cost_model_mapper.get_modeled_cost()
+
     def scipy_op(self, queue, arg_name, dtype, domains=None, **extra_args):
         """
         :arg domains: a list of discretization identifiers or
@@ -575,6 +669,9 @@ class BoundExpression(object):
             is a scalar.  If *domains* is *None*,
             :class:`~pytential.symbolic.primitives.DEFAULT_TARGET` is required
             to be a key in :attr:`places`.
+        :returns: An object that (mostly) satisfies the
+            :mod:`scipy.linalg.LinearOperator` protocol, except for accepting
+            and returning :clas:`pyopencl.array.Array` arrays.
         """
 
         from pytools.obj_array import is_obj_array
@@ -605,10 +702,34 @@ class BoundExpression(object):
         return MatVecOp(self, queue,
                 arg_name, dtype, total_dofs, starts_and_ends, extra_args)
 
-    def __call__(self, queue, **args):
-        exec_mapper = EvaluationMapper(self, queue, args)
+    def eval(self, queue, context=None, timing_data=None):
+        """Evaluate the expression in *self*, using the
+        :clas:`pyopencl.CommandQueue` *queue* and the
+        input variables given in the dictionary *context*.
+
+        :arg timing_data: A dictionary into which timing
+            data will be inserted during evaluation.
+            (experimental)
+        :returns: the value of the expression, as a scalar,
+            :class:`pyopencl.array.Array`, or an object array of these.
+        """
+
+        if context is None:
+            context = {}
+        exec_mapper = EvaluationMapper(
+                self, queue, context, timing_data=timing_data)
         return self.code.execute(exec_mapper)
 
+    def __call__(self, queue, **args):
+        """Evaluate the expression in *self*, using the
+        :clas:`pyopencl.CommandQueue` *queue* and the
+        input variables given in the dictionary *context*.
+
+        :returns: the value of the expression, as a scalar,
+            :class:`pyopencl.array.Array`, or an object array of these.
+        """
+        return self.eval(queue, args)
+
 
 def bind(places, expr, auto_where=None):
     """
@@ -617,6 +738,11 @@ def bind(places, expr, auto_where=None):
         constructor can also be used.
     :arg auto_where: for simple source-to-self or source-to-target
         evaluations, find 'where' attributes automatically.
+    :arg expr: one or multiple expressions consisting of primitives
+        form :mod:`pytential.symbolic.primitives` (aka :mod:`pytential.sym`).
+        Multiple expressions can be combined into one object to pass here
+        in the form of a :mod:`numpy` object array
+    :returns: a :class:`BoundExpression`
     """
 
     if not isinstance(places, GeometryCollection):
diff --git a/pytential/unregularized.py b/pytential/unregularized.py
index 2ff748d18d31303fb1f73998bdd72851ca66e47e..fc39217e21293d8cc94b2a089ee0c9847dad1cf9 100644
--- a/pytential/unregularized.py
+++ b/pytential/unregularized.py
@@ -108,7 +108,15 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
                 density_discr=density_discr or self.density_discr,
                 debug=debug if debug is not None else self.debug)
 
-    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate):
+    def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate,
+            return_timing_data):
+        if return_timing_data:
+            from warnings import warn
+            from pytential.source import UnableToCollectTimingData
+            warn(
+                   "Timing data collection not supported.",
+                   category=UnableToCollectTimingData)
+
         from pytools.obj_array import with_object_array_or_scalar
 
         def evaluate_wrapper(expr):
@@ -164,7 +172,8 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
 
             result.append((o.name, output_for_each_kernel[o.kernel_index]))
 
-        return result
+        timing_data = {}
+        return result, timing_data
 
     # {{{ fmm-based execution
 
@@ -246,8 +255,9 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
         # }}}
 
         from boxtree.fmm import drive_fmm
+        timing_data = {}
         all_potentials_on_every_tgt = drive_fmm(
-                geo_data.traversal(), wrangler, strengths)
+                geo_data.traversal(), wrangler, strengths, timing_data)
 
         # {{{ postprocess fmm
 
@@ -264,7 +274,7 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase):
 
         # }}}
 
-        return result
+        return result, timing_data
 
     # }}}
 
diff --git a/setup.py b/setup.py
index 8e1a66c4a570c22aae64e381c56548f57fad164f..d3095519c12a6dadcee3fb2ac431becc69d5e9fb 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,8 @@
 
 import os
 from setuptools import setup, find_packages
+from setuptools.extension import Extension
+from Cython.Build import cythonize
 
 
 # {{{ capture git revision at install time
@@ -54,6 +56,21 @@ write_git_revision("pytential")
 # }}}
 
 
+ext_modules = [
+        Extension(
+            "pytential.qbx.target_specific.impl",
+            sources=[
+                "pytential/qbx/target_specific/impl.pyx",
+                "pytential/qbx/target_specific/helmholtz_utils.c"],
+            depends=[
+                "pytential/qbx/target_specific/impl.h",
+                "pytential/qbx/target_specific/helmholtz_utils.h"],
+            extra_compile_args=["-Wall", "-fopenmp", "-Ofast"],
+            extra_link_args=["-fopenmp"]
+        ),
+]
+
+
 version_dict = {}
 init_filename = "pytential/version.py"
 os.environ["AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY"] = "1"
@@ -91,6 +108,8 @@ setup(name="pytential",
 
       packages=find_packages(),
 
+      ext_modules=cythonize(ext_modules),
+
       install_requires=[
           "pytest>=2.3",
           # FIXME leave out for now
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..482262ec28582f40adfdde816cb9ec408417e388
--- /dev/null
+++ b/test/test_cost_model.py
@@ -0,0 +1,565 @@
+from __future__ import division, print_function
+
+__copyright__ = "Copyright (C) 2018 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la  # noqa
+
+from boxtree.tools import ConstantOneExpansionWrangler
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from pytools import one
+from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+
+from pytential import bind, sym, norm  # noqa
+from pytential.qbx.cost import CostModel
+
+
+# {{{ global params
+
+TARGET_ORDER = 8
+OVSMP_FACTOR = 5
+TCF = 0.9
+QBX_ORDER = 5
+FMM_ORDER = 10
+
+DEFAULT_LPOT_KWARGS = {
+        "_box_extent_norm": "l2",
+        "_from_sep_smaller_crit": "static_l2",
+        }
+
+
+def get_lpot_source(queue, dim):
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import (
+            InterpolatoryQuadratureSimplexGroupFactory)
+
+    target_order = TARGET_ORDER
+
+    if dim == 2:
+        from meshmode.mesh.generation import starfish, make_curve_mesh
+        mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order)
+    elif dim == 3:
+        from meshmode.mesh.generation import generate_torus
+        mesh = generate_torus(2, 1, order=target_order)
+    else:
+        raise ValueError("unsupported dimension: %d" % dim)
+
+    pre_density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    lpot_kwargs = DEFAULT_LPOT_KWARGS.copy()
+    lpot_kwargs.update(
+            _expansion_stick_out_factor=TCF,
+            fmm_order=FMM_ORDER,
+            qbx_order=QBX_ORDER,
+            fmm_backend="fmmlib",
+            )
+
+    from pytential.qbx import QBXLayerPotentialSource
+    lpot_source = QBXLayerPotentialSource(
+            pre_density_discr, OVSMP_FACTOR*target_order,
+            **lpot_kwargs)
+
+    lpot_source, _ = lpot_source.with_refinement()
+
+    return lpot_source
+
+
+def get_density(queue, lpot_source):
+    density_discr = lpot_source.density_discr
+    nodes = density_discr.nodes().with_queue(queue)
+    return cl.clmath.sin(10 * nodes[0])
+
+# }}}
+
+
+# {{{ test that timing data gathering can execute succesfully
+
+def test_timing_data_gathering(ctx_getter):
+    """Test that timing data gathering can execute succesfully."""
+
+    pytest.importorskip("pyfmmlib")
+
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    lpot_source = get_lpot_source(queue, 2)
+    sigma = get_density(queue, lpot_source)
+
+    sigma_sym = sym.var("sigma")
+    k_sym = LaplaceKernel(lpot_source.ambient_dim)
+    sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+
+    op_S = bind(lpot_source, sym_op_S)
+
+    timing_data = {}
+    op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data)
+    assert timing_data
+    print(timing_data)
+
+# }}}
+
+
+# {{{ test cost model
+
+@pytest.mark.parametrize("dim, use_target_specific_qbx", (
+    (2, False),
+    (3, False),
+    (3, True)))
+def test_cost_model(ctx_getter, dim, use_target_specific_qbx):
+    """Test that cost model gathering can execute successfully."""
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    lpot_source = (
+            get_lpot_source(queue, dim)
+            .copy(
+                _use_target_specific_qbx=use_target_specific_qbx,
+                cost_model=CostModel()))
+
+    sigma = get_density(queue, lpot_source)
+
+    sigma_sym = sym.var("sigma")
+    k_sym = LaplaceKernel(lpot_source.ambient_dim)
+
+    sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+    op_S = bind(lpot_source, sym_op_S)
+    cost_S = op_S.get_modeled_cost(queue, sigma=sigma)
+    assert len(cost_S) == 1
+
+    sym_op_S_plus_D = (
+            sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+            + sym.D(k_sym, sigma_sym))
+    op_S_plus_D = bind(lpot_source, sym_op_S_plus_D)
+    cost_S_plus_D = op_S_plus_D.get_modeled_cost(queue, sigma=sigma)
+    assert len(cost_S_plus_D) == 2
+
+# }}}
+
+
+# {{{ test cost model metadata gathering
+
+def test_cost_model_metadata_gathering(ctx_getter):
+    """Test that the cost model correctly gathers metadata."""
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
+
+    fmm_level_to_order = SimpleExpansionOrderFinder(tol=1e-5)
+
+    lpot_source = get_lpot_source(queue, 2).copy(
+            fmm_level_to_order=fmm_level_to_order)
+
+    sigma = get_density(queue, lpot_source)
+
+    sigma_sym = sym.var("sigma")
+    k_sym = HelmholtzKernel(2, "k")
+    k = 2
+
+    sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1, k=sym.var("k"))
+    op_S = bind(lpot_source, sym_op_S)
+
+    cost_S = one(op_S.get_modeled_cost(queue, sigma=sigma, k=k).values())
+
+    geo_data = lpot_source.qbx_fmm_geometry_data(
+            target_discrs_and_qbx_sides=((lpot_source.density_discr, 1),))
+
+    tree = geo_data.tree()
+
+    assert cost_S.params["p_qbx"] == QBX_ORDER
+    assert cost_S.params["nlevels"] == tree.nlevels
+    assert cost_S.params["nsources"] == tree.nsources
+    assert cost_S.params["ntargets"] == tree.ntargets
+    assert cost_S.params["ncenters"] == geo_data.ncenters
+
+    for level in range(tree.nlevels):
+        assert (
+                cost_S.params["p_fmm_lev%d" % level]
+                == fmm_level_to_order(k_sym, {"k": 2}, tree, level))
+
+# }}}
+
+
+# {{{ constant one wrangler
+
+class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler):
+
+    def __init__(self, queue, geo_data, use_target_specific_qbx):
+        from pytential.qbx.utils import ToHostTransferredGeoDataWrapper
+        geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data)
+
+        self.geo_data = geo_data
+        self.trav = geo_data.traversal()
+        self.using_tsqbx = (
+                use_target_specific_qbx
+                # None means use by default if possible
+                or use_target_specific_qbx is None)
+
+        ConstantOneExpansionWrangler.__init__(self, geo_data.tree())
+
+    def _get_target_slice(self, ibox):
+        non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists()
+        pstart = non_qbx_box_target_lists.box_target_starts[ibox]
+        return slice(
+                pstart, pstart
+                + non_qbx_box_target_lists.box_target_counts_nonchild[ibox])
+
+    def output_zeros(self):
+        non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists()
+        return np.zeros(non_qbx_box_target_lists.nfiltered_targets)
+
+    def full_output_zeros(self):
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([np.zeros(self.tree.ntargets)])
+
+    def qbx_local_expansion_zeros(self):
+        return np.zeros(self.geo_data.ncenters)
+
+    def reorder_potentials(self, potentials):
+        raise NotImplementedError("reorder_potentials should not "
+                "be called on a QBXExpansionWrangler")
+
+    def form_global_qbx_locals(self, src_weights):
+        local_exps = self.qbx_local_expansion_zeros()
+        ops = 0
+
+        if self.using_tsqbx:
+            return local_exps, self.timing_future(ops)
+
+        global_qbx_centers = self.geo_data.global_qbx_centers()
+        qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box()
+
+        for tgt_icenter in global_qbx_centers:
+            itgt_box = qbx_center_to_target_box[tgt_icenter]
+
+            start, end = (
+                    self.trav.neighbor_source_boxes_starts[itgt_box:itgt_box + 2])
+
+            src_sum = 0
+            for src_ibox in self.trav.neighbor_source_boxes_lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                ops += src_pslice.stop - src_pslice.start
+                src_sum += np.sum(src_weights[src_pslice])
+
+            local_exps[tgt_icenter] = src_sum
+
+        return local_exps, self.timing_future(ops)
+
+    def translate_box_multipoles_to_qbx_local(self, multipole_exps):
+        local_exps = self.qbx_local_expansion_zeros()
+        ops = 0
+
+        global_qbx_centers = self.geo_data.global_qbx_centers()
+
+        for isrc_level, ssn in enumerate(self.trav.from_sep_smaller_by_level):
+            for tgt_icenter in global_qbx_centers:
+                icontaining_tgt_box = (
+                        self.geo_data
+                        .qbx_center_to_target_box_source_level(isrc_level)
+                        [tgt_icenter])
+
+                if icontaining_tgt_box == -1:
+                    continue
+
+                start, stop = (
+                        ssn.starts[icontaining_tgt_box],
+                        ssn.starts[icontaining_tgt_box+1])
+
+                for src_ibox in ssn.lists[start:stop]:
+                    local_exps[tgt_icenter] += multipole_exps[src_ibox]
+                    ops += 1
+
+        return local_exps, self.timing_future(ops)
+
+    def translate_box_local_to_qbx_local(self, local_exps):
+        qbx_expansions = self.qbx_local_expansion_zeros()
+        ops = 0
+
+        global_qbx_centers = self.geo_data.global_qbx_centers()
+        qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box()
+
+        for tgt_icenter in global_qbx_centers:
+            isrc_box = qbx_center_to_target_box[tgt_icenter]
+            src_ibox = self.trav.target_boxes[isrc_box]
+            qbx_expansions[tgt_icenter] += local_exps[src_ibox]
+            ops += 1
+
+        return qbx_expansions, self.timing_future(ops)
+
+    def eval_qbx_expansions(self, qbx_expansions):
+        output = self.full_output_zeros()
+        ops = 0
+
+        global_qbx_centers = self.geo_data.global_qbx_centers()
+        center_to_tree_targets = self.geo_data.center_to_tree_targets()
+
+        for src_icenter in global_qbx_centers:
+            start, end = (
+                    center_to_tree_targets.starts[src_icenter:src_icenter+2])
+            for icenter_tgt in range(start, end):
+                center_itgt = center_to_tree_targets.lists[icenter_tgt]
+                output[0][center_itgt] += qbx_expansions[src_icenter]
+                ops += 1
+
+        return output, self.timing_future(ops)
+
+    def eval_target_specific_qbx_locals(self, src_weights):
+        pot = self.full_output_zeros()
+        ops = 0
+
+        if not self.using_tsqbx:
+            return pot, self.timing_future(ops)
+
+        global_qbx_centers = self.geo_data.global_qbx_centers()
+        center_to_tree_targets = self.geo_data.center_to_tree_targets()
+        qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box()
+
+        for ictr in global_qbx_centers:
+            tgt_ibox = qbx_center_to_target_box[ictr]
+
+            ictr_tgt_start, ictr_tgt_end = center_to_tree_targets.starts[ictr:ictr+2]
+
+            for ictr_tgt in range(ictr_tgt_start, ictr_tgt_end):
+                ctr_itgt = center_to_tree_targets.lists[ictr_tgt]
+
+                isrc_box_start, isrc_box_end = (
+                        self.trav.neighbor_source_boxes_starts[tgt_ibox:tgt_ibox+2])
+
+                for isrc_box in range(isrc_box_start, isrc_box_end):
+                    src_ibox = self.trav.neighbor_source_boxes_lists[isrc_box]
+
+                    isrc_start = self.tree.box_source_starts[src_ibox]
+                    isrc_end = (isrc_start
+                            + self.tree.box_source_counts_nonchild[src_ibox])
+
+                    pot[0][ctr_itgt] += sum(src_weights[isrc_start:isrc_end])
+                    ops += isrc_end - isrc_start
+
+        return pot, self.timing_future(ops)
+
+# }}}
+
+
+# {{{ verify cost model
+
+class OpCountingTranslationCostModel(object):
+    """A translation cost model which assigns at cost of 1 to each operation."""
+
+    def __init__(self, dim, nlevels):
+        pass
+
+    @staticmethod
+    def direct():
+        return 1
+
+    p2qbxl = direct
+    p2p_tsqbx = direct
+    qbxl2p = direct
+
+    @staticmethod
+    def p2l(level):
+        return 1
+
+    l2p = p2l
+    p2m = p2l
+    m2p = p2l
+    m2qbxl = p2l
+    l2qbxl = p2l
+
+    @staticmethod
+    def m2m(src_level, tgt_level):
+        return 1
+
+    l2l = m2m
+    m2l = m2m
+
+
+@pytest.mark.parametrize("dim, off_surface, use_target_specific_qbx", (
+        (2, False, False),
+        (2, True,  False),
+        (3, False, False),
+        (3, False, True),
+        (3, True,  False),
+        (3, True,  True)))
+def test_cost_model_correctness(ctx_getter, dim, off_surface,
+        use_target_specific_qbx):
+    """Check that computed cost matches that of a constant-one FMM."""
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    cost_model = (
+            CostModel(
+                translation_cost_model_factory=OpCountingTranslationCostModel))
+
+    lpot_source = get_lpot_source(queue, dim).copy(
+            cost_model=cost_model,
+            _use_target_specific_qbx=use_target_specific_qbx)
+
+    # Construct targets.
+    if off_surface:
+        from pytential.target import PointsTarget
+        from boxtree.tools import make_uniform_particle_array
+        ntargets = 10 ** 3
+        targets = PointsTarget(
+                make_uniform_particle_array(queue, ntargets, dim, np.float))
+        target_discrs_and_qbx_sides = ((targets, 0),)
+        qbx_forced_limit = None
+    else:
+        targets = lpot_source.density_discr
+        target_discrs_and_qbx_sides = ((targets, 1),)
+        qbx_forced_limit = 1
+
+    # Construct bound op, run cost model.
+    sigma_sym = sym.var("sigma")
+    k_sym = LaplaceKernel(lpot_source.ambient_dim)
+    sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=qbx_forced_limit)
+
+    op_S = bind((lpot_source, targets), sym_op_S)
+    sigma = get_density(queue, lpot_source)
+
+    from pytools import one
+    cost_S = one(op_S.get_modeled_cost(queue, sigma=sigma).values())
+
+    # Run FMM with ConstantOneWrangler. This can't be done with pytential's
+    # high-level interface, so call the FMM driver directly.
+    from pytential.qbx.fmm import drive_fmm
+    geo_data = lpot_source.qbx_fmm_geometry_data(
+            target_discrs_and_qbx_sides=target_discrs_and_qbx_sides)
+
+    wrangler = ConstantOneQBXExpansionWrangler(
+            queue, geo_data, use_target_specific_qbx)
+    nnodes = lpot_source.quad_stage2_density_discr.nnodes
+    src_weights = np.ones(nnodes)
+
+    timing_data = {}
+    potential = drive_fmm(wrangler, src_weights, timing_data,
+            traversal=wrangler.trav)[0][geo_data.ncenters:]
+
+    # Check constant one wrangler for correctness.
+    assert (potential == nnodes).all()
+
+    modeled_time = cost_S.get_predicted_times(merge_close_lists=True)
+
+    # Check that the cost model matches the timing data returned by the
+    # constant one wrangler.
+    mismatches = []
+    for stage in timing_data:
+        if timing_data[stage]["ops_elapsed"] != modeled_time[stage]:
+            mismatches.append(
+                    (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage]))
+
+    assert not mismatches, "\n".join(str(s) for s in mismatches)
+
+# }}}
+
+
+# {{{ test order varying by level
+
+CONSTANT_ONE_PARAMS = dict(
+        c_l2l=1,
+        c_l2p=1,
+        c_l2qbxl=1,
+        c_m2l=1,
+        c_m2m=1,
+        c_m2p=1,
+        c_m2qbxl=1,
+        c_p2l=1,
+        c_p2m=1,
+        c_p2p=1,
+        c_p2qbxl=1,
+        c_qbxl2p=1,
+        c_p2p_tsqbx=1,
+        )
+
+
+def test_cost_model_order_varying_by_level(ctx_getter):
+    """For FMM order varying by level, this checks to ensure that the costs are
+    different. The varying-level case should have larger cost.
+    """
+
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    # {{{ constant level to order
+
+    def level_to_order_constant(kernel, kernel_args, tree, level):
+        return 1
+
+    lpot_source = get_lpot_source(queue, 2).copy(
+            cost_model=CostModel(
+                calibration_params=CONSTANT_ONE_PARAMS),
+            fmm_level_to_order=level_to_order_constant)
+
+    sigma_sym = sym.var("sigma")
+
+    k_sym = LaplaceKernel(2)
+    sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+
+    sigma = get_density(queue, lpot_source)
+
+    cost_constant = one(
+            bind(lpot_source, sym_op)
+            .get_modeled_cost(queue, sigma=sigma).values())
+
+    # }}}
+
+    # {{{ varying level to order
+
+    varying_order_params = cost_constant.params.copy()
+
+    nlevels = cost_constant.params["nlevels"]
+    for level in range(nlevels):
+        varying_order_params["p_fmm_lev%d" % level] = nlevels - level
+
+    cost_varying = cost_constant.with_params(varying_order_params)
+
+    # }}}
+
+    assert (
+            sum(cost_varying.get_predicted_times().values())
+            > sum(cost_constant.get_predicted_times().values()))
+
+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_cost_model.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+
+# vim: foldmethod=marker
diff --git a/test/test_layer_pot.py b/test/test_layer_pot.py
index 2f2175aeb078fe2eb6a46cbfc8606e6a931da524..2e7aca67dbf1635a1b5169dc5c8dcec31db407e9 100644
--- a/test/test_layer_pot.py
+++ b/test/test_layer_pot.py
@@ -183,7 +183,7 @@ def test_off_surface_eval_vs_direct(ctx_factory,  do_plot=False):
             target_association_tolerance=0.05,
             ).with_refinement()
 
-    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000)
+    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500)
     from pytential.target import PointsTarget
     ptarget = PointsTarget(fplot.points)
     from sumpy.kernel import LaplaceKernel
@@ -315,77 +315,6 @@ def test_unregularized_off_surface_fmm_vs_direct(ctx_factory):
 # }}}
 
 
-# {{{ test performance data gathering
-
-def test_perf_data_gathering(ctx_factory, n_arms=5):
-    cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
-
-    # prevent cache 'splosion
-    from sympy.core.cache import clear_cache
-    clear_cache()
-
-    target_order = 8
-
-    starfish_func = NArmedStarfish(n_arms, 0.8)
-    mesh = make_curve_mesh(
-            starfish_func,
-            np.linspace(0, 1, n_arms * 30),
-            target_order)
-
-    sigma_sym = sym.var("sigma")
-
-    # The kernel doesn't really matter here
-    from sumpy.kernel import LaplaceKernel
-    k_sym = LaplaceKernel(mesh.ambient_dim)
-
-    sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
-
-    from meshmode.discretization import Discretization
-    from meshmode.discretization.poly_element import (
-            InterpolatoryQuadratureSimplexGroupFactory)
-    pre_density_discr = Discretization(
-            queue.context, mesh,
-            InterpolatoryQuadratureSimplexGroupFactory(target_order))
-
-    results = []
-
-    def inspect_geo_data(insn, bound_expr, geo_data):
-        from pytential.qbx.fmm import assemble_performance_data
-        perf_data = assemble_performance_data(geo_data, uses_pde_expansions=True)
-        results.append(perf_data)
-
-        return False  # no need to do the actual FMM
-
-    from pytential.qbx import QBXLayerPotentialSource
-    lpot_source = QBXLayerPotentialSource(
-            pre_density_discr, 4*target_order,
-            # qbx order and fmm order don't really matter
-            10, fmm_order=10,
-            _expansions_in_tree_have_extent=True,
-            _expansion_stick_out_factor=0.5,
-            geometry_data_inspector=inspect_geo_data,
-            target_association_tolerance=1e-10,
-            )
-
-    lpot_source, _ = lpot_source.with_refinement()
-
-    density_discr = lpot_source.density_discr
-
-    if 0:
-        from meshmode.discretization.visualization import draw_curve
-        draw_curve(density_discr)
-        import matplotlib.pyplot as plt
-        plt.show()
-
-    nodes = density_discr.nodes().with_queue(queue)
-    sigma = cl.clmath.sin(10 * nodes[0])
-
-    bind(lpot_source, sym_op)(queue, sigma=sigma)
-
-# }}}
-
-
 # {{{ test 3D jump relations
 
 @pytest.mark.parametrize("relation", ["sp", "nxcurls", "div_s"])
diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py
index d6d426519e42f1177de0bcf85cf7b8553e8154b5..9151f510d7841f4a19f07893016e92332ae181e4 100644
--- a/test/test_layer_pot_identity.py
+++ b/test/test_layer_pot_identity.py
@@ -238,32 +238,38 @@ class DynamicTestCase(object):
         if (self.geometry.mesh_name == "sphere"
                 and self.k != 0
                 and self.fmm_backend == "sumpy"):
-            pytest.skip("both direct eval and generating the FMM kernels "
+            raise ValueError("both direct eval and generating the FMM kernels "
                     "are too slow")
 
         if (self.geometry.mesh_name == "sphere"
                 and self.expr.zero_op_name == "green_grad"):
-            pytest.skip("does not achieve sufficient precision")
+            raise ValueError("does not achieve sufficient precision")
 
 
 # {{{ integral identity tester
 
+
+@pytest.mark.slowtest
+@pytest.mark.parametrize("case", [
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 0),
+])
+def test_identity_convergence_slow(ctx_factory, case):
+    test_identity_convergence(ctx_factory, case)
+
+
 @pytest.mark.parametrize("case", [
-    tc
-    for geom in [
-        StarfishGeometry(),
-        SphereGeometry(),
-        ]
-    for tc in [
-        DynamicTestCase(geom, GreenExpr(), 0),
-        DynamicTestCase(geom, GreenExpr(), 1.2),
-        DynamicTestCase(geom, GradGreenExpr(), 0),
-        DynamicTestCase(geom, GradGreenExpr(), 1.2),
-        DynamicTestCase(geom, ZeroCalderonExpr(), 0),
-
-        DynamicTestCase(geom, GreenExpr(), 0, fmm_backend="fmmlib"),
-        DynamicTestCase(geom, GreenExpr(), 1.2, fmm_backend="fmmlib"),
-        ]])
+        # 2d
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2),
+        DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 1.2),
+        DynamicTestCase(StarfishGeometry(), ZeroCalderonExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib"),
+        # 3d
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"),
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib")
+])
 def test_identity_convergence(ctx_factory,  case, visualize=False):
     logging.basicConfig(level=logging.INFO)
 
diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py
new file mode 100644
index 0000000000000000000000000000000000000000..34091af9a91e26f526ea3f2b371515b0c8992029
--- /dev/null
+++ b/test/test_target_specific_qbx.py
@@ -0,0 +1,219 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+import pyopencl.clmath as clmath
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from functools import partial  # noqa
+from meshmode.mesh.generation import (  # noqa
+        ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
+        NArmedStarfish,
+        make_curve_mesh)
+
+from pytential import bind, sym, norm  # noqa
+from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def test_spherical_bessel_functions():
+    import pytential.qbx.target_specific as ts
+
+    nterms = 9
+    z = 3j
+    scale = 1
+    j = np.zeros(1 + nterms, dtype=np.complex)
+    jder = np.zeros(1 + nterms, dtype=np.complex)
+    ts.jfuns3d_wrapper(nterms, z, scale, j, jder)
+
+    # Reference solution computed using scipy.special.spherical_jn
+
+    j_expected = np.array([
+            +3.33929164246994992e+00 + +0.00000000000000000e+00j,
+            +0.00000000000000000e+00 + +2.24279011776926884e+00j,
+            -1.09650152470070195e+00 + +0.00000000000000000e+00j,
+            +2.77555756156289135e-17 + -4.15287576601431119e-01j,
+            +1.27497179297362317e-01 + +0.00000000000000000e+00j,
+            -3.46944695195361419e-18 + +3.27960387093445271e-02j,
+            -7.24503736309898075e-03 + -4.33680868994201774e-19j,
+            +0.00000000000000000e+00 + -1.40087680258226812e-03j,
+            +2.40653350187633002e-04 + +0.00000000000000000e+00j,
+            +0.00000000000000000e+00 + +3.71744848523478122e-05j,
+    ])
+
+    assert np.allclose(j, j_expected)
+
+    jder_expected = np.array([
+            -0.00000000000000000e+00 + -2.24279011776926884e+00j,
+            +1.84409823062377076e+00 + +0.00000000000000000e+00j,
+            +0.00000000000000000e+00 + +1.14628859306856690e+00j,
+            -5.42784755898793825e-01 + +3.70074341541718826e-17j,
+            +2.77555756156289135e-17 + -2.02792277772493951e-01j,
+            +6.19051018786732632e-02 + -6.93889390390722838e-18j,
+            -2.45752492430047685e-18 + +1.58909515287802387e-02j,
+            -3.50936588954626604e-03 + -4.33680868994201774e-19j,
+            +0.00000000000000000e+00 + -6.78916752019369197e-04j,
+            +1.16738400679806980e-04 + +0.00000000000000000e+00j,
+    ])
+
+    assert np.allclose(jder, jder_expected)
+
+
+def test_spherical_hankel_functions():
+    import pytential.qbx.target_specific as ts
+
+    nterms = 9
+    z = 2 + 3j
+    scale = 1
+    h = np.zeros(1 + nterms, dtype=np.complex)
+    hder = np.zeros(1 + nterms, dtype=np.complex)
+    ts.h3dall_wrapper(nterms, z, scale, h, hder)
+
+    # Reference solution computed using
+    # scipy.special.spherical_jn + 1j * scipy.special.spherical_yn
+    h_expected = np.array([
+            +1.17460537937623677e-02 + -7.25971518952217565e-03j,
+            -7.12794888037171503e-03 + -1.55735608522498126e-02j,
+            -2.58175723285687941e-02 + +5.00665171335734627e-03j,
+            -6.95481631849959037e-03 + +4.92143379339500253e-02j,
+            +9.78278544942576822e-02 + +5.92281078069348405e-02j,
+            +2.65420992601874961e-01 + -1.70387117227806167e-01j,
+            -8.11750107462848453e-02 + -1.02133651818182791e+00j,
+            -3.49178056863992792e+00 + -1.62876088689699405e+00j,
+            -1.36147986022969878e+01 + +9.34959028601928743e+00j,
+            +4.56300765393887087e+00 + +7.94934376901125432e+01j,
+    ])
+
+    assert np.allclose(h, h_expected)
+
+    hder_expected = np.array([
+            +7.12794888037171503e-03 + +1.55735608522498126e-02j,
+            +2.11270661502996893e-02 + -5.75767287207851197e-03j,
+            +1.32171023895111261e-03 + -3.57580271012700734e-02j,
+            -6.69663049946767064e-02 + -3.16989251553807527e-02j,
+            -1.50547136475930293e-01 + +1.16532548652759055e-01j,
+            +8.87444851771816839e-02 + +5.84014513465967444e-01j,
+            +2.00269153354544205e+00 + +7.98384884993240895e-01j,
+            +7.22334424954346144e+00 + -5.46307186102847187e+00j,
+            -4.05890079026877615e+00 + -4.28512368415405192e+01j,
+            -2.04081205047078043e+02 + -1.02417988497371837e+02j,
+    ])
+
+    assert np.allclose(hder, hder_expected)
+
+
+@pytest.mark.parametrize("op", ["S", "D", "Sp"])
+@pytest.mark.parametrize("helmholtz_k", [0, 1.2, 12 + 1.2j])
+@pytest.mark.parametrize("qbx_order", [0, 1, 5])
+def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order):
+    logging.basicConfig(level=logging.INFO)
+
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    target_order = 4
+    fmm_tol = 1e-3
+
+    from meshmode.mesh.generation import generate_icosphere
+    mesh = generate_icosphere(1, target_order)
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+        InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    pre_density_discr = Discretization(
+            cl_ctx, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder
+
+    refiner_extra_kwargs = {}
+
+    if helmholtz_k != 0:
+        refiner_extra_kwargs["kernel_length_scale"] = 5 / abs(helmholtz_k)
+
+    qbx, _ = QBXLayerPotentialSource(
+            pre_density_discr, 4*target_order,
+            qbx_order=qbx_order,
+            fmm_level_to_order=SimpleExpansionOrderFinder(fmm_tol),
+            fmm_backend="fmmlib",
+            _expansions_in_tree_have_extent=True,
+            _expansion_stick_out_factor=0.9,
+            _use_target_specific_qbx=False,
+            ).with_refinement(**refiner_extra_kwargs)
+
+    density_discr = qbx.density_discr
+
+    nodes = density_discr.nodes().with_queue(queue)
+    u_dev = clmath.sin(nodes[0])
+
+    if helmholtz_k == 0:
+        kernel = LaplaceKernel(3)
+        kernel_kwargs = {}
+    else:
+        kernel = HelmholtzKernel(3, allow_evanescent=True)
+        kernel_kwargs = {"k": sym.var("k")}
+
+    u_sym = sym.var("u")
+
+    if op == "S":
+        op = sym.S
+    elif op == "D":
+        op = sym.D
+    elif op == "Sp":
+        op = sym.Sp
+    else:
+        raise ValueError("unknown operator: '%s'" % op)
+
+    expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs)
+
+    bound_op = bind(qbx, expr)
+    pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k).get()
+
+    qbx = qbx.copy(_use_target_specific_qbx=True)
+    bound_op = bind(qbx, expr)
+    pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k).get()
+
+    assert np.allclose(pot_tsqbx, pot_ref, atol=1e-13, rtol=1e-13)
+
+
+# You can test individual routines by typing
+# $ python test_target_specific_qbx.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker