diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 750bf6f4cc5018fc14c9195ec688d45bea21d129..6b6cf7a7e311ae56d81c948859b6ad841fec0c01 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,12 +1,19 @@
-Python 3.5 POCL:
+# Environment variables
+#
+# * PYTEST_ADDOPTS is used to filter test runs. The default value is "-k-slowtest",
+#   which skips the slow running tests.
+# * SKIP_EXAMPLES, if non-empty, can be used to skip the examples job.
+
+Python 2.7 POCL:
   script:
-  - export PY_EXE=python3.5
+  - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
   - export EXTRA_INSTALL="numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
-  - python3.5
+  - python2.7
   - pocl
   - large-node
   except:
@@ -16,6 +23,7 @@ Python 3.6 POCL:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
   - export EXTRA_INSTALL="numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -28,6 +36,7 @@ Python 3.6 POCL:
 
 Python 3.6 POCL Examples:
   script:
+  - test -n "$SKIP_EXAMPLES" && exit
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako pyvisfile matplotlib"
@@ -43,8 +52,9 @@ Python 3.6 POCL Examples:
 Python 3.5 Conda:
   script:
   - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine
-  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - export CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
@@ -53,27 +63,13 @@ Python 3.5 Conda:
   except:
   - tags
 
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  - large-node
-  except:
-  - tags
-
 Python 3.5 Conda Apple:
   script:
   - export LC_ALL=en_US.UTF-8
   - export LANG=en_US.UTF-8
-  - export PYTEST_ADDOPTS=-k-slowtest
-  - CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml
-  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml
+  - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest}
+  - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py
index 4be9e5ca093b5890024f92fb1307ebf42e4fb5f5..4cfd0c816f0e35542a3def17073eaa5f9850b08a 100644
--- a/pytential/qbx/fmm.py
+++ b/pytential/qbx/fmm.py
@@ -521,397 +521,4 @@ def drive_fmm(expansion_wrangler, src_weights):
 # }}}
 
 
-# {{{ performance data
-
-def assemble_performance_data(geo_data, uses_pde_expansions,
-        translation_source_power=None, translation_target_power=None,
-        translation_max_power=None,
-        summarize_parallel=None, merge_close_lists=True):
-    """
-    :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM
-        uses translation operators that make use of the knowledge that the
-        potential satisfies a PDE.
-    :arg summarize_parallel: a function of two arguments
-        *(parallel_array, sym_multipliers)* used to process an array of
-        workloads of 'parallelizable units'. By default, all workloads are
-        summed into one number encompassing the total workload.
-    :arg merge_close_lists: A :class:`bool` indicating whether or not all
-        boxes requiring direct evaluation should be merged into a single
-        interaction list. If *False*, *part_direct* and *p2qbxl* will be
-        suffixed with the originating list as follows:
-
-        * *_neighbor* (List 1)
-        * *_sep_smaller* (List 3 close)
-        * *_sep_bigger* (List 4 close).
-    """
-
-    # FIXME: This should suport target filtering.
-
-    if summarize_parallel is None:
-        def summarize_parallel(parallel_array, sym_multipliers):
-            return np.sum(parallel_array) * sym_multipliers
-
-    from collections import OrderedDict
-    result = OrderedDict()
-
-    from pymbolic import var
-    p_fmm = var("p_fmm")
-    p_qbx = var("p_qbx")
-
-    nqbtl = geo_data.non_qbx_box_target_lists()
-
-    with cl.CommandQueue(geo_data.cl_context) as queue:
-        tree = geo_data.tree().get(queue=queue)
-        traversal = geo_data.traversal(merge_close_lists).get(queue=queue)
-        box_target_counts_nonchild = (
-                nqbtl.box_target_counts_nonchild.get(queue=queue))
-
-    d = tree.dimensions
-    if uses_pde_expansions:
-        ncoeffs_fmm = p_fmm ** (d-1)
-        ncoeffs_qbx = p_qbx ** (d-1)
-
-        if d == 2:
-            default_translation_source_power = 1
-            default_translation_target_power = 1
-            default_translation_max_power = 0
-
-        elif d == 3:
-            # Based on a reading of FMMlib, i.e. a point-and-shoot FMM.
-            default_translation_source_power = 0
-            default_translation_target_power = 0
-            default_translation_max_power = 3
-
-        else:
-            raise ValueError("Don't know how to estimate expansion complexities "
-                    "for dimension %d" % d)
-
-    else:
-        ncoeffs_fmm = p_fmm ** d
-        ncoeffs_qbx = p_qbx ** d
-        default_translation_source_power = d
-        default_translation_target_power = d
-
-    if translation_source_power is None:
-        translation_source_power = default_translation_source_power
-    if translation_target_power is None:
-        translation_target_power = default_translation_target_power
-    if translation_max_power is None:
-        translation_max_power = default_translation_max_power
-
-    def xlat_cost(p_source, p_target):
-        from pymbolic.primitives import Max
-        return (
-                p_source ** translation_source_power
-                * p_target ** translation_target_power
-                * Max((p_source, p_target)) ** translation_max_power
-                )
-
-    result.update(
-            nlevels=tree.nlevels,
-            nboxes=tree.nboxes,
-            nsources=tree.nsources,
-            ntargets=tree.ntargets)
-
-    # {{{ construct local multipoles
-
-    result["form_mp"] = tree.nsources*ncoeffs_fmm
-
-    # }}}
-
-    # {{{ propagate multipoles upward
-
-    result["prop_upward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
-
-    # }}}
-
-    # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
-
-    def process_direct():
-        # box -> nsources * ntargets
-        npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-        npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-        npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_boxes):
-            ntargets = box_target_counts_nonchild[tgt_ibox]
-
-            npart_direct_list1_srcs = 0
-            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
-            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                npart_direct_list1_srcs += nsources
-
-            npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs
-
-            if merge_close_lists:
-                continue
-
-            npart_direct_list3_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_smaller_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    npart_direct_list3_srcs += nsources
-
-            npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs
-
-            npart_direct_list4_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_bigger_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    npart_direct_list4_srcs += nsources
-
-            npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs
-
-        if merge_close_lists:
-            result["part_direct"] = summarize_parallel(npart_direct_list1, 1)
-        else:
-            result["part_direct_neighbor"] = (
-                    summarize_parallel(npart_direct_list1, 1))
-            result["part_direct_sep_smaller"] = (
-                    summarize_parallel(npart_direct_list3, 1))
-            result["part_direct_sep_bigger"] = (
-                    summarize_parallel(npart_direct_list4, 1))
-
-    process_direct()
-
-    # }}}
-
-    # {{{ translate separated siblings' ("list 2") mpoles to local
-
-    def process_list2():
-        nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
-            start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
-
-            nm2l[itgt_box] += end-start
-
-        result["m2l"] = summarize_parallel(nm2l, xlat_cost(p_fmm, p_fmm))
-
-    process_list2()
-
-    # }}}
-
-    # {{{ evaluate sep. smaller mpoles ("list 3") at particles
-
-    def process_list3():
-        nmp_eval = np.zeros(
-                (tree.nlevels, len(traversal.target_boxes)),
-                dtype=np.intp)
-
-        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
-
-        for ilevel, sep_smaller_list in enumerate(
-                traversal.from_sep_smaller_by_level):
-            for itgt_box, tgt_ibox in enumerate(
-                        traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
-                ntargets = box_target_counts_nonchild[tgt_ibox]
-                start, end = sep_smaller_list.starts[itgt_box:itgt_box+2]
-                nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = (
-                        ntargets * (end-start)
-                )
-
-        result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm)
-
-    process_list3()
-
-    # }}}
-
-    # {{{ form locals for separated bigger source boxes ("list 4")
-
-    def process_list4():
-        nform_local = np.zeros(
-                len(traversal.target_or_target_parent_boxes),
-                dtype=np.intp)
-
-        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
-            start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
-
-            nform_local_box = 0
-            for src_ibox in traversal.from_sep_bigger_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                nform_local_box += nsources
-
-            nform_local[itgt_box] = nform_local_box
-
-        result["form_local"] = summarize_parallel(nform_local, ncoeffs_fmm)
-
-    process_list4()
-
-    # }}}
-
-    # {{{ propagate local_exps downward
-
-    result["prop_downward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
-
-    # }}}
-
-    # {{{ evaluate locals
-
-    result["eval_part"] = tree.ntargets * ncoeffs_fmm
-
-    # }}}
-
-    # {{{ form global qbx locals
-
-    global_qbx_centers = geo_data.global_qbx_centers()
-
-    # If merge_close_lists is False above, then this builds another traversal
-    # (which is OK).
-    qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
-    center_to_targets_starts = geo_data.center_to_tree_targets().starts
-    qbx_center_to_target_box_source_level = np.empty(
-        (tree.nlevels,), dtype=object
-    )
-
-    for src_level in range(tree.nlevels):
-        qbx_center_to_target_box_source_level[src_level] = (
-            geo_data.qbx_center_to_target_box_source_level(src_level)
-        )
-
-    with cl.CommandQueue(geo_data.cl_context) as queue:
-        global_qbx_centers = global_qbx_centers.get(
-                queue=queue)
-        qbx_center_to_target_box = qbx_center_to_target_box.get(
-                queue=queue)
-        center_to_targets_starts = center_to_targets_starts.get(
-                queue=queue)
-        for src_level in range(tree.nlevels):
-            qbx_center_to_target_box_source_level[src_level] = (
-                qbx_center_to_target_box_source_level[src_level].get(queue=queue)
-            )
-
-    def process_form_qbxl():
-        ncenters = geo_data.ncenters
-
-        result["ncenters"] = ncenters
-
-        # center -> nsources
-        np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-        np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-        np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp)
-
-        for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
-            itgt_box = qbx_center_to_target_box[tgt_icenter]
-
-            np2qbxl_list1_srcs = 0
-            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
-            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
-                nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                np2qbxl_list1_srcs += nsources
-
-            np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs
-
-            if merge_close_lists:
-                continue
-
-            np2qbxl_list3_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_smaller_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    np2qbxl_list3_srcs += nsources
-
-            np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs
-
-            np2qbxl_list4_srcs = 0
-
-            # Could be None, if not using targets with extent.
-            if traversal.from_sep_close_bigger_starts is not None:
-                start, end = (
-                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
-                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
-                    nsources = tree.box_source_counts_nonchild[src_ibox]
-
-                    np2qbxl_list4_srcs += nsources
-
-            np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs
-
-        if merge_close_lists:
-            result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx)
-        else:
-            result["p2qbxl_neighbor"] = (
-                    summarize_parallel(np2qbxl_list1, ncoeffs_qbx))
-            result["p2qbxl_sep_smaller"] = (
-                    summarize_parallel(np2qbxl_list3, ncoeffs_qbx))
-            result["p2qbxl_sep_bigger"] = (
-                    summarize_parallel(np2qbxl_list4, ncoeffs_qbx))
-
-    process_form_qbxl()
-
-    # }}}
-
-    # {{{ translate from list 3 multipoles to qbx local expansions
-
-    def process_m2qbxl():
-        nm2qbxl = np.zeros(
-                (tree.nlevels, len(global_qbx_centers)),
-                dtype=np.intp)
-
-        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
-
-        for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
-
-            for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
-                icontaining_tgt_box = qbx_center_to_target_box_source_level[
-                    isrc_level][tgt_icenter]
-
-                if icontaining_tgt_box == -1:
-                    continue
-
-                start, stop = (
-                        ssn.starts[icontaining_tgt_box],
-                        ssn.starts[icontaining_tgt_box+1])
-
-                nm2qbxl[isrc_level, itgt_center] += stop-start
-
-        result["m2qbxl"] = summarize_parallel(nm2qbxl, xlat_cost(p_fmm, p_qbx))
-
-    process_m2qbxl()
-
-    # }}}
-
-    # {{{ translate from box local expansions to qbx local expansions
-
-    result["l2qbxl"] = geo_data.ncenters * xlat_cost(p_fmm, p_qbx)
-
-    # }}}
-
-    # {{{ evaluate qbx local expansions
-
-    def process_eval_qbxl():
-        nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp)
-
-        for isrc_center, src_icenter in enumerate(global_qbx_centers):
-            start, end = center_to_targets_starts[src_icenter:src_icenter+2]
-            nqbx_eval[isrc_center] += end-start
-
-        result["qbxl2p"] = summarize_parallel(nqbx_eval, ncoeffs_qbx)
-
-    process_eval_qbxl()
-
-    # }}}
-
-    return result
-
-# }}}
-
 # vim: foldmethod=marker
diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..6442a475cd477287077ae7b0aaa2732d801d03a2
--- /dev/null
+++ b/pytential/qbx/performance.py
@@ -0,0 +1,434 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from six.moves import range
+import numpy as np  # noqa
+import pyopencl as cl  # noqa
+import pyopencl.array  # noqa
+
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. autofunction:: assemble_performance_data
+"""
+
+
+# {{{ assemble_performance_data
+
+def assemble_performance_data(geo_data, uses_pde_expansions,
+        translation_source_power=None, translation_target_power=None,
+        translation_max_power=None,
+        summarize_parallel=None, merge_close_lists=True):
+    """
+    :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM
+        uses translation operators that make use of the knowledge that the
+        potential satisfies a PDE.
+    :arg summarize_parallel: a function of two arguments
+        *(parallel_array, sym_multipliers)* used to process an array of
+        workloads of 'parallelizable units'. By default, all workloads are
+        summed into one number encompassing the total workload.
+    :arg merge_close_lists: A :class:`bool` indicating whether or not all
+        boxes requiring direct evaluation should be merged into a single
+        interaction list. If *False*, *part_direct* and *p2qbxl* will be
+        suffixed with the originating list as follows:
+
+        * *_neighbor* (List 1)
+        * *_sep_smaller* (List 3 close)
+        * *_sep_bigger* (List 4 close).
+    """
+
+    # FIXME: This should suport target filtering.
+
+    if summarize_parallel is None:
+        def summarize_parallel(parallel_array, sym_multipliers):
+            return np.sum(parallel_array) * sym_multipliers
+
+    from collections import OrderedDict
+    result = OrderedDict()
+
+    from pymbolic import var
+    p_fmm = var("p_fmm")
+    p_qbx = var("p_qbx")
+
+    nqbtl = geo_data.non_qbx_box_target_lists()
+
+    with cl.CommandQueue(geo_data.cl_context) as queue:
+        tree = geo_data.tree().get(queue=queue)
+        traversal = geo_data.traversal(merge_close_lists).get(queue=queue)
+        box_target_counts_nonchild = (
+                nqbtl.box_target_counts_nonchild.get(queue=queue))
+
+    d = tree.dimensions
+    if uses_pde_expansions:
+        ncoeffs_fmm = p_fmm ** (d-1)
+        ncoeffs_qbx = p_qbx ** (d-1)
+
+        if d == 2:
+            default_translation_source_power = 1
+            default_translation_target_power = 1
+            default_translation_max_power = 0
+
+        elif d == 3:
+            # Based on a reading of FMMlib, i.e. a point-and-shoot FMM.
+            default_translation_source_power = 0
+            default_translation_target_power = 0
+            default_translation_max_power = 3
+
+        else:
+            raise ValueError("Don't know how to estimate expansion complexities "
+                    "for dimension %d" % d)
+
+    else:
+        ncoeffs_fmm = p_fmm ** d
+        ncoeffs_qbx = p_qbx ** d
+        default_translation_source_power = d
+        default_translation_target_power = d
+
+    if translation_source_power is None:
+        translation_source_power = default_translation_source_power
+    if translation_target_power is None:
+        translation_target_power = default_translation_target_power
+    if translation_max_power is None:
+        translation_max_power = default_translation_max_power
+
+    def xlat_cost(p_source, p_target):
+        from pymbolic.primitives import Max
+        return (
+                p_source ** translation_source_power
+                * p_target ** translation_target_power
+                * Max((p_source, p_target)) ** translation_max_power
+                )
+
+    result.update(
+            nlevels=tree.nlevels,
+            nboxes=tree.nboxes,
+            nsources=tree.nsources,
+            ntargets=tree.ntargets)
+
+    # {{{ construct local multipoles
+
+    result["form_mp"] = tree.nsources*ncoeffs_fmm
+
+    # }}}
+
+    # {{{ propagate multipoles upward
+
+    result["prop_upward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
+
+    # }}}
+
+    # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
+
+    def process_direct():
+        # box -> nsources * ntargets
+        npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
+        npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
+        npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp)
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_boxes):
+            ntargets = box_target_counts_nonchild[tgt_ibox]
+
+            npart_direct_list1_srcs = 0
+            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                npart_direct_list1_srcs += nsources
+
+            npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs
+
+            if merge_close_lists:
+                continue
+
+            npart_direct_list3_srcs = 0
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_smaller_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
+                    nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                    npart_direct_list3_srcs += nsources
+
+            npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs
+
+            npart_direct_list4_srcs = 0
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_bigger_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
+                    nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                    npart_direct_list4_srcs += nsources
+
+            npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs
+
+        if merge_close_lists:
+            result["part_direct"] = summarize_parallel(npart_direct_list1, 1)
+        else:
+            result["part_direct_neighbor"] = (
+                    summarize_parallel(npart_direct_list1, 1))
+            result["part_direct_sep_smaller"] = (
+                    summarize_parallel(npart_direct_list3, 1))
+            result["part_direct_sep_bigger"] = (
+                    summarize_parallel(npart_direct_list4, 1))
+
+    process_direct()
+
+    # }}}
+
+    # {{{ translate separated siblings' ("list 2") mpoles to local
+
+    def process_list2():
+        nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp)
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
+            start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
+
+            nm2l[itgt_box] += end-start
+
+        result["m2l"] = summarize_parallel(nm2l, xlat_cost(p_fmm, p_fmm))
+
+    process_list2()
+
+    # }}}
+
+    # {{{ evaluate sep. smaller mpoles ("list 3") at particles
+
+    def process_list3():
+        nmp_eval = np.zeros(
+                (tree.nlevels, len(traversal.target_boxes)),
+                dtype=np.intp)
+
+        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
+
+        for ilevel, sep_smaller_list in enumerate(
+                traversal.from_sep_smaller_by_level):
+            for itgt_box, tgt_ibox in enumerate(
+                        traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
+                ntargets = box_target_counts_nonchild[tgt_ibox]
+                start, end = sep_smaller_list.starts[itgt_box:itgt_box+2]
+                nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = (
+                        ntargets * (end-start)
+                )
+
+        result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm)
+
+    process_list3()
+
+    # }}}
+
+    # {{{ form locals for separated bigger source boxes ("list 4")
+
+    def process_list4():
+        nform_local = np.zeros(
+                len(traversal.target_or_target_parent_boxes),
+                dtype=np.intp)
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
+            start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
+
+            nform_local_box = 0
+            for src_ibox in traversal.from_sep_bigger_lists[start:end]:
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                nform_local_box += nsources
+
+            nform_local[itgt_box] = nform_local_box
+
+        result["form_local"] = summarize_parallel(nform_local, ncoeffs_fmm)
+
+    process_list4()
+
+    # }}}
+
+    # {{{ propagate local_exps downward
+
+    result["prop_downward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm)
+
+    # }}}
+
+    # {{{ evaluate locals
+
+    result["eval_part"] = tree.ntargets * ncoeffs_fmm
+
+    # }}}
+
+    # {{{ form global qbx locals
+
+    global_qbx_centers = geo_data.global_qbx_centers()
+
+    # If merge_close_lists is False above, then this builds another traversal
+    # (which is OK).
+    qbx_center_to_target_box = geo_data.qbx_center_to_target_box()
+    center_to_targets_starts = geo_data.center_to_tree_targets().starts
+    qbx_center_to_target_box_source_level = np.empty(
+        (tree.nlevels,), dtype=object
+    )
+
+    for src_level in range(tree.nlevels):
+        qbx_center_to_target_box_source_level[src_level] = (
+            geo_data.qbx_center_to_target_box_source_level(src_level)
+        )
+
+    with cl.CommandQueue(geo_data.cl_context) as queue:
+        global_qbx_centers = global_qbx_centers.get(
+                queue=queue)
+        qbx_center_to_target_box = qbx_center_to_target_box.get(
+                queue=queue)
+        center_to_targets_starts = center_to_targets_starts.get(
+                queue=queue)
+        for src_level in range(tree.nlevels):
+            qbx_center_to_target_box_source_level[src_level] = (
+                qbx_center_to_target_box_source_level[src_level].get(queue=queue)
+            )
+
+    def process_form_qbxl():
+        ncenters = geo_data.ncenters
+
+        result["ncenters"] = ncenters
+
+        # center -> nsources
+        np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp)
+        np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp)
+        np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp)
+
+        for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
+            itgt_box = qbx_center_to_target_box[tgt_icenter]
+
+            np2qbxl_list1_srcs = 0
+            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                np2qbxl_list1_srcs += nsources
+
+            np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs
+
+            if merge_close_lists:
+                continue
+
+            np2qbxl_list3_srcs = 0
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_smaller_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
+                    nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                    np2qbxl_list3_srcs += nsources
+
+            np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs
+
+            np2qbxl_list4_srcs = 0
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_bigger_starts is not None:
+                start, end = (
+                        traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2])
+                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
+                    nsources = tree.box_source_counts_nonchild[src_ibox]
+
+                    np2qbxl_list4_srcs += nsources
+
+            np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs
+
+        if merge_close_lists:
+            result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx)
+        else:
+            result["p2qbxl_neighbor"] = (
+                    summarize_parallel(np2qbxl_list1, ncoeffs_qbx))
+            result["p2qbxl_sep_smaller"] = (
+                    summarize_parallel(np2qbxl_list3, ncoeffs_qbx))
+            result["p2qbxl_sep_bigger"] = (
+                    summarize_parallel(np2qbxl_list4, ncoeffs_qbx))
+
+    process_form_qbxl()
+
+    # }}}
+
+    # {{{ translate from list 3 multipoles to qbx local expansions
+
+    def process_m2qbxl():
+        nm2qbxl = np.zeros(
+                (tree.nlevels, len(global_qbx_centers)),
+                dtype=np.intp)
+
+        assert tree.nlevels == len(traversal.from_sep_smaller_by_level)
+
+        for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level):
+
+            for itgt_center, tgt_icenter in enumerate(global_qbx_centers):
+                icontaining_tgt_box = qbx_center_to_target_box_source_level[
+                    isrc_level][tgt_icenter]
+
+                if icontaining_tgt_box == -1:
+                    continue
+
+                start, stop = (
+                        ssn.starts[icontaining_tgt_box],
+                        ssn.starts[icontaining_tgt_box+1])
+
+                nm2qbxl[isrc_level, itgt_center] += stop-start
+
+        result["m2qbxl"] = summarize_parallel(nm2qbxl, xlat_cost(p_fmm, p_qbx))
+
+    process_m2qbxl()
+
+    # }}}
+
+    # {{{ translate from box local expansions to qbx local expansions
+
+    result["l2qbxl"] = geo_data.ncenters * xlat_cost(p_fmm, p_qbx)
+
+    # }}}
+
+    # {{{ evaluate qbx local expansions
+
+    def process_eval_qbxl():
+        nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp)
+
+        for isrc_center, src_icenter in enumerate(global_qbx_centers):
+            start, end = center_to_targets_starts[src_icenter:src_icenter+2]
+            nqbx_eval[isrc_center] += end-start
+
+        result["qbxl2p"] = summarize_parallel(nqbx_eval, ncoeffs_qbx)
+
+    process_eval_qbxl()
+
+    # }}}
+
+    return result
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py
index 942b6ed282ae1eb250c5d6bea645bc834dda84eb..e6ec4a9c88cf807c0effa5886f6e16608d1aaae6 100644
--- a/test/test_layer_pot_identity.py
+++ b/test/test_layer_pot_identity.py
@@ -233,12 +233,12 @@ class DynamicTestCase(object):
         if (self.geometry.mesh_name == "sphere"
                 and self.k != 0
                 and self.fmm_backend == "sumpy"):
-            pytest.skip("both direct eval and generating the FMM kernels "
+            raise ValueError("both direct eval and generating the FMM kernels "
                     "are too slow")
 
         if (self.geometry.mesh_name == "sphere"
                 and self.expr.zero_op_name == "green_grad"):
-            pytest.skip("does not achieve sufficient precision")
+            raise ValueError("does not achieve sufficient precision")
 
         if self.fmm_backend == "fmmlib":
             pytest.importorskip("pyfmmlib")
@@ -246,22 +246,28 @@ class DynamicTestCase(object):
 
 # {{{ integral identity tester
 
+
+@pytest.mark.slowtest
+@pytest.mark.parametrize("case", [
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 0),
+])
+def test_identity_convergence_slow(ctx_getter, case):
+    test_identity_convergence(case)
+
+
 @pytest.mark.parametrize("case", [
-    tc
-    for geom in [
-        StarfishGeometry(),
-        SphereGeometry(),
-        ]
-    for tc in [
-        DynamicTestCase(geom, GreenExpr(), 0),
-        DynamicTestCase(geom, GreenExpr(), 1.2),
-        DynamicTestCase(geom, GradGreenExpr(), 0),
-        DynamicTestCase(geom, GradGreenExpr(), 1.2),
-        DynamicTestCase(geom, ZeroCalderonExpr(), 0),
-
-        DynamicTestCase(geom, GreenExpr(), 0, fmm_backend="fmmlib"),
-        DynamicTestCase(geom, GreenExpr(), 1.2, fmm_backend="fmmlib"),
-        ]])
+        # 2d
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2),
+        DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 1.2),
+        DynamicTestCase(StarfishGeometry(), ZeroCalderonExpr(), 0),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"),
+        DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib"),
+        # 3d
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"),
+        DynamicTestCase(SphereGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib")
+])
 def test_identity_convergence(ctx_getter,  case, visualize=False):
     logging.basicConfig(level=logging.INFO)
 
diff --git a/test/test_performance_model.py b/test/test_performance_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6658857e15a09cd82d3b42fdf7ecbcd30fc2c04d
--- /dev/null
+++ b/test/test_performance_model.py
@@ -0,0 +1,131 @@
+from __future__ import division, print_function
+
+__copyright__ = "Copyright (C) 2018 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from pytential import bind, sym, norm  # noqa
+
+
+# {{{ global params
+
+TARGET_ORDER = 8
+OVSMP_FACTOR = 5
+TCF = 0.9
+QBX_ORDER = 5
+FMM_ORDER = 10
+
+DEFAULT_LPOT_KWARGS = {
+        "_box_extent_norm": "l2",
+        "_from_sep_smaller_crit": "static_l2",
+        }
+
+# }}}
+
+
+@pytest.mark.parametrize("dim", (2, 3))
+def test_performance_model(ctx_getter, dim):
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    # {{{ get lpot source
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import (
+            InterpolatoryQuadratureSimplexGroupFactory)
+
+    target_order = TARGET_ORDER
+
+    if dim == 2:
+        from meshmode.mesh.generation import starfish, make_curve_mesh
+        mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order)
+    elif dim == 3:
+        from meshmode.mesh.generation import generate_icosphere
+        mesh = generate_icosphere(r=1, order=target_order)
+    else:
+        raise ValueError("unknown dimension: %d" % dim)
+
+    pre_density_discr = Discretization(
+            queue.context, mesh,
+            InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    lpot_kwargs = DEFAULT_LPOT_KWARGS.copy()
+    lpot_kwargs.update(
+            _expansion_stick_out_factor=TCF,
+            fmm_order=FMM_ORDER, qbx_order=QBX_ORDER,
+            )
+
+    from pytential.qbx import QBXLayerPotentialSource
+    lpot_source = QBXLayerPotentialSource(
+            pre_density_discr, OVSMP_FACTOR*target_order,
+            **lpot_kwargs,)
+
+    lpot_source, _ = lpot_source.with_refinement()
+
+    # }}}
+
+    # {{{ run performance model
+
+    costs = {}
+
+    def inspect_geo_data(insn, bound_expr, geo_data):
+        from pytential.qbx.performance import assemble_performance_data
+        costs["costs"] = assemble_performance_data(
+                geo_data, uses_pde_expansions=True, merge_close_lists=False)
+        return False
+
+    lpot_source = lpot_source.copy(geometry_data_inspector=inspect_geo_data)
+    density_discr = lpot_source.density_discr
+    nodes = density_discr.nodes().with_queue(queue)
+    sigma = cl.clmath.sin(10 * nodes[0])
+
+    from sumpy.kernel import LaplaceKernel
+    sigma_sym = sym.var("sigma")
+    k_sym = LaplaceKernel(lpot_source.ambient_dim)
+    sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)
+
+    bound_op = bind(lpot_source, sym_op)
+    bound_op(queue, sigma=sigma)
+
+    # }}}
+
+
+# You can test individual routines by typing
+# $ python test_layer_pot.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+
+# vim: foldmethod=marker