5b046f33 · 5b046f33 · cc810e3a · cc810e3a · cc810e3a · cc810e3a
--- a/setup.cfg
+++ b/setup.cfg
-[flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,E402,W503
-max-line-length=85
--- a/setup.py
+++ b/setup.py
-#!/usr/bin/env python
-# -*- coding: latin1 -*-
-
-
-def main():
-    from setuptools import setup
-
-    version_dict = {}
-    init_filename = "boxtree/version.py"
-    exec(compile(open(init_filename, "r").read(), init_filename, "exec"),
-            version_dict)
-
-    setup(name="boxtree",
-          version=version_dict["VERSION_TEXT"],
-          description="Quadtree/octree building in Python and OpenCL",
-          long_description=open("README.rst", "rt").read(),
-          author="Andreas Kloeckner",
-          author_email="inform@tiker.net",
-          license="MIT",
-          url="http://wiki.tiker.net/BoxTree",
-          classifiers=[
-              'Development Status :: 3 - Alpha',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Other Audience',
-              'Intended Audience :: Science/Research',
-              'License :: OSI Approved :: MIT License',
-              'Natural Language :: English',
-              'Programming Language :: Python',
-              # We use conditional expressions, so 2.5 is the bare minimum.
-              'Programming Language :: Python :: 2.5',
-              'Programming Language :: Python :: 2.6',
-              'Programming Language :: Python :: 2.7',
-              # 3.x has not yet been tested.
-              'Topic :: Scientific/Engineering',
-              'Topic :: Scientific/Engineering :: Information Analysis',
-              'Topic :: Scientific/Engineering :: Mathematics',
-              'Topic :: Scientific/Engineering :: Visualization',
-              'Topic :: Software Development :: Libraries',
-              'Topic :: Utilities',
-              ],
-
-          packages=["boxtree"],
-          install_requires=[
-              "pytools>=2013.1",
-              "pyopencl>=2013.1",
-              "Mako>=0.7.3",
-              "pytest>=2.3",
-              "cgen>=2013.1.2",
-              "six",
-              ])
-
-
-if __name__ == '__main__':
-    main()
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
+__copyright__ = """
+Copyright (C) 2013 Andreas Kloeckner
+Copyright (C) 2018 Matt Wala
+Copyright (C) 2018 Hao Gao
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import sys
+import time
+
+import numpy as np
+import pytest
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,  # noqa: F401
+)
+from boxtree.cost import (
+    FMMCostModel,
+    _PythonFMMCostModel,
+    make_pde_aware_translation_cost_model,
+)
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+# {{{ test_compare_cl_and_py_cost_model
+
+@pytest.mark.opencl
+@pytest.mark.parametrize(
+    ("nsources", "ntargets", "dims", "dtype"), [
+        (50000, 50000, 3, np.float64)
+    ]
+)
+def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dtype):
+    actx = actx_factory()
+
+    # {{{ Generate sources, targets and target_radii
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+
+    rng = np.random.default_rng(22)
+    target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
+
+    # }}}
+
+    # {{{ Generate tree and traversal
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+    tree, _ = tb(
+        actx.queue, sources, targets=targets, target_radii=target_radii,
+        stick_out_factor=0.15, max_particles_in_box=30, debug=True
+    )
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+    trav_dev, _ = tg(actx.queue, tree, debug=True)
+    trav = trav_dev.get(queue=actx.queue)
+
+    # }}}
+
+    # {{{ Construct cost models
+
+    cl_cost_model = FMMCostModel(None)
+    python_cost_model = _PythonFMMCostModel(None)
+
+    constant_one_params = cl_cost_model.get_unit_calibration_params().copy()
+    for ilevel in range(trav.tree.nlevels):
+        constant_one_params[f"p_fmm_lev{ilevel}"] = 10
+
+    xlat_cost = make_pde_aware_translation_cost_model(dims, trav.tree.nlevels)
+
+    # }}}
+
+    # {{{ Test process_form_multipoles
+
+    from pymbolic import evaluate
+    nlevels = trav.tree.nlevels
+    p2m_cost = np.zeros(nlevels, dtype=np.float64)
+    for ilevel in range(nlevels):
+        p2m_cost[ilevel] = evaluate(
+            xlat_cost.p2m(ilevel),
+            context=constant_one_params
+        )
+    p2m_cost_dev = actx.from_numpy(p2m_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_form_multipoles = cl_cost_model.process_form_multipoles(
+        actx.queue, trav_dev, p2m_cost_dev
+    )
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_form_multipoles: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+
+    python_form_multipoles = python_cost_model.process_form_multipoles(
+        actx.queue, trav, p2m_cost
+    )
+
+    logger.info("Python time for process_form_multipoles: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_form_multipoles), python_form_multipoles)
+
+    # }}}
+
+    # {{{ Test process_coarsen_multipoles
+
+    m2m_cost = np.zeros(nlevels - 1, dtype=np.float64)
+    for target_level in range(nlevels - 1):
+        m2m_cost[target_level] = evaluate(
+            xlat_cost.m2m(target_level + 1, target_level),
+            context=constant_one_params
+        )
+    m2m_cost_dev = actx.from_numpy(m2m_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+    cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles(
+        actx.queue, trav_dev, m2m_cost_dev
+    )
+
+    actx.queue.finish()
+    logger.info("OpenCL time for coarsen_multipoles: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+
+    python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles(
+        actx.queue, trav, m2m_cost
+    )
+
+    logger.info("Python time for coarsen_multipoles: %gs",
+            time.time() - start_time)
+
+    assert cl_coarsen_multipoles == python_coarsen_multipoles
+
+    # }}}
+
+    # {{{ Test process_direct
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_ndirect_sources_per_target_box = \
+        cl_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav_dev)
+
+    cl_direct = cl_cost_model.process_direct(
+        actx.queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0
+    )
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_direct: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+
+    python_ndirect_sources_per_target_box = \
+        python_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav)
+
+    python_direct = python_cost_model.process_direct(
+        actx.queue, trav, python_ndirect_sources_per_target_box, 5.0
+    )
+
+    logger.info("Python time for process_direct: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_direct), python_direct)
+
+    # }}}
+
+    # {{{ Test aggregate_over_boxes
+
+    start_time = time.time()
+
+    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct)
+
+    actx.queue.finish()
+    logger.info("OpenCL time for aggregate_over_boxes: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+
+    python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct)
+
+    logger.info("Python time for aggregate_over_boxes: %gs",
+            time.time() - start_time)
+
+    assert cl_direct_aggregate == python_direct_aggregate
+
+    # }}}
+
+    # {{{ Test process_list2
+
+    nlevels = trav.tree.nlevels
+    m2l_cost = np.zeros(nlevels, dtype=np.float64)
+    for ilevel in range(nlevels):
+        m2l_cost[ilevel] = evaluate(
+            xlat_cost.m2l(ilevel, ilevel),
+            context=constant_one_params
+        )
+    m2l_cost_dev = actx.from_numpy(m2l_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_m2l_cost = cl_cost_model.process_list2(actx.queue, trav_dev, m2l_cost_dev)
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_list2: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+    python_m2l_cost = python_cost_model.process_list2(actx.queue, trav, m2l_cost)
+    logger.info("Python time for process_list2: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_m2l_cost), python_m2l_cost)
+
+    # }}}
+
+    # {{{ Test process_list 3
+
+    m2p_cost = np.zeros(nlevels, dtype=np.float64)
+    for ilevel in range(nlevels):
+        m2p_cost[ilevel] = evaluate(
+            xlat_cost.m2p(ilevel),
+            context=constant_one_params
+        )
+    m2p_cost_dev = actx.from_numpy(m2p_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_m2p_cost = cl_cost_model.process_list3(actx.queue, trav_dev, m2p_cost_dev)
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_list3: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+    python_m2p_cost = python_cost_model.process_list3(actx.queue, trav, m2p_cost)
+    logger.info("Python time for process_list3: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_m2p_cost), python_m2p_cost)
+
+    # }}}
+
+    # {{{ Test process_list4
+
+    p2l_cost = np.zeros(nlevels, dtype=np.float64)
+    for ilevel in range(nlevels):
+        p2l_cost[ilevel] = evaluate(
+            xlat_cost.p2l(ilevel),
+            context=constant_one_params
+        )
+    p2l_cost_dev = actx.from_numpy(p2l_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_p2l_cost = cl_cost_model.process_list4(actx.queue, trav_dev, p2l_cost_dev)
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_list4: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+    python_p2l_cost = python_cost_model.process_list4(actx.queue, trav, p2l_cost)
+    logger.info("Python time for process_list4: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_p2l_cost), python_p2l_cost)
+
+    # }}}
+
+    # {{{ Test process_refine_locals
+
+    l2l_cost = np.zeros(nlevels - 1, dtype=np.float64)
+    for ilevel in range(nlevels - 1):
+        l2l_cost[ilevel] = evaluate(
+            xlat_cost.l2l(ilevel, ilevel + 1),
+            context=constant_one_params
+        )
+    l2l_cost_dev = actx.from_numpy(l2l_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_refine_locals_cost = cl_cost_model.process_refine_locals(
+        actx.queue, trav_dev, l2l_cost_dev
+    )
+
+    actx.queue.finish()
+    logger.info("OpenCL time for refine_locals: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+    python_refine_locals_cost = python_cost_model.process_refine_locals(
+        actx.queue, trav, l2l_cost
+    )
+    logger.info("Python time for refine_locals: %gs",
+            time.time() - start_time)
+
+    assert cl_refine_locals_cost == python_refine_locals_cost
+
+    # }}}
+
+    # {{{ Test process_eval_locals
+
+    l2p_cost = np.zeros(nlevels, dtype=np.float64)
+    for ilevel in range(nlevels):
+        l2p_cost[ilevel] = evaluate(
+            xlat_cost.l2p(ilevel),
+            context=constant_one_params
+        )
+    l2p_cost_dev = actx.from_numpy(l2p_cost)
+
+    actx.queue.finish()
+    start_time = time.time()
+
+    cl_l2p_cost = cl_cost_model.process_eval_locals(
+            actx.queue, trav_dev, l2p_cost_dev)
+
+    actx.queue.finish()
+    logger.info("OpenCL time for process_eval_locals: %gs",
+            time.time() - start_time)
+
+    start_time = time.time()
+    python_l2p_cost = python_cost_model.process_eval_locals(
+            actx.queue, trav, l2p_cost)
+    logger.info("Python time for process_eval_locals: %gs",
+            time.time() - start_time)
+
+    assert np.array_equal(actx.to_numpy(cl_l2p_cost), python_l2p_cost)
+
+    # }}}
+
+# }}}
+
+
+# {{{ test_estimate_calibration_params
+
+@pytest.mark.opencl
+def test_estimate_calibration_params(actx_factory):
+    from boxtree.pyfmmlib_integration import (
+        FMMLibExpansionWrangler,
+        FMMLibTreeIndependentDataForWrangler,
+        Kernel,
+    )
+
+    rng = np.random.default_rng(seed=42)
+
+    nsources_list = [1000, 2000, 3000, 4000]
+    ntargets_list = [1000, 2000, 3000, 4000]
+    dims = 3
+    dtype = np.float64
+
+    actx = actx_factory()
+
+    traversals = []
+    traversals_dev = []
+    level_to_orders = []
+    timing_results = []
+
+    def fmm_level_to_order(tree, ilevel):
+        return 10
+
+    for nsources, ntargets in zip(nsources_list, ntargets_list, strict=True):
+        # {{{ Generate sources, targets and target_radii
+
+        from boxtree.tools import make_normal_particle_array as p_normal
+        sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+        targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+
+        rng = np.random.default_rng(22)
+        target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
+
+        # }}}
+
+        # {{{ Generate tree and traversal
+
+        from boxtree import TreeBuilder
+        tb = TreeBuilder(actx.context)
+        tree, _ = tb(
+            actx.queue, sources, targets=targets, target_radii=target_radii,
+            stick_out_factor=0.15, max_particles_in_box=30, debug=True
+        )
+
+        from boxtree.traversal import FMMTraversalBuilder
+        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+        trav_dev, _ = tg(actx.queue, tree, debug=True)
+        trav = trav_dev.get(queue=actx.queue)
+
+        traversals.append(trav)
+        traversals_dev.append(trav_dev)
+
+        # }}}
+
+        tree_indep = FMMLibTreeIndependentDataForWrangler(
+                trav.tree.dimensions, Kernel.LAPLACE)
+        wrangler = FMMLibExpansionWrangler(tree_indep, trav,
+                fmm_level_to_order=fmm_level_to_order)
+        level_to_orders.append(wrangler.level_orders)
+
+        timing_data = {}
+        from boxtree.fmm import drive_fmm
+        src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
+        drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+
+        timing_results.append(timing_data)
+
+    time_field_name = "process_elapsed"
+
+    def test_params_sanity(test_params):
+        param_names = ["c_p2m", "c_m2m", "c_p2p", "c_m2l", "c_m2p", "c_p2l", "c_l2l",
+                       "c_l2p"]
+        for name in param_names:
+            assert isinstance(test_params[name], np.float64)
+
+    def test_params_equal(test_params1, test_params2):
+        param_names = ["c_p2m", "c_m2m", "c_p2p", "c_m2l", "c_m2p", "c_p2l", "c_l2l",
+                       "c_l2p"]
+        for name in param_names:
+            assert test_params1[name] == test_params2[name]
+
+    python_cost_model = _PythonFMMCostModel(make_pde_aware_translation_cost_model)
+
+    python_model_results = []
+
+    for icase in range(len(traversals)-1):
+        traversal = traversals[icase]
+        level_to_order = level_to_orders[icase]
+
+        python_model_results.append(python_cost_model.cost_per_stage(
+            actx.queue, traversal, level_to_order,
+            _PythonFMMCostModel.get_unit_calibration_params(),
+        ))
+
+    python_params = python_cost_model.estimate_calibration_params(
+        python_model_results, timing_results[:-1], time_field_name=time_field_name
+    )
+
+    test_params_sanity(python_params)
+
+    cl_cost_model = FMMCostModel(make_pde_aware_translation_cost_model)
+
+    cl_model_results = []
+
+    for icase in range(len(traversals_dev)-1):
+        traversal = traversals_dev[icase]
+        level_to_order = level_to_orders[icase]
+
+        cl_model_results.append(cl_cost_model.cost_per_stage(
+            actx.queue, traversal, level_to_order,
+            FMMCostModel.get_unit_calibration_params(),
+        ))
+
+    cl_params = cl_cost_model.estimate_calibration_params(
+        cl_model_results, timing_results[:-1], time_field_name=time_field_name
+    )
+
+    test_params_sanity(cl_params)
+    test_params_equal(cl_params, python_params)
+
+# }}}
+
+
+# {{{ test_cost_model_op_counts_agree_with_constantone_wrangler
+
+class OpCountingTranslationCostModel:
+    """A translation cost model which assigns at cost of 1 to each operation."""
+
+    def __init__(self, dim, nlevels):
+        pass
+
+    @staticmethod
+    def direct():
+        return 1
+
+    @staticmethod
+    def p2l(level):
+        return 1
+
+    l2p = p2l
+    p2m = p2l
+    m2p = p2l
+
+    @staticmethod
+    def m2m(src_level, tgt_level):
+        return 1
+
+    l2l = m2m
+    m2l = m2m
+
+
+@pytest.mark.opencl
+@pytest.mark.parametrize(
+    ("nsources", "ntargets", "dims", "dtype"), [
+        (5000, 5000, 3, np.float64)
+    ]
+)
+def test_cost_model_op_counts_agree_with_constantone_wrangler(
+        actx_factory, nsources, ntargets, dims, dtype):
+    actx = actx_factory()
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=16)
+    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=19)
+
+    rng = np.random.default_rng(20)
+    target_radii = rng.uniform(0, 0.04, (ntargets,)).astype(dtype)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+    tree, _ = tb(
+        actx.queue, sources, targets=targets, target_radii=target_radii,
+        stick_out_factor=0.15, max_particles_in_box=30, debug=True
+    )
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+    trav_dev, _ = tg(actx.queue, tree, debug=True)
+    trav = trav_dev.get(queue=actx.queue)
+
+    from boxtree.constant_one import (
+        ConstantOneExpansionWrangler,
+        ConstantOneTreeIndependentDataForWrangler,
+    )
+    tree_indep = ConstantOneTreeIndependentDataForWrangler()
+    wrangler = ConstantOneExpansionWrangler(tree_indep, trav)
+
+    timing_data = {}
+    from boxtree.fmm import drive_fmm
+    src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
+    drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+
+    cost_model = FMMCostModel(
+        translation_cost_model_factory=OpCountingTranslationCostModel
+    )
+
+    level_to_order = np.array([1 for _ in range(tree.nlevels)])
+
+    modeled_time = cost_model.cost_per_stage(
+        actx.queue, trav_dev, level_to_order,
+        FMMCostModel.get_unit_calibration_params(),
+    )
+
+    mismatches = []
+    for stage in timing_data:
+        if timing_data[stage]["ops_elapsed"] != modeled_time[stage]:
+            mismatches.append(
+                    (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage]))
+
+    assert not mismatches, "\n".join(str(s) for s in mismatches)
+
+    # {{{ Test per-box cost
+
+    total_cost = 0.0
+    for stage in timing_data:
+        total_cost += timing_data[stage]["ops_elapsed"]
+
+    per_box_cost = cost_model.cost_per_box(
+        actx.queue, trav_dev, level_to_order,
+        FMMCostModel.get_unit_calibration_params(),
+    )
+    total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost)
+
+    assert total_cost == (
+            total_aggregate_cost
+            + modeled_time["coarsen_multipoles"]
+            + modeled_time["refine_locals"]
+    )
+
+    # }}}
+
+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_cost_model.py 'test_routine(_acf)'
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: foldmethod=marker
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
+__copyright__ = "Copyright (C) 2021 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import os
+import sys
+
+import numpy as np
+import numpy.linalg as la
+import pytest
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,
+)
+from boxtree.constant_one import (
+    ConstantOneExpansionWrangler as ConstantOneExpansionWranglerBase,
+    ConstantOneTreeIndependentDataForWrangler,
+)
+from boxtree.pyfmmlib_integration import (
+    FMMLibExpansionWrangler,
+    FMMLibTreeIndependentDataForWrangler,
+    Kernel,
+)
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+# NOTE: Do not import mpi4py.MPI object at the module level, because OpenMPI
+# does not support recursive invocations.
+
+
+def _cachedir():
+    import tempfile
+    return tempfile.mkdtemp(prefix="boxtree-pytest-")
+
+
+# {{{ test_against_shared
+
+def _test_against_shared(
+        tmp_cache_basedir,
+        dims, nsources, ntargets, dtype, communicate_mpoles_via_allreduce=False):
+    from mpi4py import MPI
+
+    # Get the current rank
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    rank_cache_dir = os.path.join(tmp_cache_basedir, f"rank_{rank:03d}")
+
+    # Initialize arguments for worker processes
+    global_tree_host = None
+    sources_weights = np.empty(0, dtype=dtype)
+    helmholtz_k = 0
+
+    def fmm_level_to_order(tree, level):
+        return max(level, 3)
+
+    from unittest.mock import patch
+    with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}):
+        actx = _acf()
+
+        from boxtree.traversal import FMMTraversalBuilder
+        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+
+        tree_indep = FMMLibTreeIndependentDataForWrangler(
+            dims, Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
+
+        # Generate particles and run shared-memory parallelism on rank 0
+        if rank == 0:
+            # Generate random particles and source weights
+            from boxtree.tools import make_normal_particle_array as p_normal
+            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+            targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+
+            rng = np.random.default_rng(20)
+            sources_weights = rng.uniform(0.0, 1.0, (nsources,))
+            target_radii = rng.uniform(0.0, 0.05, (ntargets,))
+
+            # Build the tree and interaction lists
+            from boxtree import TreeBuilder
+            tb = TreeBuilder(actx.context)
+            global_tree_dev, _ = tb(
+                actx.queue, sources, targets=targets, target_radii=target_radii,
+                stick_out_factor=0.25, max_particles_in_box=30, debug=True)
+
+            d_trav, _ = tg(actx.queue, global_tree_dev, debug=True)
+            global_traversal_host = d_trav.get(queue=actx.queue)
+            global_tree_host = global_traversal_host.tree
+
+            # Get pyfmmlib expansion wrangler
+            wrangler = FMMLibExpansionWrangler(
+                    tree_indep, global_traversal_host,
+                    fmm_level_to_order=fmm_level_to_order)
+
+            # Compute FMM with one MPI rank
+            from boxtree.fmm import drive_fmm
+            pot_fmm = drive_fmm(wrangler, [sources_weights]) * 2 * np.pi
+
+        # Compute FMM using the distributed implementation
+
+        def wrangler_factory(local_traversal, global_traversal):
+            from boxtree.distributed.calculation import (
+                DistributedFMMLibExpansionWrangler,
+            )
+
+            return DistributedFMMLibExpansionWrangler(
+                actx.context, comm, tree_indep, local_traversal, global_traversal,
+                fmm_level_to_order=fmm_level_to_order,
+                communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
+
+        from boxtree.distributed import DistributedFMMRunner
+        distributed_fmm_info = DistributedFMMRunner(
+            actx.queue, global_tree_host, tg, wrangler_factory, comm=comm)
+
+        timing_data = {}
+        pot_dfmm = distributed_fmm_info.drive_dfmm(
+                    [sources_weights], timing_data=timing_data)
+        assert timing_data
+
+    # Uncomment the following section to print the time taken of each stage
+    """
+    if rank == 1:
+        from pytools import Table
+        table = Table()
+        table.add_row(["stage", "time (s)"])
+        for stage in timing_data:
+            table.add_row([stage, "%.2f" % timing_data[stage]["wall_elapsed"]])
+        print(table)
+    """
+
+    if rank == 0:
+        error = (la.norm(pot_fmm - pot_dfmm * 2 * np.pi, ord=np.inf)
+                 / la.norm(pot_fmm, ord=np.inf))
+        print(error)
+        assert error < 1e-14
+
+
+@pytest.mark.mpi
+@pytest.mark.parametrize(
+    "num_processes, dims, nsources, ntargets, communicate_mpoles_via_allreduce", [
+        (4, 3, 10000, 10000, True),
+        (4, 3, 10000, 10000, False)
+    ]
+)
+def test_against_shared(
+        tmp_path, num_processes, dims, nsources, ntargets,
+        communicate_mpoles_via_allreduce):
+    pytest.importorskip("mpi4py")
+
+    from boxtree.tools import run_mpi
+    run_mpi(__file__, num_processes, {
+        "OMP_NUM_THREADS": 1,
+        "_BOXTREE_TEST_NAME": "shared",
+        "_BOXTREE_TEST_DIMS": dims,
+        "_BOXTREE_TEST_NSOURCES": nsources,
+        "_BOXTREE_TEST_NTARGETS": ntargets,
+        "_BOXTREE_TEST_TMP_CACHE_BASEDIR": tmp_path / "boxtree_distributed_test",
+        "_BOXTREE_TEST_MPOLES_ALLREDUCE": communicate_mpoles_via_allreduce
+        })
+
+# }}}
+
+
+# {{{ test_constantone
+
+def _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype):
+    from boxtree.distributed.calculation import DistributedExpansionWrangler
+
+    class ConstantOneExpansionWrangler(
+            ConstantOneExpansionWranglerBase, DistributedExpansionWrangler):
+        def __init__(
+                self, queue, comm, tree_indep, local_traversal, global_traversal):
+            DistributedExpansionWrangler.__init__(
+                self, queue, comm, global_traversal, False,
+                communicate_mpoles_via_allreduce=True)
+            ConstantOneExpansionWranglerBase.__init__(
+                self, tree_indep, local_traversal)
+            self.level_orders = np.ones(local_traversal.tree.nlevels, dtype=np.int32)
+
+        def reorder_sources(self, source_array):
+            if self.comm.Get_rank() == 0:
+                return source_array[self.global_traversal.tree.user_source_ids]
+            else:
+                return None
+
+        def reorder_potentials(self, potentials):
+            if self.comm.Get_rank() == 0:
+                return potentials[self.global_traversal.tree.sorted_target_ids]
+            else:
+                return None
+
+    from mpi4py import MPI
+
+    # Get the current rank
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    rank_cache_dir = os.path.join(tmp_cache_basedir, f"rank_{rank:03d}")
+
+    # Initialization
+    tree = None
+    sources_weights = np.empty(0, dtype=dtype)
+
+    from unittest.mock import patch
+    with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}):
+        actx = _acf()
+
+        from boxtree.traversal import FMMTraversalBuilder
+        tg = FMMTraversalBuilder(actx.context)
+
+        if rank == 0:
+
+            # Generate random particles
+            from boxtree.tools import make_normal_particle_array as p_normal
+            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+            targets = (p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+                       + np.array([2, 0, 0])[:dims])
+
+            # Constant one source weights
+            sources_weights = np.ones((nsources,), dtype=dtype)
+
+            # Build the global tree
+            from boxtree import TreeBuilder
+            tb = TreeBuilder(actx.context)
+            tree, _ = tb(
+                    actx.queue, sources, targets=targets, max_particles_in_box=30,
+                    debug=True)
+            tree = tree.get(actx.queue)
+
+        tree_indep = ConstantOneTreeIndependentDataForWrangler()
+
+        def wrangler_factory(local_traversal, global_traversal):
+            return ConstantOneExpansionWrangler(
+                    actx.queue, comm, tree_indep, local_traversal, global_traversal)
+
+        from boxtree.distributed import DistributedFMMRunner
+        distributed_fmm_info = DistributedFMMRunner(
+            actx.queue, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD)
+
+        pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights])
+
+    if rank == 0:
+        assert (np.all(pot_dfmm == nsources))
+
+
+@pytest.mark.mpi
+@pytest.mark.parametrize("num_processes, dims, nsources, ntargets", [
+    (4, 3, 10000, 10000)
+])
+def test_constantone(tmp_path, num_processes, dims, nsources, ntargets):
+    pytest.importorskip("mpi4py")
+
+    from boxtree.tools import run_mpi
+    run_mpi(__file__, num_processes, {
+        "OMP_NUM_THREADS": 1,
+        "_BOXTREE_TEST_NAME": "constantone",
+        "_BOXTREE_TEST_DIMS": dims,
+        "_BOXTREE_TEST_NSOURCES": nsources,
+        "_BOXTREE_TEST_NTARGETS": ntargets,
+        "_BOXTREE_TEST_TMP_CACHE_BASEDIR": tmp_path / "boxtree_distributed_test",
+        "_BOXTREE_TEST_MPOLES_ALLREDUCE": False
+        })
+
+# }}}
+
+
+if __name__ == "__main__":
+    dtype = np.float64
+    tmp_cache_basedir = os.environ.get("_BOXTREE_TEST_TMP_CACHE_BASEDIR", _cachedir())
+
+    if "_BOXTREE_TEST_DIMS" in os.environ:
+        dims = int(os.environ["_BOXTREE_TEST_DIMS"])
+        nsources = int(os.environ["_BOXTREE_TEST_NSOURCES"])
+        ntargets = int(os.environ["_BOXTREE_TEST_NTARGETS"])
+        communicate_mpoles_via_allreduce = (
+                os.environ["_BOXTREE_TEST_MPOLES_ALLREDUCE"] == "True"
+                )
+
+        if os.environ["_BOXTREE_TEST_NAME"] == "shared":
+            _test_against_shared(
+                tmp_cache_basedir,
+                dims, nsources, ntargets, dtype,
+                communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
+        elif os.environ["_BOXTREE_TEST_NAME"] == "constantone":
+            _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype)
+    else:
+        if len(sys.argv) > 1:
+
+            # You can test individual routines by typing
+            # $ python test_distributed.py 'test_constantone(
+            #       tmp_cache_basedir, 4, 3, 10000, 10000)'
+            exec(sys.argv[1])
+
+        elif len(sys.argv) == 1:
+
+            # Run against_shared test case with default parameter
+            dims = 3
+            nsources = 10000
+            ntargets = 10000
+
+            _test_against_shared(tmp_cache_basedir, dims, nsources, ntargets, dtype)
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

 __license__ = """
@@ -22,169 +20,87 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-from six.moves import range
+import logging

 import numpy as np
 import numpy.linalg as la
-import pyopencl as cl
-
 import pytest
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,  # noqa: F401
+)
+from boxtree.constant_one import (
+    ConstantOneExpansionWrangler,
+    ConstantOneTreeIndependentDataForWrangler,
+)
+from boxtree.tools import (  # noqa: F401
+    make_normal_particle_array as p_normal,
+    make_surface_particle_array as p_surface,
+    make_uniform_particle_array as p_uniform,
+    particle_array_to_host,
+)

-from boxtree.tools import (
-        make_normal_particle_array as p_normal,
-        make_surface_particle_array as p_surface,
-        make_uniform_particle_array as p_uniform,
-        particle_array_to_host)

-import logging
 logger = logging.getLogger(__name__)

+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])

-# {{{ fmm interaction completeness test
-
-class ConstantOneExpansionWrangler(object):
-    """This implements the 'analytical routines' for a Green's function that is
-    constant 1 everywhere. For 'charges' of 'ones', this should get every particle
-    a copy of the particle count.
-    """
-
-    def __init__(self, tree):
-        self.tree = tree
-
-    def multipole_expansion_zeros(self):
-        return np.zeros(self.tree.nboxes, dtype=np.float64)
-
-    local_expansion_zeros = multipole_expansion_zeros
-
-    def potential_zeros(self):
-        return np.zeros(self.tree.ntargets, dtype=np.float64)
-
-    def _get_source_slice(self, ibox):
-        pstart = self.tree.box_source_starts[ibox]
-        return slice(
-                pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
-
-    def _get_target_slice(self, ibox):
-        pstart = self.tree.box_target_starts[ibox]
-        return slice(
-                pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
-
-    def reorder_sources(self, source_array):
-        return source_array[self.tree.user_source_ids]
-
-    def reorder_potentials(self, potentials):
-        return potentials[self.tree.sorted_target_ids]
-
-    def form_multipoles(self, source_boxes, src_weights):
-        mpoles = self.multipole_expansion_zeros()
-        for ibox in source_boxes:
-            pslice = self._get_source_slice(ibox)
-            mpoles[ibox] += np.sum(src_weights[pslice])
-
-        return mpoles
-
-    def coarsen_multipoles(self, parent_boxes, mpoles):
-        tree = self.tree
-
-        for ibox in parent_boxes:
-            for child in tree.box_child_ids[:, ibox]:
-                if child:
-                    mpoles[ibox] += mpoles[child]
-
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
-            neighbor_sources_lists, src_weights):
-        pot = self.potential_zeros()
-
-        for itgt_box, tgt_ibox in enumerate(target_boxes):
-            tgt_pslice = self._get_target_slice(tgt_ibox)
-
-            src_sum = 0
-            start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
-            #print "DIR: %s <- %s" % (tgt_ibox, neighbor_sources_lists[start:end])
-            for src_ibox in neighbor_sources_lists[start:end]:
-                src_pslice = self._get_source_slice(src_ibox)
-
-                src_sum += np.sum(src_weights[src_pslice])

-            pot[tgt_pslice] = src_sum
+# {{{ ref fmmlib pot computation

-        return pot
+def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host,
+        helmholtz_k, dipole_vec=None):
+    dims = sources_host.shape[0]
+    eqn_letter = "h" if helmholtz_k else "l"
+    use_dipoles = dipole_vec is not None

-    def multipole_to_local(self, target_or_target_parent_boxes,
-            starts, lists, mpole_exps):
-        local_exps = self.local_expansion_zeros()
+    import pyfmmlib

-        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
-            start, end = starts[itgt_box:itgt_box+2]
+    name = "fld" if dims == 3 else "grad"
+    dp = "_dp" if use_dipoles else ""
+    fmmlib_routine = getattr(
+            pyfmmlib,
+            f"{eqn_letter}pot{name}{dims}dall{dp}_vec")

-            contrib = 0
-            #print tgt_ibox, "<-", lists[start:end]
-            for src_ibox in lists[start:end]:
-                contrib += mpole_exps[src_ibox]
-
-            local_exps[tgt_ibox] += contrib
-
-        return local_exps
-
-    def eval_multipoles(self, target_boxes, sep_smaller_nonsiblings_starts,
-            sep_smaller_nonsiblings_lists, mpole_exps):
-        pot = self.potential_zeros()
-
-        for itgt_box, tgt_ibox in enumerate(target_boxes):
-            tgt_pslice = self._get_target_slice(tgt_ibox)
-
-            contrib = 0
-            start, end = sep_smaller_nonsiblings_starts[itgt_box:itgt_box+2]
-            for src_ibox in sep_smaller_nonsiblings_lists[start:end]:
-                contrib += mpole_exps[src_ibox]
-
-            pot[tgt_pslice] += contrib
-
-        return pot
-
-    def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
-        local_exps = self.local_expansion_zeros()
-
-        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
-            start, end = starts[itgt_box:itgt_box+2]
-
-            #print "LIST 4", tgt_ibox, "<-", lists[start:end]
-            contrib = 0
-            for src_ibox in lists[start:end]:
-                src_pslice = self._get_source_slice(src_ibox)
-
-                contrib += np.sum(src_weights[src_pslice])
-
-            local_exps[tgt_ibox] += contrib
-
-        return local_exps
-
-    def refine_locals(self, child_boxes, local_exps):
-        for ibox in child_boxes:
-            local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
+    kwargs = {}
+    if dims == 3:
+        kwargs["iffld"] = False
+    else:
+        kwargs["ifgrad"] = False
+        kwargs["ifhess"] = False

-        return local_exps
+    if use_dipoles:
+        if helmholtz_k == 0 and dims == 2:
+            kwargs["dipstr"] = -weights * (dipole_vec[0] + 1j * dipole_vec[1])
+        else:
+            kwargs["dipstr"] = weights
+            kwargs["dipvec"] = dipole_vec
+    else:
+        kwargs["charge"] = weights
+    if helmholtz_k:
+        kwargs["zk"] = helmholtz_k

-    def eval_locals(self, target_boxes, local_exps):
-        pot = self.potential_zeros()
+    return wrangler.finalize_potentials(
+            fmmlib_routine(
+                sources=sources_host, targets=targets_host,
+                **kwargs)[0],
+            template_ary=weights)

-        for ibox in target_boxes:
-            tgt_pslice = self._get_target_slice(ibox)
-            pot[tgt_pslice] += local_exps[ibox]
+# }}}

-        return pot

+# {{{ fmm interaction completeness test

 class ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(
        ConstantOneExpansionWrangler):
-    def __init__(self, tree, filtered_targets):
-        ConstantOneExpansionWrangler.__init__(self, tree)
-        self.filtered_targets = filtered_targets

-    def potential_zeros(self):
-        return np.zeros(self.filtered_targets.nfiltered_targets, dtype=np.float64)
+    def __init__(self, tree_indep, traversal, filtered_targets):
+        super().__init__(tree_indep, traversal)
+        self.filtered_targets = filtered_targets

    def _get_target_slice(self, ibox):
        pstart = self.filtered_targets.box_target_starts[ibox]
@@ -192,6 +108,9 @@ class ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(
                pstart, pstart
                + self.filtered_targets.box_target_counts_nonchild[ibox])

+    def output_zeros(self):
+        return np.zeros(self.filtered_targets.nfiltered_targets, dtype=np.float64)
+
    def reorder_potentials(self, potentials):
        tree_order_all_potentials = np.zeros(self.tree.ntargets, potentials.dtype)
        tree_order_all_potentials[
@@ -203,8 +122,8 @@ class ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(

 class ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder(
        ConstantOneExpansionWrangler):
-    def __init__(self, tree, filtered_targets):
-        ConstantOneExpansionWrangler.__init__(self, tree)
+    def __init__(self, tree_indep, traversal, filtered_targets):
+        super().__init__(tree_indep, traversal)
        self.filtered_targets = filtered_targets

    def _get_target_slice(self, ibox):
@@ -214,55 +133,52 @@ class ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder(
        return self.tree.sorted_target_ids[user_target_ids]


+@pytest.mark.parametrize("well_sep_is_n_away", [1, 2])
 @pytest.mark.parametrize(("dims", "nsources_req", "ntargets_req",
-        "who_has_extent", "source_gen", "target_gen", "filter_kind"),
+        "who_has_extent", "source_gen", "target_gen", "filter_kind",
+        "extent_norm", "from_sep_smaller_crit"),
        [
-            (2, 10**5, None, "", p_normal, p_normal, None),
-            (3, 5 * 10**4, 4*10**4, "", p_normal, p_normal, None),
-            (2, 5 * 10**5, 4*10**4, "s", p_normal, p_normal, None),
-            (2, 5 * 10**5, 4*10**4, "st", p_normal, p_normal, None),
-            (2, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None),
-            (2, 5 * 10**5, 4*10**4, "st", p_surface, p_uniform, None),
-
-            (3, 10**5, None, "", p_normal, p_normal, None),
-            (3, 5 * 10**4, 4*10**4, "", p_normal, p_normal, None),
-            (3, 5 * 10**5, 4*10**4, "s", p_normal, p_normal, None),
-            (3, 5 * 10**5, 4*10**4, "st", p_normal, p_normal, None),
-            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None),
-            (3, 5 * 10**5, 4*10**4, "st", p_surface, p_uniform, None),
-
-            (2, 10**5, None, "", p_normal, p_normal, "user"),
-            (3, 5 * 10**4, 4*10**4, "", p_normal, p_normal, "user"),
-            (2, 5 * 10**5, 4*10**4, "s", p_normal, p_normal, "user"),
-            (2, 5 * 10**5, 4*10**4, "st", p_normal, p_normal, "user"),
-            (2, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, "user"),
-            (2, 5 * 10**5, 4*10**4, "st", p_surface, p_uniform, "user"),
-
-            (2, 10**5, None, "", p_normal, p_normal, "tree"),
-            (3, 5 * 10**4, 4*10**4, "", p_normal, p_normal, "tree"),
-            (2, 5 * 10**5, 4*10**4, "s", p_normal, p_normal, "tree"),
-            (2, 5 * 10**5, 4*10**4, "st", p_normal, p_normal, "tree"),
-            (2, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, "tree"),
-            (2, 5 * 10**5, 4*10**4, "st", p_surface, p_uniform, "tree"),
+            (1, 10**5, None, "", p_normal, p_normal, None, "linf", "static_linf"),
+
+            (2, 10**5, None, "", p_normal, p_normal, None, "linf", "static_linf"),
+            (2, 5 * 10**4, 4*10**4, "", p_normal, p_normal, None, "linf", "static_linf"),  # noqa: E501
+            (2, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "linf", "static_linf"),  # noqa: E501
+
+            (3, 10**5, None, "", p_normal, p_normal, None, "linf", "static_linf"),
+            (3, 5 * 10**5, 4*10**4, "", p_normal, p_normal, None, "linf", "static_linf"),  # noqa: E501
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "linf", "static_linf"),  # noqa: E501
+
+            (2, 10**5, None, "", p_normal, p_normal, "user", "linf", "static_linf"),
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, "user", "linf", "static_linf"),  # noqa: E501
+            (2, 10**5, None, "", p_normal, p_normal, "tree", "linf", "static_linf"),
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, "tree", "linf", "static_linf"),  # noqa: E501
+
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "linf", "static_linf"),  # noqa: E501
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "linf", "precise_linf"),  # noqa: E501
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "l2", "precise_linf"),  # noqa: E501
+            (3, 5 * 10**5, 4*10**4, "t", p_normal, p_normal, None, "l2", "static_l2"),
+
            ])
-def test_fmm_completeness(ctx_getter, dims, nsources_req, ntargets_req,
-         who_has_extent, source_gen, target_gen, filter_kind):
+def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
+         who_has_extent, source_gen, target_gen, filter_kind, well_sep_is_n_away,
+         extent_norm, from_sep_smaller_crit):
    """Tests whether the built FMM traversal structures and driver completely
    capture all interactions.
    """
+    actx = actx_factory()
+
+    devname = actx.queue.device.name.lower()
+    if (dims == 1
+            and actx.queue.device.platform.name == "Portable Computing Language"
+            and ("nvidia" in devname or "tesla" in devname)):
+        pytest.xfail("1D FMM fails to build on POCL Nvidia")

    sources_have_extent = "s" in who_has_extent
    targets_have_extent = "t" in who_has_extent
-
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
-
    dtype = np.float64

    try:
-        sources = source_gen(queue, nsources_req, dims, dtype, seed=15)
+        sources = source_gen(actx.queue, nsources_req, dims, dtype, seed=15)
        nsources = len(sources[0])

        if ntargets_req is None:
@@ -270,143 +186,182 @@ def test_fmm_completeness(ctx_getter, dims, nsources_req, ntargets_req,
            targets = None
            ntargets = ntargets_req
        else:
-            targets = target_gen(queue, ntargets_req, dims, dtype, seed=16)
+            targets = target_gen(actx.queue, ntargets_req, dims, dtype, seed=16)
            ntargets = len(targets[0])
    except ImportError:
-        pytest.skip("loo.py not available, but needed for particle array "
+        pytest.skip("loopy not available, but needed for particle array "
                "generation")

-    from pyopencl.clrandom import RanluxGenerator
-    rng = RanluxGenerator(queue, seed=13)
+    rng = np.random.default_rng(12)
    if sources_have_extent:
-        source_radii = 2**rng.uniform(queue, nsources, dtype=dtype,
-                a=-10, b=0)
+        source_radii = actx.from_numpy(
+                2**rng.uniform(-10, 0, (nsources)).astype(dtype)
+                )
    else:
        source_radii = None

    if targets_have_extent:
-        target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype,
-                a=-10, b=0)
+        target_radii = actx.from_numpy(
+                2**rng.uniform(-10, 0, (ntargets,)).astype(dtype)
+                )
    else:
        target_radii = None

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    tree, _ = tb(queue, sources, targets=targets,
+    tree, _ = tb(actx.queue, sources, targets=targets,
            max_particles_in_box=30,
            source_radii=source_radii, target_radii=target_radii,
-            debug=True)
+            debug=True, stick_out_factor=0.25, extent_norm=extent_norm)
    if 0:
-        tree.get().plot()
+        tree = tree.get(queue=actx.queue)
+        tree.plot()
        import matplotlib.pyplot as pt
        pt.show()

    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(ctx)
-    trav, _ = tbuild(queue, tree, debug=True)
-    if trav.sep_close_smaller_starts is not None:
-        trav = trav.merge_close_lists(queue)
+    tbuild = FMMTraversalBuilder(actx.context,
+            well_sep_is_n_away=well_sep_is_n_away,
+            from_sep_smaller_crit=from_sep_smaller_crit)
+    trav, _ = tbuild(actx.queue, tree, debug=True)

-    weights = np.random.randn(nsources)
-    #weights = np.ones(nsources)
+    if who_has_extent:
+        pre_merge_trav = trav
+        trav = trav.merge_close_lists(actx.queue)
+
+    # weights = np.random.randn(nsources)
+    weights = np.ones(nsources)
    weights_sum = np.sum(weights)

-    host_trav = trav.get(queue=queue)
+    host_trav = trav.get(queue=actx.queue)
    host_tree = host_trav.tree

+    if who_has_extent:
+        pre_merge_host_trav = pre_merge_trav.get(queue=actx.queue)
+
+    from boxtree.tree import ParticleListFilter
+    plfilt = ParticleListFilter(actx.context)
+
+    tree_indep = ConstantOneTreeIndependentDataForWrangler()
+
    if filter_kind:
-        flags = rng.uniform(queue, ntargets or nsources, np.int32, a=0, b=2) \
-                .astype(np.int8)
+        flags = actx.from_numpy(
+                rng.integers(0, 2, ntargets or nsources, dtype=np.int8)
+                )
+
        if filter_kind == "user":
-            from boxtree.tree import filter_target_lists_in_user_order
-            filtered_targets = filter_target_lists_in_user_order(queue, tree, flags)
+            filtered_targets = plfilt.filter_target_lists_in_user_order(
+                    actx.queue, tree, flags)
            wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder(
-                    host_tree, filtered_targets.get(queue=queue))
+                    tree_indep, host_trav,
+                    filtered_targets.get(queue=actx.queue))
        elif filter_kind == "tree":
-            from boxtree.tree import filter_target_lists_in_tree_order
-            filtered_targets = filter_target_lists_in_tree_order(queue, tree, flags)
+            filtered_targets = plfilt.filter_target_lists_in_tree_order(
+                    actx.queue, tree, flags)
            wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(
-                    host_tree, filtered_targets.get(queue=queue))
+                    tree_indep, host_trav,
+                    filtered_targets.get(queue=actx.queue))
        else:
            raise ValueError("unsupported value of 'filter_kind'")
    else:
-        wrangler = ConstantOneExpansionWrangler(host_tree)
+        wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
+        flags = 1 + actx.np.zeros(ntargets or nsources, dtype=np.int8)

    if ntargets is None and not filter_kind:
        # This check only works for targets == sources.
-        assert (wrangler.reorder_potentials(
-                wrangler.reorder_sources(weights)) == weights).all()
+        assert np.all(
+                wrangler.reorder_potentials(wrangler.reorder_sources(weights))
+                == weights)

    from boxtree.fmm import drive_fmm
-    pot = drive_fmm(host_trav, wrangler, weights)
+    pot = drive_fmm(wrangler, (weights,))

-    # {{{ build, evaluate matrix (and identify missing interactions)
+    if filter_kind:
+        pot = pot[actx.to_numpy(flags) > 0]

-    if 0:
+    rel_err = la.norm((pot - weights_sum) / nsources)
+    good = rel_err < 1e-8
+
+    # {{{ build, evaluate matrix (and identify incorrect interactions)
+
+    if 0 and not good:
        mat = np.zeros((ntargets, nsources), dtype)
        from pytools import ProgressBar

-        logging.getLogger().setLevel(logging.WARNING)
-
        pb = ProgressBar("matrix", nsources)
        for i in range(nsources):
            unit_vec = np.zeros(nsources, dtype=dtype)
            unit_vec[i] = 1
-            mat[:, i] = drive_fmm(host_trav, wrangler, unit_vec)
+            mat[:, i] = drive_fmm(host_trav, wrangler, (unit_vec,))
            pb.progress()
        pb.finished()

-        logging.getLogger().setLevel(logging.INFO)
-
        import matplotlib.pyplot as pt

-        if 1:
-            pt.spy(mat)
+        if 0:
+            pt.imshow(mat)
+            pt.colorbar()
            pt.show()

-        missing_tgts, missing_srcs = np.where(mat == 0)
+        incorrect_tgts, incorrect_srcs = np.where(mat != 1)

-        if 1 and len(missing_tgts):
+        if 1 and len(incorrect_tgts):
            from boxtree.visualization import TreePlotter
            plotter = TreePlotter(host_tree)
            plotter.draw_tree(fill=False, edgecolor="black")
            plotter.draw_box_numbers()
            plotter.set_bounding_box()

-            tree_order_missing_tgts = \
-                    host_tree.indices_to_tree_target_order(missing_tgts)
-            tree_order_missing_srcs = \
-                    host_tree.indices_to_tree_source_order(missing_srcs)
+            tree_order_incorrect_tgts = \
+                    host_tree.indices_to_tree_target_order(incorrect_tgts)
+            tree_order_incorrect_srcs = \
+                    host_tree.indices_to_tree_source_order(incorrect_srcs)

            src_boxes = [
                    host_tree.find_box_nr_for_source(i)
-                    for i in tree_order_missing_srcs]
+                    for i in tree_order_incorrect_srcs]
            tgt_boxes = [
                    host_tree.find_box_nr_for_target(i)
-                    for i in tree_order_missing_tgts]
+                    for i in tree_order_incorrect_tgts]
            print(src_boxes)
            print(tgt_boxes)

-            pt.plot(
-                    host_tree.targets[0][tree_order_missing_tgts],
-                    host_tree.targets[1][tree_order_missing_tgts],
-                    "rv")
-            pt.plot(
-                    host_tree.sources[0][tree_order_missing_srcs],
-                    host_tree.sources[1][tree_order_missing_srcs],
-                    "go")
+            # plot all sources/targets
+            if 0:
+                pt.plot(
+                        host_tree.targets[0],
+                        host_tree.targets[1],
+                        "v", alpha=0.9)
+                pt.plot(
+                        host_tree.sources[0],
+                        host_tree.sources[1],
+                        "gx", alpha=0.9)
+
+            # plot offending sources/targets
+            if 0:
+                pt.plot(
+                        host_tree.targets[0][tree_order_incorrect_tgts],
+                        host_tree.targets[1][tree_order_incorrect_tgts],
+                        "rv")
+                pt.plot(
+                        host_tree.sources[0][tree_order_incorrect_srcs],
+                        host_tree.sources[1][tree_order_incorrect_srcs],
+                        "go")
            pt.gca().set_aspect("equal")

+            from boxtree.visualization import draw_box_lists
+            draw_box_lists(
+                    plotter,
+                    pre_merge_host_trav if who_has_extent else host_trav,  # pylint: disable=possibly-used-before-assignment
+                    22)
+            # from boxtree.visualization import draw_same_level_non_well_sep_boxes
+            # draw_same_level_non_well_sep_boxes(plotter, host_trav, 2)
+
            pt.show()

    # }}}

-    if filter_kind:
-        pot = pot[flags.get() > 0]
-
-    rel_err = la.norm((pot - weights_sum) / nsources)
-    good = rel_err < 1e-8
    if 0 and not good:
        import matplotlib.pyplot as pt
        pt.plot(pot-weights_sum)
@@ -415,8 +370,8 @@ def test_fmm_completeness(ctx_getter, dims, nsources_req, ntargets_req,
    if 0 and not good:
        import matplotlib.pyplot as pt
        filt_targets = [
-                host_tree.targets[0][flags.get() > 0],
-                host_tree.targets[1][flags.get() > 0],
+                host_tree.targets[0][actx.to_numpy(flags > 0)],
+                host_tree.targets[1][actx.to_numpy(flags > 0)],
                ]
        host_tree.plot()
        bad = np.abs(pot - weights_sum) >= 1e-3
@@ -434,79 +389,432 @@ def test_fmm_completeness(ctx_getter, dims, nsources_req, ntargets_req,
 # }}}


-# {{{ test Helmholtz fmm with pyfmmlib
-
-def test_pyfmmlib_fmm(ctx_getter):
-    logging.basicConfig(level=logging.INFO)
+# {{{ test fmmlib integration

-    from pytest import importorskip
-    importorskip("pyfmmlib")
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+@pytest.mark.parametrize("dims", [2, 3])
+@pytest.mark.parametrize("use_dipoles", [True, False])
+@pytest.mark.parametrize("helmholtz_k", [0, 2])
+def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k):
+    pytest.importorskip("pyfmmlib")
+    actx = actx_factory()

    nsources = 3000
    ntargets = 1000
-    dims = 2
    dtype = np.float64

-    helmholtz_k = 2
-
-    sources = p_normal(queue, nsources, dims, dtype, seed=15)
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
    targets = (
-            p_normal(queue, ntargets, dims, dtype, seed=18)
-            + np.array([2, 0]))
+            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            + np.array([2, 0, 0])[:dims])

    sources_host = particle_array_to_host(sources)
    targets_host = particle_array_to_host(targets)

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    tree, _ = tb(queue, sources, targets=targets,
+    tree, _ = tb(actx.queue, sources, targets=targets,
            max_particles_in_box=30, debug=True)

    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(ctx)
-    trav, _ = tbuild(queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    trav = trav.get(queue=actx.queue)
+
+    rng = np.random.default_rng(20)
+    weights = rng.uniform(0.0, 1.0, (nsources,))
+
+    rng = np.random.default_rng(seed=42)
+    dipole_vec = rng.normal(size=(dims, nsources)) if use_dipoles else None
+
+    base_order = 20 if (dims == 2 and helmholtz_k == 0) else 10

-    trav = trav.get(queue=queue)
+    def fmm_level_to_order(tree, lev):
+        result = base_order
+
+        if lev < 3 and helmholtz_k:
+            # exercise order-varies-by-level capability
+            result += 5
+
+        if use_dipoles:
+            result += 1
+
+        return result
+
+    from boxtree.pyfmmlib_integration import (
+        FMMLibExpansionWrangler,
+        FMMLibTreeIndependentDataForWrangler,
+        Kernel,
+    )
+    tree_indep = FMMLibTreeIndependentDataForWrangler(
+            trav.tree.dimensions,
+            Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
+    wrangler = FMMLibExpansionWrangler(
+            tree_indep, trav,
+            helmholtz_k=helmholtz_k,
+            fmm_level_to_order=fmm_level_to_order,
+            dipole_vec=dipole_vec)
+
+    from boxtree.fmm import drive_fmm

-    from pyopencl.clrandom import RanluxGenerator
-    rng = RanluxGenerator(queue, seed=20)
+    timing_data = {}
+    pot = drive_fmm(wrangler, (weights,), timing_data=timing_data)
+    print(timing_data)
+    assert timing_data

-    weights = rng.uniform(queue, nsources, dtype=np.float64).get()
-    #weights = np.ones(nsources)
+    # {{{ ref fmmlib computation

    logger.info("computing direct (reference) result")

-    from pyfmmlib import hpotgrad2dall_vec
-    ref_pot, _, _ = hpotgrad2dall_vec(ifgrad=False, ifhess=False,
-            sources=sources_host.T, charge=weights,
-            targets=targets_host.T, zk=helmholtz_k)
+    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources_host.T,
+            targets_host.T, helmholtz_k, dipole_vec)
+
+    rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
+    logger.info("relative l2 error vs fmmlib direct: %g", rel_err)
+    assert rel_err < 1e-5, rel_err
+
+    # }}}
+
+    # {{{ check against sumpy
+
+    try:
+        import sumpy  # noqa
+    except ImportError:
+        have_sumpy = False
+        from warnings import warn
+        warn("sumpy unavailable: cannot compute independent reference "
+                "values for pyfmmlib", stacklevel=1)
+    else:
+        have_sumpy = True
+
+    if have_sumpy:
+        from sumpy.kernel import (  # pylint:disable=import-error
+            DirectionalSourceDerivative,
+            HelmholtzKernel,
+            LaplaceKernel,
+        )
+        from sumpy.p2p import P2P  # pylint:disable=import-error
+
+        sumpy_extra_kwargs = {}
+        if helmholtz_k:
+            knl = HelmholtzKernel(dims)
+            sumpy_extra_kwargs["k"] = helmholtz_k
+        else:
+            knl = LaplaceKernel(dims)
+
+        if use_dipoles:
+            knl = DirectionalSourceDerivative(knl)
+            sumpy_extra_kwargs["src_derivative_dir"] = dipole_vec
+
+        p2p = P2P(actx.context,
+                [knl],
+                exclude_self=False)
+
+        _evt, (sumpy_ref_pot,) = p2p(
+                actx.queue, targets, sources, (weights,),
+                out_host=True, **sumpy_extra_kwargs)
+
+        sumpy_rel_err = (
+                la.norm(pot - sumpy_ref_pot, np.inf)
+                / la.norm(sumpy_ref_pot, np.inf))

-    from boxtree.pyfmmlib_integration import Helmholtz2DExpansionWrangler
-    wrangler = Helmholtz2DExpansionWrangler(trav.tree, helmholtz_k, nterms=10)
+        logger.info("relative l2 error vs sumpy direct: %g", sumpy_rel_err)
+        assert sumpy_rel_err < 1e-5, sumpy_rel_err
+
+    # }}}
+
+# }}}
+
+
+# {{{ test fmmlib numerical stability
+
+@pytest.mark.parametrize("dims", [2, 3])
+@pytest.mark.parametrize("helmholtz_k", [0, 2])
+@pytest.mark.parametrize("order", [35])
+def test_pyfmmlib_numerical_stability(actx_factory, dims, helmholtz_k, order):
+    pytest.importorskip("pyfmmlib")
+    actx = actx_factory()
+
+    nsources = 30
+    dtype = np.float64
+
+    # The input particles are arranged with geometrically increasing/decreasing
+    # spacing along a line, to build a deep tree that stress-tests the
+    # translations.
+    particle_line = np.array([2**-i for i in range(nsources//2)], dtype=dtype)
+    particle_line = np.hstack([particle_line, 3 - particle_line])
+    zero = np.zeros(nsources, dtype=dtype)
+
+    sources = np.vstack([
+            particle_line,
+            zero,
+            zero])[:dims]
+
+    targets = sources * (1 + 1e-3)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=2, debug=True)
+
+    assert tree.nlevels >= 15
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    trav = trav.get(queue=actx.queue)
+    weights = np.ones_like(sources[0])
+
+    from boxtree.pyfmmlib_integration import (
+        FMMLibExpansionWrangler,
+        FMMLibRotationData,
+        FMMLibTreeIndependentDataForWrangler,
+        Kernel,
+    )
+
+    def fmm_level_to_order(tree, lev):
+        return order
+
+    tree_indep = FMMLibTreeIndependentDataForWrangler(
+            trav.tree.dimensions,
+            Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
+    wrangler = FMMLibExpansionWrangler(
+            tree_indep, trav,
+            helmholtz_k=helmholtz_k,
+            fmm_level_to_order=fmm_level_to_order,
+            rotation_data=FMMLibRotationData(actx.queue, trav))

    from boxtree.fmm import drive_fmm
-    pot = drive_fmm(trav, wrangler, weights)
+    pot = drive_fmm(wrangler, (weights,))
+    assert not np.isnan(pot).any()
+
+    # {{{ ref fmmlib computation
+
+    logger.info("computing direct (reference) result")
+
+    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources, targets,
+            helmholtz_k)
+
+    rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
+    logger.info("relative l2 error vs fmmlib direct: %g", rel_err)
+
+    error_bound = (1/2) ** (1 + order) if dims == 2 else (3/4) ** (1 + order)
+    assert rel_err < error_bound, rel_err
+
+    # }}}
+
+# }}}
+
+
+# {{{ test particle count thresholding in traversal generation
+
+@pytest.mark.parametrize("enable_extents", [True, False])
+def test_interaction_list_particle_count_thresholding(actx_factory, enable_extents):
+    actx = actx_factory()
+
+    dims = 2
+    nsources = 1000
+    ntargets = 1000
+    dtype = np.float64
+
+    max_particles_in_box = 30
+    # Ensure that we have underfilled boxes.
+    from_sep_smaller_min_nsources_cumul = 1 + max_particles_in_box
+
+    from boxtree.fmm import drive_fmm
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+
+    rng = np.random.default_rng(22)
+    if enable_extents:
+        target_radii = actx.from_numpy(
+                2**rng.uniform(-10, 0, (ntargets,)).astype(dtype)
+                )
+    else:
+        target_radii = None
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=max_particles_in_box,
+            target_radii=target_radii,
+            debug=True, stick_out_factor=0.25)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True,
+            _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul)
+
+    weights = np.ones(nsources)
+    weights_sum = np.sum(weights)
+
+    host_trav = trav.get(queue=actx.queue)
+
+    tree_indep = ConstantOneTreeIndependentDataForWrangler()
+    wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
+
+    pot = drive_fmm(wrangler, (weights,))
+
+    assert np.all(pot == weights_sum)
+
+# }}}
+
+
+# {{{ test fmm with float32 dtype
+
+@pytest.mark.parametrize("enable_extents", [True, False])
+def test_fmm_float32(actx_factory, enable_extents):
+    actx = actx_factory()
+
+    from pyopencl.characterize import has_struct_arg_count_bug
+    if has_struct_arg_count_bug(actx.queue.device, actx.context):
+        pytest.xfail("won't work on devices with the struct arg count issue")
+
+    dims = 2
+    nsources = 1000
+    ntargets = 1000
+    dtype = np.float32
+
+    from boxtree.fmm import drive_fmm
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+
+    rng = np.random.default_rng(12)
+    if enable_extents:
+        target_radii = actx.from_numpy(
+                2**rng.uniform(-10, 0, (ntargets,)).astype(dtype)
+                )
+    else:
+        target_radii = None
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=30,
+            target_radii=target_radii,
+            debug=True, stick_out_factor=0.25)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    weights = np.ones(nsources)
+    weights_sum = np.sum(weights)
+
+    host_trav = trav.get(queue=actx.queue)
+
+    tree_indep = ConstantOneTreeIndependentDataForWrangler()
+    wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
+
+    pot = drive_fmm(wrangler, (weights,))
+
+    assert np.all(pot == weights_sum)
+
+# }}}
+
+
+# {{{ test with fmm optimized 3d m2l
+
+@pytest.mark.parametrize("well_sep_is_n_away", (1, 2))
+@pytest.mark.parametrize("helmholtz_k", (0, 2))
+@pytest.mark.parametrize("nsrcntgts", (20, 10000))
+def test_fmm_with_optimized_3d_m2l(actx_factory, nsrcntgts, helmholtz_k,
+                                   well_sep_is_n_away):
+    pytest.importorskip("pyfmmlib")
+    actx = actx_factory()
+
+    dims = 3
+    nsources = ntargets = nsrcntgts // 2
+    dtype = np.float64
+
+    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    targets = (
+            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            + np.array([2, 0, 0])[:dims])
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+    trav = trav.get(queue=actx.queue)
+
+    rng = np.random.default_rng(20)
+    weights = rng.uniform(0.0, 1.0, (nsources,))
+
+    base_order = 10
+
+    def fmm_level_to_order(tree, lev):
+        result = base_order
+
+        if lev < 3 and helmholtz_k:
+            # exercise order-varies-by-level capability
+            result += 5
+
+        return result
+
+    from boxtree.pyfmmlib_integration import (
+        FMMLibExpansionWrangler,
+        FMMLibRotationData,
+        FMMLibTreeIndependentDataForWrangler,
+        Kernel,
+    )
+
+    tree_indep = FMMLibTreeIndependentDataForWrangler(
+            trav.tree.dimensions,
+            Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
+    baseline_wrangler = FMMLibExpansionWrangler(
+            tree_indep, trav,
+            helmholtz_k=helmholtz_k,
+            fmm_level_to_order=fmm_level_to_order)
+
+    optimized_wrangler = FMMLibExpansionWrangler(
+            tree_indep, trav,
+            helmholtz_k=helmholtz_k,
+            fmm_level_to_order=fmm_level_to_order,
+            rotation_data=FMMLibRotationData(actx.queue, trav))
+
+    from boxtree.fmm import drive_fmm
+
+    baseline_timing_data = {}
+    baseline_pot = drive_fmm(
+            baseline_wrangler, (weights,), timing_data=baseline_timing_data)
+
+    optimized_timing_data = {}
+    optimized_pot = drive_fmm(
+            optimized_wrangler, (weights,), timing_data=optimized_timing_data)
+
+    baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"]
+    if baseline_time is not None:
+        print(f"Baseline M2L time : {baseline_time:#.4g} s")
+
+    opt_time = optimized_timing_data["multipole_to_local"]["process_elapsed"]
+    if opt_time is not None:
+        print(f"Optimized M2L time: {opt_time:#.4g} s")

-    rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot)
-    logger.info("relative l2 error: %g" % rel_err)
-    assert rel_err < 1e-5
+    assert np.allclose(baseline_pot, optimized_pot, atol=1e-13, rtol=1e-13)

 # }}}


 # You can test individual routines by typing
-# $ python test_fmm.py 'test_routine(cl.create_some_context)'
+# $ python test_fmm.py 'test_routine(_acf)'

 if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from py.test.cmdline import main
+        from pytest import main
        main([__file__])

 # vim: fdm=marker
--- a/test/test_tools.py
+++ b/test/test_tools.py
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner \
+                 Copyright (C) 2017 Matt Wala \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+import logging
+
+import numpy as np
+import pytest
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,  # noqa: F401
+)
+from boxtree.tools import (
+    make_normal_particle_array as p_normal,
+    make_surface_particle_array as p_surface,
+    make_uniform_particle_array as p_uniform,
+)
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+# {{{ test_allreduce_comm_pattern
+
+@pytest.mark.parametrize("p", [1, 2, 3, 4, 5, 6, 7, 8, 16, 17])
+def test_allreduce_comm_pattern(p):
+    from boxtree.tools import AllReduceCommPattern
+
+    # This models the parallel allreduce communication pattern.
+    # processor -> communication pattern of the processor
+    patterns = [AllReduceCommPattern(i, p) for i in range(p)]
+    # processor -> list of data items on the processor
+    data = [[i] for i in range(p)]
+    from copy import deepcopy
+
+    while not all(pat.done() for pat in patterns):
+        new_data = deepcopy(data)
+
+        for i in range(p):
+            if patterns[i].done():
+                for pat in patterns:
+                    if not pat.done():
+                        assert i not in pat.sources() | pat.sinks()
+                continue
+
+            # Check sources / sinks match up
+            for s in patterns[i].sinks():
+                assert i in patterns[s].sources()
+
+            for s in patterns[i].sources():
+                assert i in patterns[s].sinks()
+
+            # Send / recv data
+            for s in patterns[i].sinks():
+                new_data[s].extend(data[i])
+
+        for pat in patterns:
+            if not pat.done():
+                pat.advance()
+        data = new_data
+
+    for item in data:
+        assert len(item) == p
+        assert set(item) == set(range(p))
+
+# }}}
+
+
+# {{{ test_masked_matrix_compression
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_masked_matrix_compression(actx_factory, order):
+    actx = actx_factory()
+
+    from boxtree.tools import MaskCompressorKernel
+    matcompr = MaskCompressorKernel(actx.context)
+
+    n = 40
+    m = 10
+
+    rng = np.random.default_rng(15)
+    arr = (rng.random((n, m)) > 0.5).astype(np.int8).copy(order=order)
+    d_arr = actx.from_numpy(arr)
+
+    arr_starts, arr_lists, _evt = matcompr(actx.queue, d_arr)
+    arr_starts = actx.to_numpy(arr_starts)
+    arr_lists = actx.to_numpy(arr_lists)
+
+    for i in range(n):
+        items = arr_lists[arr_starts[i]:arr_starts[i+1]]
+        assert set(items) == set(arr[i].nonzero()[0])
+
+# }}}
+
+
+# {{{ test_masked_list_compression
+
+def test_masked_list_compression(actx_factory):
+    actx = actx_factory()
+    rng = np.random.default_rng(seed=42)
+
+    from boxtree.tools import MaskCompressorKernel
+    listcompr = MaskCompressorKernel(actx.context)
+
+    n = 20
+
+    arr = (rng.random(n) > 0.5).astype(np.int8)
+    d_arr = actx.from_numpy(arr)
+
+    arr_list, _evt = listcompr(actx.queue, d_arr)
+    arr_list = actx.to_numpy(arr_list)
+
+    assert set(arr_list) == set(arr.nonzero()[0])
+
+# }}}
+
+
+# {{{ test_device_record
+
+def test_device_record(actx_factory):
+    actx = actx_factory()
+
+    from boxtree.tools import DeviceDataRecord
+    array = np.arange(60).reshape((3, 4, 5))
+
+    obj_array = np.empty((3,), dtype=object)
+    for i in range(3):
+        obj_array[i] = np.arange((i + 1) * 40).reshape(5, i + 1, 8)
+
+    record = DeviceDataRecord(
+        array=array,
+        obj_array=obj_array
+    )
+
+    record_dev = record.to_device(actx.queue)
+    record_host = record_dev.get(actx.queue)
+
+    assert np.array_equal(record_host.array, record.array)
+
+    for i in range(3):
+        assert np.array_equal(record_host.obj_array[i], record.obj_array[i])
+
+# }}}
+
+
+# {{{ test_particle_array
+
+@pytest.mark.parametrize("array_factory", (p_normal, p_surface, p_uniform))
+@pytest.mark.parametrize("dim", (2, 3))
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_particle_array(actx_factory, array_factory, dim, dtype):
+    actx = actx_factory()
+
+    particles = array_factory(actx.queue, 1000, dim, dtype)
+    assert len(particles) == dim
+    assert all(len(particles[0]) == len(axis) for axis in particles)
+
+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_tools.py 'test_routine'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

 __license__ = """
@@ -24,20 +20,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging

 import numpy as np
 import numpy.linalg as la
-import pyopencl as cl
-
 import pytest
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+from arraycontext import pytest_generate_tests_for_array_contexts

+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,  # noqa: F401
+)
 from boxtree.tools import make_normal_particle_array

-import logging
+
 logger = logging.getLogger(__name__)

+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+

 # {{{ connectivity test

@@ -48,30 +50,26 @@ logger = logging.getLogger(__name__)
    (3, True),
    (3, False),
    ])
-def test_tree_connectivity(ctx_getter, dims, sources_are_targets):
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
-
+def test_tree_connectivity(actx_factory, dims, sources_are_targets):
+    actx = actx_factory()
    dtype = np.float64

-    sources = make_normal_particle_array(queue, 1 * 10**5, dims, dtype)
+    sources = make_normal_particle_array(actx.queue, 1 * 10**5, dims, dtype)
    if sources_are_targets:
        targets = None
    else:
-        targets = make_normal_particle_array(queue, 2 * 10**5, dims, dtype)
+        targets = make_normal_particle_array(actx.queue, 2 * 10**5, dims, dtype)

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
-    tree, _ = tb(queue, sources, max_particles_in_box=30,
+    tb = TreeBuilder(actx.context)
+    tree, _ = tb(actx.queue, sources, max_particles_in_box=30,
            targets=targets, debug=True)

    from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(ctx)
-    trav, _ = tg(queue, tree, debug=True)
-    tree = tree.get(queue=queue)
-    trav = trav.get(queue=queue)
+    tg = FMMTraversalBuilder(actx.context)
+    trav, _ = tg(actx.queue, tree, debug=True)
+    tree = tree.get(queue=actx.queue)
+    trav = trav.get(queue=actx.queue)

    levels = tree.box_levels
    parents = tree.box_parent_ids.T
@@ -91,6 +89,7 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):

    if 0:
        import matplotlib.pyplot as pt
+
        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black")
@@ -108,7 +107,7 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):
            assert ibox in nbl

        for jbox in nbl:
-            assert (0 == children[jbox]).all(), (ibox, jbox, children[jbox])
+            assert np.all(children[jbox] == 0), (ibox, jbox, children[jbox])

    logger.info("list 1 consists of source boxes")

@@ -117,10 +116,10 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):
    # {{{ separated siblings (list 2) are actually separated

    for itgt_box, tgt_ibox in enumerate(trav.target_or_target_parent_boxes):
-        start, end = trav.sep_siblings_starts[itgt_box:itgt_box+2]
-        seps = trav.sep_siblings_lists[start:end]
+        start, end = trav.from_sep_siblings_starts[itgt_box:itgt_box+2]
+        seps = trav.from_sep_siblings_lists[start:end]

-        assert (levels[seps] == levels[tgt_ibox]).all()
+        assert np.all(levels[seps] == levels[tgt_ibox])

        # three-ish box radii (half of size)
        mindist = 2.5 * 0.5 * 2**-int(levels[tgt_ibox]) * tree.root_extent
@@ -135,46 +134,63 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):
    # }}}

    if sources_are_targets:
-        # {{{ sep_{smaller,bigger} are duals of each other
+        # {{{ from_sep_{smaller,bigger} are duals of each other

-        assert (trav.target_or_target_parent_boxes == np.arange(tree.nboxes)).all()
+        assert np.all(trav.target_or_target_parent_boxes == np.arange(tree.nboxes))

        # {{{ list 4 <= list 3
-        for itarget_box, ibox in enumerate(trav.target_boxes):
-            start, end = trav.sep_smaller_starts[itarget_box:itarget_box+2]

-            for jbox in trav.sep_smaller_lists[start:end]:
-                rstart, rend = trav.sep_bigger_starts[jbox:jbox+2]
+        for level, ssn in enumerate(trav.from_sep_smaller_by_level):
+            for itarget_box, ibox in \
+                    enumerate(trav.target_boxes_sep_smaller_by_source_level[level]):
+                start, end = ssn.starts[itarget_box:itarget_box+2]

-                assert ibox in trav.sep_bigger_lists[rstart:rend], (ibox, jbox)
+                for jbox in ssn.lists[start:end]:
+                    rstart, rend = trav.from_sep_bigger_starts[jbox:jbox+2]
+
+                    assert ibox in trav.from_sep_bigger_lists[rstart:rend], \
+                            (ibox, jbox)

        # }}}

        # {{{ list 4 <= list 3

-        box_to_target_box_index = np.empty(tree.nboxes, tree.box_id_dtype)
-        box_to_target_box_index.fill(-1)
-        box_to_target_box_index[trav.target_boxes] = np.arange(
-                len(trav.target_boxes), dtype=tree.box_id_dtype)
-
-        assert (trav.source_boxes == trav.target_boxes).all()
-        assert (trav.target_or_target_parent_boxes == np.arange(
-                tree.nboxes, dtype=tree.box_id_dtype)).all()
+        box_to_target_boxes_sep_smaller_by_source_level = []
+        for level in range(trav.tree.nlevels):
+            box_to_target_boxes_sep_smaller = np.empty(
+                tree.nboxes, tree.box_id_dtype)
+            box_to_target_boxes_sep_smaller.fill(-1)
+            box_to_target_boxes_sep_smaller[
+                trav.target_boxes_sep_smaller_by_source_level[level]
+            ] = np.arange(
+                len(trav.target_boxes_sep_smaller_by_source_level[level]),
+                dtype=tree.box_id_dtype
+            )
+            box_to_target_boxes_sep_smaller_by_source_level.append(
+                box_to_target_boxes_sep_smaller)
+
+        assert np.all(trav.source_boxes == trav.target_boxes)
+        assert np.all(trav.target_or_target_parent_boxes == np.arange(
+                tree.nboxes, dtype=tree.box_id_dtype))

        for ibox in range(tree.nboxes):
-            start, end = trav.sep_bigger_starts[ibox:ibox+2]
+            start, end = trav.from_sep_bigger_starts[ibox:ibox+2]

-            for jbox in trav.sep_bigger_lists[start:end]:
-                # In principle, entries of sep_bigger_lists are
+            for jbox in trav.from_sep_bigger_lists[start:end]:
+                # In principle, entries of from_sep_bigger_lists are
                # source boxes. In this special case, source and target boxes
                # are the same thing (i.e. leaves--see assertion above), so we
                # may treat them as targets anyhow.

-                jtgt_box = box_to_target_box_index[jbox]
-                assert jtgt_box != -1
+                good = False

-                rstart, rend = trav.sep_smaller_starts[jtgt_box:jtgt_box+2]
-                good = ibox in trav.sep_smaller_lists[rstart:rend]
+                for level, ssn in enumerate(trav.from_sep_smaller_by_level):
+                    jtgt_box = \
+                        box_to_target_boxes_sep_smaller_by_source_level[level][jbox]
+                    if jtgt_box == -1:
+                        continue
+                    rstart, rend = ssn.starts[jtgt_box:jtgt_box + 2]
+                    good = good or ibox in ssn.lists[rstart:rend]

                if not good:
                    from boxtree.visualization import TreePlotter
@@ -182,8 +198,8 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):
                    plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
                    plotter.set_bounding_box()

-                    plotter.draw_box(ibox, facecolor='green', alpha=0.5)
-                    plotter.draw_box(jbox, facecolor='red', alpha=0.5)
+                    plotter.draw_box(ibox, facecolor="green", alpha=0.5)
+                    plotter.draw_box(jbox, facecolor="red", alpha=0.5)

                    import matplotlib.pyplot as pt
                    pt.gca().set_aspect("equal")
@@ -199,130 +215,194 @@ def test_tree_connectivity(ctx_getter, dims, sources_are_targets):

        # }}}

-    # {{{ sep_smaller satisfies relative level assumption
+    # {{{ from_sep_smaller satisfies relative level assumption

-    for itarget_box, ibox in enumerate(trav.target_boxes):
-        start, end = trav.sep_smaller_starts[itarget_box:itarget_box+2]
+    # for itarget_box, ibox in enumerate(trav.target_boxes):
+    #    for ssn in trav.from_sep_smaller_by_level:
+    for level, ssn in enumerate(trav.from_sep_smaller_by_level):
+        for itarget_box, ibox in enumerate(
+                trav.target_boxes_sep_smaller_by_source_level[level]):
+            start, end = ssn.starts[itarget_box:itarget_box+2]

-        for jbox in trav.sep_smaller_lists[start:end]:
-            assert levels[ibox] < levels[jbox]
+            for jbox in ssn.lists[start:end]:
+                assert levels[ibox] < levels[jbox]

    logger.info("list 3 satisfies relative level assumption")

    # }}}

-    # {{{ sep_smaller satisfies relative level assumption
+    # {{{ from_sep_bigger satisfies relative level assumption

    for itgt_box, tgt_ibox in enumerate(trav.target_or_target_parent_boxes):
-        start, end = trav.sep_bigger_starts[itgt_box:itgt_box+2]
+        start, end = trav.from_sep_bigger_starts[itgt_box:itgt_box+2]

-        for jbox in trav.sep_bigger_lists[start:end]:
+        for jbox in trav.from_sep_bigger_lists[start:end]:
            assert levels[tgt_ibox] > levels[jbox]

    logger.info("list 4 satisfies relative level assumption")

    # }}}

+    # {{{ level_start_*_box_nrs lists make sense
+
+    for name, ref_array in [
+            ("level_start_source_box_nrs", trav.source_boxes),
+            ("level_start_source_parent_box_nrs", trav.source_parent_boxes),
+            ("level_start_target_box_nrs", trav.target_boxes),
+            ("level_start_target_or_target_parent_box_nrs",
+                trav.target_or_target_parent_boxes)
+            ]:
+        level_starts = getattr(trav, name)
+        for lev in range(tree.nlevels):
+            start, stop = level_starts[lev:lev+2]
+
+            box_nrs = ref_array[start:stop]
+
+            assert np.all(tree.box_levels[box_nrs] == lev), name
+
+    # }}}
+
 # }}}


-# {{{ visualization helper (not a test)
+# {{{ visualization helper
+
+# Set 'visualize' kwarg to True to actually plot. Otherwise, this
+# test simply ensures that interaction list plotting is still
+# working.

-def plot_traversal(ctx_getter, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_plot_traversal(actx_factory, well_sep_is_n_away=1, visualize=False):
+    pytest.importorskip("matplotlib")
+    actx = actx_factory()

-    #for dims in [2, 3]:
    for dims in [2]:
        nparticles = 10**4
        dtype = np.float64

-        from pyopencl.clrandom import RanluxGenerator
-        rng = RanluxGenerator(queue, seed=15)
-
        from pytools.obj_array import make_obj_array
+        rng = np.random.default_rng(15)
        particles = make_obj_array([
-            rng.normal(queue, nparticles, dtype=dtype)
+            actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype))
            for i in range(dims)])

-        #if do_plot:
-            #pt.plot(particles[0].get(), particles[1].get(), "x")
-
        from boxtree import TreeBuilder
-        tb = TreeBuilder(ctx)
+        tb = TreeBuilder(actx.context)

-        queue.finish()
-        tree = tb(queue, particles, max_particles_in_box=30, debug=True)
+        actx.queue.finish()
+        tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)

        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(ctx)
-        trav = tg(queue, tree).get()
+        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
+        trav, _ = tg(actx.queue, tree)
+
+        tree = tree.get(queue=actx.queue)
+        trav = trav.get(queue=actx.queue)

        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black")
-        #plotter.draw_box_numbers()
+        # plotter.draw_box_numbers()
        plotter.set_bounding_box()

-        from random import randrange, seed
-        seed(7)
+        from boxtree.visualization import draw_box_lists
+        if well_sep_is_n_away == 1:
+            draw_box_lists(plotter, trav, 380)
+        elif well_sep_is_n_away == 2:
+            draw_box_lists(plotter, trav, 320)

-        # {{{ generic box drawing helper
+        if visualize:
+            import matplotlib.pyplot as pt
+            pt.gca().set_xticks([])
+            pt.gca().set_yticks([])

-        def draw_some_box_lists(starts, lists, key_to_box=None,
-                count=5):
-            actual_count = 0
-            while actual_count < count:
-                if key_to_box is not None:
-                    key = randrange(len(key_to_box))
-                    ibox = key_to_box[key]
-                else:
-                    key = ibox = randrange(tree.nboxes)
+            pt.show()

-                start, end = starts[key:key+2]
-                if start == end:
-                    continue
+# }}}

-                #print ibox, start, end, lists[start:end]
-                for jbox in lists[start:end]:
-                    plotter.draw_box(jbox, facecolor='yellow')

-                plotter.draw_box(ibox, facecolor='red')
+# {{{ test_from_sep_siblings_translation_and_rotation_classes

-                actual_count += 1
+@pytest.mark.parametrize("well_sep_is_n_away", (1, 2))
+def test_from_sep_siblings_translation_and_rotation_classes(
+        actx_factory, well_sep_is_n_away):
+    actx = actx_factory()

-        # }}}
+    dims = 3
+    nparticles = 10**4
+    dtype = np.float64

-        if 0:
-            # colleagues
-            draw_some_box_lists(
-                    trav.colleagues_starts,
-                    trav.colleagues_lists)
-        elif 0:
-            # near neighbors ("list 1")
-            draw_some_box_lists(
-                    trav.neighbor_leaves_starts,
-                    trav.neighbor_leaves_lists,
-                    key_to_box=trav.source_boxes)
-        elif 0:
-            # well-separated siblings (list 2)
-            draw_some_box_lists(
-                    trav.sep_siblings_starts,
-                    trav.sep_siblings_lists)
-        elif 1:
-            # separated smaller (list 3)
-            draw_some_box_lists(
-                    trav.sep_smaller_starts,
-                    trav.sep_smaller_lists,
-                    key_to_box=trav.source_boxes)
-        elif 1:
-            # separated bigger (list 4)
-            draw_some_box_lists(
-                    trav.sep_bigger_starts,
-                    trav.sep_bigger_lists)
+    # {{{ build tree

-        import matplotlib.pyplot as pt
-        pt.show()
+    from pytools.obj_array import make_obj_array
+    rng = np.random.default_rng(15)
+    particles = make_obj_array([
+        actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype))
+        for i in range(dims)])
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+
+    # }}}
+
+    # {{{ build traversal
+
+    from boxtree.rotation_classes import RotationClassesBuilder
+    from boxtree.translation_classes import TranslationClassesBuilder
+    from boxtree.traversal import FMMTraversalBuilder
+
+    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
+    trav, _ = tg(actx.queue, tree)
+
+    rb = RotationClassesBuilder(actx.context)
+    result, _ = rb(actx.queue, trav, tree)
+
+    tb = TranslationClassesBuilder(actx.context)
+    result_tb, _ = tb(actx.queue, trav, tree)
+
+    rot_classes = actx.to_numpy(
+            result.from_sep_siblings_rotation_classes)
+    rot_angles = actx.to_numpy(
+            result.from_sep_siblings_rotation_class_to_angle)
+
+    translation_classes = actx.to_numpy(
+            result_tb.from_sep_siblings_translation_classes)
+    distance_vectors = actx.to_numpy(
+        result_tb.from_sep_siblings_translation_class_to_distance_vector)
+
+    tree = tree.get(queue=actx.queue)
+    trav = trav.get(queue=actx.queue)
+
+    centers = tree.box_centers.T
+
+    # }}}
+
+    # For each entry of from_sep_siblings, compute the source-target translation
+    # direction as a vector, and check that the from_sep_siblings rotation class
+    # in the traversal corresponds to the angle with the z-axis of the
+    # translation direction.
+
+    for itgt_box, tgt_ibox in enumerate(trav.target_or_target_parent_boxes):
+        start, end = trav.from_sep_siblings_starts[itgt_box:itgt_box+2]
+        seps = trav.from_sep_siblings_lists[start:end]
+        level_rot_classes = rot_classes[start:end]
+
+        level_translation_classes = translation_classes[start:end]
+        actual_translation_vecs = distance_vectors[:, level_translation_classes].T
+        expected_translation_vecs = centers[tgt_ibox] - centers[seps]
+
+        if len(expected_translation_vecs) > 0:
+            assert np.allclose(actual_translation_vecs, expected_translation_vecs,
+                               atol=1e-13, rtol=1e-13)
+
+        theta = np.arctan2(
+                la.norm(expected_translation_vecs[:, :dims - 1], axis=1),
+                expected_translation_vecs[:, dims - 1])
+        level_rot_angles = rot_angles[level_rot_classes]
+
+        assert np.allclose(theta, level_rot_angles, atol=1e-13, rtol=1e-13)

 # }}}

@@ -335,7 +415,7 @@ if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from py.test.cmdline import main
+        from pytest import main
        main([__file__])

 # vim: fdm=marker
--- a/test/test_tree.py
+++ b/test/test_tree.py
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-import six
-from six.moves import range
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -26,65 +20,68 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging
+import sys

 import numpy as np
-import sys
 import pytest
-import logging
+from arraycontext import pytest_generate_tests_for_array_contexts

-import pyopencl as cl
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+from boxtree.array_context import (
+    PytestPyOpenCLArrayContextFactory,
+    _acf,  # noqa: F401
+)
 from boxtree.tools import make_normal_particle_array

+
 logger = logging.getLogger(__name__)

+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+

 # {{{ bounding box test

 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("dims", [2, 3])
 @pytest.mark.parametrize("nparticles", [9, 4096, 10**5])
-def test_bounding_box(ctx_getter, dtype, dims, nparticles):
-    logging.basicConfig(level=logging.INFO)
+def test_bounding_box(actx_factory, dtype, dims, nparticles):
+    actx = actx_factory()

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
-
-    from boxtree.tools import AXIS_NAMES
    from boxtree.bounding_box import BoundingBoxFinder
-
-    bbf = BoundingBoxFinder(ctx)
+    from boxtree.tools import AXIS_NAMES
+    bbf = BoundingBoxFinder(actx.context)

    axis_names = AXIS_NAMES[:dims]
+    logger.info("%s - %s %s", dtype, dims, nparticles)

-    logger.info("%s - %s %s" % (dtype, dims, nparticles))
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)

-    particles = make_normal_particle_array(queue, nparticles, dims, dtype)
+    bbox_min = [np.min(actx.to_numpy(x)) for x in particles]
+    bbox_max = [np.max(actx.to_numpy(x)) for x in particles]

-    bbox_min = [np.min(x.get()) for x in particles]
-    bbox_max = [np.max(x.get()) for x in particles]
-
-    bbox_cl, evt = bbf(particles, radii=None)
-    bbox_cl = bbox_cl.get()
+    bbox_cl, _evt = bbf(particles, radii=None)
+    bbox_cl = actx.to_numpy(bbox_cl)

    bbox_min_cl = np.empty(dims, dtype)
    bbox_max_cl = np.empty(dims, dtype)

    for i, ax in enumerate(axis_names):
-        bbox_min_cl[i] = bbox_cl["min_"+ax]
-        bbox_max_cl[i] = bbox_cl["max_"+ax]
+        bbox_min_cl[i] = bbox_cl[f"min_{ax}"]
+        bbox_max_cl[i] = bbox_cl[f"max_{ax}"]

-    assert (bbox_min == bbox_min_cl).all()
-    assert (bbox_max == bbox_max_cl).all()
+    assert np.all(bbox_min == bbox_min_cl)
+    assert np.all(bbox_max == bbox_max_cl)

 # }}}


 # {{{ test basic (no source/target distinction) tree build

-def run_build_test(builder, queue, dims, dtype, nparticles, do_plot,
-        max_particles_in_box=30, **kwargs):
+def run_build_test(builder, actx, dims, dtype, nparticles, visualize,
+        max_particles_in_box=None, max_leaf_refine_weight=None,
+        refine_weights=None, **kwargs):
    dtype = np.dtype(dtype)

    if dtype == np.float32:
@@ -92,41 +89,50 @@ def run_build_test(builder, queue, dims, dtype, nparticles, do_plot,
    elif dtype == np.float64:
        tol = 1e-12
    else:
-        raise RuntimeError("unsupported dtype: %s" % dtype)
+        raise RuntimeError(f"unsupported dtype: {dtype}")

-    if (dtype == np.float32
-            and dims == 2
-            and queue.device.platform.name == "Portable Computing Language"):
-        pytest.xfail("2D float doesn't work on POCL")
+    logger.info(75 * "-")

-    logger.info(75*"-")
-    logger.info("%dD %s - %d particles - max %d per box - %s" % (
+    if max_particles_in_box is not None:
+        logger.info("%dD %s - %d particles - max %d per box - %s",
            dims, dtype.type.__name__, nparticles, max_particles_in_box,
-            " - ".join("%s: %s" % (k, v) for k, v in six.iteritems(kwargs))))
-    logger.info(75*"-")
+            " - ".join(f"{k}: {v}" for k, v in kwargs.items()))
+    else:
+        logger.info("%dD %s - %d particles - max leaf weight %d  - %s",
+            dims, dtype.type.__name__, nparticles, max_leaf_refine_weight,
+            " - ".join(f"{k}: {v}" for k, v in kwargs.items()))
+
+    logger.info(75 * "-")

-    particles = make_normal_particle_array(queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)

-    if do_plot:
+    if visualize:
        import matplotlib.pyplot as pt
-        pt.plot(particles[0].get(), particles[1].get(), "x")
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")

-    queue.finish()
+    actx.queue.finish()

-    tree, _ = builder(queue, particles,
-            max_particles_in_box=max_particles_in_box, debug=True,
-            **kwargs)
-    tree = tree.get(queue=queue)
+    tree, _ = builder(actx.queue, particles,
+                      max_particles_in_box=max_particles_in_box,
+                      refine_weights=refine_weights,
+                      max_leaf_refine_weight=max_leaf_refine_weight,
+                      debug=True, **kwargs)
+    tree = tree.get(queue=actx.queue)

    sorted_particles = np.array(list(tree.sources))

-    unsorted_particles = np.array([pi.get() for pi in particles])
-    assert (sorted_particles
-            == unsorted_particles[:, tree.user_source_ids]).all()
+    unsorted_particles = np.array([actx.to_numpy(pi) for pi in particles])
+    assert np.all(sorted_particles
+            == unsorted_particles[:, tree.user_source_ids])
+
+    if refine_weights is not None:
+        refine_weights_reordered = (
+                actx.to_numpy(refine_weights)[tree.user_source_ids])

    all_good_so_far = True

-    if do_plot:
+    if visualize:
        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
@@ -136,19 +142,36 @@ def run_build_test(builder, queue, dims, dtype, nparticles, do_plot,

    scaled_tol = tol*tree.root_extent
    for ibox in range(tree.nboxes):
-
        # Empty boxes exist in non-pruned trees--which themselves are undocumented.
        # These boxes will fail these tests.
-        if not (tree.box_flags[ibox] & bfe.HAS_OWN_SRCNTGTS):
+        if not (tree.box_flags[ibox] & bfe.IS_SOURCE_OR_TARGET_BOX):
            continue

        extent_low, extent_high = tree.get_box_extent(ibox)

-        assert (extent_low >= tree.bounding_box[0] - scaled_tol).all(), (
+        assert np.all(extent_low >= tree.bounding_box[0] - scaled_tol), (
                ibox, extent_low, tree.bounding_box[0])
-        assert (extent_high <= tree.bounding_box[1] + scaled_tol).all(), (
+        assert np.all(extent_high <= tree.bounding_box[1] + scaled_tol), (
                ibox, extent_high, tree.bounding_box[1])

+        center = tree.box_centers[:, ibox]
+
+        for _, bbox_min, bbox_max in [
+                (
+                    "source",
+                    tree.box_source_bounding_box_min[:, ibox],
+                    tree.box_source_bounding_box_max[:, ibox]),
+                (
+                    "target",
+                    tree.box_target_bounding_box_min[:, ibox],
+                    tree.box_target_bounding_box_max[:, ibox]),
+                ]:
+            assert np.all(extent_low - scaled_tol <= bbox_min)
+            assert np.all(bbox_min - scaled_tol <= center)
+
+            assert np.all(bbox_max - scaled_tol <= extent_high)
+            assert np.all(center - scaled_tol <= bbox_max)
+
        start = tree.box_source_starts[ibox]

        box_children = tree.box_child_ids[:, ibox]
@@ -162,12 +185,10 @@ def run_build_test(builder, queue, dims, dtype, nparticles, do_plot,
                start:start+tree.box_source_counts_cumul[ibox]]
        good = (
                (box_particles < extent_high[:, np.newaxis] + scaled_tol)
-                &
-                (extent_low[:, np.newaxis] - scaled_tol <= box_particles)
-                )
+                & (extent_low[:, np.newaxis] - scaled_tol <= box_particles))

-        all_good_here = good.all()
-        if do_plot and not all_good_here and all_good_so_far:
+        all_good_here = np.all(good)
+        if visualize and not all_good_here and all_good_so_far:
            pt.plot(
                    box_particles[0, np.where(~good)[1]],
                    box_particles[1, np.where(~good)[1]], "ro")
@@ -177,9 +198,26 @@ def run_build_test(builder, queue, dims, dtype, nparticles, do_plot,
        if not all_good_here:
            print("BAD BOX", ibox)

+        if not (tree.box_flags[ibox] & bfe.HAS_SOURCE_OR_TARGET_CHILD_BOXES):
+            # Check that leaf particle density is as promised.
+            nparticles_in_box = tree.box_source_counts_cumul[ibox]
+            if max_particles_in_box is not None:
+                if nparticles_in_box > max_particles_in_box:
+                    print("too many particles "
+                          f"({nparticles_in_box} > {max_particles_in_box}); box {ibox}")
+                    all_good_here = False
+            else:
+                assert refine_weights is not None
+                box_weight = np.sum(
+                    refine_weights_reordered[start:start+nparticles_in_box])  # pylint: disable=possibly-used-before-assignment
+                if box_weight > max_leaf_refine_weight:
+                    print("refine weight exceeded "
+                          f"({box_weight} > {max_leaf_refine_weight}); box {ibox}")
+                    all_good_here = False
+
        all_good_so_far = all_good_so_far and all_good_here

-    if do_plot:
+    if visualize:
        pt.gca().set_aspect("equal", "datalim")
        pt.show()

@@ -191,97 +229,107 @@ def particle_tree_test_decorator(f):
    f = pytest.mark.parametrize("dtype", [np.float64, np.float32])(f)
    f = pytest.mark.parametrize("dims", [2, 3])(f)

-    def wrapper(*args, **kwargs):
-        logging.basicConfig(level=logging.INFO)
-        f(*args, **kwargs)
-
    return f


 @particle_tree_test_decorator
-def test_single_boxparticle_tree(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims,
-            dtype, 4, do_plot=do_plot)
+    run_build_test(builder, actx, dims,
+            dtype, 4, max_particles_in_box=30, visualize=visualize)


 @particle_tree_test_decorator
-def test_two_level_particle_tree(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims,
-            dtype, 50, do_plot=do_plot)
+    run_build_test(builder, actx, dims,
+            dtype, 50, max_particles_in_box=30, visualize=visualize)


 @particle_tree_test_decorator
-def test_unpruned_particle_tree(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_unpruned_particle_tree(actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

    # test unpruned tree build
-    run_build_test(builder, queue, dims, dtype, 10**5,
-            do_plot=do_plot, skip_prune=True)
+    run_build_test(builder, actx, dims, dtype, 10**5,
+            visualize=visualize, max_particles_in_box=30, skip_prune=True)


 @particle_tree_test_decorator
-def test_particle_tree_with_reallocations(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_particle_tree_with_reallocations(
+        actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims, dtype, 10**5,
-            do_plot=do_plot, nboxes_guess=5)
+    run_build_test(builder, actx, dims, dtype, 10**5,
+            max_particles_in_box=30, visualize=visualize, nboxes_guess=5)


 @particle_tree_test_decorator
 def test_particle_tree_with_many_empty_leaves(
-        ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+        actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()
+
+    from boxtree import TreeBuilder
+    builder = TreeBuilder(actx.context)
+
+    run_build_test(builder, actx, dims, dtype, 10**5,
+            max_particles_in_box=5, visualize=visualize)
+
+
+@particle_tree_test_decorator
+def test_vanilla_particle_tree(actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims, dtype, 10**5,
-            do_plot=do_plot, max_particles_in_box=5)
+    run_build_test(builder, actx, dims, dtype, 10**5,
+            max_particles_in_box=30, visualize=visualize)


 @particle_tree_test_decorator
-def test_vanilla_particle_tree(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_explicit_refine_weights_particle_tree(
+        actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims, dtype, 10**5,
-            do_plot=do_plot)
+    nparticles = 10**5
+
+    rng = np.random.default_rng(10)
+    refine_weights = actx.from_numpy(
+            rng.integers(1, 10, (nparticles,), dtype=np.int32)
+            )
+
+    run_build_test(builder, actx, dims, dtype, nparticles,
+            refine_weights=refine_weights, max_leaf_refine_weight=100,
+            visualize=visualize)


 @particle_tree_test_decorator
-def test_non_adaptive_particle_tree(ctx_getter, dtype, dims, do_plot=False):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_non_adaptive_particle_tree(actx_factory, dtype, dims, visualize=False):
+    actx = actx_factory()

    from boxtree import TreeBuilder
-    builder = TreeBuilder(ctx)
+    builder = TreeBuilder(actx.context)

-    run_build_test(builder, queue, dims, dtype, 10**4,
-            do_plot=do_plot, non_adaptive=True)
+    run_build_test(builder, actx, dims, dtype, 10**4,
+            max_particles_in_box=30, visualize=visualize, kind="non-adaptive")

 # }}}

@@ -290,63 +338,63 @@ def test_non_adaptive_particle_tree(ctx_getter, dtype, dims, do_plot=False):

 @pytest.mark.opencl
 @pytest.mark.parametrize("dims", [2, 3])
-def test_source_target_tree(ctx_getter, dims, do_plot=False):
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_source_target_tree(actx_factory, dims, visualize=False):
+    actx = actx_factory()

    nsources = 2 * 10**5
    ntargets = 3 * 10**5
    dtype = np.float64

-    sources = make_normal_particle_array(queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
            seed=12)
-    targets = make_normal_particle_array(queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
            seed=19)

-    if do_plot:
+    if visualize:
        import matplotlib.pyplot as pt
-        pt.plot(sources[0].get(), sources[1].get(), "rx")
-        pt.plot(targets[0].get(), targets[1].get(), "g+")
+        np_sources, np_targets = actx.to_numpy(sources), actx.to_numpy(targets)
+        pt.plot(np_sources[0], np_sources[1], "rx")
+        pt.plot(np_targets[0], np_targets[1], "g+")
        pt.show()

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    queue.finish()
-    tree, _ = tb(queue, sources, targets=targets,
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, sources, targets=targets,
            max_particles_in_box=10, debug=True)
-    tree = tree.get(queue=queue)
+    tree = tree.get(queue=actx.queue)

    sorted_sources = np.array(list(tree.sources))
    sorted_targets = np.array(list(tree.targets))

-    unsorted_sources = np.array([pi.get() for pi in sources])
-    unsorted_targets = np.array([pi.get() for pi in targets])
-    assert (sorted_sources
-            == unsorted_sources[:, tree.user_source_ids]).all()
+    unsorted_sources = np.array([actx.to_numpy(pi) for pi in sources])
+    unsorted_targets = np.array([actx.to_numpy(pi) for pi in targets])
+    assert np.all(sorted_sources
+            == unsorted_sources[:, tree.user_source_ids])

    user_target_ids = np.empty(tree.ntargets, dtype=np.intp)
    user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets, dtype=np.intp)
-    assert (sorted_targets
-            == unsorted_targets[:, user_target_ids]).all()
+    assert np.all(sorted_targets
+            == unsorted_targets[:, user_target_ids])

    all_good_so_far = True

-    if do_plot:
+    if visualize:
        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
        plotter.set_bounding_box()

+    tol = 1e-15
+
    for ibox in range(tree.nboxes):
        extent_low, extent_high = tree.get_box_extent(ibox)

-        assert (extent_low >=
-                tree.bounding_box[0] - 1e-12*tree.root_extent).all(), ibox
-        assert (extent_high <=
-                tree.bounding_box[1] + 1e-12*tree.root_extent).all(), ibox
+        assert np.all(extent_low
+                >= tree.bounding_box[0] - 1e-12*tree.root_extent), ibox
+        assert np.all(extent_high
+                <= tree.bounding_box[1] + 1e-12*tree.root_extent), ibox

        src_start = tree.box_source_starts[ibox]
        tgt_start = tree.box_target_starts[ibox]
@@ -367,14 +415,14 @@ def test_source_target_tree(ctx_getter, dims, do_plot=False):
                ("targets", sorted_targets[:,
                    tgt_start:tgt_start+tree.box_target_counts_cumul[ibox]]),
                ]:
-            good = (
-                    (particles < extent_high[:, np.newaxis])
-                    &
-                    (extent_low[:, np.newaxis] <= particles)
-                    ).all(axis=0)
-
-            all_good_here = good.all()
-            if do_plot and not all_good_here:
+            good = np.all(
+                    (particles < extent_high[:, np.newaxis] + tol)
+                    & (extent_low[:, np.newaxis] - tol <= particles),
+                    axis=0)
+
+            all_good_here = np.all(good)
+
+            if visualize and not all_good_here:
                pt.plot(
                        particles[0, np.where(~good)[0]],
                        particles[1, np.where(~good)[0]], "ro")
@@ -382,13 +430,14 @@ def test_source_target_tree(ctx_getter, dims, do_plot=False):
                plotter.draw_box(ibox, edgecolor="red")
                pt.show()

-        if not all_good_here:
-            print("BAD BOX %s %d" % (what, ibox))
+            if not all_good_here:
+                print(f"BAD BOX {what} {ibox}")
+
+            all_good_so_far = all_good_so_far and all_good_here

-        all_good_so_far = all_good_so_far and all_good_here
        assert all_good_so_far

-    if do_plot:
+    if visualize:
        pt.gca().set_aspect("equal", "datalim")
        pt.show()

@@ -399,54 +448,84 @@ def test_source_target_tree(ctx_getter, dims, do_plot=False):

 @pytest.mark.opencl
 @pytest.mark.parametrize("dims", [2, 3])
-def test_extent_tree(ctx_getter, dims, do_plot=False):
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+@pytest.mark.parametrize("extent_norm", ["linf", "l2"])
+def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
+    actx = actx_factory()

    nsources = 100000
    ntargets = 200000
    dtype = np.float64
    npoint_sources_per_source = 16

-    sources = make_normal_particle_array(queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
            seed=12)
-    targets = make_normal_particle_array(queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
            seed=19)

-    from pyopencl.clrandom import RanluxGenerator
-    rng = RanluxGenerator(queue, seed=13)
-    source_radii = 2**rng.uniform(queue, nsources, dtype=dtype,
-            a=-10, b=0)
-    target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype,
-            a=-10, b=0)
+    refine_weights = actx.np.zeros(nsources + ntargets, np.int32)
+    refine_weights[:nsources] = 1
+
+    rng = np.random.default_rng(13)
+    source_radii = actx.from_numpy(
+            2**rng.uniform(-10, 0, (nsources,)).astype(dtype)
+            )
+    target_radii = actx.from_numpy(
+            2**rng.uniform(-10, 0, (ntargets,)).astype(dtype)
+            )

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    queue.finish()
-    dev_tree, _ = tb(queue, sources, targets=targets,
-            source_radii=source_radii, target_radii=target_radii,
-            max_particles_in_box=10, debug=True)
+    actx.queue.finish()
+    dev_tree, _ = tb(actx.queue, sources, targets=targets,
+            source_radii=source_radii,
+            target_radii=target_radii,
+            extent_norm=extent_norm,
+
+            refine_weights=refine_weights,
+            max_leaf_refine_weight=20,
+
+            # max_particles_in_box=10,
+
+            # Set artificially small, to exercise the reallocation code.
+            nboxes_guess=10,
+
+            debug=True,
+            stick_out_factor=0)

    logger.info("transfer tree, check orderings")

-    tree = dev_tree.get(queue=queue)
+    tree = dev_tree.get(queue=actx.queue)
+
+    if visualize:
+        import matplotlib.pyplot as pt
+        np_sources, np_targets = actx.to_numpy(sources), actx.to_numpy(targets)
+        pt.plot(np_sources[0], np_sources[1], "rx")
+        pt.plot(np_targets[0], np_targets[1], "g+")
+
+        from boxtree.visualization import TreePlotter
+        plotter = TreePlotter(tree)
+        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
+        plotter.draw_box_numbers()
+        plotter.set_bounding_box()
+
+        pt.gca().set_aspect("equal", "datalim")
+        pt.show()

    sorted_sources = np.array(list(tree.sources))
    sorted_targets = np.array(list(tree.targets))
    sorted_source_radii = tree.source_radii
    sorted_target_radii = tree.target_radii

-    unsorted_sources = np.array([pi.get() for pi in sources])
-    unsorted_targets = np.array([pi.get() for pi in targets])
-    unsorted_source_radii = source_radii.get()
-    unsorted_target_radii = target_radii.get()
-    assert (sorted_sources
-            == unsorted_sources[:, tree.user_source_ids]).all()
-    assert (sorted_source_radii
-            == unsorted_source_radii[tree.user_source_ids]).all()
+    unsorted_sources = np.array([actx.to_numpy(pi) for pi in sources])
+    unsorted_targets = np.array([actx.to_numpy(pi) for pi in targets])
+    unsorted_source_radii = actx.to_numpy(source_radii)
+    unsorted_target_radii = actx.to_numpy(target_radii)
+
+    assert np.all(sorted_sources
+            == unsorted_sources[:, tree.user_source_ids])
+    assert np.all(sorted_source_radii
+            == unsorted_source_radii[tree.user_source_ids])

    # {{{ test box structure, stick-out criterion

@@ -455,25 +534,36 @@ def test_extent_tree(ctx_getter, dims, do_plot=False):
    user_target_ids = np.empty(tree.ntargets, dtype=np.intp)
    user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets, dtype=np.intp)
    if ntargets:
-        assert (sorted_targets
-                == unsorted_targets[:, user_target_ids]).all()
-        assert (sorted_target_radii
-                == unsorted_target_radii[user_target_ids]).all()
+        assert np.all(sorted_targets
+                == unsorted_targets[:, user_target_ids])
+        assert np.all(sorted_target_radii
+                == unsorted_target_radii[user_target_ids])

    all_good_so_far = True

    # {{{ check sources, targets

+    assert np.sum(tree.box_source_counts_nonchild) == nsources
+    assert np.sum(tree.box_target_counts_nonchild) == ntargets
+
    for ibox in range(tree.nboxes):
-        extent_low, extent_high = tree.get_box_extent(ibox)
+        kid_sum = sum(
+                    tree.box_target_counts_cumul[ichild_box]
+                    for ichild_box in tree.box_child_ids[:, ibox]
+                    if ichild_box != 0)
+        assert (
+                tree.box_target_counts_cumul[ibox]
+                == (
+                    tree.box_target_counts_nonchild[ibox]
+                    + kid_sum)), ibox

-        box_radius = np.max(extent_high-extent_low) * 0.5
-        stick_out_dist = tree.stick_out_factor * box_radius
+    for ibox in range(tree.nboxes):
+        extent_low, extent_high = tree.get_box_extent(ibox)

-        assert (extent_low >=
-                tree.bounding_box[0] - 1e-12*tree.root_extent).all(), ibox
-        assert (extent_high <=
-                tree.bounding_box[1] + 1e-12*tree.root_extent).all(), ibox
+        assert np.all(extent_low
+                >= tree.bounding_box[0] - 1e-12*tree.root_extent), ibox
+        assert np.all(extent_high
+                <= tree.bounding_box[1] + 1e-12*tree.root_extent), ibox

        box_children = tree.box_child_ids[:, ibox]
        existing_children = box_children[box_children != 0]
@@ -485,6 +575,19 @@ def test_extent_tree(ctx_getter, dims, do_plot=False):
                + np.sum(tree.box_target_counts_cumul[existing_children])
                == tree.box_target_counts_cumul[ibox])

+    del existing_children
+    del box_children
+
+    for ibox in range(tree.nboxes):
+        lev = int(tree.box_levels[ibox])
+        box_radius = 0.5 * tree.root_extent / (1 << lev)
+        box_center = tree.box_centers[:, ibox]
+        extent_low = box_center - box_radius
+        extent_high = box_center + box_radius
+
+        stick_out_dist = tree.stick_out_factor * box_radius
+        radius_with_stickout = (1 + tree.stick_out_factor) * box_radius
+
        for what, starts, counts, points, radii in [
                ("source", tree.box_source_starts, tree.box_source_counts_cumul,
                    sorted_sources, sorted_source_radii),
@@ -496,18 +599,32 @@ def test_extent_tree(ctx_getter, dims, do_plot=False):
            check_particles = points[:, bslice]
            check_radii = radii[bslice]

-            good = (
-                    (check_particles + check_radii
-                        < extent_high[:, np.newaxis] + stick_out_dist)
-                    &
-                    (extent_low[:, np.newaxis] - stick_out_dist
-                        <= check_particles - check_radii)
-                    ).all(axis=0)
+            if extent_norm == "linf":
+                good = np.all(
+                        (check_particles + check_radii
+                            < extent_high[:, np.newaxis] + stick_out_dist)
+                        &
+                        (extent_low[:, np.newaxis] - stick_out_dist
+                            <= check_particles - check_radii),
+                        axis=0)
+
+            elif extent_norm == "l2":
+                center_dists = np.sqrt(
+                        np.sum(
+                            (check_particles - box_center.reshape(-1, 1))**2,
+                            axis=0))

-            all_good_here = good.all()
+                good = (
+                        (center_dists + check_radii)**2
+                        < dims * radius_with_stickout**2)
+
+            else:
+                raise ValueError(f"unexpected value of extent_norm: '{extent_norm}'")
+
+            all_good_here = np.all(good)

            if not all_good_here:
-                print("BAD BOX %s %d level %d" % (what, ibox, tree.box_levels[ibox]))
+                print(f"BAD BOX {what} {ibox} level {tree.box_levels[ibox]}")

            all_good_so_far = all_good_so_far and all_good_here
            assert all_good_here
@@ -522,24 +639,25 @@ def test_extent_tree(ctx_getter, dims, do_plot=False):

    logger.info("creating point sources")

-    np.random.seed(20)
-
    from pytools.obj_array import make_obj_array
    point_sources = make_obj_array([
-            cl.array.to_device(queue,
+            actx.from_numpy(
                unsorted_sources[i][:, np.newaxis]
                + unsorted_source_radii[:, np.newaxis]
-                * np.random.uniform(
-                    -1, 1, size=(nsources, npoint_sources_per_source))
-                 )
+                * rng.uniform(-1, 1, size=(nsources, npoint_sources_per_source))
+                )
            for i in range(dims)])

-    point_source_starts = cl.array.arange(queue,
-            0, (nsources+1)*npoint_sources_per_source, npoint_sources_per_source,
-            dtype=tree.particle_id_dtype)
+    point_source_starts = actx.from_numpy(
+            np.arange(
+                0,
+                (nsources + 1) * npoint_sources_per_source,
+                npoint_sources_per_source,
+                dtype=tree.particle_id_dtype)
+            )

    from boxtree.tree import link_point_sources
-    dev_tree = link_point_sources(queue, dev_tree,
+    dev_tree = link_point_sources(actx.queue, dev_tree,
            point_source_starts, point_sources,
            debug=True)

@@ -548,52 +666,52 @@ def test_extent_tree(ctx_getter, dims, do_plot=False):
 # }}}


-# {{{ geometry query test
+# {{{ leaves to balls query test

 @pytest.mark.opencl
 @pytest.mark.geo_lookup
 @pytest.mark.parametrize("dims", [2, 3])
-def test_geometry_query(ctx_getter, dims, do_plot=False):
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_leaves_to_balls_query(actx_factory, dims, visualize=False):
+    actx = actx_factory()

    nparticles = 10**5
    dtype = np.float64

-    particles = make_normal_particle_array(queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)

-    if do_plot:
+    if visualize:
        import matplotlib.pyplot as pt
-        pt.plot(particles[0].get(), particles[1].get(), "x")
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    queue.finish()
-    tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True)
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)

    nballs = 10**4
-    ball_centers = make_normal_particle_array(queue, nballs, dims, dtype)
-    ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1)
+    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_radii = 0.1 + actx.np.zeros(nballs, dtype)

-    from boxtree.geo_lookup import LeavesToBallsLookupBuilder
-    lblb = LeavesToBallsLookupBuilder(ctx)
+    from boxtree.area_query import LeavesToBallsLookupBuilder
+    lblb = LeavesToBallsLookupBuilder(actx.context)

-    lbl, _ = lblb(queue, tree, ball_centers, ball_radii)
+    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)

    # get data to host for test
-    tree = tree.get(queue=queue)
-    lbl = lbl.get(queue=queue)
-    ball_centers = np.array([x.get() for x in ball_centers]).T
-    ball_radii = ball_radii.get()
+    tree = tree.get(queue=actx.queue)
+    lbl = lbl.get(queue=actx.queue)
+    ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
+    ball_radii = actx.to_numpy(ball_radii)
+
+    assert len(lbl.balls_near_box_starts) == tree.nboxes + 1

    from boxtree import box_flags_enum

    for ibox in range(tree.nboxes):
        # We only want leaves here.
-        if tree.box_flags[ibox] & box_flags_enum.HAS_CHILDREN:
+        if tree.box_flags[ibox] & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES:
            continue

        box_center = tree.box_centers[:, ibox]
@@ -604,8 +722,6 @@ def test_geometry_query(ctx_getter, dims, do_plot=False):
        near_circles, = np.where(linf_circle_dists - ball_radii < box_rad)

        start, end = lbl.balls_near_box_starts[ibox:ibox+2]
-        #print sorted(lbl.balls_near_box_lists[start:end])
-        #print sorted(near_circles)
        assert sorted(lbl.balls_near_box_lists[start:end]) == sorted(near_circles)

 # }}}
@@ -613,70 +729,402 @@ def test_geometry_query(ctx_getter, dims, do_plot=False):

 # {{{ area query test

+def run_area_query_test(actx, tree, ball_centers, ball_radii):
+    """
+    Performs an area query and checks that the result is as expected.
+    """
+    from boxtree.area_query import AreaQueryBuilder
+    aqb = AreaQueryBuilder(actx.context)
+
+    area_query, _ = aqb(actx.queue, tree, ball_centers, ball_radii)
+
+    # Get data to host for test.
+    tree = tree.get(queue=actx.queue)
+    area_query = area_query.get(queue=actx.queue)
+    ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
+    ball_radii = actx.to_numpy(ball_radii)
+
+    from boxtree import box_flags_enum
+    leaf_boxes, = (
+            tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
+            ).nonzero()
+
+    leaf_box_radii = np.empty(len(leaf_boxes))
+    dims = len(tree.sources)
+    leaf_box_centers = np.empty((len(leaf_boxes), dims))
+
+    for idx, leaf_box in enumerate(leaf_boxes):
+        box_center = tree.box_centers[:, leaf_box]
+        ext_l, ext_h = tree.get_box_extent(leaf_box)
+        leaf_box_radii[idx] = np.max(ext_h - ext_l) * 0.5
+        leaf_box_centers[idx] = box_center
+
+    for ball_nr, (ball_center, ball_radius) \
+            in enumerate(zip(ball_centers, ball_radii, strict=True)):
+        linf_box_dists = np.max(np.abs(ball_center - leaf_box_centers), axis=-1)
+        near_leaves_indices, \
+            = np.where(linf_box_dists < ball_radius + leaf_box_radii)
+        near_leaves = leaf_boxes[near_leaves_indices]
+
+        start, end = area_query.leaves_near_ball_starts[ball_nr:ball_nr+2]
+        found = area_query.leaves_near_ball_lists[start:end]
+        actual = near_leaves
+        assert set(found) == set(actual), (found, actual)
+
+
 @pytest.mark.opencl
 @pytest.mark.area_query
 @pytest.mark.parametrize("dims", [2, 3])
-def test_area_query(ctx_getter, dims, do_plot=False):
-    logging.basicConfig(level=logging.INFO)
+def test_area_query(actx_factory, dims, visualize=False):
+    actx = actx_factory()
+
+    nparticles = 10**5
+    dtype = np.float64
+
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    if visualize:
+        import matplotlib.pyplot as pt
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+
+    nballs = 10**4
+    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
+
+    run_area_query_test(actx, tree, ball_centers, ball_radii)
+
+
+@pytest.mark.opencl
+@pytest.mark.area_query
+@pytest.mark.parametrize("dims", [2, 3])
+def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False):
+    """
+    The input to the area query includes balls whose centers are not within
+    the tree bounding box.
+    """
+    actx = actx_factory()
+
+    nparticles = 10**4
+    dtype = np.float64
+
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+
+    if visualize:
+        import matplotlib.pyplot as pt
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+
+    nballs = 10**4
+    bbox_min = tree.bounding_box[0].min()
+    bbox_max = tree.bounding_box[1].max()
+
+    from pytools.obj_array import make_obj_array
+    rng = np.random.default_rng(13)
+    ball_centers = make_obj_array([
+        actx.from_numpy(
+            rng.uniform(bbox_min - 1, bbox_max + 1, nballs).astype(dtype))
+        for i in range(dims)])
+    ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
+
+    run_area_query_test(actx, tree, ball_centers, ball_radii)
+
+
+@pytest.mark.opencl
+@pytest.mark.area_query
+@pytest.mark.parametrize("dims", [2, 3])
+def test_area_query_elwise(actx_factory, dims, visualize=False):
+    actx = actx_factory()

    nparticles = 10**5
    dtype = np.float64

-    particles = make_normal_particle_array(queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)

-    if do_plot:
+    if visualize:
        import matplotlib.pyplot as pt
-        pt.plot(particles[0].get(), particles[1].get(), "x")
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    queue.finish()
-    tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True)
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)

    nballs = 10**4
-    ball_centers = make_normal_particle_array(queue, nballs, dims, dtype)
-    ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1)
+    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
+
+    from boxtree.area_query import AreaQueryElementwiseTemplate, PeerListFinder
+
+    template = AreaQueryElementwiseTemplate(
+        extra_args="""
+            coord_t *ball_radii,
+            %for ax in AXIS_NAMES[:dimensions]:
+                coord_t *ball_${ax},
+            %endfor
+        """,
+        ball_center_and_radius_expr="""
+            %for ax in AXIS_NAMES[:dimensions]:
+                ${ball_center}.${ax} = ball_${ax}[${i}];
+            %endfor
+            ${ball_radius} = ball_radii[${i}];
+        """,
+        leaf_found_op="")
+
+    peer_lists, evt = PeerListFinder(actx.context)(actx.queue, tree)
+
+    kernel = template.generate(
+        actx.context,
+        dims,
+        tree.coord_dtype,
+        tree.box_id_dtype,
+        peer_lists.peer_list_starts.dtype,
+        tree.nlevels)
+
+    evt = kernel(
+        *template.unwrap_args(
+            tree, peer_lists, ball_radii, *ball_centers),
+        queue=actx.queue,
+        wait_for=[evt],
+        range=slice(len(ball_radii)))

-    from boxtree.area_query import AreaQueryBuilder
-    aqb = AreaQueryBuilder(ctx)
+# }}}
+
+
+# {{{ level restriction test

-    area_query, _ = aqb(queue, tree, ball_centers, ball_radii)
+@pytest.mark.opencl
+@pytest.mark.parametrize("lookbehind", [0, 1])
+@pytest.mark.parametrize("skip_prune", [True, False])
+@pytest.mark.parametrize("dims", [2, 3])
+def test_level_restriction(
+        actx_factory, dims, skip_prune, lookbehind, visualize=False):
+    actx = actx_factory()
+
+    nparticles = 10**5
+    dtype = np.float64
+
+    from boxtree.tools import make_surface_particle_array
+    particles = make_surface_particle_array(
+            actx.queue, nparticles, dims, dtype, seed=15)
+
+    if visualize:
+        import matplotlib.pyplot as pt
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    actx.queue.finish()
+    tree_dev, _ = tb(actx.queue, particles,
+            kind="adaptive-level-restricted",
+            max_particles_in_box=30, debug=True,
+            skip_prune=skip_prune, lr_lookbehind=lookbehind,
+
+            # Artificially low to exercise reallocation code
+            nboxes_guess=10)
+
+    def find_neighbors(leaf_box_centers, leaf_box_radii):
+        # We use an area query with a ball that is slightly larger than
+        # the size of a leaf box to find the neighboring leaves.
+        #
+        # Note that since this comes from an area query, the self box will be
+        # included in the neighbor list.
+        from boxtree.area_query import AreaQueryBuilder
+        aqb = AreaQueryBuilder(actx.context)
+
+        ball_radii = actx.from_numpy(np.min(leaf_box_radii) / 2 + leaf_box_radii)
+        leaf_box_centers = [actx.from_numpy(axis) for axis in leaf_box_centers]
+
+        area_query, _ = aqb(actx.queue, tree_dev, leaf_box_centers, ball_radii)
+        area_query = area_query.get(queue=actx.queue)
+        return (area_query.leaves_near_ball_starts,
+                area_query.leaves_near_ball_lists)

    # Get data to host for test.
-    tree = tree.get(queue=queue)
-    area_query = area_query.get(queue=queue)
-    ball_centers = np.array([x.get() for x in ball_centers]).T
-    ball_radii = ball_radii.get()
+    tree = tree_dev.get(queue=actx.queue)

+    # Find leaf boxes.
    from boxtree import box_flags_enum
-
-    leaf_boxes = np.array([ibox for ibox in range(tree.nboxes)
-        if tree.box_flags[ibox] & ~box_flags_enum.HAS_CHILDREN])
+    leaf_boxes, = (
+            tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
+            ).nonzero()

    leaf_box_radii = np.empty(len(leaf_boxes))
-    leaf_box_centers = np.empty((len(leaf_boxes), dims))
+    leaf_box_centers = np.empty((dims, len(leaf_boxes)))

    for idx, leaf_box in enumerate(leaf_boxes):
        box_center = tree.box_centers[:, leaf_box]
        ext_l, ext_h = tree.get_box_extent(leaf_box)
        leaf_box_radii[idx] = np.max(ext_h - ext_l) * 0.5
-        leaf_box_centers[idx] = box_center
+        leaf_box_centers[:, idx] = box_center

-    for ball_nr, (ball_center, ball_radius) \
-            in enumerate(zip(ball_centers, ball_radii)):
-        linf_box_dists = np.max(np.abs(ball_center - leaf_box_centers), axis=-1)
-        near_leaves_indices, \
-            = np.where(linf_box_dists < ball_radius + leaf_box_radii)
-        near_leaves = leaf_boxes[near_leaves_indices]
+    neighbor_starts, neighbor_and_self_lists = find_neighbors(
+        leaf_box_centers, leaf_box_radii)

-        start, end = area_query.leaves_near_ball_starts[ball_nr:ball_nr+2]
-        found = area_query.leaves_near_ball_lists[start:end]
-        actual = near_leaves
-        assert sorted(found) == sorted(actual)
+    # Check level restriction.
+    for leaf_idx, leaf in enumerate(leaf_boxes):
+        neighbors = neighbor_and_self_lists[
+            neighbor_starts[leaf_idx]:neighbor_starts[leaf_idx+1]]
+        neighbor_levels = np.array(tree.box_levels[neighbors], dtype=int)
+        leaf_level = int(tree.box_levels[leaf])
+        assert np.all(np.abs(neighbor_levels - leaf_level) <= 1), \
+                (neighbor_levels, leaf_level)
+
+# }}}
+
+
+# {{{ space invader query test
+
+@pytest.mark.opencl
+@pytest.mark.geo_lookup
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("dims", [2, 3])
+def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
+    actx = actx_factory()
+
+    dtype = np.dtype(dtype)
+    nparticles = 10**5
+
+    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+
+    if visualize:
+        import matplotlib.pyplot as pt
+        np_particles = actx.to_numpy(particles)
+        pt.plot(np_particles[0], np_particles[1], "x")
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    actx.queue.finish()
+    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+
+    nballs = 10**4
+    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
+
+    from boxtree.area_query import LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder
+
+    siqb = SpaceInvaderQueryBuilder(actx.context)
+    # We can use leaves-to-balls lookup to get the set of overlapping balls for
+    # each box, and from there to compute the outer space invader distance.
+    lblb = LeavesToBallsLookupBuilder(actx.context)
+
+    siq, _ = siqb(actx.queue, tree, ball_centers, ball_radii)
+    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)
+
+    # get data to host for test
+    tree = tree.get(queue=actx.queue)
+    siq = siq.get(queue=actx.queue)
+    lbl = lbl.get(queue=actx.queue)
+
+    ball_centers = np.array([actx.to_numpy(x) for x in ball_centers])
+    ball_radii = actx.to_numpy(ball_radii)
+
+    # Find leaf boxes.
+    from boxtree import box_flags_enum
+
+    outer_space_invader_dist = np.zeros(tree.nboxes)
+
+    for ibox in range(tree.nboxes):
+        # We only want leaves here.
+        if tree.box_flags[ibox] & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES:
+            continue
+
+        start, end = lbl.balls_near_box_starts[ibox:ibox + 2]
+        space_invaders = lbl.balls_near_box_lists[start:end]
+        if len(space_invaders) > 0:
+            outer_space_invader_dist[ibox] = np.max(np.abs(
+                    tree.box_centers[:, ibox].reshape((-1, 1))
+                    - ball_centers[:, space_invaders]))
+
+    assert np.allclose(siq, outer_space_invader_dist)
+
+# }}}
+
+
+# {{{ test_same_tree_with_zero_weight_particles
+
+@pytest.mark.opencl
+@pytest.mark.parametrize("dims", [2, 3])
+def test_same_tree_with_zero_weight_particles(actx_factory, dims):
+    actx = actx_factory()
+
+    ntargets_values = [300, 400, 500]
+    stick_out_factors = [0, 0.1, 0.3, 1]
+    nsources = 20
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    trees = []
+
+    rng = np.random.default_rng(10)
+    for stick_out_factor in stick_out_factors:
+        for ntargets in [40]:
+            sources = rng.random((dims, nsources))**2
+            sources[:, 0] = -0.1
+            sources[:, 1] = 1.1
+
+            targets = rng.random((dims, max(ntargets_values)))[:, :ntargets].copy()
+            target_radii = rng.random(max(ntargets_values))[:ntargets]
+
+            sources = actx.from_numpy(sources)
+            targets = actx.from_numpy(targets)
+
+            refine_weights = actx.np.zeros(nsources + ntargets, np.int32)
+            refine_weights[:nsources] = 1
+            refine_weights[nsources:] = 0
+
+            tree, _ = tb(actx.queue, sources, targets=targets,
+                    target_radii=target_radii,
+                    stick_out_factor=stick_out_factor,
+                    max_leaf_refine_weight=10,
+                    refine_weights=refine_weights,
+                    debug=True)
+            tree = tree.get(queue=actx.queue)
+            trees.append(tree)
+
+            print("TREE:", tree.nboxes)
+
+    if 0:
+        import matplotlib.pyplot as plt
+        for tree in trees:
+            plt.figure()
+            tree.plot()
+
+        plt.show()
+
+# }}}
+
+
+# {{{ test_max_levels_error
+
+def test_max_levels_error(actx_factory):
+    actx = actx_factory()
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    sources = [actx.np.zeros(11, np.float64) for i in range(2)]
+    from boxtree.tree_build import MaxLevelsExceeded
+    with pytest.raises(MaxLevelsExceeded):
+        _tree, _ = tb(actx.queue, sources, max_particles_in_box=10, debug=True)

 # }}}

@@ -688,7 +1136,7 @@ if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from py.test.cmdline import main
+        from pytest import main
        main([__file__])

 # vim: fdm=marker
--- a/test/test_tree_of_boxes.py
+++ b/test/test_tree_of_boxes.py
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner, Xiaoyu Wei"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import sys
+
+import numpy as np
+import pytest
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+# This means boxtree's tests have a hard dependency on meshmode. That's OK.
+from meshmode import _acf  # noqa: F401
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+
+from boxtree import (
+    make_meshmode_mesh_from_leaves,
+    make_tree_of_boxes_root,
+    uniformly_refine_tree_of_boxes,
+)
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+# {{{ make_global_leaf_quadrature
+
+def make_global_leaf_quadrature(actx, tob, order):
+    from meshmode.discretization.poly_element import (
+        GaussLegendreTensorProductGroupFactory,
+    )
+    group_factory = GaussLegendreTensorProductGroupFactory(order=order)
+
+    mesh, _ = make_meshmode_mesh_from_leaves(tob)
+
+    if 0:
+        import matplotlib.pyplot as plt
+
+        from meshmode.mesh import visualization as mvis
+        mvis.draw_2d_mesh(mesh,
+                          set_bounding_box=True,
+                          draw_vertex_numbers=False,
+                          draw_element_numbers=False)
+        plt.plot(tob.box_centers[0][tob.leaf_boxes],
+                 tob.box_centers[1][tob.leaf_boxes], "rx")
+        plt.plot(mesh.vertices[0], mesh.vertices[1], "ro")
+        plt.show()
+
+    from meshmode.discretization import Discretization
+    discr = Discretization(actx, mesh, group_factory)
+
+    lflevels = tob.box_levels[tob.leaf_boxes]
+    lfmeasures = (tob.root_extent / (2**lflevels))**tob.dimensions
+
+    from arraycontext import flatten
+    weights = flatten(actx.thaw(discr.quad_weights()), actx)
+    jacobians = actx.from_numpy(
+        np.repeat(lfmeasures/(2**tob.dimensions), discr.groups[0].nunit_dofs)
+        )
+    q = weights * jacobians
+
+    from pytools.obj_array import make_obj_array
+    nodes = discr.nodes()
+    x = make_obj_array([flatten(actx.thaw(axis), actx) for axis in nodes])
+
+    return x, q
+
+# }}}
+
+
+# {{{ test_uniform_tree_of_boxes
+
+@pytest.mark.parametrize("dim", [1, 2, 3])
+@pytest.mark.parametrize("order", [1, 2, 3])
+@pytest.mark.parametrize("nlevels", [1, 4])
+def test_uniform_tree_of_boxes(actx_factory, dim, order, nlevels):
+    actx = actx_factory()
+
+    rng = np.random.default_rng(seed=42)
+    lower_bounds = rng.random(dim)
+    radius = rng.random() + 0.1
+    upper_bounds = lower_bounds + radius
+    tob = make_tree_of_boxes_root((lower_bounds, upper_bounds))
+
+    for _ in range(nlevels - 1):
+        tob = uniformly_refine_tree_of_boxes(tob)
+
+    _, q = make_global_leaf_quadrature(actx, tob, order)
+
+    # integrates 1 exactly
+    box_area = actx.np.sum(q)
+    assert np.isclose(actx.to_numpy(box_area), radius**dim)
+
+# }}}
+
+
+# {{{ test_uniform_tree_of_boxes_convergence
+
+@pytest.mark.parametrize("dim", [1, 2, 3])
+@pytest.mark.parametrize("order", [1, 2, 3])
+def test_uniform_tree_of_boxes_convergence(actx_factory, dim, order):
+    actx = actx_factory()
+
+    radius = np.pi
+    lower_bounds = np.zeros(dim) - radius / 2
+    upper_bounds = lower_bounds + radius
+    tob = make_tree_of_boxes_root((lower_bounds, upper_bounds))
+
+    min_level = 0
+    max_level = 1
+
+    for _ in range(min_level):
+        tob = uniformly_refine_tree_of_boxes(tob)
+
+    # integrate cos(0.1*x + 0.2*y + 0.3*z + e) over [-pi/2, pi/2]**dim
+    qexact_table = {
+        1: 20 * np.sin(np.pi/20) * np.cos(np.e),
+        2: 50 * (np.sqrt(5) - 1) * np.sin(np.pi/20) * np.cos(np.e),
+        3: 250/3 * (np.sqrt(10 - 2*np.sqrt(5)) - 2) * np.cos(np.e)
+    }
+    qexact = qexact_table[dim]
+
+    from pytools.convergence import EOCRecorder
+    eoc_rec = EOCRecorder()
+
+    for _ in range(min_level, max_level + 1):
+        x, q = make_global_leaf_quadrature(actx, tob, order)
+        x = np.array([actx.to_numpy(xx) for xx in x])
+        q = actx.to_numpy(q)
+
+        inner = np.ones_like(q) * np.e
+        for iaxis in range(dim):
+            inner += (iaxis + 1) * 0.1 * x[iaxis]
+        f = np.cos(inner)
+        qh = np.sum(f * q)
+        err = abs(qexact - qh)
+
+        if err < 1e-14:
+            break  # eoc will be off after hitting machine epsilon
+
+        # under uniform refinement, last box is always leaf
+        eoc_rec.add_data_point(tob.get_box_size(-1), err)
+        tob = uniformly_refine_tree_of_boxes(tob)
+
+    if len(eoc_rec.history) > 1:
+        # Gauss quadrature is exact up to degree 2q+1
+        eps = 0.05
+        assert eoc_rec.order_estimate() >= 2*order + 2 - eps
+    else:
+        print(err)
+        assert err < 1e-14
+
+# }}}
+
+
+# {{{ test_tree_plot
+
+def test_tree_plot():
+    radius = np.pi
+    dim = 2
+    nlevels = 3
+    lower_bounds = np.zeros(dim) - radius / 2
+    upper_bounds = lower_bounds + radius
+    tob = make_tree_of_boxes_root((lower_bounds, upper_bounds))
+
+    for _ in range(nlevels - 1):
+        tob = uniformly_refine_tree_of_boxes(tob)
+
+    # test TreePlotter compatibility
+    from boxtree.visualization import TreePlotter
+    tp = TreePlotter(tob)
+    tp.draw_tree()
+    tp.set_bounding_box()
+
+    # import matplotlib.pyplot as plt
+    # plt.show()
+
+# }}}
+
+
+# {{{ test_traversal_from_tob
+
+
+def test_traversal_from_tob(actx_factory):
+    actx = actx_factory()
+
+    radius = np.pi
+    dim = 2
+    nlevels = 3
+    lower_bounds = np.zeros(dim) - radius/2
+    upper_bounds = lower_bounds + radius
+    tob = make_tree_of_boxes_root((lower_bounds, upper_bounds))
+
+    for _ in range(nlevels):
+        tob = uniformly_refine_tree_of_boxes(tob)
+
+    from boxtree.tree_of_boxes import _sort_boxes_by_level
+    tob = _sort_boxes_by_level(tob)
+
+    from dataclasses import replace
+    tob = replace(
+        tob,
+        box_centers=actx.from_numpy(tob.box_centers),
+        root_extent=tob.root_extent,
+        box_parent_ids=actx.from_numpy(tob.box_parent_ids),
+        box_child_ids=actx.from_numpy(tob.box_child_ids),
+        box_levels=actx.from_numpy(tob.box_levels),
+        box_flags=actx.from_numpy(tob.box_flags),
+        )
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tg = FMMTraversalBuilder(actx.context)
+    _trav, _ = tg(actx.queue, tob)
+
+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_tree.py 'test_routine(cl.create_some_context)'
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker