cc810e3a · cc810e3a · cc810e3a · 5b046f33 · cc810e3a · cc810e3a
--- a/boxtree/distributed/local_tree.py
+++ b/boxtree/distributed/local_tree.py
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from mako.template import Template
+
+import pyopencl as cl
+from pyopencl.tools import dtype_to_ctype
+from pytools import memoize_method
+
+from boxtree import Tree
+
+
+logger = logging.getLogger(__name__)
+
+
+# FIXME: The logic in this file has a lot in common with
+# the particle filtering functionality that already exists.
+# We should refactor this to make use of this commonality.
+# https://documen.tician.de/boxtree/tree.html#filtering-the-lists-of-targets
+
+
+class LocalTreeGeneratorCodeContainer:
+    """Objects of this type serve as a place to keep the code needed for
+    :func:`generate_local_tree`.
+    """
+    def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype):
+        self.cl_context = cl_context
+        self.dimensions = dimensions
+        self.particle_id_dtype = particle_id_dtype
+        self.coord_dtype = coord_dtype
+
+    @memoize_method
+    def particle_mask_kernel(self):
+        return cl.elementwise.ElementwiseKernel(
+            self.cl_context,
+            arguments=Template("""
+                __global char *responsible_boxes,
+                __global ${particle_id_t} *box_particle_starts,
+                __global ${particle_id_t} *box_particle_counts_nonchild,
+                __global ${particle_id_t} *particle_mask
+            """, strict_undefined=True).render(
+                particle_id_t=dtype_to_ctype(self.particle_id_dtype)
+            ),
+            operation=Template("""
+                if(responsible_boxes[i]) {
+                    for(${particle_id_t} pid = box_particle_starts[i];
+                        pid < box_particle_starts[i]
+                              + box_particle_counts_nonchild[i];
+                        ++pid) {
+                        particle_mask[pid] = 1;
+                    }
+                }
+            """).render(particle_id_t=dtype_to_ctype(self.particle_id_dtype))
+        )
+
+    @memoize_method
+    def mask_scan_kernel(self):
+        from pyopencl.scan import GenericScanKernel
+        return GenericScanKernel(
+            self.cl_context, self.particle_id_dtype,
+            arguments=Template("""
+                __global ${mask_t} *ary,
+                __global ${mask_t} *scan
+                """, strict_undefined=True).render(
+                mask_t=dtype_to_ctype(self.particle_id_dtype)
+            ),
+            input_expr="ary[i]",
+            scan_expr="a+b", neutral="0",
+            output_statement="scan[i + 1] = item;"
+        )
+
+    fetch_local_particles_arguments = Template("""
+        __global const ${mask_t} *particle_mask,
+        __global const ${mask_t} *particle_scan
+        % for dim in range(ndims):
+            , __global const ${coord_t} *particles_${dim}
+        % endfor
+        % for dim in range(ndims):
+            , __global ${coord_t} *local_particles_${dim}
+        % endfor
+        % if particles_have_extent:
+            , __global const ${coord_t} *particle_radii
+            , __global ${coord_t} *local_particle_radii
+        % endif
+    """, strict_undefined=True)
+
+    fetch_local_particles_prg = Template("""
+        if(particle_mask[i]) {
+            ${particle_id_t} des = particle_scan[i];
+            % for dim in range(ndims):
+                local_particles_${dim}[des] = particles_${dim}[i];
+            % endfor
+            % if particles_have_extent:
+                local_particle_radii[des] = particle_radii[i];
+            % endif
+        }
+    """, strict_undefined=True)
+
+    @memoize_method
+    def fetch_local_particles_kernel(self, particles_have_extent):
+        return cl.elementwise.ElementwiseKernel(
+            self.cl_context,
+            self.fetch_local_particles_arguments.render(
+                mask_t=dtype_to_ctype(self.particle_id_dtype),
+                coord_t=dtype_to_ctype(self.coord_dtype),
+                ndims=self.dimensions,
+                particles_have_extent=particles_have_extent
+            ),
+            self.fetch_local_particles_prg.render(
+                particle_id_t=dtype_to_ctype(self.particle_id_dtype),
+                ndims=self.dimensions,
+                particles_have_extent=particles_have_extent
+            )
+        )
+
+    @memoize_method
+    def mask_compressor_kernel(self):
+        from boxtree.tools import MaskCompressorKernel
+        return MaskCompressorKernel(self.cl_context)
+
+    @memoize_method
+    def modify_target_flags_kernel(self):
+        from boxtree import box_flags_enum
+        box_flag_t = dtype_to_ctype(box_flags_enum.dtype)
+
+        return cl.elementwise.ElementwiseKernel(
+            self.cl_context,
+            Template("""
+                __global ${particle_id_t} *box_target_counts_nonchild,
+                __global ${particle_id_t} *box_target_counts_cumul,
+                __global ${box_flag_t} *box_flags
+            """).render(
+                particle_id_t=dtype_to_ctype(self.particle_id_dtype),
+                box_flag_t=box_flag_t
+            ),
+            r"""
+                // reset BOX_IS_TARGET_BOX and BOX_HAS_TARGET_CHILD_BOXES bits
+                // in the flag of each box
+                box_flags[i] &= (~BOX_IS_TARGET_BOX);
+                box_flags[i] &= (~BOX_HAS_TARGET_CHILD_BOXES);
+
+                // rebuild BOX_IS_TARGET_BOX and BOX_HAS_TARGET_CHILD_BOXES bits
+                if(box_target_counts_nonchild[i]) box_flags[i] |= BOX_IS_TARGET_BOX;
+                if(box_target_counts_nonchild[i] < box_target_counts_cumul[i])
+                    box_flags[i] |= BOX_HAS_TARGET_CHILD_BOXES;
+            """,
+            preamble=box_flags_enum.get_c_defines()
+        )
+
+
+@dataclass
+class LocalParticlesAndLists:
+    particles: np.ndarray
+    particle_radii: cl.array.Array | None
+    box_particle_starts: cl.array.Array
+    box_particle_counts_nonchild: cl.array.Array
+    box_particle_counts_cumul: cl.array.Array
+    particle_idx: np.ndarray
+
+
+def construct_local_particles_and_lists(
+        queue, code, dimensions, num_boxes, num_global_particles,
+        particle_id_dtype, coord_dtype, particles_have_extent,
+        box_mask,
+        global_particles, global_particle_radii,
+        box_particle_starts, box_particle_counts_nonchild,
+        box_particle_counts_cumul):
+    """This helper function generates particles (either sources or targets) of the
+    local tree, and reconstructs list of lists indexing accordingly.
+    """
+    # {{{ calculate the particle mask
+
+    particle_mask = cl.array.zeros(
+        queue, num_global_particles, dtype=particle_id_dtype)
+
+    code.particle_mask_kernel()(
+        box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask)
+
+    # }}}
+
+    # {{{ calculate the scan of the particle mask
+
+    global_to_local_particle_index = cl.array.empty(
+        queue, num_global_particles + 1, dtype=particle_id_dtype)
+
+    global_to_local_particle_index[0] = 0
+    code.mask_scan_kernel()(particle_mask, global_to_local_particle_index)
+
+    # }}}
+
+    # {{{ fetch the local particles
+
+    num_local_particles = global_to_local_particle_index[-1].get(queue).item()
+
+    local_particles = [
+        cl.array.empty(queue, num_local_particles, dtype=coord_dtype)
+        for _ in range(dimensions)]
+
+    from pytools.obj_array import make_obj_array
+    local_particles = make_obj_array(local_particles)
+
+    local_particle_radii = None
+    if particles_have_extent:
+        local_particle_radii = cl.array.empty(
+            queue, num_local_particles, dtype=coord_dtype)
+
+        code.fetch_local_particles_kernel(True)(
+            particle_mask, global_to_local_particle_index,
+            *global_particles.tolist(),
+            *local_particles,
+            global_particle_radii,
+            local_particle_radii)
+    else:
+        code.fetch_local_particles_kernel(False)(
+            particle_mask, global_to_local_particle_index,
+            *global_particles.tolist(),
+            *local_particles)
+
+    # {{{ construct the list of list indices
+
+    local_box_particle_starts = global_to_local_particle_index[box_particle_starts]
+
+    box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype)
+
+    local_box_particle_counts_nonchild = cl.array.if_positive(
+        box_mask, box_particle_counts_nonchild, box_counts_all_zeros)
+
+    box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul
+
+    local_box_particle_counts_cumul = (
+        global_to_local_particle_index[box_particle_ends_cumul]
+        - global_to_local_particle_index[box_particle_starts])
+
+    # }}}
+
+    particle_mask = particle_mask.get(queue=queue).astype(bool)
+    particle_idx = np.arange(num_global_particles)[particle_mask]
+
+    return LocalParticlesAndLists(
+        local_particles,
+        local_particle_radii,
+        local_box_particle_starts,
+        local_box_particle_counts_nonchild,
+        local_box_particle_counts_cumul,
+        particle_idx)
+
+
+class LocalTree(Tree):
+    """
+    Inherits from :class:`boxtree.Tree`.
+
+    .. attribute:: box_to_user_rank_starts
+
+        ``box_id_t [nboxes + 1]``
+
+    .. attribute:: box_to_user_rank_lists
+
+        ``int32 [*]``
+
+        A :ref:`csr` array, together with :attr:`box_to_user_rank_starts`.
+        For each box, the list of ranks which own targets that *use* the
+        multipole expansion at this box, via either List 3 or (possibly downward
+        propagated from an ancestor) List 2.
+    """
+
+
+def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
+    """Generate the local tree for the current rank.
+
+    This is an MPI-collective routine on *comm*.
+
+    :arg queue: a :class:`pyopencl.CommandQueue` object.
+    :arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object
+        on host memory.
+    :arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the
+        responsible boxes of the current rank.
+
+    :return: a tuple of ``(local_tree, src_idx, tgt_idx)``, where ``local_tree`` is
+        an object with class :class:`boxtree.distributed.local_tree.LocalTree` of the
+        generated local tree, ``src_idx`` is the indices of the local sources in the
+        global tree, and ``tgt_idx`` is the indices of the local targets in the
+        global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source
+        weights from root rank and assembling calculated potentials on the root rank.
+    """
+    global_tree = global_traversal.tree
+    code = LocalTreeGeneratorCodeContainer(
+            queue.context, global_tree.dimensions,
+            global_tree.particle_id_dtype, global_tree.coord_dtype)
+
+    mpi_rank = comm.Get_rank()
+    mpi_size = comm.Get_size()
+
+    start_time = time.time()
+
+    from boxtree.distributed.partition import get_box_masks
+    box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list)
+
+    global_tree_dev = global_tree.to_device(queue).with_queue(queue)
+
+    local_sources_and_lists = construct_local_particles_and_lists(
+        queue, code, global_tree.dimensions, global_tree.nboxes,
+        global_tree.nsources,
+        global_tree.particle_id_dtype, global_tree.coord_dtype,
+        global_tree.sources_have_extent,
+        box_masks.point_src_boxes,
+        global_tree_dev.sources,
+        global_tree_dev.sources_radii if global_tree.sources_have_extent else None,
+        global_tree_dev.box_source_starts,
+        global_tree_dev.box_source_counts_nonchild,
+        global_tree_dev.box_source_counts_cumul)
+
+    local_targets_and_lists = construct_local_particles_and_lists(
+        queue, code, global_tree.dimensions, global_tree.nboxes,
+        global_tree.ntargets,
+        global_tree.particle_id_dtype, global_tree.coord_dtype,
+        global_tree.targets_have_extent,
+        box_masks.responsible_boxes,
+        global_tree_dev.targets,
+        global_tree_dev.target_radii if global_tree.targets_have_extent else None,
+        global_tree_dev.box_target_starts,
+        global_tree_dev.box_target_counts_nonchild,
+        global_tree_dev.box_target_counts_cumul)
+
+    # {{{ compute the users of multipole expansions of each box on the root rank
+
+    multipole_src_boxes_all_ranks = None
+    if mpi_rank == 0:
+        multipole_src_boxes_all_ranks = np.empty(
+            (mpi_size, global_tree.nboxes),
+            dtype=box_masks.multipole_src_boxes.dtype)
+    comm.Gather(
+        box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0)
+
+    box_to_user_rank_starts = None
+    box_to_user_rank_lists = None
+
+    if mpi_rank == 0:
+        multipole_src_boxes_all_ranks = cl.array.to_device(
+            queue, multipole_src_boxes_all_ranks)
+
+        (box_to_user_rank_starts, box_to_user_rank_lists, evt) = \
+            code.mask_compressor_kernel()(
+                queue, multipole_src_boxes_all_ranks.transpose(),
+                list_dtype=np.int32)
+
+        cl.wait_for_events([evt])
+
+        box_to_user_rank_starts = box_to_user_rank_starts.get()
+        box_to_user_rank_lists = box_to_user_rank_lists.get()
+
+        logger.debug("computing box_to_user: done")
+
+    box_to_user_rank_starts = comm.bcast(box_to_user_rank_starts, root=0)
+    box_to_user_rank_lists = comm.bcast(box_to_user_rank_lists, root=0)
+
+    # }}}
+
+    # {{{ Reconstruct the target box flags
+
+    # Note: We do not change the source box flags despite the local tree may only
+    # contain a subset of sources. This is because evaluating target potentials in
+    # the responsible boxes of the current rank may depend on the multipole
+    # expansions formed by sources in other ranks. Modifying the source box flags
+    # could result in incomplete interaction lists.
+
+    local_box_flags = global_tree_dev.box_flags.copy(queue=queue)
+    code.modify_target_flags_kernel()(
+        local_targets_and_lists.box_particle_counts_nonchild,
+        local_targets_and_lists.box_particle_counts_cumul,
+        local_box_flags)
+
+    # }}}
+
+    from pytools.obj_array import make_obj_array
+    local_sources = make_obj_array([
+        local_sources_idim.get(queue=queue)
+        for local_sources_idim in local_sources_and_lists.particles])
+    local_targets = make_obj_array([
+        local_target_idim.get(queue=queue)
+        for local_target_idim in local_targets_and_lists.particles])
+
+    local_tree = LocalTree(
+        sources_are_targets=global_tree.sources_are_targets,
+        sources_have_extent=global_tree.sources_have_extent,
+        targets_have_extent=global_tree.targets_have_extent,
+
+        particle_id_dtype=global_tree.particle_id_dtype,
+        box_id_dtype=global_tree.box_id_dtype,
+        coord_dtype=global_tree.coord_dtype,
+        box_level_dtype=global_tree.box_level_dtype,
+
+        root_extent=global_tree.root_extent,
+        stick_out_factor=global_tree.stick_out_factor,
+        extent_norm=global_tree.extent_norm,
+
+        bounding_box=global_tree.bounding_box,
+        level_start_box_nrs=global_tree.level_start_box_nrs,
+        level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev,
+
+        sources=local_sources,
+        targets=local_targets,
+        source_radii=(local_sources_and_lists.particle_radii.get(queue=queue)
+                if global_tree.sources_have_extent else None),
+        target_radii=(local_targets_and_lists.particle_radii.get(queue=queue)
+                if global_tree.targets_have_extent else None),
+
+        box_source_starts=(
+            local_sources_and_lists.box_particle_starts.get(queue=queue)),
+        box_source_counts_nonchild=(
+            local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+        box_source_counts_cumul=(
+            local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)),
+        box_target_starts=(
+            local_targets_and_lists.box_particle_starts.get(queue=queue)),
+        box_target_counts_nonchild=(
+            local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+        box_target_counts_cumul=(
+            local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)),
+
+        box_parent_ids=global_tree.box_parent_ids,
+        box_child_ids=global_tree.box_child_ids,
+        box_centers=global_tree.box_centers,
+        box_levels=global_tree.box_levels,
+        box_flags=local_box_flags.get(queue=queue),
+
+        user_source_ids=None,
+        sorted_target_ids=None,
+
+        box_source_bounding_box_min=global_tree.box_source_bounding_box_min,
+        box_source_bounding_box_max=global_tree.box_source_bounding_box_max,
+        box_target_bounding_box_min=global_tree.box_target_bounding_box_min,
+        box_target_bounding_box_max=global_tree.box_target_bounding_box_max,
+
+        _is_pruned=global_tree._is_pruned,
+
+        responsible_boxes_list=responsible_boxes_list,
+        responsible_boxes_mask=box_masks.responsible_boxes.get(),
+        ancestor_mask=box_masks.ancestor_boxes.get(),
+        box_to_user_rank_starts=box_to_user_rank_starts,
+        box_to_user_rank_lists=box_to_user_rank_lists
+    )
+
+    local_tree = local_tree.to_host_device_array(queue)
+    local_tree.with_queue(None)
+
+    logger.info("Generate local tree on rank %d in %f sec.",
+            mpi_rank, time.time() - start_time)
+
+    return (
+        local_tree,
+        local_sources_and_lists.particle_idx,
+        local_targets_and_lists.particle_idx)
--- a/boxtree/distributed/partition.py
+++ b/boxtree/distributed/partition.py
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+
+import numpy as np
+from mako.template import Template
+
+import pyopencl as cl
+from pyopencl.tools import dtype_to_ctype
+from pytools import memoize_method
+
+
+def get_box_ids_dfs_order(tree):
+    """Helper function for getting box ids of a tree in depth-first order.
+
+    :arg tree: A :class:`boxtree.Tree` object in the host memory. See
+        :meth:`boxtree.Tree.get` for getting a tree object in host memory.
+    :return: A numpy array of box ids in depth-first order.
+    """
+    # FIXME: optimize the performance with OpenCL
+    dfs_order = np.empty((tree.nboxes,), dtype=tree.box_id_dtype)
+    idx = 0
+    stack = [0]
+    while stack:
+        box_id = stack.pop()
+        dfs_order[idx] = box_id
+        idx += 1
+        for i in range(2**tree.dimensions):
+            child_box_id = tree.box_child_ids[i][box_id]
+            if child_box_id > 0:
+                stack.append(child_box_id)
+    return dfs_order
+
+
+def partition_work(cost_per_box, traversal, comm):
+    """This function assigns responsible boxes for each rank.
+
+    If a rank is responsible for a box, it will calculate the multiple expansion of
+    the box and evaluate target potentials in the box.
+
+    :arg cost_per_box: The expected running time of each box. This argument is only
+        significant on the root rank.
+    :arg traversal: The global traversal object containing all particles. This
+        argument is significant on all ranks.
+    :arg comm: MPI communicator.
+    :return: A numpy array containing the responsible boxes of the current rank.
+    """
+    tree = traversal.tree
+    mpi_rank = comm.Get_rank()
+    mpi_size = comm.Get_size()
+
+    if mpi_size > tree.nboxes:
+        raise RuntimeError("Fail to partition work because the number of boxes is "
+                           "less than the number of processes.")
+
+    # transform tree from the level order to the morton dfs order
+    # dfs_order[i] stores the level-order box index of dfs index i
+    dfs_order = get_box_ids_dfs_order(tree)
+
+    # partition all boxes in dfs order evenly according to workload on the root rank
+
+    responsible_boxes_segments = None
+    # contains: [start_index, end_index)
+    responsible_boxes_current_rank = np.empty(2, dtype=tree.box_id_dtype)
+
+    # FIXME: Right now, the responsible boxes assigned to all ranks are computed
+    # centrally on the root rank to avoid inconsistency risks of floating point
+    # operations. We could improve the efficiency by letting each rank compute the
+    # costs of a subset of boxes, and use MPI_Scan to aggregate the results.
+    if mpi_rank == 0:
+        total_workload = np.sum(cost_per_box)
+
+        # second axis: [start_index, end_index)
+        responsible_boxes_segments = np.empty((mpi_size, 2), dtype=tree.box_id_dtype)
+        segment_idx = 0
+        start = 0
+        workload_count = 0
+        for box_idx_dfs_order in range(tree.nboxes):
+            if segment_idx + 1 == mpi_size:
+                responsible_boxes_segments[segment_idx, :] = [start, tree.nboxes]
+                break
+
+            box_idx = dfs_order[box_idx_dfs_order]
+            workload_count += cost_per_box[box_idx]
+            if (workload_count > (segment_idx + 1) * total_workload / mpi_size
+                    or box_idx_dfs_order == tree.nboxes - 1):
+                # record "end of rank segment"
+                responsible_boxes_segments[segment_idx, :] = (
+                    [start, box_idx_dfs_order + 1])
+                start = box_idx_dfs_order + 1
+                segment_idx += 1
+
+    comm.Scatter(responsible_boxes_segments, responsible_boxes_current_rank, root=0)
+
+    return dfs_order[
+        responsible_boxes_current_rank[0]:responsible_boxes_current_rank[1]]
+
+
+class GetBoxMasksCodeContainer:
+    def __init__(self, cl_context, box_id_dtype):
+        self.cl_context = cl_context
+        self.box_id_dtype = box_id_dtype
+
+    @memoize_method
+    def add_interaction_list_boxes_kernel(self):
+        """Given a ``responsible_boxes_mask`` and an interaction list, mark source
+        boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask.
+        """
+        return cl.elementwise.ElementwiseKernel(
+            self.cl_context,
+            Template("""
+                __global ${box_id_t} *box_list,
+                __global char *responsible_boxes_mask,
+                __global ${box_id_t} *interaction_boxes_starts,
+                __global ${box_id_t} *interaction_boxes_lists,
+                __global char *src_boxes_mask
+            """, strict_undefined=True).render(
+                box_id_t=dtype_to_ctype(self.box_id_dtype)
+            ),
+            Template(r"""
+                typedef ${box_id_t} box_id_t;
+                box_id_t current_box = box_list[i];
+                if(responsible_boxes_mask[current_box]) {
+                    for(box_id_t box_idx = interaction_boxes_starts[i];
+                        box_idx < interaction_boxes_starts[i + 1];
+                        ++box_idx)
+                        src_boxes_mask[interaction_boxes_lists[box_idx]] = 1;
+                }
+            """, strict_undefined=True).render(
+                box_id_t=dtype_to_ctype(self.box_id_dtype)
+            ),
+        )
+
+    @memoize_method
+    def add_parent_boxes_kernel(self):
+        return cl.elementwise.ElementwiseKernel(
+            self.cl_context,
+            "__global char *current, __global char *parent, "
+            f"__global {dtype_to_ctype(self.box_id_dtype)} *box_parent_ids",
+            "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1"
+        )
+
+
+def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
+    """Query the ancestors of responsible boxes.
+
+    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
+        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
+    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by
+        *responsible_boxes_mask*.
+    """
+    ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8)
+    ancestor_boxes_last = responsible_boxes_mask.copy()
+
+    while ancestor_boxes_last.any():
+        ancestor_boxes_new = cl.array.zeros(
+            queue, (traversal.tree.nboxes,), dtype=np.int8)
+        code.add_parent_boxes_kernel()(
+            ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids)
+        ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes)
+        ancestor_boxes = ancestor_boxes | ancestor_boxes_new
+        ancestor_boxes_last = ancestor_boxes_new
+
+    return ancestor_boxes
+
+
+def get_point_src_boxes_mask(
+        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+    """Query the boxes whose sources are needed in order to evaluate potentials
+    of boxes represented by *responsible_boxes_mask*.
+
+    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
+        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
+    :param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
+        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
+        or an ancestor of the responsible boxes.
+    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if sources of box ``i`` are needed for evaluating the
+        potentials of targets in boxes represented by *responsible_boxes_mask*.
+    """
+
+    src_boxes_mask = responsible_boxes_mask.copy()
+
+    # Add list 1 of responsible boxes
+    code.add_interaction_list_boxes_kernel()(
+        traversal.target_boxes, responsible_boxes_mask,
+        traversal.neighbor_source_boxes_starts,
+        traversal.neighbor_source_boxes_lists, src_boxes_mask,
+        queue=queue)
+
+    # Add list 4 of responsible boxes or ancestor boxes
+    code.add_interaction_list_boxes_kernel()(
+        traversal.target_or_target_parent_boxes,
+        responsible_boxes_mask | ancestor_boxes_mask,
+        traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists,
+        src_boxes_mask,
+        queue=queue)
+
+    if traversal.tree.targets_have_extent:
+        # Add list 3 close of responsible boxes
+        if traversal.from_sep_close_smaller_starts is not None:
+            code.add_interaction_list_boxes_kernel()(
+                traversal.target_boxes,
+                responsible_boxes_mask,
+                traversal.from_sep_close_smaller_starts,
+                traversal.from_sep_close_smaller_lists,
+                src_boxes_mask,
+                queue=queue
+            )
+
+        # Add list 4 close of responsible boxes
+        if traversal.from_sep_close_bigger_starts is not None:
+            code.add_interaction_list_boxes_kernel()(
+                traversal.target_boxes,
+                responsible_boxes_mask | ancestor_boxes_mask,
+                traversal.from_sep_close_bigger_starts,
+                traversal.from_sep_close_bigger_lists,
+                src_boxes_mask,
+                queue=queue
+            )
+
+    return src_boxes_mask
+
+
+def get_multipole_src_boxes_mask(
+        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+    """Query the boxes whose multipoles are used in order to evaluate
+    potentials of targets in boxes represented by *responsible_boxes_mask*.
+
+    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
+        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
+    :arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
+        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
+        or an ancestor of the responsible boxes.
+    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)``
+        whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating
+        the potentials of targets in boxes represented by *responsible_boxes_mask*.
+    """
+
+    multipole_boxes_mask = cl.array.zeros(
+        queue, (traversal.tree.nboxes,), dtype=np.int8
+    )
+
+    # A mpole is used by process p if it is in the List 2 of either a box
+    # owned by p or one of its ancestors.
+    code.add_interaction_list_boxes_kernel()(
+        traversal.target_or_target_parent_boxes,
+        responsible_boxes_mask | ancestor_boxes_mask,
+        traversal.from_sep_siblings_starts,
+        traversal.from_sep_siblings_lists,
+        multipole_boxes_mask,
+        queue=queue
+    )
+    multipole_boxes_mask.finish()
+
+    # A mpole is used by process p if it is in the List 3 of a box owned by p.
+    for ilevel in range(traversal.tree.nlevels):
+        code.add_interaction_list_boxes_kernel()(
+            traversal.target_boxes_sep_smaller_by_source_level[ilevel],
+            responsible_boxes_mask,
+            traversal.from_sep_smaller_by_level[ilevel].starts,
+            traversal.from_sep_smaller_by_level[ilevel].lists,
+            multipole_boxes_mask,
+            queue=queue
+        )
+
+        multipole_boxes_mask.finish()
+
+    return multipole_boxes_mask
+
+
+@dataclass
+class BoxMasks:
+    """
+    Box masks needed for the distributed calculation. Each of these masks is a
+    PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
+    set.
+
+    .. attribute:: responsible_boxes
+
+        Current process will evaluate target potentials and multipole expansions in
+        these boxes. Sources and targets in these boxes are needed.
+
+    .. attribute:: ancestor_boxes
+
+        Ancestors of the responsible boxes.
+
+    .. attribute:: point_src_boxes
+
+        Current process needs sources but not targets in these boxes.
+
+    .. attribute:: multipole_src_boxes
+
+        Current process needs multipole expressions in these boxes.
+    """
+    responsible_boxes: cl.array.Array
+    ancestor_boxes: cl.array.Array
+    point_src_boxes: cl.array.Array
+    multipole_src_boxes: cl.array.Array
+
+
+def get_box_masks(queue, traversal, responsible_boxes_list):
+    """Given the responsible boxes for a rank, this helper function calculates the
+    relevant masks.
+
+    :arg responsible_boxes_list: A numpy array of responsible box indices.
+
+    :returns: A :class:`BoxMasks` object of the relevant masks.
+    """
+    code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype)
+
+    # FIXME: It is wasteful to copy the whole traversal object into device memory
+    # here because
+    # 1) Not all fields are needed.
+    # 2) For sumpy wrangler, a device traversal object is already available.
+    traversal = traversal.to_device(queue)
+
+    responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8)
+    responsible_boxes_mask[responsible_boxes_list] = 1
+    responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask)
+
+    ancestor_boxes_mask = get_ancestor_boxes_mask(
+        queue, code, traversal, responsible_boxes_mask)
+
+    point_src_boxes_mask = get_point_src_boxes_mask(
+        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+
+    multipole_src_boxes_mask = get_multipole_src_boxes_mask(
+        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+
+    return BoxMasks(
+        responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask,
+        multipole_src_boxes_mask)
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
-from __future__ import division
+"""
+.. autofunction:: drive_fmm
+
+.. autoclass:: TreeIndependentDataForWrangler
+.. autoclass:: ExpansionWranglerInterface
+"""

 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

@@ -23,75 +28,391 @@ THE SOFTWARE.
 """

 import logging
+from abc import ABC, abstractmethod
+
+
 logger = logging.getLogger(__name__)
+from pytools import ProcessLogger
+
+from boxtree.traversal import FMMTraversalInfo
+from boxtree.tree import Tree
+
+
+# {{{ expansion wrangler interface
+
+class TreeIndependentDataForWrangler:
+    """An object that can be used to store information for efficient
+    wrangler execution that depends on the kernel but not the tree and/or
+    the traversal.
+
+    Examples of such data include generated code for carrying out
+    translations.
+
+    .. note::
+
+        Instances of this type should not hold a reference (and thereby be
+        specific to) a :class:`boxtree.Tree` instance. Their purpose is to
+        host caches for generated translation code that is reusable across
+        trees. It is OK for these instances to be specific to a given kernel
+        (or set of kernels).
+    """
+
+
+class ExpansionWranglerInterface(ABC):
+    """Abstract expansion handling interface for use with :func:`drive_fmm`.
+
+    See this
+    `test code <https://github.com/inducer/boxtree/blob/main/test/test_fmm.py>`__
+    for a very simple sample implementation.
+
+    .. note::
+
+        Wranglers may hold a reference (and thereby be specific to) a
+        :class:`boxtree.Tree` instance.
+        :class:`TreeIndependentDataForWrangler` exists to hold data that
+        is more broadly reusable.
+
+    Functions that support returning timing data return a value supporting the
+    :class:`~boxtree.timing.TimingFuture` interface.
+
+    .. versionchanged:: 2018.1
+
+        Changed (a subset of) functions to return timing data.
+
+    .. attribute:: tree_indep
+
+        An instance of (a typically wrangler-dependent subclass of)
+        :class:`TreeIndependentDataForWrangler`.
+
+    .. attribute:: traversal
+
+        An instance of :class:`~boxtree.traversal.FMMTraversalInfo`.
+
+    .. autoattribute:: tree
+
+    .. rubric:: Particle ordering
+
+    .. automethod:: reorder_sources
+    .. automethod:: reorder_potentials
+
+    .. rubric:: Views into arrays of expansions
+
+    .. automethod:: multipole_expansions_view
+    .. automethod:: local_expansions_view
+
+    .. rubric:: Translations
+
+    .. automethod:: form_multipoles
+    .. automethod:: coarsen_multipoles
+    .. automethod:: eval_direct
+    .. automethod:: multipole_to_local
+    .. automethod:: eval_multipoles
+    .. automethod:: form_locals
+    .. automethod:: refine_locals
+    .. automethod:: eval_locals
+    .. automethod:: finalize_potentials
+    """
+
+    def __init__(self, tree_indep: TreeIndependentDataForWrangler,
+            traversal: FMMTraversalInfo):
+        self.tree_indep = tree_indep
+        self.traversal = traversal
+
+    @property
+    def tree(self) -> Tree:
+        return self.traversal.tree
+
+    @abstractmethod
+    def reorder_sources(self, source_array):
+        """Return a copy of *source_array* in
+        :ref:`tree source order <particle-orderings>`.
+        *source_array* is in user source order.
+        """
+
+    @abstractmethod
+    def reorder_potentials(self, potentials):
+        """Return a copy of *potentials* in
+        :ref:`user target order <particle-orderings>`.
+        *source_weights* is in tree target order.
+        """
+
+    # {{{ views into arrays of expansions
+
+    # Included here for the benefit of the distributed-memory FMM
+
+    @abstractmethod
+    def multipole_expansions_view(self, mpole_exps, level):
+        pass
+
+    @abstractmethod
+    def local_expansions_view(self, local_exps, level):
+        pass
+
+    # }}}
+
+    # {{{ translations
+
+    @abstractmethod
+    def form_multipoles(self,
+            level_start_source_box_nrs, source_boxes,
+            src_weight_vecs):
+        """Return an expansions array
+        containing multipole expansions in *source_boxes* due to sources
+        with *src_weight_vecs*.
+        All other expansions must be zero.
+
+        :return: A pair (*mpoles*, *timing_future*).
+        """
+
+    @abstractmethod
+    def coarsen_multipoles(self,
+            level_start_source_parent_box_nrs,
+            source_parent_boxes, mpoles):
+        """For each box in *source_parent_boxes*,
+        gather (and translate) the box's children's multipole expansions in
+        *mpole* and add the resulting expansion into the box's multipole
+        expansion in *mpole*.
+
+        :returns: A pair (*mpoles*, *timing_future*).
+        """
+
+    @abstractmethod
+    def eval_direct(self,
+            target_boxes, neighbor_sources_starts,
+            neighbor_sources_lists, src_weight_vecs):
+        """For each box in *target_boxes*, evaluate the influence of the
+        neighbor sources due to *src_weight_vecs*, which use :ref:`csr` and are
+        indexed like *target_boxes*.
+
+        :returns: A pair (*pot*, *timing_future*), where *pot* is a
+            a new potential array.
+        """
+
+    @abstractmethod
+    def multipole_to_local(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
+            starts, lists, mpole_exps):
+        """For each box in *target_or_target_parent_boxes*, translate and add
+        the influence of the multipole expansion in *mpole_exps* into a new
+        array of local expansions.  *starts* and *lists* use :ref:`csr`, and
+        *starts* is indexed like *target_or_target_parent_boxes*.
+
+        :returns: A pair (*pot*, *timing_future*) where *pot* is
+            a new (local) expansion array.
+        """
+
+    @abstractmethod
+    def eval_multipoles(self,
+            target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps):
+        """For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate
+        the multipole expansion in *mpole_exps* in the nearby boxes given in
+        *from_sep_smaller_by_level*, and return a new potential array.
+        *starts* and *lists* in *from_sep_smaller_by_level[i]* use :ref:`csr`
+        and *starts* is indexed like *target_boxes_by_source_level[i]*.
+
+        :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
+            array.
+        """
+
+    @abstractmethod
+    def form_locals(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+        """For each box in *target_or_target_parent_boxes*, form local
+        expansions due to the sources in the nearby boxes given in *starts* and
+        *lists*, and return a new local expansion array.  *starts* and *lists*
+        use :ref:`csr` and *starts* is indexed like
+        *target_or_target_parent_boxes*.
+
+        :returns: A pair (*pot*, *timing_future*) where *pot* is a new
+            local expansion array.
+        """
+
+    @abstractmethod
+    def refine_locals(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, local_exps):
+        """For each box in *child_boxes*,
+        translate the box's parent's local expansion in *local_exps* and add
+        the resulting expansion into the box's local expansion in *local_exps*.
+
+        :returns: A pair (*local_exps*, *timing_future*).
+        """
+
+    @abstractmethod
+    def eval_locals(self,
+            level_start_target_box_nrs, target_boxes, local_exps):
+        """For each box in *target_boxes*, evaluate the local expansion in
+        *local_exps* and return a new potential array.
+
+        :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
+            array.
+        """
+
+    # }}}
+
+    @abstractmethod
+    def finalize_potentials(self, potentials, template_ary):
+        """
+        Postprocess the reordered potentials. This is where global scaling
+        factors could be applied. This is distinct from :meth:`reorder_potentials`
+        because some derived FMMs (notably the QBX FMM) do their own reordering.
+
+        :arg template_ary: If the array type used inside of the FMM
+            is different from the array type used by the user (e.g.
+            :class:`boxtree.pyfmmlib_integration.FMMLibExpansionWrangler`
+            uses :class:`numpy.ndarray` internally, this array can be used
+            to help convert the output back to the user's array
+            type (typically :class:`pyopencl.array.Array`).
+        """
+
+    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+        """Used by the distributed implementation for transferring needed source
+        weights from root rank to each worker rank in the communicator.
+
+        This method needs to be called collectively by all ranks in the communicator.
+
+        :arg src_weight_vecs: a sequence of :class:`numpy.ndarray`, each with length
+            ``nsources``, representing the weights of sources on the root rank.
+            *None* on worker ranks.
+        :arg src_idx_all_ranks: a :class:`list` of length ``nranks``, including the
+            root rank, where the i-th entry is a :class:`numpy.ndarray` of indices,
+            of which *src_weight_vecs* to be sent from the root rank to rank *i*.
+            Each entry can be generated by :func:`.generate_local_tree`. *None* on
+            worker ranks.
+
+        :return: Received source weights of the current rank, including the root
+            rank.
+        """
+        return src_weight_vecs
+
+    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+        """Used by the distributed implementation for gathering calculated potentials
+        from all worker ranks in the communicator to the root rank.
+
+        This method needs to be called collectively by all ranks in the communicator.
+
+        :arg potentials: Calculated potentials on each rank. This argument is
+            significant on all ranks, including the root rank.
+        :arg tgt_idx_all_ranks: a :class:`list` of length ``nranks``, where the
+            i-th entry is a :class:`numpy.ndarray` of the global potential indices
+            of potentials from rank *i*. This argument is only significant on the
+            root rank.
+
+        :return: Gathered potentials on the root rank. *None* on worker ranks.
+        """
+        return potentials
+
+    def communicate_mpoles(self, mpole_exps, return_stats=False):  # noqa: B027
+        """Used by the distributed implementation for forming the complete multipole
+        expansions from the partial multipole expansions.
+
+        This function accepts partial multipole expansions in the argument
+        *mpole_exps*, and modifies *mpole_exps* in place with the communicated and
+        reduced multipole expansions.

+        This function needs to be called collectively by all ranks in the
+        communicator.

-def drive_fmm(traversal, expansion_wrangler, src_weights):
+        :returns: Statistics of the communication if *return_stats* is True. *None*
+            otherwise.
+        """
+        pass
+
+# }}}
+
+
+def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
+              timing_data=None,
+              global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None):
    """Top-level driver routine for a fast multipole calculation.

    In part, this is intended as a template for custom FMMs, in the sense that
    you may copy and paste its
-    `source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_
+    `source code <https://github.com/inducer/boxtree/blob/main/boxtree/fmm.py>`__
    as a starting point.

    Nonetheless, many common applications (such as point-to-point FMMs) can be
    covered by supplying the right *expansion_wrangler* to this routine.

-    :arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance.
    :arg expansion_wrangler: An object exhibiting the
-        :class:`ExpansionWranglerInterface`.
-    :arg src_weights: Source 'density/weights/charges'.
-        Passed unmodified to *expansion_wrangler*.
-
-    Returns the potentials computed by *expansion_wrangler*.
+        :class:`ExpansionWranglerInterface`. For distributed implementation, this
+        wrangler should be a subclass of
+        :class:`boxtree.distributed.calculation.DistributedExpansionWrangler`.
+    :arg src_weight_vecs: A sequence of source 'density/weights/charges'.
+        Passed unmodified to *expansion_wrangler*. For distributed
+        implementation, this argument is only significant on the root rank, but
+        worker ranks still need to supply a dummy vector.
+    :arg timing_data: Either *None*, or a :class:`dict` that is populated with
+        timing information for the stages of the algorithm (in the form of
+        :class:`~boxtree.timing.TimingResult`), if such information is available.
+    :arg global_src_idx_all_ranks: Only used in the distributed implementation. A
+        :class:`list` of length ``nranks``, where the i-th entry is a
+        :class:`numpy.ndarray` representing the global indices of sources in the
+        local tree on rank *i*. Each entry can be returned from
+        *generate_local_tree*. This argument is only significant on the root rank.
+    :arg global_tgt_idx_all_ranks: Only used in the distributed implementation. A
+        :class:`list` of length ``nranks``, where the i-th entry is a
+        :class:`numpy.ndarray` representing the global indices of targets in the
+        local tree on rank *i*. Each entry can be returned from
+        *generate_local_tree*. This argument is only significant on the root rank.
+
+    :return: the potentials computed by *expansion_wrangler*. For the distributed
+        implementation, the potentials are gathered and returned on the root rank;
+        this function returns *None* on the worker ranks.
    """
-    tree = traversal.tree

-    wrangler = expansion_wrangler
+    traversal = wrangler.traversal

    # Interface guidelines: Attributes of the tree are assumed to be known
    # to the expansion wrangler and should not be passed.

-    logger.info("start fmm")
+    fmm_proc = ProcessLogger(logger, "fmm")
+    from boxtree.timing import TimingRecorder
+    recorder = TimingRecorder()

-    logger.debug("reorder source weights")
+    src_weight_vecs = [wrangler.reorder_sources(weight) for
+        weight in src_weight_vecs]

-    src_weights = wrangler.reorder_sources(src_weights)
+    src_weight_vecs = wrangler.distribute_source_weights(
+        src_weight_vecs, global_src_idx_all_ranks)

    # {{{ "Step 2.1:" Construct local multipoles

-    logger.debug("construct local multipoles")
-
-    mpole_exps = wrangler.form_multipoles(
+    mpole_exps, timing_future = wrangler.form_multipoles(
+            traversal.level_start_source_box_nrs,
            traversal.source_boxes,
-            src_weights)
+            src_weight_vecs)
+
+    recorder.add("form_multipoles", timing_future)

    # }}}

    # {{{ "Step 2.2:" Propagate multipoles upward

-    logger.debug("propagate multipoles upward")
+    mpole_exps, timing_future = wrangler.coarsen_multipoles(
+            traversal.level_start_source_parent_box_nrs,
+            traversal.source_parent_boxes,
+            mpole_exps)

-    for lev in range(tree.nlevels-1, -1, -1):
-        start_parent_box, end_parent_box = \
-                traversal.level_start_source_parent_box_nrs[lev:lev+2]
-        wrangler.coarsen_multipoles(
-                traversal.source_parent_boxes[start_parent_box:end_parent_box],
-                mpole_exps)
+    recorder.add("coarsen_multipoles", timing_future)

    # mpole_exps is called Phi in [1]

    # }}}

-    # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
+    wrangler.communicate_mpoles(mpole_exps)

-    logger.debug("direct evaluation from neighbor source boxes ('list 1')")
+    # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")

-    potentials = wrangler.eval_direct(
+    potentials, timing_future = wrangler.eval_direct(
            traversal.target_boxes,
            traversal.neighbor_source_boxes_starts,
            traversal.neighbor_source_boxes_lists,
-            src_weights)
+            src_weight_vecs)
+
+    recorder.add("eval_direct", timing_future)

    # these potentials are called alpha in [1]

@@ -99,217 +420,114 @@ def drive_fmm(traversal, expansion_wrangler, src_weights):

    # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local

-    logger.debug("translate separated siblings' ('list 2') mpoles to local")
-
-    local_exps = wrangler.multipole_to_local(
+    local_exps, timing_future = wrangler.multipole_to_local(
+            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
-            traversal.sep_siblings_starts,
-            traversal.sep_siblings_lists,
+            traversal.from_sep_siblings_starts,
+            traversal.from_sep_siblings_lists,
            mpole_exps)

+    recorder.add("multipole_to_local", timing_future)
+
    # local_exps represents both Gamma and Delta in [1]

    # }}}

    # {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles

-    logger.debug("evaluate sep. smaller mpoles at particles ('list 3 far')")
-
    # (the point of aiming this stage at particles is specifically to keep its
    # contribution *out* of the downward-propagating local expansions)

-    potentials = potentials + wrangler.eval_multipoles(
-            traversal.target_boxes,
-            traversal.sep_smaller_starts,
-            traversal.sep_smaller_lists,
+    mpole_result, timing_future = wrangler.eval_multipoles(
+            traversal.target_boxes_sep_smaller_by_source_level,
+            traversal.from_sep_smaller_by_level,
            mpole_exps)

+    recorder.add("eval_multipoles", timing_future)
+
+    potentials = potentials + mpole_result
+
    # these potentials are called beta in [1]

-    if traversal.sep_close_smaller_starts is not None:
+    if traversal.from_sep_close_smaller_starts is not None:
        logger.debug("evaluate separated close smaller interactions directly "
                "('list 3 close')")

-        potentials = potentials + wrangler.eval_direct(
+        direct_result, timing_future = wrangler.eval_direct(
                traversal.target_boxes,
-                traversal.sep_close_smaller_starts,
-                traversal.sep_close_smaller_lists,
-                src_weights)
+                traversal.from_sep_close_smaller_starts,
+                traversal.from_sep_close_smaller_lists,
+                src_weight_vecs)

-    # }}}
+        recorder.add("eval_direct", timing_future)

-    # {{{ "Stage 6:" form locals for separated bigger mpoles ("list 4")
+        potentials = potentials + direct_result

-    logger.debug("form locals for separated bigger mpoles ('list 4 far')")
+    # }}}

-    local_exps = local_exps + wrangler.form_locals(
+    # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")
+
+    local_result, timing_future = wrangler.form_locals(
+            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
-            traversal.sep_bigger_starts,
-            traversal.sep_bigger_lists,
-            src_weights)
+            traversal.from_sep_bigger_starts,
+            traversal.from_sep_bigger_lists,
+            src_weight_vecs)
+
+    recorder.add("form_locals", timing_future)

-    if traversal.sep_close_bigger_starts is not None:
-        logger.debug("evaluate separated close bigger interactions directly "
-                "('list 4 close')")
+    local_exps = local_exps + local_result
+
+    if traversal.from_sep_close_bigger_starts is not None:
+        direct_result, timing_future = wrangler.eval_direct(
+                traversal.target_boxes,
+                traversal.from_sep_close_bigger_starts,
+                traversal.from_sep_close_bigger_lists,
+                src_weight_vecs)

-        potentials = potentials + wrangler.eval_direct(
-                traversal.target_or_target_parent_boxes,
-                traversal.sep_close_bigger_starts,
-                traversal.sep_close_bigger_lists,
-                src_weights)
+        recorder.add("eval_direct", timing_future)
+
+        potentials = potentials + direct_result

    # }}}

    # {{{ "Stage 7:" propagate local_exps downward

-    logger.debug("propagate local_exps downward")
+    local_exps, timing_future = wrangler.refine_locals(
+            traversal.level_start_target_or_target_parent_box_nrs,
+            traversal.target_or_target_parent_boxes,
+            local_exps)

-    for lev in range(1, tree.nlevels):
-        start_box, end_box = \
-                traversal.level_start_target_or_target_parent_box_nrs[lev:lev+2]
-        wrangler.refine_locals(
-                traversal.target_or_target_parent_boxes[start_box:end_box],
-                local_exps)
+    recorder.add("refine_locals", timing_future)

    # }}}

    # {{{ "Stage 8:" evaluate locals

-    logger.debug("evaluate locals")
-
-    potentials = potentials + wrangler.eval_locals(
+    local_result, timing_future = wrangler.eval_locals(
+            traversal.level_start_target_box_nrs,
            traversal.target_boxes,
            local_exps)

-    # }}}
-
-    logger.debug("reorder potentials")
-    result = wrangler.reorder_potentials(potentials)
-
-    logger.info("fmm complete")
-
-    return result
-
-
-# {{{ expansion wrangler interface
-
-class ExpansionWranglerInterface:
-    """Abstract expansion handling interface for use with :func:`drive_fmm`.
-
-    See this
-    `test code <https://github.com/inducer/boxtree/blob/master/test/test_fmm.py>`_
-    for a very simple sample implementation.
-
-    Will usually hold a reference (and thereby be specific to) a
-    :class:`boxtree.Tree` instance.
-    """
-
-    def multipole_expansion_zeros(self):
-        """Return an expansions array (which must support addition)
-        capable of holding one multipole or local expansion for every
-        box in the tree.
-        """
-
-    def local_expansion_zeros(self):
-        """Return an expansions array (which must support addition)
-        capable of holding one multipole or local expansion for every
-        box in the tree.
-        """
-
-    def potential_zeros(self):
-        """Return a potentials array (which must support addition) capable of
-        holding a potential value for each target in the tree. Note that
-        :func:`drive_fmm` makes no assumptions about *potential* other than
-        that it supports addition--it may consist of potentials, gradients of
-        the potential, or arbitrary other per-target output data.
-        """
+    recorder.add("eval_locals", timing_future)

-    def reorder_sources(self, source_array):
-        """Return a copy of *source_array* in
-        :ref:`tree source order <particle-orderings>`.
-        *source_array* is in user source order.
-        """
+    potentials = potentials + local_result

-    def reorder_potentials(self, potentials):
-        """Return a copy of *potentials* in
-        :ref:`user target order <particle-orderings>`.
-        *source_weights* is in tree target order.
-        """
-
-    def form_multipoles(self, source_boxes, src_weights):
-        """Return an expansions array (compatible with
-        :meth:`multipole_expansion_zeros`)
-        containing multipole expansions in *source_boxes* due to sources
-        with *src_weights*.
-        All other expansions must be zero.
-        """
-
-    def coarsen_multipoles(self, parent_boxes, mpoles):
-        """For each box in *parent_boxes*,
-        gather (and translate) the box's children's multipole expansions in
-        *mpole* and add the resulting expansion into the box's multipole
-        expansion in *mpole*.
-
-        :returns: *mpoles*
-        """
-
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
-            neighbor_sources_lists, src_weights):
-        """For each box in *target_boxes*, evaluate the influence of the
-        neigbor sources due to *src_weights*, which use :ref:`csr` and are
-        indexed like *target_boxes*.
-
-        :returns: a new potential array, see :meth:`potential_zeros`.
-        """
-
-    def multipole_to_local(self, target_or_target_parent_boxes,
-            starts, lists, mpole_exps):
-        """For each box in *target_or_target_parent_boxes*, translate and add
-        the influence of the multipole expansion in *mpole_exps* into a new
-        array of local expansions.  *starts* and *lists* use :ref:`csr`, and
-        *starts* is indexed like *target_or_target_parent_boxes*.
-
-        :returns: a new (local) expansion array, see
-            :meth:`local_expansion_zeros`.
-        """
-
-    def eval_multipoles(self, target_boxes, starts, lists, mpole_exps):
-        """For each box in *target_boxes*, evaluate the multipole expansion in
-        *mpole_exps* in the nearby boxes given in *starts* and *lists*, and
-        return a new potential array.  *starts* and *lists* use :ref:`csr` and
-        *starts* is indexed like *target_boxes*.
-
-        :returns: a new potential array, see :meth:`potential_zeros`.
-        """
-
-    def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
-        """For each box in *target_or_target_parent_boxes*, form local
-        expansions due to the sources in the nearby boxes given in *starts* and
-        *lists*, and return a new local expansion array.  *starts* and *lists*
-        use :ref:`csr` and *starts* is indexed like
-        *target_or_target_parent_boxes*.
+    # }}}

-        :returns: a new local expansion array, see
-            :meth:`local_expansion_zeros`.
-        """
-        pass
+    potentials = wrangler.gather_potential_results(
+                    potentials, global_tgt_idx_all_ranks)

-    def refine_locals(self, child_boxes, local_exps):
-        """For each box in *child_boxes*,
-        translate the box's parent's local expansion in *local_exps* and add
-        the resulting expansion into the box's local expansion in *local_exps*.
+    result = wrangler.reorder_potentials(potentials)

-        :returns: *local_exps*
-        """
+    result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0])

-    def eval_locals(self, target_boxes, local_exps):
-        """For each box in *target_boxes*, evaluate the local expansion in
-        *local_exps* and return a new potential array.
+    fmm_proc.done()

-        :returns: a new potential array, see :meth:`potential_zeros`.
-        """
+    if timing_data is not None:
+        timing_data.update(recorder.summarize())

-# }}}
+    return result


 # vim: filetype=pyopencl:fdm=marker
--- a/boxtree/geo_lookup.py
+++ b/boxtree/geo_lookup.py
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from pytools import memoize_method, Record
-import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
-from mako.template import Template
-from boxtree.tools import AXIS_NAMES, DeviceDataRecord
-
-import logging
-logger = logging.getLogger(__name__)
-
-__doc__ = """
-Leaves -> overlapping balls
---------------------------
-
-.. autoclass:: LeavesToBallsLookupBuilder
-
-.. autoclass:: LeavesToBallsLookup
-"""
-
-
-# {{{ output
-
-class LeavesToBallsLookup(DeviceDataRecord):
-    """
-    .. attribute:: tree
-
-        The :class:`boxtree.Tree` instance used to build this lookup.
-
-    .. attribute:: balls_near_box_starts
-
-        Indices into :attr:`balls_near_box_lists`.
-        ``balls_near_box_lists[balls_near_box_starts[ibox]:
-        balls_near_box_starts[ibox]+1]``
-        results in a list of balls that overlap leaf box *ibox*.
-
-        .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
-            this list is indexed by the global box index.
-
-    .. attribute:: balls_near_box_lists
-
-    .. automethod:: get
-    """
-
-# }}}
-
-# {{{ kernel templates
-
-BALLS_TO_LEAVES_TEMPLATE = r"""//CL//
-typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
-
-void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
-{
-    coord_vec_t ball_center;
-    %for i in range(dimensions):
-        ball_center.${AXIS_NAMES[i]} = ball_${AXIS_NAMES[i]}[ball_nr];
-    %endfor
-
-    coord_t ball_radius = ball_radii[ball_nr];
-
-    // To find overlapping leaves, start at the top of the tree, descend
-    // into overlapping boxes.
-    ${walk_init(0)}
-
-    while (continue_walk)
-    {
-        box_id_t child_box_id = box_child_ids[
-            walk_morton_nr * aligned_nboxes + walk_box_id];
-        dbg_printf(("  walk box id: %d morton: %d child id: %d level: %d\n",
-            walk_box_id, walk_morton_nr, child_box_id, walk_level));
-
-        if (child_box_id)
-        {
-            bool is_overlapping;
-
-            ${check_l_infty_ball_overlap(
-                "is_overlapping", "child_box_id", "ball_radius", "ball_center")}
-
-            if (is_overlapping)
-            {
-                if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
-                {
-                    APPEND_ball_numbers(ball_nr);
-                    APPEND_overlapping_leaves(child_box_id);
-                }
-                else
-                {
-                    // We want to descend into this box. Put the current state
-                    // on the stack.
-
-                    ${walk_push("child_box_id")}
-                    continue;
-                }
-            }
-        }
-
-        ${walk_advance()}
-    }
-}
-"""
-
-
-class _KernelInfo(Record):
-    pass
-
-
-class LeavesToBallsLookupBuilder(object):
-    """Given a set of :math:`l^\infty` "balls", this class helps build a
-    look-up table from leaf boxes to balls that overlap with each leaf box.
-
-    .. automethod:: __call__
-
-    """
-    def __init__(self, context):
-        self.context = context
-
-        from pyopencl.algorithm import KeyValueSorter
-        self.key_value_sorter = KeyValueSorter(context)
-
-    @memoize_method
-    def get_balls_to_leaves_kernel(self, dimensions, coord_dtype, box_id_dtype,
-            ball_id_dtype, max_levels, stick_out_factor):
-        from pyopencl.tools import dtype_to_ctype
-        from boxtree import box_flags_enum
-        render_vars = dict(
-                dimensions=dimensions,
-                dtype_to_ctype=dtype_to_ctype,
-                box_id_dtype=box_id_dtype,
-                particle_id_dtype=None,
-                ball_id_dtype=ball_id_dtype,
-                coord_dtype=coord_dtype,
-                vec_types=cl.array.vec.types,
-                max_levels=max_levels,
-                AXIS_NAMES=AXIS_NAMES,
-                box_flags_enum=box_flags_enum,
-                debug=False,
-                stick_out_factor=stick_out_factor,
-                )
-
-        logger.info("start building leaves-to-balls lookup kernel")
-
-        from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
-
-        src = Template(
-                TRAVERSAL_PREAMBLE_TEMPLATE
-                + BALLS_TO_LEAVES_TEMPLATE,
-                strict_undefined=True).render(**render_vars)
-
-        from pyopencl.tools import VectorArg, ScalarArg
-        from pyopencl.algorithm import ListOfListsBuilder
-        result = ListOfListsBuilder(self.context,
-                [
-                    ("ball_numbers", ball_id_dtype),
-                    ("overlapping_leaves", box_id_dtype),
-                    ],
-                str(src),
-                arg_decls=[
-                    VectorArg(box_flags_enum.dtype, "box_flags"),
-                    VectorArg(coord_dtype, "box_centers"),
-                    VectorArg(box_id_dtype, "box_child_ids"),
-                    VectorArg(np.uint8, "box_levels"),
-                    ScalarArg(coord_dtype, "root_extent"),
-                    ScalarArg(box_id_dtype, "aligned_nboxes"),
-                    VectorArg(coord_dtype, "ball_radii"),
-                    ] + [
-                        VectorArg(coord_dtype, "ball_"+ax)
-                        for ax in AXIS_NAMES[:dimensions]],
-                name_prefix="circles_to_balls",
-                count_sharing={
-                    # /!\ This makes a promise that APPEND_ball_numbers will
-                    # always occur *before* APPEND_overlapping_leaves.
-                    "overlapping_leaves": "ball_numbers"
-                    },
-                complex_kernel=True)
-
-        logger.info("done building leaves-to-balls lookup kernel")
-
-        return result
-
-    def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None):
-        """
-        :arg queue: a :class:`pyopencl.CommandQueue`
-        :arg tree: a :class:`boxtree.Tree`.
-        :arg ball_centers: an object array of coordinate
-            :class:`pyopencl.array.Array` instances.
-            Their *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: a
-            :class:`pyopencl.array.Array`
-            of positive numbers.
-            Its *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            exeuction.
-        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
-            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
-            for dependency management.
-        """
-
-        from pytools import single_valued
-        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
-            raise TypeError("ball_centers dtype must match tree.coord_dtype")
-        if ball_radii.dtype != tree.coord_dtype:
-            raise TypeError("ball_radii dtype must match tree.coord_dtype")
-
-        ball_id_dtype = tree.particle_id_dtype  # ?
-
-        from pytools import div_ceil
-        # Avoid generating too many kernels.
-        max_levels = div_ceil(tree.nlevels, 10) * 10
-
-        b2l_knl = self.get_balls_to_leaves_kernel(
-                tree.dimensions, tree.coord_dtype,
-                tree.box_id_dtype, ball_id_dtype,
-                max_levels, tree.stick_out_factor)
-
-        logger.info("leaves-to-balls lookup: prepare ball list")
-
-        nballs = len(ball_radii)
-        result, evt = b2l_knl(
-                queue, nballs,
-                tree.box_flags.data, tree.box_centers.data,
-                tree.box_child_ids.data, tree.box_levels.data,
-                tree.root_extent, tree.aligned_nboxes,
-                ball_radii.data, *tuple(bc.data for bc in ball_centers),
-                wait_for=wait_for)
-        wait_for = [evt]
-
-        logger.info("leaves-to-balls lookup: key-value sort")
-
-        balls_near_box_starts, balls_near_box_lists, evt \
-                = self.key_value_sorter(
-                        queue,
-                        # keys
-                        result["overlapping_leaves"].lists,
-                        # values
-                        result["ball_numbers"].lists,
-                        tree.nboxes, starts_dtype=tree.box_id_dtype,
-                        wait_for=wait_for)
-
-        logger.info("leaves-to-balls lookup: built")
-
-        return LeavesToBallsLookup(
-                tree=tree,
-                balls_near_box_starts=balls_near_box_starts,
-                balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
-
-# }}}
-
-# vim: filetype=pyopencl:fdm=marker
--- a/boxtree/pyfmmlib_integration.py
+++ b/boxtree/pyfmmlib_integration.py
-from __future__ import division
+"""
+Integrates :mod:`boxtree` with
+`pyfmmlib <https://pypi.org/project/pyfmmlib>`__.
+
+.. autoclass:: FMMLibTreeIndependentDataForWrangler
+.. autoclass:: FMMLibExpansionWrangler
+
+Internal bits
+^^^^^^^^^^^^^

-"""Integration between boxtree and pyfmmlib."""
+.. autoclass:: FMMLibRotationDataInterface
+.. autoclass:: FMMLibRotationData
+.. autoclass:: FMMLibRotationDataNotSuppliedWarning
+"""

 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

@@ -25,31 +36,482 @@ THE SOFTWARE.
 """


+import logging
+
+
+logger = logging.getLogger(__name__)
+import enum
+
 import numpy as np

+from pytools import log_process, memoize_method

-__doc__ = """Integrates :mod:`boxtree` with
-`pyfmmlib <http://pypi.python.org/pypi/pyfmmlib>`_.
-"""
+from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
+from boxtree.timing import return_timing_data
+
+
+# {{{ rotation data interface
+
+class FMMLibRotationDataInterface:
+    """Abstract interface for additional, optional data for precomputation of
+    rotation matrices passed to the expansion wrangler.
+
+    .. automethod:: m2l_rotation_lists
+
+    .. automethod:: m2l_rotation_angles
+
+    """
+
+    def m2l_rotation_lists(self):
+        """Return a :mod:`numpy` array mapping entries of List 2 to rotation classes.
+        """
+        raise NotImplementedError
+
+    def m2l_rotation_angles(self):
+        """Return a :mod:`numpy` array mapping List 2 rotation classes to
+        rotation angles.
+        """
+        raise NotImplementedError
+
+
+class FMMLibRotationData(FMMLibRotationDataInterface):
+    """An implementation of the :class:`FMMLibRotationDataInterface`.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, queue, trav):
+        self.queue = queue
+        self.trav = trav
+        self.tree = trav.tree
+
+    @property
+    @memoize_method
+    def rotation_classes_builder(self):
+        from boxtree.rotation_classes import RotationClassesBuilder
+        return RotationClassesBuilder(self.queue.context)
+
+    @memoize_method
+    def build_rotation_classes_lists(self):
+        trav = self.trav.to_device(self.queue)
+        tree = self.tree.to_device(self.queue)
+        return self.rotation_classes_builder(self.queue, trav, tree)[0]
+
+    @memoize_method
+    def m2l_rotation_lists(self):
+        return (self
+                .build_rotation_classes_lists()
+                .from_sep_siblings_rotation_classes
+                .get(self.queue))
+
+    @memoize_method
+    def m2l_rotation_angles(self):
+        return (self
+                .build_rotation_classes_lists()
+                .from_sep_siblings_rotation_class_to_angle
+                .get(self.queue))
+
+
+class FMMLibRotationDataNotSuppliedWarning(UserWarning):
+    pass
+
+# }}}
+
+
+@enum.unique
+class Kernel(enum.Enum):
+    LAPLACE = enum.auto()
+    HELMHOLTZ = enum.auto()
+
+
+# {{{ tree-independent data for wrangler
+
+class FMMLibTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
+    """
+    .. automethod:: __init__
+    """
+
+    def __init__(self, dim, kernel, ifgrad=False):
+        self.dim = dim
+        self.ifgrad = ifgrad
+        self.kernel = kernel
+
+        if kernel == Kernel.LAPLACE:
+            self.eqn_letter = "l"
+        elif kernel == Kernel.HELMHOLTZ:
+            self.eqn_letter = "h"
+        else:
+            raise ValueError(kernel)
+
+        self.dtype = np.complex128
+
+    # {{{ routine getters
+
+    def get_routine(self, name, suffix=""):
+        import pyfmmlib
+        return getattr(pyfmmlib, f"{self.eqn_letter}{name % self.dim}{suffix}")
+
+    def get_vec_routine(self, name):
+        return self.get_routine(name, "_vec")
+
+    def get_translation_routine(self, wrangler, name, vec_suffix="_vec"):
+        suffix = ""
+        if self.dim == 3:
+            suffix = "quadu"
+        suffix += vec_suffix
+
+        rout = self.get_routine(name, suffix)
+
+        if self.dim == 2:
+            def wrapper(*args, **kwargs):
+                # not used
+                kwargs.pop("level_for_projection", None)
+
+                return rout(*args, **kwargs)
+        else:
+
+            def wrapper(*args, **kwargs):
+                kwargs.pop("level_for_projection", None)
+                nterms2 = kwargs["nterms2"]
+                kwargs.update(wrangler.projection_quad_extra_kwargs(order=nterms2))
+
+                val, ier = rout(*args, **kwargs)
+                if (ier != 0).any():
+                    raise RuntimeError(f"{name} failed with nonzero ier")

+                return val

-class Helmholtz2DExpansionWrangler:
+        # Doesn't work in in Py2
+        # from functools import update_wrapper
+        # update_wrapper(wrapper, rout)
+        return wrapper
+
+    def get_direct_eval_routine(self, use_dipoles):
+        if self.dim == 2:
+            rout = self.get_vec_routine(
+                    "potgrad%ddall" + ("_dp" if use_dipoles else ""))
+
+            def wrapper(*args, **kwargs):
+                kwargs["ifgrad"] = self.ifgrad
+                kwargs["ifhess"] = False
+                pot, grad, _hess = rout(*args, **kwargs)
+
+                if not self.ifgrad:
+                    grad = 0
+
+                return pot, grad
+
+            # Doesn't work in in Py2
+            # from functools import update_wrapper
+            # update_wrapper(wrapper, rout)
+            return wrapper
+
+        elif self.dim == 3:
+            rout = self.get_vec_routine(
+                    "potfld%ddall" + ("_dp" if use_dipoles else ""))
+
+            def wrapper(*args, **kwargs):
+                kwargs["iffld"] = self.ifgrad
+                pot, fld = rout(*args, **kwargs)
+                grad = -fld if self.ifgrad else 0
+
+                return pot, grad
+
+            # Doesn't work in in Py2
+            # from functools import update_wrapper
+            # update_wrapper(wrapper, rout)
+            return wrapper
+        else:
+            raise ValueError("unsupported dimensionality")
+
+    def get_expn_eval_routine(self, expn_kind):
+        name = f"%dd{expn_kind}eval"
+        rout = self.get_routine(name, "_vec")
+
+        if self.dim == 2:
+            def wrapper(*args, **kwargs):
+                kwargs["ifgrad"] = self.ifgrad
+                kwargs["ifhess"] = False
+
+                pot, grad, _hess = rout(*args, **kwargs)
+                if not self.ifgrad:
+                    grad = 0
+
+                return pot, grad
+
+            # Doesn't work in in Py2
+            # from functools import update_wrapper
+            # update_wrapper(wrapper, rout)
+            return wrapper
+
+        elif self.dim == 3:
+            def wrapper(*args, **kwargs):
+                kwargs["iffld"] = self.ifgrad
+                pot, fld, ier = rout(*args, **kwargs)
+
+                if (ier != 0).any():
+                    raise RuntimeError(f"{name} failed with nonzero ier")
+
+                grad = -fld if self.ifgrad else 0
+                return pot, grad
+
+            # Doesn't work in in Py2
+            # from functools import update_wrapper
+            # update_wrapper(wrapper, rout)
+            return wrapper
+        else:
+            raise ValueError("unsupported dimensionality")
+
+    # }}}
+
+# }}}
+
+
+# {{{ wrangler
+
+class FMMLibExpansionWrangler(ExpansionWranglerInterface):
    """Implements the :class:`boxtree.fmm.ExpansionWranglerInterface`
    by using pyfmmlib.
+
+    Timing results returned by this wrangler contains the values *wall_elapsed*
+    and (optionally, if supported) *process_elapsed*, which measure wall time
+    and process time in seconds, respectively.
    """

-    def __init__(self, tree, helmholtz_k, nterms):
-        self.tree = tree
+    # {{{ constructor
+
+    def __init__(self, tree_indep, traversal, *,
+            helmholtz_k=None, fmm_level_to_order=None,
+            dipole_vec=None, dipoles_already_reordered=False, order=None,
+            optimized_m2l_precomputation_memory_cutoff_bytes=10**8,
+            rotation_data=None):
+        """
+        :arg fmm_level_to_order: A callable that, upon being passed the tree
+            and the tree level as an integer, returns the order for the multipole and
+            local expansions on that level.
+        :arg rotation_data: Either *None* or an instance of the
+            :class:`FMMLibRotationDataInterface`. In three dimensions, passing
+            *rotation_data* enables optimized M2L (List 2) translations.
+            In two dimensions, this does nothing.
+        :arg optimized_m2l_precomputation_memory_cutoff_bytes: When using
+            optimized List 2 translations, an upper bound in bytes on the
+            amount of storage to use for a precomputed rotation matrix.
+        """
+
+        if order is not None and fmm_level_to_order is not None:
+            raise TypeError("may specify either fmm_level_to_order or order, "
+                    "but not both")
+
+        if order is not None:
+            from warnings import warn
+            warn("Passing order is deprecated. Pass fmm_level_to_order instead.",
+                    DeprecationWarning, stacklevel=2)
+
+            def fmm_level_to_order(tree, level):  # pylint:disable=function-redefined
+                return order
+
+        super().__init__(tree_indep, traversal)
+
+        if tree_indep.kernel == Kernel.LAPLACE:
+            self.kernel_kwargs = {}
+            self.rscale_factor = 1
+
+            if helmholtz_k:
+                raise ValueError(
+                        "helmholtz_k must be zero or unspecified for Laplace")
+
+            helmholtz_k = 0
+
+        elif tree_indep.kernel == Kernel.HELMHOLTZ:
+            self.kernel_kwargs = {"zk": helmholtz_k}
+
+            if not helmholtz_k:
+                raise ValueError(
+                        "helmholtz_k must be specified and nonzero")
+
+            self.rscale_factor = abs(helmholtz_k)
+
+        else:
+            raise ValueError(tree_indep.kernel)
+
        self.helmholtz_k = helmholtz_k
-        self.nterms = nterms

-    def multipole_expansion_zeros(self):
-        return np.zeros((self.tree.nboxes, 2*self.nterms+1), dtype=np.complex128)
+        tree = traversal.tree
+
+        if tree_indep.dim != tree.dimensions:
+            raise ValueError(f"Kernel dim ({tree_indep.dim}) "
+                    f"does not match tree dim ({tree.dimensions})")

-    local_expansion_zeros = multipole_expansion_zeros
+        self.level_orders = np.array([
+            fmm_level_to_order(tree, lev) for lev in range(tree.nlevels)
+            ], dtype=np.int32)

-    def potential_zeros(self):
-        return np.zeros(self.tree.ntargets, dtype=np.complex128)
+        if tree_indep.kernel == Kernel.HELMHOLTZ:
+            logger.info("expansion orders by level used in Helmholtz FMM: %s",
+                    self.level_orders)
+
+        self.rotation_data = rotation_data
+        self.rotmat_cutoff_bytes = optimized_m2l_precomputation_memory_cutoff_bytes
+
+        if self.dim == 3:
+            if rotation_data is None:
+                from warnings import warn
+                warn(
+                        "List 2 (multipole-to-local) translations will be "
+                        "unoptimized. Supply a rotation_data argument to "
+                        "FMMLibExpansionWrangler for optimized List 2.",
+                        FMMLibRotationDataNotSuppliedWarning,
+                        stacklevel=2)
+
+            self.supports_optimized_m2l = rotation_data is not None
+        else:
+            self.supports_optimized_m2l = False
+
+        # FIXME: dipole_vec shouldn't be stored here! Otherwise, we'll recompute
+        # bunches of tree-dependent stuff for every new dipole vector.
+
+        # It's not super bad because the dipole vectors are typically geometry
+        # normals and thus change about at the same time as the tree... but there's
+        # still no reason for them to be here.
+        self.use_dipoles = dipole_vec is not None
+        if self.use_dipoles:
+            assert dipole_vec.shape == (self.dim, self.tree.nsources)
+
+            if not dipoles_already_reordered:
+                dipole_vec = self.reorder_sources(dipole_vec)
+
+            self.dipole_vec = dipole_vec.copy(order="F")
+        else:
+            self.dipole_vec = None
+
+    # }}}
+
+    @property
+    def dim(self):
+        return self.tree.dimensions
+
+    def level_to_rscale(self, level):
+        result = self.tree.root_extent * 2 ** -level * self.rscale_factor
+        if abs(result) > 1:
+            result = 1
+        if self.dim == 3 and self.tree_indep.eqn_letter == "l":
+            # Laplace 3D uses the opposite convention compared to
+            # all other cases.
+            # https://gitlab.tiker.net/inducer/boxtree/merge_requests/81
+            result = 1 / result
+        return result
+
+    @memoize_method
+    def projection_quad_extra_kwargs(self, level=None, order=None):
+        if level is None and order is None:
+            raise TypeError("must pass exactly one of level or order")
+        if level is not None and order is not None:
+            raise TypeError("must pass exactly one of level or order")
+        if level is not None:
+            order = self.level_orders[level]
+
+        common_extra_kwargs = {}
+
+        if self.dim == 3 and self.tree_indep.eqn_letter == "h":
+            nquad = max(6, int(2.5*order))
+            from pyfmmlib import legewhts
+            xnodes, weights = legewhts(nquad, ifwhts=1)
+
+            common_extra_kwargs = {
+                    "xnodes": xnodes,
+                    "wts": weights,
+                    }
+
+        return common_extra_kwargs
+
+    # {{{ overridable target lists for the benefit of the QBX FMM
+
+    def box_target_starts(self):
+        return self.tree.box_target_starts
+
+    def box_target_counts_nonchild(self):
+        return self.tree.box_target_counts_nonchild
+
+    def targets(self):
+        return self.tree.targets
+
+    # }}}
+
+    # {{{ level starts
+
+    def _expansions_level_starts(self, order_to_size):
+        result = [0]
+        for lev in range(self.tree.nlevels):
+            lev_nboxes = (
+                    self.tree.level_start_box_nrs[lev+1]
+                    - self.tree.level_start_box_nrs[lev])
+
+            expn_size = order_to_size(self.level_orders[lev])
+            result.append(
+                    result[-1]
+                    + expn_size * lev_nboxes)
+
+        return result
+
+    @memoize_method
+    def multipole_expansions_level_starts(self):
+        from pytools import product
+        return self._expansions_level_starts(
+                lambda order: product(
+                    self.expansion_shape(order)))
+
+    @memoize_method
+    def local_expansions_level_starts(self):
+        from pytools import product
+        return self._expansions_level_starts(
+                lambda order: product(
+                    self.expansion_shape(order)))
+
+    # }}}
+
+    # {{{ views into arrays of expansions
+
+    def multipole_expansions_view(self, mpole_exps, level):
+        box_start, box_stop = self.tree.level_start_box_nrs[level:level+2]
+
+        expn_start, expn_stop = \
+                self.multipole_expansions_level_starts()[level:level+2]
+        return (box_start,
+                mpole_exps[expn_start:expn_stop].reshape(
+                    box_stop-box_start,
+                    *self.expansion_shape(self.level_orders[level])))
+
+    def local_expansions_view(self, local_exps, level):
+        box_start, box_stop = self.tree.level_start_box_nrs[level:level+2]
+
+        expn_start, expn_stop = \
+                self.local_expansions_level_starts()[level:level+2]
+        return (box_start,
+                local_exps[expn_start:expn_stop].reshape(
+                    box_stop-box_start,
+                    *self.expansion_shape(self.level_orders[level])))
+
+    # }}}
+
+    def get_source_kwargs(self, src_weights, pslice):
+        if self.dipole_vec is None:
+            return {
+                    "charge": src_weights[pslice],
+                    }
+        else:
+            if self.tree_indep.eqn_letter == "l" and self.dim == 2:
+                return {
+                        "dipstr": -src_weights[pslice] * (
+                            self.dipole_vec[0, pslice]
+                            + 1j * self.dipole_vec[1, pslice])
+                        }
+            else:
+                return {
+                        "dipstr": src_weights[pslice],
+                        "dipvec": self.dipole_vec[:, pslice],
+                        }
+
+    # {{{ source/target particle wrangling

    def _get_source_slice(self, ibox):
        pstart = self.tree.box_source_starts[ibox]
@@ -57,75 +519,268 @@ class Helmholtz2DExpansionWrangler:
                pstart, pstart + self.tree.box_source_counts_nonchild[ibox])

    def _get_target_slice(self, ibox):
-        pstart = self.tree.box_target_starts[ibox]
+        pstart = self.box_target_starts()[ibox]
        return slice(
-                pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
+                pstart, pstart + self.box_target_counts_nonchild()[ibox])
+
+    @memoize_method
+    def _get_single_sources_array(self):
+        return np.array([
+            self.tree.sources[idim]
+            for idim in range(self.dim)
+            ], order="F")

    def _get_sources(self, pslice):
-        # FIXME yuck!
+        return self._get_single_sources_array()[:, pslice]
+
+    @memoize_method
+    def _get_single_targets_array(self):
        return np.array([
-            self.tree.sources[idim][pslice]
-            for idim in range(self.tree.dimensions)
+            self.targets()[idim]
+            for idim in range(self.dim)
            ], order="F")

    def _get_targets(self, pslice):
-        # FIXME yuck!
+        return self._get_single_targets_array()[:, pslice]
+
+    @memoize_method
+    def _get_single_box_centers_array(self):
        return np.array([
-            self.tree.targets[idim][pslice]
-            for idim in range(self.tree.dimensions)
+            self.tree.box_centers[idim]
+            for idim in range(self.dim)
            ], order="F")

+    # }}}
+
+    # {{{ precompute rotation matrices for optimized m2l
+
+    @memoize_method
+    def m2l_rotation_matrices(self):
+        # Returns a tuple (rotmatf, rotmatb, rotmat_order), consisting of the
+        # forward rotation matrices, backward rotation matrices, and the
+        # translation order of the matrices. rotmat_order is -1 if not
+        # supported.
+
+        rotmatf = None
+        rotmatb = None
+        rotmat_order = -1
+
+        if not self.supports_optimized_m2l:
+            return (rotmatf, rotmatb, rotmat_order)
+
+        m2l_rotation_angles = self.rotation_data.m2l_rotation_angles()
+
+        if len(m2l_rotation_angles) == 0:
+            # The pyfmmlib wrapper may or may not complain if you give it a
+            # zero-length array.
+            return (rotmatf, rotmatb, rotmat_order)
+
+        def mem_estimate(order):
+            # Rotation matrix memory cost estimate.
+            return (8
+                    * (order + 1)**2
+                    * (2*order + 1)
+                    * len(m2l_rotation_angles))
+
+        # Find the largest order we can use. Because the memory cost of the
+        # matrices could be large, only precompute them if the cost estimate
+        # for the order does not exceed the cutoff.
+        for order in sorted(self.level_orders, reverse=True):
+            if mem_estimate(order) < self.rotmat_cutoff_bytes:
+                rotmat_order = order
+                break
+
+        if rotmat_order == -1:
+            return (rotmatf, rotmatb, rotmat_order)
+
+        # Compute the rotation matrices.
+        from pyfmmlib import rotviarecur3p_init_vec as rotmat_builder
+
+        ier, rotmatf = (
+                rotmat_builder(rotmat_order, m2l_rotation_angles))
+        assert (ier == 0).all()
+
+        ier, rotmatb = (
+                rotmat_builder(rotmat_order, -m2l_rotation_angles))
+        assert (ier == 0).all()
+
+        return (rotmatf, rotmatb, rotmat_order)
+
+    # }}}
+
+    # {{{ data vector utilities
+
+    def expansion_shape(self, order):
+        if self.dim == 2 and self.tree_indep.eqn_letter == "l":
+            return (order+1,)
+        elif self.dim == 2 and self.tree_indep.eqn_letter == "h":
+            return (2*order+1,)
+        elif self.dim == 3:
+            # This is the transpose of the Fortran format, to
+            # minimize mismatch between C and Fortran orders.
+            return (2*order+1, order+1,)
+        else:
+            raise ValueError("unsupported dimensionality")
+
+    def multipole_expansion_zeros(self):
+        """Return an expansions array (which must support addition)
+        capable of holding one multipole or local expansion for every
+        box in the tree.
+        """
+
+        return np.zeros(
+                self.multipole_expansions_level_starts()[-1],
+                dtype=self.tree_indep.dtype)
+
+    def local_expansion_zeros(self):
+        """Return an expansions array (which must support addition)
+        capable of holding one multipole or local expansion for every
+        box in the tree.
+        """
+        return np.zeros(
+                self.local_expansions_level_starts()[-1],
+                dtype=self.tree_indep.dtype)
+
+    def output_zeros(self):
+        """Return a potentials array (which must support addition) capable of
+        holding a potential value for each target in the tree. Note that
+        :func:`drive_fmm` makes no assumptions about *potential* other than
+        that it supports addition--it may consist of potentials, gradients of
+        the potential, or arbitrary other per-target output data.
+        """
+
+        if self.tree_indep.ifgrad:
+            from pytools.obj_array import make_obj_array
+            return make_obj_array([
+                    np.zeros(self.tree.ntargets, self.tree_indep.dtype)
+                    for i in range(1 + self.dim)])
+        else:
+            return np.zeros(self.tree.ntargets, self.tree_indep.dtype)
+
+    def add_potgrad_onto_output(self, output, output_slice, pot, grad):
+        if self.tree_indep.ifgrad:
+            output[0, output_slice] += pot
+            output[1:, output_slice] += grad
+        else:
+            output[output_slice] += pot
+
+    # }}}
+
+    @log_process(logger)
    def reorder_sources(self, source_array):
-        return source_array[self.tree.user_source_ids]
+        return source_array[..., self.tree.user_source_ids]

+    @log_process(logger)
    def reorder_potentials(self, potentials):
        return potentials[self.tree.sorted_target_ids]

-    def form_multipoles(self, source_boxes, src_weights):
-        rscale = 1  # FIXME
-
-        from pyfmmlib import h2dformmp
+    @log_process(logger)
+    @return_timing_data
+    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+            src_weight_vecs):
+        src_weights, = src_weight_vecs
+        formmp = self.tree_indep.get_routine(
+                "%ddformmp" + ("_dp" if self.use_dipoles else ""))

        mpoles = self.multipole_expansion_zeros()
-        for src_ibox in source_boxes:
-            pslice = self._get_source_slice(src_ibox)
-
-            if pslice.stop - pslice.start == 0:
+        for lev in range(self.tree.nlevels):
+            start, stop = level_start_source_box_nrs[lev:lev+2]
+            if start == stop:
                continue

-            ier, mpoles[src_ibox] = h2dformmp(
-                    self.helmholtz_k, rscale, self._get_sources(pslice),
-                    src_weights[pslice],
-                    self.tree.box_centers[:, src_ibox], self.nterms)
-            if ier:
-                raise RuntimeError("h2dformmp failed")
+            level_start_ibox, mpoles_view = self.multipole_expansions_view(
+                    mpoles, lev)
+
+            rscale = self.level_to_rscale(lev)
+
+            for src_ibox in source_boxes[start:stop]:
+                pslice = self._get_source_slice(src_ibox)
+
+                if pslice.stop - pslice.start == 0:
+                    continue
+
+                kwargs = {}
+                kwargs.update(self.kernel_kwargs)
+                kwargs.update(self.get_source_kwargs(src_weights, pslice))
+
+                ier, mpole = formmp(
+                        rscale=rscale,
+                        source=self._get_sources(pslice),
+                        center=self.tree.box_centers[:, src_ibox],
+                        nterms=self.level_orders[lev],
+                        **kwargs)
+
+                if ier:
+                    raise RuntimeError("formmp failed")
+
+                mpoles_view[src_ibox-level_start_ibox] = mpole.T

        return mpoles

-    def coarsen_multipoles(self, parent_boxes, mpoles):
+    @log_process(logger)
+    @return_timing_data
+    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
+            source_parent_boxes, mpoles):
        tree = self.tree
-        rscale = 1  # FIXME

-        from pyfmmlib import h2dmpmp_vec
+        mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp")
+
+        # nlevels-1 is the last valid level index
+        # nlevels-2 is the last valid level that could have children
+        #
+        # 3 is the last relevant source_level.
+        # 2 is the last relevant target_level.
+        # (because no level 1 box will be well-separated from another)
+        for source_level in range(tree.nlevels-1, 2, -1):
+            target_level = source_level - 1
+            start, stop = level_start_source_parent_box_nrs[
+                            target_level:target_level+2]
+
+            source_level_start_ibox, source_mpoles_view = \
+                    self.multipole_expansions_view(mpoles, source_level)
+            target_level_start_ibox, target_mpoles_view = \
+                    self.multipole_expansions_view(mpoles, target_level)
+
+            source_rscale = self.level_to_rscale(source_level)
+            target_rscale = self.level_to_rscale(target_level)

-        for ibox in parent_boxes:
-            parent_center = tree.box_centers[:, ibox]
-            for child in tree.box_child_ids[:, ibox]:
-                if child:
-                    child_center = tree.box_centers[:, child]
+            for ibox in source_parent_boxes[start:stop]:
+                parent_center = tree.box_centers[:, ibox]
+                for child in tree.box_child_ids[:, ibox]:
+                    if child:
+                        child_center = tree.box_centers[:, child]

-                    new_mp = h2dmpmp_vec(
-                            self.helmholtz_k,
-                            rscale, child_center, mpoles[child],
-                            rscale, parent_center, self.nterms)
+                        kwargs = {}
+                        if self.dim == 3 and self.tree_indep.eqn_letter == "h":
+                            kwargs["radius"] = tree.root_extent * 2**(-target_level)

-                    mpoles[ibox] += new_mp[:, 0]
+                        kwargs.update(self.kernel_kwargs)
+
+                        new_mp = mpmp(
+                                rscale1=source_rscale,
+                                center1=child_center,
+                                expn1=source_mpoles_view[
+                                    child - source_level_start_ibox].T,
+
+                                rscale2=target_rscale,
+                                center2=parent_center,
+                                nterms2=self.level_orders[target_level],
+
+                                **kwargs)
+
+                        target_mpoles_view[
+                                ibox - target_level_start_ibox] += new_mp[..., 0].T
+
+        return mpoles

+    @log_process(logger)
+    @return_timing_data
    def eval_direct(self, target_boxes, neighbor_sources_starts,
-            neighbor_sources_lists, src_weights):
-        pot = self.potential_zeros()
+            neighbor_sources_lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        output = self.output_zeros()

-        from pyfmmlib import hpotgrad2dall_vec
+        ev = self.tree_indep.get_direct_eval_routine(self.use_dipoles)

        for itgt_box, tgt_ibox in enumerate(target_boxes):
            tgt_pslice = self._get_target_slice(tgt_ibox)
@@ -133,7 +788,11 @@ class Helmholtz2DExpansionWrangler:
            if tgt_pslice.stop - tgt_pslice.start == 0:
                continue

-            tgt_result = np.zeros(tgt_pslice.stop - tgt_pslice.start, np.complex128)
+            # tgt_result = np.zeros(
+            #         tgt_pslice.stop - tgt_pslice.start, self.tree_indep.dtype)
+            tgt_pot_result = 0
+            tgt_grad_result = 0
+
            start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
            for src_ibox in neighbor_sources_lists[start:end]:
                src_pslice = self._get_source_slice(src_ibox)
@@ -141,140 +800,363 @@ class Helmholtz2DExpansionWrangler:
                if src_pslice.stop - src_pslice.start == 0:
                    continue

-                tmp_pot, _, _ = hpotgrad2dall_vec(
-                        ifgrad=False, ifhess=False,
+                kwargs = {}
+                kwargs.update(self.kernel_kwargs)
+                kwargs.update(self.get_source_kwargs(src_weights, src_pslice))
+
+                tmp_pot, tmp_grad = ev(
                        sources=self._get_sources(src_pslice),
-                        charge=src_weights[src_pslice],
-                        targets=self._get_targets(tgt_pslice), zk=self.helmholtz_k)
+                        targets=self._get_targets(tgt_pslice),
+                        **kwargs)

-                tgt_result += tmp_pot
+                tgt_pot_result += tmp_pot
+                tgt_grad_result += tmp_grad

-            pot[tgt_pslice] = tgt_result
+            self.add_potgrad_onto_output(
+                    output, tgt_pslice, tgt_pot_result, tgt_grad_result)

-        return pot
+        return output

-    def multipole_to_local(self, target_or_target_parent_boxes,
+    @log_process(logger)
+    @return_timing_data
+    def multipole_to_local(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
            starts, lists, mpole_exps):
        tree = self.tree
        local_exps = self.local_expansion_zeros()

-        rscale = 1
+        # Precomputed rotation matrices (matrices of larger order can be used
+        # for translations of smaller order)
+        rotmatf, rotmatb, rotmat_order = self.m2l_rotation_matrices()

-        from pyfmmlib import h2dmploc_vec
+        for lev in range(self.tree.nlevels):
+            lstart, lstop = level_start_target_or_target_parent_box_nrs[lev:lev+2]
+            if lstart == lstop:
+                continue

-        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
-            start, end = starts[itgt_box:itgt_box+2]
-            tgt_center = tree.box_centers[:, tgt_ibox]
+            starts_on_lvl = starts[lstart:lstop+1]

-            #print tgt_ibox, "<-", lists[start:end]
-            tgt_loc = 0
+            mploc = self.tree_indep.get_translation_routine(
+                    self, "%ddmploc", vec_suffix="_imany")

-            for src_ibox in lists[start:end]:
-                src_center = tree.box_centers[:, src_ibox]
+            kwargs = {}

-                tgt_loc = tgt_loc + h2dmploc_vec(
-                        self.helmholtz_k,
-                        rscale, src_center, mpole_exps[src_ibox],
-                        rscale, tgt_center, self.nterms)[:, 0]
+            # {{{ set up optimized m2l, if applicable

-            local_exps[tgt_ibox] += tgt_loc
+            if self.level_orders[lev] <= rotmat_order:
+                m2l_rotation_lists = self.rotation_data.m2l_rotation_lists()
+                assert len(m2l_rotation_lists) == len(lists)

-        return local_exps
+                mploc = self.tree_indep.get_translation_routine(
+                        self, "%ddmploc", vec_suffix="2_trunc_imany")

-    def eval_multipoles(self, target_boxes, sep_smaller_nonsiblings_starts,
-            sep_smaller_nonsiblings_lists, mpole_exps):
-        pot = self.potential_zeros()
+                kwargs["ldm"] = rotmat_order
+                kwargs["nterms"] = self.level_orders[lev]
+                kwargs["nterms1"] = self.level_orders[lev]

-        rscale = 1
+                kwargs["rotmatf"] = rotmatf
+                kwargs["rotmatf_offsets"] = m2l_rotation_lists
+                kwargs["rotmatf_starts"] = starts_on_lvl

-        from pyfmmlib import h2dmpeval_vec
-        for itgt_box, tgt_ibox in enumerate(target_boxes):
-            tgt_pslice = self._get_target_slice(tgt_ibox)
+                kwargs["rotmatb"] = rotmatb
+                kwargs["rotmatb_offsets"] = m2l_rotation_lists
+                kwargs["rotmatb_starts"] = starts_on_lvl

-            if tgt_pslice.stop - tgt_pslice.start == 0:
-                continue
+            # }}}

-            tgt_pot = 0
-            start, end = sep_smaller_nonsiblings_starts[itgt_box:itgt_box+2]
-            for src_ibox in sep_smaller_nonsiblings_lists[start:end]:
+            source_level_start_ibox, source_mpoles_view = \
+                    self.multipole_expansions_view(mpole_exps, lev)
+            target_level_start_ibox, target_local_exps_view = \
+                    self.local_expansions_view(local_exps, lev)

-                tmp_pot, _, _ = h2dmpeval_vec(self.helmholtz_k, rscale, self.
-                        tree.box_centers[:, src_ibox], mpole_exps[src_ibox],
-                        self._get_targets(tgt_pslice),
-                        ifgrad=False, ifhess=False)
+            ntgt_boxes = lstop-lstart
+            itgt_box_vec = np.arange(ntgt_boxes)
+            tgt_ibox_vec = target_or_target_parent_boxes[lstart:lstop]

-                tgt_pot = tgt_pot + tmp_pot
+            nsrc_boxes_per_tgt_box = (
+                    starts[lstart + itgt_box_vec+1] - starts[lstart + itgt_box_vec])

-            pot[tgt_pslice] += tgt_pot
+            nsrc_boxes = np.sum(nsrc_boxes_per_tgt_box)

-        return pot
+            src_boxes_starts = np.empty(ntgt_boxes+1, dtype=np.int32)
+            src_boxes_starts[0] = 0
+            src_boxes_starts[1:] = np.cumsum(nsrc_boxes_per_tgt_box)

-    def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
-        rscale = 1  # FIXME
-        local_exps = self.local_expansion_zeros()
+            rscale = self.level_to_rscale(lev)

-        from pyfmmlib import h2dformta
+            rscale1 = np.ones(nsrc_boxes) * rscale
+            rscale1_offsets = np.arange(nsrc_boxes)

-        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
-            start, end = starts[itgt_box:itgt_box+2]
+            if self.dim == 3 and self.tree_indep.eqn_letter == "h":
+                kwargs["radius"] = (
+                        tree.root_extent * 2**(-lev)
+                        * np.ones(ntgt_boxes))

-            contrib = 0
+            rscale2 = np.ones(ntgt_boxes, np.float64) * rscale

-            for src_ibox in lists[start:end]:
-                src_pslice = self._get_source_slice(src_ibox)
-                tgt_center = self.tree.box_centers[:, tgt_ibox]
+            # These get max'd/added onto: pass initialized versions.
+            if self.dim == 3:
+                ier = np.zeros(ntgt_boxes, dtype=np.int32)
+                kwargs["ier"] = ier

-                if src_pslice.stop - src_pslice.start == 0:
-                    continue
+            expn2 = np.zeros(
+                    (ntgt_boxes, *self.expansion_shape(self.level_orders[lev])),
+                    dtype=self.tree_indep.dtype)

-                ier, mpole = h2dformta(
-                        self.helmholtz_k, rscale,
-                        self._get_sources(src_pslice), src_weights[src_pslice],
-                        tgt_center, self.nterms)
-                if ier:
-                    raise RuntimeError("h2dformta failed")
+            kwargs.update(self.kernel_kwargs)
+
+            expn2 = mploc(
+                    rscale1=rscale1,
+                    rscale1_offsets=rscale1_offsets,
+                    rscale1_starts=src_boxes_starts,
+
+                    center1=tree.box_centers,
+                    center1_offsets=lists,
+                    center1_starts=starts_on_lvl,
+
+                    expn1=source_mpoles_view.T,
+                    expn1_offsets=lists - source_level_start_ibox,
+                    expn1_starts=starts_on_lvl,

-                contrib = contrib + mpole
+                    rscale2=rscale2,
+                    # FIXME: wrong layout, will copy
+                    center2=tree.box_centers[:, tgt_ibox_vec],
+                    expn2=expn2.T,

-            local_exps[tgt_ibox] = contrib
+                    nterms2=self.level_orders[lev],
+
+                    **kwargs).T
+
+            target_local_exps_view[tgt_ibox_vec - target_level_start_ibox] += expn2

        return local_exps

-    def refine_locals(self, child_boxes, local_exps):
-        rscale = 1  # FIXME
+    @log_process(logger)
+    @return_timing_data
+    def eval_multipoles(self,
+            target_boxes_by_source_level, sep_smaller_nonsiblings_by_level,
+            mpole_exps):
+        output = self.output_zeros()
+
+        mpeval = self.tree_indep.get_expn_eval_routine("mp")
+
+        for isrc_level, ssn in enumerate(sep_smaller_nonsiblings_by_level):
+            source_level_start_ibox, source_mpoles_view = \
+                    self.multipole_expansions_view(mpole_exps, isrc_level)
+
+            rscale = self.level_to_rscale(isrc_level)
+
+            for itgt_box, tgt_ibox in \
+                    enumerate(target_boxes_by_source_level[isrc_level]):
+                tgt_pslice = self._get_target_slice(tgt_ibox)

-        from pyfmmlib import h2dlocloc_vec
+                if tgt_pslice.stop - tgt_pslice.start == 0:
+                    continue
+
+                tgt_pot = 0
+                tgt_grad = 0
+                start, end = ssn.starts[itgt_box:itgt_box+2]
+                for src_ibox in ssn.lists[start:end]:
+
+                    tmp_pot, tmp_grad = mpeval(
+                            rscale=rscale,
+                            center=self.tree.box_centers[:, src_ibox],
+                            expn=source_mpoles_view[
+                                src_ibox - source_level_start_ibox].T,
+                            ztarg=self._get_targets(tgt_pslice),
+                            **self.kernel_kwargs)
+
+                    tgt_pot = tgt_pot + tmp_pot
+                    tgt_grad = tgt_grad + tmp_grad
+
+                self.add_potgrad_onto_output(
+                        output, tgt_pslice, tgt_pot, tgt_grad)
+
+        return output
+
+    @log_process(logger)
+    @return_timing_data
+    def form_locals(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        local_exps = self.local_expansion_zeros()
+
+        formta = self.tree_indep.get_routine(
+                "%ddformta" + ("_dp" if self.use_dipoles else ""), suffix="_imany")
+
+        sources = self._get_single_sources_array()
+        # sources_starts / sources_lists is a CSR list mapping box centers to
+        # lists of starting indices into the sources array. To get the starting
+        # source indices we have to look at box_source_starts.
+        sources_offsets = self.tree.box_source_starts[lists]
+
+        # nsources_starts / nsources_lists is a CSR list mapping box centers to
+        # lists of indices into nsources, each of which represents a source
+        # count.
+        nsources = self.tree.box_source_counts_nonchild
+        nsources_offsets = lists

-        for tgt_ibox in child_boxes:
-            tgt_center = self.tree.box_centers[:, tgt_ibox]
-            src_ibox = self.tree.box_parent_ids[tgt_ibox]
-            src_center = self.tree.box_centers[:, src_ibox]
+        # centers is indexed into by values of centers_offsets, which is a list
+        # mapping box indices to box center indices.
+        centers = self._get_single_box_centers_array()

-            tmp_loc_exp = h2dlocloc_vec(
-                        self.helmholtz_k,
-                        rscale, src_center, local_exps[src_ibox],
-                        rscale, tgt_center, self.nterms)[:, 0]
+        source_kwargs = self.get_source_kwargs(src_weights, slice(None))

-            local_exps[tgt_ibox] += tmp_loc_exp
+        for lev in range(self.tree.nlevels):
+            lev_start, lev_stop = \
+                    level_start_target_or_target_parent_box_nrs[lev:lev+2]
+
+            if lev_start == lev_stop:
+                continue
+
+            target_box_start, target_local_exps_view = \
+                    self.local_expansions_view(local_exps, lev)
+
+            centers_offsets = target_or_target_parent_boxes[lev_start:lev_stop]
+
+            rscale = self.level_to_rscale(lev)
+
+            sources_starts = starts[lev_start:1 + lev_stop]
+            nsources_starts = sources_starts
+
+            kwargs = {}
+            kwargs.update(self.kernel_kwargs)
+            for key, val in source_kwargs.items():
+                kwargs[key] = val
+                # Add CSR lists mapping box centers to lists of starting positions
+                # in the array of source strengths.
+                # Since the source strengths have the same order as the sources,
+                # these lists are the same as those for starting position in the
+                # sources array.
+                kwargs[key + "_starts"] = sources_starts
+                kwargs[key + "_offsets"] = sources_offsets
+
+            ier, expn = formta(
+                    rscale=rscale,
+                    sources=sources,
+                    sources_offsets=sources_offsets,
+                    sources_starts=sources_starts,
+                    nsources=nsources,
+                    nsources_starts=nsources_starts,
+                    nsources_offsets=nsources_offsets,
+                    centers=centers,
+                    centers_offsets=centers_offsets,
+                    nterms=self.level_orders[lev],
+                    **kwargs)
+
+            if ier.any():
+                raise RuntimeError("formta failed")
+
+            target_local_exps_view[
+                    target_or_target_parent_boxes[lev_start:lev_stop]
+                    - target_box_start] = expn.T

        return local_exps

-    def eval_locals(self, target_boxes, local_exps):
-        pot = self.potential_zeros()
-        rscale = 1  # FIXME
+    @log_process(logger)
+    @return_timing_data
+    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, local_exps):

-        from pyfmmlib import h2dtaeval_vec
+        locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc")

-        for tgt_ibox in target_boxes:
-            tgt_pslice = self._get_target_slice(tgt_ibox)
+        for target_lev in range(1, self.tree.nlevels):
+            start, stop = level_start_target_or_target_parent_box_nrs[
+                    target_lev:target_lev+2]

-            if tgt_pslice.stop - tgt_pslice.start == 0:
+            source_lev = target_lev - 1
+
+            source_level_start_ibox, source_local_exps_view = \
+                    self.local_expansions_view(local_exps, source_lev)
+            target_level_start_ibox, target_local_exps_view = \
+                    self.local_expansions_view(local_exps, target_lev)
+            source_rscale = self.level_to_rscale(source_lev)
+            target_rscale = self.level_to_rscale(target_lev)
+
+            for tgt_ibox in target_or_target_parent_boxes[start:stop]:
+                tgt_center = self.tree.box_centers[:, tgt_ibox]
+                src_ibox = self.tree.box_parent_ids[tgt_ibox]
+                src_center = self.tree.box_centers[:, src_ibox]
+
+                kwargs = {}
+                if self.dim == 3 and self.tree_indep.eqn_letter == "h":
+                    kwargs["radius"] = self.tree.root_extent * 2**(-target_lev)
+
+                kwargs.update(self.kernel_kwargs)
+                tmp_loc_exp = locloc(
+                            rscale1=source_rscale,
+                            center1=src_center,
+                            expn1=source_local_exps_view[
+                                src_ibox - source_level_start_ibox].T,
+
+                            rscale2=target_rscale,
+                            center2=tgt_center,
+                            nterms2=self.level_orders[target_lev],
+
+                            **kwargs)[..., 0]
+
+                target_local_exps_view[
+                        tgt_ibox - target_level_start_ibox] += tmp_loc_exp.T
+
+        return local_exps
+
+    @log_process(logger)
+    @return_timing_data
+    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+        output = self.output_zeros()
+        taeval = self.tree_indep.get_expn_eval_routine("ta")
+
+        for lev in range(self.tree.nlevels):
+            start, stop = level_start_target_box_nrs[lev:lev+2]
+            if start == stop:
                continue

-            tmp_pot, _, _ = h2dtaeval_vec(self.helmholtz_k, rscale,
-                    self.tree.box_centers[:, tgt_ibox], local_exps[tgt_ibox],
-                    self._get_targets(tgt_pslice), ifgrad=False, ifhess=False)
+            source_level_start_ibox, source_local_exps_view = \
+                    self.local_expansions_view(local_exps, lev)
+
+            rscale = self.level_to_rscale(lev)
+
+            for tgt_ibox in target_boxes[start:stop]:
+                tgt_pslice = self._get_target_slice(tgt_ibox)
+
+                if tgt_pslice.stop - tgt_pslice.start == 0:
+                    continue
+
+                tmp_pot, tmp_grad = taeval(
+                        rscale=rscale,
+                        center=self.tree.box_centers[:, tgt_ibox],
+                        expn=source_local_exps_view[
+                            tgt_ibox - source_level_start_ibox].T,
+                        ztarg=self._get_targets(tgt_pslice),
+
+                        **self.kernel_kwargs)
+
+                self.add_potgrad_onto_output(
+                        output, tgt_pslice, tmp_pot, tmp_grad)
+
+        return output
+
+    @log_process(logger)
+    def finalize_potentials(self, potential, template_ary):
+        if self.tree_indep.eqn_letter == "l" and self.dim == 2:
+            scale_factor = -1/(2*np.pi)
+        elif self.tree_indep.eqn_letter == "h" and self.dim == 2:
+            scale_factor = 1
+        elif self.tree_indep.eqn_letter in ["l", "h"] and self.dim == 3:
+            scale_factor = 1/(4*np.pi)
+        else:
+            raise NotImplementedError(
+                    f"scale factor for pyfmmlib {self.tree_indep.eqn_letter} "
+                    f"for {self.dim} dimensions")
+
+        if self.tree_indep.eqn_letter == "l" and self.dim == 2:
+            potential = potential.real
+
+        return potential * scale_factor
+
+# }}}

-            pot[tgt_pslice] += tmp_pot

-        return pot
+# vim: foldmethod=marker
--- a/boxtree/rotation_classes.py
+++ b/boxtree/rotation_classes.py
+"""
+Rotation classes data structure
+-------------------------------
+
+.. autoclass:: RotationClassesInfo
+
+Build rotation classes
+----------------------
+
+.. autoclass:: RotationClassesBuilder
+"""
+
+__copyright__ = "Copyright (C) 2019 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.array
+
+from boxtree.tools import DeviceDataRecord
+from boxtree.translation_classes import TranslationClassesBuilder
+
+
+logger = logging.getLogger(__name__)
+
+from pytools import log_process
+
+
+# {{{ rotation classes builder
+
+class RotationClassesInfo(DeviceDataRecord):
+    r"""Interaction lists to help with matrix precomputations for rotation-based
+    translations ("point and shoot").
+
+    .. attribute:: nfrom_sep_siblings_rotation_classes
+
+       The number of distinct rotation classes.
+
+    .. attribute:: from_sep_siblings_rotation_classes
+
+        ``int32 [*]``
+
+        A list, corresponding to *from_sep_siblings_lists* of *trav*, of
+        the rotation class of each box pair.
+
+    .. attribute:: from_sep_siblings_rotation_class_to_angle
+
+        ``coord_t [nfrom_sep_siblings_rotation_classes]``
+
+        Maps rotation classes in *from_sep_siblings_rotation_classes* to
+        rotation angles. This represents the angle between box translation
+        pairs and the *z*-axis.
+
+    """
+
+    @property
+    def nfrom_sep_siblings_rotation_classes(self):
+        return len(self.from_sep_siblings_rotation_class_to_angle)
+
+
+class RotationClassesBuilder:
+    """Build rotation classes for List 2 translations.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, context):
+        self.context = context
+        self.tcb = TranslationClassesBuilder(context)
+
+    @staticmethod
+    def vec_gcd(vec):
+        """Return the GCD of a list of integers."""
+        def gcd(a, b):
+            while b:
+                a, b = b, a % b
+            return a
+
+        result = abs(vec[0])
+        for elem in vec[1:]:
+            result = gcd(result, abs(elem))
+        return result
+
+    def compute_rotation_classes(self,
+            well_sep_is_n_away, dimensions, used_translation_classes):
+        """Convert translation classes to a list of rotation classes and angles."""
+        angle_to_rot_class = {}
+        angles = []
+
+        ntranslation_classes_per_level = (
+                self.tcb.ntranslation_classes_per_level(well_sep_is_n_away,
+                    dimensions))
+
+        translation_class_to_rot_class = (
+                np.empty(ntranslation_classes_per_level, dtype=np.int32))
+
+        translation_class_to_rot_class[:] = -1
+
+        for cls in used_translation_classes:
+            vec = self.tcb.translation_class_to_normalized_vector(
+                    well_sep_is_n_away, dimensions, cls)
+
+            # Normalize the translation vector (by dividing by its GCD).
+            #
+            # We need this before computing the cosine of the rotation angle,
+            # because generally in in floating point arithmetic, if k is a
+            # positive scalar and v is a vector, we can't assume
+            #
+            #   kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2).
+            #
+            # Normalizing ensures vectors that are positive integer multiples of
+            # each other get classified into the same equivalence class of
+            # rotations.
+            vec //= self.vec_gcd(vec)
+
+            # Compute the rotation angle for the vector.
+            norm = np.linalg.norm(vec)
+            assert norm != 0
+            angle = np.arccos(vec[-1] / norm)
+
+            # Find the rotation class.
+            if angle in angle_to_rot_class:
+                rot_class = angle_to_rot_class[angle]
+            else:
+                rot_class = len(angles)
+                angle_to_rot_class[angle] = rot_class
+                angles.append(angle)
+
+            translation_class_to_rot_class[cls] = rot_class
+
+        return translation_class_to_rot_class, angles
+
+    @log_process(logger, "build m2l rotation classes")
+    def __call__(self, queue, trav, tree, wait_for=None):
+        """Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`.
+        """
+        evt, translation_class_is_used, translation_classes_lists = \
+            self.tcb.compute_translation_classes(queue, trav, tree, wait_for, False)
+
+        d = tree.dimensions
+        n = trav.well_sep_is_n_away
+
+        # convert translation classes to rotation classes
+
+        used_translation_classes = (
+                np.flatnonzero(translation_class_is_used.get()))
+
+        translation_class_to_rotation_class, rotation_angles = (
+                self.compute_rotation_classes(n, d, used_translation_classes))
+
+        # There should be no more than 2^(d-1) * (2n+1)^d distinct rotation
+        # classes, since that is an upper bound on the number of distinct
+        # positions for list 2 boxes.
+        assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d
+
+        rotation_classes_lists = (
+                cl.array.take(
+                    cl.array.to_device(queue, translation_class_to_rotation_class),
+                    translation_classes_lists))
+
+        rotation_angles = cl.array.to_device(queue, np.array(rotation_angles))
+
+        return RotationClassesInfo(
+                from_sep_siblings_rotation_classes=rotation_classes_lists,
+                from_sep_siblings_rotation_class_to_angle=rotation_angles,
+                ).with_queue(None), evt
+
+# }}}
+
+# vim: filetype=pyopencl:fdm=marker
--- a/boxtree/timing.py
+++ b/boxtree/timing.py
+"""
+.. autoclass:: TimingResult
+
+.. autoclass:: TimingFuture
+"""
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from collections.abc import Mapping
+
+
+# {{{ timing result
+
+class TimingResult(Mapping):
+    """Interface for returned timing data.
+
+    This supports accessing timing results via a mapping interface, along with
+    combining results via :meth:`merge`.
+
+    .. automethod:: merge
+    """
+
+    def __init__(self, *args, **kwargs):
+        """See constructor for :class:`dict`."""
+        self._mapping = dict(*args, **kwargs)
+
+    def __getitem__(self, key):
+        return self._mapping[key]
+
+    def __iter__(self):
+        return iter(self._mapping)
+
+    def __len__(self):
+        return len(self._mapping)
+
+    def merge(self, other):
+        """Merge this result with another by adding together common fields."""
+        result = {}
+
+        for key in self:
+            val = self.get(key)
+            other_val = other.get(key)
+
+            if val is None or other_val is None:
+                continue
+
+            result[key] = val + other_val
+
+        return type(self)(result)
+
+# }}}
+
+
+# {{{ timing future
+
+class TimingFuture:
+    """Returns timing data for a potentially asynchronous operation.
+
+    .. automethod:: result
+    .. automethod:: done
+    """
+
+    def result(self):
+        """Return a :class:`TimingResult`. May block."""
+        raise NotImplementedError
+
+    def done(self):
+        """Return *True* if the operation is complete."""
+        raise NotImplementedError
+
+# }}}
+
+
+# {{{ timing recorder
+
+class TimingRecorder:
+
+    def __init__(self):
+        from collections import defaultdict
+        self.futures = defaultdict(list)
+
+    def add(self, description, future):
+        self.futures[description].append(future)
+
+    def summarize(self):
+        result = {}
+
+        for description, futures_list in self.futures.items():
+            futures = iter(futures_list)
+
+            timing_result = next(futures).result()
+            for future in futures:
+                timing_result = timing_result.merge(future.result())
+
+            result[description] = timing_result
+
+        return result
+
+# }}}
+
+
+# {{{ time recording tool
+
+class DummyTimingFuture(TimingFuture):
+    @classmethod
+    def from_timer(cls, timer):
+        return cls(wall_elapsed=timer.wall_elapsed,
+                   process_elapsed=timer.process_elapsed)
+
+    @classmethod
+    def from_op_count(cls, op_count):
+        return cls(ops_elapsed=op_count)
+
+    def __init__(self, *args, **kwargs):
+        self._result = TimingResult(*args, **kwargs)
+
+    def result(self):
+        return self._result
+
+    def done(self):
+        return True
+
+
+def return_timing_data(wrapped):
+    """A decorator for recording timing data for a function call.
+
+    The decorated function returns a tuple (*retval*, *timing_future*)
+    where *retval* is the original return value and *timing_future*
+    supports the timing data future interface in :mod:`boxtree.fmm`.
+    """
+
+    from pytools import ProcessTimer
+
+    def wrapper(*args, **kwargs):
+        timer = ProcessTimer()
+        retval = wrapped(*args, **kwargs)
+        timer.done()
+
+        future = DummyTimingFuture.from_timer(timer)
+        return (retval, future)
+
+    from functools import update_wrapper
+    new_wrapper = update_wrapper(wrapper, wrapped)
+
+    return new_wrapper
+
+# }}}
+
+
+# vim: foldmethod=marker
--- a/boxtree/tools.py
+++ b/boxtree/tools.py
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

 __license__ = """
@@ -22,37 +20,47 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import sys
+from functools import partial
+from typing import Any

 import numpy as np
-from pytools import Record, memoize_method
-import pyopencl as cl
-import pyopencl.array  # noqa
-from pyopencl.tools import first_arg_dependent_memoize_nested
 from mako.template import Template
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.cltypes as cltypes
+from pyopencl.tools import ScalarArg, VectorArg as _VectorArg, dtype_to_c_struct
+from pytools import Record, memoize_method
 from pytools.obj_array import make_obj_array


+# Use offsets in VectorArg by default.
+VectorArg = partial(_VectorArg, with_offset=True)
+
 AXIS_NAMES = ("x", "y", "z", "w")


-def padded_bin(i, l):
-    """Format *i* as binary number, pad it to length *l*."""
+def padded_bin(i, nbits):
+    """Format *i* as binary number, pad it to length *nbits*."""
+    return bin(i)[2:].rjust(nbits, "0")

-    s = bin(i)[2:]
-    while len(s) < l:
-        s = '0' + s
-    return s

+# NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__()
+def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=None):
+    if wait_for is None:
+        wait_for = []
+
+    if zero_fill:  # noqa: SIM108
+        array_maker = cl.array.zeros
+    else:
+        array_maker = cl.array.empty

-def realloc_array(ary, new_shape, zero_fill, queue, wait_for):
-    new_ary = cl.array.empty(queue, shape=new_shape, dtype=ary.dtype,
-            allocator=ary.allocator)
-    if zero_fill:
-        new_ary.fill(0, wait_for=wait_for)
-        wait_for = new_ary.events
+    new_ary = array_maker(queue, shape=new_shape, dtype=ary.dtype,
+                          allocator=allocator)

    evt = cl.enqueue_copy(queue, new_ary.data, ary.data, byte_count=ary.nbytes,
-            wait_for=wait_for)
+                          wait_for=wait_for + new_ary.events)

    return new_ary, evt

@@ -91,8 +99,8 @@ def reverse_index_array(indices, target_size=None, result_fill_value=None,
 # {{{ particle distribution generators

 def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15):
-    from pyopencl.clrandom import RanluxGenerator
-    rng = RanluxGenerator(queue, seed=seed)
+    from pyopencl.clrandom import PhiloxGenerator
+    rng = PhiloxGenerator(queue.context, seed=seed)

    return make_obj_array([
        rng.normal(queue, nparticles, dtype=dtype)
@@ -103,25 +111,29 @@ def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15):
    import loopy as lp

    if dims == 2:
-        @first_arg_dependent_memoize_nested
-        def get_2d_knl(context, dtype):
+        def get_2d_knl(dtype):
            knl = lp.make_kernel(
                "{[i]: 0<=i<n}",
                """
-                    <> phi = 2*M_PI/n * i
-                    x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
-                    y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
-                    """,
+                    for i
+                        <> phi = 2*M_PI/n * i
+                        x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
+                        y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
+                    end
+                """,
                [
                    lp.GlobalArg("x,y", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
-                    ])
+                ],
+                name="make_surface_particles_2d",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )

            knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

-            return lp.CompiledKernel(context, knl)
+            return knl.executor(queue.context)

-        evt, result = get_2d_knl(queue.context, dtype)(queue, n=nparticles)
+        _evt, result = get_2d_knl(dtype)(queue, n=nparticles)

        result = [x.ravel() for x in result]

@@ -129,28 +141,33 @@ def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15):
    elif dims == 3:
        n = int(nparticles**0.5)

-        @first_arg_dependent_memoize_nested
-        def get_3d_knl(context, dtype):
+        def get_3d_knl(dtype):
            knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<n}",
                """
-                    <> phi = 2*M_PI/n * i
-                    <> theta = 2*M_PI/n * j
-                    x[i,j] = 5*cos(phi) * (3 + cos(theta))
-                    y[i,j] = 5*sin(phi) * (3 + cos(theta))
-                    z[i,j] = 5*sin(theta)
-                    """,
+                    for i,j
+                        <> phi = 2*M_PI/n * i
+                        <> theta = 2*M_PI/n * j
+                        x[i,j] = 5*cos(phi) * (3 + cos(theta))
+                        y[i,j] = 5*sin(phi) * (3 + cos(theta))
+                        z[i,j] = 5*sin(theta)
+                    end
+                """,
                [
                    lp.GlobalArg("x,y,z,", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
-                    ])
+                ],
+                assumptions="n>0",
+                name="make_surface_particles_3d",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )

            knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")

-            return lp.CompiledKernel(context, knl)
+            return knl.executor(queue.context)

-        evt, result = get_3d_knl(queue.context, dtype)(queue, n=n)
+        _evt, result = get_3d_knl(dtype)(queue, n=n)

        result = [x.ravel() for x in result]

@@ -165,30 +182,35 @@ def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
    if dims == 2:
        n = int(nparticles**0.5)

-        @first_arg_dependent_memoize_nested
-        def get_2d_knl(context, dtype):
+        def get_2d_knl(dtype):
            knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<n}",
                """
-                    <> xx = 4*i/(n-1)
-                    <> yy = 4*j/(n-1)
-                    <float64> angle = 0.3
-                    <> s = sin(angle)
-                    <> c = cos(angle)
-                    x[i,j] = c*xx + s*yy - 2
-                    y[i,j] = -s*xx + c*yy - 2
-                    """,
+                    for i,j
+                        <> xx = 4*i/(n-1)
+                        <> yy = 4*j/(n-1)
+                        <float64> angle = 0.3
+                        <> s = sin(angle)
+                        <> c = cos(angle)
+                        x[i,j] = c*xx + s*yy - 2
+                        y[i,j] = -s*xx + c*yy - 2
+                    end
+                """,
                [
                    lp.GlobalArg("x,y", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
-                    ], assumptions="n>0")
+                ],
+                assumptions="n>0",
+                name="make_uniform_particles_2d",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )

            knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")

-            return lp.CompiledKernel(context, knl)
+            return knl.executor(queue.context)

-        evt, result = get_2d_knl(queue.context, dtype)(queue, n=n)
+        _evt, result = get_2d_knl(dtype)(queue, n=n)

        result = [x.ravel() for x in result]

@@ -196,42 +218,47 @@ def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
    elif dims == 3:
        n = int(nparticles**(1/3))

-        @first_arg_dependent_memoize_nested
-        def get_3d_knl(context, dtype):
+        def get_3d_knl(dtype):
            knl = lp.make_kernel(
                "{[i,j,k]: 0<=i,j,k<n}",
                """
-                    <> xx = i/(n-1)
-                    <> yy = j/(n-1)
-                    <> zz = k/(n-1)
-
-                    <float64> phi = 0.3
-                    <> s1 = sin(phi)
-                    <> c1 = cos(phi)
-
-                    <> xxx = c1*xx + s1*yy
-                    <> yyy = -s1*xx + c1*yy
-                    <> zzz = zz
-
-                    <float64> theta = 0.7
-                    <> s2 = sin(theta)
-                    <> c2 = cos(theta)
-
-                    x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
-                    y[i,j,k] = 4 * yyy - 2
-                    z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
-                    """,
+                    for i,j,k
+                        <> xx = i/(n-1)
+                        <> yy = j/(n-1)
+                        <> zz = k/(n-1)
+
+                        <float64> phi = 0.3
+                        <> s1 = sin(phi)
+                        <> c1 = cos(phi)
+
+                        <> xxx = c1*xx + s1*yy
+                        <> yyy = -s1*xx + c1*yy
+                        <> zzz = zz
+
+                        <float64> theta = 0.7
+                        <> s2 = sin(theta)
+                        <> c2 = cos(theta)
+
+                        x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
+                        y[i,j,k] = 4 * yyy - 2
+                        z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
+                    end
+                """,
                [
                    lp.GlobalArg("x,y,z", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
-                    ], assumptions="n>0")
+                ],
+                assumptions="n>0",
+                name="make_uniform_particles_3d",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )

            knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")

-            return lp.CompiledKernel(context, knl)
+            return knl.executor(queue.context)

-        evt, result = get_3d_knl(queue.context, dtype)(queue, n=n)
+        _evt, result = get_3d_knl(dtype)(queue, n=n)

        result = [x.ravel() for x in result]

@@ -259,35 +286,53 @@ class DeviceDataRecord(Record):
    instances on the host.
    """

-    def _transform_arrays(self, f):
+    def _transform_arrays(self, f, exclude_fields=frozenset()):
        result = {}
+
+        def transform_val(val):
+            from pyopencl.algorithm import BuiltList
+            if isinstance(val, np.ndarray) and val.dtype == object:
+                from pytools.obj_array import obj_array_vectorize
+                return obj_array_vectorize(f, val)
+            elif isinstance(val, list):
+                return [transform_val(i) for i in val]
+            elif isinstance(val, BuiltList):
+                transformed_list = {}
+                for field in val.__dict__:
+                    if field != "count" and not field.startswith("_"):
+                        transformed_list[field] = f(getattr(val, field))
+                return BuiltList(count=val.count, **transformed_list)
+            else:
+                return f(val)
+
        for field_name in self.__class__.fields:
+            if field_name in exclude_fields:
+                continue
+
            try:
                attr = getattr(self, field_name)
            except AttributeError:
                pass
            else:
-                if isinstance(attr, np.ndarray) and attr.dtype == object:
-                    from pytools.obj_array import with_object_array_or_scalar
-                    result[field_name] = with_object_array_or_scalar(f, attr)
-                else:
-                    result[field_name] = f(attr)
+                result[field_name] = transform_val(attr)

        return self.copy(**result)

-    def get(self, **kwargs):
+    def get(self, queue, **kwargs):
        """Return a copy of `self` in which all data lives on the host, i.e.
-        all :class:`pyopencl.array.Array` objects are replaced by corresponding
-        :class:`numpy.ndarray` instances on the host.
+        all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` objects are
+        replaced by corresponding :class:`numpy.ndarray` instances on the host.
        """
-
        def try_get(attr):
+            if isinstance(attr, ImmutableHostDeviceArray):
+                return attr.host
+
            try:
                get_meth = attr.get
            except AttributeError:
                return attr

-            return get_meth(**kwargs)
+            return get_meth(queue=queue, **kwargs)

        return self._transform_arrays(try_get)

@@ -311,13 +356,52 @@ class DeviceDataRecord(Record):

        return self._transform_arrays(try_with_queue)

+    def to_device(self, queue, exclude_fields=frozenset()):
+        """Return a copy of `self` in all :class:`numpy.ndarray` arrays are
+        transferred to device memory as :class:`pyopencl.array.Array` objects.
+
+        :arg exclude_fields: a :class:`frozenset` containing fields excluding from
+            transferring to the device memory.
+        """
+
+        def _to_device(attr):
+            if isinstance(attr, np.ndarray):
+                return cl.array.to_device(queue, attr).with_queue(None)
+            elif isinstance(attr, ImmutableHostDeviceArray):
+                return attr.device
+            elif isinstance(attr, DeviceDataRecord):
+                return attr.to_device(queue)
+            else:
+                return attr
+
+        return self._transform_arrays(_to_device, exclude_fields=exclude_fields)
+
+    def to_host_device_array(self, queue, exclude_fields=frozenset()):
+        """Return a copy of `self` where all device and host arrays are transformed
+        to `ImmutableHostDeviceArray` objects.
+
+        :arg exclude_fields: a :class:`frozenset` containing fields excluding from
+            transformed to `ImmutableHostDeviceArray`.
+        """
+        def _to_host_device_array(attr):
+            if isinstance(attr, np.ndarray | cl.array.Array):
+                return ImmutableHostDeviceArray(queue, attr)
+            elif isinstance(attr, DeviceDataRecord):
+                return attr.to_host_device_array(queue)
+            else:
+                return attr
+
+        return self._transform_arrays(
+            _to_host_device_array, exclude_fields=exclude_fields
+        )
+
 # }}}


 # {{{ type mangling

 def get_type_moniker(dtype):
-    return "%s%d" % (dtype.kind, dtype.itemsize)
+    return f"{dtype.kind}{dtype.itemsize}"

 # }}}

@@ -328,14 +412,22 @@ GAPPY_COPY_TPL = Template(r"""//CL//

    typedef ${dtype_to_ctype(dtype)} value_t;

-    value_t val = input_ary[from_indices[i]];
+    %if from_indices:
+        value_t val = input_ary[from_indices[i]];
+    %else:
+        value_t val = input_ary[i];
+    %endif

    // Optionally, noodle values through a lookup table.
    %if map_values:
        val = value_map[val];
    %endif

-    output_ary[i] = val;
+    %if to_indices:
+        output_ary[to_indices[i]] = val;
+    %else:
+        output_ary[i] = val;
+    %endif

 """, strict_undefined=True)

@@ -345,51 +437,514 @@ class GappyCopyAndMapKernel:
        self.context = context

    @memoize_method
-    def _get_kernel(self, dtype, src_index_dtype, map_values=False):
-        from pyopencl.tools import VectorArg
+    def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
+                    have_src_indices, have_dst_indices, map_values):
+        from boxtree.tools import VectorArg

        args = [
-                VectorArg(dtype, "input_ary", with_offset=True),
-                VectorArg(dtype, "output_ary", with_offset=True),
-                VectorArg(src_index_dtype, "from_indices", with_offset=True)
-                ]
+                VectorArg(dtype, "input_ary"),
+                VectorArg(dtype, "output_ary"),
+               ]
+
+        if have_src_indices:
+            args.append(VectorArg(src_index_dtype, "from_indices"))
+
+        if have_dst_indices:
+            args.append(VectorArg(dst_index_dtype, "to_indices"))

        if map_values:
-            args.append(VectorArg(dtype, "value_map", with_offset=True))
+            args.append(VectorArg(dtype, "value_map"))

        from pyopencl.tools import dtype_to_ctype
        src = GAPPY_COPY_TPL.render(
                dtype=dtype,
                dtype_to_ctype=dtype_to_ctype,
+                from_dtype=src_index_dtype,
+                to_dtype=dst_index_dtype,
+                from_indices=have_src_indices,
+                to_indices=have_dst_indices,
                map_values=map_values)

        from pyopencl.elementwise import ElementwiseKernel
        return ElementwiseKernel(self.context,
-                args, str(src), name="gappy_copy_and_map")
-
-    def __call__(self, queue, allocator, new_size,
-            src_indices, ary, map_values=None, wait_for=None):
+                args, str(src),
+                preamble=dtype_to_c_struct(self.context.devices[0], dtype),
+                name="gappy_copy_and_map")
+
+    # NOTE: Order of positional args should match realloc_array()
+    def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
+                 dst_indices=None, map_values=None, zero_fill=False,
+                 wait_for=None, range=None, debug=False):
        """Compresses box info arrays after empty leaf pruning and, optionally,
        maps old box IDs to new box IDs (if the array being operated on contains
        box IDs).
        """

-        assert len(ary) >= new_size
+        have_src_indices = src_indices is not None
+        have_dst_indices = dst_indices is not None
+        have_map_values = map_values is not None
+
+        if not (have_src_indices or have_dst_indices):
+            raise ValueError("must specify at least one of src or dest indices")
+
+        if range is None:
+            if have_src_indices and have_dst_indices:
+                raise ValueError(
+                    "must supply range when passing both src and dest indices")
+            elif have_src_indices:
+                range = slice(src_indices.shape[0])
+                if debug:
+                    assert int(cl.array.max(src_indices).get()) < len(ary)
+            elif have_dst_indices:
+                range = slice(dst_indices.shape[0])
+                if debug:
+                    assert int(cl.array.max(dst_indices).get()) < new_shape
+
+        if zero_fill:  # noqa: SIM108
+            array_maker = cl.array.zeros
+        else:
+            array_maker = cl.array.empty
+
+        result = array_maker(queue, new_shape, ary.dtype, allocator=allocator)
+
+        kernel = self._get_kernel(ary.dtype,
+                                  src_indices.dtype if have_src_indices else None,
+                                  dst_indices.dtype if have_dst_indices else None,
+                                  have_src_indices,
+                                  have_dst_indices,
+                                  have_map_values)
+
+        args = (ary, result)
+        args += (src_indices,) if have_src_indices else ()
+        args += (dst_indices,) if have_dst_indices else ()
+        args += (map_values,) if have_map_values else ()
+
+        evt = kernel(*args, queue=queue, range=range, wait_for=wait_for)

-        result = cl.array.empty(queue, new_size, ary.dtype, allocator=allocator)
+        return result, evt

-        kernel = self._get_kernel(ary.dtype, src_indices.dtype,
-                # map_values:
-                map_values is not None)
+# }}}

-        args = (ary, result, src_indices)
-        if map_values is not None:
-            args += (map_values,)

-        evt = kernel(*args, queue=queue, range=slice(new_size), wait_for=wait_for)
+# {{{ map values through table

-        return result, evt
+from pyopencl.elementwise import ElementwiseTemplate
+
+
+MAP_VALUES_TPL = ElementwiseTemplate(
+    arguments="""//CL//
+        dst_value_t *dst,
+        src_value_t *src,
+        dst_value_t *map_values
+        """,
+    operation=r"""//CL//
+        dst[i] = map_values[src[i]];
+        """,
+    name="map_values")
+
+
+class MapValuesKernel:
+
+    def __init__(self, context):
+        self.context = context
+
+    @memoize_method
+    def _get_kernel(self, dst_dtype, src_dtype):
+        type_aliases = (
+            ("src_value_t", src_dtype),
+            ("dst_value_t", dst_dtype)
+            )
+
+        return MAP_VALUES_TPL.build(self.context, type_aliases)
+
+    def __call__(self, map_values, src, dst=None):
+        """
+        Map the entries of the array `src` through the table `map_values`.
+        """
+        if dst is None:
+            dst = src
+
+        kernel = self._get_kernel(dst.dtype, src.dtype)
+        evt = kernel(dst, src, map_values)
+
+        return dst, evt
+
+# }}}
+
+
+# {{{ binary search
+
+from mako.template import Template
+
+
+BINARY_SEARCH_TEMPLATE = Template("""
+/*
+ * Returns the largest value of i such that arr[i] <= val, or (size_t) -1 if val
+ * is less than all values.
+ */
+inline size_t bsearch(
+    __global const ${elem_t} *arr,
+    size_t len,
+    const ${elem_t} val)
+{
+    if (val < arr[0])
+    {
+        return -1;
+    }
+
+    size_t l = 0, r = len, i;
+
+    while (1)
+    {
+        i = l + (r - l) / 2;
+
+        if (arr[i] <= val && (i == len - 1 || val < arr[i + 1]))
+        {
+            return i;
+        }
+
+        if (arr[i] <= val)
+        {
+            l = i;
+        }
+        else
+        {
+            r = i;
+        }
+    }
+}
+""")
+
+
+class InlineBinarySearch:
+
+    def __init__(self, elem_type_name):
+        self.render_vars = {"elem_t": elem_type_name}
+
+    @memoize_method
+    def __str__(self):
+        return BINARY_SEARCH_TEMPLATE.render(**self.render_vars)

 # }}}

-# vim: foldmethod=marker:filetype=pyopencl
+
+# {{{ compress a masked array into a list / list of lists
+
+
+MASK_LIST_COMPRESSOR_BODY = r"""
+void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
+{
+    if (mask[i])
+    {
+        APPEND_output(i);
+    }
+}
+"""
+
+
+MASK_MATRIX_COMPRESSOR_BODY = r"""
+void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
+{
+    for (int j = 0; j < ncols; ++j)
+    {
+        if (mask[outer_stride * i + j * inner_stride])
+        {
+            APPEND_output(j);
+        }
+    }
+}
+"""
+
+
+class MaskCompressorKernel:
+    """
+    .. automethod:: __call__
+    """
+    def __init__(self, context):
+        self.context = context
+
+    @memoize_method
+    def get_list_compressor_kernel(self, mask_dtype, list_dtype):
+        from pyopencl.algorithm import ListOfListsBuilder
+
+        return ListOfListsBuilder(
+                self.context,
+                [("output", list_dtype)],
+                MASK_LIST_COMPRESSOR_BODY,
+                [
+                    _VectorArg(mask_dtype, "mask"),
+                ],
+                name_prefix="compress_list")
+
+    @memoize_method
+    def get_matrix_compressor_kernel(self, mask_dtype, list_dtype):
+        from pyopencl.algorithm import ListOfListsBuilder
+
+        return ListOfListsBuilder(
+                self.context,
+                [("output", list_dtype)],
+                MASK_MATRIX_COMPRESSOR_BODY,
+                [
+                    ScalarArg(np.int32, "ncols"),
+                    ScalarArg(np.int32, "outer_stride"),
+                    ScalarArg(np.int32, "inner_stride"),
+                    _VectorArg(mask_dtype, "mask"),
+                ],
+                name_prefix="compress_matrix")
+
+    def __call__(self, queue, mask, list_dtype=None):
+        """Convert a mask to a list in :ref:`csr` format.
+
+        :arg mask: Either a 1D or 2D array.
+            * If *mask* is 1D, it should represent a masked list, where
+              *mask[i]* is true if and only if *i* is in the list.
+            * If *mask* is 2D, it should represent a list of masked lists,
+              so that *mask[i,j]* is true if and only if *j* is in list *i*.
+
+        :arg list_dtype: The dtype for the output list(s). Defaults to the mask
+            dtype.
+
+        :returns: The return value depends on the type of the input.
+            * If mask* is 1D, returns a tuple *(list, evt)*.
+            * If *mask* is 2D, returns a tuple *(starts, lists, event)*, as a
+              :ref:`csr` list.
+        """
+        if list_dtype is None:
+            list_dtype = mask.dtype
+
+        if len(mask.shape) == 1:
+            knl = self.get_list_compressor_kernel(mask.dtype, list_dtype)
+            result, evt = knl(queue, mask.shape[0], mask.data)
+            return (result["output"].lists, evt)
+        elif len(mask.shape) == 2:
+            # FIXME: This is efficient for small column sizes but may not be
+            # for larger ones since the work is partitioned by row.
+            knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype)
+            size = mask.dtype.itemsize
+            assert size > 0
+            result, evt = knl(queue, mask.shape[0], mask.shape[1],
+                              mask.strides[0] // size, mask.strides[1] // size,
+                              mask.data)
+            return (result["output"].starts, result["output"].lists, evt)
+        else:
+            raise ValueError("unsupported dimensionality")
+
+# }}}
+
+
+# {{{ Communication pattern for partial multipole expansions
+
+class AllReduceCommPattern:
+    """Describes a tree-like communication pattern for exchanging and reducing
+    multipole expansions. Supports an arbitrary number of processes.
+
+    A user must instantiate a version of this with identical *size* and varying
+    *rank* on each rank. During each stage, each rank sends its contribution to
+    the reduction results on ranks returned by :meth:`sinks` and listens for
+    contributions from :meth:`source`. :meth:`messages` can be used for determining
+    array indices whose partial results need to be sent during the current stage.
+    Then, all ranks call :meth:`advance` and use :meth:`done` to check whether the
+    communication is complete. In the use case of multipole communication, the
+    reduction result is a vector of multipole expansions to which all ranks add
+    contribution. These contributions are communicated sparsely via arrays of box
+    indices and expansions.
+
+    .. automethod:: __init__
+    .. automethod:: sources
+    .. automethod:: sinks
+    .. automethod:: messages
+    .. automethod:: advance
+    .. automethod:: done
+    """
+
+    def __init__(self, rank, size):
+        """
+        :arg rank: Current rank.
+        :arg size: Total number of ranks.
+        """
+        assert 0 <= rank < size
+        self.rank = rank
+        self.left = 0
+        self.right = size
+        self.midpoint = size // 2
+
+    def sources(self):
+        """Return the set of source nodes at the current communication stage. The
+        current rank receives messages from these ranks.
+        """
+        if self.rank < self.midpoint:
+            partner = self.midpoint + (self.rank - self.left)
+            if self.rank == self.midpoint - 1 and partner == self.right:
+                partners = set()
+            elif self.rank == self.midpoint - 1 and partner == self.right - 2:
+                partners = {partner, partner + 1}
+            else:
+                partners = {partner}
+        else:
+            partner = self.left + (self.rank - self.midpoint)
+            if self.rank == self.right - 1 and partner == self.midpoint:
+                partners = set()
+            elif self.rank == self.right - 1 and partner == self.midpoint - 2:
+                partners = {partner, partner + 1}
+            else:
+                partners = {partner}
+
+        return partners
+
+    def sinks(self):
+        """Return the set of sink nodes at this communication stage. The current rank
+        sends a message to these ranks.
+        """
+        if self.rank < self.midpoint:
+            partner = self.midpoint + (self.rank - self.left)
+            if partner == self.right:
+                partner -= 1
+        else:
+            partner = self.left + (self.rank - self.midpoint)
+            if partner == self.midpoint:
+                partner -= 1
+
+        return {partner}
+
+    def messages(self):
+        """Return a range of ranks, such that the partial results of array indices
+        used by these ranks are sent to the sinks.  This is returned as a
+        [start, end) pair. By design, it is a consecutive range.
+        """
+        if self.rank < self.midpoint:
+            return (self.midpoint, self.right)
+        else:
+            return (self.left, self.midpoint)
+
+    def advance(self):
+        """Advance to the next stage in the communication pattern.
+        """
+        if self.done():
+            raise RuntimeError("finished communicating")
+
+        if self.rank < self.midpoint:
+            self.right = self.midpoint
+            self.midpoint = (self.midpoint + self.left) // 2
+        else:
+            self.left = self.midpoint
+            self.midpoint = (self.midpoint + self.right) // 2
+
+    def done(self):
+        """Return whether the current rank is finished communicating.
+        """
+        return self.left + 1 == self.right
+
+# }}}
+
+
+# {{{ MPI launcher
+
+def run_mpi(script: str, num_processes: int, env: dict[str, Any]) -> None:
+    """Launch MPI processes.
+
+    This function forks another process and uses ``mpiexec`` to launch
+    *num_processes* MPI processes running *script*.
+
+    :arg script: the Python script to run.
+    :arg num_processes: the number of MPI process to launch.
+    :arg env: a :class:`dict` of environment variables.
+    """
+    import os
+    env = {key: str(value) for key, value in env.items()}
+    env = {**os.environ, **env}
+
+    import subprocess
+
+    from mpi4py import MPI
+
+    # Using "-m mpi4py" is necessary for avoiding deadlocks on exception cleanup
+    # See https://mpi4py.readthedocs.io/en/stable/mpi4py.run.html for details.
+
+    mpi_library_name = MPI.Get_library_version()
+    if mpi_library_name.startswith("Open MPI"):
+        command = ["mpiexec", "-np", str(num_processes), "--oversubscribe"]
+        for env_variable_name in env:
+            command.extend(["-x", env_variable_name])
+        command.extend([sys.executable, "-m", "mpi4py", script])
+    else:
+        command = [
+            "mpiexec", "-np", str(num_processes), sys.executable,
+            "-m", "mpi4py", script
+            ]
+
+    subprocess.run(command, env=env, check=True)
+
+# }}}
+
+
+# {{{ HostDeviceArray
+
+class ImmutableHostDeviceArray:
+    """Interface for arrays on both host and device.
+
+    .. note:: This interface assumes the array is immutable. The behavior of
+    modifying the content of either the host array or the device array is undefined.
+
+    @TODO: Once available, replace this implementation with PyOpenCL's in-house
+    implementation.
+    """
+    def __init__(self, queue, array):
+        self.queue = queue
+        self.shape = array.shape
+        self.host_array = None
+        self.device_array = None
+
+        if isinstance(array, np.ndarray):
+            self.host_array = array
+        elif isinstance(array, cl.array.Array):
+            self.device_array = array
+
+    def with_queue(self, queue):
+        self.queue = queue
+
+    @property
+    def svm_capable(self):
+        svm_capabilities = \
+            self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES)
+        return svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0
+
+    @property
+    def host(self):
+        if self.host_array is None:
+            self.host_array = self.device_array.get(self.queue)
+        return self.host_array
+
+    @property
+    def device(self):
+        if self.device_array is None:
+            # @TODO: Use SVM
+            self.device_array = cl.array.to_device(self.queue, self.host_array)
+
+        self.device_array.with_queue(self.queue)
+        return self.device_array
+
+# }}}
+
+
+# {{{ coord_vec tools
+
+def get_coord_vec_dtype(
+        coord_dtype: np.dtype, dimensions: int) -> np.dtype:
+    if dimensions == 1:
+        return coord_dtype
+    else:
+        return cltypes.vec_types[coord_dtype, dimensions]
+
+
+def coord_vec_subscript_code(dimensions: int, vec_name: str, iaxis: int) -> str:
+    assert 0 <= iaxis < dimensions
+    if dimensions == 1:
+        # a coord_vec_t is just a scalar
+        return vec_name
+    else:
+        return f"{vec_name}.s{iaxis}"
+
+# }}}
+
+
+# vim: foldmethod=marker
--- a/boxtree/translation_classes.py
+++ b/boxtree/translation_classes.py
+"""
+Translation classes data structure
+----------------------------------
+
+.. autoclass:: TranslationClassesInfo
+
+Build translation classes
+-------------------------
+
+.. autoclass:: TranslationClassesBuilder
+"""
+
+__copyright__ = "Copyright (C) 2019 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+from functools import partial
+
+import numpy as np
+from mako.template import Template
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.cltypes
+from pyopencl.elementwise import ElementwiseTemplate
+from pytools import Record, memoize_method
+
+from boxtree.tools import (
+    DeviceDataRecord,
+    InlineBinarySearch,
+    coord_vec_subscript_code,
+    get_coord_vec_dtype,
+)
+from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
+
+
+logger = logging.getLogger(__name__)
+
+from pytools import log_process
+
+
+# {{{ translation classes builder
+
+TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE = Template(r"""//CL:mako//
+    #define LEVEL_TO_RAD(level) \
+        (root_extent * 1 / (coord_t) (1 << (level + 1)))
+
+    // Return an integer vector indicating the a translation direction
+    // as a multiple of the box diameter.
+    inline int_coord_vec_t get_normalized_translation_vector(
+        coord_t root_extent,
+        int level,
+        coord_vec_t source_center,
+        coord_vec_t target_center)
+    {
+        int_coord_vec_t result = (int_coord_vec_t) 0;
+        coord_t diam = 2 * LEVEL_TO_RAD(level);
+        %for i in range(dimensions):
+            ${cvec_sub("result", i)} = rint(
+                (${cvec_sub("target_center", i)} - ${cvec_sub("source_center", i)})
+                / diam);
+        %endfor
+        return result;
+    }
+
+    // Compute the translation class for the given translation vector.  The
+    // translation class maps a translation vector (a_1, a_2, ..., a_d) into
+    // a dense range of integers [0, ..., (4*n+3)^d - 1], where
+    // d is the dimension and n is well_sep_is_n_away.
+    //
+    // The translation vector should be normalized for a box diameter of 1.
+    //
+    // This relies on the fact that the entries of the vector will
+    // always be in the range [-2n-1,...,2n+1].
+    //
+    // The mapping from vector to class is:
+    //
+    //                         \~~   d                 k-1
+    //     cls(a ,a ,...,a ) =  >      (2n+1+a ) (4n+3)
+    //          1  2      d    /__ k=1        k
+    //
+    // Returns -1 on error.
+    inline int get_translation_class(int_coord_vec_t vec, int well_sep_is_n_away)
+    {
+        int dim_bound = 2 * well_sep_is_n_away + 1;
+        %for i in range(dimensions):
+            if (!(-dim_bound <= ${cvec_sub("vec", i)}
+                && ${cvec_sub("vec", i)} <= dim_bound))
+            {
+                return -1;
+            }
+        %endfor
+
+        int result = 0;
+        int base = 4 * well_sep_is_n_away + 3;
+        int mult = 1;
+        %for i in range(dimensions):
+            result += (2 * well_sep_is_n_away + 1 + ${cvec_sub("vec", i)}) * mult;
+            mult *= base;
+        %endfor
+        return result;
+    }
+    """ + str(InlineBinarySearch("box_id_t")),
+    strict_undefined=True)
+
+
+TRANSLATION_CLASS_FINDER_TEMPLATE = ElementwiseTemplate(
+    arguments=r"""//CL:mako//
+    /* input: */
+    box_id_t *from_sep_siblings_lists,
+    box_id_t *from_sep_siblings_starts,
+    box_id_t *target_or_target_parent_boxes,
+    int ntarget_or_target_parent_boxes,
+    coord_t *box_centers,
+    int aligned_nboxes,
+    coord_t root_extent,
+    box_level_t *box_levels,
+    int well_sep_is_n_away,
+
+    /* output: */
+    int *translation_classes,
+    int *translation_class_is_used,
+    int *error_flag,
+    """,
+
+    operation=TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
+    // Find the target box for this source box.
+    box_id_t source_box_id = from_sep_siblings_lists[i];
+
+    size_t itarget_box = bsearch(
+        from_sep_siblings_starts, 1 + ntarget_or_target_parent_boxes, i);
+
+    box_id_t target_box_id = target_or_target_parent_boxes[itarget_box];
+
+    // Ensure levels are the same.
+    if (box_levels[source_box_id] != box_levels[target_box_id])
+    {
+        atomic_or(error_flag, 1);
+        PYOPENCL_ELWISE_CONTINUE;
+    }
+
+    // Compute the translation vector and translation class.
+    ${load_center("source_center", "source_box_id")}
+    ${load_center("target_center", "target_box_id")}
+
+    int_coord_vec_t vec = get_normalized_translation_vector(
+        root_extent, box_levels[source_box_id], source_center, target_center);
+
+    int translation_class = get_translation_class(vec, well_sep_is_n_away);
+
+    // Ensure valid translation class.
+    if (translation_class == -1)
+    {
+        atomic_or(error_flag, 1);
+        PYOPENCL_ELWISE_CONTINUE;
+    }
+
+    % if translation_class_per_level:
+        translation_class += box_levels[source_box_id] * \
+                                ${ntranslation_classes_per_level};
+    % endif
+
+    translation_classes[i] = translation_class;
+    atomic_or(&translation_class_is_used[translation_class], 1);
+    """)
+
+
+class _KernelInfo(Record):
+    pass
+
+
+class TranslationClassesInfo(DeviceDataRecord):
+    r"""Interaction lists to help with for translations that benefit from
+    precomputing distance related values
+
+    .. attribute:: nfrom_sep_siblings_translation_classes
+
+       The number of distinct translation classes.
+
+    .. attribute:: from_sep_siblings_translation_classes
+
+        ``int32 [*]``
+
+        A list, corresponding to *from_sep_siblings_lists* of :attr:`traversal`, of
+        the translation classes of each box pair.
+
+    .. attribute:: from_sep_siblings_translation_class_to_distance_vector
+
+        ``coord_vec_t [nfrom_sep_siblings_translation_classes]``
+
+        Maps translation classes in *from_sep_siblings_translation_classes*
+        to distance (translation) vectors from source box center to
+        target box center.
+
+    .. attribute:: from_sep_siblings_translation_classes_level_starts
+
+        ``int32 [nlevels + 1]``
+
+        A list with an entry for each level giving the starting translation
+        class id for that level. Translation classes are numbered contiguously
+        by level.
+
+    .. attribute:: traversal
+
+        A :class:`boxtree.traversal.FMMTraversalInfo` object corresponding to the
+        traversal that these translation classes refer to.
+    """
+
+    def __init__(self, traversal, **kwargs):
+        super().__init__(**kwargs)
+        self.traversal = traversal
+
+    def copy(self, **kwargs):
+        traversal = kwargs.pop("traversal", self.traversal)
+        return self.__class__(traversal=traversal, **self.get_copy_kwargs(**kwargs))
+
+    @property
+    def nfrom_sep_siblings_translation_classes(self):
+        return len(self.from_sep_siblings_translation_class_to_distance_vector)
+
+
+class TranslationClassesBuilder:
+    """Build translation classes for List 2 translations.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, context):
+        self.context = context
+
+    @memoize_method
+    def get_kernel_info(self, dimensions, well_sep_is_n_away,
+            box_id_dtype, box_level_dtype, coord_dtype, translation_class_per_level):
+        coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
+        int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions)
+
+        num_translation_classes = \
+            self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
+
+        # Make sure translation classes can fit inside a 32 bit integer.
+        if not num_translation_classes <= 1 + np.iinfo(np.int32).max:
+            raise ValueError("would overflow")
+
+        preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render(
+                dimensions=dimensions,
+                cvec_sub=partial(coord_vec_subscript_code, dimensions))
+
+        translation_class_finder = (
+                TRANSLATION_CLASS_FINDER_TEMPLATE.build(
+                    self.context,
+                    type_aliases=(
+                        ("int_coord_vec_t", int_coord_vec_dtype),
+                        ("coord_vec_t", coord_vec_dtype),
+                        ("coord_t", coord_dtype),
+                        ("box_id_t", box_id_dtype),
+                        ("box_level_t", box_level_dtype),
+                    ),
+                    var_values=(
+                        ("dimensions", dimensions),
+                        ("ntranslation_classes_per_level", num_translation_classes),
+                        ("translation_class_per_level", translation_class_per_level),
+                        ("cvec_sub", partial(
+                            coord_vec_subscript_code, dimensions)),
+                    ),
+                    more_preamble=preamble))
+
+        return _KernelInfo(translation_class_finder=translation_class_finder)
+
+    @staticmethod
+    def ntranslation_classes_per_level(well_sep_is_n_away, dimensions):
+        return (4 * well_sep_is_n_away + 3) ** dimensions
+
+    def translation_class_to_normalized_vector(self, well_sep_is_n_away,
+            dimensions, cls):
+        # This computes the vector for the translation class, using the inverse
+        # of the formula found in get_translation_class() defined in
+        # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.
+        assert 0 <= cls < self.ntranslation_classes_per_level(well_sep_is_n_away,
+                                                              dimensions)
+        result = np.zeros(dimensions, dtype=np.int32)
+        shift = 2 * well_sep_is_n_away + 1
+        base = 4 * well_sep_is_n_away + 3
+        for i in range(dimensions):
+            result[i] = cls % base - shift
+            cls //= base
+        return result
+
+    def compute_translation_classes(self, queue, trav, tree, wait_for,
+            is_translation_per_level):
+        """
+        Returns a tuple *evt*,  *translation_class_is_used* and
+        *translation_classes_lists*.
+        """
+
+        # {{{ compute translation classes for list 2
+
+        well_sep_is_n_away = trav.well_sep_is_n_away
+        dimensions = tree.dimensions
+        coord_dtype = tree.coord_dtype
+
+        knl_info = self.get_kernel_info(
+                dimensions, well_sep_is_n_away, tree.box_id_dtype,
+                tree.box_level_dtype, coord_dtype, is_translation_per_level)
+
+        ntranslation_classes = (
+                self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
+
+        if is_translation_per_level:
+            ntranslation_classes = ntranslation_classes * tree.nlevels
+
+        translation_classes_lists = cl.array.empty(
+                queue, len(trav.from_sep_siblings_lists), dtype=np.int32)
+
+        translation_class_is_used = cl.array.zeros(
+                queue, ntranslation_classes, dtype=np.int32)
+
+        error_flag = cl.array.zeros(queue, 1, dtype=np.int32)
+
+        evt = knl_info.translation_class_finder(
+                trav.from_sep_siblings_lists,
+                trav.from_sep_siblings_starts,
+                trav.target_or_target_parent_boxes,
+                trav.ntarget_or_target_parent_boxes,
+                tree.box_centers,
+                tree.aligned_nboxes,
+                tree.root_extent,
+                tree.box_levels,
+                well_sep_is_n_away,
+                translation_classes_lists,
+                translation_class_is_used,
+                error_flag,
+                queue=queue, wait_for=wait_for)
+
+        if (error_flag.get()):
+            raise ValueError("could not compute translation classes")
+
+        return (evt, translation_class_is_used, translation_classes_lists)
+
+        # }}}
+
+    @log_process(logger, "build m2l translation classes")
+    def __call__(self, queue, trav, tree, wait_for=None,
+                 is_translation_per_level=True):
+        """Returns a pair *info*, *evt* where info is a
+        :class:`TranslationClassesInfo`.
+        """
+        evt, translation_class_is_used, translation_classes_lists = \
+            self.compute_translation_classes(queue, trav, tree, wait_for,
+                                             is_translation_per_level)
+
+        well_sep_is_n_away = trav.well_sep_is_n_away
+        dimensions = tree.dimensions
+
+        used_translation_classes_map = np.empty(len(translation_class_is_used),
+                                                dtype=np.int32)
+        used_translation_classes_map.fill(-1)
+
+        distances = np.empty((dimensions, len(translation_class_is_used)),
+                             dtype=tree.coord_dtype)
+        num_translation_classes = \
+            self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
+
+        nlevels = tree.nlevels
+        count = 0
+        prev_level = -1
+        from_sep_siblings_translation_classes_level_starts = \
+            np.empty(nlevels+1, dtype=np.int32)
+        for i, used in enumerate(translation_class_is_used.get()):
+            cls_without_level = i % num_translation_classes
+            level = i // num_translation_classes
+            if (prev_level != level):
+                from_sep_siblings_translation_classes_level_starts[level] = count
+                prev_level = level
+
+            if not used:
+                continue
+
+            used_translation_classes_map[i] = count
+            unit_vector = self.translation_class_to_normalized_vector(
+                            well_sep_is_n_away, dimensions, cls_without_level)
+            distances[:, count] = unit_vector * tree.root_extent / (1 << level)
+            count = count + 1
+
+        from_sep_siblings_translation_classes_level_starts[nlevels] = count
+
+        translation_classes_lists = (
+                cl.array.take(
+                    cl.array.to_device(queue, used_translation_classes_map),
+                    translation_classes_lists))
+
+        distances = cl.array.to_device(queue, distances)
+        from_sep_siblings_translation_classes_level_starts = cl.array.to_device(
+            queue, from_sep_siblings_translation_classes_level_starts)
+
+        info = TranslationClassesInfo(
+                traversal=trav,
+                from_sep_siblings_translation_classes=translation_classes_lists,
+                from_sep_siblings_translation_class_to_distance_vector=distances,
+                from_sep_siblings_translation_classes_level_starts=(
+                    from_sep_siblings_translation_classes_level_starts),
+                ).with_queue(None)
+
+        return info, evt
+
+# }}}
+
+# vim: fdm=marker
--- a/boxtree/traversal.py
+++ b/boxtree/traversal.py
-from __future__ import division
+"""
+Traversal data structure
+------------------------
+
+.. autoclass:: FMMTraversalInfo
+
+Build Entrypoint
+----------------
+
+.. autoclass:: FMMTraversalBuilder
+
+    .. automethod:: __call__
+"""

 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

@@ -22,66 +34,50 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging
+from functools import partial
+
 import numpy as np
-from pytools import Record, memoize_method, memoize_in
-import pyopencl as cl
-import pyopencl.array  # noqa
-from pyopencl.elementwise import ElementwiseTemplate
 from mako.template import Template
-from boxtree.tools import AXIS_NAMES, DeviceDataRecord
-
-import logging
-logger = logging.getLogger(__name__)

+import pyopencl as cl
+import pyopencl.array
+import pyopencl.cltypes
+from pyopencl.elementwise import ElementwiseTemplate
+from pytools import Record, memoize_method

-# {{{ preamble
+from boxtree.tools import (
+    AXIS_NAMES,
+    DeviceDataRecord,
+    coord_vec_subscript_code,
+    get_coord_vec_dtype,
+)

-TRAVERSAL_PREAMBLE_TEMPLATE = r"""//CL//
-${box_flags_enum.get_c_defines()}
-${box_flags_enum.get_c_typedef()}

-typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
-%if particle_id_dtype is not None:
-    typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
-%endif
-typedef ${dtype_to_ctype(coord_dtype)} coord_t;
-typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
+logger = logging.getLogger(__name__)

-#define NLEVELS ${max_levels}
-#define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor})
+from pytools import ProcessLogger, log_process

-<%def name="load_center(name, box_id)">
-    coord_vec_t ${name};
-    %for i in range(dimensions):
-        ${name}.${AXIS_NAMES[i]} = box_centers[aligned_nboxes * ${i} + ${box_id}];
-    %endfor
-</%def>

-#define LEVEL_TO_RAD(level) \
-        (root_extent * 1 / (coord_t) (1 << (level + 1)))
+# {{{ preamble

-%if 0:
-    #define dbg_printf(ARGS) printf ARGS
-%else:
-    #define dbg_printf(ARGS) /* */
-%endif
+# This 'walk' mechanism walks over 'child' boxes in the tree.

+TRAVERSAL_PREAMBLE_MAKO_DEFS = r"""//CL:mako//
 <%def name="walk_init(start_box_id)">
-    box_id_t box_stack[NLEVELS];
-    int morton_nr_stack[NLEVELS];
+    box_id_t walk_box_stack[NLEVELS];
+    int walk_morton_nr_stack[NLEVELS];

    // start at root
-    int walk_level = 0;
-    box_id_t walk_box_id = ${start_box_id};
+    int walk_stack_size = 0;
+    box_id_t walk_parent_box_id = ${start_box_id};
    int walk_morton_nr = 0;
    bool continue_walk = true;
 </%def>

-<%def name="walk_reset(start_box_id)">
-    walk_level = 0;
-    walk_box_id = ${start_box_id};
-    walk_morton_nr = 0;
-    continue_walk = true;
+<%def name="walk_get_box_id()">
+    box_id_t walk_box_id = box_child_ids[
+        walk_morton_nr * aligned_nboxes + walk_parent_box_id];
 </%def>

 <%def name="walk_advance()">
@@ -94,13 +90,17 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
        // Ran out of children, pull the next guy off the stack
        // and advance him.

-        continue_walk = walk_level > 0;
+        continue_walk = (
+            // Stack empty? Abort.
+            walk_stack_size > 0
+            );
+
        if (continue_walk)
        {
-            --walk_level;
+            --walk_stack_size;
            dbg_printf(("    ascend\n"));
-            walk_box_id = box_stack[walk_level];
-            walk_morton_nr = morton_nr_stack[walk_level];
+            walk_parent_box_id = walk_box_stack[walk_stack_size];
+            walk_morton_nr = walk_morton_nr_stack[walk_stack_size];
        }
        else
        {
@@ -111,87 +111,178 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
 </%def>

 <%def name="walk_push(new_box)">
-    box_stack[walk_level] = walk_box_id;
-    morton_nr_stack[walk_level] = walk_morton_nr;
-    ++walk_level;
+    walk_box_stack[walk_stack_size] = walk_parent_box_id;
+    walk_morton_nr_stack[walk_stack_size] = walk_morton_nr;
+    ++walk_stack_size;

    %if debug:
-    if (walk_level >= NLEVELS)
+    if (walk_stack_size >= NLEVELS)
    {
        dbg_printf(("  ** ERROR: overran levels stack\n"));
        return;
    }
    %endif

-    walk_box_id = ${new_box};
+    walk_parent_box_id = ${new_box};
    walk_morton_nr = 0;
 </%def>

+<%def name="load_center(name, box_id, declare=True)">
+    %if declare:
+        coord_vec_t ${name} = (coord_vec_t)(
+    %else:
+        ${name} = (coord_vec_t)(
+    %endif
+        %for i in range(dimensions):
+            box_centers[aligned_nboxes * ${i} + ${box_id}]
+            %if i + 1 < dimensions:
+                ,
+            %endif
+        %endfor
+        );
+</%def>
+
+<%def name="load_true_box_extent(name, box_id, kind, declare=True)">
+    %if declare:
+        coord_vec_t ${name}_ext_center, ${name}_radii_vec;
+    %endif
+
+    {
+        %for bound in ["min", "max"]:
+                coord_vec_t ${name}_${bound} = (coord_vec_t)(
+                %for iaxis in range(dimensions):
+                    box_${kind}_bounding_box_${bound}[
+                        ${iaxis} * aligned_nboxes + ${box_id}]
+                    %if iaxis + 1 < dimensions:
+                        ,
+                    %endif
+                %endfor
+                );
+        %endfor
+
+        ${name}_ext_center = ((coord_vec_t) 0.5) * (${name}_min + ${name}_max);
+        ${name}_radii_vec = ((coord_vec_t) 0.5) * (${name}_max - ${name}_min);
+    }
+</%def>
+
 <%def name="check_l_infty_ball_overlap(
        is_overlapping, box_id, ball_radius, ball_center)">
    {
        ${load_center("box_center", box_id)}
        int box_level = box_levels[${box_id}];
-
        coord_t size_sum = LEVEL_TO_RAD(box_level) + ${ball_radius};
-
        coord_t max_dist = 0;
        %for i in range(dimensions):
            max_dist = fmax(max_dist,
-                fabs(${ball_center}.s${i} - box_center.s${i}));
+                fabs(${cvec_sub(ball_center, i)}
+                    - ${cvec_sub("box_center", i)}));
        %endfor
-
        ${is_overlapping} = max_dist <= size_sum;
    }
 </%def>
+"""
+
+
+TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES = r"""//CL//
+${box_flags_enum.get_c_defines()}
+${box_flags_enum.get_c_typedef()}
+
+typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
+%if particle_id_dtype is not None:
+    typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
+%endif
+## Convert to dict first, as this may be passed as a tuple-of-tuples.
+typedef ${dtype_to_ctype(coord_dtype)} coord_t;
+typedef ${dtype_to_ctype(get_coord_vec_dtype(coord_dtype, dimensions))} coord_vec_t;
+
+#define COORD_T_MACH_EPS ((coord_t) ${ repr(float(np.finfo(coord_dtype).eps)) })
+
+#define NLEVELS ${max_levels}
+
+#define LEVEL_TO_RAD(level) \
+        (root_extent * 1 / (coord_t) (1 << (level + 1)))
+
+%if 0:
+    #define dbg_printf(ARGS) printf ARGS
+%else:
+    #define dbg_printf(ARGS) /* */
+%endif

+#define square(x) ((x)*(x))
 """

+
+TRAVERSAL_PREAMBLE_TEMPLATE = (
+    TRAVERSAL_PREAMBLE_MAKO_DEFS
+    + TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES)
+
 # }}}

 # {{{ adjacency test

 HELPER_FUNCTION_TEMPLATE = r"""//CL//

-inline bool is_adjacent_or_overlapping(
+/*
+These adjacency tests check the l^\infty distance between centers to check whether
+two boxes are adjacent or overlapping.
+
+Rather than a 'small floating point number', these adjacency test routines use the
+smaller of the source/target box radii as the floating point tolerance, which
+calls the following configuration 'adjacent' even though it actually is not:
+
+    +---------+     +---------+
+    |         |     |         |
+    |         |     |         |
+    |    o    |     |    o<--->
+    |         |  r  |       r |
+    |         |<--->|         |
+    +---------+     +---------+
+
+This is generically OK since one would expect the distance between the edge of
+a large box and the edge of a smaller box to be a integer multiple of the
+smaller box's diameter (which is twice its radius, our tolerance).
+*/
+
+
+inline bool is_adjacent_or_overlapping_with_neighborhood(
    coord_t root_extent,
-    // target and source order only matter if include_stick_out is true.
    coord_vec_t target_center, int target_level,
-    coord_vec_t source_center, int source_level,
-    // this is expected to be constant so that the inliner will kill the if.
-    const bool include_stick_out
-    )
+    coord_t target_box_neighborhood_size,
+    coord_vec_t source_center, int source_level)
 {
-    // This checks if the two boxes overlap
-    // with an amount of 'slack' corresponding to half the
-    // width of the smaller of the two boxes.
-    // (Without the 'slack', there wouldn't be any
-    // overlap.)
+    // This checks if the source box overlaps the target box
+    // including a neighborhood of target_box_neighborhood_size boxes
+    // of the same size as the target box.

    coord_t target_rad = LEVEL_TO_RAD(target_level);
    coord_t source_rad = LEVEL_TO_RAD(source_level);
-    coord_t rad_sum = target_rad + source_rad;
+    coord_t rad_sum = (
+        (2*(target_box_neighborhood_size-1) + 1) * target_rad
+        + source_rad);
    coord_t slack = rad_sum + fmin(target_rad, source_rad);

-    if (include_stick_out)
-    {
-        slack += STICK_OUT_FACTOR * (
-            0
-            %if targets_have_extent:
-                + target_rad
-            %endif
-            %if sources_have_extent:
-                + source_rad
-            %endif
-            );
-    }
-
-    coord_t max_dist = 0;
+    coord_t l_inf_dist = 0;
    %for i in range(dimensions):
-        max_dist = fmax(max_dist, fabs(target_center.s${i} - source_center.s${i}));
+        l_inf_dist = fmax(
+            l_inf_dist,
+            fabs(${cvec_sub("target_center", i)}
+                - ${cvec_sub("source_center", i)}));
    %endfor

-    return max_dist <= slack;
+    return l_inf_dist <= slack;
+}
+
+inline bool is_adjacent_or_overlapping(
+    coord_t root_extent,
+    // note: order does not matter
+    coord_vec_t target_center, int target_level,
+    coord_vec_t source_center, int source_level)
+{
+    return is_adjacent_or_overlapping_with_neighborhood(
+        root_extent,
+        target_center, target_level,
+        1,
+        source_center, source_level);
 }

 """
@@ -206,17 +297,27 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
 {
    box_flags_t flags = box_flags[box_id];

-    if (flags & BOX_HAS_OWN_SOURCES)
-    { APPEND_source_boxes(box_id); }
+    %if source_boxes_has_mask:
+        if (flags & BOX_IS_SOURCE_BOX && source_boxes_mask[box_id])
+        { APPEND_source_boxes(box_id); }
+    %else:
+        if (flags & BOX_IS_SOURCE_BOX)
+        { APPEND_source_boxes(box_id); }
+    %endif

-    if (flags & BOX_HAS_CHILD_SOURCES)
-    { APPEND_source_parent_boxes(box_id); }
+    %if source_parent_boxes_has_mask:
+        if (flags & BOX_HAS_SOURCE_CHILD_BOXES && source_parent_boxes_mask[box_id])
+        { APPEND_source_parent_boxes(box_id); }
+    %else:
+        if (flags & BOX_HAS_SOURCE_CHILD_BOXES)
+        { APPEND_source_parent_boxes(box_id); }
+    %endif

    %if not sources_are_targets:
-        if (flags & BOX_HAS_OWN_TARGETS)
+        if (flags & BOX_IS_TARGET_BOX)
        { APPEND_target_boxes(box_id); }
    %endif
-    if (flags & (BOX_HAS_CHILD_TARGETS | BOX_HAS_OWN_TARGETS))
+    if (flags & (BOX_HAS_TARGET_CHILD_BOXES | BOX_IS_TARGET_BOX))
    { APPEND_target_or_target_parent_boxes(box_id); }
 }
 """
@@ -238,21 +339,31 @@ LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE = ElementwiseTemplate(
        // assert(i > 0);

        box_id_t my_box_id = box_list[i];
-        box_id_t prev_box_id = box_list[i-1];
-
        int my_level = box_levels[my_box_id];
-        box_id_t my_level_start = level_start_box_nrs[my_level];

-        if (prev_box_id < my_level_start && my_level_start <= my_box_id)
+        bool is_level_leading_box;
+        if (i == 0)
+            is_level_leading_box = true;
+        else
+        {
+            box_id_t prev_box_id = box_list[i-1];
+            box_id_t my_level_start = level_start_box_nrs[my_level];
+
+            is_level_leading_box = (
+                    prev_box_id < my_level_start
+                    && my_level_start <= my_box_id);
+        }
+
+        if (is_level_leading_box)
            list_level_start_box_nrs[my_level] = i;
    """,
    name="extract_level_start_box_nrs")

 # }}}

-# {{{ colleagues
+# {{{ same-level non-well-separated boxes (generalization of "colleagues")

-COLLEAGUES_TEMPLATE = r"""//CL//
+SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
 {
@@ -260,7 +371,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)

    if (box_id == 0)
    {
-        // The root has no colleagues.
+        // The root has no boxes on the same level, nws or not.
        return;
    }

@@ -268,31 +379,34 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)

    dbg_printf(("box id: %d level: %d\n", box_id, level));

-    // To find this box's colleagues, start at the top of the tree, descend
+    // To find this box's same-level nws boxes, start at the top of the tree, descend
    // into adjacent (or overlapping) parents.
    ${walk_init(0)}

    while (continue_walk)
    {
-        box_id_t child_box_id = box_child_ids[
-                walk_morton_nr * aligned_nboxes + walk_box_id];
-        dbg_printf(("  level: %d walk box id: %d morton: %d child id: %d\n",
-            walk_level, walk_box_id, walk_morton_nr, child_box_id));
+        ${walk_get_box_id()}
+
+        dbg_printf(("  level: %d walk parent box id: %d morton: %d child id: %d\n",
+            walk_stack_size, walk_parent_box_id, walk_morton_nr, walk_box_id));

-        if (child_box_id)
+        if (walk_box_id)
        {
-            ${load_center("child_center", "child_box_id")}
+            ${load_center("walk_center", "walk_box_id")}

-            bool a_or_o = is_adjacent_or_overlapping(root_extent,
-                center, level, child_center, box_levels[child_box_id], false);
+            bool a_or_o = is_adjacent_or_overlapping_with_neighborhood(
+                    root_extent,
+                    center, level,
+                    ${well_sep_is_n_away},
+                    walk_center, box_levels[walk_box_id]);

            if (a_or_o)
            {
-                // child_box_id lives on walk_level+1.
-                if (walk_level+1 == level  && child_box_id != box_id)
+                // walk_box_id lives on level walk_stack_size+1.
+                if (walk_stack_size+1 == level && walk_box_id != box_id)
                {
-                    dbg_printf(("    colleague\n"));
-                    APPEND_colleagues(child_box_id);
+                    dbg_printf(("    found same-lev nws\n"));
+                    APPEND_same_level_non_well_sep_boxes(walk_box_id);
                }
                else
                {
@@ -300,7 +414,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
                    // on the stack.

                    dbg_printf(("    descend\n"));
-                    ${walk_push("child_box_id")}
+                    ${walk_push("walk_box_id")}

                    continue;
                }
@@ -321,7 +435,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)

 # {{{ neighbor source boxes ("list 1")

-NEIGBHOR_SOURCE_BOXES_TEMPLATE = r"""//CL//
+NEIGHBOR_SOURCE_BOXES_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
 {
@@ -342,50 +456,51 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)

    {
        box_flags_t root_flags = box_flags[0];
-        if (root_flags & BOX_HAS_OWN_SOURCES)
+        if (root_flags & BOX_IS_SOURCE_BOX)
        {
            APPEND_neighbor_source_boxes(0);
        }
    }

-    // To find this box's colleagues, start at the top of the tree, descend
+    // To find this box's adjacent boxes, start at the top of the tree, descend
    // into adjacent (or overlapping) parents.
    ${walk_init(0)}

    while (continue_walk)
    {
-        box_id_t child_box_id = box_child_ids[
-                walk_morton_nr * aligned_nboxes + walk_box_id];
+        ${walk_get_box_id()}

-        dbg_printf(("  walk box id: %d morton: %d child id: %d level: %d\n",
-            walk_box_id, walk_morton_nr, child_box_id, walk_level));
+        dbg_printf(("  walk parent box id: %d morton: %d child id: %d level: %d\n",
+            walk_parent_box_id, walk_morton_nr, walk_box_id, walk_stack_size));

-        if (child_box_id)
+        if (walk_box_id)
        {
-            ${load_center("child_center", "child_box_id")}
+            ${load_center("walk_center", "walk_box_id")}

-            bool a_or_o = is_adjacent_or_overlapping(root_extent,
-                center, level, child_center, box_levels[child_box_id], false);
+            bool a_or_o = is_adjacent_or_overlapping(
+                root_extent,
+                center, level,
+                walk_center, box_levels[walk_box_id]);

            if (a_or_o)
            {
-                box_flags_t flags = box_flags[child_box_id];
-                /* child_box_id == box_id is ok */
-                if (flags & BOX_HAS_OWN_SOURCES)
+                box_flags_t flags = box_flags[walk_box_id];
+                /* walk_box_id == box_id is ok */
+                if (flags & BOX_IS_SOURCE_BOX)
                {
                    dbg_printf(("    neighbor source box\n"));

-                    APPEND_neighbor_source_boxes(child_box_id);
+                    APPEND_neighbor_source_boxes(walk_box_id);
                }

-                if (flags & BOX_HAS_CHILD_SOURCES)
+                if (flags & BOX_HAS_SOURCE_CHILD_BOXES)
                {
                    // We want to descend into this box. Put the current state
                    // on the stack.

                    dbg_printf(("    descend\n"));

-                    ${walk_push("child_box_id")}
+                    ${walk_push("walk_box_id")}

                    continue;
                }
@@ -404,9 +519,9 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)

 # }}}

-# {{{ well-separated siblings ("list 2")
+# {{{ from well-separated siblings ("list 2")

-SEP_SIBLINGS_TEMPLATE = r"""//CL//
+FROM_SEP_SIBLINGS_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
 {
@@ -420,27 +535,33 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
    if (parent == box_id)
        return;

-    box_id_t parent_coll_start = colleagues_starts[parent];
-    box_id_t parent_coll_stop = colleagues_starts[parent+1];
+    box_id_t parent_slnf_start = same_level_non_well_sep_boxes_starts[parent];
+    box_id_t parent_slnf_stop = same_level_non_well_sep_boxes_starts[parent+1];

-    // /!\ i is not a box_id, it's an index into colleagues_list.
-    for (box_id_t i = parent_coll_start; i < parent_coll_stop; ++i)
+    // /!\ i is not a box_id, it's an index into same_level_non_well_sep_boxes_list.
+    for (box_id_t i = parent_slnf_start; i < parent_slnf_stop; ++i)
    {
-        box_id_t parent_colleague = colleagues_list[i];
+        box_id_t parent_nf = same_level_non_well_sep_boxes_lists[i];

        for (int morton_nr = 0; morton_nr < ${2**dimensions}; ++morton_nr)
        {
            box_id_t sib_box_id = box_child_ids[
-                    morton_nr * aligned_nboxes + parent_colleague];
+                    morton_nr * aligned_nboxes + parent_nf];
+
+            if (sib_box_id == 0)
+                continue;

            ${load_center("sib_center", "sib_box_id")}

-            bool sep = !is_adjacent_or_overlapping(root_extent,
-                center, level, sib_center, box_levels[sib_box_id], false);
+            bool sep = !is_adjacent_or_overlapping_with_neighborhood(
+                root_extent,
+                center, level,
+                ${well_sep_is_n_away},
+                sib_center, box_levels[sib_box_id]);

            if (sep)
            {
-                APPEND_sep_siblings(sib_box_id);
+                APPEND_from_sep_siblings(sib_box_id);
            }
        }
    }
@@ -449,117 +570,272 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)

 # }}}

-# {{{ separated smaller ("list 3")
+# {{{ from separated smaller ("list 3")

-SEP_SMALLER_TEMPLATE = r"""//CL//
+FROM_SEP_SMALLER_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
 {
    // /!\ target_box_number is *not* a box_id, despite the type.
    // It's the number of the target box we're currently processing.

-    box_id_t box_id = target_boxes[target_box_number];
+    box_id_t tgt_box_id = target_boxes[target_box_number];

-    ${load_center("center", "box_id")}
+    ${load_center("tgt_center", "tgt_box_id")}

-    int level = box_levels[box_id];
+    int tgt_level = box_levels[tgt_box_id];
+
+    %if targets_have_extent:
+        %if from_sep_smaller_crit in ["static_linf", "static_l2"]:
+            coord_t tgt_stickout_l_inf_rad =
+                (1 + stick_out_factor) * LEVEL_TO_RAD(tgt_level);

-    box_id_t coll_start = colleagues_starts[box_id];
-    box_id_t coll_stop = colleagues_starts[box_id+1];
+        %elif from_sep_smaller_crit == "precise_linf":
+            ${load_true_box_extent("tgt", "tgt_box_id", "target")}
+            // defines tgt_ext_center, tgt_radii_vec

-    // /!\ i is not a box_id, it's an index into colleagues_list.
-    for (box_id_t i = coll_start; i < coll_stop; ++i)
+        %endif
+    %endif
+
+    box_id_t slnws_start = same_level_non_well_sep_boxes_starts[tgt_box_id];
+    box_id_t slnws_stop = same_level_non_well_sep_boxes_starts[tgt_box_id+1];
+
+    // /!\ i is not a box_id, it's an index into same_level_non_well_sep_boxes_lists.
+    for (box_id_t i = slnws_start; i < slnws_stop; ++i)
    {
-        box_id_t colleague = colleagues_list[i];
+        box_id_t same_lev_nws_box = same_level_non_well_sep_boxes_lists[i];
+
+        if (same_lev_nws_box == tgt_box_id)
+            continue;

-        ${walk_init("colleague")}
+        // Colleagues (same-level NWS boxes) for 1-away are always adjacent, so
+        // we always want to descend into them. For 2-away, we may already
+        // satisfy the criteria for being in list 3 and therefore may never
+        // need to descend. Hence include the start box in the search here
+        // if we're in the two-or-more-away case.
+        ${walk_init("same_lev_nws_box")}

        while (continue_walk)
        {
-            // Loop invariant: walk_box_id is, at first, always adjacent to box_id.
-            // This is true at the first level because colleagues are by adjacent
+            // Loop invariant:
+            // walk_parent_box_id is, at first, always adjacent to tgt_box_id.
+            //
+            // This is true at the first level because colleagues are adjacent
            // by definition, and is kept true throughout the walk by only descending
            // into adjacent boxes.
            //
            // As we descend, we may find a child of an adjacent box that is
-            // non-adjacent to box_id.
+            // non-adjacent to tgt_box_id.
            //
            // If neither sources nor targets have extent, then that
-            // nonadjacent child box is added to box_id's sep_smaller ("list 3
-            // far") and that's it.
+            // nonadjacent child box is added to tgt_box_id's from_sep_smaller
+            // ("list 3far") and that's it.
            //
            // If they have extent, then while they may be separated, the
-            // intersection of box_id's and the child box's stick-out region
+            // intersection of tgt_box_id's and the child box's stick-out region
            // may be non-empty, and we thus need to add that child to
-            // sep_close_smaller ("list 3 close") for the interaction to be
+            // from_sep_close_smaller ("list 3 close") for the interaction to be
            // done by direct evaluation. We also need to descend into that
            // child.

-            box_id_t child_box_id = box_child_ids[
-                    walk_morton_nr * aligned_nboxes + walk_box_id];
+            ${walk_get_box_id()}

-            dbg_printf(("  walk box id: %d morton: %d child id: %d\n",
-                walk_box_id, walk_morton_nr, child_box_id));
+            dbg_printf(("  walk parent box id: %d morton: %d child id: %d\n",
+                walk_parent_box_id, walk_morton_nr, walk_box_id));

-            box_flags_t child_box_flags = box_flags[child_box_id];
+            box_flags_t child_box_flags = box_flags[walk_box_id];

-            if (child_box_id &&
+            if (walk_box_id &&
                    (child_box_flags &
-                            (BOX_HAS_OWN_SOURCES | BOX_HAS_CHILD_SOURCES)))
+                            (BOX_IS_SOURCE_BOX | BOX_HAS_SOURCE_CHILD_BOXES)))
            {
-                ${load_center("child_center", "child_box_id")}
+                ${load_center("walk_center", "walk_box_id")}
+
+                int walk_level = box_levels[walk_box_id];

-                bool a_or_o = is_adjacent_or_overlapping(root_extent,
-                    center, level, child_center, box_levels[child_box_id], false);
+                bool in_list_1 = is_adjacent_or_overlapping(root_extent,
+                    tgt_center, tgt_level, walk_center, walk_level);

-                if (a_or_o)
+                if (in_list_1)
                {
-                    if (child_box_flags & BOX_HAS_CHILD_SOURCES)
+                    if (child_box_flags & BOX_HAS_SOURCE_CHILD_BOXES)
                    {
                        // We want to descend into this box. Put the current state
                        // on the stack.

-                        ${walk_push("child_box_id")}
-                        continue;
+                        if (walk_level <= from_sep_smaller_source_level
+                                || from_sep_smaller_source_level == -1)
+                        {
+                            ${walk_push("walk_box_id")}
+                            continue;
+                        }
+                        // otherwise there's no point to descending further.
                    }
                }
                else
                {
-                    %if sources_have_extent or targets_have_extent:
-                        const bool a_or_o_with_stick_out =
-                            is_adjacent_or_overlapping(root_extent,
-                                center, level, child_center,
-                                box_levels[child_box_id], true);
+                    bool meets_sep_crit;
+
+                    <% assert not sources_have_extent %>
+
+                    %if not targets_have_extent:
+                        meets_sep_crit = true;
+
+                    %elif from_sep_smaller_crit == "static_linf":
+                        {
+                            coord_t source_rad = LEVEL_TO_RAD(walk_level);
+
+                            // l^infty distance between source box and target box.
+                            // Negative indicates overlap.
+                            coord_t l_inf_dist = 0;
+                            %for i in range(dimensions):
+                                l_inf_dist = fmax(
+                                    l_inf_dist,
+                                    fabs(${cvec_sub("tgt_center", i)}
+                                     - ${cvec_sub("walk_center", i)})
+                                    - tgt_stickout_l_inf_rad
+                                    - source_rad);
+                            %endfor
+
+                            meets_sep_crit = l_inf_dist >=
+                                (2 - 8 * COORD_T_MACH_EPS) * source_rad;
+                        }
+
+                    %elif from_sep_smaller_crit == "precise_linf":
+                        {
+                            coord_t source_rad = LEVEL_TO_RAD(walk_level);
+
+                            // l^infty distance between source box and target box.
+                            // Negative indicates overlap.
+                            coord_t l_inf_dist = 0;
+                            %for i in range(dimensions):
+                                l_inf_dist = fmax(
+                                    l_inf_dist,
+                                    fabs(
+                                        ${cvec_sub("tgt_ext_center", i)}
+                                        - ${cvec_sub("walk_center", i)}
+                                        )
+                                    - ${cvec_sub("tgt_radii_vec", i)}
+                                    - source_rad);
+                            %endfor
+
+                            meets_sep_crit = l_inf_dist >=
+                                (2 - 8 * COORD_T_MACH_EPS) * source_rad;
+                        }
+
+                    %elif from_sep_smaller_crit == "static_l2":
+                        {
+                            coord_t source_l_inf_rad = LEVEL_TO_RAD(walk_level);
+
+                            // l^2 distance between source box and target centers.
+                            coord_t l_2_squared_center_dist =
+                                0
+                                %for i in range(dimensions):
+                                    + square(
+                                        ${cvec_sub("tgt_center", i)}
+                                        - ${cvec_sub("walk_center", i)})
+                                %endfor
+                                ;
+
+                            <% assert not sources_have_extent %>
+
+                            // We're considering convergence of a multipole
+                            // in the (square) source box at all locations
+                            // in the (round) target box. We need
+
+                            // src_box_l2_radius
+                            //    / d_2(src_box_center, tgt_box) <= sqrt(d)/3
+
+                            // <=>
+
+                            // src_box_linf_radius * sqrt(d)
+                            //    / d_2(src_box_center, tgt_box) <= sqrt(d)/3
+
+                            // <=>
+
+                            // 3 * src_box_linf_radius
+                            //    <= d_2(src_box_center, tgt_box)
+
+                            // <=>
+
+                            // 3 * src_box_linf_radius
+                            //    <= d_2(src_box_center, tgt_box_center)
+                            //    - sqrt(d) * tgt_stickout_l_inf_rad
+
+                            // <=> (because why not)
+
+                            // 2 * src_box_linf_radius
+                            //    <= d_2(src_box_center, tgt_box_center)
+                            //    - sqrt(d) * tgt_stickout_l_inf_rad
+                            //    - src_box_linf_radius
+
+                            coord_t rhs =
+                                sqrt(l_2_squared_center_dist)
+                                - sqrt((coord_t) (${dimensions}))
+                                    * tgt_stickout_l_inf_rad
+                                - source_l_inf_rad;
+
+                            meets_sep_crit = (
+                                (2 - 8 * COORD_T_MACH_EPS) * source_l_inf_rad
+                                <= rhs);
+                        }
+
                    %else:
-                        const bool a_or_o_with_stick_out = false;
+                        <% raise ValueError(
+                            "unknown value of from_sep_smaller_crit: %s"
+                            % from_sep_smaller_crit) %>
                    %endif

                    // We're no longer *immediately* adjacent to our target
                    // box, but our stick-out regions might still have a
                    // non-empty intersection.

-                    if (!a_or_o_with_stick_out)
+                    // If the number of particles in this box is below the
+                    // source count threshold, it can be moved to a "close" list.
+                    // This is a performance optimization.
+
+                    <% close_lists_exist  = \
+                        sources_have_extent or targets_have_extent %>
+                    bool close_lists_exist = ${ str(close_lists_exist).lower() };
+
+                    bool force_close_list_for_low_interaction_count =
+                    %if close_lists_exist:
+                        close_lists_exist &&
+                        (box_source_counts_cumul[walk_box_id]
+                            < from_sep_smaller_min_nsources_cumul);
+                    %else:
+                        false;
+                    %endif
+
+                    if (meets_sep_crit &&
+                        !force_close_list_for_low_interaction_count)
                    {
-                        APPEND_sep_smaller(child_box_id);
+                        if (from_sep_smaller_source_level == walk_level)
+                            APPEND_from_sep_smaller(walk_box_id);
                    }
                    else
                    {
                    %if sources_have_extent or targets_have_extent:
-                        if (child_box_flags & BOX_HAS_OWN_SOURCES)
-                        {
-                            APPEND_sep_close_smaller(child_box_id);
-                        }
-
-                        if (child_box_flags & BOX_HAS_CHILD_SOURCES)
+                        // from_sep_smaller_source_level == -1 means "only build
+                        // list 3 close", with sources on any level.
+                        // This kernel will be run once per source level to
+                        // generate per-level list 3, and once
+                        // (not per level) to generate list 3 close.
+
+                        if (
+                               (child_box_flags & BOX_IS_SOURCE_BOX)
+                               && (from_sep_smaller_source_level == -1))
+                            APPEND_from_sep_close_smaller(walk_box_id);
+
+                        if (child_box_flags & BOX_HAS_SOURCE_CHILD_BOXES)
                        {
-                            ${walk_push("child_box_id")}
+                            ${walk_push("walk_box_id")}
                            continue;
                        }
                    %endif
                    }
                }
            }
-
            ${walk_advance()}
        }
    }
@@ -568,167 +844,266 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)

 # }}}

-# {{{ separated bigger ("list 4")
+# {{{ from separated bigger ("list 4")

-# "Normal" case: Sources/targets without extent
-# ---------------------------------------------
-#
-# List 4 interactions for box "B" are about a parent P's colleague A not
-# adjacent to B.
-#
-# -------|----------|----------|
-# Case   |    1     |    2     |
-#        | adj to A | adj to A |
-# -------|----------|----------|
-#        |          |          |
-# A---P  |    X !   |    X !   |
-#     |  |          |          |
-#     o  |    X     |    X     |
-#     |  |          |          |
-#     o  |    X     |    X     |
-#     |  |          |          |
-#     o  |    X     |    O     |
-#     |  |          |          |
-#     B  |    O !   |    O !   |
+# List 4 consists of source boxes that 'missed the boat' on entering the downward
+# propagation through list 2. That is, they are non-well-separated from the
+# target box itself or a box in its chain of parents. In addition, they are
+# not adjacent to the target box and have the same size or are bigger.
 #
-# Note that once a parent is no longer adjacent, its children won't be either.
+# To be in list 4, a box must have its own sources. In the no-extents case,
+# this will happen only if that box is a leaf, but for the with-extents case,
+# any box can have sources.
 #
-# (X: yes, O:no, exclamation marks denote that this *must* be the case. Entries
-# without exclamation mark are choices for this case)
+# (Yes, you read that right--same-level non-well separated boxes *can* be in
+# list 4, although only for 2+-away. They *could* also use list 3, but that
+# would be less efficient because it would not make use of the downward
+# propagation.)
 #
-# Case 1: A->B interaction enters the downward propagation at B, i.e. A is in
-#    B's "sep_bigger". (list 4)
-#
-# Case 2: A->B interaction entered the downward propagation at B's parent, i.e.
-#    A is not in B's "sep_bigger". (list 4)
-
-# Sources/targets with extent
-# ---------------------------
+# For a box not well-separated from the target box or one of its parents, we
+# check whether the box is adjacent to our target box (in its list 1).  If so,
+# we don't need to consider it (because the interaction to this box will be
+# mediated by list 1).
 #
-# List 4 interactions for box "B" are about a parent P's colleague A not
-# adjacent to B.
+# Case I: Neither sources nor targets have extent
 #
-# -------|----------|----------|----------|
-# Case   |    1     |    2     |    3     |
-#        | so   adj | so   adj | so   adj |
-# -------|----------|----------|----------|
-#        |          |          |          |
-# A---P  | X!    X! | X!    X! | X!    X! |
-#     |  |          |          |          |
-#     o  | X     ?  | X     ?  | X     ?  |
-#     |  |          |          |          |
-#     o  | X     ?  | X     ?  | X     ?  |
-#     |  |          |          |          |
-#     o  | X     ?  | X     ?  | O     O  |
-#     |  |          |          |          |
-#     B  | X     O! | O     O! | O     O! |
+# In this case and once non-membership in list 1 has been verified, list 4
+# membership is simply a matter of deciding whether the source box's
+# contribution should enter the downward propagation at this target box or
+# whether it has already entered it at a parent of the target box.
 #
-# "so": adjacent or overlapping when stick-out is taken into account (to A)
-# "adj": adjacent to A without stick-out
+# It suffices to check this for the immediate parent because the check has to
+# be monotone: Child boxes are subsets of parent boxes, and therefore any
+# minimum distance requirement satisfied by the parent will also be satisfied
+# by the child. Thus, if the source box is in the target box's parent's list 4,
+# then it entered downward propagation with it or another ancestor.
 #
-# Note that once a parent is no longer "adj" or "so", its children won't be
-# either.  Also note that "adj" => "so". (And there by "not so" => "not adj".)
+# Case II: Sources or targets have extent
 #
-# (X: yes, O:no, ?: doesn't matter, exclamation marks denote that this *must*
-# be the case. Entries without exclamation mark are choices for this case)
+# The with-extents case is conceptually similar to the no-extents case, however
+# there is an extra 'separation requirement' based on the extents that, if not
+# satisfied, may prevent a source box from entering the downward propagation
+# at a given box. If we once again assume monotonicity of this 'separation
+# requirement' check, then simply verifying whether or not the interaction from
+# the source box would be *allowed* to enter the downward propagation at the
+# parent suffices to determine whether the target box may be responsible for
+# entering the source interaction into the downward propagation.
 #
-# Case 1: A->B interaction must be processed by direct eval because of "so",
-#    i.e. it is in B's "sep_close_bigger".
-#
-# Case 2: A->B interaction enters downward the propagation at B,
-#    i.e. it is in B's "sep_bigger".
-#
-# Case 3: A->B interaction enters downward the propagation at B's parent,
-#    i.e. A is not in B's "sep*bigger"
+# In cases where the source box is not yet part of the downward propagation
+# received from the parent and also not eligible for entering downward
+# propagation at this box (noting that this can only happen in the with-extents
+# case), the interaction is added to the (non-downward-propagating) 'list 4
+# close' (from_sep_close_bigger).
+
+
+FROM_SEP_BIGGER_TEMPLATE = r"""//CL//
+
+inline bool meets_sep_bigger_criterion(
+    coord_t root_extent,
+    coord_vec_t target_center, int target_level,
+    coord_vec_t source_center, int source_level,
+    coord_t stick_out_factor)
+{
+    <%
+        assert not sources_have_extent
+    %>
+
+    // What we are interested in ensuring is that
+
+    // (*)
+    // d_2(src_box, tgt_center)
+    //     >= 3 * (radius of tgt box potentially
+    //                   including stick-out)
+
+    // (because convergence factors are in l^2,
+    // irrespective of how we measure)
+
+    // Since d_2(a, b) >= d_inf(a, b), ensuring that
+    // (*) holds with d_inf implies that it also holds
+    // with d_2.
+
+    coord_t target_rad = LEVEL_TO_RAD(target_level);
+    coord_t source_rad = LEVEL_TO_RAD(source_level);
+    coord_t max_allowed_center_l_inf_dist = (
+        3 * (1 + stick_out_factor) * target_rad
+        +  source_rad);
+
+    coord_t l_inf_dist = 0;
+    %for i in range(dimensions):
+        l_inf_dist = fmax(
+            l_inf_dist,
+            fabs(${cvec_sub("target_center", i)}
+                - ${cvec_sub("source_center", i)}));
+    %endfor
+
+    return l_inf_dist >= max_allowed_center_l_inf_dist * (1 - 8 * COORD_T_MACH_EPS);
+}

-SEP_BIGGER_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
 {
    box_id_t tgt_ibox = target_or_target_parent_boxes[itarget_or_target_parent_box];
-    ${load_center("center", "tgt_ibox")}
+    ${load_center("tgt_box_center", "tgt_ibox")}

-    int box_level = box_levels[tgt_ibox];
+    int tgt_box_level = box_levels[tgt_ibox];
    // The root box has no parents, so no list 4.
-    if (box_level == 0)
+    if (tgt_box_level == 0)
        return;

-    box_id_t parent_box_id = box_parent_ids[tgt_ibox];
-    ${load_center("parent_center", "parent_box_id")}
-
-    box_id_t current_parent_box_id = parent_box_id;
-    int walk_level = box_level - 1;
+    box_id_t tgt_parent_box_id = box_parent_ids[tgt_ibox];
+    const int tgt_parent_level = tgt_box_level - 1;
+    ${load_center("parent_center", "tgt_parent_box_id")}

    box_flags_t tgt_box_flags = box_flags[tgt_ibox];

-    // Look for colleagues of parents that are non-adjacent to tgt_ibox.
-    // Walk up the tree from tgt_ibox.
+    %if well_sep_is_n_away == 1:
+        // In a 1-away FMM, tgt_ibox's colleagues are by default uninteresting
+        // (i.e. not in list 4) because they're adjacent. So in this case, we
+        // may directly jump to the parent level.
+
+        int walk_level = tgt_box_level - 1;
+        box_id_t current_tgt_parent_box_id = tgt_parent_box_id;
+    %else:
+        // In a 2+-away FMM, tgt_ibox's same-level non-well-separated boxes *may*
+        // be sufficiently separated from tgt_ibox to be in its list 4.
+
+        int walk_level = tgt_box_level;
+        box_id_t current_tgt_parent_box_id = tgt_ibox;
+    %endif
+
+    /*
+    Look for same-level non-well-separated boxes of parents that are
+    non-adjacent to tgt_ibox.
+    Walk up the tree from tgt_ibox.

-    // Box 0 (== level 0) doesn't have any colleagues, so we can stop the
-    // search for such colleagues there.
-    for (int walk_level = box_level - 1; walk_level != 0;
+    Box 0 (== level 0) doesn't have any slnws boxes, so we can stop the
+    search for such slnws boxes there.
+    */
+    for (; walk_level != 0;
            // {{{ advance
            --walk_level,
-            current_parent_box_id = box_parent_ids[current_parent_box_id]
+            current_tgt_parent_box_id = box_parent_ids[current_tgt_parent_box_id]
            // }}}
            )
    {
-        box_id_t coll_start = colleagues_starts[current_parent_box_id];
-        box_id_t coll_stop = colleagues_starts[current_parent_box_id+1];
-
-        // /!\ i is not a box id, it's an index into colleagues_list.
-        for (box_id_t i = coll_start; i < coll_stop; ++i)
+        box_id_t slnws_start =
+            same_level_non_well_sep_boxes_starts[current_tgt_parent_box_id];
+        box_id_t slnws_stop =
+            same_level_non_well_sep_boxes_starts[current_tgt_parent_box_id+1];
+
+        // /!\ i is not a box id, it's an index into
+        // same_level_non_well_sep_boxes_lists.
+        for (box_id_t i = slnws_start; i < slnws_stop; ++i)
        {
-            box_id_t colleague_box_id = colleagues_list[i];
+            box_id_t slnws_box_id = same_level_non_well_sep_boxes_lists[i];

-            if (box_flags[colleague_box_id] & BOX_HAS_OWN_SOURCES)
+            if (box_flags[slnws_box_id] & BOX_IS_SOURCE_BOX)
            {
-                ${load_center("colleague_center", "colleague_box_id")}
-                bool a_or_o = is_adjacent_or_overlapping(root_extent,
-                    center, box_level, colleague_center, walk_level, false);
+                ${load_center("slnws_center", "slnws_box_id")}

-                if (!a_or_o)
-                {
-                    // Found one.
+                bool in_list_1 = is_adjacent_or_overlapping(root_extent,
+                    tgt_box_center, tgt_box_level,
+                    slnws_center, walk_level);

+                if (!in_list_1)
+                {
                    %if sources_have_extent or targets_have_extent:
-                        const bool a_or_o_with_stick_out =
-                            is_adjacent_or_overlapping(root_extent,
-                                center, box_level, colleague_center,
-                                walk_level, true);
-
-                    if (a_or_o_with_stick_out)
+                        /*
+                        With-extent list 4 separation criterion.
+                        Needs to be monotone.  (see main comment narrative
+                        above for what that means) If you change this, also
+                        change the equivalent check for the parent, below.
+                        */
+                        const bool tgt_meets_with_ext_sep_criterion =
+                            meets_sep_bigger_criterion(root_extent,
+                                tgt_box_center, tgt_box_level,
+                                slnws_center, walk_level,
+                                stick_out_factor);
+
+                    if (!tgt_meets_with_ext_sep_criterion)
                    {
-                        // "Case 1" above: colleague_box_id is too close and
-                        // overlaps our stick_out region. We're obliged to do
-                        // the interaction directly.
+                        /*
+                        slnws_box_id failed the separation criterion (i.e.  is
+                        too close to the target box) for list 4 proper. Stick
+                        it in list 4 close.
+                        */

-                        if (tgt_box_flags & BOX_HAS_OWN_TARGETS)
+                        if (tgt_box_flags & BOX_IS_TARGET_BOX)
                        {
-                            APPEND_sep_close_bigger(colleague_box_id);
+                            APPEND_from_sep_close_bigger(slnws_box_id);
                        }
                    }
                    else
                    %endif
                    {
-                        bool parent_a_or_o_with_stick_out =
+                        bool in_parent_list_1 =
                            is_adjacent_or_overlapping(root_extent,
-                                parent_center, box_level-1, colleague_center,
-                                walk_level, true);
-
-                        if (parent_a_or_o_with_stick_out)
+                                parent_center, tgt_parent_level,
+                                slnws_center, walk_level);
+
+                        bool would_be_in_parent_list_4_not_considering_stickout = (
+                                !in_parent_list_1
+                                %if well_sep_is_n_away > 1:
+                                    /*
+                                    From-sep-bigger boxes can only be in the
+                                    parent's from-sep-bigger list if they're
+                                    actually bigger (or equal) to the parent
+                                    box size.
+
+                                    For 1-away, that's guaranteed at this
+                                    point, because we only start ascending the
+                                    tree at the parent's level, so any box we
+                                    find here is naturally big enough. For
+                                    2-away, we start looking at the target
+                                    box's level, so slnws_box_id may actually
+                                    be too small (at too deep a level) to be in
+                                    the parent's from-sep-bigger list.
+                                    */
+
+                                    && walk_level < tgt_box_level
+                                %endif
+                                );
+
+                        if (would_be_in_parent_list_4_not_considering_stickout)
                        {
-                            // "Case 2" above: We're the first box down the chain
-                            // to be far enough away to let the interaction into
-                            // our local downward subtree.
-                            APPEND_sep_bigger(colleague_box_id);
+                            /*
+                            Our immediate parent box was already far enough
+                            away to (hypothetically) let the interaction into
+                            its downward propagation--so this happened either
+                            there or at a more distant ancestor. We'll get the
+                            interaction that way. Nothing to do, unless the box
+                            was too close to the parent and ended up in the
+                            parent's from_sep_close_bigger. If that's the case,
+                            we'll simply let it enter the downward propagation
+                            here.
+
+                            With-extent list 4 separation criterion.
+                            Needs to be monotone.  (see main comment narrative
+                            above for what that means) If you change this, also
+                            change the equivalent check for the target box, above.
+                            */
+
+                            %if sources_have_extent or targets_have_extent:
+                                const bool parent_meets_with_ext_sep_criterion =
+                                    meets_sep_bigger_criterion(root_extent,
+                                        parent_center, tgt_parent_level,
+                                        slnws_center, walk_level,
+                                        stick_out_factor);
+
+                                if (!parent_meets_with_ext_sep_criterion)
+                                {
+                                    APPEND_from_sep_bigger(slnws_box_id);
+                                }
+                            %endif
                        }
                        else
                        {
-                            // "Case 2" above: A parent box was already far
-                            // enough away to let the interaction into its
-                            // local downward subtree. We'll get the interaction
-                            // that way. Nothing to do.
+                            /*
+                            We're the first box down the chain to be far enough
+                            away to let the interaction into our local downward
+                            propagation.
+                            */
+                            APPEND_from_sep_bigger(slnws_box_id);
                        }
                    }
                }
@@ -741,18 +1116,201 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
 # }}}


+# {{{ list merger
+
+LIST_MERGER_TEMPLATE = ElementwiseTemplate(
+    arguments=r"""//CL:mako//
+    /* input: */
+
+    box_id_t *output_to_input_box,
+
+    %for ilist in range(nlists):
+        box_id_t *list${ilist}_starts,
+    %endfor
+
+    %if not write_counts:
+    %for ilist in range(nlists):
+        const box_id_t *list${ilist}_lists,
+    %endfor
+        const box_id_t *new_starts,
+    %endif
+
+    /* output: */
+
+    %if not write_counts:
+        box_id_t *new_lists,
+    %else:
+        box_id_t *new_counts,
+    %endif
+    """,
+
+    operation=r"""//CL:mako//
+        /* Compute output and input indices. */
+        const box_id_t ioutput_box = i;
+        const box_id_t ibox = output_to_input_box[ioutput_box];
+
+        /* Count the size of the input at the current index. */
+        %for ilist in range(nlists):
+            const box_id_t list${ilist}_start = list${ilist}_starts[ibox];
+            const box_id_t list${ilist}_count =
+                list${ilist}_starts[ibox + 1] - list${ilist}_start;
+        %endfor
+
+        /* Update the counts or copy the elements. */
+        %if write_counts:
+            if (ioutput_box == 0)
+                new_counts[0] = 0;
+
+            new_counts[ioutput_box + 1] =
+            %for ilist in range(nlists):
+                + list${ilist}_count
+            %endfor
+                ;
+        %else:
+            box_id_t cur_idx = new_starts[ioutput_box];
+
+            %for ilist in range(nlists):
+            for (box_id_t j = 0; j < list${ilist}_count; ++j)
+            {
+                new_lists[cur_idx++] =
+                    list${ilist}_lists[list${ilist}_start + j];
+            }
+            %endfor
+        %endif
+    """,
+
+    name="merge_lists")
+
+
+class _IndexStyle:
+    TARGET_BOXES = 0
+    TARGET_OR_TARGET_PARENT_BOXES = 1
+
+
+class _ListMerger:
+    """Utility class for combining box lists optionally changing indexing style."""
+
+    def __init__(self, context, box_id_dtype):
+        self.context = context
+        self.box_id_dtype = box_id_dtype
+
+    @memoize_method
+    def get_list_merger_kernel(self, nlists, write_counts):
+        """
+        :arg nlists: Number of input lists
+        :arg write_counts: A :class:`bool`, indicating whether to generate a
+            kernel that produces box counts or box lists
+        """
+        assert nlists >= 1
+
+        return LIST_MERGER_TEMPLATE.build(
+                self.context,
+                type_aliases=(
+                    ("box_id_t", self.box_id_dtype),
+                ),
+                var_values=(
+                    ("nlists", nlists),
+                    ("write_counts", write_counts),
+                ))
+
+    def __call__(self, queue, input_starts, input_lists, input_index_style,
+            output_index_style, target_boxes, target_or_target_parent_boxes,
+            nboxes, debug=False, wait_for=None):
+        """
+        :arg input_starts: Starts arrays of input
+        :arg input_lists: Lists arrays of input
+        :arg input_index_style: A :class:`_IndexStyle`
+        :arg output_index_style: A :class:`_IndexStyle`
+        :returns: A pair *results_dict, event*, where *results_dict*
+            contains entries *starts* and *lists*
+        """
+        if wait_for is None:
+            wait_for = []
+
+        if (
+                output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
+                and input_index_style == _IndexStyle.TARGET_BOXES):
+            raise ValueError(
+                    "unsupported: merging a list indexed by target boxes "
+                    "into a list indexed by target or target parent boxes")
+
+        ntarget_boxes = len(target_boxes)
+        ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes)
+
+        noutput_boxes = (ntarget_boxes
+                if output_index_style == _IndexStyle.TARGET_BOXES
+                else ntarget_or_ntarget_parent_boxes)
+
+        if (
+                input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
+                and output_index_style == _IndexStyle.TARGET_BOXES):
+            from boxtree.tools import reverse_index_array
+            target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
+                    target_or_target_parent_boxes, target_size=nboxes,
+                    queue=queue)
+            target_or_target_parent_boxes_from_target_boxes = cl.array.take(
+                    target_or_target_parent_boxes_from_all_boxes,
+                    target_boxes, queue=queue)
+
+            output_to_input_box = target_or_target_parent_boxes_from_target_boxes
+        else:
+            output_to_input_box = cl.array.arange(
+                    queue, noutput_boxes, dtype=self.box_id_dtype)
+
+        new_counts = cl.array.empty(queue, noutput_boxes+1, self.box_id_dtype)
+
+        assert len(input_starts) == len(input_lists)
+        nlists = len(input_starts)
+
+        evt = self.get_list_merger_kernel(nlists, True)(
+                    # input:
+                    output_to_input_box,
+                    *input_starts,
+                    # output:
+                    new_counts,
+                    range=slice(noutput_boxes),
+                    queue=queue,
+                    wait_for=wait_for)
+
+        new_starts = cl.array.cumsum(new_counts)
+        del new_counts
+
+        new_lists = cl.array.empty(
+                queue,
+                int(new_starts[-1].get()),
+                self.box_id_dtype)
+
+        new_lists.fill(999999999)
+
+        evt = self.get_list_merger_kernel(nlists, False)(
+                    # input:
+                    output_to_input_box,
+                    *input_starts,
+                    *input_lists,
+                    new_starts,
+                    # output:
+                    new_lists,
+                    range=slice(noutput_boxes),
+                    queue=queue,
+                    wait_for=[evt])
+
+        return {"starts": new_starts, "lists": new_lists}, evt
+
+# }}}
+
+
 # {{{ traversal info (output)

 class FMMTraversalInfo(DeviceDataRecord):
-    """Interaction lists needed for a fast-multipole-like linear-time gather of
+    r"""Interaction lists needed for a fast-multipole-like linear-time gather of
    particle interactions.

-    Terminology follows this article:
+    Terminology (largely) follows this article:

        Carrier, J., Greengard, L. and Rokhlin, V. "A Fast
        Adaptive Multipole Algorithm for Particle Simulations." SIAM Journal on
        Scientific and Statistical Computing 9, no. 4 (July 1988): 669-686.
-        `DOI: 10.1137/0909044 <http://dx.doi.org/10.1137/0909044>`_.
+        `DOI: 10.1137/0909044 <https://dx.doi.org/10.1137/0909044>`__.

    Unless otherwise indicated, all bulk data in this data structure is stored
    in a :class:`pyopencl.array.Array`. See also :meth:`get`.
@@ -761,6 +1319,20 @@ class FMMTraversalInfo(DeviceDataRecord):

        An instance of :class:`boxtree.Tree`.

+    .. attribute:: nboxes
+
+        Number of boxes in the tree.
+
+    .. attribute:: nlevels
+
+        Number of levels in the tree.
+
+    .. attribute:: well_sep_is_n_away
+
+        The distance (measured in target box diameters in the :math:`l^\infty`
+        norm) from the edge of the target box at which the 'well-separated'
+        (i.e. M2L-handled) 'far-field' starts.
+
    .. ------------------------------------------------------------------------
    .. rubric:: Basic box lists for iteration
    .. ------------------------------------------------------------------------
@@ -779,6 +1351,10 @@ class FMMTraversalInfo(DeviceDataRecord):
        If :attr:`boxtree.Tree.sources_are_targets`,
        then ``target_boxes is source_boxes``.

+    .. attribute:: ntarget_boxes
+
+        Number of :attr:`target_boxes`.
+
    .. attribute:: source_parent_boxes

        ``box_id_t [*]``
@@ -787,6 +1363,13 @@ class FMMTraversalInfo(DeviceDataRecord):
        of one of the :attr:`source_boxes`. These boxes may have sources of their
        own.

+    .. attribute:: level_start_source_box_nrs
+
+        ``box_id_t [nlevels+1]``
+
+        Indices into :attr:`source_boxes` indicating where
+        each level starts and ends.
+
    .. attribute:: level_start_source_parent_box_nrs

        ``box_id_t [nlevels+1]``
@@ -805,6 +1388,13 @@ class FMMTraversalInfo(DeviceDataRecord):

        Number of :attr:`target_or_target_parent_boxes`.

+    .. attribute:: level_start_target_box_nrs
+
+        ``box_id_t [nlevels+1]``
+
+        Indices into :attr:`target_boxes` indicating where
+        each level starts and ends.
+
    .. attribute:: level_start_target_or_target_parent_box_nrs

        ``box_id_t [nlevels+1]``
@@ -813,10 +1403,25 @@ class FMMTraversalInfo(DeviceDataRecord):
        each level starts and ends.

    .. ------------------------------------------------------------------------
-    .. rubric:: Colleagues
+    .. rubric:: Same-level non-well-separated boxes
    .. ------------------------------------------------------------------------

-    Immediately adjacent boxes on the same level. See :ref:`csr`.
+    Boxes considered to be within the 'non-well-separated area' according to
+    :attr:`well_sep_is_n_away` that are on the same level as their reference
+    box. See :ref:`csr`.
+
+    This is a generalization of the "colleagues" concept from the Carrier paper
+    to the case in which :attr:`well_sep_is_n_away` is not 1.
+
+    .. attribute:: same_level_non_well_sep_boxes_starts
+
+        ``box_id_t [nboxes+1]``
+
+    .. attribute:: same_level_non_well_sep_boxes_lists
+
+        ``box_id_t [*]``
+
+    Following attributes are deprecated.

    .. attribute:: colleagues_starts

@@ -831,7 +1436,9 @@ class FMMTraversalInfo(DeviceDataRecord):
    .. ------------------------------------------------------------------------

    List of source boxes immediately adjacent to each target box. Indexed like
-    :attr:`target_boxes`. See :ref:`csr`.
+    :attr:`target_boxes`. Includes the target box itself. See :ref:`csr`.
+    (Note: This list contains global box numbers, not indices into
+    :attr:`source_boxes`.)

    .. attribute:: neighbor_source_boxes_starts

@@ -848,11 +1455,11 @@ class FMMTraversalInfo(DeviceDataRecord):
    Well-separated boxes on the same level.  Indexed like
    :attr:`target_or_target_parent_boxes`. See :ref:`csr`.

-    .. attribute:: sep_siblings_starts
+    .. attribute:: from_sep_siblings_starts

        ``box_id_t [ntarget_or_target_parent_boxes+1]``

-    .. attribute:: sep_siblings_lists
+    .. attribute:: from_sep_siblings_lists

        ``box_id_t [*]``

@@ -863,26 +1470,44 @@ class FMMTraversalInfo(DeviceDataRecord):
    Smaller source boxes separated from the target box by their own size.

    If :attr:`boxtree.Tree.targets_have_extent`, then
-    :attr:`sep_close_smaller_starts` will be non-*None*. It records
+    :attr:`from_sep_close_smaller_starts` will be non-*None*. It records
    interactions between boxes that would ordinarily be handled
    through "List 3", but must be evaluated specially/directly
    because of :ref:`extent`.

-    Indexed like :attr:`target_or_target_parent_boxes`.  See :ref:`csr`.
+    .. attribute:: target_boxes_sep_smaller_by_source_level

-    .. attribute:: sep_smaller_starts
+        A list of arrays of global box numbers, one array per level, indicating
+        which boxes are used with the interaction list entries of
+        :attr:`from_sep_smaller_by_level`.
+        ``target_boxes_sep_smaller_by_source_level[i]`` has length
+        ``from_sep_smaller_by_level[i].num_nonempty_lists``.

-        ``box_id_t [ntargets+1]``

-    .. attribute:: sep_smaller_lists
+    .. attribute:: from_sep_smaller_by_level

-        ``box_id_t [*]``
+        A list of :attr:`boxtree.Tree.nlevels` (corresponding to the levels on
+        which each listed source box resides) objects, each of which has
+        attributes *count*, *starts*, *lists*, *num_nonempty_lists*, and
+        *nonempty_indices*, which form a CSR list of List 3 source boxes.

-    .. attribute:: sep_close_smaller_starts
+        *starts* has shape/type ``box_id_t [num_nonempty_lists+1]``. *lists* is of
+        type ``box_id_t``.  (Note: This list contains global box numbers, not
+        indices into :attr:`source_boxes`.)

-        ``box_id_t [ntargets+1]`` (or *None*)
+        Note *starts* are indexed along with
+        `target_boxes_sep_smaller_by_source_level`. For
+        example, for level *i*, *lists[starts[j]:starts[j+1]]* represents "List 3"
+        source boxes of *target_boxes_sep_smaller_by_source_level[i][j]* on level
+        *i*.

-    .. attribute:: sep_close_smaller_lists
+    .. attribute:: from_sep_close_smaller_starts
+
+        Indexed like :attr:`target_boxes`.  See :ref:`csr`.
+
+        ``box_id_t [ntarget_boxes+1]`` (or *None*)
+
+    .. attribute:: from_sep_close_smaller_lists

        ``box_id_t [*]`` (or *None*)

@@ -892,184 +1517,89 @@ class FMMTraversalInfo(DeviceDataRecord):

    Bigger source boxes separated from the target box by the (smaller) target
    box's size.
+    (Note: This list contains global box numbers, not indices into
+    :attr:`source_boxes`.)

-    If :attr:`boxtree.Tree.sources_have_extent`, then
-    :attr:`sep_close_bigger_starts` will be non-*None*. It records
-    interactions between boxes that would ordinarily be handled
-    through "List 4", but must be evaluated specially/directly
-    because of :ref:`extent`.
+    If :attr:`boxtree.Tree.sources_have_extent` or
+    :attr:`boxtree.Tree.targets_have_extent`, then
+    :attr:`from_sep_close_bigger_starts` will be non-*None*. It records
+    interactions between boxes that would ordinarily be handled through "List
+    4", but must be evaluated specially/directly because of :ref:`extent`.

-    Indexed like :attr:`target_or_target_parent_boxes`. See :ref:`csr`.
+    *from_sep_bigger_starts* is indexed like
+    :attr:`target_or_target_parent_boxes`. Similar to the other "close" lists,
+    *from_sep_close_bigger_starts* is indexed like :attr:`target_boxes`. See
+    :ref:`csr`.

-    .. attribute:: sep_bigger_starts
+    .. attribute:: from_sep_bigger_starts

        ``box_id_t [ntarget_or_target_parent_boxes+1]``

-    .. attribute:: sep_bigger_lists
+    .. attribute:: from_sep_bigger_lists

        ``box_id_t [*]``

-    .. attribute:: sep_close_bigger_starts
+    .. attribute:: from_sep_close_bigger_starts

-        ``box_id_t [ntarget_or_target_parent_boxes+1]`` (or *None*)
+        ``box_id_t [ntarget_boxes+1]`` (or *None*)

-    .. attribute:: sep_close_bigger_lists
+    .. attribute:: from_sep_close_bigger_lists

        ``box_id_t [*]`` (or *None*)
-    """

-    # {{{ "close" list merging -> "unified list 1"
+    .. versionchanged:: 2018.2

-    def merge_close_lists(self, queue, debug=False):
-        """Return a new :class:`FMMTraversalInfo` instance with the contents of
-        :attr:`sep_close_smaller_starts` and :attr:`sep_close_bigger_starts`
-        merged into :attr:`neighbor_source_boxes_starts` and these two
-        attributes set to *None*.
-        """
+        Changed index style of *from_sep_close_bigger_starts* from
+        :attr:`target_or_target_parent_boxes` to :attr:`target_boxes`.

-        from boxtree.tools import reverse_index_array
-        target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
-                self.target_or_target_parent_boxes, target_size=self.tree.nboxes,
-                queue=queue)
-        target_or_target_parent_boxes_from_tgt_boxes = cl.array.take(
-                target_or_target_parent_boxes_from_all_boxes,
-                self.target_boxes, queue=queue)
-
-        del target_or_target_parent_boxes_from_all_boxes
-
-        @memoize_in(self, "merge_close_lists_kernel")
-        def get_new_nb_sources_knl(write_counts):
-            from pyopencl.elementwise import ElementwiseTemplate
-            return ElementwiseTemplate("""//CL:mako//
-                /* input: */
-                box_id_t *target_or_target_parent_boxes_from_tgt_boxes,
-                box_id_t *neighbor_source_boxes_starts,
-                box_id_t *sep_close_smaller_starts,
-                box_id_t *sep_close_bigger_starts,
-
-                %if not write_counts:
-                    box_id_t *neighbor_source_boxes_lists,
-                    box_id_t *sep_close_smaller_lists,
-                    box_id_t *sep_close_bigger_lists,
-
-                    box_id_t *new_neighbor_source_boxes_starts,
-                %endif
-
-                /* output: */
-
-                %if write_counts:
-                    box_id_t *new_neighbor_source_boxes_counts,
-                %else:
-                    box_id_t *new_neighbor_source_boxes_lists,
-                %endif
-                """,
-                """//CL:mako//
-                box_id_t itgt_box = i;
-                box_id_t itarget_or_target_parent_box =
-                    target_or_target_parent_boxes_from_tgt_boxes[itgt_box];
-
-                box_id_t neighbor_source_boxes_start =
-                    neighbor_source_boxes_starts[itgt_box];
-                box_id_t neighbor_source_boxes_count =
-                    neighbor_source_boxes_starts[itgt_box + 1]
-                    - neighbor_source_boxes_start;
-
-                box_id_t sep_close_smaller_start =
-                    sep_close_smaller_starts[itgt_box];
-                box_id_t sep_close_smaller_count =
-                    sep_close_smaller_starts[itgt_box + 1]
-                    - sep_close_smaller_start;
-
-                box_id_t sep_close_bigger_start =
-                    sep_close_bigger_starts[itarget_or_target_parent_box];
-                box_id_t sep_close_bigger_count =
-                    sep_close_bigger_starts[itarget_or_target_parent_box + 1]
-                    - sep_close_bigger_start;
-
-                %if write_counts:
-                    if (itgt_box == 0)
-                        new_neighbor_source_boxes_counts[0] = 0;
-
-                    new_neighbor_source_boxes_counts[itgt_box + 1] =
-                        neighbor_source_boxes_count
-                        + sep_close_smaller_count
-                        + sep_close_bigger_count
-                        ;
-                %else:
-
-                    box_id_t cur_idx = new_neighbor_source_boxes_starts[itgt_box];
-
-                    #define COPY_FROM(NAME) \
-                        for (box_id_t i = 0; i < NAME##_count; ++i) \
-                            new_neighbor_source_boxes_lists[cur_idx++] = \
-                                NAME##_lists[NAME##_start+i];
-
-                    COPY_FROM(neighbor_source_boxes)
-                    COPY_FROM(sep_close_smaller)
-                    COPY_FROM(sep_close_bigger)
-
-                %endif
-                """).build(
-                        queue.context,
-                        type_aliases=(
-                            ("box_id_t", self.tree.box_id_dtype),
-                            ),
-                        var_values=(
-                            ("write_counts", write_counts),
-                            )
-                        )
-
-        ntarget_boxes = len(self.target_boxes)
-        new_neighbor_source_boxes_counts = cl.array.empty(
-                queue, ntarget_boxes+1, self.tree.box_id_dtype)
-        get_new_nb_sources_knl(True)(
-            # input:
-            target_or_target_parent_boxes_from_tgt_boxes,
-            self.neighbor_source_boxes_starts,
-            self.sep_close_smaller_starts,
-            self.sep_close_bigger_starts,
-
-            # output:
-            new_neighbor_source_boxes_counts,
-            range=slice(ntarget_boxes),
-            queue=queue)
-
-        new_neighbor_source_boxes_starts = cl.array.cumsum(
-                new_neighbor_source_boxes_counts)
-        del new_neighbor_source_boxes_counts
-
-        new_neighbor_source_boxes_lists = cl.array.empty(
-                queue,
-                int(new_neighbor_source_boxes_starts[ntarget_boxes].get()),
-                self.tree.box_id_dtype)

-        new_neighbor_source_boxes_lists.fill(999999999)
+    .. automethod:: get

-        get_new_nb_sources_knl(False)(
-            # input:
-            target_or_target_parent_boxes_from_tgt_boxes,
+    .. automethod:: merge_close_lists
+    """

-            self.neighbor_source_boxes_starts,
-            self.sep_close_smaller_starts,
-            self.sep_close_bigger_starts,
-            self.neighbor_source_boxes_lists,
-            self.sep_close_smaller_lists,
-            self.sep_close_bigger_lists,
+    # {{{ "close" list merging -> "unified list 1"

-            new_neighbor_source_boxes_starts,
+    def merge_close_lists(self, queue, debug=False):
+        """Return a new :class:`FMMTraversalInfo` instance with the contents of
+        :attr:`from_sep_close_smaller_starts` and
+        :attr:`from_sep_close_bigger_starts` merged into
+        :attr:`neighbor_source_boxes_starts` and these two attributes set to
+        *None*.
+        """

-            # output:
-            new_neighbor_source_boxes_lists,
-            range=slice(ntarget_boxes),
-            queue=queue)
+        list_merger = _ListMerger(queue.context, self.tree.box_id_dtype)
+
+        result, evt = (
+                list_merger(
+                    queue,
+                    # starts
+                    (self.neighbor_source_boxes_starts,
+                     self.from_sep_close_smaller_starts,
+                     self.from_sep_close_bigger_starts),
+                    # lists
+                    (self.neighbor_source_boxes_lists,
+                     self.from_sep_close_smaller_lists,
+                     self.from_sep_close_bigger_lists),
+                    # input index styles
+                    _IndexStyle.TARGET_BOXES,
+                    # output index style
+                    _IndexStyle.TARGET_BOXES,
+                    # box and tree data
+                    self.target_boxes,
+                    self.target_or_target_parent_boxes,
+                    self.tree.nboxes,
+                    debug))
+
+        cl.wait_for_events([evt])

        return self.copy(
-            neighbor_source_boxes_starts=new_neighbor_source_boxes_starts,
-            neighbor_source_boxes_lists=new_neighbor_source_boxes_lists,
-            sep_close_smaller_starts=None,
-            sep_close_smaller_lists=None,
-            sep_close_bigger_starts=None,
-            sep_close_bigger_lists=None)
+                neighbor_source_boxes_starts=result["starts"].with_queue(None),
+                neighbor_source_boxes_lists=result["lists"].with_queue(None),
+                from_sep_close_smaller_starts=None,
+                from_sep_close_smaller_lists=None,
+                from_sep_close_bigger_starts=None,
+                from_sep_close_bigger_lists=None)

    # }}}

@@ -1083,6 +1613,18 @@ class FMMTraversalInfo(DeviceDataRecord):

    # }}}

+    @property
+    def nboxes(self):
+        return self.tree.nboxes
+
+    @property
+    def nlevels(self):
+        return self.tree.nlevels
+
+    @property
+    def ntarget_boxes(self):
+        return len(self.target_boxes)
+
    @property
    def ntarget_or_target_parent_boxes(self):
        return len(self.target_or_target_parent_boxes)
@@ -1095,41 +1637,110 @@ class _KernelInfo(Record):


 class FMMTraversalBuilder:
-    def __init__(self, context):
+    """
+    .. automethod:: __init__
+    """
+
+    def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None):
+        """
+        :arg well_sep_is_n_away: Either An integer 1 or greater.
+            (Only 1 and 2 are tested.)
+            The spacing between boxes that is considered "well-separated" for
+            :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts`
+            (List 2).
+        :arg from_sep_smaller_crit: The criterion used to determine separation
+            box dimensions and separation for
+            :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level`
+            (List 3). May be one of ``"static_linf"`` (use the box square,
+            possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`),
+            ``"precise_linf"`` (use the precise extent of targets in the box,
+            including their radii), or ``"static_l2"`` (use the circumcircle of
+            the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`).
+        """
        self.context = context
+        self.well_sep_is_n_away = well_sep_is_n_away
+        self.from_sep_smaller_crit = from_sep_smaller_crit

    # {{{ kernel builder

    @memoize_method
-    def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype,
+    @log_process(logger)
+    def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
            coord_dtype, box_level_dtype, max_levels,
            sources_are_targets, sources_have_extent, targets_have_extent,
-            stick_out_factor):
+            extent_norm,
+            source_boxes_has_mask,
+            source_parent_boxes_has_mask):

-        logging.info("building traversal build kernels")
+        # {{{ process from_sep_smaller_crit
+
+        from_sep_smaller_crit = self.from_sep_smaller_crit
+
+        if from_sep_smaller_crit is None:
+            from_sep_smaller_crit = "precise_linf"
+
+        if extent_norm == "linf":
+            # no special checks needed
+            pass
+
+        elif extent_norm == "l2":
+            if from_sep_smaller_crit == "static_linf":
+                # Not technically necessary, but static linf will assume box
+                # bounds that are not guaranteed to contain all particle
+                # extents.
+                raise ValueError(
+                        "The static l^inf from-sep-smaller criterion "
+                        "cannot be used with the l^2 extent norm")
+
+        elif extent_norm is None:
+            assert not (sources_have_extent or targets_have_extent)
+
+            if from_sep_smaller_crit is None:
+                # doesn't matter
+                from_sep_smaller_crit = "static_linf"
+
+        else:
+            raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}")
+
+        if from_sep_smaller_crit not in [
+                "static_linf", "precise_linf",
+                "static_l2",
+                ]:
+            raise ValueError(
+                "unexpected value of 'from_sep_smaller_crit': "
+                f"{from_sep_smaller_crit}")
+
+        # }}}

        debug = False

        from pyopencl.tools import dtype_to_ctype
+
        from boxtree.tree import box_flags_enum
-        render_vars = dict(
-                dimensions=dimensions,
-                dtype_to_ctype=dtype_to_ctype,
-                particle_id_dtype=particle_id_dtype,
-                box_id_dtype=box_id_dtype,
-                box_flags_enum=box_flags_enum,
-                coord_dtype=coord_dtype,
-                vec_types=cl.array.vec.types,
-                max_levels=max_levels,
-                AXIS_NAMES=AXIS_NAMES,
-                debug=debug,
-                sources_are_targets=sources_are_targets,
-                sources_have_extent=sources_have_extent,
-                targets_have_extent=targets_have_extent,
-                stick_out_factor=stick_out_factor,
-                )
+        render_vars = {
+                "np": np,
+                "dimensions": dimensions,
+                "dtype_to_ctype": dtype_to_ctype,
+                "particle_id_dtype": particle_id_dtype,
+                "box_id_dtype": box_id_dtype,
+                "box_flags_enum": box_flags_enum,
+                "coord_dtype": coord_dtype,
+                "get_coord_vec_dtype": get_coord_vec_dtype,
+                "cvec_sub": partial(coord_vec_subscript_code, dimensions),
+                "max_levels": max_levels,
+                "AXIS_NAMES": AXIS_NAMES,
+                "debug": debug,
+                "sources_are_targets": sources_are_targets,
+                "sources_have_extent": sources_have_extent,
+                "targets_have_extent": targets_have_extent,
+                "well_sep_is_n_away": self.well_sep_is_n_away,
+                "from_sep_smaller_crit": from_sep_smaller_crit,
+                "source_boxes_has_mask": source_boxes_has_mask,
+                "source_parent_boxes_has_mask": source_parent_boxes_has_mask,
+                }
        from pyopencl.algorithm import ListOfListsBuilder
-        from pyopencl.tools import VectorArg, ScalarArg
+
+        from boxtree.tools import ScalarArg, VectorArg

        result = {}

@@ -1140,6 +1751,12 @@ class FMMTraversalBuilder:
                + SOURCES_PARENTS_AND_TARGETS_TEMPLATE,
                strict_undefined=True).render(**render_vars)

+        arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")]
+        if source_boxes_has_mask:
+            arg_decls.append(VectorArg(np.int8, "source_boxes_mask"))
+        if source_parent_boxes_has_mask:
+            arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask"))
+
        result["sources_parents_and_targets_builder"] = \
                ListOfListsBuilder(self.context,
                        [
@@ -1151,9 +1768,7 @@ class FMMTraversalBuilder:
                                if not sources_are_targets
                                else []),
                        str(src),
-                        arg_decls=[
-                            VectorArg(box_flags_enum.dtype, "box_flags"),
-                            ],
+                        arg_decls=arg_decls,
                        debug=debug,
                        name_prefix="sources_parents_and_targets")

@@ -1170,46 +1785,68 @@ class FMMTraversalBuilder:
        # {{{ build list N builders

        base_args = [
-                VectorArg(coord_dtype, "box_centers"),
+                VectorArg(coord_dtype, "box_centers", with_offset=False),
                ScalarArg(coord_dtype, "root_extent"),
                VectorArg(np.uint8, "box_levels"),
                ScalarArg(box_id_dtype, "aligned_nboxes"),
-                VectorArg(box_id_dtype, "box_child_ids"),
+                VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
                VectorArg(box_flags_enum.dtype, "box_flags"),
                ]

-        for list_name, template, extra_args, extra_lists in [
-                ("colleagues", COLLEAGUES_TEMPLATE, [], []),
-                ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE,
+        for list_name, template, extra_args, extra_lists, eliminate_empty_list in [
+                ("same_level_non_well_sep_boxes",
+                    SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []),
+                ("neighbor_source_boxes", NEIGHBOR_SOURCE_BOXES_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_boxes"),
-                            ], []),
-                ("sep_siblings", SEP_SIBLINGS_TEMPLATE,
+                            ], [], []),
+                ("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
-                            VectorArg(box_id_dtype, "box_parent_ids"),
-                            VectorArg(box_id_dtype, "colleagues_starts"),
-                            VectorArg(box_id_dtype, "colleagues_list"),
-                            ], []),
-                ("sep_smaller", SEP_SMALLER_TEMPLATE,
+                            VectorArg(box_id_dtype, "box_parent_ids",
+                                with_offset=False),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_starts"),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_lists"),
+                            ], [], []),
+                ("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE,
                        [
+                            ScalarArg(coord_dtype, "stick_out_factor"),
                            VectorArg(box_id_dtype, "target_boxes"),
-                            VectorArg(box_id_dtype, "colleagues_starts"),
-                            VectorArg(box_id_dtype, "colleagues_list"),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_starts"),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_lists"),
+                            *([VectorArg(coord_dtype, "box_target_bounding_box_min",
+                                         with_offset=False),
+                               VectorArg(coord_dtype, "box_target_bounding_box_max",
+                                         with_offset=False),
+                               VectorArg(particle_id_dtype,
+                                         "box_source_counts_cumul"),
+                               ]
+                              if targets_have_extent else []),
+                            ScalarArg(particle_id_dtype,
+                                "from_sep_smaller_min_nsources_cumul"),
+                            ScalarArg(box_id_dtype, "from_sep_smaller_source_level"),
                            ],
-                            ["sep_close_smaller"]
+                            ["from_sep_close_smaller"]
                            if sources_have_extent or targets_have_extent
-                            else []),
-                ("sep_bigger", SEP_BIGGER_TEMPLATE,
+                            else [], ["from_sep_smaller"]),
+                ("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE,
                        [
+                            ScalarArg(coord_dtype, "stick_out_factor"),
                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
-                            VectorArg(box_id_dtype, "box_parent_ids"),
-                            VectorArg(box_id_dtype, "colleagues_starts"),
-                            VectorArg(box_id_dtype, "colleagues_list"),
+                            VectorArg(box_id_dtype, "box_parent_ids",
+                                with_offset=False),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_starts"),
+                            VectorArg(box_id_dtype,
+                                "same_level_non_well_sep_boxes_lists"),
                            ],
-                            ["sep_close_bigger"]
+                            ["from_sep_close_bigger"]
                            if sources_have_extent or targets_have_extent
-                            else []),
+                            else [], []),
                ]:
            src = Template(
                    TRAVERSAL_PREAMBLE_TEMPLATE
@@ -1217,51 +1854,84 @@ class FMMTraversalBuilder:
                    + template,
                    strict_undefined=True).render(**render_vars)

-            result[list_name+"_builder"] = ListOfListsBuilder(self.context,
+            result[f"{list_name}_builder"] = ListOfListsBuilder(
+                    self.context,
                    [(list_name, box_id_dtype)]
                    + [(extra_list_name, box_id_dtype)
                        for extra_list_name in extra_lists],
                    str(src),
                    arg_decls=base_args + extra_args,
                    debug=debug, name_prefix=list_name,
-                    complex_kernel=True)
+                    complex_kernel=True,
+                    eliminate_empty_output_lists=eliminate_empty_list)

        # }}}

-        logging.info("traversal build kernels built")
-
        return _KernelInfo(**result)

    # }}}

    # {{{ driver

-    def __call__(self, queue, tree, wait_for=None, debug=False):
+    def __call__(self, queue, tree, wait_for=None, debug=False,
+                 _from_sep_smaller_min_nsources_cumul=None,
+                 source_boxes_mask=None,
+                 source_parent_boxes_mask=None):
        """
        :arg queue: A :class:`pyopencl.CommandQueue` instance.
        :arg tree: A :class:`boxtree.Tree` instance.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
-            exeuction.
+            execution.
+        :arg source_boxes_mask: Only boxes passing this mask will be considered for
+            `source_boxes`. Used by the distributed implementation.
+        :arg source_parent_boxes_mask: Only boxes passing this mask will be
+            considered for `source_parent_boxes`. Used by the distributed
+            implementation.
        :return: A tuple *(trav, event)*, where *trav* is a new instance of
            :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
+        from_sep_smaller_min_nsources_cumul = _from_sep_smaller_min_nsources_cumul
+
+        if from_sep_smaller_min_nsources_cumul is None:
+            # default to old no-threshold behavior
+            from_sep_smaller_min_nsources_cumul = 0

        if not tree._is_pruned:
            raise ValueError("tree must be pruned for traversal generation")

-        # Generated code shouldn't depend on tje *exact* number of tree levels.
+        if tree.sources_have_extent:
+            # YAGNI
+            raise NotImplementedError(
+                    "trees with source extent are not supported for "
+                    "traversal generation")
+
+        # FIXME: missing on TreeOfBoxes
+        sources_are_targets = getattr(tree, "sources_are_targets", True)
+
+        # Generated code shouldn't depend on the *exact* number of tree levels.
        # So round up to the next multiple of 5.
        from pytools import div_ceil
        max_levels = div_ceil(tree.nlevels, 5) * 5

+        level_start_box_nrs = (
+                None if tree.level_start_box_nrs is None else
+                cl.array.to_device(queue, tree.level_start_box_nrs))
+
        knl_info = self.get_kernel_info(
-                tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype,
-                tree.coord_dtype, tree.box_level_dtype, max_levels,
-                tree.sources_are_targets,
-                tree.sources_have_extent, tree.targets_have_extent,
-                tree.stick_out_factor)
+                dimensions=tree.dimensions,
+                particle_id_dtype=getattr(tree, "particle_id_dtype", None),
+                box_id_dtype=tree.box_id_dtype,
+                coord_dtype=tree.coord_dtype,
+                box_level_dtype=tree.box_level_dtype,
+                max_levels=max_levels,
+                sources_are_targets=sources_are_targets,
+                sources_have_extent=tree.sources_have_extent,
+                targets_have_extent=tree.targets_have_extent,
+                extent_norm=tree.extent_norm,
+                source_boxes_has_mask=source_boxes_mask is not None,
+                source_parent_boxes_has_mask=source_parent_boxes_mask is not None)

        def fin_debug(s):
            if debug:
@@ -1269,21 +1939,29 @@ class FMMTraversalBuilder:

            logger.debug(s)

-        logger.info("start building traversal")
+        traversal_plog = ProcessLogger(logger, "build traversal")

        # {{{ source boxes, their parents, and target boxes

        fin_debug("building list of source boxes, their parents, and target boxes")

+        extra_args = []
+        if source_boxes_mask is not None:
+            extra_args.append(source_boxes_mask)
+        if source_parent_boxes_mask is not None:
+            extra_args.append(source_parent_boxes_mask)
+
        result, evt = knl_info.sources_parents_and_targets_builder(
-                queue, tree.nboxes, tree.box_flags.data, wait_for=wait_for)
+            queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for
+        )
+
        wait_for = [evt]

        source_parent_boxes = result["source_parent_boxes"].lists
        source_boxes = result["source_boxes"].lists
        target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists

-        if not tree.sources_are_targets:
+        if not sources_are_targets:
            target_boxes = result["target_boxes"].lists
        else:
            target_boxes = source_boxes
@@ -1293,56 +1971,68 @@ class FMMTraversalBuilder:
        # {{{ figure out level starts in *_parent_boxes

        def extract_level_start_box_nrs(box_list, wait_for):
+            if level_start_box_nrs is None:
+                return None, []
+
            result = cl.array.empty(queue,
                    tree.nlevels+1, tree.box_id_dtype) \
                            .fill(len(box_list))
            evt = knl_info.level_start_box_nrs_extractor(
-                    tree.level_start_box_nrs_dev,
+                    level_start_box_nrs,
                    tree.box_levels,
                    box_list,
                    result,
-                    range=slice(1, len(box_list)),
+                    range=slice(0, len(box_list)),
                    queue=queue, wait_for=wait_for)

            result = result.get()

-            # We skipped box 0 above. This is always true, whether
-            # box 0 (=level 0) is a leaf or a parent.
-            result[0] = 0
-
            # Postprocess result for unoccupied levels
            prev_start = len(box_list)
            for ilev in range(tree.nlevels-1, -1, -1):
                result[ilev] = prev_start = \
                        min(result[ilev], prev_start)

-            return result, evt
+            return result, [evt]
+
+        fin_debug("finding level starts in source boxes array")
+        level_start_source_box_nrs, evt_s = \
+                extract_level_start_box_nrs(
+                        source_boxes, wait_for=wait_for)

        fin_debug("finding level starts in source parent boxes array")
-        level_start_source_parent_box_nrs, evt_s = \
+        level_start_source_parent_box_nrs, evt_sp = \
                extract_level_start_box_nrs(
                        source_parent_boxes, wait_for=wait_for)

+        fin_debug("finding level starts in target boxes array")
+        level_start_target_box_nrs, evt_t = \
+                extract_level_start_box_nrs(
+                        target_boxes, wait_for=wait_for)
+
        fin_debug("finding level starts in target or target parent boxes array")
-        level_start_target_or_target_parent_box_nrs, evt_t = \
+        level_start_target_or_target_parent_box_nrs, evt_tp = \
                extract_level_start_box_nrs(
                        target_or_target_parent_boxes, wait_for=wait_for)

-        wait_for = [evt_s, evt_t]
+        wait_for = evt_s + evt_sp + evt_t + evt_tp

        # }}}

-        # {{{ colleagues
+        # {{{ same-level non-well-separated boxes
+
+        # If well_sep_is_n_away is 1, this agrees with the definition of
+        # 'colleagues' from the classical FMM literature.

-        fin_debug("finding colleagues")
+        fin_debug("finding same-level near-field boxes")

-        result, evt = knl_info.colleagues_builder(
+        result, evt = knl_info.same_level_non_well_sep_boxes_builder(
                queue, tree.nboxes,
-                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
+                tree.box_centers.data, tree.root_extent, tree.box_levels,
+                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
                wait_for=wait_for)
        wait_for = [evt]
-        colleagues = result["colleagues"]
+        same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"]

        # }}}

@@ -1352,9 +2042,9 @@ class FMMTraversalBuilder:

        result, evt = knl_info.neighbor_source_boxes_builder(
                queue, len(target_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
-                target_boxes.data, wait_for=wait_for)
+                tree.box_centers.data, tree.root_extent, tree.box_levels,
+                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+                target_boxes, wait_for=wait_for)

        wait_for = [evt]
        neighbor_source_boxes = result["neighbor_source_boxes"]
@@ -1365,72 +2055,157 @@ class FMMTraversalBuilder:

        fin_debug("finding well-separated siblings ('list 2')")

-        result, evt = knl_info.sep_siblings_builder(
+        result, evt = knl_info.from_sep_siblings_builder(
                queue, len(target_or_target_parent_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
-                target_or_target_parent_boxes.data, tree.box_parent_ids.data,
-                colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
+                tree.box_centers.data, tree.root_extent, tree.box_levels,
+                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+                target_or_target_parent_boxes, tree.box_parent_ids.data,
+                same_level_non_well_sep_boxes.starts,
+                same_level_non_well_sep_boxes.lists,
+                wait_for=wait_for)
        wait_for = [evt]
-        sep_siblings = result["sep_siblings"]
+        from_sep_siblings = result["from_sep_siblings"]

        # }}}

+        with_extent = tree.sources_have_extent or tree.targets_have_extent
+
        # {{{ separated smaller ("list 3")

        fin_debug("finding separated smaller ('list 3')")

-        result, evt = knl_info.sep_smaller_builder(
+        from_sep_smaller_base_args = (
                queue, len(target_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
-                target_boxes.data,
-                colleagues.starts.data, colleagues.lists.data,
-                wait_for=wait_for)
-        wait_for = [evt]
-        sep_smaller = result["sep_smaller"]
+                # base_args
+                tree.box_centers.data, tree.root_extent, tree.box_levels,
+                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+                # list-specific args
+                tree.stick_out_factor, target_boxes,
+                same_level_non_well_sep_boxes.starts,
+                same_level_non_well_sep_boxes.lists,
+                *([tree.box_target_bounding_box_min.data,
+                   tree.box_target_bounding_box_max.data,
+                   tree.box_source_counts_cumul]
+                  if tree.targets_have_extent else []),
+                from_sep_smaller_min_nsources_cumul,
+                )
+
+        from_sep_smaller_wait_for = []
+        from_sep_smaller_by_level = []
+        target_boxes_sep_smaller_by_source_level = []
+
+        for ilevel in range(tree.nlevels):
+            fin_debug(f"finding separated smaller ('list 3 level {ilevel}')")
+
+            result, evt = knl_info.from_sep_smaller_builder(
+                    *from_sep_smaller_base_args, ilevel,
+                    omit_lists=("from_sep_close_smaller",) if with_extent else (),
+                    wait_for=wait_for)

-        if tree.sources_have_extent or tree.targets_have_extent:
-            sep_close_smaller_starts = result["sep_close_smaller"].starts
-            sep_close_smaller_lists = result["sep_close_smaller"].lists
+            target_boxes_sep_smaller = target_boxes[
+                result["from_sep_smaller"].nonempty_indices]
+
+            from_sep_smaller_by_level.append(result["from_sep_smaller"])
+            target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller)
+            from_sep_smaller_wait_for.append(evt)
+
+        if with_extent:
+            fin_debug("finding separated smaller close ('list 3 close')")
+            result, evt = knl_info.from_sep_smaller_builder(
+                    *from_sep_smaller_base_args,
+                     -1,
+                    omit_lists=("from_sep_smaller",),
+                    wait_for=wait_for)
+            from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts
+            from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists
+
+            from_sep_smaller_wait_for.append(evt)
        else:
-            sep_close_smaller_starts = None
-            sep_close_smaller_lists = None
+            from_sep_close_smaller_starts = None
+            from_sep_close_smaller_lists = None

        # }}}

+        wait_for = from_sep_smaller_wait_for
+        del from_sep_smaller_wait_for
+
        # {{{ separated bigger ("list 4")

        fin_debug("finding separated bigger ('list 4')")

-        result, evt = knl_info.sep_bigger_builder(
+        result, evt = knl_info.from_sep_bigger_builder(
                queue, len(target_or_target_parent_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
-                target_or_target_parent_boxes.data, tree.box_parent_ids.data,
-                colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
-        wait_for = [evt]
-        sep_bigger = result["sep_bigger"]
+                tree.box_centers.data, tree.root_extent, tree.box_levels,
+                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+                tree.stick_out_factor, target_or_target_parent_boxes,
+                tree.box_parent_ids.data,
+                same_level_non_well_sep_boxes.starts,
+                same_level_non_well_sep_boxes.lists,
+                wait_for=wait_for)

-        if tree.sources_have_extent or tree.targets_have_extent:
-            sep_close_bigger_starts = result["sep_close_bigger"].starts
-            sep_close_bigger_lists = result["sep_close_bigger"].lists
+        wait_for = [evt]
+        from_sep_bigger = result["from_sep_bigger"]
+
+        if with_extent:
+            # These are indexed by target_or_target_parent boxes; we rewrite
+            # them to be indexed by target_boxes.
+            from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts
+            from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists
+
+            list_merger = _ListMerger(queue.context, tree.box_id_dtype)
+            result, evt = list_merger(
+                    queue,
+                    # starts
+                    (from_sep_close_bigger_starts_raw,),
+                    # lists
+                    (from_sep_close_bigger_lists_raw,),
+                    # input index style
+                    _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES,
+                    # output index style
+                    _IndexStyle.TARGET_BOXES,
+                    # box and tree data
+                    target_boxes,
+                    target_or_target_parent_boxes,
+                    tree.nboxes,
+                    debug,
+                    wait_for=wait_for)
+
+            wait_for = [evt]
+
+            del from_sep_close_bigger_starts_raw
+            del from_sep_close_bigger_lists_raw
+
+            from_sep_close_bigger_starts = result["starts"]
+            from_sep_close_bigger_lists = result["lists"]
        else:
-            sep_close_bigger_starts = None
-            sep_close_bigger_lists = None
+            from_sep_close_bigger_starts = None
+            from_sep_close_bigger_lists = None

        # }}}

+        if self.well_sep_is_n_away == 1:
+            colleagues_starts = same_level_non_well_sep_boxes.starts
+            colleagues_lists = same_level_non_well_sep_boxes.lists
+        else:
+            colleagues_starts = None
+            colleagues_lists = None
+
        evt, = wait_for

-        logger.info("traversal built")
+        traversal_plog.done(
+                "from_sep_smaller_crit: %s",
+                self.from_sep_smaller_crit)

        return FMMTraversalInfo(
                tree=tree,
+                well_sep_is_n_away=self.well_sep_is_n_away,

                source_boxes=source_boxes,
                target_boxes=target_boxes,

+                level_start_source_box_nrs=level_start_source_box_nrs,
+                level_start_target_box_nrs=level_start_target_box_nrs,
+
                source_parent_boxes=source_parent_boxes,
                level_start_source_parent_box_nrs=level_start_source_parent_box_nrs,

@@ -1438,28 +2213,34 @@ class FMMTraversalBuilder:
                level_start_target_or_target_parent_box_nrs=(
                    level_start_target_or_target_parent_box_nrs),

-                colleagues_starts=colleagues.starts,
-                colleagues_lists=colleagues.lists,
+                same_level_non_well_sep_boxes_starts=(
+                    same_level_non_well_sep_boxes.starts),
+                same_level_non_well_sep_boxes_lists=(
+                    same_level_non_well_sep_boxes.lists),
+                # Deprecated, but we'll keep these alive for the time being.
+                colleagues_starts=colleagues_starts,
+                colleagues_lists=colleagues_lists,

                neighbor_source_boxes_starts=neighbor_source_boxes.starts,
                neighbor_source_boxes_lists=neighbor_source_boxes.lists,

-                sep_siblings_starts=sep_siblings.starts,
-                sep_siblings_lists=sep_siblings.lists,
+                from_sep_siblings_starts=from_sep_siblings.starts,
+                from_sep_siblings_lists=from_sep_siblings.lists,

-                sep_smaller_starts=sep_smaller.starts,
-                sep_smaller_lists=sep_smaller.lists,
+                from_sep_smaller_by_level=from_sep_smaller_by_level,
+                target_boxes_sep_smaller_by_source_level=(
+                    target_boxes_sep_smaller_by_source_level),

-                sep_close_smaller_starts=sep_close_smaller_starts,
-                sep_close_smaller_lists=sep_close_smaller_lists,
+                from_sep_close_smaller_starts=from_sep_close_smaller_starts,
+                from_sep_close_smaller_lists=from_sep_close_smaller_lists,

-                sep_bigger_starts=sep_bigger.starts,
-                sep_bigger_lists=sep_bigger.lists,
+                from_sep_bigger_starts=from_sep_bigger.starts,
+                from_sep_bigger_lists=from_sep_bigger.lists,

-                sep_close_bigger_starts=sep_close_bigger_starts,
-                sep_close_bigger_lists=sep_close_bigger_lists,
+                from_sep_close_bigger_starts=from_sep_close_bigger_starts,
+                from_sep_close_bigger_lists=from_sep_close_bigger_lists,
                ).with_queue(None), evt

    # }}}

-# vim: filetype=pyopencl:fdm=marker
+# vim: fdm=marker
--- a/boxtree/tree.py
+++ b/boxtree/tree.py
-from __future__ import division
+"""
+.. _tree-kinds:
+
+Supported tree kinds
+--------------------
+
+The following tree kinds are supported:
+
+- *Nonadaptive* trees have all leaves on the same (last) level.
+
+- *Adaptive* trees differ from nonadaptive trees in that they may have leaves on
+  more than one level. Adaptive trees have the option of being
+  *level-restricted*: in a level-restricted tree, neighboring leaves differ by
+  at most one level.
+
+All trees returned by the tree builder are pruned so that empty leaves have been
+removed. If a level-restricted tree is requested, the tree gets constructed in
+such a way that the version of the tree before pruning is also level-restricted.
+
+Tree data structure
+-------------------
+
+.. currentmodule:: boxtree
+
+.. autoclass:: box_flags_enum
+
+.. autoclass:: TreeOfBoxes
+
+.. autoclass:: Tree
+
+.. currentmodule:: boxtree.tree
+
+Tree with linked point sources
+------------------------------
+
+.. autoclass:: TreeWithLinkedPointSources
+
+.. autofunction:: link_point_sources
+
+Filtering the lists of targets
+------------------------------
+
+.. currentmodule:: boxtree.tree
+
+Data structures
+^^^^^^^^^^^^^^^
+
+.. autoclass:: FilteredTargetListsInUserOrder
+.. autoclass:: FilteredTargetListsInTreeOrder
+
+Tools
+^^^^^
+
+.. autoclass:: ParticleListFilter
+
+.. autofunction:: filter_target_lists_in_user_order
+
+.. autofunction:: filter_target_lists_in_tree_order
+"""

 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

@@ -22,40 +80,217 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging
+from dataclasses import dataclass
+from functools import cached_property

-import pyopencl as cl
 import numpy as np
-from boxtree.tools import DeviceDataRecord
+
+import pyopencl as cl
 from cgen import Enum
+from pytools import memoize_method
+
+from boxtree.tools import DeviceDataRecord
+

-import logging
 logger = logging.getLogger(__name__)


 # {{{ box flags

 class box_flags_enum(Enum):  # noqa
-    """Constants for box flags bit field."""
+    """Constants for box flags bit field.
+
+    .. rubric:: Flags for particle-based trees
+
+    .. attribute:: dtype
+
+    .. attribute:: IS_SOURCE_BOX
+    .. attribute:: IS_TARGET_BOX
+    .. attribute:: IS_SOURCE_OR_TARGET_BOX
+    .. attribute:: HAS_SOURCE_CHILD_BOXES
+    .. attribute:: HAS_TARGET_CHILD_BOXES
+    .. attribute:: HAS_SOURCE_OR_TARGET_CHILD_BOXES
+    .. attribute:: IS_LEAF_BOX
+
+    .. warning ::
+
+        :attr:`IS_LEAF_BOX` is only used for :class:`TreeOfBoxes` for the moment.
+    """

    c_name = "box_flags_t"
    dtype = np.dtype(np.uint8)
    c_value_prefix = "BOX_"

-    HAS_OWN_SOURCES = 1 << 0
-    HAS_OWN_TARGETS = 1 << 1
-    HAS_OWN_SRCNTGTS = (HAS_OWN_SOURCES | HAS_OWN_TARGETS)
-    HAS_CHILD_SOURCES = 1 << 2
-    HAS_CHILD_TARGETS = 1 << 3
-    HAS_CHILDREN = (HAS_CHILD_SOURCES | HAS_CHILD_TARGETS)
+    IS_SOURCE_BOX = 1 << 0
+    IS_TARGET_BOX = 1 << 1
+    IS_SOURCE_OR_TARGET_BOX = (IS_SOURCE_BOX | IS_TARGET_BOX)
+    HAS_SOURCE_CHILD_BOXES = 1 << 2
+    HAS_TARGET_CHILD_BOXES = 1 << 3
+    HAS_SOURCE_OR_TARGET_CHILD_BOXES = (
+            HAS_SOURCE_CHILD_BOXES | HAS_TARGET_CHILD_BOXES)
+
+    # FIXME: Only used for TreeOfBoxes for now
+    IS_LEAF_BOX = 1 << 4
+
+    # Deprecated alias, do not use.
+    HAS_CHILDREN = HAS_SOURCE_OR_TARGET_CHILD_BOXES
+
+# }}}
+
+
+# {{{ tree of boxes
+
+@dataclass
+class TreeOfBoxes:
+    """A quad/octree tree of pure boxes, excluding their contents (e.g.
+    particles).  It is a lightweight tree handled with :mod:`numpy`, intended
+    for mesh adaptivity. One may generate a :class:`meshmode.mesh.Mesh` object
+    consisting of leaf boxes using :func:`make_meshmode_mesh_from_leaves`.
+
+    .. attribute:: dimensions
+
+    .. attribute:: nlevels
+
+    .. attribute:: nboxes
+
+    .. attribute:: root_extent
+
+        (Scalar) extent of the root box.
+
+    .. attribute:: box_centers
+
+        mod:`numpy` array of shape ``(dim, nboxes)`` of the centers of the boxes.
+
+    .. attribute:: box_parent_ids
+
+        :mod:`numpy` vector of parent box ids.
+
+    .. attribute:: box_child_ids
+
+        (2**dim)-by-nboxes :mod:`numpy` array of children box ids.
+
+    .. attribute:: box_levels
+
+        :mod:`numpy` vector of box levels in non-decreasing order.
+
+    .. attribute:: bounding_box
+
+        A :class:`tuple` ``(bbox_min, bbox_max)`` of :mod:`numpy` vectors
+        giving the (built) extent of the tree. Note that this may be slightly
+        larger than what is required to contain all particles, if any.
+
+    .. attribute:: box_flags
+
+        :attr:`box_flags_enum.dtype` ``[nboxes]``
+
+        A bitwise combination of :class:`box_flags_enum` constants.
+
+    .. attribute:: level_start_box_nrs
+
+        ``box_id_t [nlevels+1]``
+
+        An array of box ids indicating the ID at which each level starts. Levels
+        are contiguous in box ID space. To determine how many boxes there are
+        in each level, access the start of the next level. This array is
+        built so that this works even for the last level.
+
+    .. attribute:: box_id_dtype
+    .. attribute:: box_level_dtype
+    .. attribute:: coord_dtype
+
+        See :class:`Tree` documentation.
+
+    .. attribute:: leaf_boxes
+
+        Array of leaf boxes.
+
+    .. attribute:: sources_have_extent
+    .. attribute:: targets_have_extent
+    .. attribute:: extent_norm
+    .. attribute:: stick_out_factor
+
+        See :class:`Tree` documentation.
+
+    .. automethod:: __init__
+    """
+
+    root_extent: np.ndarray
+    box_centers: np.ndarray
+
+    box_parent_ids: np.ndarray
+    box_child_ids: np.ndarray
+    box_levels: np.ndarray
+
+    box_flags: np.ndarray | None
+    level_start_box_nrs: np.ndarray | None
+
+    # FIXME: these should be properties and take values from box_parent_ids, etc
+    box_id_dtype: np.dtype
+    box_level_dtype: np.dtype
+    coord_dtype: np.dtype
+
+    sources_have_extent: bool
+    targets_have_extent: bool
+    extent_norm: str
+    stick_out_factor: float
+
+    _is_pruned: bool
+
+    @property
+    def dimensions(self):
+        return self.box_centers.shape[0]
+
+    @property
+    def nboxes(self):
+        return self.box_centers.shape[1]
+
+    @property
+    def aligned_nboxes(self):
+        return self.box_child_ids.shape[-1]
+
+    @property
+    def nlevels(self):
+        # level starts from 0
+        if isinstance(self.box_levels, cl.array.Array):
+            return int(max(self.box_levels).get()) + 1
+        else:
+            return max(self.box_levels) + 1
+
+    @property
+    def leaf_boxes(self):
+        boxes = np.arange(self.nboxes)
+        return boxes[self.box_flags & box_flags_enum.IS_LEAF_BOX != 0]
+
+    @cached_property
+    def bounding_box(self) -> tuple[np.ndarray, np.ndarray]:
+        lows = self.box_centers[:, 0] - 0.5 * self.root_extent
+        highs = lows + self.root_extent
+        return lows, highs
+
+    # {{{ dummy interface for TreePlotter
+
+    def get_box_size(self, ibox):
+        lev = self.box_levels[ibox]
+        box_size = self.root_extent * 0.5**lev
+        return box_size
+
+    def get_box_extent(self, ibox):
+        box_size = self.get_box_size(ibox)
+        extent_low = self.box_centers[:, ibox] - 0.5*box_size
+        extent_high = extent_low + box_size
+        return extent_low, extent_high

+    # }}}

 # }}}


-# {{{ tree data structure
+# {{{ tree with particles
+
+class Tree(DeviceDataRecord, TreeOfBoxes):
+    r"""A quad/octree consisting of particles sorted into a hierarchy of boxes.

-class Tree(DeviceDataRecord):
-    """A quad/octree consisting of particles sorted into a hierarchy of boxes.
    Optionally, particles may be designated 'sources' and 'targets'. They
    may also be assigned radii which restrict the minimum size of the box
    into which they may be sorted.
@@ -66,6 +301,8 @@ class Tree(DeviceDataRecord):
    Unless otherwise indicated, all bulk data in this data structure is stored
    in a :class:`pyopencl.array.Array`. See also :meth:`get`.

+    Inherits from :class:`TreeOfBoxes`.
+
    .. rubric:: Flags

    .. attribute:: sources_are_targets
@@ -99,43 +336,52 @@ class Tree(DeviceDataRecord):
    .. rubric:: Counts and sizes
    .. ------------------------------------------------------------------------

-    .. attribute:: root_extent
+    .. attribute:: stick_out_factor

-        the root box size, a scalar
+        A scalar used for calculating how much particles with extent may
+        overextend their containing box.

-    .. attribute:: stick_out_factor
+        Each box in the tree can be thought of as being surrounded by a
+        fictitious box whose :math:`l^\infty` radius is `1 + stick_out_factor`
+        larger. Particles with extent are allowed to extend inside (a) the
+        fictitious box or (b) a disk surrounding the fictitious box, depending on
+        :attr:`extent_norm`.

-        The fraction of the box diameter by which the :math:`l^\infty` circles
-        given by :attr:`source_radii` may stick out the box in which they are
-        contained. A scalar.
+    .. attribute:: extent_norm

-    .. attribute:: nsources
+        One of ``None``, ``"l2"`` or ``"linf"``. If *None*, particles do not have
+        extent. If not *None*, indicates the norm with which extent-bearing particles
+        are determined to lie 'inside' a box, taking into account the box's
+        :attr:`stick_out_factor`.

-    .. attribute:: ntargets
+        This image illustrates the difference in semantics:
+
+        .. image:: images/linf-l2.png
+
+        In the figure, the box has (:math:`\ell^\infty`) radius :math:`R`, the
+        particle has radius :math:`r`, and :attr:`stick_out_factor` is denoted
+        :math:`\alpha`.

    .. attribute:: nlevels

-    .. attribute:: bounding_box
+    .. attribute:: nboxes

-        a tuple *(bbox_min, bbox_max)* of
-        :mod:`numpy` vectors giving the (built) extent
-        of the tree. Note that this may be slightly larger
-        than what is required to contain all particles.
+    .. attribute:: nsources
+
+    .. attribute:: ntargets

    .. attribute:: level_start_box_nrs

        ``box_id_t [nlevels+1]``

-        A :class:`numpy.ndarray` of box ids
-        indicating the ID at which each level starts. Levels
-        are contiguous in box ID space. To determine
-        how many boxes there are in each level,
-        access the start of the next level. This array is
+        An array of box ids indicating the ID at which each level starts. Levels
+        are contiguous in box ID space. To determine how many boxes there are
+        in each level, access the start of the next level. This array is
        built so that this works even for the last level.

    .. attribute:: level_start_box_nrs_dev

-        ``particle_id_t [nlevels+1``
+        ``particle_id_t [nlevels+1]``

        The same array as :attr:`level_start_box_nrs`
        as a :class:`pyopencl.array.Array`.
@@ -169,10 +415,10 @@ class Tree(DeviceDataRecord):

    .. attribute:: target_radii

-        ``coord_t [nsources]``
+        ``coord_t [ntargets]``

        :math:`l^\infty` radii of the :attr:`targets`.
-        Available if :attr:`sources_have_extent` is *True*.
+        Available if :attr:`targets_have_extent` is *True*.

    .. ------------------------------------------------------------------------
    .. rubric:: Tree/user order indices
@@ -275,11 +521,43 @@ class Tree(DeviceDataRecord):

        :attr:`box_level_dtype` ``box_level_t [nboxes]``

-    .. attribute:: box_flags
+    .. ------------------------------------------------------------------------
+    .. rubric:: Particle-adaptive box extents
+    .. ------------------------------------------------------------------------

-        :attr:`box_flags_enum.dtype` ``[nboxes]``
+    These attributes capture the maximum extent of particles (including the
+    particle's extents) inside of the box.  If the box is empty, both *min* and *max*
+    will reflect the box center.  The purpose of this information is to reduce the
+    cost of some interactions through knowledge that some boxes are partially empty.
+    (See the *from_sep_smaller_crit* argument to the constructor of
+    :class:`boxtree.traversal.FMMTraversalBuilder` for an example.)

-        A bitwise combination of :class:`box_flags_enum` constants.
+    .. note::
+
+        To obtain the overall, non-adaptive box extent, use
+        :attr:`boxtree.Tree.box_centers` along with :attr:`boxtree.Tree.box_levels`.
+
+    If they are not available, the corresponding attributes will be *None*.
+
+    .. attribute:: box_source_bounding_box_min
+
+        ``coordt_t [dimensions, aligned_nboxes]``
+
+    .. attribute:: box_source_bounding_box_max
+
+        ``coordt_t [dimensions, aligned_nboxes]``
+
+    .. attribute:: box_target_bounding_box_min
+
+        ``coordt_t [dimensions, aligned_nboxes]``
+
+    .. attribute:: box_target_bounding_box_max
+
+        ``coordt_t [dimensions, aligned_nboxes]``
+
+    .. rubric:: Methods
+
+    .. automethod:: get
    """

    @property
@@ -294,20 +572,16 @@ class Tree(DeviceDataRecord):

    @property
    def nsources(self):
-        return len(self.user_source_ids)
+        return len(self.sources[0])

    @property
    def ntargets(self):
-        return len(self.sorted_target_ids)
+        return len(self.targets[0])

    @property
    def nlevels(self):
        return len(self.level_start_box_nrs) - 1

-    @property
-    def aligned_nboxes(self):
-        return self.box_child_ids.shape[-1]
-
    def plot(self, **kwargs):
        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(self)
@@ -350,8 +624,8 @@ class Tree(DeviceDataRecord):
        """
        crit = (
                (self.box_target_starts <= itarget)
-                &
-                (itarget < self.box_target_starts + self.box_target_counts_nonchild))
+                & (itarget
+                    < self.box_target_starts + self.box_target_counts_nonchild))

        return int(np.where(crit)[0])

@@ -361,13 +635,28 @@ class Tree(DeviceDataRecord):
        """
        crit = (
                (self.box_source_starts <= isource)
-                &
-                (isource < self.box_source_starts + self.box_source_counts_nonchild))
+                & (isource
+                    < self.box_source_starts + self.box_source_counts_nonchild))

        return int(np.where(crit)[0])

    # }}}

+    def to_device(self, queue, exclude_fields=frozenset()):
+        # level_start_box_nrs should remain in host memory
+        exclude_fields = set(exclude_fields)
+        exclude_fields.add("level_start_box_nrs")
+
+        return super().to_device(queue, frozenset(exclude_fields))
+
+    def to_host_device_array(self, queue, exclude_fields=frozenset()):
+        # level_start_box_nrs should remain in host memory
+        exclude_fields = set(exclude_fields)
+        exclude_fields.add("level_start_box_nrs")
+
+        return super().to_host_device_array(
+            queue, frozenset(exclude_fields))
+
 # }}}


@@ -378,8 +667,8 @@ class TreeWithLinkedPointSources(Tree):
    linked with extent are expanded into point sources which are linked to the
    extent-having sources in the original tree. (In an FMM context, they may
    stand in for the 'underlying' source for the purpose of the far-field
-    calculation.) Has all the same attributes as :class:`Tree`.
-    :attr:`Tree.sources_have_extent` is always *True* for instances of this
+    calculation.) Has all the same attributes as :class:`boxtree.Tree`.
+    :attr:`boxtree.Tree.sources_have_extent` is always *True* for instances of this
    type. In addition, the following attributes are available.

    .. attribute:: npoint_sources
@@ -397,7 +686,8 @@ class TreeWithLinkedPointSources(Tree):
        is an object array.)

        This array is stored in :ref:`tree point source order <particle-orderings>`,
-        unlike the parameter to :meth:`TreeWithLinkedPointSources.___init__`
+        unlike the parameter to
+        :meth:`boxtree.tree.TreeWithLinkedPointSources.__init__`

    .. attribute:: point_source_counts

@@ -429,13 +719,22 @@ class TreeWithLinkedPointSources(Tree):
    .. attribute:: box_point_source_counts_cumul

        ``particle_id_t [nboxes]``
+
+    .. method:: __init__
+
+        This constructor is not intended to be called by users directly.
+        Call :func:`link_point_sources` instead.
+
+    .. rubric:: Methods
+
+    .. automethod:: get
    """


 def link_point_sources(queue, tree, point_source_starts, point_sources,
        debug=False):
-    """
-    *Construction:* Requires that :attr:`Tree.sources_have_extent` is *True*
+    r"""
+    *Construction:* Requires that :attr:`boxtree.Tree.sources_have_extent` is *True*
    on *tree*.

    :arg queue: a :class:`pyopencl.CommandQueue` instance
@@ -445,9 +744,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
        original (extent-having) source number *isrc*. *isrc* is in :ref:`user
        source order <particle-orderings>`.

-        All the particles linked to *isrc* shoud fall within the :math:`l^\infty`
+        All the particles linked to *isrc* should fall within the :math:`l^\infty`
        'circle' around particle number *isrc* with the radius drawn from
-        :attr:`source_radii`.
+        :attr:`boxtree.Tree.source_radii`.

    :arg point_sources: an object array of (XYZ) point coordinate arrays.
    """
@@ -523,8 +822,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
            dest_indices=tree_order_point_source_starts,
            out=[source_boundaries])

-    from boxtree.tree_build_kernels import \
-            POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL
+    from boxtree.tree_build_kernels import (
+        POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL,
+    )

    logger.debug("point source linking: point source id scan")

@@ -591,7 +891,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,

    tree_attrs = {}
    for attr_name in tree.__class__.fields:
-        try:
+        try:  # noqa: SIM105
            tree_attrs[attr_name] = getattr(tree, attr_name)
        except AttributeError:
            pass
@@ -612,11 +912,11 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
 # }}}


-# {{{ filtered target lists
+# {{{ particle list filter

 class FilteredTargetListsInUserOrder(DeviceDataRecord):
-    """Use :func:`filter_target_lists_in_user_order` to create instances of this
-    class.
+    """Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create
+    instances of this class.

    This class represents subsets of the list of targets in each box (as given
    by :attr:`boxtree.Tree.box_target_starts` and
@@ -647,73 +947,16 @@ class FilteredTargetListsInUserOrder(DeviceDataRecord):
        child boxes).  Use together with :attr:`target_starts`.

        Target numbers are stored in user order, as the class name suggests.
-    """

+    .. rubric:: Methods

-def filter_target_lists_in_user_order(queue, tree, flags):
+    .. automethod:: get
    """
-    :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
-        :class:`numpy.int8` objects, which indicate by being zero that the
-        corresponding target (in user target order) is not part of the
-        filtered list, or by being nonzero that it is.
-
-    :returns: A :class:`FilteredTargetListsInUserOrder`
-    """
-
-    user_order_flags = flags
-    del flags
-
-    user_target_ids = cl.array.empty(queue, tree.ntargets,
-            tree.sorted_target_ids.dtype)
-    user_target_ids[tree.sorted_target_ids] = cl.array.arange(
-            queue, tree.ntargets, user_target_ids.dtype)
-
-    from pyopencl.tools import VectorArg, dtype_to_ctype
-    from pyopencl.algorithm import ListOfListsBuilder
-    from mako.template import Template
-    builder = ListOfListsBuilder(queue.context,
-        [("filt_tgt_list", tree.particle_id_dtype)], Template("""//CL//
-        typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
-
-        void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
-        {
-            particle_id_t b_t_start = box_target_starts[i];
-            particle_id_t b_t_count = box_target_counts_nonchild[i];
-
-            for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
-            {
-                particle_id_t user_target_id = user_target_ids[j];
-                if (user_order_flags[user_target_id])
-                {
-                    APPEND_filt_tgt_list(user_target_id);
-                }
-            }
-        }
-        """, strict_undefined=True).render(
-            dtype_to_ctype=dtype_to_ctype,
-            particle_id_dtype=tree.particle_id_dtype
-            ), arg_decls=[
-                VectorArg(user_order_flags.dtype, "user_order_flags"),
-                VectorArg(tree.particle_id_dtype, "user_target_ids"),
-                VectorArg(tree.particle_id_dtype, "box_target_starts"),
-                VectorArg(tree.particle_id_dtype, "box_target_counts_nonchild"),
-            ])
-
-    result, evt = builder(queue, tree.nboxes,
-            user_order_flags.data,
-            user_target_ids.data,
-            tree.box_target_starts.data, tree.box_target_counts_nonchild.data)
-
-    return FilteredTargetListsInUserOrder(
-            nfiltered_targets=result["filt_tgt_list"].count,
-            target_starts=result["filt_tgt_list"].starts,
-            target_lists=result["filt_tgt_list"].lists,
-            ).with_queue(None)


 class FilteredTargetListsInTreeOrder(DeviceDataRecord):
-    """Use :func:`filter_target_lists_in_tree_order` to create instances of this
-    class.
+    """Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create
+    instances of this class.

    This class represents subsets of the list of targets in each box (as given by
    :attr:`boxtree.Tree.box_target_starts` and
@@ -756,90 +999,219 @@ class FilteredTargetListsInTreeOrder(DeviceDataRecord):
        Storing *to* these indices will reorder the targets
        from *filtered* tree target order into 'regular'
        :ref:`tree target order <particle-orderings>`.
+
+    .. rubric:: Methods
+
+    .. automethod:: get
    """


-def filter_target_lists_in_tree_order(queue, tree, flags):
+class ParticleListFilter:
    """
-    :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
-        :class:`numpy.int8` objects, which indicate by being zero that the
-        corresponding target (in user target order) is not part of the
-        filtered list, or by being nonzero that it is.
-    :returns: A :class:`FilteredTargetListsInTreeOrder`
+    .. automethod:: filter_target_lists_in_tree_order
+    .. automethod:: filter_target_lists_in_user_order
    """

-    tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8)
-    tree_order_flags[tree.sorted_target_ids] = flags
+    def __init__(self, context):
+        self.context = context

-    from boxtree.tree_build_kernels import (
-            TREE_ORDER_TARGET_FILTER_SCAN_TPL,
-            TREE_ORDER_TARGET_FILTER_INDEX_TPL)
+    @memoize_method
+    def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
+            user_order_flags_dtype):
+        from mako.template import Template

-    scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build(
-        queue.context,
-        type_aliases=(
-            ("scan_t", tree.particle_id_dtype),
-            ("particle_id_t", tree.particle_id_dtype),
-            ),
-        )
-    filtered_from_unfiltered_target_indices = cl.array.empty(
-            queue, tree.ntargets, tree.particle_id_dtype)
-    unfiltered_from_filtered_target_indices = cl.array.empty(
-            queue, tree.ntargets, tree.particle_id_dtype)
-
-    nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype)
-    scan_knl(tree_order_flags,
-            filtered_from_unfiltered_target_indices,
-            unfiltered_from_filtered_target_indices,
-            nfiltered_targets,
-            queue=queue)
+        from pyopencl.algorithm import ListOfListsBuilder
+        from pyopencl.tools import dtype_to_ctype

-    nfiltered_targets = int(nfiltered_targets.get())
+        from boxtree.tools import VectorArg

-    unfiltered_from_filtered_target_indices = \
-            unfiltered_from_filtered_target_indices[:nfiltered_targets]
+        builder = ListOfListsBuilder(self.context,
+            [("filt_tgt_list", particle_id_dtype)], Template("""//CL//
+            typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;

-    from pytools.obj_array import make_obj_array
-    filtered_targets = make_obj_array([
-        targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices]
-        for targets_i in tree.targets
-        ])
+            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
+            {
+                particle_id_t b_t_start = box_target_starts[i];
+                particle_id_t b_t_count = box_target_counts_nonchild[i];

-    index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build(
-        queue.context,
-        type_aliases=(
-            ("particle_id_t", tree.particle_id_dtype),
-            ),
+                for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
+                {
+                    particle_id_t user_target_id = user_target_ids[j];
+                    if (user_order_flags[user_target_id])
+                    {
+                        APPEND_filt_tgt_list(user_target_id);
+                    }
+                }
+            }
+            """, strict_undefined=True).render(
+                dtype_to_ctype=dtype_to_ctype,
+                particle_id_dtype=particle_id_dtype
+                ), arg_decls=[
+                    VectorArg(user_order_flags_dtype, "user_order_flags"),
+                    VectorArg(particle_id_dtype, "user_target_ids"),
+                    VectorArg(particle_id_dtype, "box_target_starts"),
+                    VectorArg(particle_id_dtype, "box_target_counts_nonchild"),
+                ])
+
+        return builder
+
+    def filter_target_lists_in_user_order(self, queue, tree, flags):
+        """
+        :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
+            :class:`numpy.int8` objects, which indicate by being zero that the
+            corresponding target (in user target order) is not part of the
+            filtered list, or by being nonzero that it is.
+
+        :returns: A :class:`FilteredTargetListsInUserOrder`
+        """
+        user_order_flags = flags
+        del flags
+
+        user_target_ids = cl.array.empty(queue, tree.ntargets,
+                tree.sorted_target_ids.dtype)
+        user_target_ids[tree.sorted_target_ids] = cl.array.arange(
+                queue, tree.ntargets, user_target_ids.dtype)
+
+        kernel = self.get_filter_target_lists_in_user_order_kernel(
+                tree.particle_id_dtype, user_order_flags.dtype)
+
+        result, _evt = kernel(queue, tree.nboxes,
+                user_order_flags,
+                user_target_ids,
+                tree.box_target_starts,
+                tree.box_target_counts_nonchild)
+
+        return FilteredTargetListsInUserOrder(
+                nfiltered_targets=result["filt_tgt_list"].count,
+                target_starts=result["filt_tgt_list"].starts,
+                target_lists=result["filt_tgt_list"].lists,
+                ).with_queue(None)
+
+    @memoize_method
+    def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype):
+        from boxtree.tree_build_kernels import (
+            TREE_ORDER_TARGET_FILTER_INDEX_TPL,
+            TREE_ORDER_TARGET_FILTER_SCAN_TPL,
        )

-    box_target_starts_filtered = \
-            cl.array.empty_like(tree.box_target_starts)
-    box_target_counts_nonchild_filtered = \
-            cl.array.empty_like(tree.box_target_counts_nonchild)
+        scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build(
+            self.context,
+            type_aliases=(
+                ("scan_t", particle_id_dtype),
+                ("particle_id_t", particle_id_dtype),
+                ),
+            )

-    index_knl(
-            # input
-            tree.box_target_starts,
-            tree.box_target_counts_nonchild,
-            filtered_from_unfiltered_target_indices,
-            tree.ntargets,
-            nfiltered_targets,
+        index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build(
+            self.context,
+            type_aliases=(
+                ("particle_id_t", particle_id_dtype),
+                ),
+            )

-            # output
-            box_target_starts_filtered,
-            box_target_counts_nonchild_filtered,
+        return scan_knl, index_knl

-            queue=queue)
+    def filter_target_lists_in_tree_order(self, queue, tree, flags):
+        """
+        :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
+            :class:`numpy.int8` objects, which indicate by being zero that the
+            corresponding target (in user target order) is not part of the
+            filtered list, or by being nonzero that it is.
+        :returns: A :class:`FilteredTargetListsInTreeOrder`
+        """
+
+        tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8)
+        tree_order_flags[tree.sorted_target_ids] = flags
+
+        filtered_from_unfiltered_target_indices = cl.array.empty(
+                queue, tree.ntargets, tree.particle_id_dtype)
+        unfiltered_from_filtered_target_indices = cl.array.empty(
+                queue, tree.ntargets, tree.particle_id_dtype)
+
+        nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype)
+
+        scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels(
+                tree.particle_id_dtype)
+
+        scan_knl(tree_order_flags,
+                filtered_from_unfiltered_target_indices,
+                unfiltered_from_filtered_target_indices,
+                nfiltered_targets,
+                queue=queue)
+
+        nfiltered_targets = int(nfiltered_targets.get().item())
+
+        unfiltered_from_filtered_target_indices = \
+                unfiltered_from_filtered_target_indices[:nfiltered_targets]
+
+        from pytools.obj_array import make_obj_array
+        filtered_targets = make_obj_array([
+            targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices]
+            for targets_i in tree.targets
+            ])

-    return FilteredTargetListsInTreeOrder(
-            nfiltered_targets=nfiltered_targets,
-            box_target_starts=box_target_starts_filtered,
-            box_target_counts_nonchild=box_target_counts_nonchild_filtered,
-            unfiltered_from_filtered_target_indices=(
-                unfiltered_from_filtered_target_indices),
-            targets=filtered_targets,
-            ).with_queue(None)
+        box_target_starts_filtered = \
+                cl.array.empty_like(tree.box_target_starts)
+        box_target_counts_nonchild_filtered = \
+                cl.array.empty_like(tree.box_target_counts_nonchild)
+
+        index_knl(
+                # input
+                tree.box_target_starts,
+                tree.box_target_counts_nonchild,
+                filtered_from_unfiltered_target_indices,
+                tree.ntargets,
+                nfiltered_targets,
+
+                # output
+                box_target_starts_filtered,
+                box_target_counts_nonchild_filtered,
+
+                queue=queue)
+
+        return FilteredTargetListsInTreeOrder(
+                nfiltered_targets=nfiltered_targets,
+                box_target_starts=box_target_starts_filtered,
+                box_target_counts_nonchild=box_target_counts_nonchild_filtered,
+                unfiltered_from_filtered_target_indices=(
+                    unfiltered_from_filtered_target_indices),
+                targets=filtered_targets,
+                ).with_queue(None)

 # }}}

+
+# {{{ filter_target_lists_in_*_order
+
+def filter_target_lists_in_user_order(queue, tree, flags):
+    """
+    Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_user_order`.
+    """
+
+    from warnings import warn
+    warn(
+            "filter_target_lists_in_user_order() is deprecated and will go "
+            "away in a future release. Use "
+            "ParticleListFilter.filter_target_lists_in_user_order() instead.",
+            DeprecationWarning, stacklevel=2)
+
+    return (ParticleListFilter(queue.context)
+            .filter_target_lists_in_user_order(queue, tree, flags))
+
+
+def filter_target_lists_in_tree_order(queue, tree, flags):
+    """
+    Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_tree_order`.
+    """
+    from warnings import warn
+    warn(
+            "filter_target_lists_in_tree_order() is deprecated and will go "
+            "away in a future release. Use "
+            "ParticleListFilter.filter_target_lists_in_tree_order() instead.",
+            DeprecationWarning, stacklevel=2)
+
+    return (ParticleListFilter(queue.context)
+            .filter_target_lists_in_tree_order(queue, tree, flags))
+# }}}
+
 # vim: filetype=pyopencl:fdm=marker
--- a/boxtree/tree_build.py
+++ b/boxtree/tree_build.py
-from __future__ import division, absolute_import
+"""
+.. currentmodule:: boxtree
+
+Building Particle-Based Trees
+-----------------------------
+
+These functions produce instances of the particle-based :class:`Tree`.
+
+.. note::
+
+    These functions currently keep their bulk data in in
+    :class:`pyopencl.array.Array` instances.  This contrasts with the box-based
+    tree (:class:`TreeOfBoxes`), which operates on data  in :class:`numpy.ndarray`
+    instances. Along with the rest of :mod:`boxtree`, both will migrate to
+    :mod:`arraycontext` in the future.
+
+.. autoclass:: TreeBuilder
+"""

-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2022 University of Illinois Board of Trustees
+"""

 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -23,20 +43,34 @@ THE SOFTWARE.
 """


-from six.moves import range, zip
+import logging
+from functools import partial
+from itertools import pairwise

 import numpy as np
-from pytools import memoize_method
+
 import pyopencl as cl
-import pyopencl.array  # noqa
-from functools import partial
+import pyopencl.array
+from pytools import DebugProcessLogger, ProcessLogger, memoize_method
+
 from boxtree.tree import Tree

-import logging
+
 logger = logging.getLogger(__name__)


-class TreeBuilder(object):
+class MaxLevelsExceeded(RuntimeError):  # noqa: N818
+    pass
+
+
+# {{{ tree builder
+
+class TreeBuilder:
+    """
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
    def __init__(self, context):
        """
        :arg context: A :class:`pyopencl.Context`.
@@ -50,35 +84,48 @@ class TreeBuilder(object):
        # This is used to map box IDs and compress box lists in empty leaf
        # pruning.

-        from boxtree.tools import GappyCopyAndMapKernel
+        from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel
        self.gappy_copy_and_map = GappyCopyAndMapKernel(self.context)
+        self.map_values_kernel = MapValuesKernel(self.context)

    morton_nr_dtype = np.dtype(np.int8)
    box_level_dtype = np.dtype(np.uint8)
+    ROOT_EXTENT_STRETCH_FACTOR = 1e-4

    @memoize_method
    def get_kernel_info(self, dimensions, coord_dtype,
            particle_id_dtype, box_id_dtype,
-            sources_are_targets, srcntgts_have_extent,
-            stick_out_factor, adaptive):
+            sources_are_targets, srcntgts_extent_norm,
+            kind):

        from boxtree.tree_build_kernels import get_tree_build_kernel_info
        return get_tree_build_kernel_info(self.context, dimensions, coord_dtype,
            particle_id_dtype, box_id_dtype,
-            sources_are_targets, srcntgts_have_extent,
-            stick_out_factor, self.morton_nr_dtype, self.box_level_dtype,
-            adaptive=adaptive)
+            sources_are_targets, srcntgts_extent_norm,
+            self.morton_nr_dtype, self.box_level_dtype,
+            kind=kind)

    # {{{ run control

-    def __call__(self, queue, particles, max_particles_in_box,
-            allocator=None, debug=False, targets=None,
-            source_radii=None, target_radii=None, stick_out_factor=0.25,
-            wait_for=None, non_adaptive=False,
+    def __call__(self, queue, particles, kind="adaptive",
+            max_particles_in_box=None, allocator=None, debug=False,
+            targets=None, source_radii=None, target_radii=None,
+            stick_out_factor=None, refine_weights=None,
+            max_leaf_refine_weight=None, wait_for=None,
+            extent_norm=None, bbox=None,
            **kwargs):
        """
        :arg queue: a :class:`pyopencl.CommandQueue` instance
        :arg particles: an object array of (XYZ) point coordinate arrays.
+        :arg kind: One of the following strings:
+
+            - 'adaptive'
+            - 'adaptive-level-restricted'
+            - 'non-adaptive'
+
+            'adaptive' requests an adaptive tree without level restriction.  See
+            :ref:`tree-kinds` for further explanation.
+
        :arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
            If ``None``, *particles* act as targets, too.
            Must have the same (inner) dtype as *particles*.
@@ -90,12 +137,32 @@ class TreeBuilder(object):

        :arg target_radii: Like *source_radii*, but for targets.
        :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
+        :arg refine_weights: If not *None*, a :class:`pyopencl.array.Array` of the
+            type :class:`numpy.int32`. A box will be split if it has a cumulative
+            refine_weight greater than *max_leaf_refine_weight*. If this is given,
+            *max_leaf_refine_weight* must also be given and *max_particles_in_box*
+            must be *None*.
+        :arg max_leaf_refine_weight: If not *None*, specifies the maximum weight
+            of a leaf box.
+        :arg max_particles_in_box: If not *None*, specifies the maximum number
+            of particles in a leaf box. If this is given, both
+            *refine_weights* and *max_leaf_refine_weight* must be *None*.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
-            exeuction.
-        :arg non_adaptive: If *True*, return a tree in which all leaf boxes are
-            on the same (last) level. The tree is pruned, in the sense that empty
-            boxes have been eliminated.
+            execution.
+        :arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect
+            to which particle stick-out is measured. See :attr:`Tree.extent_norm`.
+        :arg bbox: Bounding box of either type:
+            1. A dim-by-2 array, with each row to be [min, max] coordinates
+            in its corresponding axis direction.
+            2. (Internal use only) of the same type as returned by
+            *boxtree.bounding_box.make_bounding_box_dtype*.
+            When given, this bounding box is used for tree
+            building. Otherwise, the bounding box is determined from particles
+            in such a way that it is square and is slightly larger at the top (so
+            that scaled coordinates are always < 1).
+            When supplied, the bounding box must be square and have all the
+            particles in its closure.
        :arg kwargs: Used internally for debugging.

        :returns: a tuple ``(tree, event)``, where *tree* is an instance of
@@ -105,11 +172,11 @@ class TreeBuilder(object):

        # {{{ input processing

+        if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]:
+            raise ValueError(f"unknown tree kind '{kind}'")
+
        # we'll modify this below, so copy it
-        if wait_for is None:
-            wait_for = []
-        else:
-            wait_for = list(wait_for)
+        wait_for = [] if wait_for is None else list(wait_for)

        dimensions = len(particles)

@@ -119,9 +186,21 @@ class TreeBuilder(object):
        sources_are_targets = targets is None
        sources_have_extent = source_radii is not None
        targets_have_extent = target_radii is not None
+
+        if extent_norm is None:
+            extent_norm = "linf"
+
+        if extent_norm not in ["linf", "l2"]:
+            raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}")
+
+        srcntgts_extent_norm = extent_norm
        srcntgts_have_extent = sources_have_extent or targets_have_extent
+        if not srcntgts_have_extent:
+            srcntgts_extent_norm = None

-        if srcntgts_have_extent and targets is None:
+        del extent_norm
+
+        if srcntgts_extent_norm and targets is None:
            raise ValueError("must specify targets when specifying "
                    "any kind of radii")

@@ -153,20 +232,37 @@ class TreeBuilder(object):
                raise TypeError("dtypes of coordinate arrays and "
                        "target_radii must agree")

+        if sources_have_extent or targets_have_extent:
+            if stick_out_factor is None:
+                raise ValueError("if sources or targets have extent, "
+                        "stick_out_factor must be explicitly specified")
+        else:
+            stick_out_factor = 0
+
        # }}}

        empty = partial(cl.array.empty, queue, allocator=allocator)

        def zeros(shape, dtype):
-            result = (cl.array.empty(queue, shape, dtype, allocator=allocator)
-                    .fill(0, wait_for=wait_for))
-            event, = result.events
+            result = cl.array.zeros(queue, shape, dtype, allocator=allocator)
+            if result.events:
+                event, = result.events
+            else:
+                from numbers import Number
+                if isinstance(shape, Number):
+                    shape = (shape,)
+                from pytools import product
+                assert product(shape) == 0
+                event = cl.enqueue_marker(queue)
+
            return result, event

        knl_info = self.get_kernel_info(dimensions, coord_dtype,
                particle_id_dtype, box_id_dtype,
-                sources_are_targets, srcntgts_have_extent,
-                stick_out_factor, adaptive=not non_adaptive)
+                sources_are_targets, srcntgts_extent_norm,
+                kind=kind)
+
+        logger.debug("tree build: start")

        # {{{ combine sources and targets into one array, if necessary

@@ -176,7 +272,13 @@ class TreeBuilder(object):
            # Targets weren't specified. Sources are also targets. Let's
            # call them "srcntgts".

-            srcntgts = particles
+            if isinstance(particles, np.ndarray) and particles.dtype.char == "O":
+                srcntgts = particles
+            else:
+                from pytools.obj_array import make_obj_array
+                srcntgts = make_obj_array([
+                    p.with_queue(queue).copy() for p in particles
+                    ])

            assert source_radii is None
            assert target_radii is None
@@ -197,10 +299,7 @@ class TreeBuilder(object):
                        "dtype")

            def combine_srcntgt_arrays(ary1, ary2=None):
-                if ary2 is None:
-                    dtype = ary1.dtype
-                else:
-                    dtype = ary2.dtype
+                dtype = ary1.dtype if ary2 is None else ary2.dtype

                result = empty(nsrcntgts, dtype)
                if (ary1 is None) or (ary2 is None):
@@ -217,7 +316,7 @@ class TreeBuilder(object):
            from pytools.obj_array import make_obj_array
            srcntgts = make_obj_array([
                combine_srcntgt_arrays(src_i, tgt_i)
-                for src_i, tgt_i in zip(particles, targets)
+                for src_i, tgt_i in zip(particles, targets, strict=True)
                ])

            if srcntgts_have_extent:
@@ -239,28 +338,108 @@ class TreeBuilder(object):

        # }}}

+        # {{{ process refine_weights
+
+        from boxtree.tree_build_kernels import refine_weight_dtype
+
+        specified_max_particles_in_box = max_particles_in_box is not None
+        specified_refine_weights = refine_weights is not None and \
+            max_leaf_refine_weight is not None
+
+        if specified_max_particles_in_box and specified_refine_weights:
+            raise ValueError("may only specify one of max_particles_in_box and "
+                    "refine_weights/max_leaf_refine_weight")
+        elif not specified_max_particles_in_box and not specified_refine_weights:
+            raise ValueError("must specify either max_particles_in_box or "
+                    "refine_weights/max_leaf_refine_weight")
+        elif specified_max_particles_in_box:
+            refine_weights = (
+                cl.array.empty(
+                    queue, nsrcntgts, refine_weight_dtype, allocator=allocator)
+                .fill(1))
+            event, = refine_weights.events
+            prep_events.append(event)
+            max_leaf_refine_weight = max_particles_in_box
+        elif specified_refine_weights:  # noqa: SIM102
+            if refine_weights.dtype != refine_weight_dtype:
+                raise TypeError(
+                        f"refine_weights must have dtype '{refine_weight_dtype}'")
+
+        if max_leaf_refine_weight < cl.array.max(refine_weights).get():
+            raise ValueError(
+                    "entries of refine_weights cannot exceed max_leaf_refine_weight")
+        if cl.array.min(refine_weights).get() < 0:
+            raise ValueError("all entries of refine_weights must be nonnegative")
+        if max_leaf_refine_weight <= 0:
+            raise ValueError("max_leaf_refine_weight must be positive")
+
+        total_refine_weight = cl.array.sum(
+                refine_weights, dtype=np.dtype(np.int64)).get()
+
+        del max_particles_in_box
+        del specified_max_particles_in_box
+        del specified_refine_weights
+
+        # }}}
+
        # {{{ find and process bounding box

-        bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
-        bbox = bbox.get()
+        if bbox is None:
+            bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
+            bbox = bbox.get()

-        root_extent = max(
+            root_extent = max(
                bbox["max_"+ax] - bbox["min_"+ax]
-                for ax in axis_names) * (1+1e-4)
+                for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR)
+
+            # make bbox square and slightly larger at the top, to ensure scaled
+            # coordinates are always < 1
+            bbox_min = np.empty(dimensions, coord_dtype)
+            for i, ax in enumerate(axis_names):
+                bbox_min[i] = bbox["min_"+ax]
+
+            bbox_max = bbox_min + root_extent
+            for i, ax in enumerate(axis_names):
+                bbox["max_"+ax] = bbox_max[i]
+        else:
+            # Validate that bbox is a superset of particle-derived bbox
+            bbox_auto, _ = self.bbox_finder(
+                    srcntgts, srcntgt_radii, wait_for=wait_for)
+            bbox_auto = bbox_auto.get()
+
+            # Convert unstructured numpy array to bbox_type
+            if isinstance(bbox, np.ndarray):
+                if len(bbox) == dimensions:
+                    bbox_bak = bbox.copy()
+                    bbox = np.empty(1, bbox_auto.dtype)
+                    for i, ax in enumerate(axis_names):
+                        bbox["min_"+ax] = bbox_bak[i][0]
+                        bbox["max_"+ax] = bbox_bak[i][1]
+                else:
+                    assert len(bbox) == 1
+            else:
+                raise NotImplementedError("Unsupported bounding box type: "
+                        + str(type(bbox)))

-        # make bbox square and slightly larger at the top, to ensure scaled
-        # coordinates are always < 1
-        bbox_min = np.empty(dimensions, coord_dtype)
-        for i, ax in enumerate(axis_names):
-            bbox_min[i] = bbox["min_"+ax]
+            # bbox must cover bbox_auto
+            bbox_min = np.empty(dimensions, coord_dtype)
+            bbox_max = np.empty(dimensions, coord_dtype)

-        bbox_max = bbox_min + root_extent
-        for i, ax in enumerate(axis_names):
-            bbox["max_"+ax] = bbox_max[i]
+            for i, ax in enumerate(axis_names):
+                bbox_min[i] = bbox["min_" + ax]
+                bbox_max[i] = bbox["max_" + ax]
+                assert bbox_min[i] < bbox_max[i]
+                assert bbox_min[i] <= bbox_auto["min_" + ax]
+                assert bbox_max[i] >= bbox_auto["max_" + ax]

-        # }}}
+            # bbox must be a square
+            bbox_exts = bbox_max - bbox_min
+            for ext in bbox_exts:
+                assert abs(ext - bbox_exts[0]) < 1e-15

-        from pytools import div_ceil
+            root_extent = bbox_exts[0]
+
+        # }}}

        # {{{ allocate data

@@ -282,26 +461,37 @@ class TreeBuilder(object):
        prep_events.append(evt)
        srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
        prep_events.append(evt)
-        split_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
-        prep_events.append(evt)
-
-        # number of boxes total, and a guess
-        nboxes_dev = empty((), dtype=box_id_dtype)
-        nboxes_dev.fill(1)
-
-        # /!\ If you're allocating an array here that depends on nboxes_guess,
-        # you *must* also write reallocation code down below for the case when
-        # nboxes_guess was too low.

        # Outside nboxes_guess feeding is solely for debugging purposes,
        # to test the reallocation code.
        nboxes_guess = kwargs.get("nboxes_guess")
        if nboxes_guess is None:
-            nboxes_guess = div_ceil(nsrcntgts, max_particles_in_box) * 2**dimensions
+            nboxes_guess = 2**dimensions * (
+                    (max_leaf_refine_weight + total_refine_weight - 1)
+                    // max_leaf_refine_weight)
+
+        assert nboxes_guess > 0
+
+        # /!\ IMPORTANT
+        #
+        # If you're allocating an array here that depends on nboxes_guess, or if
+        # your array contains box numbers, you have to write code for the
+        # following down below as well:
+        #
+        # * You *must* write reallocation code to handle box renumbering and
+        #   reallocation triggered at the top of the level loop.
+        #
+        # * If your array persists after the level loop, you *must* write code
+        #   to handle box renumbering and reallocation triggered by the box
+        #   pruning step.
+
+        split_box_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
+        prep_events.append(evt)

        # per-box morton bin counts
-        box_morton_bin_counts = empty(nboxes_guess,
-                dtype=knl_info.morton_bin_count_dtype)
+        box_morton_bin_counts, evt = zeros(nboxes_guess,
+                                      dtype=knl_info.morton_bin_count_dtype)
+        prep_events.append(evt)

        # particle# at which each box starts
        box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
@@ -311,9 +501,22 @@ class TreeBuilder(object):
        box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
        prep_events.append(evt)

-        # morton nr identifier {quadr,oct}ant of parent in which this box was created
-        box_morton_nrs, evt = zeros(nboxes_guess, dtype=self.morton_nr_dtype)
-        prep_events.append(evt)
+        # pointer to child box, by morton number
+        box_child_ids, evts = zip(
+            *(zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions)),
+            strict=True)
+        prep_events.extend(evts)
+
+        # box centers, by dimension
+        box_centers, evts = zip(
+            *(zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions)),
+            strict=True)
+        prep_events.extend(evts)
+
+        # Initialize box_centers[0] to contain the root box's center
+        for d, (ax, evt) in enumerate(zip(axis_names, evts, strict=True)):
+            center_ax = bbox["min_"+ax] + (bbox["max_"+ax] - bbox["min_"+ax]) / 2
+            box_centers[d][0].fill(center_ax, wait_for=[evt])

        # box -> level map
        box_levels, evt = zeros(nboxes_guess, self.box_level_dtype)
@@ -324,15 +527,39 @@ class TreeBuilder(object):
        box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
        prep_events.append(evt)

-        # Initalize box 0 to contain all particles
-        evt = box_srcntgt_counts_cumul[0].fill(
+        # Initialize box 0 to contain all particles
+        box_srcntgt_counts_cumul[0].fill(
                nsrcntgts, queue=queue, wait_for=[evt])

+        # box -> whether the box has a child. FIXME: use smaller integer type
+        box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32))
+        prep_events.append(evt)
+
+        # box -> whether the box needs a splitting to enforce level restriction.
+        # FIXME: use smaller integer type
+        force_split_box, evt = zeros(nboxes_guess
+                                     if knl_info.level_restrict
+                                     else 0, dtype=np.dtype(np.int32))
+        prep_events.append(evt)
+
        # set parent of root box to itself
        evt = cl.enqueue_copy(
                queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype))
        prep_events.append(evt)

+        # 2*(num bits in the significand)
+        # https://gitlab.tiker.net/inducer/boxtree/issues/23
+        nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1)
+        assert nlevels_max <= np.iinfo(self.box_level_dtype).max
+
+        # level -> starting box on level
+        level_start_box_nrs_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
+        prep_events.append(evt)
+
+        # level -> number of used boxes on level
+        level_used_box_counts_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
+        prep_events.append(evt)
+
        # }}}

        def fin_debug(s):
@@ -345,48 +572,81 @@ class TreeBuilder(object):
        have_oversize_split_box, evt = zeros((), np.int32)
        prep_events.append(evt)

+        # True if and only if the level restrict kernel found a box to split in
+        # order to enforce level restriction.
+        have_upper_level_split_box, evt = zeros((), np.int32)
+        prep_events.append(evt)
+
        wait_for = prep_events

-        # {{{ level loop
+        from pytools import div_ceil

+        # {{{ level loop
        # Level 0 starts at 0 and always contains box 0 and nothing else.
        # Level 1 therefore starts at 1.
        level_start_box_nrs = [0, 1]
+        level_start_box_nrs_dev[0] = 0
+        level_start_box_nrs_dev[1] = 1
+        wait_for.extend(level_start_box_nrs_dev.events)

-        from time import time
-        start_time = time()
-        if nsrcntgts > max_particles_in_box:
-            level = 1
-        else:
-            level = 0
+        # This counts the number of boxes that have been used per level. Note
+        # that this could be fewer than the actual number of boxes allocated to
+        # the level (in the case of building a level restricted tree, more boxes
+        # are pre-allocated for a level than used since we may decide to split
+        # parent level boxes later).
+        level_used_box_counts = [1]
+        level_used_box_counts_dev[0] = 1
+        wait_for.extend(level_used_box_counts_dev.events)
+
+        # level -> number of leaf boxes on level. Initially the root node is a
+        # leaf.
+        level_leaf_counts = np.array([1])
+
+        tree_build_proc = ProcessLogger(logger, "tree build")
+
+        level = 1 if total_refine_weight > max_leaf_refine_weight else 0

        # INVARIANTS -- Upon entry to this loop:
        #
        # - level is the level being built.
        # - the last entry of level_start_box_nrs is the beginning of the level
        #   to be built
+        # - the last entry of level_used_box_counts is the number of boxes that
+        #   are used (not just allocated) at the previous level

        # This while condition prevents entering the loop in case there's just a
        # single box, by how 'level' is set above. Read this as 'while True' with
        # an edge case.

-        logger.debug("entering level loop with %s srcntgts" % nsrcntgts)
+        level_loop_proc = DebugProcessLogger(logger, "tree build level loop")
+
+        # When doing level restriction, the level loop may need to be entered
+        # one more time after creating all the levels (see fixme note below
+        # regarding this). This flag is set to True when that happens.
+        final_level_restrict_iteration = False

        while level:
            if debug:
                # More invariants:
                assert level == len(level_start_box_nrs) - 1
+                assert level == len(level_used_box_counts)
+                assert level == len(level_leaf_counts)

-            if level > np.iinfo(self.box_level_dtype).max:
-                raise RuntimeError("level count exceeded maximum")
+            if level + 1 >= nlevels_max:  # level is zero-based
+                raise MaxLevelsExceeded("Level count exceeded number of significant "
+                        "bits in coordinate dtype. That means that a large number "
+                        "of particles was indistinguishable up to floating point "
+                        "precision (because they ended up in the same box).")

            common_args = ((morton_bin_counts, morton_nrs,
-                    box_start_flags, srcntgt_box_ids, split_box_ids,
+                    box_start_flags,
+                    srcntgt_box_ids, split_box_ids,
                    box_morton_bin_counts,
+                    refine_weights,
+                    max_leaf_refine_weight,
                    box_srcntgt_starts, box_srcntgt_counts_cumul,
-                    box_parent_ids, box_morton_nrs,
-                    nboxes_dev,
-                    level, max_particles_in_box, bbox,
+                    box_parent_ids, box_levels,
+                    level, bbox,
                    user_srcntgt_ids)
                    + tuple(srcntgts)
                    + ((srcntgt_radii,) if srcntgts_have_extent else ())
@@ -394,153 +654,560 @@ class TreeBuilder(object):

            fin_debug("morton count scan")

-            # writes: box_morton_bin_counts, morton_nrs
+            morton_count_args = common_args
+            if srcntgts_have_extent:
+                morton_count_args += (stick_out_factor,)
+
+            # writes: box_morton_bin_counts
            evt = knl_info.morton_count_scan(
-                    *common_args, queue=queue, size=nsrcntgts,
+                    *morton_count_args, queue=queue, size=nsrcntgts,
                    wait_for=wait_for)
            wait_for = [evt]

            fin_debug("split box id scan")

-            # writes: nboxes_dev, split_box_ids
+            # writes: box_has_children, split_box_ids
            evt = knl_info.split_box_id_scan(
                    srcntgt_box_ids,
-                    box_srcntgt_starts,
                    box_srcntgt_counts_cumul,
-                    max_particles_in_box,
                    box_morton_bin_counts,
+                    refine_weights,
+                    max_leaf_refine_weight,
                    box_levels,
+                    level_start_box_nrs_dev,
+                    level_used_box_counts_dev,
+                    force_split_box,
                    level,

-                    # input/output:
-                    nboxes_dev,
-
                    # output:
+                    box_has_children,
                    split_box_ids,
-                    queue=queue, size=nsrcntgts, wait_for=wait_for)
+                    have_oversize_split_box,
+
+                    queue=queue,
+                    size=level_start_box_nrs[level],
+                    wait_for=wait_for)
            wait_for = [evt]

-            nboxes_new = int(nboxes_dev.get())
+            # {{{ compute new level_used_box_counts, level_leaf_counts
+
+            # The last split_box_id on each level tells us how many boxes are
+            # needed at the next level.
+            new_level_used_box_counts = [1]
+            for level_start_box_id in level_start_box_nrs[1:]:
+                last_box_on_prev_level = level_start_box_id - 1
+                new_level_used_box_counts.append(
+                    # FIXME: Get this all at once.
+                    int(split_box_ids[last_box_on_prev_level].get())
+                    - level_start_box_id)
+
+            # New leaf count =
+            #   old leaf count
+            #   + nr. new boxes from splitting parent's leaves
+            #   - nr. new boxes from splitting current level's leaves / 2**d
+            level_used_box_counts_diff = (new_level_used_box_counts
+                    - np.append(level_used_box_counts, [0]))
+            new_level_leaf_counts = (level_leaf_counts
+                    + level_used_box_counts_diff[:-1]
+                    - level_used_box_counts_diff[1:] // 2 ** dimensions)
+            new_level_leaf_counts = np.append(
+                    new_level_leaf_counts,
+                    [level_used_box_counts_diff[-1]])
+            del level_used_box_counts_diff
+
+            # }}}

            # Assumption: Everything between here and the top of the loop must
            # be repeatable, so that in an out-of-memory situation, we can just
            # rerun this bit of the code after reallocating and a minimal reset
            # procedure.

-            # {{{ reallocate and retry if nboxes_guess was too small
+            # The algorithm for deciding on level sizes is as follows:
+            # 1. Compute the minimal necessary size of each level, including the
+            #    new level being created.
+            # 2. If level restricting, add padding to the new level being created.
+            # 3. Check if there is enough existing space for each level.
+            # 4. If any level does not have sufficient space, reallocate all levels:
+            #    4a. Compute new sizes of upper levels
+            #    4b. If level restricting, add padding to all levels.
+
+            curr_upper_level_lengths = np.diff(level_start_box_nrs)
+            minimal_upper_level_lengths = np.max(
+                [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0)
+            minimal_new_level_length = new_level_used_box_counts[-1]
+
+            # Allocate extra space at the end of the current level for higher
+            # level leaves that may be split later.
+            #
+            # If there are no further levels to split (i.e.
+            # have_oversize_split_box = 0), then we do not need to allocate any
+            # extra space, since no new leaves can be created at the bottom
+            # level.
+            if knl_info.level_restrict and have_oversize_split_box.get():
+                # Currently undocumented.
+                lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1)
+                minimal_new_level_length += sum(
+                    2**(lev*dimensions) * new_level_leaf_counts[level - lev]
+                    for lev in range(1, 1 + min(level, lr_lookbehind_levels)))
+
+            nboxes_minimal = \
+                    sum(minimal_upper_level_lengths) + minimal_new_level_length
+
+            needs_renumbering = \
+                    (curr_upper_level_lengths < minimal_upper_level_lengths).any()
+
+            # {{{ prepare for reallocation/renumbering
+
+            if needs_renumbering:
+                assert knl_info.level_restrict
+
+                # {{{ compute new level_start_box_nrs
+
+                # Represents the amount of padding needed for upper levels.
+                upper_level_padding = np.zeros(level, dtype=int)
+
+                # Recompute the level padding.
+                for ulevel in range(level):
+                    upper_level_padding[ulevel] = sum(
+                        2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev]
+                        for lev in range(
+                            1, 1 + min(ulevel, lr_lookbehind_levels)))
+
+                new_upper_level_unused_box_counts = np.max(
+                    [upper_level_padding,
+                    minimal_upper_level_lengths - new_level_used_box_counts[:-1]],
+                    axis=0)
+
+                new_level_start_box_nrs = np.empty(level + 1, dtype=int)
+                new_level_start_box_nrs[0] = 0
+                new_level_start_box_nrs[1:] = np.cumsum(
+                    new_level_used_box_counts[:-1]
+                    + new_upper_level_unused_box_counts)
+
+                assert not (level_start_box_nrs == new_level_start_box_nrs).all()
+
+                # }}}
+
+                # {{{ set up reallocators
+
+                old_box_count = level_start_box_nrs[-1]
+                # Where should I put this box?
+                dst_box_id = cl.array.empty(queue,
+                        shape=old_box_count, dtype=box_id_dtype)
+
+                for level_start, new_level_start, level_len in zip(
+                        level_start_box_nrs[:-1],
+                        new_level_start_box_nrs[:-1],
+                        curr_upper_level_lengths, strict=True):
+                    dst_box_id[level_start:level_start + level_len] = \
+                            cl.array.arange(queue,
+                                            new_level_start,
+                                            new_level_start + level_len,
+                                            dtype=box_id_dtype)
+
+                wait_for.extend(dst_box_id.events)
+
+                realloc_array = partial(self.gappy_copy_and_map,
+                        dst_indices=dst_box_id, range=slice(old_box_count),
+                        debug=debug)
+                realloc_and_renumber_array = partial(self.gappy_copy_and_map,
+                        dst_indices=dst_box_id, map_values=dst_box_id,
+                        range=slice(old_box_count), debug=debug)
+                renumber_array = partial(self.map_values_kernel, dst_box_id)
+
+                # }}}
+
+                # Update level_start_box_nrs. This will be the
+                # level_start_box_nrs for the reallocated data.
+
+                level_start_box_nrs = list(new_level_start_box_nrs)
+                level_start_box_nrs_dev[:level + 1] = \
+                    np.array(new_level_start_box_nrs, dtype=box_id_dtype)
+                level_start_box_nrs_updated = True
+                wait_for.extend(level_start_box_nrs_dev.events)
+
+                nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length
+
+                del new_level_start_box_nrs
+            else:
+                from boxtree.tools import realloc_array
+                realloc_and_renumber_array = realloc_array
+                renumber_array = None
+                level_start_box_nrs_updated = False
+                nboxes_new = nboxes_minimal
+
+            del nboxes_minimal

-            if nboxes_new > nboxes_guess:
+            # }}}
+
+            # {{{ reallocate and/or renumber boxes if necessary
+
+            if level_start_box_nrs_updated or nboxes_new > nboxes_guess:
                fin_debug("starting nboxes_guess increase")

                while nboxes_guess < nboxes_new:
                    nboxes_guess *= 2

-                from boxtree.tools import realloc_array
-                my_realloc = partial(realloc_array, new_shape=nboxes_guess,
-                        zero_fill=False, queue=queue, wait_for=wait_for)
-                my_realloc_zeros = partial(realloc_array, new_shape=nboxes_guess,
-                        zero_fill=True, queue=queue, wait_for=wait_for)
+                def my_realloc_nocopy(ary, shape=nboxes_guess):
+                    return cl.array.empty(queue, allocator=allocator,
+                            shape=shape, dtype=ary.dtype)
+
+                def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
+                    result = cl.array.zeros(queue, allocator=allocator,
+                            shape=shape, dtype=ary.dtype)
+                    return result, result.events[0]
+
+                my_realloc = partial(realloc_array,
+                        queue, allocator, nboxes_guess, wait_for=wait_for)
+                my_realloc_zeros = partial(realloc_array,
+                        queue, allocator, nboxes_guess, zero_fill=True,
+                        wait_for=wait_for)
+                my_realloc_zeros_and_renumber = partial(realloc_and_renumber_array,
+                        queue, allocator, nboxes_guess, zero_fill=True,
+                        wait_for=wait_for)

                resize_events = []
-                box_morton_bin_counts, evt = my_realloc(box_morton_bin_counts)
+
+                split_box_ids = my_realloc_nocopy(split_box_ids)
+
+                # *Most*, but not *all* of the values in this array are
+                # rewritten when the morton scan is redone. Specifically,
+                # only the box morton bin counts of boxes on the level
+                # currently being processed are written-but we need to
+                # retain the box morton bin counts from the higher levels.
+                box_morton_bin_counts, evt = my_realloc_zeros(
+                        box_morton_bin_counts)
                resize_events.append(evt)

+                # force_split_box is unused unless level restriction is enabled.
+                if knl_info.level_restrict:
+                    force_split_box, evt = my_realloc_zeros(force_split_box)
+                    resize_events.append(evt)
+
                box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts)
                resize_events.append(evt)
-                box_parent_ids, evt = my_realloc_zeros(box_parent_ids)
-                resize_events.append(evt)
-                box_morton_nrs, evt = my_realloc_zeros(box_morton_nrs)
-                resize_events.append(evt)
-                box_levels, evt = my_realloc_zeros(box_levels)
-                resize_events.append(evt)
+
                box_srcntgt_counts_cumul, evt = \
                        my_realloc_zeros(box_srcntgt_counts_cumul)
                resize_events.append(evt)

-                del my_realloc
-                del my_realloc_zeros
+                box_has_children, evt = my_realloc_zeros(box_has_children)
+                resize_events.append(evt)
+
+                box_centers, evts = zip(
+                    *(my_realloc(ary) for ary in box_centers), strict=True)
+                resize_events.extend(evts)

-                # reset nboxes_dev to previous value
-                nboxes_dev.fill(level_start_box_nrs[-1])
+                box_child_ids, evts = zip(
+                    *(my_realloc_zeros_and_renumber(ary)
+                      for ary in box_child_ids), strict=True)
+                resize_events.extend(evts)
+
+                box_parent_ids, evt = my_realloc_zeros_and_renumber(box_parent_ids)
                resize_events.append(evt)

-                wait_for = resize_events
+                if not level_start_box_nrs_updated:
+                    box_levels, evt = my_realloc(box_levels)
+                    resize_events.append(evt)
+                else:
+                    box_levels, evt = my_realloc_zeros_nocopy(box_levels)
+                    cl.wait_for_events([evt])
+                    for box_level, (level_start, level_end) in enumerate(
+                            pairwise(level_start_box_nrs)):
+                        box_levels[level_start:level_end].fill(box_level)
+                    resize_events.extend(box_levels.events)
+
+                if level_start_box_nrs_updated:
+                    srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids)
+                    resize_events.append(evt)
+
+                del my_realloc_zeros
+                del my_realloc_nocopy
+                del my_realloc_zeros_nocopy
+                del renumber_array
+
+                # Can't del on Py2.7 - these are used in generator expressions
+                # above, which are nested scopes
+                my_realloc = None
+                my_realloc_zeros_and_renumber = None

                # retry
                logger.info("nboxes_guess exceeded: "
-                        "enlarged allocations, restarting level")
+                            "enlarged allocations, restarting level")

                continue

            # }}}

-            logger.info("LEVEL %d -> %d boxes" % (level, nboxes_new))
+            logger.debug("LEVEL %d -> %d boxes", level, nboxes_new)

-            assert level_start_box_nrs[-1] != nboxes_new or srcntgts_have_extent
+            assert (
+                level_start_box_nrs[-1] != nboxes_new
+                or srcntgts_have_extent
+                or final_level_restrict_iteration)

            if level_start_box_nrs[-1] == nboxes_new:
-                # We haven't created new boxes in this level loop trip.  Unless
-                # srcntgts have extent, this should never happen.  (I.e., we
-                # should've never entered this loop trip.)
+                # We haven't created new boxes in this level loop trip.
                #
                # If srcntgts have extent, this can happen if boxes were
                # in-principle overfull, but couldn't subdivide because of
                # extent restrictions.
+                if srcntgts_have_extent and not final_level_restrict_iteration:
+                    level -= 1
+                    break
+                assert final_level_restrict_iteration

-                assert srcntgts_have_extent
+            # {{{ update level_start_box_nrs, level_used_box_counts

-                level -= 1
+            level_start_box_nrs.append(nboxes_new)
+            level_start_box_nrs_dev[level + 1].fill(nboxes_new)
+            wait_for.extend(level_start_box_nrs_dev.events)

-                logger.debug("no new boxes created this loop trip")
-                break
+            level_used_box_counts = new_level_used_box_counts
+            level_used_box_counts_dev[:level + 1] = \
+                    np.array(level_used_box_counts, dtype=box_id_dtype)
+            wait_for.extend(level_used_box_counts_dev.events)
+
+            level_leaf_counts = new_level_leaf_counts
+            if debug:
+                for level_start, level_nboxes, leaf_count in zip(
+                        level_start_box_nrs[:-1],
+                        level_used_box_counts,
+                        level_leaf_counts, strict=True):
+                    if level_nboxes == 0:
+                        assert leaf_count == 0
+                        continue
+                    nleaves_actual = level_nboxes - int(
+                        cl.array.sum(box_has_children[
+                            level_start:level_start + level_nboxes]).get())
+                    assert leaf_count == nleaves_actual
+
+            # Can't del in Py2.7 - see note below
+            new_level_leaf_counts = None
+
+            # }}}

-            level_start_box_nrs.append(nboxes_new)
            del nboxes_new
+            del new_level_used_box_counts

-            new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
-            new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
-            split_and_sort_args = (
-                    common_args
-                    + (new_user_srcntgt_ids, have_oversize_split_box,
-                        new_srcntgt_box_ids, box_levels))
+            # {{{ split boxes

-            fin_debug("split and sort")
+            box_splitter_args = (
+                *common_args,
+                box_has_children,
+                force_split_box,
+                root_extent,
+                *box_child_ids,
+                *box_centers)

-            evt = knl_info.split_and_sort_kernel(*split_and_sort_args,
+            evt = knl_info.box_splitter_kernel(*box_splitter_args,
+                    range=slice(level_start_box_nrs[-1]),
                    wait_for=wait_for)
+
            wait_for = [evt]

+            fin_debug("box splitter")
+
+            # Mark the levels of boxes added for padding (these were not updated
+            # by the box splitter kernel).
+            last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1]
+            box_levels[last_used_box:level_start_box_nrs[-1]].fill(level)
+
+            wait_for.extend(box_levels.events)
+
            if debug:
+                box_levels.finish()
                level_bl_chunk = box_levels.get()[
                        level_start_box_nrs[-2]:level_start_box_nrs[-1]]
-                assert ((level_bl_chunk == level) | (level_bl_chunk == 0)).all()
+                assert (level_bl_chunk == level).all()
                del level_bl_chunk

            if debug:
                assert (box_srcntgt_starts.get() < nsrcntgts).all()

+            # }}}
+
+            # {{{ renumber particles within split boxes
+
+            new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
+            new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
+
+            particle_renumberer_args = (
+                *common_args,
+                box_has_children,
+                force_split_box,
+                new_user_srcntgt_ids,
+                new_srcntgt_box_ids)
+
+            evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args,
+                    range=slice(nsrcntgts), wait_for=wait_for)
+
+            wait_for = [evt]
+
+            fin_debug("particle renumbering")
+
            user_srcntgt_ids = new_user_srcntgt_ids
            del new_user_srcntgt_ids
            srcntgt_box_ids = new_srcntgt_box_ids
            del new_srcntgt_box_ids

+            # }}}
+
+            # {{{ enforce level restriction on upper levels
+
+            if final_level_restrict_iteration:
+                # Roll back level update.
+                #
+                # FIXME: The extra iteration at the end to split boxes should
+                # not be necessary. Instead, all the work for the final box
+                # split should be done in the last iteration of the level
+                # loop. Currently the main issue that forces the extra iteration
+                # to be there is the need to use the box renumbering and
+                # reallocation code. In order to fix this issue, the box
+                # numbering and reallocation code needs to be accessible after
+                # the final level restriction is done.
+                assert int(have_oversize_split_box.get()) == 0
+                assert level_used_box_counts[-1] == 0
+                del level_used_box_counts[-1]
+                del level_start_box_nrs[-1]
+                level -= 1
+                break
+
+            if knl_info.level_restrict:
+                # Avoid generating too many kernels.
+                LEVEL_STEP = 10  # noqa
+                if level % LEVEL_STEP == 1:
+                    level_restrict_kernel = knl_info.level_restrict_kernel_builder(
+                            LEVEL_STEP * div_ceil(level, LEVEL_STEP))
+
+                # Upward pass - check if leaf boxes at higher levels need
+                # further splitting.
+                assert len(force_split_box) > 0
+                force_split_box.fill(0)
+                wait_for.extend(force_split_box.events)
+
+                did_upper_level_split = False
+
+                if debug:
+                    boxes_split = []
+
+                for upper_level, upper_level_start, upper_level_box_count in zip(
+                        # We just built level. Our parent level doesn't need to
+                        # be rechecked for splitting because the smallest boxes
+                        # in the tree (ours) already have a 2-to-1 ratio with
+                        # that. Start checking at the level above our parent.
+                        range(level - 2, 0, -1),
+                        # At this point, the last entry in level_start_box_nrs
+                        # already refers to (level + 1).
+                        level_start_box_nrs[-4::-1],
+                        level_used_box_counts[-3::-1], strict=False):
+
+                    upper_level_slice = slice(
+                        upper_level_start, upper_level_start + upper_level_box_count)
+
+                    have_upper_level_split_box.fill(0)
+                    wait_for.extend(have_upper_level_split_box.events)
+
+                    # writes: force_split_box, have_upper_level_split_box
+                    evt = level_restrict_kernel(  # pylint: disable=possibly-used-before-assignment
+                        upper_level,
+                        root_extent,
+                        box_has_children,
+                        force_split_box,
+                        have_upper_level_split_box,
+                        *(box_child_ids + box_centers),
+                        slice=upper_level_slice,
+                        wait_for=wait_for)
+
+                    wait_for = [evt]
+
+                    if debug:
+                        force_split_box.finish()
+                        boxes_split.append(int(cl.array.sum(
+                            force_split_box[upper_level_slice]).get()))
+
+                    if int(have_upper_level_split_box.get()) == 0:
+                        break
+
+                    did_upper_level_split = True
+
+                if debug:
+                    total_boxes_split = sum(boxes_split)
+                    logger.debug("level restriction: %d boxes split",
+                                 total_boxes_split)
+                    from itertools import count
+                    for level_, nboxes_split in zip(
+                            count(level - 2, step=-1), boxes_split[:-1]):
+                        logger.debug("level %d: %d boxes split", level_, nboxes_split)
+                    del boxes_split
+
+                if int(have_oversize_split_box.get()) == 0 and did_upper_level_split:
+                    # We are in the situation where there are boxes left to
+                    # split on upper levels, and the level loop is done creating
+                    # lower levels.
+                    #
+                    # We re-run the level loop one more time to finish creating
+                    # the upper level boxes.
+                    final_level_restrict_iteration = True
+                    level += 1
+                    continue
+
+            # }}}
+
            if not int(have_oversize_split_box.get()):
-                logger.debug("no overfull boxes left")
+                logger.debug("no boxes left to split")
                break

            level += 1
-
            have_oversize_split_box.fill(0)

-        end_time = time()
-        elapsed = end_time-start_time
+            # {{{ check that nonchild part of box_morton_bin_counts is consistent
+
+            if debug and 0:
+                h_box_morton_bin_counts = box_morton_bin_counts.get()
+                h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
+                h_box_child_ids = tuple(bci.get() for bci in box_child_ids)
+
+                has_mismatch = False
+                for ibox in range(level_start_box_nrs[-1]):
+                    is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids)
+                    if is_leaf:
+                        # nonchild count only found in box_info kernel
+                        continue
+
+                    if h_box_srcntgt_counts_cumul[ibox] == 0:
+                        # empty boxes don't have box_morton_bin_counts written
+                        continue
+
+                    kid_sum = sum(
+                            h_box_srcntgt_counts_cumul[bci[ibox]]
+                            for bci in h_box_child_ids
+                            if bci[ibox] != 0)
+
+                    if (
+                            h_box_srcntgt_counts_cumul[ibox]
+                            != (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"]
+                                + kid_sum)):
+                        print("MISMATCH", level, ibox)
+                        has_mismatch = True
+
+                assert not has_mismatch
+                print(f"LEVEL {level} OK")
+
+                # Cannot delete in Py 2.7: referred to from nested scope.
+                h_box_srcntgt_counts_cumul = None
+
+                del h_box_morton_bin_counts
+                del h_box_child_ids
+
+            # }}}
+
+        nboxes = level_start_box_nrs[-1]
+
        npasses = level+1
-        logger.info("elapsed time: %g s (%g s/particle/pass)" % (
-                elapsed, elapsed/(npasses*nsrcntgts)))
+        level_loop_proc.done("%d levels, %d boxes", level, nboxes)
        del npasses

-        nboxes = int(nboxes_dev.get())
-
        # }}}

        # {{{ extract number of non-child srcntgts from box morton counts
@@ -567,44 +1234,88 @@ class TreeBuilder(object):
            del highest_possibly_split_box_nr

            if debug:
-                assert (box_srcntgt_counts_nonchild.get()
-                        <= box_srcntgt_counts_cumul.get()[:nboxes]).all()
+                h_box_srcntgt_counts_nonchild = box_srcntgt_counts_nonchild.get()
+                h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
+
+                assert (h_box_srcntgt_counts_nonchild
+                        <= h_box_srcntgt_counts_cumul[:nboxes]).all()
+
+                del h_box_srcntgt_counts_nonchild
+
+                # Cannot delete in Py 2.7: referred to from nested scope.
+                h_box_srcntgt_counts_cumul = None

        # }}}

        del morton_nrs
        del box_morton_bin_counts

-        # {{{ prune empty leaf boxes
+        # {{{ prune empty/unused leaf boxes

-        is_pruned = not kwargs.get("skip_prune")
-        if is_pruned:
+        prune_empty_leaves = not kwargs.get("skip_prune")

+        if prune_empty_leaves:
            # What is the original index of this box?
-            from_box_id = empty(nboxes, box_id_dtype)
+            src_box_id = empty(nboxes, box_id_dtype)

            # Where should I put this box?
-            to_box_id = empty(nboxes, box_id_dtype)
+            #
+            # Initialize to all zeros, because pruned boxes should be mapped to
+            # zero (e.g. when pruning child_box_ids).
+            dst_box_id, evt = zeros(nboxes, box_id_dtype)
+            wait_for.append(evt)

            fin_debug("find prune indices")

            nboxes_post_prune_dev = empty((), dtype=box_id_dtype)
            evt = knl_info.find_prune_indices_kernel(
                    box_srcntgt_counts_cumul,
-                    to_box_id, from_box_id, nboxes_post_prune_dev,
+                    src_box_id, dst_box_id, nboxes_post_prune_dev,
                    size=nboxes, wait_for=wait_for)
            wait_for = [evt]
-
-            fin_debug("prune copy")
-
            nboxes_post_prune = int(nboxes_post_prune_dev.get())
+            logger.debug("%d boxes after pruning "
+                        "(%d empty leaves and/or unused boxes removed)",
+                        nboxes_post_prune, nboxes - nboxes_post_prune)
+            should_prune = True
+        elif knl_info.level_restrict:
+            # Remove unused boxes from the tree.
+            src_box_id = empty(nboxes, box_id_dtype)
+            dst_box_id = empty(nboxes, box_id_dtype)
+
+            new_level_start_box_nrs = np.empty_like(level_start_box_nrs)
+            new_level_start_box_nrs[0] = 0
+            new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts)
+            for level_start, new_level_start, level_used_box_count in zip(
+                    level_start_box_nrs[:-1],
+                    new_level_start_box_nrs[:-1],
+                    level_used_box_counts, strict=True):
+                def make_slice(start, offset=level_used_box_count):
+                    return slice(start, start + offset)
+
+                def make_arange(start, offset=level_used_box_count):
+                    return cl.array.arange(
+                            queue, start, start + offset, dtype=box_id_dtype)
+
+                src_box_id[make_slice(new_level_start)] = make_arange(level_start)
+                dst_box_id[make_slice(level_start)] = make_arange(new_level_start)
+            wait_for.extend(src_box_id.events + dst_box_id.events)
+
+            nboxes_post_prune = new_level_start_box_nrs[-1]
+
+            logger.info("%d boxes after pruning (%d unused boxes removed)",
+                        nboxes_post_prune, nboxes - nboxes_post_prune)
+            should_prune = True
+        else:
+            should_prune = False

-            logger.info("%d empty leaves" % (nboxes-nboxes_post_prune))
-
+        if should_prune:
            prune_events = []

            prune_empty = partial(self.gappy_copy_and_map,
-                    queue, allocator, nboxes_post_prune, from_box_id)
+                    queue, allocator, nboxes_post_prune,
+                    src_indices=src_box_id,
+                    range=slice(nboxes_post_prune), debug=debug)

            box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts)
            prune_events.append(evt)
@@ -612,28 +1323,52 @@ class TreeBuilder(object):
            box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul)
            prune_events.append(evt)

-            if debug:
+            if debug and prune_empty_leaves:
                assert (box_srcntgt_counts_cumul.get() > 0).all()

-            srcntgt_box_ids = cl.array.take(to_box_id, srcntgt_box_ids)
-
-            box_parent_ids, evt = prune_empty(box_parent_ids, map_values=to_box_id)
+            srcntgt_box_ids, evt = self.map_values_kernel(
+                    dst_box_id, srcntgt_box_ids)
            prune_events.append(evt)
-            box_morton_nrs, evt = prune_empty(box_morton_nrs)
+
+            box_parent_ids, evt = prune_empty(box_parent_ids, map_values=dst_box_id)
            prune_events.append(evt)
+
            box_levels, evt = prune_empty(box_levels)
            prune_events.append(evt)
+
            if srcntgts_have_extent:
                box_srcntgt_counts_nonchild, evt = prune_empty(
                        box_srcntgt_counts_nonchild)
                prune_events.append(evt)

-            # Remap level_start_box_nrs to new box IDs.
-            # FIXME: It would be better to do this on the device.
-            level_start_box_nrs = list(
-                    to_box_id.get()
-                    [np.array(level_start_box_nrs[:-1], box_id_dtype)])
-            level_start_box_nrs = level_start_box_nrs + [nboxes_post_prune]
+            box_has_children, evt = prune_empty(box_has_children)
+            prune_events.append(evt)
+
+            box_child_ids, evts = zip(
+                *(prune_empty(ary, map_values=dst_box_id)
+                  for ary in box_child_ids), strict=True)
+            prune_events.extend(evts)
+
+            box_centers, evts = zip(
+                *(prune_empty(ary) for ary in box_centers), strict=True)
+            prune_events.extend(evts)
+
+            # Update box counts and level start box indices.
+            box_levels.finish()
+
+            evt = knl_info.find_level_box_counts_kernel(
+                box_levels, level_used_box_counts_dev)
+            cl.wait_for_events([evt])
+
+            nlevels = len(level_used_box_counts)
+            level_used_box_counts = level_used_box_counts_dev[:nlevels].get()
+
+            level_start_box_nrs = [0]
+            level_start_box_nrs.extend(np.cumsum(level_used_box_counts))
+
+            level_start_box_nrs_dev[:nlevels + 1] = np.array(
+                level_start_box_nrs, dtype=box_id_dtype)
+            prune_events.extend(level_start_box_nrs_dev.events)

            wait_for = prune_events
        else:
@@ -714,26 +1449,25 @@ class TreeBuilder(object):
                    box_target_starts, box_target_counts_cumul,
                    )
                + ((
-                    box_source_counts_nonchild,
-                    box_target_counts_nonchild,
+                    box_source_counts_nonchild,  # pylint: disable=possibly-used-before-assignment
+                    box_target_counts_nonchild,  # pylint: disable=possibly-used-before-assignment
                    ) if srcntgts_have_extent else ())
                ),
                queue=queue, range=slice(nsrcntgts),
                wait_for=wait_for)
            wait_for = [evt]

-            if srcntgts_have_extent:
+            if srcntgts_have_extent:  # noqa: SIM102
                if debug:
                    assert (
                            box_srcntgt_counts_nonchild.get()
-                            ==
-                            (box_source_counts_nonchild
-                            + box_target_counts_nonchild).get()).all()
+                            == (box_source_counts_nonchild
+                                + box_target_counts_nonchild).get()).all()

            if debug:
                usi_host = user_source_ids.get()
                assert (usi_host < nsources).all()
-                assert (0 <= usi_host).all()
+                assert (usi_host >= 0).all()
                del usi_host

                sti_host = srcntgt_target_ids.get()
@@ -811,22 +1545,44 @@ class TreeBuilder(object):
        del srcntgts

        nlevels = len(level_start_box_nrs) - 1
+
+        assert nlevels == len(level_used_box_counts)
        assert level + 1 == nlevels, (level+1, nlevels)
        if debug:
            max_level = np.max(box_levels.get())
-
            assert max_level + 1 == nlevels

-        # {{{ compute box info
+        # {{{ gather box child ids, box centers

        # A number of arrays below are nominally 2-dimensional and stored with
        # the box index as the fastest-moving index. To make sure that accesses
        # remain aligned, we round up the number of boxes used for indexing.
        aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32

-        box_child_ids, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
+        box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
        wait_for.append(evt)
-        box_centers = empty((dimensions, aligned_nboxes), coord_dtype)
+        box_centers_new = empty((dimensions, aligned_nboxes), coord_dtype)
+
+        for mnr, child_row in enumerate(box_child_ids):
+            box_child_ids_new[mnr, :nboxes_post_prune] = \
+                    child_row[:nboxes_post_prune]
+        wait_for.extend(box_child_ids_new.events)
+
+        for dim, center_row in enumerate(box_centers):
+            box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune]
+        wait_for.extend(box_centers_new.events)
+
+        cl.wait_for_events(wait_for)
+
+        box_centers = box_centers_new
+        box_child_ids = box_child_ids_new
+
+        del box_centers_new
+        del box_child_ids_new
+
+        # }}}
+
+        # {{{ compute box flags

        from boxtree.tree import box_flags_enum
        box_flags = empty(nboxes_post_prune, box_flags_enum.dtype)
@@ -838,21 +1594,22 @@ class TreeBuilder(object):
            # the cumulative counts and setting them to zero for non-leaves.

            # {{{ make sure box_{source,target}_counts_nonchild are not defined
+
            # (before we overwrite them)

            try:
-                box_source_counts_nonchild
+                box_source_counts_nonchild  # noqa: B018
            except NameError:
                pass
            else:
-                assert False
+                raise AssertionError

            try:
-                box_target_counts_nonchild
+                box_target_counts_nonchild  # noqa: B018
            except NameError:
                pass
            else:
-                assert False
+                raise AssertionError

            # }}}

@@ -871,24 +1628,106 @@ class TreeBuilder(object):
        evt = knl_info.box_info_kernel(
                *(
                    # input:
-                    box_parent_ids, box_morton_nrs, bbox, aligned_nboxes,
-
-                    box_srcntgt_counts_cumul,
+                    box_parent_ids, box_srcntgt_counts_cumul,
                    box_source_counts_cumul, box_target_counts_cumul,
-                    max_particles_in_box,
-                    box_levels, nlevels,
+                    box_has_children, box_levels, nlevels,

                    # output if srcntgts_have_extent, input+output otherwise
                    box_source_counts_nonchild, box_target_counts_nonchild,

                    # output:
-                    box_child_ids, box_centers, box_flags,
+                    box_flags,
                ),
                range=slice(nboxes_post_prune),
                wait_for=wait_for)

        # }}}

+        del box_has_children
+        wait_for = [evt]
+
+        # {{{ compute box bounding box
+
+        fin_debug("finding box extents")
+
+        box_source_bounding_box_min = cl.array.empty(
+                queue, (dimensions, aligned_nboxes),
+                dtype=coord_dtype)
+        box_source_bounding_box_max = cl.array.empty(
+                queue, (dimensions, aligned_nboxes),
+                dtype=coord_dtype)
+
+        if sources_are_targets:
+            box_target_bounding_box_min = box_source_bounding_box_min
+            box_target_bounding_box_max = box_source_bounding_box_max
+        else:
+            box_target_bounding_box_min = cl.array.empty(
+                    queue, (dimensions, aligned_nboxes),
+                    dtype=coord_dtype)
+            box_target_bounding_box_max = cl.array.empty(
+                    queue, (dimensions, aligned_nboxes),
+                    dtype=coord_dtype)
+
+        bogus_radii_array = cl.array.empty(queue, 1, dtype=coord_dtype)
+
+        # nlevels-1 is the highest valid level index
+        for level in range(nlevels-1, -1, -1):
+            start, stop = level_start_box_nrs[level:level+2]
+
+            for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max,
+                    pstarts, pcounts, particle_radii, particles) in [
+                    (
+                        # never skip
+                        False,
+
+                        sources_have_extent,
+                        box_source_bounding_box_min,
+                        box_source_bounding_box_max,
+                        box_source_starts,
+                        box_source_counts_nonchild,
+                        source_radii if sources_have_extent else bogus_radii_array,
+                        sources),
+                    (
+                        # skip the 'target' round if sources and targets
+                        # are the same.
+                        sources_are_targets,
+
+                        targets_have_extent,
+                        box_target_bounding_box_min,
+                        box_target_bounding_box_max,
+                        box_target_starts,
+                        box_target_counts_nonchild,
+                        target_radii if targets_have_extent else bogus_radii_array,
+                        targets),
+                    ]:
+
+                if skip:
+                    continue
+
+                args = (
+                    aligned_nboxes,
+                    box_child_ids,
+                    box_centers,
+                    pstarts,
+                    pcounts,
+                    *particles,
+                    particle_radii,
+                    enable_radii,
+                    box_bounding_box_min,
+                    box_bounding_box_max)
+
+                evt = knl_info.box_extents_finder_kernel(
+                        *args,
+
+                        range=slice(start, stop),
+                        queue=queue, wait_for=wait_for)
+
+            wait_for = [evt]
+
+        del bogus_radii_array
+
+        # }}}
+
        # {{{ build output

        extra_tree_attrs = {}
@@ -898,12 +1737,15 @@ class TreeBuilder(object):
        if targets_have_extent:
            extra_tree_attrs.update(target_radii=target_radii)

-        logger.info("tree build complete")
+        tree_build_proc.done(
+                "%d levels, %d boxes, %d particles, box extent norm: %s, "
+                "max_leaf_refine_weight: %d",
+                nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm,
+                max_leaf_refine_weight)

        return Tree(
                # If you change this, also change the documentation
                # of what's in the tree, above.
-
                sources_are_targets=sources_are_targets,
                sources_have_extent=sources_have_extent,
                targets_have_extent=targets_have_extent,
@@ -915,12 +1757,11 @@ class TreeBuilder(object):

                root_extent=root_extent,
                stick_out_factor=stick_out_factor,
+                extent_norm=srcntgts_extent_norm,

                bounding_box=(bbox_min, bbox_max),
                level_start_box_nrs=level_start_box_nrs,
-                level_start_box_nrs_dev=cl.array.to_device(
-                    queue, level_start_box_nrs,
-                    allocator=allocator),
+                level_start_box_nrs_dev=level_start_box_nrs_dev,

                sources=sources,
                targets=targets,
@@ -941,7 +1782,12 @@ class TreeBuilder(object):
                user_source_ids=user_source_ids,
                sorted_target_ids=sorted_target_ids,

-                _is_pruned=is_pruned,
+                box_source_bounding_box_min=box_source_bounding_box_min,
+                box_source_bounding_box_max=box_source_bounding_box_max,
+                box_target_bounding_box_min=box_target_bounding_box_min,
+                box_target_bounding_box_max=box_target_bounding_box_max,
+
+                _is_pruned=prune_empty_leaves,

                **extra_tree_attrs
                ).with_queue(None), evt
@@ -950,4 +1796,6 @@ class TreeBuilder(object):

    # }}}

-# vim: foldmethod=marker:filetype=pyopencl
+# }}}
+
+# vim: foldmethod=marker
--- a/boxtree/tree_build_kernels.py
+++ b/boxtree/tree_build_kernels.py
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-
-__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+# __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -25,30 +20,41 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging
+from functools import partial

 import numpy as np
-import pyopencl as cl
+from mako.template import Template
+
 from pyopencl.elementwise import ElementwiseTemplate
 from pyopencl.scan import ScanTemplate
-from mako.template import Template
-from pytools import Record, memoize
-from boxtree.tools import get_type_moniker
+from pytools import Record, log_process, memoize
+
+from boxtree.tools import (
+    coord_vec_subscript_code,
+    get_coord_vec_dtype,
+    get_type_moniker,
+)
+

-import logging
 logger = logging.getLogger(__name__)


 # TODO:
 # - Add *restrict where applicable.
+# - Split up the arrays so that there is one array per box level. This avoids
+#   having to reallocate the middle of an array.
+# - Use level-relative box numbering in parent_box_ids, child_box_ids. This
+#   avoids having to renumber these arrays after reallocation.

 # -----------------------------------------------------------------------------
 # CONTROL FLOW
 # ------------
 #
-# Since this file mostly fills in the blanks in the outer parallel 'scan'
+# Since this file mostly fills in the blanks in the tree build
 # implementation, control flow here can be a bit hard to see.
 #
-# - Everything starts and ends in the 'driver' bit at the end.
+# - Everything starts and ends in the driver in tree_build.py
 #
 # - The first thing that happens is that data types get built and
 #   kernels get compiled. Most of the file consists of type and
@@ -57,12 +63,11 @@ logger = logging.getLogger(__name__)
 # - We start with a reduction that determines the bounding box of all
 #   particles.
 #
-# - The level loop is in the driver below, which alternates between
-#   scans and local post processing ("split and sort"), according to
-#   the algorithm described below.
+# - The level loop is in the driver, which alternates between scans and local
+#   post processing, according to the algorithm described below.
 #
 # - Once the level loop finishes, a "box info" kernel is run
-#   that extracts some more information for each box. (center, level, ...)
+#   that extracts flags for each box.
 #
 # - As a last step, empty leaf boxes are eliminated. This is done by a
 #   scan kernel that computes indices, and by an elementwise kernel
@@ -70,24 +75,48 @@ logger = logging.getLogger(__name__)
 #
 # -----------------------------------------------------------------------------
 #
-# HOW DOES THE PRIMARY SCAN WORK?
-# -------------------------------
+# HOW DOES THE LEVEL LOOP WORK?
+# -----------------------------
 #
-# This code sorts particles into an nD-tree of boxes. It does this by doing a
-# (parallel) scan over particles and a (local, i.e. independent for each particle)
-# postprocessing step for each level.
+# This code sorts particles into an nD-tree of boxes.  It does this by doing two
+# successive (parallel) scans and a postprocessing step.
 #
-# The following information is being pushed around by the scan, which
-# proceeds over particles:
+# The following information is being pushed around by the scans, which
+# proceed over particles:
 #
-# - a cumulative count ("counts") of particles in each subbox ("morton_nr") at
-#   the current level, should the current box need to be subdivided.
+# - a cumulative count ("pcnt") and weight ("pwt") of particles in each subbox
+#   ("morton_nr") , should the current box need to be subdivided.
 #
-# - the "split_box_id". The very first entry here gets intialized to
-#   the number of boxes present at the previous level. If a box knows it needs to
-#   be subdivided, its first particle asks for 2**d new boxes. This gets scanned
-#   over by summing globally (unsegmented-ly). The splits are then realized in
-#   the post-processing step.
+# - the "split_box_id". This is an array that, for each box, answers the
+#   question, "After I am subdivided, what is end of the range of boxes
+#   that my particles get pushed into?" The split_box_id is not meaningful
+#   unless the box is about to be subdivided.
+#
+# Using this data, the stages of the algorithm proceed as follows:
+#
+# 1. Count the number of particles in each subbox. This stage uses a segmented
+#    (per-box) scan to fill "pcnt" and "pwt". This information is kept
+#    per-particle ("morton_bin_counts") and per-box ("box_morton_bin_counts").
+#
+# 2. Using a scan over the boxes, segmented by level, make a decision whether to
+#    refine each box, and compute the split_box_id. This stage also computes the
+#    total number of new boxes needed. If a box knows it needs to be subdivided,
+#    it asks for 2**d new boxes at the next level.
+#
+# 3. Realize the splitting determined in #2. This part consists of splitting the
+#    boxes (done in the "box splitter kernel") and renumbering the particles so
+#    that particles in the same box have are numbered contiguously (done in the
+#    "particle renumberer kernel").
+#
+# HOW DOES LEVEL RESTRICTION WORK?
+# --------------------------------
+#
+# This requires some post-processing in the level loop described above: as an
+# additional step, the "level restrict" kernel gets run at the end of the level
+# loop. The job of the level restrict kernel is to mark boxes on higher levels
+# to be split based on looking at the levels of their neighbor boxes. The
+# splitting is then realized by the next iteration of the level loop,
+# simultaneously with the creation of the next level.
 #
 # -----------------------------------------------------------------------------

@@ -98,7 +127,10 @@ class _KernelInfo(Record):

 # {{{ data types

-@memoize
+refine_weight_dtype = np.dtype(np.int32)
+
+
+@memoize(use_kwargs=True)
 def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
        srcntgts_have_extent):
    fields = []
@@ -109,7 +141,10 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,

    from boxtree.tools import padded_bin
    for mnr in range(2**dimensions):
-        fields.append(("pcnt%s" % padded_bin(mnr, dimensions), particle_id_dtype))
+        fields.append((f"pcnt{padded_bin(mnr, dimensions)}", particle_id_dtype))
+    # Morton bin weight totals
+    for mnr in range(2**dimensions):
+        fields.append((f"pwt{padded_bin(mnr, dimensions)}", refine_weight_dtype))

    dtype = np.dtype(fields)

@@ -117,10 +152,10 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
    if srcntgts_have_extent:
        name_suffix = "_ext"

-    name = "boxtree_morton_bin_count_%dd_p%s%s_t" % (
-            dimensions,
-            get_type_moniker(particle_id_dtype),
-            name_suffix)
+    type_moniker = get_type_moniker(particle_id_dtype)
+    name = (
+        f"boxtree_morton_bin_count_{dimensions}d_p{type_moniker}{name_suffix}_t"
+    )

    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
@@ -130,16 +165,19 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,

 # }}}

+
 # {{{ preamble

 TYPE_DECL_PREAMBLE_TPL = Template(r"""//CL//
    typedef ${dtype_to_ctype(morton_bin_count_dtype)} morton_counts_t;
    typedef morton_counts_t scan_t;
+    typedef ${dtype_to_ctype(refine_weight_dtype)} refine_weight_t;
    typedef ${dtype_to_ctype(bbox_dtype)} bbox_t;
    typedef ${dtype_to_ctype(coord_dtype)} coord_t;
    typedef ${dtype_to_ctype(coord_vec_dtype)} coord_vec_t;
    typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
    typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
+    typedef ${dtype_to_ctype(box_level_dtype)} box_level_t;

    // morton_nr == -1 is defined to mean that the srcntgt is
    // remaining at the present level and will not be sorted
@@ -148,7 +186,6 @@ TYPE_DECL_PREAMBLE_TPL = Template(r"""//CL//
    """, strict_undefined=True)

 GENERIC_PREAMBLE_TPL = Template(r"""//CL//
-    #define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor})

    // Use this as dbg_printf(("oh snap: %d\n", stuff)); Note the double
    // parentheses.
@@ -180,26 +217,37 @@ GENERIC_PREAMBLE_TPL = Template(r"""//CL//

 # BEGIN KERNELS IN THE LEVEL LOOP

-# {{{ scan primitive code template
+# {{{ morton scan

-SCAN_PREAMBLE_TPL = Template(r"""//CL//
+MORTON_NR_SCAN_PREAMBLE_TPL = Template(r"""//CL//

    // {{{ neutral element

    scan_t scan_t_neutral()
    {
        scan_t result;
+
        %if srcntgts_have_extent:
            result.nonchild_srcntgts = 0;
        %endif
+
        %for mnr in range(2**dimensions):
            result.pcnt${padded_bin(mnr, dimensions)} = 0;
        %endfor
+        %for mnr in range(2**dimensions):
+            result.pwt${padded_bin(mnr, dimensions)} = 0;
+        %endfor
        return result;
    }

    // }}}

+    inline int my_add_sat(int a, int b)
+    {
+        long result = (long) a + b;
+        return (result > INT_MAX) ? INT_MAX : result;
+    }
+
    // {{{ scan 'add' operation
    scan_t scan_t_add(scan_t a, scan_t b, bool across_seg_boundary)
    {
@@ -213,6 +261,16 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
                <% field = "pcnt"+padded_bin(mnr, dimensions) %>
                b.${field} = a.${field} + b.${field};
            %endfor
+            %for mnr in range(2**dimensions):
+                <% field = "pwt"+padded_bin(mnr, dimensions) %>
+                // XXX: The use of add_sat() seems to be causing trouble
+                // with multiple compilers. For d=3:
+                // 1. POCL will miscompile and either give wrong
+                //    results or crash.
+                // 2. Intel will use a large amount of memory.
+                // Versions tested: POCL 0.13, Intel OpenCL 16.1
+                b.${field} = my_add_sat(a.${field}, b.${field});
+            %endfor
        }

        return b;
@@ -224,41 +282,53 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//

    scan_t scan_t_from_particle(
        const int i,
-        const int level,
+        const int particle_level,
        bbox_t const *bbox,
        global morton_nr_t *morton_nrs, // output/side effect
-        global particle_id_t *user_srcntgt_ids
+        global particle_id_t *user_srcntgt_ids,
+        global refine_weight_t *refine_weights
        %for ax in axis_names:
            , global const coord_t *${ax}
        %endfor
        %if srcntgts_have_extent:
            , global const coord_t *srcntgt_radii
+            , const coord_t stick_out_factor
        %endif
    )
    {
        particle_id_t user_srcntgt_id = user_srcntgt_ids[i];

-        // Recall that 'level' is the level currently being built, e.g. 1 at
-        // the root.  This should be 0.5 at level 1. (Level 0 is the root.)
+        // The next level is 1 + the current level of the particle.
+        // This should be 0.5 when next level = 1. (Level 0 is the root.)
        coord_t next_level_box_size_factor =
-            ((coord_t) 1) / ((coord_t) (1U << level));
+            ((coord_t) 1) / ((coord_t) (1U << (1 + particle_level)));

        %if srcntgts_have_extent:
            bool stop_srcntgt_descent = false;
            coord_t srcntgt_radius = srcntgt_radii[user_srcntgt_id];
        %endif

+        %if not srcntgts_have_extent:
+            // This argument is only supplied with srcntgts_have_extent.
+            #define stick_out_factor 0.
+        %endif
+
        const coord_t one_half = ((coord_t) 1) / 2;
        const coord_t box_radius_factor =
            // AMD CPU seems to like to miscompile this--change with care.
            // (last seen on 13.4-2)
-            (1. + STICK_OUT_FACTOR)
+            (1. + stick_out_factor)
            * one_half; // convert diameter to radius

+        %if not srcntgts_have_extent:
+            #undef stick_out_factor
+        %endif
+
        %for ax in axis_names:
            // Most FMMs are isotropic, i.e. global_extent_{x,y,z} are all the same.
            // Nonetheless, the gain from exploiting this assumption seems so
-            // minimal that doing so here didn't seem worthwhile.
+            // minimal that doing so here didn't seem worthwhile in the
+            // srcntgts_extent_norm == "linf" case.

            coord_t global_min_${ax} = bbox->min_${ax};
            coord_t global_extent_${ax} = bbox->max_${ax} - global_min_${ax};
@@ -270,28 +340,34 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
            // level, and it isn't either by the fact that boxes are
            // [)-half-open in subsequent levels.

-            // So (1 << level) is 2 when building level 1.  Because the
-            // floating point factor is strictly less than 1, 2 is never
-            // reached, so when building level 1, the result is either 0 or 1.
+            // So (1 << (1 + particle_level)) is 2 when building level 1.
+            // Because the floating point factor is strictly less than 1, 2 is
+            // never reached, so when building level 1, the result is either
+            // 0 or 1.
            // After that, we just add one (less significant) bit per level.

            unsigned ${ax}_bits = (unsigned) (
                ((srcntgt_${ax} - global_min_${ax}) / global_extent_${ax})
-                * (1U << level));
+                * (1U << (1 + particle_level)));

-            %if srcntgts_have_extent:
-                // Need to compute center to compare excess with STICK_OUT_FACTOR.
-                coord_t next_level_box_center_${ax} =
-                    global_min_${ax}
-                    + global_extent_${ax}
-                    * (${ax}_bits + one_half)
-                    * next_level_box_size_factor;
+            // Need to compute center to compare excess with stick_out_factor.
+            // Unused if no stickout, relying on compiler to eliminate this.
+            const coord_t next_level_box_center_${ax} =
+                global_min_${ax}
+                + global_extent_${ax}
+                * (${ax}_bits + one_half)
+                * next_level_box_size_factor;
+
+        %endfor

-                coord_t next_level_box_stick_out_radius_${ax} =
+        %if srcntgts_extent_norm == "linf":
+            %for ax in axis_names:
+                const coord_t next_level_box_stick_out_radius_${ax} =
                    box_radius_factor
                    * global_extent_${ax}
                    * next_level_box_size_factor;

+                // stop descent here if particle sticks out of next-level box
                stop_srcntgt_descent = stop_srcntgt_descent ||
                    (srcntgt_${ax} + srcntgt_radius >=
                        next_level_box_center_${ax}
@@ -300,8 +376,41 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
                    (srcntgt_${ax} - srcntgt_radius <
                        next_level_box_center_${ax}
                        - next_level_box_stick_out_radius_${ax});
-            %endif
-        %endfor
+            %endfor
+
+        %elif srcntgts_extent_norm == "l2":
+
+            coord_t next_level_box_stick_out_radius =
+                box_radius_factor
+                * global_extent_x  /* assume isotropy */
+                * next_level_box_size_factor;
+
+            coord_t next_level_box_center_to_srcntgt_bdry_l2_dist =
+                sqrt(
+                %for ax in axis_names:
+                    +   (srcntgt_${ax} - next_level_box_center_${ax})
+                      * (srcntgt_${ax} - next_level_box_center_${ax})
+                %endfor
+                ) + srcntgt_radius;
+
+            // stop descent here if particle sticks out of next-level box
+            stop_srcntgt_descent = stop_srcntgt_descent ||
+                (
+                next_level_box_center_to_srcntgt_bdry_l2_dist
+                * next_level_box_center_to_srcntgt_bdry_l2_dist
+                    >= ${dimensions}
+                        * next_level_box_stick_out_radius
+                        * next_level_box_stick_out_radius);
+
+        %elif srcntgts_extent_norm is None:
+            // nothing to do
+
+        %else:
+            <%
+                raise ValueError("unexpected value of 'srcntgts_extent_norm': %s"
+                    % srcntgts_extent_norm)
+            %>
+        %endif

        // Pick off the lowest-order bit for each axis, put it in its place.
        int level_morton_number = 0
@@ -325,6 +434,11 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
            <% field = "pcnt"+padded_bin(mnr, dimensions) %>
            result.${field} = (level_morton_number == ${mnr});
        %endfor
+        %for mnr in range(2**dimensions):
+            <% field = "pwt"+padded_bin(mnr, dimensions) %>
+            result.${field} = (level_morton_number == ${mnr}) ?
+                    refine_weights[user_srcntgt_id] : 0;
+        %endfor
        morton_nrs[i] = level_morton_number;

        return result;
@@ -336,9 +450,9 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//

 # }}}

-# {{{ scan output code template
+# {{{ morton scan output

-SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
+MORTON_NR_SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
    {
        particle_id_t my_id_in_my_box = -1
        %if srcntgts_have_extent:
@@ -352,6 +466,7 @@ SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
        morton_bin_counts[i] = item;

        box_id_t current_box_id = srcntgt_box_ids[i];
+
        particle_id_t box_srcntgt_count = box_srcntgt_counts_cumul[current_box_id];

        // Am I the last particle in my current box?
@@ -372,40 +487,51 @@ SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
 # {{{ split box id scan

 SPLIT_BOX_ID_SCAN_TPL = ScanTemplate(
+    name_prefix="split_box_id_scan",
    arguments=r"""//CL:mako//
        /* input */
        box_id_t *srcntgt_box_ids,
-        particle_id_t *box_srcntgt_starts,
        particle_id_t *box_srcntgt_counts_cumul,
-        particle_id_t max_particles_in_box,
        morton_counts_t *box_morton_bin_counts,
+        refine_weight_t *refine_weights,
+        refine_weight_t max_leaf_refine_weight,
        box_level_t *box_levels,
-        box_level_t level,
-
-        /* input/output */
-        box_id_t *nboxes,
+        box_id_t *level_start_box_ids,
+        box_id_t *level_used_box_counts,
+        int *box_force_split,
+        box_level_t last_level,

        /* output */
+        int *box_has_children,
        box_id_t *split_box_ids,
+        int *have_oversize_split_box,
        """,
    preamble=r"""//CL:mako//
        scan_t count_new_boxes_needed(
-            particle_id_t i,
            box_id_t box_id,
-            __global box_id_t *nboxes,
-            __global particle_id_t *box_srcntgt_starts,
+            box_level_t level,
+            box_level_t last_level,
+            refine_weight_t max_leaf_refine_weight,
            __global particle_id_t *box_srcntgt_counts_cumul,
            __global morton_counts_t *box_morton_bin_counts,
-            particle_id_t max_particles_in_box,
-            __global box_level_t *box_levels,
-            box_level_t level
+            __global box_id_t *level_start_box_ids,
+            __global box_id_t *level_used_box_counts,
+            %if level_restrict:
+                __global int *box_force_split,
+            %endif
+            __global int *have_oversize_split_box, // output/side effect
+            __global int *box_has_children // output/side effect
            )
        {
            scan_t result = 0;

-            // First particle? Start counting at (the previous level's) nboxes.
-            if (i == 0)
-                result += *nboxes;
+            // First box at my level? Start counting at the number of boxes
+            // used at the child level.
+            if (box_id == level_start_box_ids[level])
+            {
+                result += level_start_box_ids[level + 1];
+                result += level_used_box_counts[level + 1];
+            }

            %if srcntgts_have_extent:
                const particle_id_t nonchild_srcntgts_in_box =
@@ -414,63 +540,156 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate(
                const particle_id_t nonchild_srcntgts_in_box = 0;
            %endif

-            particle_id_t first_particle_in_my_box =
-                box_srcntgt_starts[box_id];
+            // Get box refine weight.
+            refine_weight_t box_refine_weight = 0;
+            %for mnr in range(2**dimensions):
+                box_refine_weight = add_sat(box_refine_weight,
+                    box_morton_bin_counts[box_id].pwt${padded_bin(mnr, dimensions)});
+            %endfor

            // Add 2**d to make enough room for a split of the current box
-            // This will be the split_box_id for *all* particles in this box,
-            // including non-child srcntgts.
-
-            if (i == first_particle_in_my_box
-                %if srcntgts_have_extent:
-                    // Only last-level boxes get to produce new boxes.
-                    // If srcntgts have extent, then prior-level boxes
-                    // will keep asking for more boxes to be allocated.
-                    // Prevent that.
-
-                    &&
-                    box_levels[box_id] + 1 == level
-                %endif
+
+            if ((
+                level + 1 == last_level
                &&
                %if adaptive:
                    /* box overfull? */
-                    box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box
-                        > max_particles_in_box
+                    box_refine_weight
+                        > max_leaf_refine_weight
                %else:
                    /* box non-empty? */
+                    /* Note: Refine weights are allowed to be 0,
+                       so check # of particles directly. */
                    box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box
-                        > 0
+                        >= 0
+                %endif
+                )
+                %if level_restrict:
+                    || box_force_split[box_id]
                %endif
                )
            {
                result += ${2**dimensions};
+                box_has_children[box_id] = 1;
+
+                // Check if the box is oversized. This drives the level loop.
+                refine_weight_t max_subbox_refine_weight = 0;
+                %for mnr in range(2**dimensions):
+                    max_subbox_refine_weight = max(max_subbox_refine_weight,
+                        box_morton_bin_counts[box_id]
+                        .pwt${padded_bin(mnr, dimensions)});
+                %endfor
+                if (max_subbox_refine_weight > max_leaf_refine_weight)
+                {
+                    *have_oversize_split_box = 1;
+                }
            }

            return result;
        }
        """,
-    input_expr="""count_new_boxes_needed(
-            i, srcntgt_box_ids[i], nboxes,
-            box_srcntgt_starts, box_srcntgt_counts_cumul, box_morton_bin_counts,
-            max_particles_in_box, box_levels, level
+    input_expr=r"""//CL:mako//
+            count_new_boxes_needed(
+                i,
+                box_levels[i],
+                last_level,
+                max_leaf_refine_weight,
+                box_srcntgt_counts_cumul,
+                box_morton_bin_counts,
+                level_start_box_ids,
+                level_used_box_counts,
+                %if level_restrict:
+                    box_force_split,
+                %endif
+                have_oversize_split_box,
+                box_has_children
            )""",
-    scan_expr="a + b",
+    scan_expr="across_seg_boundary ? b : a + b",
    neutral="0",
-    output_statement="""//CL//
+    is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i-1]",
+    output_statement=r"""//CL//
        dbg_assert(item >= 0);

        split_box_ids[i] = item;

-        // Am I the last particle overall? If so, write box count
-        if (i+1 == N)
-            *nboxes = item;
        """)

 # }}}

-# {{{ split-and-sort kernel
+# {{{ box splitter kernel
+
+BOX_SPLITTER_KERNEL_TPL = Template(r"""//CL//
+    box_id_t ibox = i;
+
+    bool do_split_box =
+       (box_has_children[ibox] && box_levels[ibox] + 1 == level)
+       %if level_restrict:
+           || box_force_split[ibox]
+       %endif
+       ;
+
+    if (!do_split_box)
+    {
+        PYOPENCL_ELWISE_CONTINUE;
+    }
+
+    // {{{ Set up child box data structure.
+
+    morton_counts_t box_morton_bin_count = box_morton_bin_counts[ibox];
+
+    %for mnr in range(2**dimensions):
+    {
+        box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + ${mnr};
+
+        // Parent / child / level info
+        box_parent_ids[new_box_id] = ibox;
+        box_child_ids_mnr_${mnr}[ibox] = new_box_id;
+        box_level_t new_level = box_levels[ibox] + 1;
+        box_levels[new_box_id] = new_level;
+
+        // Box particle counts / starting particle number
+        particle_id_t new_count =
+            box_morton_bin_count.pcnt${padded_bin(mnr, dimensions)};
+        box_srcntgt_counts_cumul[new_box_id] = new_count;
+
+        // Only set the starting particle number / start flags if
+        // the new box has particles to begin with.
+        if (new_count > 0)
+        {
+            particle_id_t new_box_start = box_srcntgt_starts[ibox]
+            %if srcntgts_have_extent:
+                + box_morton_bin_count.nonchild_srcntgts
+            %endif
+            %for sub_mnr in range(mnr):
+                + box_morton_bin_count.pcnt${padded_bin(sub_mnr, dimensions)}
+            %endfor
+            ;
+
+            box_start_flags[new_box_start] = 1;
+            box_srcntgt_starts[new_box_id] = new_box_start;
+        }
+
+        // Compute box center.
+        coord_t radius = (root_extent * 1 / (coord_t) (1 << (1 + new_level)));
+
+        %for idim, ax in enumerate(axis_names):
+        {
+            <% has_bit = mnr & 2**(dimensions-1-idim) %>
+            box_centers_${ax}[new_box_id] = box_centers_${ax}[ibox]
+                ${"+" if has_bit else "-"} radius;
+        }
+        %endfor
+    }
+    %endfor
+
+    // }}}
+""", strict_undefined=True)
+
+# }}}
+
+# {{{ post-split particle renumbering

-SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL//
+PARTICLE_RENUMBERER_PREAMBLE_TPL = Template(r"""//CL//
    <%
      def get_count_for_branch(known_bits):
          if len(known_bits) == dimensions:
@@ -497,162 +716,244 @@ SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL//
 """, strict_undefined=True)


-SPLIT_AND_SORT_KERNEL_TPL = Template(r"""//CL//
+PARTICLE_RENUMBERER_KERNEL_TPL = Template(r"""//CL//
    box_id_t ibox = srcntgt_box_ids[i];
    dbg_assert(ibox >= 0);
-    dbg_assert(ibox < nboxes);

    dbg_printf(("postproc %d:\n", i));
    dbg_printf(("   my box id: %d\n", ibox));

-    particle_id_t box_srcntgt_count = box_srcntgt_counts_cumul[ibox];
+    bool do_split_box = (box_has_children[ibox] && box_levels[ibox] + 1 == level)
+       %if level_restrict:
+           || box_force_split[ibox]
+       %endif
+       ;

-    %if srcntgts_have_extent:
-        const particle_id_t nonchild_srcntgt_count =
-            box_morton_bin_counts[ibox].nonchild_srcntgts;
+    if (!do_split_box)
+    {
+        // Not splitting? Copy over existing particle info.
+        new_user_srcntgt_ids[i] = user_srcntgt_ids[i];
+        new_srcntgt_box_ids[i] = ibox;

-    %else:
-        const particle_id_t nonchild_srcntgt_count = 0;
-    %endif
+        PYOPENCL_ELWISE_CONTINUE;
+    }

-    %if adaptive:
-        bool do_split_box =
-            box_srcntgt_count - nonchild_srcntgt_count
-            > max_particles_in_box;
-    %else:
-        bool do_split_box =
-            box_srcntgt_count - nonchild_srcntgt_count
-            > 0;
-    %endif
+    morton_nr_t my_morton_nr = morton_nrs[i];
+    // printf("   my morton nr: %d\n", my_morton_nr);
+
+    morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox];
+
+    morton_counts_t my_morton_bin_counts = morton_bin_counts[i];
+    particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr);

+    // {{{ compute this srcntgt's new index
+
+    particle_id_t my_box_start = box_srcntgt_starts[ibox];
+    particle_id_t tgt_particle_idx = my_box_start + my_count-1;
    %if srcntgts_have_extent:
-        ## Only do split-box processing for srcntgts that were touched
-        ## on the immediately preceding level.
-        ##
-        ## If srcntgts have no extent, then subsequent levels
-        ## will never decide to split boxes that were kept unsplit on prior
-        ## levels either. If srcntgts do
-        ## have an extent, this could happen. Prevent running the
-        ## split code for such particles.
-
-        int box_level = box_levels[ibox];
-        do_split_box = do_split_box && box_level + 1 == level;
+        tgt_particle_idx +=
+            (my_morton_nr >= 0)
+                ? my_box_morton_bin_counts.nonchild_srcntgts
+                : 0;
    %endif
+    %for mnr in range(2**dimensions):
+        <% bin_nmr = padded_bin(mnr, dimensions) %>
+        tgt_particle_idx +=
+            (my_morton_nr > ${mnr})
+                ? my_box_morton_bin_counts.pcnt${bin_nmr}
+                : 0;
+    %endfor
+
+    dbg_assert(tgt_particle_idx < n);
+    dbg_printf(("   moving %ld -> %d "
+        "(ibox %d, my_box_start %d, my_count %d)\n",
+        i, tgt_particle_idx,
+        ibox, my_box_start, my_count));
+
+    new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i];

-    if (do_split_box)
-    {
-        morton_nr_t my_morton_nr = morton_nrs[i];
-        dbg_printf(("   my morton nr: %d\n", my_morton_nr));
+    // }}}

-        morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox];
+    // {{{ compute this srcntgt's new box id

-        morton_counts_t my_morton_bin_counts = morton_bin_counts[i];
-        particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr);
+    box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + my_morton_nr;

-        // {{{ compute this srcntgt's new index
+    %if srcntgts_have_extent:
+        if (my_morton_nr == -1)
+        {
+            new_box_id = ibox;
+        }
+    %endif

-        particle_id_t my_box_start = box_srcntgt_starts[ibox];
-        particle_id_t tgt_particle_idx = my_box_start + my_count-1;
-        %if srcntgts_have_extent:
-            tgt_particle_idx +=
-                (my_morton_nr >= 0)
-                    ? my_box_morton_bin_counts.nonchild_srcntgts
-                    : 0;
-        %endif
-        %for mnr in range(2**dimensions):
-            <% bin_nmr = padded_bin(mnr, dimensions) %>
-            tgt_particle_idx +=
-                (my_morton_nr > ${mnr})
-                    ? my_box_morton_bin_counts.pcnt${bin_nmr}
-                    : 0;
-        %endfor
+    dbg_printf(("   new_box_id: %d\n", new_box_id));
+    dbg_assert(new_box_id >= 0);

-        dbg_assert(tgt_particle_idx < n);
-        dbg_printf(("   moving %ld -> %d "
-            "(ibox %d, my_box_start %d, my_count %d)\n",
-            i, tgt_particle_idx,
-            ibox, my_box_start, my_count));
+    new_srcntgt_box_ids[tgt_particle_idx] = new_box_id;

-        new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i];
+    // }}}
+""", strict_undefined=True)

-        // }}}
+# }}}

-        // {{{ compute this srcntgt's new box id
+# {{{ level restrict kernel

-        box_id_t new_box_id = split_box_ids[i] - ${2**dimensions} + my_morton_nr;
+from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS

-        %if srcntgts_have_extent:
-            if (my_morton_nr == -1)
-                new_box_id = ibox;
-        %endif

-        dbg_printf(("   new_box_id: %d\n", new_box_id));
-        dbg_assert(new_box_id >= 0);
+LEVEL_RESTRICT_TPL = Template(
+    TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
+    <%def name="my_load_center(name, box_id)">
+        ## This differs from load_center() because in this kernel box centers
+        ## live in one array per axis.
+        coord_vec_t ${name};
+        %for i in range(dimensions):
+            ${name}.${AXIS_NAMES[i]} = box_centers_${AXIS_NAMES[i]}[${box_id}];
+        %endfor
+    </%def>

-        new_srcntgt_box_ids[tgt_particle_idx] = new_box_id;
+    #define NLEVELS (${max_levels})

-        // }}}
+    box_id_t box_id = i;

-        // {{{ set up child box data structure
+    // Skip unless this box is a leaf.
+    if (box_has_children[box_id])
+    {
+        PYOPENCL_ELWISE_CONTINUE;
+    }

-        %for mnr in range(2**dimensions):
-          /* Am I the last particle in my Morton bin? */
-            %if mnr > 0:
-                else
-            %endif
-            if (${mnr} == my_morton_nr
-                && my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)}
-                    == my_count)
-            {
-                dbg_printf(("   ## splitting\n"));
+    ${walk_init(0)}

-                particle_id_t new_box_start = my_box_start
-                %if srcntgts_have_extent:
-                    + my_box_morton_bin_counts.nonchild_srcntgts
-                %endif
-                %for sub_mnr in range(mnr):
-                    + my_box_morton_bin_counts.pcnt${padded_bin(sub_mnr, dimensions)}
-                %endfor
-                    ;
+    // Descend the tree searching for neighboring leaves.
+    while (continue_walk)
+    {
+        box_id_t child_box_id;
+        // Look for the child in the appropriate array.
+    %for morton_nr in range(2**dimensions):
+        if (walk_morton_nr == ${morton_nr})
+        {
+            child_box_id = box_child_ids_mnr_${morton_nr}[walk_parent_box_id];
+        }
+    %endfor

-                dbg_printf(("   new_box_start: %d\n", new_box_start));
+        if (child_box_id)
+        {
+            int child_level = walk_stack_size + 1;

-                box_start_flags[new_box_start] = 1;
-                box_srcntgt_starts[new_box_id] = new_box_start;
-                box_parent_ids[new_box_id] = ibox;
-                box_morton_nrs[new_box_id] = my_morton_nr;
+            // Check adjacency.
+            bool is_adjacent;

-                particle_id_t new_count =
-                    my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)};
-                box_srcntgt_counts_cumul[new_box_id] = new_count;
-                box_levels[new_box_id] = level;
+            if (child_box_id == box_id)
+            {
+                // Skip considering self.
+                is_adjacent = false;
+            }
+            else
+            {
+                ${my_load_center("box_center", "box_id")}
+                ${my_load_center("child_center", "child_box_id")}
+                is_adjacent = is_adjacent_or_overlapping(
+                    root_extent, child_center, child_level, box_center, level);
+            }

-                // For a non-adaptive run, max_particles_in_box drives the
-                // level loop.
-                if (new_count > max_particles_in_box)
+            if (is_adjacent)
+            {
+                // Invariant: When new leaves get added,
+                // they are never more than 2 levels deeper than
+                // all their adjacent leaves.
+                //
+                // Hence in we only need to look at boxes up to
+                // (level + 2) deep.
+
+                if (box_has_children[child_box_id])
                {
-                    *have_oversize_split_box = 1;
+                    if (child_level <= 1 + level)
+                    {
+                        ${walk_push("child_box_id")}
+                        continue;
+                    }
+                }
+                else
+                {
+                    // We are looking at a neighboring leaf box.
+                    // Check if my box must be split to enforce level
+                    // restriction.
+                    if (child_level == 2 + level || (
+                        child_level == 1 + level &&
+                        box_force_split[child_box_id]))
+                    {
+                        box_force_split[box_id] = 1;
+                        atomic_or(have_upper_level_split_box, 1);
+                        continue_walk = false;
+                    }
                }
-
-                dbg_printf(("   box pcount: %d\n",
-                    box_srcntgt_counts_cumul[new_box_id]));
            }
-        %endfor
-
-        // }}}
-    }
-    else
-    {
-        // Not splitting? Copy over existing particle info.
-        new_user_srcntgt_ids[i] = user_srcntgt_ids[i];
-        new_srcntgt_box_ids[i] = ibox;
+        }
+        ${walk_advance()}
    }
 """, strict_undefined=True)

+
+def build_level_restrict_kernel(context, preamble_with_dtype_decls,
+            dimensions, axis_names, box_id_dtype, coord_dtype,
+            box_level_dtype, max_levels):
+    from boxtree.tools import ScalarArg, VectorArg
+
+    arguments = (
+        [
+            # input
+            ScalarArg(box_level_dtype, "level"),  # [1]
+            ScalarArg(coord_dtype, "root_extent"),  # [1]
+            VectorArg(np.int32, "box_has_children"),  # [nboxes]
+
+            # input/output
+            VectorArg(np.int32, "box_force_split"),  # [nboxes]
+
+            # output
+            VectorArg(np.int32, "have_upper_level_split_box"),  # [1]
+        ]
+        # input, length depends on dim
+        + [VectorArg(box_id_dtype, f"box_child_ids_mnr_{mnr}")
+             for mnr in range(2**dimensions)]  # [nboxes]
+        + [VectorArg(coord_dtype, f"box_centers_{ax}")
+             for ax in axis_names]  # [nboxes]
+        )
+
+    render_vars = {
+        "AXIS_NAMES": axis_names,
+        "dimensions": dimensions,
+        "max_levels": max_levels,
+        # Entries below are needed by HELPER_FUNCTION_TEMPLATE
+        # and/or TRAVERSAL_PREAMBLE_MAKO_DEFS:
+        "debug": False,
+        "targets_have_extent": False,
+        "sources_have_extent": False,
+        "get_coord_vec_dtype": get_coord_vec_dtype,
+        "cvec_sub": partial(coord_vec_subscript_code, dimensions),
+        }
+
+    from pyopencl.elementwise import ElementwiseKernel
+
+    from boxtree.traversal import HELPER_FUNCTION_TEMPLATE
+
+    return ElementwiseKernel(
+            context,
+            arguments=arguments,
+            operation=LEVEL_RESTRICT_TPL.render(**render_vars),
+            name="level_restrict",
+            preamble=(
+                str(preamble_with_dtype_decls)
+                + Template(r"""
+                    #define LEVEL_TO_RAD(level) \
+                        (root_extent * 1 / (coord_t) (1 << (level + 1)))
+                    """
+                    + HELPER_FUNCTION_TEMPLATE)
+                .render(**render_vars)))
+
 # }}}

 # END KERNELS IN THE LEVEL LOOP

+
 # {{{ nonchild srcntgt count extraction

 EXTRACT_NONCHILD_SRCNTGT_COUNT_TPL = ElementwiseTemplate(
@@ -784,9 +1085,6 @@ SOURCE_AND_TARGET_INDEX_FINDER = ElementwiseTemplate(
                    target_nr + 1 - (particle_id_t) is_source
                    - box_start_target_nr;
            }
-        %elif srcntgts_have_extent:
-            box_source_counts_nonchild[box_id] = 0;
-            box_target_counts_nonchild[box_id] = 0;
        %endif

        // {{{ last particle for this or the parents' boxes? update counts
@@ -869,20 +1167,16 @@ SRCNTGT_PERMUTER_TPL = ElementwiseTemplate(

 # }}}

-
 # {{{ box info kernel

 BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
    arguments="""//CL:mako//
        /* input */
        box_id_t *box_parent_ids,
-        morton_nr_t *box_morton_nrs,
-        bbox_t bbox,
-        box_id_t aligned_nboxes,
        particle_id_t *box_srcntgt_counts_cumul,
        particle_id_t *box_source_counts_cumul,
        particle_id_t *box_target_counts_cumul,
-        particle_id_t max_particles_in_box,
+        int *box_has_children,
        box_level_t *box_levels,
        box_level_t nlevels,

@@ -891,8 +1185,6 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
        particle_id_t *box_target_counts_nonchild,

        /* output */
-        box_id_t *box_child_ids, /* [2**dimensions, aligned_nboxes] */
-        coord_t *box_centers, /* [dimensions, aligned_nboxes] */
        box_flags_t *box_flags, /* [nboxes] */
        """,
    operation=r"""//CL:mako//
@@ -905,10 +1197,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
         *
         * box_srcntgt_counts_cumul is zero (here) exactly for empty leaves
         * because it gets initialized to zero and never gets set to another
-         * value. If you check above, most box info is only ever initialized
-         * *if* there's a particle in the box, because the sort/build is a
-         * repeated scan over *particles* (not boxes). Thus, no particle -> no
-         * work done.
+         * value.
         */

        particle_id_t particle_count = box_srcntgt_counts_cumul[box_id];
@@ -940,41 +1229,16 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(

        dbg_assert(particle_count >= nonchild_srcntgt_count);

-        if (particle_count == 0)
-        {
-            // Empty leaf: Lots of stuff uninitialized, prevent
-            // damage by quitting now.
-
-            // Also, those should have gotten pruned by this point,
-            // unless skip_prune is True.
-
-            box_flags[box_id] = 0; // no children, no sources, no targets, bye.
-
-            PYOPENCL_ELWISE_CONTINUE;
-        }
-        else if (
-            %if adaptive:
-                particle_count - nonchild_srcntgt_count > max_particles_in_box
-            %else:
-                particle_count - nonchild_srcntgt_count > 0
-            %endif
-            && box_levels[box_id] + 1 < nlevels)
+        if (box_has_children[box_id])
        {
            // This box has children, it is not a leaf.

-            // That second condition there covers a weird corner case.  It's
-            // obviously true--a last-level box won't have children.  But why
-            // is it necessary? It turns out that nonchild_srcntgt_count is not
-            // available (i.e. zero) for boxes on the last level. So these boxes
-            // look like they got split if they have enough non-child srcntgts,
-            // to the first part of the 'if' condition. But in fact they weren't,
-            // because of their non-child srcntgts.
-
-            my_box_flags |= BOX_HAS_CHILDREN;
+            my_box_flags |= BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES;

            %if sources_are_targets:
                if (particle_count - nonchild_srcntgt_count)
-                    my_box_flags |= BOX_HAS_CHILD_SOURCES | BOX_HAS_CHILD_TARGETS;
+                    my_box_flags |=
+                        BOX_HAS_SOURCE_CHILD_BOXES | BOX_HAS_TARGET_CHILD_BOXES;
            %else:
                particle_id_t source_count = box_source_counts_cumul[box_id];
                particle_id_t target_count = box_target_counts_cumul[box_id];
@@ -983,15 +1247,15 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
                dbg_assert(target_count >= nonchild_target_count);

                if (source_count - nonchild_source_count)
-                    my_box_flags |= BOX_HAS_CHILD_SOURCES;
+                    my_box_flags |= BOX_HAS_SOURCE_CHILD_BOXES;
                if (target_count - nonchild_target_count)
-                    my_box_flags |= BOX_HAS_CHILD_TARGETS;
+                    my_box_flags |= BOX_HAS_TARGET_CHILD_BOXES;
            %endif

            if (nonchild_source_count)
-                my_box_flags |= BOX_HAS_OWN_SOURCES;
+                my_box_flags |= BOX_IS_SOURCE_BOX;
            if (nonchild_target_count)
-                my_box_flags |= BOX_HAS_OWN_TARGETS;
+                my_box_flags |= BOX_IS_TARGET_BOX;
        }
        else
        {
@@ -999,7 +1263,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(

            %if sources_are_targets:
                if (particle_count)
-                    my_box_flags |= BOX_HAS_OWN_SOURCES | BOX_HAS_OWN_TARGETS;
+                    my_box_flags |= BOX_IS_SOURCE_BOX | BOX_IS_TARGET_BOX;

                box_source_counts_nonchild[box_id] = particle_count;
                dbg_assert(box_source_counts_nonchild == box_target_counts_nonchild);
@@ -1008,9 +1272,9 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
                particle_id_t my_target_count = particle_count - my_source_count;

                if (my_source_count)
-                    my_box_flags |= BOX_HAS_OWN_SOURCES;
+                    my_box_flags |= BOX_IS_SOURCE_BOX;
                if (my_target_count)
-                    my_box_flags |= BOX_HAS_OWN_TARGETS;
+                    my_box_flags |= BOX_IS_TARGET_BOX;

                box_source_counts_nonchild[box_id] = my_source_count;
                box_target_counts_nonchild[box_id] = my_target_count;
@@ -1018,57 +1282,118 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
        }

        box_flags[box_id] = my_box_flags;
+    """)
+
+# }}}
+
+# {{{ box extents
+
+BOX_EXTENTS_FINDER_TEMPLATE = ElementwiseTemplate(
+    arguments="""//CL:mako//
+    box_id_t aligned_nboxes,
+    box_id_t *box_child_ids,
+    coord_t *box_centers,
+    particle_id_t *box_particle_starts,
+    particle_id_t *box_particle_counts_nonchild
+
+    %for iaxis in range(dimensions):
+        , const coord_t *particle_${AXIS_NAMES[iaxis]}
+    %endfor
+    ,
+    const coord_t *particle_radii,
+    int enable_radii,
+
+    coord_t *box_particle_bounding_box_min,
+    coord_t *box_particle_bounding_box_max,
+    """,
+
+    operation=TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
+        box_id_t ibox = i;
+
+        ${load_center("box_center", "ibox")}

-        box_id_t parent_id = box_parent_ids[box_id];
-        morton_nr_t morton_nr = box_morton_nrs[box_id];
-        box_child_ids[parent_id + aligned_nboxes*morton_nr] = box_id;
+        <% axis_names = AXIS_NAMES[:dimensions] %>

-        /* walk up to root to find center */
-        %for idim in range(dimensions):
-            coord_t center_${idim} = 0;
+        // incorporate own particles
+        %for iaxis, ax in enumerate(axis_names):
+            coord_t min_particle_${ax} =
+                ${coord_vec_subscript_code("box_center", iaxis)};
+            coord_t max_particle_${ax} =
+                ${coord_vec_subscript_code("box_center", iaxis)};
        %endfor

-        box_id_t walk_parent_id = parent_id;
-        box_id_t current_box_id = box_id;
-        morton_nr_t walk_morton_nr = morton_nr;
-        while (walk_parent_id != current_box_id)
+        particle_id_t start = box_particle_starts[ibox];
+        particle_id_t stop = start + box_particle_counts_nonchild[ibox];
+
+        for (particle_id_t iparticle = start; iparticle < stop; ++iparticle)
        {
-            %for idim in range(dimensions):
-                {
-                    bool has_bit = (walk_morton_nr & ${2**(dimensions-1-idim)});
-                    center_${idim} = one_half*(
-                        center_${idim}
-                        - one_half
-                        + has_bit);
-                }
-            %endfor
+            coord_t particle_rad = 0;
+            %if srcntgts_have_extent:
+                // If only one has extent, then the radius array for the other
+                // may well be a null pointer.
+                if (enable_radii)
+                    particle_rad = particle_radii[iparticle];
+            %endif

-            current_box_id = walk_parent_id;
-            walk_parent_id = box_parent_ids[walk_parent_id];
-            walk_morton_nr = box_morton_nrs[current_box_id];
+            %for iaxis, ax in enumerate(axis_names):
+                coord_t particle_coord_${ax} = particle_${ax}[iparticle];
+
+                min_particle_${ax} = min(
+                    min_particle_${ax},
+                    particle_coord_${ax} - particle_rad);
+                max_particle_${ax} = max(
+                    max_particle_${ax},
+                    particle_coord_${ax} + particle_rad);
+            %endfor
        }

-        coord_t extent = bbox.max_x - bbox.min_x;
-        %for idim in range(dimensions):
+        // incorporate child boxes
+        for (int morton_nr = 0; morton_nr < ${2**dimensions}; ++morton_nr)
        {
-            box_centers[box_id + aligned_nboxes*${idim}] =
-                bbox.min_${AXIS_NAMES[idim]} + extent*(one_half+center_${idim});
+            box_id_t child_id = box_child_ids[
+                    morton_nr * aligned_nboxes + ibox];
+
+            if (child_id == 0)
+                continue;
+
+            %for iaxis, ax in enumerate(axis_names):
+                min_particle_${ax} = min(
+                    min_particle_${ax},
+                    box_particle_bounding_box_min[
+                        ${iaxis} * aligned_nboxes + child_id]);
+                max_particle_${ax} = max(
+                    max_particle_${ax},
+                    box_particle_bounding_box_max[
+                        ${iaxis} * aligned_nboxes + child_id]);
+            %endfor
        }
+
+        // write result
+        %for iaxis, ax in enumerate(axis_names):
+            box_particle_bounding_box_min[
+                ${iaxis} * aligned_nboxes + ibox] = min_particle_${ax};
+            box_particle_bounding_box_max[
+                ${iaxis} * aligned_nboxes + ibox] = max_particle_${ax};
        %endfor
-    """)
+    """,
+    name="find_box_extents")

 # }}}

-
 # {{{ kernel creation top-level

+
+@log_process(logger)
 def get_tree_build_kernel_info(context, dimensions, coord_dtype,
        particle_id_dtype, box_id_dtype,
-        sources_are_targets, srcntgts_have_extent,
-        stick_out_factor, morton_nr_dtype, box_level_dtype,
-        adaptive):
+        sources_are_targets, srcntgts_extent_norm,
+        morton_nr_dtype, box_level_dtype, kind):
+    """
+    :arg srcntgts_extent_norm: one of ``None``, ``"l2"`` or ``"linf"``
+    """

-    logger.info("start building tree build kernels")
+    level_restrict = (kind == "adaptive-level-restricted")
+    adaptive = (kind != "non-adaptive")

    # {{{ preparation

@@ -1079,7 +1404,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
                "incorrect results.", stacklevel=4)

    from pyopencl.tools import dtype_to_c_struct, dtype_to_ctype
-    coord_vec_dtype = cl.array.vec.types[coord_dtype, dimensions]
+    coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)

    particle_id_dtype = np.dtype(particle_id_dtype)
    box_id_dtype = np.dtype(box_id_dtype)
@@ -1087,10 +1412,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
    dev = context.devices[0]
    morton_bin_count_dtype, _ = make_morton_bin_count_type(
            dev, dimensions, particle_id_dtype,
-            srcntgts_have_extent)
+            srcntgts_have_extent=srcntgts_extent_norm is not None)

    from boxtree.bounding_box import make_bounding_box_dtype
-    bbox_dtype, bbox_type_decl = make_bounding_box_dtype(
+    bbox_dtype, _bbox_type_decl = make_bounding_box_dtype(
            dev, dimensions, coord_dtype)

    from boxtree.tools import AXIS_NAMES
@@ -1098,31 +1423,33 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,

    from boxtree.tools import padded_bin
    from boxtree.tree import box_flags_enum
-    codegen_args = dict(
-            dimensions=dimensions,
-            axis_names=axis_names,
-            padded_bin=padded_bin,
-            coord_dtype=coord_dtype,
-            coord_vec_dtype=coord_vec_dtype,
-            bbox_dtype=bbox_dtype,
-            particle_id_dtype=particle_id_dtype,
-            morton_bin_count_dtype=morton_bin_count_dtype,
-            morton_nr_dtype=morton_nr_dtype,
-            box_id_dtype=box_id_dtype,
-            dtype_to_ctype=dtype_to_ctype,
-            AXIS_NAMES=AXIS_NAMES,
-            box_flags_enum=box_flags_enum,
-
-            adaptive=adaptive,
-
-            sources_are_targets=sources_are_targets,
-            srcntgts_have_extent=srcntgts_have_extent,
-
-            stick_out_factor=stick_out_factor,
-
-            enable_assert=False,
-            enable_printf=False,
-            )
+    codegen_args = {
+            "dimensions": dimensions,
+            "axis_names": axis_names,
+            "padded_bin": padded_bin,
+            "coord_dtype": coord_dtype,
+            "coord_vec_dtype": coord_vec_dtype,
+            "bbox_dtype": bbox_dtype,
+            "refine_weight_dtype": refine_weight_dtype,
+            "particle_id_dtype": particle_id_dtype,
+            "morton_bin_count_dtype": morton_bin_count_dtype,
+            "morton_nr_dtype": morton_nr_dtype,
+            "box_id_dtype": box_id_dtype,
+            "box_level_dtype": box_level_dtype,
+            "dtype_to_ctype": dtype_to_ctype,
+            "AXIS_NAMES": AXIS_NAMES,
+            "box_flags_enum": box_flags_enum,
+
+            "adaptive": adaptive,
+            "level_restrict": level_restrict,
+
+            "sources_are_targets": sources_are_targets,
+            "srcntgts_have_extent": srcntgts_extent_norm is not None,
+            "srcntgts_extent_norm": srcntgts_extent_norm,
+
+            "enable_assert": False,
+            "enable_printf": False,
+            }

    # }}}

@@ -1141,10 +1468,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,

    scan_preamble = (
            preamble_with_dtype_decls
-            + str(SCAN_PREAMBLE_TPL.render(**codegen_args))
+            + str(MORTON_NR_SCAN_PREAMBLE_TPL.render(**codegen_args))
            )

-    from pyopencl.tools import VectorArg, ScalarArg
+    from boxtree.tools import ScalarArg, VectorArg
    common_arguments = (
            [
                # box-local morton bin counts for each particle at the current level
@@ -1160,15 +1487,20 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
                # segment flags
                # invariant to sorting once set
                # (particles are only reordered within a box)
-                VectorArg(np.uint8, "box_start_flags"),  # [nsrcntgts]
+                VectorArg(np.uint8, "box_start_flags"),   # [nsrcntgts]

                VectorArg(box_id_dtype, "srcntgt_box_ids"),  # [nsrcntgts]
-                VectorArg(box_id_dtype, "split_box_ids"),  # [nsrcntgts]
+                VectorArg(box_id_dtype, "split_box_ids"),  # [nboxes]

                # per-box morton bin counts
                VectorArg(morton_bin_count_dtype, "box_morton_bin_counts"),
+                # [nboxes]
+
+                VectorArg(refine_weight_dtype, "refine_weights"),
                # [nsrcntgts]

+                ScalarArg(refine_weight_dtype, "max_leaf_refine_weight"),
+
                # particle# at which each box starts
                VectorArg(particle_id_dtype, "box_srcntgt_starts"),  # [nboxes]

@@ -1178,15 +1510,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
                # pointer to parent box
                VectorArg(box_id_dtype, "box_parent_ids"),  # [nboxes]

-                # morton nr identifier {quadr,oct}ant of parent in which this
-                # box was created
-                VectorArg(morton_nr_dtype, "box_morton_nrs"),  # [nboxes]
-
-                # number of boxes total
-                VectorArg(box_id_dtype, "nboxes"),  # [1]
+                # level number
+                VectorArg(box_level_dtype, "box_levels"),  # [nboxes]

                ScalarArg(np.int32, "level"),
-                ScalarArg(particle_id_dtype, "max_particles_in_box"),
                ScalarArg(bbox_dtype, "bbox"),

                VectorArg(particle_id_dtype, "user_srcntgt_ids"),  # [nsrcntgts]
@@ -1196,26 +1523,35 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
            + [VectorArg(coord_dtype, ax) for ax in axis_names]

            + ([VectorArg(coord_dtype, "srcntgt_radii")]
-                if srcntgts_have_extent else [])
+                if srcntgts_extent_norm is not None else [])
            )

+    morton_count_scan_arguments = list(common_arguments)
+
+    if srcntgts_extent_norm is not None:
+        morton_count_scan_arguments += [
+            (ScalarArg(coord_dtype, "stick_out_factor"))
+        ]
+
    from pyopencl.scan import GenericScanKernel
    morton_count_scan = GenericScanKernel(
            context, morton_bin_count_dtype,
-            arguments=common_arguments,
+            arguments=morton_count_scan_arguments,
            input_expr=(
-                "scan_t_from_particle(%s)"
-                % ", ".join([
-                    "i", "level", "&bbox", "morton_nrs",
+                "scan_t_from_particle({})".format(", ".join([
+                    "i", "box_levels[srcntgt_box_ids[i]]", "&bbox", "morton_nrs",
                    "user_srcntgt_ids",
+                    "refine_weights",
                    ]
-                    + ["%s" % ax for ax in axis_names]
-                    + (["srcntgt_radii"] if srcntgts_have_extent else []))),
+                    + [f"{ax}" for ax in axis_names]
+                    + (["srcntgt_radii, stick_out_factor"]
+                       if srcntgts_extent_norm is not None else [])))),
            scan_expr="scan_t_add(a, b, across_seg_boundary)",
            neutral="scan_t_neutral()",
            is_segment_start_expr="box_start_flags[i]",
-            output_statement=SCAN_OUTPUT_STMT_TPL.render(**codegen_args),
-            preamble=scan_preamble)
+            output_statement=MORTON_NR_SCAN_OUTPUT_STMT_TPL.render(**codegen_args),
+            preamble=scan_preamble,
+            name_prefix="morton_scan")

    # }}}

@@ -1231,51 +1567,100 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
                ("box_id_t", box_id_dtype),
                ("morton_counts_t", morton_bin_count_dtype),
                ("box_level_t", box_level_dtype),
+                ("refine_weight_t", refine_weight_dtype),
                ),
            var_values=(
                ("dimensions", dimensions),
-                ("srcntgts_have_extent", srcntgts_have_extent),
+                ("srcntgts_have_extent", srcntgts_extent_norm is not None),
+                ("srcntgts_extent_norm", srcntgts_extent_norm),
                ("adaptive", adaptive),
+                ("padded_bin", padded_bin),
+                ("level_restrict", level_restrict),
                ),
            more_preamble=generic_preamble)

    # }}}

-    # {{{ split-and-sort
+    # {{{ box splitter
+
+    # Work around a bug in Mako < 0.7.3
+    # FIXME: Is this needed?
+    box_s_codegen_args = codegen_args.copy()
+    box_s_codegen_args.update(
+        dim=None,
+        boundary_morton_nr=None)
+
+    box_splitter_kernel_source = BOX_SPLITTER_KERNEL_TPL.render(**box_s_codegen_args)
+
+    from pyopencl.elementwise import ElementwiseKernel
+    box_splitter_kernel = ElementwiseKernel(
+            context,
+            common_arguments
+            + [
+                VectorArg(np.int32, "box_has_children"),
+                VectorArg(np.int32, "box_force_split"),
+                ScalarArg(coord_dtype, "root_extent"),
+                ]
+            + [VectorArg(box_id_dtype, f"box_child_ids_mnr_{mnr}")
+                          for mnr in range(2**dimensions)]
+            + [VectorArg(coord_dtype, f"box_centers_{ax}")
+                          for ax in axis_names],
+            str(box_splitter_kernel_source),
+            name="box_splitter",
+            preamble=preamble_with_dtype_decls
+            )
+
+    # }}}
+
+    # {{{ particle renumberer

    # Work around a bug in Mako < 0.7.3
-    s_and_s_codegen_args = codegen_args.copy()
-    s_and_s_codegen_args.update(
+    # FIXME: Copied from above. It may not be necessary?
+    part_rn_codegen_args = codegen_args.copy()
+    part_rn_codegen_args.update(
            dim=None,
            boundary_morton_nr=None)

-    split_and_sort_preamble = \
-            SPLIT_AND_SORT_PREAMBLE_TPL.render(**s_and_s_codegen_args)
+    particle_renumberer_preamble = \
+            PARTICLE_RENUMBERER_PREAMBLE_TPL.render(**part_rn_codegen_args)

-    split_and_sort_kernel_source = SPLIT_AND_SORT_KERNEL_TPL.render(**codegen_args)
+    particle_renumberer_kernel_source = \
+            PARTICLE_RENUMBERER_KERNEL_TPL.render(**codegen_args)

    from pyopencl.elementwise import ElementwiseKernel
-    split_and_sort_kernel = ElementwiseKernel(
+    particle_renumberer_kernel = ElementwiseKernel(
            context,
-            common_arguments
-            + [
-                VectorArg(particle_id_dtype, "new_user_srcntgt_ids",
-                    with_offset=True),
-                VectorArg(np.int32, "have_oversize_split_box", with_offset=True),
-                VectorArg(box_id_dtype, "new_srcntgt_box_ids", with_offset=True),
-                VectorArg(box_level_dtype, "box_levels", with_offset=True),
-                ],
-            str(split_and_sort_kernel_source), name="split_and_sort",
+            [*common_arguments,
+                VectorArg(np.int32, "box_has_children"),
+                VectorArg(np.int32, "box_force_split"),
+                VectorArg(particle_id_dtype, "new_user_srcntgt_ids"),
+                VectorArg(box_id_dtype, "new_srcntgt_box_ids")],
+            str(particle_renumberer_kernel_source), name="renumber_particles",
            preamble=(
                preamble_with_dtype_decls
-                + str(split_and_sort_preamble))
+                + str(particle_renumberer_preamble))
            )

    # }}}

+    # {{{ level restrict propagator
+
+    if level_restrict:
+        # At compile time the level restrict kernel requires fixing a
+        # "max_levels" constant for traversing the tree. This constant cannot be
+        # known at this point, hence we return a kernel builder.
+
+        level_restrict_kernel_builder = partial(build_level_restrict_kernel,
+            context, preamble_with_dtype_decls, dimensions, axis_names, box_id_dtype,
+            coord_dtype, box_level_dtype)
+    else:
+        level_restrict_kernel_builder = None
+
+    # }}}
+
    # END KERNELS IN LEVEL LOOP

-    if srcntgts_have_extent:
+    if srcntgts_extent_norm is not None:
        extract_nonchild_srcntgt_count_kernel = \
                EXTRACT_NONCHILD_SRCNTGT_COUNT_TPL.build(
                        context,
@@ -1294,26 +1679,53 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,

    # FIXME: Turn me into a scan template

-    from pyopencl.tools import VectorArg
+    from boxtree.tools import VectorArg
    find_prune_indices_kernel = GenericScanKernel(
            context, box_id_dtype,
            arguments=[
                # input
                VectorArg(particle_id_dtype, "box_srcntgt_counts_cumul"),
                # output
-                VectorArg(box_id_dtype, "to_box_id"),
-                VectorArg(box_id_dtype, "from_box_id"),
+                VectorArg(box_id_dtype, "src_box_id"),
+                VectorArg(box_id_dtype, "dst_box_id"),
                VectorArg(box_id_dtype, "nboxes_post_prune"),
                ],
-            input_expr="box_srcntgt_counts_cumul[i] == 0 ? 1 : 0",
+            input_expr="box_srcntgt_counts_cumul[i] != 0",
            preamble=box_flags_enum.get_c_defines(),
            scan_expr="a+b", neutral="0",
            output_statement="""
-                to_box_id[i] = i-prev_item;
                if (box_srcntgt_counts_cumul[i])
-                    from_box_id[i-prev_item] = i;
-                if (i+1 == N) *nboxes_post_prune = N-item;
-                """)
+                {
+                    dst_box_id[i] = item - 1;
+                    src_box_id[item - 1] = i;
+                }
+                if (i+1 == N) *nboxes_post_prune = item;
+                """,
+            name_prefix="find_prune_indices_scan")
+
+    # }}}
+
+    # {{{ find new level box counts
+
+    find_level_box_counts_kernel = GenericScanKernel(
+        context, box_id_dtype,
+        arguments=[
+            # input
+            VectorArg(box_level_dtype, "box_levels"),  # [nboxes]
+            # output
+            VectorArg(box_id_dtype, "level_box_counts"),  # [nlevels]
+            ],
+        input_expr="1",
+        is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i - 1]",
+        scan_expr="across_seg_boundary ? b : a + b",
+        neutral="0",
+        output_statement=r"""//CL//
+        if (i + 1 == N || box_levels[i] != box_levels[i + 1])
+        {
+            level_box_counts[box_levels[i]] = item;
+        }
+        """,
+        name_prefix="find_level_box_counts_scan")

    # }}}

@@ -1364,7 +1776,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
                    ("box_id_t", box_id_dtype),
                    ),
                var_values=(
-                    ("srcntgts_have_extent", srcntgts_have_extent),
+                    ("srcntgts_have_extent", srcntgts_extent_norm is not None),
                    ("sources_are_targets", sources_are_targets),
                    ),
                more_preamble=generic_preamble)
@@ -1385,7 +1797,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
            ("box_flags_t", box_flags_enum.dtype),
            ("box_level_t", box_level_dtype),
            )
-    codegen_args_tuples = tuple(six.iteritems(codegen_args))
+    codegen_args_tuples = tuple(codegen_args.items())
    box_info_kernel = BOX_INFO_KERNEL_TPL.build(
            context,
            type_aliases,
@@ -1395,7 +1807,25 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,

    # }}}

-    logger.info("tree build kernels built")
+    # {{{ box extent
+
+    box_extents_finder_kernel = BOX_EXTENTS_FINDER_TEMPLATE.build(context,
+        type_aliases=(
+            ("box_id_t", box_id_dtype),
+            ("coord_t", coord_dtype),
+            ("coord_vec_t", get_coord_vec_dtype(coord_dtype, dimensions)),
+            ("particle_id_t", particle_id_dtype),
+            ),
+        var_values=(
+            ("coord_vec_subscript_code",
+                partial(coord_vec_subscript_code, dimensions)),
+            ("dimensions", dimensions),
+            ("AXIS_NAMES", AXIS_NAMES),
+            ("srcntgts_have_extent", srcntgts_extent_norm is not None),
+            ),
+    )
+
+    # }}}

    return _KernelInfo(
            particle_id_dtype=particle_id_dtype,
@@ -1404,15 +1834,20 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,

            morton_count_scan=morton_count_scan,
            split_box_id_scan=split_box_id_scan,
-            split_and_sort_kernel=split_and_sort_kernel,
+            box_splitter_kernel=box_splitter_kernel,
+            particle_renumberer_kernel=particle_renumberer_kernel,
+            level_restrict=level_restrict,
+            level_restrict_kernel_builder=level_restrict_kernel_builder,

            extract_nonchild_srcntgt_count_kernel=(
                extract_nonchild_srcntgt_count_kernel),
            find_prune_indices_kernel=find_prune_indices_kernel,
+            find_level_box_counts_kernel=find_level_box_counts_kernel,
            srcntgt_permuter=srcntgt_permuter,
            source_counter=source_counter,
            source_and_target_index_finder=source_and_target_index_finder,
            box_info_kernel=box_info_kernel,
+            box_extents_finder_kernel=box_extents_finder_kernel,
            )

 # }}}
@@ -1501,7 +1936,6 @@ POINT_SOURCE_LINKING_BOX_POINT_SOURCES = ElementwiseTemplate(

 # }}}

-
 # {{{ target filtering

 TREE_ORDER_TARGET_FILTER_SCAN_TPL = ScanTemplate(
@@ -1575,4 +2009,4 @@ TREE_ORDER_TARGET_FILTER_INDEX_TPL = ElementwiseTemplate(

 # }}}

-# vim: foldmethod=marker:filetype=pyopencl
+# vim: foldmethod=marker
--- a/boxtree/tree_of_boxes.py
+++ b/boxtree/tree_of_boxes.py
+"""
+.. currentmodule:: boxtree
+
+.. _tree-of-boxes:
+
+Manipulating Trees of Boxes
+---------------------------
+
+These functions manipulate instances of :class:`TreeOfBoxes`.
+
+.. note::
+
+    These functions currently keep their bulk data in :class:`numpy.ndarray`
+    instances.  This contrasts with the particle-based tree (:class:`Tree`),
+    which operates on data in :class:`pyopencl.array.Array` instances).  Along
+    with the rest of :mod:`boxtree`, this will migrate to :mod:`arraycontext`
+    in the future.
+
+.. autofunction:: make_tree_of_boxes_root
+.. autofunction:: refine_tree_of_boxes
+.. autofunction:: uniformly_refine_tree_of_boxes
+.. autofunction:: coarsen_tree_of_boxes
+.. autofunction:: refine_and_coarsen_tree_of_boxes
+.. autofunction:: make_meshmode_mesh_from_leaves
+"""
+
+__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+
+from boxtree.tree import TreeOfBoxes, box_flags_enum
+
+
+if TYPE_CHECKING or getattr(sys, "_BUILDING_SPHINX_DOCS", False):
+    from meshmode.mesh import Mesh
+
+
+# {{{ utils for tree of boxes
+
+def _compute_tob_box_flags(box_child_ids: np.ndarray) -> np.ndarray:
+    nboxes = box_child_ids.shape[1]
+    # For the time being, we will work with the assumption that each box
+    # in the tree is both a source and a target box.
+    box_flags = np.full(
+            nboxes,
+            box_flags_enum.IS_SOURCE_BOX | box_flags_enum.IS_TARGET_BOX,
+            dtype=box_flags_enum.dtype)
+
+    box_is_leaf = np.all(box_child_ids == 0, axis=0)
+    box_flags[box_is_leaf] = box_flags[box_is_leaf] | box_flags_enum.IS_LEAF_BOX
+
+    box_flags[~box_is_leaf] = box_flags[~box_is_leaf] | (
+            box_flags_enum.HAS_SOURCE_CHILD_BOXES
+            | box_flags_enum.HAS_TARGET_CHILD_BOXES)
+
+    return box_flags
+
+
+def _resized_array(arr: np.ndarray, new_size: int) -> np.ndarray:
+    """Return a resized copy of the array. The new_size is a scalar which is
+    applied to the last dimension.
+    """
+    old_size = arr.shape[-1]
+    prefix = (slice(None), ) * (arr.ndim - 1)
+    if old_size >= new_size:
+        key = (*prefix, slice(new_size))
+        return arr[key].copy()
+    else:
+        new_shape = list(arr.shape)
+        new_shape[-1] = new_size
+        new_arr = np.zeros(new_shape, arr.dtype)
+
+        key = (*prefix, slice(old_size))
+        new_arr[key] = arr
+        return new_arr
+
+
+def _vec_of_signs(dim: int, i: int) -> np.ndarray:
+    """The sign vector is obtained by converting i to a dim-bit binary.
+    """
+    # e.g. bin(10) = '0b1010'
+    binary_digits = [int(bd) for bd in bin(i)[2:]]
+    n = len(binary_digits)
+    assert n <= dim
+    return np.array([0]*(dim-n) + binary_digits) * 2 - 1
+
+# }}}
+
+
+# {{{ refine/coarsen a tree of boxes
+
+def refine_tree_of_boxes(tob: TreeOfBoxes, refine_flags: np.ndarray) -> TreeOfBoxes:
+    """Make a refined copy of `tob` where boxes flagged with `refine_flags` are
+    refined.
+    """
+    return refine_and_coarsen_tree_of_boxes(tob, refine_flags, None)
+
+
+def uniformly_refine_tree_of_boxes(tob: TreeOfBoxes) -> TreeOfBoxes:
+    """Make a uniformly refined copy of `tob`.
+    """
+    refine_flags = np.zeros(tob.nboxes, bool)
+    refine_flags[tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0] = 1
+    return refine_tree_of_boxes(tob, refine_flags)
+
+
+def coarsen_tree_of_boxes(
+        tob: TreeOfBoxes, coarsen_flags: np.ndarray,
+        error_on_ignored_flags: bool = True
+        ) -> TreeOfBoxes:
+    """Make a coarsened copy of `tob` where boxes flagged with `coarsen_flags`
+    are coarsened.
+    """
+    return refine_and_coarsen_tree_of_boxes(
+        tob, None, coarsen_flags,
+        error_on_ignored_flags=error_on_ignored_flags)
+
+
+def _apply_refine_flags_without_sorting(refine_flags, tob):
+    box_is_leaf = tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0
+
+    if refine_flags[~box_is_leaf].any():
+        raise ValueError("attempting to split non-leaf")
+
+    refine_parents, = np.where(refine_flags)
+    if len(refine_parents) == 0:
+        return tob
+
+    dim = tob.dimensions
+    nchildren = 2**dim
+    n_new_boxes = len(refine_parents) * nchildren
+    nboxes_new = tob.nboxes + n_new_boxes
+
+    child_box_starts = (
+            tob.nboxes
+            + nchildren * np.arange(len(refine_parents)))
+
+    refine_parents_per_child = np.empty(
+            (nchildren, len(refine_parents)), np.intp)
+    refine_parents_per_child[:] = refine_parents.reshape(-1)
+    refine_parents_per_child = refine_parents_per_child.reshape(-1)
+
+    box_parents = _resized_array(tob.box_parent_ids, nboxes_new)
+    box_centers = _resized_array(tob.box_centers, nboxes_new)
+    box_children = _resized_array(tob.box_child_ids, nboxes_new)
+    box_levels = _resized_array(tob.box_levels, nboxes_new)
+
+    # new boxes are appended at the end, so applying coarsen_flags wrt the
+    # original tree is still meaningful after this
+    box_parents[tob.nboxes:] = refine_parents_per_child
+    box_levels[tob.nboxes:] = tob.box_levels[box_parents[tob.nboxes:]] + 1
+    box_children[:, refine_parents] = (
+        child_box_starts + np.arange(nchildren).reshape(-1, 1))
+
+    for i in range(2**dim):
+        children_i = box_children[i, refine_parents]
+        offsets = (
+                tob.root_extent * _vec_of_signs(dim, i).reshape(-1, 1)
+                * (1/2**(1+box_levels[children_i])))
+        box_centers[:, children_i] = (
+                box_centers[:, refine_parents] + offsets)
+
+    return TreeOfBoxes(
+        box_centers=box_centers,
+        root_extent=tob.root_extent,
+        box_parent_ids=box_parents,
+        box_child_ids=box_children,
+        box_levels=box_levels,
+
+        box_flags=_compute_tob_box_flags(box_children),
+        level_start_box_nrs=None,
+        box_id_dtype=tob.box_id_dtype,
+        box_level_dtype=tob.box_level_dtype,
+        coord_dtype=tob.coord_dtype,
+        sources_have_extent=tob.sources_have_extent,
+        targets_have_extent=tob.targets_have_extent,
+        extent_norm=tob.extent_norm,
+        stick_out_factor=tob.stick_out_factor,
+        _is_pruned=tob._is_pruned,
+        )
+
+
+def _apply_coarsen_flags(coarsen_flags, tob, error_on_ignored_flags=True):
+    box_is_leaf = tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0
+    if coarsen_flags[~box_is_leaf].any():
+        raise ValueError("attempting to coarsen non-leaf")
+    coarsen_sources, = np.where(coarsen_flags)
+    if coarsen_sources.size == 0:
+        return tob
+
+    coarsen_parents = tob.box_parent_ids[coarsen_sources]
+    coarsen_peers = tob.box_child_ids[:, coarsen_parents].reshape(-1)
+    coarsen_peer_is_leaf = box_is_leaf[coarsen_peers]
+    coarsen_exec_flags = np.all(coarsen_peer_is_leaf, axis=0)
+
+    # when a leaf box marked for coarsening has non-leaf peers
+    coarsen_flags_ignored = (coarsen_exec_flags != coarsen_flags)
+    if np.any(coarsen_flags_ignored):
+        msg = (f"{np.sum(coarsen_flags_ignored)} out of "
+               f"{np.sum(coarsen_flags)} coarsening flags ignored "
+               "to prevent removing non-leaf boxes")
+        if error_on_ignored_flags:
+            raise RuntimeError(msg)
+        else:
+            import warnings
+            warnings.warn(msg, stacklevel=3)
+
+    # deleted boxes are marked as:
+    # level = inf
+    # parent = -1
+    coarsen_parents = coarsen_parents[coarsen_exec_flags]
+    coarsen_peers = coarsen_peers[:, coarsen_exec_flags]
+    box_parents = tob.box_parent_ids.copy()
+    box_parents[coarsen_peers] = -1
+    box_children = tob.box_child_ids.copy()
+    box_children[:, coarsen_parents] = 0
+    box_levels = tob.box_levels.copy()
+    box_levels[coarsen_peers] = np.inf
+
+    return TreeOfBoxes(
+        box_centers=tob.box_centers,
+        root_extent=tob.root_extent,
+        box_parent_ids=box_parents,
+        box_child_ids=box_children,
+        box_levels=box_levels,
+
+        box_flags=_compute_tob_box_flags(box_children),
+        level_start_box_nrs=None,
+        box_id_dtype=tob.box_id_dtype,
+        box_level_dtype=tob.box_level_dtype,
+        coord_dtype=tob.coord_dtype,
+        sources_have_extent=tob.sources_have_extent,
+        targets_have_extent=tob.targets_have_extent,
+        extent_norm=tob.extent_norm,
+        stick_out_factor=tob.stick_out_factor,
+        _is_pruned=tob._is_pruned,
+        )
+
+
+def _sort_boxes_by_level(tob, queue=None):
+    if not np.any(np.diff(tob.box_levels) < 0):
+        return tob
+
+    # reorder boxes to into non-decreasing levels
+    neworder = np.argsort(tob.box_levels)
+    box_centers = tob.box_centers[:, neworder]
+    box_parent_ids = tob.box_parent_ids[neworder]
+    box_child_ids = tob.box_child_ids[:, neworder]
+    box_levels = tob.box_levels[neworder]
+
+    return TreeOfBoxes(
+        box_centers=box_centers,
+        root_extent=tob.root_extent,
+        box_parent_ids=box_parent_ids,
+        box_child_ids=box_child_ids,
+        box_levels=box_levels,
+
+        box_flags=_compute_tob_box_flags(box_child_ids),
+        level_start_box_nrs=None,
+        box_id_dtype=tob.box_id_dtype,
+        box_level_dtype=tob.box_level_dtype,
+        coord_dtype=tob.coord_dtype,
+        sources_have_extent=tob.sources_have_extent,
+        targets_have_extent=tob.targets_have_extent,
+        extent_norm=tob.extent_norm,
+        stick_out_factor=tob.stick_out_factor,
+        _is_pruned=tob._is_pruned,
+        )
+
+
+def _sort_and_prune_deleted_boxes(tob):
+    tob = _sort_boxes_by_level(tob)
+    n_stale_boxes = np.sum(tob.box_levels == np.inf)
+    newn = tob.nboxes - n_stale_boxes
+
+    return TreeOfBoxes(
+        root_extent=tob.root_extent,
+        box_parent_ids=tob.box_parent_ids[:newn],
+        box_child_ids=tob.box_child_ids[:, :newn],
+        box_levels=tob.box_levels[:newn],
+        box_centers=tob.box_centers[:, :newn],
+
+        box_flags=_compute_tob_box_flags(tob.box_child_ids[:, :newn]),
+        level_start_box_nrs=None,
+        box_id_dtype=tob.box_id_dtype,
+        box_level_dtype=tob.box_level_dtype,
+        coord_dtype=tob.coord_dtype,
+        sources_have_extent=tob.sources_have_extent,
+        targets_have_extent=tob.targets_have_extent,
+        extent_norm=tob.extent_norm,
+        stick_out_factor=tob.stick_out_factor,
+        _is_pruned=tob._is_pruned,
+        )
+
+
+def refine_and_coarsen_tree_of_boxes(
+        tob: TreeOfBoxes,
+        refine_flags: np.ndarray | None = None,
+        coarsen_flags: np.ndarray | None = None, *,
+        error_on_ignored_flags: bool = True,
+        ) -> TreeOfBoxes:
+    """Make a refined/coarsened copy. When children of the same parent box
+    are marked differently, the refinement flag takes priority.
+
+    Both refinement and coarsening flags can only be set of leaves.
+    To prevent drastic mesh change, coarsening is only executed when a leaf
+    box is marked for coarsening, and its parent's children are all leaf
+    boxes (so that change in the number of boxes is bounded per box flagged).
+    Please note that the above behavior may be subject to change in the future.
+
+    :arg refine_flags: a boolean array of size `nboxes`.
+    :arg coarsen_flags: a boolean array of size `nboxes`.
+    :arg error_on_ignored_flags: if true, an exception is raised when enforcing
+        level restriction requires ignoring some coarsening flags.
+    :returns: a processed copy of the tree.
+    """
+    if refine_flags is None:
+        refine_flags = np.zeros(tob.nboxes, dtype=bool)
+    if coarsen_flags is None:
+        coarsen_flags = np.zeros(tob.nboxes, dtype=bool)
+
+    if (refine_flags & coarsen_flags).any():
+        raise ValueError("some boxes are simultaneously marked "
+                         "to refine and coarsen")
+
+    tob = _apply_refine_flags_without_sorting(refine_flags, tob)
+    coarsen_flags = _resized_array(coarsen_flags, tob.nboxes)
+    tob = _apply_coarsen_flags(coarsen_flags, tob, error_on_ignored_flags)
+    return _sort_and_prune_deleted_boxes(tob)
+
+# }}}
+
+
+# {{{ make_tree_of_boxes_root
+
+def make_tree_of_boxes_root(
+        bbox: tuple[np.ndarray, np.ndarray], *,
+        box_id_dtype: Any = None,
+        box_level_dtype: Any = None,
+        coord_dtype: Any = None,
+        ) -> TreeOfBoxes:
+    """
+    Make the minimal tree of boxes, consisting of a single root box filling
+    *bbox*.
+
+    .. note::
+
+        *bbox* is expected to be square (with tolerances as accepted by
+        :func:`numpy.allclose`).
+
+    :arg bbox: a :class:`tuple` of ``(lower_bounds, upper_bounds)`` for the
+        bounding box.
+    """
+    assert len(bbox) == 2
+
+    from pytools import single_valued
+    dim = single_valued([len(bbox[0]), len(bbox[1])])
+
+    if box_id_dtype is None:
+        box_id_dtype = np.int32
+    box_id_dtype = np.dtype(box_id_dtype)
+
+    if box_level_dtype is None:
+        box_level_dtype = np.int32
+    box_level_dtype = np.dtype(box_level_dtype)
+
+    if coord_dtype is None:
+        coord_dtype = bbox[0].dtype
+    coord_dtype = np.dtype(coord_dtype)
+
+    box_centers = np.array(
+        [(bbox[0][iaxis] + bbox[1][iaxis]) * 0.5 for iaxis in range(dim)],
+        dtype=coord_dtype,
+        ).reshape(dim, 1)
+    root_extent = single_valued(
+        np.array(
+            [(bbox[1][iaxis] - bbox[0][iaxis]) for iaxis in range(dim)],
+            dtype=coord_dtype),
+        equality_pred=np.allclose)
+
+    box_parent_ids = np.array([0], dtype=box_id_dtype)
+    box_parent_ids[0] = -1  # root has no parent
+
+    box_child_ids = np.array([0] * 2**dim, box_id_dtype).reshape(2**dim, 1)
+
+    return TreeOfBoxes(
+            box_centers=box_centers,
+            root_extent=root_extent,
+            box_parent_ids=box_parent_ids,
+            box_child_ids=box_child_ids,
+            box_levels=np.array([0], box_level_dtype),
+
+            box_flags=_compute_tob_box_flags(box_child_ids),
+            level_start_box_nrs=np.array([0], dtype=box_level_dtype),
+
+            box_id_dtype=box_id_dtype,
+            box_level_dtype=box_level_dtype,
+            coord_dtype=coord_dtype,
+            sources_have_extent=False,
+            targets_have_extent=False,
+            extent_norm="linf",
+            stick_out_factor=0,
+            _is_pruned=True,
+            )
+
+# }}}
+
+
+# {{{ make_meshmode_mesh_from_leaves
+
+def make_meshmode_mesh_from_leaves(tob: TreeOfBoxes) -> tuple["Mesh", np.ndarray]:
+    """Make a :class:`~meshmode.mesh.Mesh` from the leaf boxes of the tree
+    of boxes *tob*.
+
+    :returns: A tuple of the mesh and a vector of the element number -> box number
+        mapping.
+    """
+    dim = tob.dimensions
+    lfboxes = tob.leaf_boxes
+    lfcenters = tob.box_centers[:, lfboxes]
+    lflevels = tob.box_levels[lfboxes]
+    lfradii = tob.root_extent / 2 / (2**lflevels)
+
+    # use tensor product nodes ordering
+    import modepy as mp
+    cell_nodes_1d = np.array([-1, 1])
+    cell_nodes = mp.tensor_product_nodes(dim, cell_nodes_1d)
+
+    lfvertices = (
+        np.repeat(lfcenters, 2**dim, axis=1)
+        + np.repeat(lfradii, 2**dim) * np.tile(cell_nodes, (1, len(lfboxes)))
+    )
+
+    # FIXME: purge redundant vertices
+    from meshmode.mesh import TensorProductElementGroup, make_mesh
+    from meshmode.mesh.generation import make_group_from_vertices
+
+    vertex_indices = np.arange(
+        len(lfboxes) * 2**dim, dtype=np.int32).reshape([-1, 2**dim])
+    group = make_group_from_vertices(
+        lfvertices, vertex_indices, 1,
+        group_cls=TensorProductElementGroup,
+        unit_nodes=None)
+
+    return make_mesh(lfvertices, [group]), tob.leaf_boxes
+
+# }}}
+
+# vim: foldmethod=marker
--- a/boxtree/version.py
+++ b/boxtree/version.py
-VERSION = (2013, 1)
-VERSION_TEXT = ".".join(str(i) for i in VERSION)
+from importlib import metadata
+
+
+def _parse_version(version: str) -> tuple[tuple[int, ...], str]:
+    import re
+
+    m = re.match(r"^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
+    assert m is not None
+
+    return tuple(int(nr) for nr in m.group(1).split(".")), m.group(2)
+
+
+VERSION_TEXT = metadata.version("boxtree")
+VERSION, VERSION_STATUS = _parse_version(VERSION_TEXT)
--- a/boxtree/visualization.py
+++ b/boxtree/visualization.py
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -25,6 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import numpy as np
+
+
+# {{{ utilities

 def int_to_roman(inp):
    """
@@ -34,13 +33,13 @@ def int_to_roman(inp):
    # https://code.activestate.com/recipes/81611-roman-numerals/

    if not isinstance(inp, int):
-        raise TypeError("expected integer, got %s" % type(inp))
+        raise TypeError(f"expected integer, got {type(inp)}")
    if inp == 0:
        return "Z"
    if not 0 < inp < 4000:
-        raise ValueError("Argument must be between 1 and 3999 (got %d)" % inp)
-    ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
-    nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
+        raise ValueError(f"Argument must be between 1 and 3999 (got {inp})")
+    ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
+    nums = ("M", "CM", "D", "CD", "C", "XC", "L", "XL", "X", "IX", "V", "IV", "I")
    result = ""
    for i in range(len(ints)):
        count = int(inp / ints[i])
@@ -48,6 +47,10 @@ def int_to_roman(inp):
        inp -= ints[i] * count
    return result

+# }}}
+
+
+# {{{ tree plotting

 class TreePlotter:
    """Assumes that the tree has data living on the host.
@@ -81,13 +84,19 @@ class TreePlotter:
        """
        :arg kwargs: keyword arguments to pass on to
            :class:`matplotlib.patches.PathPatch`,
-            e.g. `facecolor='red', edgecolor='yellow', alpha=0.5`
+            e.g. `facecolor="red", edgecolor="yellow", alpha=0.5`
        """

        el, eh = self.tree.get_box_extent(ibox)

-        import matplotlib.pyplot as pt
+        shrink_factor = kwargs.pop("shrink_factor", 0)
+        if shrink_factor:
+            center = 0.5*(el+eh)
+            el += (center-el)*shrink_factor
+            eh += (center-eh)*shrink_factor
+
        import matplotlib.patches as mpatches
+        import matplotlib.pyplot as pt
        from matplotlib.path import Path

        pathdata = [
@@ -98,7 +107,7 @@ class TreePlotter:
            (Path.CLOSEPOLY, (el[0], el[1])),
            ]

-        codes, verts = zip(*pathdata)
+        codes, verts = zip(*pathdata, strict=True)
        path = Path(verts, codes)
        patch = mpatches.PathPatch(path, **kwargs)
        pt.gca().add_patch(patch)
@@ -113,7 +122,7 @@ class TreePlotter:
            lev = int(tree.box_levels[ibox])
            pt.text(x, y, str(ibox), fontsize=20*1.15**(-lev),
                    ha="center", va="center",
-                    bbox=dict(facecolor='white', alpha=0.5, lw=0))
+                    bbox={"facecolor": "white", "alpha": 0.5, "lw": 0})

    def get_tikz_for_tree(self):
        if self.tree.dimensions != 2:
@@ -121,28 +130,28 @@ class TreePlotter:

        lines = []

-        lines.append(r"\def\nboxes{%d}" % self.tree.nboxes)
-        lines.append(r"\def\lastboxnr{%d}" % (self.tree.nboxes-1))
+        lines.append(r"\def\nboxes{%d}" % self.tree.nboxes)  # noqa: UP031
+        lines.append(r"\def\lastboxnr{%d}" % (self.tree.nboxes-1))  # noqa: UP031
        for ibox in range(self.tree.nboxes):
            el, eh = self.tree.get_box_extent(ibox)
+            el_0, el_1 = float(el[0]), float(el[1])
+            eh_0, eh_1 = float(eh[0]), float(eh[1])

            c = self.tree.box_centers[:, ibox]
+            c_0, c_1 = float(c[0]), float(c[1])

            lines.append(
-                    r"\coordinate (boxl%d) at (%r, %r);"
-                    % (ibox, float(el[0]), float(el[1])))
+                fr"\coordinate (boxl{ibox}) at ({el_0!r}, {el_1!r});")
            lines.append(
-                    r"\coordinate (boxh%d) at (%r, %r);"
-                    % (ibox, float(eh[0]), float(eh[1])))
+                fr"\coordinate (boxh{ibox}) at ({eh_0!r}, {eh_1!r});")
            lines.append(
-                    r"\coordinate (boxc%d) at (%r, %r);"
-                    % (ibox, float(c[0]), float(c[1])))
+                fr"\coordinate (boxc{ibox}) at ({c_0!r}, {c_1!r});")
            lines.append(
-                    r"\def\boxsize%s{%r}"
-                    % (int_to_roman(ibox), float(eh[0]-el[0])))
+                r"\def\boxsize%s{%r}" % (int_to_roman(ibox), eh_0 - el_0)   # noqa: UP031
+                )
            lines.append(
-                    r"\def\boxlevel%s{%r}"
-                    % (int_to_roman(ibox), self.tree.box_levels[ibox]))
+                r"\dev\boxlevel%s{%r}" % (int_to_roman(ibox),               # noqa: UP031
+                                          self.tree.box_levels[ibox]))

        lines.append(
                r"\def\boxpath#1{(boxl#1) rectangle (boxh#1)}")
@@ -158,4 +167,111 @@ class TreePlotter:
                r"}}")
        return "\n".join(lines)

+# }}}
+
+
+# {{{ traversal plotting
+
+def _draw_box_list(tree_plotter, ibox, starts, lists, key_to_box=None, **kwargs):
+    rng = kwargs.pop("rng", None)
+    if rng is None:
+        rng = np.random.default_rng()
+
+    default_facecolor = "blue"
+
+    if key_to_box is not None:
+        ind, = np.where(key_to_box == ibox)
+        if len(ind):
+            key, = ind
+        else:
+            # indicate empty list
+            actual_kwargs = {
+                    "edgecolor": getattr(kwargs, "facecolor", default_facecolor),
+                    "fill": False,
+                    "alpha": 0.5,
+                    "shrink_factor": -0.1+0.1*rng.random(),
+                    }
+            tree_plotter.draw_box(ibox, **actual_kwargs)
+            return
+    else:
+        key = ibox
+
+    start, end = starts[key:key+2]
+    if start == end:
+        return
+
+    actual_kwargs = {
+            "facecolor": default_facecolor,
+            "linewidth": 0,
+            "alpha": 0.5,
+            "shrink_factor": 0.1 + rng.random()*0.2,
+            }
+    actual_kwargs.update(kwargs)
+    print(actual_kwargs["facecolor"], ibox, lists[start:end])
+    for jbox in lists[start:end]:
+        tree_plotter.draw_box(jbox, **actual_kwargs)
+
+
+def draw_same_level_non_well_sep_boxes(tree_plotter, traversal, ibox):
+    tree_plotter.draw_box(ibox, facecolor="red",
+            alpha=0.5)
+
+    # same-level non-well-sep
+    _draw_box_list(tree_plotter, ibox,
+            traversal.same_level_non_well_sep_boxes_starts,
+            traversal.same_level_non_well_sep_boxes_lists,
+            facecolor="green")
+
+
+def draw_box_lists(tree_plotter, traversal, ibox):
+    tree_plotter.draw_box(ibox, facecolor="red",
+            alpha=0.5)
+
+    # from near neighbors ("list 1")
+    _draw_box_list(tree_plotter, ibox,
+            traversal.neighbor_source_boxes_starts,
+            traversal.neighbor_source_boxes_lists,
+            key_to_box=traversal.target_boxes,
+            facecolor="green")
+
+    # from well-separated siblings (list 2)
+    _draw_box_list(tree_plotter, ibox,
+            traversal.from_sep_siblings_starts,
+            traversal.from_sep_siblings_lists,
+            key_to_box=traversal.target_or_target_parent_boxes,
+            facecolor="blue")
+
+    # from separated smaller (list 3)
+    for ilev in range(tree_plotter.tree.nlevels):
+        _draw_box_list(tree_plotter, ibox,
+                traversal.from_sep_smaller_by_level[ilev].starts,
+                traversal.from_sep_smaller_by_level[ilev].lists,
+                key_to_box=traversal.target_boxes_sep_smaller_by_source_level[ilev],
+                facecolor="orange")
+
+    # list 3 close
+    if traversal.from_sep_close_smaller_starts is not None:
+        _draw_box_list(tree_plotter, ibox,
+                traversal.from_sep_close_smaller_starts,
+                traversal.from_sep_close_smaller_lists,
+                key_to_box=traversal.target_boxes,
+                facecolor="orange", hatch=".")
+
+    # from separated bigger (list 4)
+    _draw_box_list(tree_plotter, ibox,
+            traversal.from_sep_bigger_starts,
+            traversal.from_sep_bigger_lists,
+            key_to_box=traversal.target_or_target_parent_boxes,
+            facecolor="purple")
+
+    # list 4 close
+    if traversal.from_sep_close_bigger_starts is not None:
+        _draw_box_list(tree_plotter, ibox,
+                traversal.from_sep_close_bigger_starts,
+                traversal.from_sep_close_bigger_lists,
+                key_to_box=traversal.target_boxes,
+                facecolor="purple", hatch=".")
+
+# }}}
+
 # vim: filetype=pyopencl:fdm=marker
--- a/doc/.gitignore
+++ b/doc/.gitignore
-_build
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -2,7 +2,7 @@
 #

 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -n
 SPHINXBUILD   = python $(shell which sphinx-build)
 PAPER         =
 BUILDDIR      = _build

--- a/doc/conf.py
+++ b/doc/conf.py
-from __future__ import absolute_import
-# -*- coding: utf-8 -*-
-#
-# boxtree documentation build configuration file.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
+import sys
+from importlib import metadata
+from urllib.request import urlopen

-import sys, os

-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+_conf_url = \
+        "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
+with urlopen(_conf_url) as _inf:
+    exec(compile(_inf.read(), _conf_url, "exec"), globals())

-# -- General configuration -----------------------------------------------------
+copyright = "2013-21, Andreas Kloeckner"
+release = metadata.version("boxtree")
+version = ".".join(release.split(".")[:2])

-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.mathjax',
-    #'sphinx.ext.viewcode',
-    ]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-autoclass_content = "both"
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'boxtree'
-copyright = u'2013, Andreas Kloeckner'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-ver_dic = {}
-exec(compile(open("../boxtree/version.py").read(), "../boxtree/version.py", 'exec'), ver_dic)
-version = ".".join(str(x) for x in ver_dic["VERSION"])
-# The full version, including alpha/beta/rc tags.
-release = ver_dic["VERSION_TEXT"]
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-
-# -- Options for HTML output ---------------------------------------------------
-
-html_theme = "alabaster"
-
-html_theme_options = {
-        "extra_nav_links": {
-            "🚀 Github": "https://github.com/inducer/boxtree",
-            "💾 Download Releases": "https://pypi.python.org/pypi/boxtree",
-            }
-        }
-
-html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
+intersphinx_mapping = {
+    "arraycontext": ("https://documen.tician.de/arraycontext", None),
+    "meshmode": ("https://documen.tician.de/meshmode", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pyopencl": ("https://documen.tician.de/pyopencl", None),
+    "pytential": ("https://documen.tician.de/pytential", None),
+    "python": ("https://docs.python.org/3", None),
 }

-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'boxtreedoc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
-  ('index', 'boxtree.tex', u'boxtree Documentation',
-   u'Andreas Kloeckner', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output --------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'boxtree', u'boxtree Documentation',
-     [u'Andreas Kloeckner'], 1)
+nitpick_ignore_regex = [
+    ["py:class", r"numpy._?typing._generic_alias.ScalarType"],
 ]

-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-        'http://docs.python.org/': None,
-        'http://docs.scipy.org/doc/numpy/': None,
-        'http://documen.tician.de/pyopencl': None,
-        }
+# Some modules need to import things just so that sphinx can resolve symbols in
+# type annotations. Often, we do not want these imports (e.g. of PyOpenCL) when
+# in normal use (because they would introduce unintended side effects or hard
+# dependencies). This flag exists so that these imports only occur during doc
+# build. Since sphinx appears to resolve type hints lexically (as it should),
+# this needs to be cross-module (since, e.g. an inherited arraycontext
+# docstring can be read by sphinx when building meshmode, a dependent package),
+# this needs a setting of the same name across all packages involved, that's
+# why this name is as global-sounding as it is.
+sys._BUILDING_SPHINX_DOCS = True
--- a/doc/cost.rst
+++ b/doc/cost.rst
+FMM Cost Model
+==============
+
+.. automodule:: boxtree.cost