02addfd6 · 4906c584 · f01bb0f3 · 5b8e8964 · 348416d8 · de1cc4d1
--- a/.editorconfig
+++ b/.editorconfig
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+version: 2
+updates:
+    # Set update schedule for GitHub Actions
+    - package-ecosystem: "github-actions"
+      directory: "/"
+      schedule:
+          interval: "weekly"
+
+# vim: sw=4
--- a/.github/workflows/autopush.yml
+++ b/.github/workflows/autopush.yml
+name: Gitlab mirror
+on:
+    push:
+        branches:
+        - main
+
+jobs:
+    autopush:
+        name: Automatic push to gitlab.tiker.net
+        if: startsWith(github.repository, 'inducer/')
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                mirror_github_to_gitlab
+
+            env:
+                GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
+
+# vim: sw=4
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: CI
+on:
+    push:
+        branches:
+        - main
+    pull_request:
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    ruff:
+        name: Ruff
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                pipx install ruff
+                ruff check
+
+    typos:
+        name: Typos
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   uses: crate-ci/typos@master
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                run_pylint "$(get_proj_name)" examples/*.py test/*.py
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                build_docs
+
+    pytest3:
+        name: Conda Python 3
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                test_py_project
+
+    py3example:
+        name: Python 3 Examples
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                run_examples
+
+    downstream_tests:
+        strategy:
+            matrix:
+                downstream_project: [sumpy, pytential]
+        name: Tests for downstream project ${{ matrix.downstream_project }}
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            env:
+                DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
+            run: |
+                export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+
+                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
+                   DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
+                fi
+                test_downstream "$DOWNSTREAM_PROJECT"
+
+# vim: sw=4
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ setuptools.pth
 distribute*egg
 distribute*tar.gz
 a.out
+
+.cache
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-Python 2.7 Intel CPU:
-  script:
-  - py_version=2.7
-  - export PYOPENCL_TEST=int:pu
-  - EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - intel-cl-cpu
-  except:
-  - tags
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-Python 3.5 POCL:
-  script:
-  - export PY_EXE=python3.5
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python3.5
-  - pocl
-  except:
-  - tags
+Python 3 POCL:
+    script: |
+      export PYOPENCL_TEST=portable:pthread
+      export EXTRA_INSTALL="pybind11 numpy mako matplotlib"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      test_py_project
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL Titan V:
+    script: |
+      export PYOPENCL_TEST=portable:titan
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-titan-v
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL Titan X:
+    script: |
+      export PYOPENCL_TEST=portable:titan
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-titan-x
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL K40:
+    script: |
+      export PYOPENCL_TEST=portable:k40
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-k40
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL MPI:
+    script: |
+      export PYOPENCL_TEST=portable
+      export EXTRA_INSTALL="numpy mako mpi4py pybind11"
+      export PYTEST_ADDOPTS="-m 'mpi' --capture=no"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      test_py_project
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+    artifacts:
+      reports:
+        junit: test/pytest.xml
+
+Python 3 POCL Examples:
+    script: |
+      test -n "$SKIP_EXAMPLES" && exit
+      export PYOPENCL_TEST=portable:pthread
+      export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      run_examples
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+
+Pylint:
+    script: |
+      export EXTRA_INSTALL="pybind11 numpy mako matplotlib mpi4py"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      run_pylint "$(get_proj_name)" examples/*.py test/*.py
+    tags:
+    - python3
+    except:
+    - tags
+
+Documentation:
+    script: |
+      EXTRA_INSTALL="pybind11 numpy mako mpi4py"
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      build_docs
+    tags:
+    - python3
+
+Ruff:
+    script: |
+      pipx install ruff
+      ruff check
+    tags:
+    - docker-runner
+    except:
+    - tags
+
+Downstream:
+    parallel:
+      matrix:
+      - DOWNSTREAM_PROJECT: [sumpy, pytential]
+    tags:
+    - large-node
+    - "docker-runner"
+    script: |
+      export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ./ci-support-v0
+      test_downstream "$DOWNSTREAM_PROJECT"
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
+- arg: py-version
+  val: '3.10'
+
+- arg: extension-pkg-whitelist
+  val: pyfmmlib
+
+# Needed for boxtree.tools
+- arg: init-hook
+  val: import sys; sys.setrecursionlimit(2000)
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
+name: test-conda-env
+channels:
+- conda-forge
+- nodefaults
+
+dependencies:
+- python=3
+- git
+- numpy
+- pocl
+- pocl-cuda
+- mako
+- pyopencl
+- islpy
+- pyfmmlib
+- mpi4py
+
+# Only needed to make pylint succeed
+- matplotlib-base
+
+# This is intended to prevent conda from selecting 'external' (i.e. empty) builds
+# of OpenMPI to satisfy the MPI dependency of mpi4py. It did so in May 2024, leading
+# to confusing failues saying
+# 'libmpi.so.40: cannot open shared object file: No such file or directory'.
+# https://github.com/conda-forge/openmpi-feedstock/issues/153
+# https://conda-forge.org/docs/user/tipsandtricks/#using-external-message-passing-interface-mpi-libraries
+- openmpi>=5=h*
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Kloeckner"
+  given-names: "Andreas"
+  orcid: "https://orcid.org/0000-0003-1228-519X"
+- family-names: "Fikl"
+  given-names: "Alex"
+- family-names: "Gao"
+  given-names: "Hao"
+- family-names: "Fernando"
+  given-names: "Isuru"
+- family-names: "Wala"
+  given-names: "Matt"
+- family-names: "Wei"
+  given-names: "Xiaoyu"
+
+title: "boxtree"
+version: 2023.1
+date-released: 2023-10-01
+doi: 10.5281/zenodo.8397396
+url: "https://github.com/inducer/boxtree"
+license: MIT
--- a/MANIFEST.in
+++ b/MANIFEST.in
-include test/*.py
-include examples/*.py
-
-include doc/*.rst
-include doc/Makefile
-include doc/*.py
-include doc/images/*.png
-include doc/_static/*.css
-include doc/_templates/*.html
-
-include configure.py
-include Makefile.in
-include README.rst
-include requirements.txt
--- a/README.rst
+++ b/README.rst
-boxtree
-=======
+boxtree: Quad/Octrees, FMM Traversals, Geometric Queries
+========================================================

+.. image:: https://gitlab.tiker.net/inducer/boxtree/badges/main/pipeline.svg
+    :alt: Gitlab Build Status
+    :target: https://gitlab.tiker.net/inducer/boxtree/commits/main
+.. image:: https://github.com/inducer/boxtree/workflows/CI/badge.svg?branch=main&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/boxtree/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/boxtree.png
-    :target: http://pypi.python.org/pypi/boxtree
+    :alt: Python Package Index Release Page
+    :target: https://pypi.org/project/boxtree/
+.. image:: https://zenodo.org/badge/7193697.svg
+    :alt: Zenodo DOI for latest release
+    :target: https://zenodo.org/badge/latestdoi/7193697

-boxtree is a package that, given some point locations in two or three
+``boxtree`` is a package that, given some point locations in two or three
 dimensions, sorts them into an adaptive quad/octree of boxes, efficiently, in
-parallel, using `PyOpenCL <http://mathema.tician.de/software/pyopencl>`_.
+parallel, using `PyOpenCL <https://mathema.tician.de/software/pyopencl>`__.

 It can also generate traversal lists needed for adaptive fast multipole methods
 and related algorithms and tree-based look-up tables for geometric proximity.

-boxtree is under the MIT license.
+``boxtree`` is under the MIT license.

 Resources:

-* `documentation <http://documen.tician.de/boxtree>`_
-* `wiki home page <http://wiki.tiker.net/BoxTree>`_
-* `source code via git <https://github.com/inducer/boxtree>`_
+* `Documentation <https://documen.tician.de/boxtree>`__
+* `PyPI package <https://pypi.org/project/boxtree>`__
+* `Source Code (GitHub) <https://github.com/inducer/boxtree>`__
--- a/boxtree/__init__.py
+++ b/boxtree/__init__.py
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -22,31 +20,60 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum
+from boxtree.tree import Tree, TreeOfBoxes, TreeWithLinkedPointSources, box_flags_enum
 from boxtree.tree_build import TreeBuilder
+from boxtree.tree_of_boxes import (
+    coarsen_tree_of_boxes,
+    make_meshmode_mesh_from_leaves,
+    make_tree_of_boxes_root,
+    refine_and_coarsen_tree_of_boxes,
+    refine_tree_of_boxes,
+    uniformly_refine_tree_of_boxes,
+)

-__all__ = [
-    "Tree", "TreeWithLinkedPointSources",
-    "TreeBuilder", "box_flags_enum"]

-__doc__ = """
+__all__ = [
+    "Tree",
+    "TreeBuilder",
+    "TreeOfBoxes",
+    "TreeWithLinkedPointSources",
+    "box_flags_enum",
+    "coarsen_tree_of_boxes",
+    "make_meshmode_mesh_from_leaves",
+    "make_tree_of_boxes_root",
+    "refine_and_coarsen_tree_of_boxes",
+    "refine_tree_of_boxes",
+    "uniformly_refine_tree_of_boxes",
+]
+
+__doc__ = r"""
 :mod:`boxtree` can do three main things:

-* it can sort particles into an adaptively refined quad/octree,
-  see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`.
+* it can build quad/octrees (in at least 1D/2D/3D), in one of two modes:
+
+  * First, trees can be built as purely a collection of boxes,
+    see :ref:`tree-of-boxes`.  These trees are typically built iteratively,
+    through refinement and coarsening.
+  * Second, trees can be built from collections of points,
+    so that each box contains only a bounded number of these points.
+    This is the older, less general way to construct trees
+    that has a fairly heavy focus on point-based Fast Multipole
+    Methods (FMMs).
+    See :class:`~Tree` and :class:`~TreeBuilder`.
+    Note that :class:`~Tree` inherits from :class:`TreeOfBoxes`.

 * it can compute fast-multipole-like interaction lists on this tree structure,
  see :mod:`boxtree.traversal`. Note that while this traversal generation
  builds on the result of particle sorting,
  it is completely distinct in the software sense.

-* It can compute geometric lookup structures based on a :class:`boxtree.Tree`,
-  see :mod:`boxtree.geo_lookup`.
+* it can compute geometric lookup structures based on a :class:`Tree`,
+  see :mod:`~boxtree.area_query`.

 Tree modes
 ----------

-:mod:`boxtree` can operate in three 'modes':
+The particle-based :class:`Tree` can operate in three 'modes':

 * one where no distinction is made between sources and targets. In this mode,
  all participants in the interaction are called 'particles'.
@@ -101,8 +128,8 @@ the point sources have their own orderings:
 * **user point source order**
 * **tree point source order** (tree/box-sorted)

-:attr:`TreeWithLinkedPointSources.user_point_source_ids` helps translate point
-source arrays into tree order for processing.
+:attr:`boxtree.tree.TreeWithLinkedPointSources.user_point_source_ids` helps
+translate point source arrays into tree order for processing.

 .. _csr:

@@ -133,4 +160,7 @@ This indexing scheme has the following properties:
  is automatically the end of the previous one.
 """

+# allow namespace packages
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
 # vim: filetype=pyopencl:fdm=marker
--- a/boxtree/area_query.py
+++ b/boxtree/area_query.py
-# -*- coding: utf-8 -*-
-from __future__ import division
-
-__copyright__ = """Copyright (C) 2016 Matt Wala"""
+__copyright__ = """
+Copyright (C) 2013 Andreas Kloeckner
+Copyright (C) 2016 Matt Wala"""

 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -24,14 +23,25 @@ THE SOFTWARE.
 """


+import logging
+from functools import partial
+
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 from mako.template import Template
-from boxtree.tools import AXIS_NAMES, DeviceDataRecord
-from pytools import memoize_method

-import logging
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.cltypes
+from pytools import ProcessLogger, memoize_method
+
+from boxtree.tools import (
+    AXIS_NAMES,
+    DeviceDataRecord,
+    coord_vec_subscript_code,
+    get_coord_vec_dtype,
+)
+
+
 logger = logging.getLogger(__name__)


@@ -44,6 +54,20 @@ Area queries (Balls -> overlapping leaves)
 .. autoclass:: AreaQueryResult


+Inverse of area query (Leaves -> overlapping balls)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: LeavesToBallsLookupBuilder
+
+.. autoclass:: LeavesToBallsLookup
+
+
+Space invader queries
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: SpaceInvaderQueryBuilder
+
+
 Peer Lists
 ^^^^^^^^^^

@@ -86,8 +110,8 @@ class AreaQueryResult(DeviceDataRecord):

    .. attribute:: leaves_near_ball_starts

-        Indices into :attr:`query_lists`.
-        ``query_lists[leaves_near_ball_starts[ball_nr]:
+        Indices into :attr:`leaves_near_ball_lists`.
+        ``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]:
        leaves_near_ball_starts[ball_nr]+1]``
        results in a list of leaf boxes that intersect `ball_nr`.

@@ -98,138 +122,218 @@ class AreaQueryResult(DeviceDataRecord):
    .. versionadded:: 2016.1
    """

-# }}}
-
-# {{{ kernel templates

-AREA_QUERY_TEMPLATE = r"""//CL//
-typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
-typedef ${dtype_to_ctype(peer_list_idx_dtype)} peer_list_idx_t;
+class LeavesToBallsLookup(DeviceDataRecord):
+    """
+    .. attribute:: tree

-<%def name="add_box_to_list_if_overlaps_ball(box_id)">
-    {
-        bool is_overlapping;
+        The :class:`boxtree.Tree` instance used to build this lookup.

-        ${check_l_infty_ball_overlap(
-            "is_overlapping", box_id, "ball_radius", "ball_center")}
+    .. attribute:: balls_near_box_starts

-        if (is_overlapping)
-        {
-            APPEND_leaves(${box_id});
-        }
-    }
-</%def>
+        Indices into :attr:`balls_near_box_lists`.
+        ``balls_near_box_lists[balls_near_box_starts[ibox]:
+        balls_near_box_starts[ibox]+1]``
+        results in a list of balls that overlap leaf box *ibox*.

-void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
-{
-    coord_vec_t ball_center;
-    %for i in range(dimensions):
-        ball_center.${AXIS_NAMES[i]} = ball_${AXIS_NAMES[i]}[ball_nr];
-    %endfor
+        .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
+            this list is indexed by the global box index.

-    coord_t ball_radius = ball_radii[ball_nr];
+    .. attribute:: balls_near_box_lists

-    ///////////////////////////////////
-    // Step 1: Find the guiding box. //
-    ///////////////////////////////////
+    .. automethod:: get
+    """

-    ${walk_init(0)}
-    box_id_t guiding_box;
+# }}}

-    if (LEVEL_TO_RAD(0) < ball_radius / 2 || !(box_flags[0] & BOX_HAS_CHILDREN))
-    {
-        guiding_box = 0;
-        continue_walk = false;
-    }

-    while (continue_walk)
-    {
-        // Get the next child.
-        box_id_t child_box_id = box_child_ids[
-            walk_morton_nr * aligned_nboxes + walk_box_id];
+# {{{ kernel templates

-        bool last_child = walk_morton_nr == ${2**dimensions - 1};
+GUIDING_BOX_FINDER_MACRO = r"""//CL:mako//
+    <%def name="initialize_coord_vec(vector_name, entries)">
+        <% assert len(entries) == dimensions %>
+        ${vector_name} = (coord_vec_t) (${", ".join(entries)});
+    </%def>

-        if (child_box_id)
+    <%def name="find_guiding_box(ball_center, ball_radius, box='guiding_box')">
+        box_id_t ${box} = 0;
        {
-            bool contains_ball_center;
-            int child_level = walk_level + 1;
-            coord_t child_rad = LEVEL_TO_RAD(child_level);
-
+            //
+            // Step 1: Ensure that the center is within the bounding box.
+            //
+            coord_vec_t query_center, bbox_min, bbox_max;
+
+            ${initialize_coord_vec(
+                "bbox_min", ["bbox_min_" + ax for ax in AXIS_NAMES[:dimensions]])}
+
+            // bbox_max should be smaller than the true bounding box, so that
+            // (query_center - bbox_min) / root_extent entries are in the half open
+            // interval [0, 1).
+            bbox_max = bbox_min + (coord_t) (
+                root_extent / (1 + ${root_extent_stretch_factor}));
+            query_center = min(bbox_max, max(bbox_min, ${ball_center}));
+
+            //
+            // Step 2: Compute the query radius. This can be effectively
+            // smaller than the original ball radius, if the center
+            // isn't in the bounding box (see picture):
+            //
+            //          +-----------+
+            //          |           |
+            //          |           |
+            //          |           |
+            //  +-------|-+         |
+            //  |       | |         |
+            //  |       +-----------+ <= bounding box
+            //  |         |
+            //  |         |
+            //  +---------+ <= query box
+            //
+            //        <---> <= original radius
+            //           <> <= effective radius
+            //
+            coord_t query_radius = 0;
+
+            %for mnr in range(2**dimensions):
            {
-                // Check if the child contains the ball's center.
-                ${load_center("child_center", "child_box_id")}
+                coord_vec_t offset;
+
+                ${initialize_coord_vec("offset",
+                    ["{sign}{ball_radius}".format(
+                        sign="+" if (2**(dimensions-1-idim) & mnr) else "-",
+                        ball_radius=ball_radius)
+                    for idim in range(dimensions)])}
+
+                coord_vec_t corner = min(
+                    bbox_max, max(bbox_min, ${ball_center} + offset));

-                coord_t max_dist = 0;
+                coord_vec_t dist = fabs(corner - query_center);
                %for i in range(dimensions):
-                    max_dist = fmax(max_dist,
-                        fabs(ball_center.s${i} - child_center.s${i}));
+                    query_radius = fmax(query_radius, ${cvec_sub("dist", i)});
                %endfor
-
-                contains_ball_center = max_dist <= child_rad;
            }
+            %endfor
+
+            //
+            // Step 3: Find the guiding box.
+            //

-            if (contains_ball_center)
+            // Descend when root is not the guiding box.
+            if (LEVEL_TO_RAD(0) / 2 >= query_radius)
            {
-                if ((child_rad / 2 < ball_radius && ball_radius <= child_rad) ||
-                    !(box_flags[child_box_id] & BOX_HAS_CHILDREN))
+                for (unsigned box_level = 0;; ++box_level)
                {
-                    guiding_box = child_box_id;
-                    break;
-                }
+                    if (/* Found leaf? */
+                        !(box_flags[${box}] & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES)
+                        /* Found guiding box? */
+                        || (LEVEL_TO_RAD(box_level) / 2 < query_radius
+                            && query_radius <= LEVEL_TO_RAD(box_level)))
+                    {
+                        break;
+                    }
+
+                    // Find the child containing the ball center.
+                    //
+                    // Logic intended to match the morton nr scan kernel.
+
+                    coord_vec_t offset_scaled =
+                        (query_center - bbox_min) / root_extent;

-                // We want to descend into this box. Put the current state
-                // on the stack.
-                ${walk_push("child_box_id")}
-                continue;
+                    // Invariant: offset_scaled entries are in [0, 1).
+                    %for ax in AXIS_NAMES[:dimensions]:
+                        unsigned ${ax}_bits = (unsigned) (
+                            offset_scaled.${ax} * (1U << (1 + box_level)));
+                    %endfor
+
+                    // Pick off the lowest-order bit for each axis, put it in
+                    // its place.
+                    int level_morton_number = 0
+                    %for iax, ax in enumerate(AXIS_NAMES[:dimensions]):
+                        | (${ax}_bits & 1U) << (${dimensions-1-iax})
+                    %endfor
+                        ;
+
+                    box_id_t next_box = box_child_ids[
+                        level_morton_number * aligned_nboxes + ${box}];
+
+                    if (next_box)
+                    {
+                        ${box} = next_box;
+                    }
+                    else
+                    {
+                        // Child does not exist, this must be the guiding box
+                        break;
+                    }
+                }
            }
        }
+    </%def>
+"""

-        if (last_child)
-        {
-            // This box has no children that contain the center, so it must
-            // be the guiding box.
-            guiding_box = walk_box_id;
-            break;
-        }

-        ${walk_advance()}
-    }
+AREA_QUERY_WALKER_BODY = r"""
+    coord_vec_t ball_center;
+    coord_t ball_radius;
+    ${get_ball_center_and_radius("ball_center", "ball_radius", "i")}
+
+    ///////////////////////////////////
+    // Step 1: Find the guiding box. //
+    ///////////////////////////////////
+
+    ${find_guiding_box("ball_center", "ball_radius")}

    //////////////////////////////////////////////////////
    // Step 2 - Walk the peer boxes to find the leaves. //
    //////////////////////////////////////////////////////

-
    for (peer_list_idx_t pb_i = peer_list_starts[guiding_box],
         pb_e = peer_list_starts[guiding_box+1]; pb_i < pb_e; ++pb_i)
    {
        box_id_t peer_box = peer_lists[pb_i];

-        if (!(box_flags[peer_box] & BOX_HAS_CHILDREN))
+        if (!(box_flags[peer_box] & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
        {
-            ${add_box_to_list_if_overlaps_ball("peer_box")}
+            bool is_overlapping;
+
+            ${check_l_infty_ball_overlap(
+                "is_overlapping", "peer_box", "ball_radius", "ball_center")}
+
+            if (is_overlapping)
+            {
+                ${leaf_found_op("peer_box", "ball_center", "ball_radius")}
+            }
        }
        else
        {
-            ${walk_reset("peer_box")}
+            ${walk_init("peer_box")}

            while (continue_walk)
            {
-                box_id_t child_box_id = box_child_ids[
-                    walk_morton_nr * aligned_nboxes + walk_box_id];
+                ${walk_get_box_id()}

-                if (child_box_id)
+                if (walk_box_id)
                {
-                    if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
+                    if (!(box_flags[walk_box_id]
+                            & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
                    {
-                        ${add_box_to_list_if_overlaps_ball("child_box_id")}
+                        bool is_overlapping;
+
+                        ${check_l_infty_ball_overlap(
+                            "is_overlapping", "walk_box_id",
+                            "ball_radius", "ball_center")}
+
+                        if (is_overlapping)
+                        {
+                            ${leaf_found_op(
+                                "walk_box_id", "ball_center", "ball_radius")}
+                        }
                    }
                    else
                    {
                        // We want to descend into this box. Put the current state
                        // on the stack.
-                        ${walk_push("child_box_id")}
+                        ${walk_push("walk_box_id")}
                        continue;
                    }
                }
@@ -238,9 +342,34 @@ void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
            }
        }
    }
-}
 """

+
+AREA_QUERY_TEMPLATE = (
+    GUIDING_BOX_FINDER_MACRO + r"""//CL//
+    typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
+    typedef ${dtype_to_ctype(peer_list_idx_dtype)} peer_list_idx_t;
+
+    <%def name="get_ball_center_and_radius(ball_center, ball_radius, i)">
+        %for ax in AXIS_NAMES[:dimensions]:
+            ${ball_center}.${ax} = ball_${ax}[${i}];
+        %endfor
+       ${ball_radius} = ball_radii[${i}];
+    </%def>
+
+    <%def name="leaf_found_op(leaf_box_id, ball_center, ball_radius)">
+        APPEND_leaves(${leaf_box_id});
+    </%def>
+
+    void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t i)
+    {
+    """
+    + AREA_QUERY_WALKER_BODY
+    + """
+    }
+    """)
+
+
 PEER_LIST_FINDER_TEMPLATE = r"""//CL//

 void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
@@ -262,27 +391,27 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)

    while (continue_walk)
    {
-        box_id_t child_box_id = box_child_ids[
-                walk_morton_nr * aligned_nboxes + walk_box_id];
+        ${walk_get_box_id()}

-        if (child_box_id)
+        if (walk_box_id)
        {
-            ${load_center("child_center", "child_box_id")}
+            ${load_center("walk_center", "walk_box_id")}

-            // child_box_id lives on walk_level+1.
+            // walk_box_id lives on level walk_stack_size+1.
            bool a_or_o = is_adjacent_or_overlapping(root_extent,
-                center, level, child_center, walk_level+1, false);
+                center, level, walk_center, walk_stack_size+1);

            if (a_or_o)
            {
-                // child_box_id lives on walk_level+1.
-                if (walk_level+1 == level)
+                // walk_box_id lives on level walk_stack_size+1.
+                if (walk_stack_size+1 == level)
                {
-                    APPEND_peers(child_box_id);
+                    APPEND_peers(walk_box_id);
                }
-                else if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
+                else if (!(box_flags[walk_box_id]
+                                & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
                {
-                    APPEND_peers(child_box_id);
+                    APPEND_peers(walk_box_id);
                }
                else
                {
@@ -295,25 +424,24 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
                         ++morton_nr)
                    {
                        box_id_t next_child_id = box_child_ids[
-                            morton_nr * aligned_nboxes + child_box_id];
+                            morton_nr * aligned_nboxes + walk_box_id];
                        if (next_child_id)
                        {
-                            ${load_center("next_child_center", "next_child_id")}
+                            ${load_center("next_walk_center", "next_child_id")}
                            must_be_peer &= !is_adjacent_or_overlapping(root_extent,
-                                center, level, next_child_center, walk_level+2,
-                                false);
+                                center, level, next_walk_center, walk_stack_size+2);
                        }
                    }

                    if (must_be_peer)
                    {
-                        APPEND_peers(child_box_id);
+                        APPEND_peers(walk_box_id);
                    }
                    else
                    {
                        // We want to descend into this box. Put the current state
                        // on the stack.
-                        ${walk_push("child_box_id")}
+                        ${walk_push("walk_box_id")}
                        continue;
                    }
                }
@@ -326,21 +454,203 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)

 """

+
+from pyopencl.elementwise import ElementwiseTemplate
+
+from boxtree.tools import InlineBinarySearch
+
+
+STARTS_EXPANDER_TEMPLATE = ElementwiseTemplate(
+    arguments=r"""
+        idx_t *dst,
+        idx_t *starts,
+        idx_t starts_len
+    """,
+    operation=r"""//CL//
+    /* Find my index in starts, place the index in dst. */
+    dst[i] = bsearch(starts, starts_len, i);
+    """,
+    name="starts_expander",
+    preamble=str(InlineBinarySearch("idx_t")))
+
+# }}}
+
+
+# {{{ area query elementwise template
+
+class AreaQueryElementwiseTemplate:
+    """
+    Experimental: Intended as a way to perform operations in the body of an area
+    query.
+    """
+
+    @staticmethod
+    def unwrap_args(tree, peer_lists, *args):
+        return (tree.box_centers,
+                tree.root_extent,
+                tree.box_levels,
+                tree.aligned_nboxes,
+                tree.box_child_ids,
+                tree.box_flags,
+                peer_lists.peer_list_starts,
+                peer_lists.peer_lists,
+                *tree.bounding_box[0],
+                *args
+            )
+
+    def __init__(self, extra_args, ball_center_and_radius_expr,
+                 leaf_found_op, preamble="", name="area_query_elwise"):
+
+        def wrap_in_macro(decl, expr):
+            return f"""
+            <%def name=\"{decl}\">
+            {expr}
+            </%def>
+            """
+
+        from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
+
+        self.elwise_template = ElementwiseTemplate(
+            arguments=r"""//CL:mako//
+                coord_t *box_centers,
+                coord_t root_extent,
+                box_level_t *box_levels,
+                box_id_t aligned_nboxes,
+                box_id_t *box_child_ids,
+                box_flags_t *box_flags,
+                peer_list_idx_t *peer_list_starts,
+                box_id_t *peer_lists,
+                %for ax in AXIS_NAMES[:dimensions]:
+                    coord_t bbox_min_${ax},
+                %endfor
+            """ + extra_args,
+            operation="//CL:mako//\n"
+            + wrap_in_macro(
+                "get_ball_center_and_radius(ball_center, ball_radius, i)",
+                ball_center_and_radius_expr)
+            + wrap_in_macro(
+                "leaf_found_op(leaf_box_id, ball_center, ball_radius)",
+                leaf_found_op)
+            + TRAVERSAL_PREAMBLE_MAKO_DEFS
+            + GUIDING_BOX_FINDER_MACRO
+            + AREA_QUERY_WALKER_BODY,
+            name=name,
+            preamble=preamble)
+
+    def generate(self, context,
+                 dimensions, coord_dtype, box_id_dtype,
+                 peer_list_idx_dtype, max_levels,
+                 extra_var_values=(), extra_type_aliases=(),
+                 extra_preamble=""):
+        from pyopencl.cltypes import vec_types
+        from pyopencl.tools import dtype_to_ctype
+
+        from boxtree import box_flags_enum
+        from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
+        from boxtree.tree_build import TreeBuilder
+        render_vars = (
+            ("np", np),
+            ("dimensions", dimensions),
+            ("dtype_to_ctype", dtype_to_ctype),
+            ("box_id_dtype", box_id_dtype),
+            ("particle_id_dtype", None),
+            ("coord_dtype", coord_dtype),
+            ("get_coord_vec_dtype", get_coord_vec_dtype),
+            ("cvec_sub", partial(coord_vec_subscript_code, dimensions)),
+            ("max_levels", max_levels),
+            ("AXIS_NAMES", AXIS_NAMES),
+            ("box_flags_enum", box_flags_enum),
+            ("peer_list_idx_dtype", peer_list_idx_dtype),
+            ("debug", False),
+            ("root_extent_stretch_factor", TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR),
+
+            # FIXME This gets used in pytential with a template that still uses this:
+            ("vec_types", tuple(vec_types.items())),
+
+        )
+
+        preamble = Template(
+            # HACK: box_flags_t and coord_t are defined here and
+            # in the template below, so disable typedef redefinition warnings.
+            """
+            #pragma clang diagnostic push
+            #pragma clang diagnostic ignored "-Wtypedef-redefinition"
+            """
+            + TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
+            + """
+            #pragma clang diagnostic pop
+            """,
+            strict_undefined=True).render(**dict(render_vars))
+
+        return self.elwise_template.build(context,
+                type_aliases=(
+                    ("coord_t", coord_dtype),
+                    ("box_id_t", box_id_dtype),
+                    ("peer_list_idx_t", peer_list_idx_dtype),
+                    ("box_level_t", np.uint8),
+                    ("box_flags_t", box_flags_enum.dtype),
+                    *extra_type_aliases,
+                ),
+                var_values=render_vars + extra_var_values,
+                more_preamble=preamble + extra_preamble)
+
+
+SPACE_INVADER_QUERY_TEMPLATE = AreaQueryElementwiseTemplate(
+    extra_args="""
+    coord_t *ball_radii,
+    float *outer_space_invader_dists,
+    %for ax in AXIS_NAMES[:dimensions]:
+        coord_t *ball_${ax},
+    %endfor
+    """,
+    ball_center_and_radius_expr=r"""
+    ${ball_radius} = ball_radii[${i}];
+    %for ax in AXIS_NAMES[:dimensions]:
+        ${ball_center}.${ax} = ball_${ax}[${i}];
+    %endfor
+    """,
+    leaf_found_op=r"""
+    {
+        ${load_center("leaf_center", leaf_box_id)}
+
+        coord_t max_dist = 0;
+        %for i in range(dimensions):
+            max_dist = fmax(max_dist,
+                distance(
+                    ${cvec_sub(ball_center, i)},
+                    ${cvec_sub("leaf_center", i)}));
+        %endfor
+
+        // The atomic max operation supports only integer types.
+        // However, max_dist is of a floating point type.
+        // For comparison purposes we reinterpret the bits of max_dist
+        // as an integer. The comparison result is the same as for positive
+        // IEEE floating point numbers, so long as the float/int endianness
+        // matches (fingers crossed).
+        atomic_max(
+            (volatile __global int *)
+                &outer_space_invader_dists[${leaf_box_id}],
+            as_int((float) max_dist));
+    }""",
+    name="space_invader_query")
+
 # }}}


 # {{{ area query build

-class AreaQueryBuilder(object):
-    """Given a set of :math:`l^\infty` "balls", this class helps build a
+class AreaQueryBuilder:
+    r"""Given a set of :math:`l^\infty` "balls", this class helps build a
    look-up table from ball to leaf boxes that intersect with the ball.

    .. versionadded:: 2016.1

+    .. automethod:: __init__
    .. automethod:: __call__
    """
    def __init__(self, context):
        self.context = context
+        self.peer_list_finder = PeerListFinder(self.context)

    # {{{ Kernel generation

@@ -348,45 +658,52 @@ class AreaQueryBuilder(object):
    def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype,
                              ball_id_dtype, peer_list_idx_dtype, max_levels):
        from pyopencl.tools import dtype_to_ctype
+
        from boxtree import box_flags_enum

-        logger.info("start building area query kernel")
+        logger.debug("start building area query kernel")

        from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
+        from boxtree.tree_build import TreeBuilder

        template = Template(
            TRAVERSAL_PREAMBLE_TEMPLATE
            + AREA_QUERY_TEMPLATE,
            strict_undefined=True)

-        render_vars = dict(
-            dimensions=dimensions,
-            dtype_to_ctype=dtype_to_ctype,
-            box_id_dtype=box_id_dtype,
-            particle_id_dtype=None,
-            coord_dtype=coord_dtype,
-            vec_types=cl.array.vec.types,
-            max_levels=max_levels,
-            AXIS_NAMES=AXIS_NAMES,
-            box_flags_enum=box_flags_enum,
-            peer_list_idx_dtype=peer_list_idx_dtype,
-            ball_id_dtype=ball_id_dtype,
-            debug=False,
-            # Not used (but required by TRAVERSAL_PREAMBLE_TEMPLATE)
-            stick_out_factor=0)
-
-        from pyopencl.tools import VectorArg, ScalarArg
+        render_vars = {
+            "np": np,
+            "dimensions": dimensions,
+            "dtype_to_ctype": dtype_to_ctype,
+            "box_id_dtype": box_id_dtype,
+            "particle_id_dtype": None,
+            "coord_dtype": coord_dtype,
+            "get_coord_vec_dtype": get_coord_vec_dtype,
+            "cvec_sub": partial(coord_vec_subscript_code, dimensions),
+            "max_levels": max_levels,
+            "AXIS_NAMES": AXIS_NAMES,
+            "box_flags_enum": box_flags_enum,
+            "peer_list_idx_dtype": peer_list_idx_dtype,
+            "ball_id_dtype": ball_id_dtype,
+            "debug": False,
+            "root_extent_stretch_factor": TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR,
+            }
+
+        from boxtree.tools import ScalarArg, VectorArg
        arg_decls = [
-            VectorArg(coord_dtype, "box_centers"),
+            VectorArg(coord_dtype, "box_centers", with_offset=False),
            ScalarArg(coord_dtype, "root_extent"),
            VectorArg(np.uint8, "box_levels"),
            ScalarArg(box_id_dtype, "aligned_nboxes"),
-            VectorArg(box_id_dtype, "box_child_ids"),
+            VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
            VectorArg(box_flags_enum.dtype, "box_flags"),
            VectorArg(peer_list_idx_dtype, "peer_list_starts"),
            VectorArg(box_id_dtype, "peer_lists"),
            VectorArg(coord_dtype, "ball_radii"),
            ] + [
+            ScalarArg(coord_dtype, "bbox_min_"+ax)
+            for ax in AXIS_NAMES[:dimensions]
+            ] + [
            VectorArg(coord_dtype, "ball_"+ax)
            for ax in AXIS_NAMES[:dimensions]]

@@ -400,15 +717,11 @@ class AreaQueryBuilder(object):
            count_sharing={},
            complex_kernel=True)

-        logger.info("done building area query kernel")
+        logger.debug("done building area query kernel")
        return area_query_kernel

    # }}}

-    @memoize_method
-    def get_peer_list_finder(self):
-        return PeerListFinder(self.context)
-
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
@@ -427,8 +740,8 @@ class AreaQueryBuilder(object):
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
-            exeuction.
-        :returns: a tuple *(aq, event)*, where *lbl* is an instance of
+            execution.
+        :returns: a tuple *(aq, event)*, where *aq* is an instance of
            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
@@ -442,12 +755,12 @@ class AreaQueryBuilder(object):
        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
+
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
-            peer_list_finder = self.get_peer_list_finder()
-            peer_lists, evt = peer_list_finder(queue, tree, wait_for=wait_for)
+            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
@@ -457,19 +770,20 @@ class AreaQueryBuilder(object):
            tree.coord_dtype, tree.box_id_dtype, ball_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

-        logger.info("area query: run area query")
+        aq_plog = ProcessLogger(logger, "area query")

        result, evt = area_query_kernel(
                queue, len(ball_radii),
                tree.box_centers.data, tree.root_extent,
-                tree.box_levels.data, tree.aligned_nboxes,
-                tree.box_child_ids.data, tree.box_flags.data,
-                peer_lists.peer_list_starts.data,
-                peer_lists.peer_lists.data, ball_radii.data,
-                *tuple(bc.data for bc in ball_centers),
+                tree.box_levels, tree.aligned_nboxes,
+                tree.box_child_ids.data, tree.box_flags,
+                peer_lists.peer_list_starts,
+                peer_lists.peer_lists, ball_radii,
+                *(tuple(tree.bounding_box[0])
+                    + tuple(bc for bc in ball_centers)),
                wait_for=wait_for)

-        logger.info("area query: done")
+        aq_plog.done()

        return AreaQueryResult(
                tree=tree,
@@ -479,9 +793,253 @@ class AreaQueryBuilder(object):
 # }}}


+# {{{ area query transpose (leaves-to-balls) lookup build
+
+class LeavesToBallsLookupBuilder:
+    r"""Given a set of :math:`l^\infty` "balls", this class helps build a
+    look-up table from leaf boxes to balls that overlap with each leaf box.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+
+    """
+    def __init__(self, context):
+        self.context = context
+
+        from pyopencl.algorithm import KeyValueSorter
+        self.key_value_sorter = KeyValueSorter(context)
+        self.area_query_builder = AreaQueryBuilder(context)
+
+    @memoize_method
+    def get_starts_expander_kernel(self, idx_dtype):
+        """
+        Expands a "starts" array into a length starts[-1] array of increasing
+        indices:
+
+        Eg: [0 2 5 6] => [0 0 1 1 1 2]
+
+        """
+        return STARTS_EXPANDER_TEMPLATE.build(
+                self.context,
+                type_aliases=(("idx_t", idx_dtype),))
+
+    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
+                 wait_for=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue`
+        :arg tree: a :class:`boxtree.Tree`.
+        :arg ball_centers: an object array of coordinate
+            :class:`pyopencl.array.Array` instances.
+            Their *dtype* must match *tree*'s
+            :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_radii: a
+            :class:`pyopencl.array.Array`
+            of positive numbers.
+            Its *dtype* must match *tree*'s
+            :attr:`boxtree.Tree.coord_dtype`.
+        :arg peer_lists: may either be *None* or an instance of
+            :class:`PeerListLookup` associated with `tree`.
+        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
+            instances for whose completion this command waits before starting
+            execution.
+        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
+            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
+            for dependency management.
+        """
+
+        from pytools import single_valued
+        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
+            raise TypeError("ball_centers dtype must match tree.coord_dtype")
+        if ball_radii.dtype != tree.coord_dtype:
+            raise TypeError("ball_radii dtype must match tree.coord_dtype")
+
+        ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query")
+
+        area_query, evt = self.area_query_builder(
+                queue, tree, ball_centers, ball_radii, peer_lists, wait_for)
+        wait_for = [evt]
+
+        logger.debug("leaves-to-balls lookup: expand starts")
+
+        nkeys = tree.nboxes
+        nballs_p_1 = len(area_query.leaves_near_ball_starts)
+        assert nballs_p_1 == len(ball_radii) + 1
+
+        # We invert the area query in two steps:
+        #
+        # 1. Turn the area query result into (ball number, box number) pairs.
+        #    This is done in the "starts expander kernel."
+        #
+        # 2. Key-value sort the (ball number, box number) pairs by box number.
+
+        starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype)
+        expanded_starts = cl.array.empty(
+                queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
+        evt = starts_expander_knl(
+                expanded_starts,
+                area_query.leaves_near_ball_starts.with_queue(queue),
+                nballs_p_1)
+        wait_for = [evt]
+
+        logger.debug("leaves-to-balls lookup: key-value sort")
+
+        balls_near_box_starts, balls_near_box_lists, evt \
+                = self.key_value_sorter(
+                        queue,
+                        # keys
+                        area_query.leaves_near_ball_lists.with_queue(queue),
+                        # values
+                        expanded_starts,
+                        nkeys, starts_dtype=tree.box_id_dtype,
+                        wait_for=wait_for)
+
+        ltb_plog.done()
+
+        return LeavesToBallsLookup(
+                tree=tree,
+                balls_near_box_starts=balls_near_box_starts,
+                balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
+
+# }}}
+
+
+# {{{ space invader query build
+
+class SpaceInvaderQueryBuilder:
+    r"""
+    Given a set of :math:`l^\infty` "balls", this class helps build a look-up
+    table which maps leaf boxes to the *outer space invader distance*.
+    This is defined below but roughly, from the point of view
+    of a leaf box, it is the farthest "leaf center to ball center" distance among
+    all balls that intersect the leaf box.
+
+    Formally, given a leaf box :math:`b`, the *outer space invader distance* is
+    defined by the following expression (here :math:`d_\infty` is the
+    :math:`\infty` norm):
+
+    .. math::
+
+        \max \left( \{ d_{\infty}(\text{center}(b), \text{center}(b^*))
+        : b^* \text{ is a ball}, b^* \cap b \neq \varnothing \}
+        \cup \{ 0 \} \right)
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+
+    """
+    def __init__(self, context):
+        self.context = context
+        self.peer_list_finder = PeerListFinder(self.context)
+
+    # {{{ Kernel generation
+
+    @memoize_method
+    def get_space_invader_query_kernel(self, dimensions, coord_dtype,
+                box_id_dtype, peer_list_idx_dtype, max_levels):
+        return SPACE_INVADER_QUERY_TEMPLATE.generate(
+                self.context,
+                dimensions,
+                coord_dtype,
+                box_id_dtype,
+                peer_list_idx_dtype,
+                max_levels)
+
+    # }}}
+
+    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
+                 wait_for=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue`
+        :arg tree: a :class:`boxtree.Tree`.
+        :arg ball_centers: an object array of coordinate
+            :class:`pyopencl.array.Array` instances.
+            Their *dtype* must match *tree*'s
+            :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_radii: a
+            :class:`pyopencl.array.Array`
+            of positive numbers.
+            Its *dtype* must match *tree*'s
+            :attr:`boxtree.Tree.coord_dtype`.
+        :arg peer_lists: may either be *None* or an instance of
+            :class:`PeerListLookup` associated with `tree`.
+        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
+            instances for whose completion this command waits before starting
+            execution.
+        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
+            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
+            for dependency management. The *dtype* of *sqi* is
+            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
+            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
+            The entries of *sqi* are indexed by the global box index and are
+            as follows:
+
+            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
+            * if *i* is the index of a leaf box, *sqi[i]* is the
+              outer space invader distance for *i*.
+        """
+
+        from pytools import single_valued
+        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
+            raise TypeError("ball_centers dtype must match tree.coord_dtype")
+        if ball_radii.dtype != tree.coord_dtype:
+            raise TypeError("ball_radii dtype must match tree.coord_dtype")
+
+        from pytools import div_ceil
+
+        # Avoid generating too many kernels.
+        max_levels = div_ceil(tree.nlevels, 10) * 10
+
+        if peer_lists is None:
+            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
+            wait_for = [evt]
+
+        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
+            raise ValueError("size of peer lists must match with number of boxes")
+
+        space_invader_query_kernel = self.get_space_invader_query_kernel(
+            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
+            peer_lists.peer_list_starts.dtype, max_levels)
+
+        si_plog = ProcessLogger(logger, "space invader query")
+
+        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32)
+        if not wait_for:
+            wait_for = []
+        wait_for = (wait_for
+                + outer_space_invader_dists.events
+                + ball_radii.events
+                + [evt for bc in ball_centers for evt in bc.events])
+
+        evt = space_invader_query_kernel(
+                *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
+                    tree, peer_lists,
+                    ball_radii,
+                    outer_space_invader_dists,
+                    *tuple(bc for bc in ball_centers)),
+                wait_for=wait_for,
+                queue=queue,
+                range=slice(len(ball_radii)))
+
+        if tree.coord_dtype != np.dtype(np.float32):
+            # The kernel output is always an array of float32 due to limited
+            # support for atomic operations with float64 in OpenCL.
+            # Here the output is cast to match the coord dtype.
+            outer_space_invader_dists.finish()
+            outer_space_invader_dists = outer_space_invader_dists.astype(
+                    tree.coord_dtype)
+            evt, = outer_space_invader_dists.events
+
+        si_plog.done()
+
+        return outer_space_invader_dists, evt
+
+# }}}
+
+
 # {{{ peer list build

-class PeerListFinder(object):
+
+class PeerListFinder:
    """This class builds a look-up table from box numbers to peer boxes. The
    full definition [1]_ of a peer box is as follows:

@@ -500,6 +1058,7 @@ class PeerListFinder(object):

    .. versionadded:: 2016.1

+    .. automethod:: __init__
    .. automethod:: __call__
    """

@@ -512,12 +1071,15 @@ class PeerListFinder(object):
    def get_peer_list_finder_kernel(self, dimensions, coord_dtype,
                                    box_id_dtype, max_levels):
        from pyopencl.tools import dtype_to_ctype
+
        from boxtree import box_flags_enum

-        logger.info("start building peer list finder kernel")
+        logger.debug("start building peer list finder kernel")

        from boxtree.traversal import (
-            TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE)
+            HELPER_FUNCTION_TEMPLATE,
+            TRAVERSAL_PREAMBLE_TEMPLATE,
+        )

        template = Template(
            TRAVERSAL_PREAMBLE_TEMPLATE
@@ -525,30 +1087,31 @@ class PeerListFinder(object):
            + PEER_LIST_FINDER_TEMPLATE,
            strict_undefined=True)

-        render_vars = dict(
-            dimensions=dimensions,
-            dtype_to_ctype=dtype_to_ctype,
-            box_id_dtype=box_id_dtype,
-            particle_id_dtype=None,
-            coord_dtype=coord_dtype,
-            vec_types=cl.array.vec.types,
-            max_levels=max_levels,
-            AXIS_NAMES=AXIS_NAMES,
-            box_flags_enum=box_flags_enum,
-            debug=False,
-            # Not used (but required by TRAVERSAL_PREAMBLE_TEMPLATE)
-            stick_out_factor=0,
+        render_vars = {
+            "np": np,
+            "dimensions": dimensions,
+            "dtype_to_ctype": dtype_to_ctype,
+            "box_id_dtype": box_id_dtype,
+            "particle_id_dtype": None,
+            "coord_dtype": coord_dtype,
+            "get_coord_vec_dtype": get_coord_vec_dtype,
+            "cvec_sub": partial(coord_vec_subscript_code, dimensions),
+            "max_levels": max_levels,
+            "AXIS_NAMES": AXIS_NAMES,
+            "box_flags_enum": box_flags_enum,
+            "debug": False,
            # For calls to the helper is_adjacent_or_overlapping()
-            targets_have_extent=False,
-            sources_have_extent=False)
+            "targets_have_extent": False,
+            "sources_have_extent": False,
+            }

-        from pyopencl.tools import VectorArg, ScalarArg
+        from boxtree.tools import ScalarArg, VectorArg
        arg_decls = [
-            VectorArg(coord_dtype, "box_centers"),
+            VectorArg(coord_dtype, "box_centers", with_offset=False),
            ScalarArg(coord_dtype, "root_extent"),
            VectorArg(np.uint8, "box_levels"),
            ScalarArg(box_id_dtype, "aligned_nboxes"),
-            VectorArg(box_id_dtype, "box_child_ids"),
+            VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
            VectorArg(box_flags_enum.dtype, "box_flags"),
        ]

@@ -562,7 +1125,7 @@ class PeerListFinder(object):
            count_sharing={},
            complex_kernel=True)

-        logger.info("done building peer list finder kernel")
+        logger.debug("done building peer list finder kernel")
        return peer_list_finder_kernel

    # }}}
@@ -579,22 +1142,24 @@ class PeerListFinder(object):
            for dependency management.
        """
        from pytools import div_ceil
-        # Avoid generating too many kernels.
+
+        # Round up level count--this gets included in the kernel as
+        # a stack bound. Rounding avoids too many kernel versions.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)

-        logger.info("peer list finder: find peer lists")
+        pl_plog = ProcessLogger(logger, "find peer lists")

        result, evt = peer_list_finder_kernel(
                queue, tree.nboxes,
                tree.box_centers.data, tree.root_extent,
-                tree.box_levels.data, tree.aligned_nboxes,
-                tree.box_child_ids.data, tree.box_flags.data,
+                tree.box_levels, tree.aligned_nboxes,
+                tree.box_child_ids.data, tree.box_flags,
                wait_for=wait_for)

-        logger.info("peer list finder: done")
+        pl_plog.done()

        return PeerListLookup(
                tree=tree,
@@ -603,4 +1168,4 @@ class PeerListFinder(object):

 # }}}

-# vim: filetype=pyopencl:fdm=marker
+# vim: fdm=marker
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
+from arraycontext.pytest import (
+    _PytestPyOpenCLArrayContextFactoryWithClass,
+    register_pytest_array_context_factory,
+)
+
+
+__doc__ = """
+.. autoclass:: PyOpenCLArrayContext
+"""
+
+
+def _acf():
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    return PyOpenCLArrayContext(queue, force_device_scalars=True)
+
+
+class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
+    def transform_loopy_program(self, t_unit):
+        default_ep = t_unit.default_entrypoint
+        options = default_ep.options
+
+        if not (options.return_dict and options.no_numpy):
+            raise ValueError("Loopy kernel passed to call_loopy must "
+                    "have return_dict and no_numpy options set. "
+                    "Did you use arraycontext.make_loopy_program "
+                    "to create this kernel?")
+
+        return super().transform_loopy_program(t_unit)
+
+
+class PytestPyOpenCLArrayContextFactory(
+        _PytestPyOpenCLArrayContextFactoryWithClass):
+    actx_class = PyOpenCLArrayContext
+
+
+register_pytest_array_context_factory("boxtree.pyopencl",
+        PytestPyOpenCLArrayContextFactory)
--- a/boxtree/bounding_box.py
+++ b/boxtree/bounding_box.py
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -23,11 +21,13 @@ THE SOFTWARE.
 """


+import numpy as np
+
 import pyopencl as cl  # noqa
-from boxtree.tools import get_type_moniker
-from pytools import memoize, memoize_method
 from pyopencl.reduction import ReductionTemplate
-import numpy as np
+from pytools import memoize, memoize_method
+
+from boxtree.tools import get_type_moniker


 @memoize
@@ -35,12 +35,12 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
    from boxtree.tools import AXIS_NAMES
    fields = []
    for i in range(dimensions):
-        fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype))
-        fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype))
+        fields.append((f"min_{AXIS_NAMES[i]}", coord_dtype))
+        fields.append((f"max_{AXIS_NAMES[i]}", coord_dtype))

    dtype = np.dtype(fields)
-
-    name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype))
+    type_moniker = get_type_moniker(coord_dtype)
+    name = f"boxtree_bbox_{dimensions}d_{type_moniker}_t"

    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
@@ -132,7 +132,7 @@ class BoundingBoxFinder:

    @memoize_method
    def get_kernel(self, dimensions, coord_dtype, have_radii):
-        bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
+        bbox_dtype, _bbox_cdecl = make_bounding_box_dtype(
                self.context.devices[0], dimensions, coord_dtype)

        from boxtree.tools import AXIS_NAMES
@@ -158,11 +158,7 @@ class BoundingBoxFinder:
        from pytools import single_valued
        coord_dtype = single_valued(coord.dtype for coord in particles)

-        if radii is None:
-            radii_tuple = ()
-        else:
-            radii_tuple = (radii,)
-
+        radii_tuple = () if radii is None else (radii,)
        knl = self.get_kernel(dimensions, coord_dtype,
                # have_radii:
                radii is not None)

--- a/boxtree/constant_one.py
+++ b/boxtree/constant_one.py
+"""
+.. autoclass:: ConstantOneTreeIndependentDataForWrangler
+.. autoclass:: ConstantOneExpansionWrangler
+"""
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+
+from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
+from boxtree.timing import DummyTimingFuture
+
+
+# {{{ constant one wrangler
+
+
+class ConstantOneTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
+    """
+    .. automethod:: __init__
+    """
+
+
+class ConstantOneExpansionWrangler(ExpansionWranglerInterface):
+    """This implements the 'analytical routines' for a Green's function that is
+    constant 1 everywhere. For 'charges' of 'ones', this should get every particle
+    a copy of the particle count.
+
+    Timing results returned by this wrangler contain the field *ops_elapsed*,
+    which counts approximately the number of floating-point operations required.
+    """
+
+    def _get_source_slice(self, ibox):
+        pstart = self.tree.box_source_starts[ibox]
+        return slice(
+                pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
+
+    def _get_target_slice(self, ibox):
+        pstart = self.tree.box_target_starts[ibox]
+        return slice(
+                pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
+
+    def multipole_expansion_zeros(self):
+        return np.zeros(self.tree.nboxes, dtype=np.float64)
+
+    local_expansion_zeros = multipole_expansion_zeros
+
+    def output_zeros(self):
+        return np.zeros(self.tree.ntargets, dtype=np.float64)
+
+    def reorder_sources(self, source_array):
+        return source_array[self.tree.user_source_ids]
+
+    def reorder_potentials(self, potentials):
+        return potentials[self.tree.sorted_target_ids]
+
+    def multipole_expansions_view(self, mpole_exps, level):
+        # FIXME
+        raise NotImplementedError
+
+    def local_expansions_view(self, local_exps, level):
+        # FIXME
+        raise NotImplementedError
+
+    @staticmethod
+    def timing_future(ops):
+        return DummyTimingFuture.from_op_count(ops)
+
+    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+            src_weight_vecs):
+        src_weights, = src_weight_vecs
+        mpoles = self.multipole_expansion_zeros()
+        ops = 0
+
+        for ibox in source_boxes:
+            pslice = self._get_source_slice(ibox)
+            mpoles[ibox] += np.sum(src_weights[pslice])
+            ops += src_weights[pslice].size
+
+        return mpoles, self.timing_future(ops)
+
+    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
+            source_parent_boxes, mpoles):
+        tree = self.tree
+        ops = 0
+
+        # nlevels-1 is the last valid level index
+        # nlevels-2 is the last valid level that could have children
+        #
+        # 3 is the last relevant source_level.
+        # 2 is the last relevant target_level.
+        # (because no level 1 box will be well-separated from another)
+        for source_level in range(tree.nlevels-1, 2, -1):
+            target_level = source_level - 1
+            start, stop = level_start_source_parent_box_nrs[
+                            target_level:target_level+2]
+            for ibox in source_parent_boxes[start:stop]:
+                for child in tree.box_child_ids[:, ibox]:
+                    if child:
+                        mpoles[ibox] += mpoles[child]
+                        ops += 1
+
+        return mpoles, self.timing_future(ops)
+
+    def eval_direct(self, target_boxes, neighbor_sources_starts,
+            neighbor_sources_lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        pot = self.output_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_boxes):
+            tgt_pslice = self._get_target_slice(tgt_ibox)
+
+            src_sum = 0
+            nsrcs = 0
+            start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
+            # print "DIR: %s <- %s" % (tgt_ibox, neighbor_sources_lists[start:end])
+            for src_ibox in neighbor_sources_lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                nsrcs += src_weights[src_pslice].size
+
+                src_sum += np.sum(src_weights[src_pslice])
+
+            pot[tgt_pslice] = src_sum
+            ops += pot[tgt_pslice].size * nsrcs
+
+        return pot, self.timing_future(ops)
+
+    def multipole_to_local(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
+            starts, lists, mpole_exps):
+        local_exps = self.local_expansion_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
+            start, end = starts[itgt_box:itgt_box+2]
+
+            contrib = 0
+            # print tgt_ibox, "<-", lists[start:end]
+            for src_ibox in lists[start:end]:
+                contrib += mpole_exps[src_ibox]
+                ops += 1
+
+            local_exps[tgt_ibox] += contrib
+
+        return local_exps, self.timing_future(ops)
+
+    def eval_multipoles(self,
+            target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
+            mpole_exps):
+        pot = self.output_zeros()
+        ops = 0
+
+        for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level):
+            for itgt_box, tgt_ibox in \
+                    enumerate(target_boxes_by_source_level[level]):
+                tgt_pslice = self._get_target_slice(tgt_ibox)
+
+                contrib = 0
+
+                start, end = ssn.starts[itgt_box:itgt_box+2]
+                for src_ibox in ssn.lists[start:end]:
+                    contrib += mpole_exps[src_ibox]
+
+                pot[tgt_pslice] += contrib
+                ops += pot[tgt_pslice].size * (end - start)
+
+        return pot, self.timing_future(ops)
+
+    def form_locals(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        local_exps = self.local_expansion_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
+            start, end = starts[itgt_box:itgt_box+2]
+
+            # print "LIST 4", tgt_ibox, "<-", lists[start:end]
+            contrib = 0
+            nsrcs = 0
+            for src_ibox in lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                nsrcs += src_weights[src_pslice].size
+
+                contrib += np.sum(src_weights[src_pslice])
+
+            local_exps[tgt_ibox] += contrib
+            ops += nsrcs
+
+        return local_exps, self.timing_future(ops)
+
+    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, local_exps):
+        ops = 0
+
+        for target_lev in range(1, self.tree.nlevels):
+            start, stop = level_start_target_or_target_parent_box_nrs[
+                    target_lev:target_lev+2]
+            for ibox in target_or_target_parent_boxes[start:stop]:
+                local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
+                ops += 1
+
+        return local_exps, self.timing_future(ops)
+
+    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+        pot = self.output_zeros()
+        ops = 0
+
+        for ibox in target_boxes:
+            tgt_pslice = self._get_target_slice(ibox)
+            pot[tgt_pslice] += local_exps[ibox]
+            ops += pot[tgt_pslice].size
+
+        return pot, self.timing_future(ops)
+
+    def finalize_potentials(self, potentials, template_ary):
+        return potentials
+
+# }}}
+
+# vim: foldmethod=marker:filetype=pyopencl
--- a/boxtree/cost.py
+++ b/boxtree/cost.py
+__copyright__ = """
+Copyright (C) 2013 Andreas Kloeckner
+Copyright (C) 2018 Matt Wala
+Copyright (C) 2018 Hao Gao
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+__doc__ = """
+This module helps predict the running time of each step of FMM
+
+:class:`FMMTranslationCostModel` describes the translation or evaluation cost of a
+single operation. For example, *m2p* describes the cost for translating a single
+multipole expansion to a single target.
+
+:class:`AbstractFMMCostModel` uses :class:`FMMTranslationCostModel` and calibration
+parameter to compute the total cost of each step of FMM in each box. There is an
+:class:`AbstractFMMCostModel`, implemented by :class:`FMMCostModel`.
+
+:file:`examples/cost_model.py` demonstrates how the calibration and evaluation
+are performed.
+
+A similar module in *pytential* extends the functionality of his module to
+incorporate QBX-specific operations.
+
+Translation Cost of a Single Operation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: FMMTranslationCostModel
+
+.. autofunction:: make_pde_aware_translation_cost_model
+
+.. autofunction:: make_taylor_translation_cost_model
+
+Cost Model Classes
+^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: AbstractFMMCostModel
+
+.. autoclass:: FMMCostModel
+"""
+
+from collections.abc import Mapping
+from functools import partial
+from typing import ClassVar
+
+import numpy as np
+from mako.template import Template
+
+import pyopencl as cl
+import pyopencl.array
+from pymbolic import evaluate, var
+from pyopencl.elementwise import ElementwiseKernel
+from pyopencl.tools import dtype_to_ctype
+from pytools import memoize_method
+
+
+Template = partial(Template, strict_undefined=True)
+
+from abc import ABC, abstractmethod
+
+
+# {{{ FMMTranslationCostModel
+
+class FMMTranslationCostModel:
+    """Provides modeled costs for individual translations or evaluations.
+
+    .. note::
+
+        Current implementation assumes the calibration parameters are linear
+        in the modeled cost. For example,
+        `var("c_p2l") * self.ncoeffs_fmm_by_level[level]` is valid, but
+        `var("c_p2l") ** 2 * self.ncoeffs_fmm_by_level[level]` is not.
+
+    .. autoclass:: __init__
+    """
+
+    def __init__(self, ncoeffs_fmm_by_level, uses_point_and_shoot):
+        self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level
+        self.uses_point_and_shoot = uses_point_and_shoot
+
+    @staticmethod
+    def direct():
+        return var("c_p2p")
+
+    def p2l(self, level):
+        return var("c_p2l") * self.ncoeffs_fmm_by_level[level]
+
+    def l2p(self, level):
+        return var("c_l2p") * self.ncoeffs_fmm_by_level[level]
+
+    def p2m(self, level):
+        return var("c_p2m") * self.ncoeffs_fmm_by_level[level]
+
+    def m2p(self, level):
+        return var("c_m2p") * self.ncoeffs_fmm_by_level[level]
+
+    def m2m(self, src_level, tgt_level):
+        return var("c_m2m") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def l2l(self, src_level, tgt_level):
+        return var("c_l2l") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def m2l(self, src_level, tgt_level):
+        return var("c_m2l") * self.e2e_cost(
+                self.ncoeffs_fmm_by_level[src_level],
+                self.ncoeffs_fmm_by_level[tgt_level])
+
+    def e2e_cost(self, nsource_coeffs, ntarget_coeffs):
+        if self.uses_point_and_shoot:
+            return (
+                    # Rotate the coordinate system to be z axis aligned.
+                    nsource_coeffs ** (3 / 2)
+                    # Translate the expansion along the z axis.
+                    + nsource_coeffs ** (1 / 2) * ntarget_coeffs
+                    # Rotate the coordinate system back.
+                    + ntarget_coeffs ** (3 / 2))
+
+        return nsource_coeffs * ntarget_coeffs
+
+# }}}
+
+
+# {{{ translation cost model factories
+
+def make_pde_aware_translation_cost_model(dim, nlevels):
+    """Create a cost model for FMM translation operators that make use of the
+    knowledge that the potential satisfies a PDE.
+
+    For example, this factory is used for complex Taylor and Fourier-Bessel
+    expansions in 2D, and spherical harmonics (with point-and-shoot) in 3D.
+    """
+    p_fmm = np.array([var(f"p_fmm_lev{i}") for i in range(nlevels)])
+    ncoeffs_fmm = (p_fmm + 1) ** (dim - 1)
+
+    uses_point_and_shoot = dim == 3
+    return FMMTranslationCostModel(
+            ncoeffs_fmm_by_level=ncoeffs_fmm,
+            uses_point_and_shoot=uses_point_and_shoot
+    )
+
+
+def make_taylor_translation_cost_model(dim, nlevels):
+    """Create a cost model for FMM translation based on Taylor expansions
+    in Cartesian coordinates.
+    """
+    p_fmm = np.array([var(f"p_fmm_lev{i}") for i in range(nlevels)])
+    ncoeffs_fmm = (p_fmm + 1) ** dim
+
+    return FMMTranslationCostModel(
+            ncoeffs_fmm_by_level=ncoeffs_fmm,
+            uses_point_and_shoot=False
+    )
+
+# }}}
+
+
+# {{{ AbstractFMMCostModel
+
+class AbstractFMMCostModel(ABC):
+    """An interface to obtain both FMM operation counts and calibrated (e.g. in
+    seconds) cost estimates.
+
+    * To obtain operation counts only, use :meth:`get_unit_calibration_params`
+      with :meth:`cost_per_stage` or :meth:`cost_per_box`.
+
+    * To calibrate the model, pass operation counts together with timing data
+      to :meth:`estimate_calibration_params`.
+
+    * To evaluate the calibrated models, pass the calibration parameters
+      from :meth:`estimate_calibration_params` to :meth:`cost_per_stage` or
+      :meth:`cost_per_box`.
+
+    .. automethod:: __init__
+
+    .. ------------------------------------------------------------------------
+    .. rubric:: Evaluation
+    .. ------------------------------------------------------------------------
+
+    .. automethod:: cost_per_box
+
+    .. automethod:: cost_per_stage
+
+    .. ------------------------------------------------------------------------
+    .. rubric:: Calibration
+    .. ------------------------------------------------------------------------
+
+    .. automethod:: estimate_calibration_params
+
+    .. ------------------------------------------------------------------------
+    .. rubric:: Utilities
+    .. ------------------------------------------------------------------------
+    .. automethod:: aggregate_over_boxes
+
+    .. automethod:: get_unit_calibration_params
+
+    .. automethod:: get_ndirect_sources_per_target_box
+    """
+    def __init__(
+            self,
+            translation_cost_model_factory=make_pde_aware_translation_cost_model):
+        """
+        :arg translation_cost_model_factory: a function, which takes tree dimension
+            and the number of tree levels as arguments, returns an object of
+            :class:`FMMTranslationCostModel`.
+        """
+        self.translation_cost_model_factory = translation_cost_model_factory
+
+    @abstractmethod
+    def process_form_multipoles(self, queue, traversal, p2m_cost):
+        """Cost for forming multipole expansions of each box.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels,) representing the cost of forming the multipole
+            expansion of one source at each level.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (nsource_boxes,), with each entry represents the cost of the box.
+        """
+        pass
+
+    @abstractmethod
+    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+        """Cost for upward propagation.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels-1,), where the ith entry represents the
+            multipole-to-multipole cost from source level i+1 to target level i.
+        :return: a :class:`float`, the overall cost of upward propagation.
+
+        .. note:: This method returns a number instead of an array, because it is not
+            immediate clear how per-box cost of upward propagation will be useful for
+            distributed load balancing.
+        """
+        pass
+
+    @abstractmethod
+    def get_ndirect_sources_per_target_box(self, queue, traversal):
+        """Collect the number of direct evaluation sources (list 1, list 3 close and
+        list 4 close) for each target box.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (ntarget_boxes,), with each entry representing the number of direct
+            evaluation sources for that target box.
+        """
+        pass
+
+    @abstractmethod
+    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+                       box_target_counts_nonchild=None):
+        """Direct evaluation cost of each target box of *traversal*.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry
+            representing the number of direct evaluation sources for that target box.
+        :arg p2p_cost: a constant representing the cost of one point-to-point
+            evaluation.
+        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
+            using direct evaluation in this box. For example, this is useful in QBX
+            by specifying the number of non-QBX targets. If None, all targets in
+            boxes are considered.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (ntarget_boxes,), with each entry represents the cost of the box.
+        """
+        pass
+
+    @abstractmethod
+    def process_list2(self, queue, traversal, m2l_cost):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels,) representing the translation cost of each level.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (ntarget_or_target_parent_boxes,), with each entry representing the cost
+            of multipole-to-local translations to this box.
+        """
+        pass
+
+    @abstractmethod
+    def process_list3(self, queue, traversal, m2p_cost,
+                      box_target_counts_nonchild=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels,) where the ith entry represents the evaluation cost
+            from multipole expansion at level i to a point.
+        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
+            using multiple-to-point translations in this box. For example, this is
+            useful in QBX by specifying the number of non-QBX targets. If None, all
+            targets in boxes are considered.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (nboxes,), with each entry representing the cost of evaluating all
+            targets inside this box from multipole expansions of list-3 boxes.
+        """
+        pass
+
+    @abstractmethod
+    def process_list4(self, queue, traversal, p2l_cost):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels,) where the ith entry represents the translation cost
+            from a point to the local expansion at level i.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (ntarget_or_target_parent_boxes,), with each entry representing the cost
+            of point-to-local translations to this box.
+        """
+        pass
+
+    @abstractmethod
+    def process_eval_locals(self, queue, traversal, l2p_cost,
+                            box_target_counts_nonchild=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape (nlevels,) where the ith entry represents the cost of evaluating
+            the potential of a target in a box of level i using the box's local
+            expansion.
+        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
+            which need evaluation. For example, this is useful in QBX by specifying
+            the number of non-QBX targets. If None, use
+            traversal.tree.box_target_counts_nonchild.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (ntarget_boxes,), the cost of evaluating the potentials of all targets
+            inside this box from its local expansion.
+        """
+        pass
+
+    @abstractmethod
+    def process_refine_locals(self, queue, traversal, l2l_cost):
+        """Cost of downward propagation.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
+            of shape ``(nlevels-1,)``, where the :math:`i`th entry represents
+            the cost of translating local expansion from level :math:`i` to
+            level :math:`i+1`.
+        :return: a :class:`float`, the overall cost of downward propagation.
+
+        .. note:: This method returns a number instead of an array, because it is not
+            immediate clear how per-box cost of downward propagation will be useful
+            for distributed load balancing.
+        """
+        pass
+
+    @abstractmethod
+    def aggregate_over_boxes(self, per_box_result):
+        """Sum all entries of *per_box_result* into a number.
+
+        :arg per_box_result: an object of :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array`, the result to be sumed.
+        :return: a :class:`float`, the result of the sum.
+        """
+        pass
+
+    @staticmethod
+    def cost_factors_to_dev(cost_factors, queue):
+        cost_factors_dev = {}
+
+        for name in cost_factors:
+            if not isinstance(cost_factors[name], np.ndarray):
+                cost_factors_dev[name] = cost_factors[name]
+                continue
+            cost_factors_dev[name] = cl.array.to_device(
+                queue, cost_factors[name]
+            ).with_queue(None)
+
+        return cost_factors_dev
+
+    def fmm_cost_factors_for_kernels_from_model(
+            self, queue, nlevels, xlat_cost, context):
+        """Evaluate translation cost factors from symbolic model. The result of this
+        function can be used for process_* methods in this class.
+
+        :arg queue: If not None, the cost factor arrays will be transferred to device
+            using this queue.
+        :arg nlevels: the number of tree levels.
+        :arg xlat_cost: a :class:`FMMTranslationCostModel`.
+        :arg context: a :class:`dict` of parameters passed as context when
+            evaluating symbolic expressions in *xlat_cost*.
+        :return: a :class:`dict`, the translation cost of each step in FMM.
+        """
+        cost_factors = {
+            "p2m_cost": np.array([
+                evaluate(xlat_cost.p2m(ilevel), context=context)
+                for ilevel in range(nlevels)
+            ], dtype=np.float64),
+            "m2m_cost": np.array([
+                evaluate(xlat_cost.m2m(ilevel+1, ilevel), context=context)
+                for ilevel in range(nlevels-1)
+            ], dtype=np.float64),
+            "c_p2p": evaluate(xlat_cost.direct(), context=context),
+            "m2l_cost": np.array([
+                evaluate(xlat_cost.m2l(ilevel, ilevel), context=context)
+                for ilevel in range(nlevels)
+            ], dtype=np.float64),
+            "m2p_cost": np.array([
+                evaluate(xlat_cost.m2p(ilevel), context=context)
+                for ilevel in range(nlevels)
+            ], dtype=np.float64),
+            "p2l_cost": np.array([
+                evaluate(xlat_cost.p2l(ilevel), context=context)
+                for ilevel in range(nlevels)
+            ], dtype=np.float64),
+            "l2l_cost": np.array([
+                evaluate(xlat_cost.l2l(ilevel, ilevel+1), context=context)
+                for ilevel in range(nlevels-1)
+            ], dtype=np.float64),
+            "l2p_cost": np.array([
+                evaluate(xlat_cost.l2p(ilevel), context=context)
+                for ilevel in range(nlevels)
+            ], dtype=np.float64)
+        }
+
+        if queue:
+            cost_factors = self.cost_factors_to_dev(cost_factors, queue)
+
+        return cost_factors
+
+    @abstractmethod
+    def zero_cost_per_box(self, queue, nboxes):
+        """Helper function for returning the per-box cost filled with 0.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :param nboxes: the number of boxes
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (*nboxes*,), representing the zero per-box cost.
+        """
+        pass
+
+    def cost_per_box(self, queue, traversal, level_to_order,
+                     calibration_params,
+                     ndirect_sources_per_target_box=None,
+                     box_target_counts_nonchild=None):
+        """Predict the per-box costs of a new traversal object.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg level_to_order: a :class:`numpy.ndarray` of shape
+            (traversal.tree.nlevels,) representing the expansion orders
+            of different levels.
+        :arg calibration_params: a :class:`dict` of calibration parameters. These
+            parameters can be obtained via :meth:`estimate_calibration_params`
+            or :meth:`get_unit_calibration_params`.
+        :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
+            direct evaluation sources (list 1, list 3 close, list 4 close) for each
+            target box. You may find :meth:`get_ndirect_sources_per_target_box`
+            helpful. This argument is useful because the same result can be reused
+            for p2p, p2qbxl and tsqbx.
+        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
+            which need evaluation. For example, this is useful in QBX by specifying
+            the number of non-QBX targets. If None, all targets are considered,
+            namely traversal.tree.box_target_counts_nonchild.
+        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
+            (nboxes,), where the ith entry represents the cost of all stages for box
+            i.
+        """
+        if ndirect_sources_per_target_box is None:
+            ndirect_sources_per_target_box = (
+                self.get_ndirect_sources_per_target_box(queue, traversal)
+            )
+
+        tree = traversal.tree
+        nboxes = tree.nboxes
+        source_boxes = traversal.source_boxes
+        target_boxes = traversal.target_boxes
+        target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
+
+        result = self.zero_cost_per_box(queue, nboxes)
+
+        for ilevel in range(tree.nlevels):
+            calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel]
+
+        xlat_cost = self.translation_cost_model_factory(
+            tree.dimensions, tree.nlevels
+        )
+
+        translation_cost = self.fmm_cost_factors_for_kernels_from_model(
+            queue, tree.nlevels, xlat_cost, calibration_params
+        )
+
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
+
+        result[source_boxes] += self.process_form_multipoles(
+            queue, traversal, translation_cost["p2m_cost"]
+        )
+
+        result[target_boxes] += self.process_direct(
+            queue, traversal, ndirect_sources_per_target_box,
+            translation_cost["c_p2p"],
+            box_target_counts_nonchild=box_target_counts_nonchild
+        )
+
+        result[target_or_target_parent_boxes] += self.process_list2(
+            queue, traversal, translation_cost["m2l_cost"]
+        )
+
+        result += self.process_list3(
+            queue, traversal, translation_cost["m2p_cost"],
+            box_target_counts_nonchild=box_target_counts_nonchild
+        )
+
+        result[target_or_target_parent_boxes] += self.process_list4(
+            queue, traversal, translation_cost["p2l_cost"]
+        )
+
+        result[target_boxes] += self.process_eval_locals(
+            queue, traversal, translation_cost["l2p_cost"],
+            box_target_counts_nonchild=box_target_counts_nonchild
+        )
+
+        return result
+
+    def cost_per_stage(self, queue, traversal, level_to_order,
+                       calibration_params,
+                       ndirect_sources_per_target_box=None,
+                       box_target_counts_nonchild=None):
+        """Predict the per-stage costs of a new traversal object.
+
+        :arg queue: a :class:`pyopencl.CommandQueue` object.
+        :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
+        :arg level_to_order: a :class:`numpy.ndarray` of shape
+            (traversal.tree.nlevels,) representing the expansion orders
+            of different levels.
+        :arg calibration_params: a :class:`dict` of calibration parameters. These
+            parameters can be obtained via :meth:`estimate_calibration_params`
+            or :meth:`get_unit_calibration_params`.
+        :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
+            direct evaluation sources (list 1, list 3 close, list 4 close) for each
+            target box. You may find :func:`get_ndirect_sources_per_target_box`
+            helpful. This argument is useful because the same result can be reused
+            for p2p, p2qbxl and tsqbx.
+        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
+            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
+            which need evaluation. For example, this is useful in QBX by specifying
+            the number of non-QBX targets. If None, all targets are considered,
+            namely traversal.tree.box_target_counts_nonchild.
+        :return: a :class:`dict`, mapping FMM stage names to cost numbers.
+        """
+        if ndirect_sources_per_target_box is None:
+            ndirect_sources_per_target_box = (
+                self.get_ndirect_sources_per_target_box(queue, traversal)
+            )
+
+        tree = traversal.tree
+        result = {}
+
+        for ilevel in range(tree.nlevels):
+            calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel]
+
+        xlat_cost = self.translation_cost_model_factory(
+            tree.dimensions, tree.nlevels
+        )
+
+        translation_cost = self.fmm_cost_factors_for_kernels_from_model(
+            queue, tree.nlevels, xlat_cost, calibration_params
+        )
+
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
+
+        result["form_multipoles"] = self.aggregate_over_boxes(
+            self.process_form_multipoles(
+                queue, traversal, translation_cost["p2m_cost"]
+            )
+        )
+
+        result["coarsen_multipoles"] = self.process_coarsen_multipoles(
+            queue, traversal, translation_cost["m2m_cost"]
+        )
+
+        result["eval_direct"] = self.aggregate_over_boxes(
+            self.process_direct(
+                queue, traversal, ndirect_sources_per_target_box,
+                translation_cost["c_p2p"],
+                box_target_counts_nonchild=box_target_counts_nonchild
+            )
+        )
+
+        result["multipole_to_local"] = self.aggregate_over_boxes(
+            self.process_list2(queue, traversal, translation_cost["m2l_cost"])
+        )
+
+        result["eval_multipoles"] = self.aggregate_over_boxes(
+            self.process_list3(
+                queue, traversal, translation_cost["m2p_cost"],
+                box_target_counts_nonchild=box_target_counts_nonchild
+            )
+        )
+
+        result["form_locals"] = self.aggregate_over_boxes(
+            self.process_list4(queue, traversal, translation_cost["p2l_cost"])
+        )
+
+        result["refine_locals"] = self.process_refine_locals(
+            queue, traversal, translation_cost["l2l_cost"]
+        )
+
+        result["eval_locals"] = self.aggregate_over_boxes(
+            self.process_eval_locals(
+                queue, traversal, translation_cost["l2p_cost"],
+                box_target_counts_nonchild=box_target_counts_nonchild
+            )
+        )
+
+        return result
+
+    @staticmethod
+    def get_unit_calibration_params():
+        return {
+            "c_l2l": 1.0,
+            "c_l2p": 1.0,
+            "c_m2l": 1.0,
+            "c_m2m": 1.0,
+            "c_m2p": 1.0,
+            "c_p2l": 1.0,
+            "c_p2m": 1.0,
+            "c_p2p": 1.0,
+            }
+
+    _FMM_STAGE_TO_CALIBRATION_PARAMETER: ClassVar[Mapping[str, str]] = {
+        "form_multipoles": "c_p2m",
+        "coarsen_multipoles": "c_m2m",
+        "eval_direct": "c_p2p",
+        "multipole_to_local": "c_m2l",
+        "eval_multipoles": "c_m2p",
+        "form_locals": "c_p2l",
+        "refine_locals": "c_l2l",
+        "eval_locals": "c_l2p"
+    }
+
+    def estimate_calibration_params(self, model_results, timing_results,
+                                    time_field_name="wall_elapsed",
+                                    additional_stage_to_param_names=()):
+        """
+        :arg model_results: a :class:`list` of the modeled cost for each step of FMM,
+            returned by :func:`cost_per_stage` with unit calibration parameters
+            (from :meth:`get_unit_calibration_params`)
+        :arg timing_results: a :class:`list` of the same length as *model_results*.
+            Each entry is a :class:`dict` filled with timing data returned by
+            *boxtree.fmm.drive_fmm*
+        :arg time_field_name: a :class:`str`, the field name from the timing result.
+            Usually this can be "wall_elapsed" or "process_elapsed".
+        :arg additional_stage_to_param_names: a :class:`dict` for mapping stage names
+            to parameter names. This is useful for supplying additional stages of
+            QBX.
+        :return: a :class:`dict` of calibration parameters. If there is no model
+            result for a particular stage, the estimated calibration parameter for
+            that stage is NaN.
+        """
+        nresults = len(model_results)
+        assert len(timing_results) == nresults
+
+        stage_to_param_names = self._FMM_STAGE_TO_CALIBRATION_PARAMETER.copy()
+        stage_to_param_names.update(additional_stage_to_param_names)
+
+        params = set(stage_to_param_names.values())
+
+        uncalibrated_times = {}
+        actual_times = {}
+
+        for param in params:
+            uncalibrated_times[param] = np.zeros(nresults)
+            actual_times[param] = np.zeros(nresults)
+
+        for icase, model_result in enumerate(model_results):
+            for stage_name, param_name in stage_to_param_names.items():
+                if stage_name in model_result:
+                    uncalibrated_times[param_name][icase] = (
+                        model_result[stage_name])
+
+        for icase, timing_result in enumerate(timing_results):
+            for stage_name, time in timing_result.items():
+                param_name = stage_to_param_names[stage_name]
+                actual_times[param_name][icase] = time[time_field_name]
+
+        result = {}
+
+        for param in params:
+            uncalibrated = uncalibrated_times[param]
+            actual = actual_times[param]
+
+            if np.allclose(uncalibrated, 0):
+                result[param] = 0.0
+                continue
+
+            result[param] = (
+                    actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated))
+
+        return result
+
+# }}}
+
+
+# {{{ FMMCostModel
+
+class FMMCostModel(AbstractFMMCostModel):
+    """An OpenCL-based realization of :class:`AbstractFMMCostModel`.
+
+    .. note:: For methods in this class, argument *traversal* should live in device
+        memory.
+    """
+
+    # {{{ form multipoles
+
+    @memoize_method
+    def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype,
+                                    box_level_dtype):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                double *np2m,
+                ${box_id_t} *source_boxes,
+                ${particle_id_t} *box_source_counts_nonchild,
+                ${box_level_t} *box_levels,
+                double *p2m_cost
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} box_idx = source_boxes[i];
+                ${particle_id_t} nsources = box_source_counts_nonchild[box_idx];
+                ${box_level_t} ilevel = box_levels[box_idx];
+                np2m[i] = nsources * p2m_cost[ilevel];
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            name="process_form_multipoles"
+        )
+
+    def process_form_multipoles(self, queue, traversal, p2m_cost):
+        tree = traversal.tree
+        np2m = cl.array.zeros(queue, len(traversal.source_boxes), dtype=np.float64)
+
+        process_form_multipoles_knl = self.process_form_multipoles_knl(
+            queue.context,
+            tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
+        )
+
+        process_form_multipoles_knl(
+            np2m,
+            traversal.source_boxes,
+            tree.box_source_counts_nonchild,
+            tree.box_levels,
+            p2m_cost
+        )
+
+        return np2m
+
+    # }}}
+
+    # {{{ propagate multipoles upward
+
+    @memoize_method
+    def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype,
+                                       box_level_dtype, nlevels):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                ${box_id_t} *source_parent_boxes,
+                ${box_level_t} *box_levels,
+                double *m2m_cost,
+                double *nm2m,
+                % for i in range(2**ndimensions):
+                    % if i == 2**ndimensions - 1:
+                        ${box_id_t} *box_child_ids_${i}
+                    % else:
+                        ${box_id_t} *box_child_ids_${i},
+                    % endif
+                % endfor
+            """).render(
+                ndimensions=ndimensions,
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} box_idx = source_parent_boxes[i];
+                ${box_level_t} target_level = box_levels[box_idx];
+                if(target_level <= 1) {
+                    nm2m[i] = 0.0;
+                } else {
+                    int nchild = 0;
+                    % for i in range(2**ndimensions):
+                        if(box_child_ids_${i}[box_idx])
+                            nchild += 1;
+                    % endfor
+                    nm2m[i] = nchild * m2m_cost[target_level];
+                }
+            """).render(
+                ndimensions=ndimensions,
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype),
+                nlevels=nlevels
+            ),
+            name="process_coarsen_multipoles"
+        )
+
+    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+        tree = traversal.tree
+        nm2m = cl.array.zeros(
+            queue, len(traversal.source_parent_boxes), dtype=np.float64
+        )
+
+        process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl(
+            queue.context,
+            tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels
+        )
+
+        process_coarsen_multipoles_knl(
+            traversal.source_parent_boxes,
+            tree.box_levels,
+            m2m_cost,
+            nm2m,
+            *tree.box_child_ids,
+            queue=queue
+        )
+
+        return self.aggregate_over_boxes(nm2m)
+
+    # }}}
+
+    # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
+
+    @memoize_method
+    def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype):
+        return ElementwiseKernel(
+            context,
+            Template("""
+                ${particle_id_t} *ndirect_sources_by_itgt_box,
+                ${box_id_t} *source_boxes_starts,
+                ${box_id_t} *source_boxes_lists,
+                ${particle_id_t} *box_source_counts_nonchild
+            """).render(
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_id_t=dtype_to_ctype(box_id_dtype)
+            ),
+            Template(r"""
+                ${particle_id_t} nsources = 0;
+                ${box_id_t} source_boxes_start_idx = source_boxes_starts[i];
+                ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1];
+
+                for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx;
+                    cur_source_boxes_idx < source_boxes_end_idx;
+                    cur_source_boxes_idx++)
+                {
+                    ${box_id_t} cur_source_box = source_boxes_lists[
+                        cur_source_boxes_idx
+                    ];
+                    nsources += box_source_counts_nonchild[cur_source_box];
+                }
+
+                ndirect_sources_by_itgt_box[i] += nsources;
+            """).render(
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_id_t=dtype_to_ctype(box_id_dtype)
+            ),
+            name="get_ndirect_sources"
+        )
+
+    def get_ndirect_sources_per_target_box(self, queue, traversal):
+        tree = traversal.tree
+        ntarget_boxes = len(traversal.target_boxes)
+        particle_id_dtype = tree.particle_id_dtype
+        box_id_dtype = tree.box_id_dtype
+
+        get_ndirect_sources_knl = self._get_ndirect_sources_knl(
+            queue.context, particle_id_dtype, box_id_dtype
+        )
+
+        ndirect_sources_by_itgt_box = cl.array.zeros(
+            queue, ntarget_boxes, dtype=particle_id_dtype
+        )
+
+        # List 1
+        get_ndirect_sources_knl(
+            ndirect_sources_by_itgt_box,
+            traversal.neighbor_source_boxes_starts,
+            traversal.neighbor_source_boxes_lists,
+            tree.box_source_counts_nonchild
+        )
+
+        # List 3 close
+        if traversal.from_sep_close_smaller_starts is not None:
+            queue.finish()
+            get_ndirect_sources_knl(
+                ndirect_sources_by_itgt_box,
+                traversal.from_sep_close_smaller_starts,
+                traversal.from_sep_close_smaller_lists,
+                tree.box_source_counts_nonchild
+            )
+
+        # List 4 close
+        if traversal.from_sep_close_bigger_starts is not None:
+            queue.finish()
+            get_ndirect_sources_knl(
+                ndirect_sources_by_itgt_box,
+                traversal.from_sep_close_bigger_starts,
+                traversal.from_sep_close_bigger_lists,
+                tree.box_source_counts_nonchild
+            )
+
+        return ndirect_sources_by_itgt_box
+
+    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+                       box_target_counts_nonchild=None):
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
+
+        from pyopencl.array import take
+        ntargets_by_itgt_box = take(
+            box_target_counts_nonchild,
+            traversal.target_boxes,
+            queue=queue
+        )
+
+        return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost
+
+    # }}}
+
+    # {{{ translate separated siblings' ("list 2") mpoles to local
+
+    @memoize_method
+    def process_list2_knl(self, context, box_id_dtype, box_level_dtype):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                double *nm2l,
+                ${box_id_t} *target_or_target_parent_boxes,
+                ${box_id_t} *from_sep_siblings_starts,
+                ${box_level_t} *box_levels,
+                double *m2l_cost
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} start = from_sep_siblings_starts[i];
+                ${box_id_t} end = from_sep_siblings_starts[i+1];
+                ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]];
+
+                nm2l[i] = (end - start) * m2l_cost[ilevel];
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            name="process_list2"
+        )
+
+    def process_list2(self, queue, traversal, m2l_cost):
+        tree = traversal.tree
+        box_id_dtype = tree.box_id_dtype
+        box_level_dtype = tree.box_level_dtype
+
+        ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
+        nm2l = cl.array.zeros(
+            queue, (ntarget_or_target_parent_boxes,), dtype=np.float64
+        )
+
+        process_list2_knl = self.process_list2_knl(
+            queue.context, box_id_dtype, box_level_dtype
+        )
+        process_list2_knl(
+            nm2l,
+            traversal.target_or_target_parent_boxes,
+            traversal.from_sep_siblings_starts,
+            tree.box_levels,
+            m2l_cost
+        )
+
+        return nm2l
+
+    # }}}
+
+    # {{{ evaluate sep. smaller mpoles ("list 3") at particles
+
+    @memoize_method
+    def process_list3_knl(self, context, box_id_dtype, particle_id_dtype):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                ${box_id_t} *target_boxes_sep_smaller,
+                ${box_id_t} *sep_smaller_start,
+                ${particle_id_t} *box_target_counts_nonchild,
+                double m2p_cost_current_level,
+                double *nm2p
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} target_box = target_boxes_sep_smaller[i];
+                ${box_id_t} start = sep_smaller_start[i];
+                ${box_id_t} end = sep_smaller_start[i+1];
+                ${particle_id_t} ntargets = box_target_counts_nonchild[target_box];
+                nm2p[target_box] += (
+                    ntargets * (end - start) * m2p_cost_current_level
+                );
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype)
+            ),
+            name="process_list3"
+        )
+
+    def process_list3(self, queue, traversal, m2p_cost,
+                      box_target_counts_nonchild=None):
+        tree = traversal.tree
+        nm2p = cl.array.zeros(queue, tree.nboxes, dtype=np.float64)
+
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = tree.box_target_counts_nonchild
+
+        process_list3_knl = self.process_list3_knl(
+            queue.context, tree.box_id_dtype, tree.particle_id_dtype
+        )
+
+        for ilevel, sep_smaller_list in enumerate(
+                traversal.from_sep_smaller_by_level):
+            process_list3_knl(
+                traversal.target_boxes_sep_smaller_by_source_level[ilevel],
+                sep_smaller_list.starts,
+                box_target_counts_nonchild,
+                m2p_cost[ilevel].get(queue=queue).reshape(-1)[0],
+                nm2p,
+                queue=queue
+            )
+
+        return nm2p
+
+    # }}}
+
+    # {{{ form locals for separated bigger source boxes ("list 4")
+
+    @memoize_method
+    def process_list4_knl(self, context,
+                          box_id_dtype, particle_id_dtype, box_level_dtype):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                double *nm2p,
+                ${box_id_t} *from_sep_bigger_starts,
+                ${box_id_t} *from_sep_bigger_lists,
+                ${particle_id_t} *box_source_counts_nonchild,
+                ${box_level_t} *box_levels,
+                double *p2l_cost
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} start = from_sep_bigger_starts[i];
+                ${box_id_t} end = from_sep_bigger_starts[i+1];
+                for(${box_id_t} idx=start; idx < end; idx++) {
+                    ${box_id_t} src_ibox = from_sep_bigger_lists[idx];
+                    ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox];
+                    ${box_level_t} ilevel = box_levels[src_ibox];
+                    nm2p[i] += nsources * p2l_cost[ilevel];
+                }
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            name="process_list4"
+        )
+
+    def process_list4(self, queue, traversal, p2l_cost):
+        tree = traversal.tree
+        target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
+        nm2p = cl.array.zeros(
+            queue, len(target_or_target_parent_boxes), dtype=np.float64
+        )
+
+        process_list4_knl = self.process_list4_knl(
+            queue.context,
+            tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
+        )
+
+        process_list4_knl(
+            nm2p,
+            traversal.from_sep_bigger_starts,
+            traversal.from_sep_bigger_lists,
+            tree.box_source_counts_nonchild,
+            tree.box_levels,
+            p2l_cost
+        )
+
+        return nm2p
+
+    # }}}
+
+    # {{{ evaluate local expansions at targets
+
+    @memoize_method
+    def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype,
+                                box_level_dtype):
+        return ElementwiseKernel(
+            context,
+            Template(r"""
+                double *neval_locals,
+                ${box_id_t} *target_boxes,
+                ${particle_id_t} *box_target_counts_nonchild,
+                ${box_level_t} *box_levels,
+                double *l2p_cost
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            Template(r"""
+                ${box_id_t} box_idx = target_boxes[i];
+                ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx];
+                ${box_level_t} ilevel = box_levels[box_idx];
+                neval_locals[i] = ntargets * l2p_cost[ilevel];
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+                particle_id_t=dtype_to_ctype(particle_id_dtype),
+                box_level_t=dtype_to_ctype(box_level_dtype)
+            ),
+            name="process_eval_locals"
+        )
+
+    def process_eval_locals(self, queue, traversal, l2p_cost,
+                            box_target_counts_nonchild=None):
+        tree = traversal.tree
+        ntarget_boxes = len(traversal.target_boxes)
+        neval_locals = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64)
+
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
+
+        process_eval_locals_knl = self.process_eval_locals_knl(
+            queue.context,
+            tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
+        )
+
+        process_eval_locals_knl(
+            neval_locals,
+            traversal.target_boxes,
+            box_target_counts_nonchild,
+            tree.box_levels,
+            l2p_cost
+        )
+
+        return neval_locals
+
+    # }}}
+
+    # {{{ propagate locals downward
+
+    @memoize_method
+    def process_refine_locals_knl(self, context, box_id_dtype):
+        from pyopencl.reduction import ReductionKernel
+        return ReductionKernel(
+            context,
+            np.float64,
+            neutral="0.0",
+            reduce_expr="a+b",
+            map_expr=r"""
+                (level_start_target_or_target_parent_box_nrs[i + 1]
+                 - level_start_target_or_target_parent_box_nrs[i])
+                 * l2l_cost[i - 1]
+            """,
+            arguments=Template(r"""
+                ${box_id_t} *level_start_target_or_target_parent_box_nrs,
+                double *l2l_cost
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype)
+            ),
+            name="process_refine_locals"
+        )
+
+    def process_refine_locals(self, queue, traversal, l2l_cost):
+        tree = traversal.tree
+        process_refine_locals_knl = self.process_refine_locals_knl(
+            queue.context, tree.box_id_dtype
+        )
+
+        level_start_target_or_target_parent_box_nrs = cl.array.to_device(
+            queue, traversal.level_start_target_or_target_parent_box_nrs
+        )
+
+        cost = process_refine_locals_knl(
+            level_start_target_or_target_parent_box_nrs,
+            l2l_cost,
+            range=slice(1, tree.nlevels)
+        ).get()
+
+        return cost.reshape(-1)[0]
+
+    # }}}
+
+    def zero_cost_per_box(self, queue, nboxes):
+        return cl.array.zeros(queue, (nboxes,), dtype=np.float64)
+
+    def aggregate_over_boxes(self, per_box_result):
+        if isinstance(per_box_result, float):
+            return per_box_result
+        else:
+            return cl.array.sum(per_box_result).get().reshape(-1)[0]
+
+    def fmm_cost_factors_for_kernels_from_model(
+            self, queue, nlevels, xlat_cost, context):
+        if not isinstance(queue, cl.CommandQueue):
+            raise TypeError(
+                "An OpenCL command queue must be supplied for cost model")
+
+        return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
+            self, queue, nlevels, xlat_cost, context
+        )
+
+# }}}
+
+
+# {{{ _PythonFMMCostModel (undocumented, only used for testing)
+
+class _PythonFMMCostModel(AbstractFMMCostModel):
+    def process_form_multipoles(self, queue, traversal, p2m_cost):
+        tree = traversal.tree
+        np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64)
+
+        for ilevel in range(tree.nlevels):
+            start, stop = traversal.level_start_source_box_nrs[ilevel:ilevel + 2]
+            for isrc_box, src_ibox in enumerate(
+                    traversal.source_boxes[start:stop], start):
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+                np2m[isrc_box] = nsources * p2m_cost[ilevel]
+
+        return np2m
+
+    def get_ndirect_sources_per_target_box(self, queue, traversal):
+        tree = traversal.tree
+        ntarget_boxes = len(traversal.target_boxes)
+
+        # target box index -> nsources
+        ndirect_sources_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.float64)
+
+        for itgt_box in range(ntarget_boxes):
+            nsources = 0
+
+            start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
+                nsources += tree.box_source_counts_nonchild[src_ibox]
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_smaller_starts is not None:
+                start, end = (
+                    traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]
+                )
+                for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
+                    nsources += tree.box_source_counts_nonchild[src_ibox]
+
+            # Could be None, if not using targets with extent.
+            if traversal.from_sep_close_bigger_starts is not None:
+                start, end = (
+                    traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]
+                )
+                for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
+                    nsources += tree.box_source_counts_nonchild[src_ibox]
+
+            ndirect_sources_by_itgt_box[itgt_box] = nsources
+
+        return ndirect_sources_by_itgt_box
+
+    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+                       box_target_counts_nonchild=None):
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
+
+        ntargets_by_itgt_box = box_target_counts_nonchild[traversal.target_boxes]
+
+        return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost
+
+    def process_list2(self, queue, traversal, m2l_cost):
+        tree = traversal.tree
+        ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
+        nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64)
+
+        for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
+            start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
+
+            ilevel = tree.box_levels[tgt_ibox]
+            nm2l[itgt_box] += m2l_cost[ilevel] * (end - start)
+
+        return nm2l
+
+    def process_list3(self, queue, traversal, m2p_cost,
+                      box_target_counts_nonchild=None):
+        tree = traversal.tree
+        nm2p = np.zeros(tree.nboxes, dtype=np.float64)
+
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = tree.box_target_counts_nonchild
+
+        for ilevel, sep_smaller_list in enumerate(
+                traversal.from_sep_smaller_by_level):
+            for itgt_box, tgt_ibox in enumerate(
+                    traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
+                ntargets = box_target_counts_nonchild[tgt_ibox]
+                start, end = sep_smaller_list.starts[itgt_box:itgt_box + 2]
+                nm2p[tgt_ibox] += ntargets * (end - start) * m2p_cost[ilevel]
+
+        return nm2p
+
+    def process_list4(self, queue, traversal, p2l_cost):
+        tree = traversal.tree
+        target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
+        nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64)
+
+        for itgt_box in range(len(target_or_target_parent_boxes)):
+            start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
+            for src_ibox in traversal.from_sep_bigger_lists[start:end]:
+                nsources = tree.box_source_counts_nonchild[src_ibox]
+                ilevel = tree.box_levels[src_ibox]
+                nm2p[itgt_box] += nsources * p2l_cost[ilevel]
+
+        return nm2p
+
+    def process_eval_locals(self, queue, traversal, l2p_cost,
+                            box_target_counts_nonchild=None):
+        tree = traversal.tree
+        ntarget_boxes = len(traversal.target_boxes)
+        neval_locals = np.zeros(ntarget_boxes, dtype=np.float64)
+        if box_target_counts_nonchild is None:
+            box_target_counts_nonchild = tree.box_target_counts_nonchild
+
+        for target_lev in range(tree.nlevels):
+            start, stop = traversal.level_start_target_box_nrs[
+                    target_lev:target_lev+2]
+            for itgt_box, tgt_ibox in enumerate(
+                    traversal.target_boxes[start:stop], start):
+                neval_locals[itgt_box] += (box_target_counts_nonchild[tgt_ibox]
+                                           * l2p_cost[target_lev])
+
+        return neval_locals
+
+    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+        tree = traversal.tree
+        result = 0.0
+
+        # nlevels-1 is the last valid level index
+        # nlevels-2 is the last valid level that could have children
+        #
+        # 3 is the last relevant source_level.
+        # 2 is the last relevant target_level.
+        # (because no level 1 box will be well-separated from another)
+        for source_level in range(tree.nlevels-1, 2, -1):
+            target_level = source_level - 1
+            cost = m2m_cost[target_level]
+
+            nmultipoles = 0
+            start, stop = traversal.level_start_source_parent_box_nrs[
+                            target_level:target_level+2]
+            for ibox in traversal.source_parent_boxes[start:stop]:
+                for child in tree.box_child_ids[:, ibox]:
+                    if child:
+                        nmultipoles += 1
+
+            result += cost * nmultipoles
+
+        return result
+
+    def process_refine_locals(self, queue, traversal, l2l_cost):
+        tree = traversal.tree
+        result = 0.0
+
+        for target_lev in range(1, tree.nlevels):
+            start, stop = traversal.level_start_target_or_target_parent_box_nrs[
+                    target_lev:target_lev+2]
+            source_lev = target_lev - 1
+            result += (stop-start) * l2l_cost[source_lev]
+
+        return result
+
+    def zero_cost_per_box(self, queue, nboxes):
+        return np.zeros(nboxes, dtype=np.float64)
+
+    def aggregate_over_boxes(self, per_box_result):
+        if isinstance(per_box_result, float):
+            return per_box_result
+        else:
+            return np.sum(per_box_result)
+
+    def fmm_cost_factors_for_kernels_from_model(
+            self, queue, nlevels, xlat_cost, context):
+        return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
+            self, None, nlevels, xlat_cost, context
+        )
+
+# }}}
+
+
+# vim: foldmethod=marker
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+__doc__ = """
+High-level Point FMM Interface
+------------------------------
+
+To perform point-FMM, first construct a
+:class:`boxtree.distributed.DistributedFMMRunner` object. The constructor will
+distribute the necessary information from the root rank to all worker ranks. Then,
+the :meth:`boxtree.distributed.DistributedFMMRunner.drive_dfmm` can be used for
+launching FMM.
+
+.. autoclass:: boxtree.distributed.DistributedFMMRunner
+
+
+Distributed Algorithm Overview
+------------------------------
+
+1. Construct the global tree and traversal lists on the root rank and broadcast to
+   all worker ranks.
+2. Partition boxes into disjoint sets, where the number of sets is the number of MPI
+   ranks. (See :ref:`partition-boxes`)
+3. Each rank constructs the local tree and traversal lists independently, according
+   to the partition. (See :ref:`construct-local-tree-traversal`)
+4. Distribute source weights from the root rank to all worker ranks. (See
+   :ref:`distributed-wrangler`)
+5. Each rank independently forms multipole expansions from the leaf nodes of the
+   local tree and propagates the partial multipole expansions upwards.
+6. Communicate multipole expansions so that all ranks have the complete multipole
+   expansions needed.
+7. Each rank independently forms local expansions, propagates the local expansions
+   downwards, and evaluate potentials of target points in its partition. The
+   calculated potentials are then assembled on the root rank.
+
+For step 5-7, see :ref:`distributed-fmm-evaluation`.
+
+Note that step 4-7 may be repeated multiple times with the same tree and traversal
+object built from step 1-3. For example, when iteratively solving a PDE, step 4-7 is
+executed for each iteration of the linear solver.
+
+The next sections will cover the interfaces of these steps.
+
+.. _partition-boxes:
+
+Partition Boxes
+---------------
+
+.. autofunction:: boxtree.distributed.partition.partition_work
+
+.. autoclass:: boxtree.distributed.partition.BoxMasks
+
+.. autofunction:: boxtree.distributed.partition.get_box_masks
+
+.. _construct-local-tree-traversal:
+
+Construct Local Tree and Traversal
+----------------------------------
+
+.. autoclass:: boxtree.distributed.local_tree.LocalTree
+
+.. autofunction:: boxtree.distributed.local_tree.generate_local_tree
+
+.. autofunction:: boxtree.distributed.local_traversal.generate_local_travs
+
+.. _distributed-wrangler:
+
+Distributed Wrangler
+--------------------
+
+.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
+
+.. _distributed-fmm-evaluation:
+
+Distributed FMM Evaluation
+--------------------------
+
+The distributed version of the FMM evaluation shares the same interface as the
+shared-memory version. To evaluate FMM in a distributed manner, use a subclass
+of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
+:func:`boxtree.fmm.drive_fmm`.
+
+"""
+
+import enum
+import warnings
+
+import numpy as np
+from mpi4py import MPI
+
+import pyopencl as cl
+import pyopencl.array
+
+from boxtree.cost import FMMCostModel
+
+
+__all__ = ["DistributedFMMRunner"]
+
+
+# {{{ MPI
+
+@enum.unique
+class MPITags(enum.IntEnum):
+    DIST_WEIGHT = 1
+    GATHER_POTENTIALS = 2
+    REDUCE_POTENTIALS = 3
+    REDUCE_INDICES = 4
+
+
+def dtype_to_mpi(dtype):
+    """ This function translates a numpy datatype into the corresponding type used in
+    mpi4py.
+    """
+    if hasattr(MPI, "_typedict"):
+        typedict = MPI._typedict
+    elif hasattr(MPI, "__TypeDict__"):
+        typedict = MPI.__TypeDict__
+    else:
+        raise RuntimeError(
+            "There is no dictionary to translate from Numpy dtype to MPI type")
+
+    mpi_type = typedict.get(np.dtype(dtype).char, None)
+    if mpi_type is None:
+        raise ValueError(f"Could not convert '{dtype}' to an MPI datatype")
+
+    return mpi_type
+
+# }}}
+
+
+# {{{ DistributedFMMRunner
+
+def make_distributed_wrangler(
+        queue, global_tree, traversal_builder, wrangler_factory,
+        calibration_params, comm):
+    """Helper function for constructing the distributed wrangler on each rank.
+
+    .. note::
+
+        This function needs to be called collectively on all ranks.
+
+    :returns: a tuple of ``(wrangler, src_idx_all_ranks, tgt_idx_all_ranks)``,
+        where the wrangler is constructed according to *wrangler_factory* and
+        the indices are passed to :func:`boxtree.fmm.drive_fmm`.
+    """
+
+    mpi_rank = comm.Get_rank()
+
+    # `tree_in_device_memory` is True if the global tree is in the device memory
+    # `tree_in_device_memory` is False if the global tree is in the host memory
+    #
+    # Note that at this point, only the root rank has the valid global tree, so
+    # we test `tree_in_device_memory` on the root rank and broadcast to all
+    # worker ranks.
+    tree_in_device_memory = None
+    if mpi_rank == 0:
+        tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
+    tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
+
+    # {{{ Broadcast the global tree
+
+    global_tree_host = None
+    if mpi_rank == 0:
+        if tree_in_device_memory:
+            global_tree_host = global_tree.get(queue)
+        else:
+            global_tree_host = global_tree
+
+    global_tree_host = comm.bcast(global_tree_host, root=0)
+
+    global_tree_dev = None
+    if mpi_rank == 0 and tree_in_device_memory:
+        global_tree_dev = global_tree
+    else:
+        global_tree_dev = global_tree_host.to_device(queue)
+    global_tree_dev = global_tree_dev.with_queue(queue)
+
+    global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
+    global_trav_host = global_trav_dev.get(queue)
+    global_trav = global_trav_dev if tree_in_device_memory else global_trav_host
+
+    # }}}
+
+    # {{{ Partition work
+
+    cost_per_box = None
+
+    if mpi_rank == 0:
+        cost_model = FMMCostModel()
+
+        if calibration_params is None:
+            # Use default calibration parameters if not supplied
+            # TODO: should replace the default calibration params with a more
+            # accurate one
+            warnings.warn("Calibration parameters for the cost model are not "
+                          "supplied. The default one will be used.",
+                          stacklevel=2)
+            calibration_params = \
+                FMMCostModel.get_unit_calibration_params()
+
+        # We need to construct a wrangler in order to access `level_orders`
+        global_wrangler = wrangler_factory(global_trav, global_trav)
+
+        cost_per_box = cost_model.cost_per_box(
+            queue, global_trav_dev, global_wrangler.level_orders,
+            calibration_params
+        ).get()
+
+    from boxtree.distributed.partition import partition_work
+    responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
+
+    # }}}
+
+    # {{{ Compute local tree
+
+    from boxtree.distributed.local_tree import generate_local_tree
+    local_tree, src_idx, tgt_idx = generate_local_tree(
+        queue, global_trav_host, responsible_boxes_list, comm)
+
+    # }}}
+
+    # {{{ Gather source indices and target indices of each rank
+
+    src_idx_all_ranks = comm.gather(src_idx, root=0)
+    tgt_idx_all_ranks = comm.gather(tgt_idx, root=0)
+
+    # }}}
+
+    # {{{ Compute traversal object on each rank
+
+    from boxtree.distributed.local_traversal import generate_local_travs
+    local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
+
+    if not tree_in_device_memory:
+        local_trav = local_trav_dev.get(queue=queue)
+    else:
+        local_trav = local_trav_dev.with_queue(None)
+
+    # }}}
+
+    wrangler = wrangler_factory(local_trav, global_trav)
+
+    return wrangler, src_idx_all_ranks, tgt_idx_all_ranks
+
+
+class DistributedFMMRunner:
+    """Helper class for setting up and running distributed point FMM.
+
+    .. automethod:: __init__
+    .. automethod:: drive_dfmm
+    """
+    def __init__(self, queue, global_tree,
+                 traversal_builder,
+                 wrangler_factory,
+                 calibration_params=None, comm=MPI.COMM_WORLD):
+        """Construct a DistributedFMMRunner object.
+
+        :arg global_tree: a :class:`boxtree.Tree` object. This tree could live in the
+            host or the device memory, depending on the wrangler. This argument is
+            only significant on the root rank.
+        :arg traversal_builder: an object which, when called, takes a
+            :class:`pyopencl.CommandQueue` object and a :class:`boxtree.Tree` object,
+            and generates a :class:`boxtree.traversal.FMMTraversalInfo` object from
+            the tree using the command queue.
+        :arg wrangler_factory: an object which, when called, takes the local
+            traversal and the global traversal objects and returns an
+            :class:`boxtree.fmm.ExpansionWranglerInterface` object.
+        :arg calibration_params: Calibration parameters for the cost model,
+            if supplied. The cost model is used for estimating the execution time of
+            each box, which is used for improving load balancing.
+        :arg comm: MPI communicator.
+        """
+        self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
+            make_distributed_wrangler(
+                queue, global_tree, traversal_builder, wrangler_factory,
+                calibration_params, comm)
+
+    def drive_dfmm(self, source_weights, timing_data=None):
+        """Calculate potentials at target points.
+        """
+        from boxtree.fmm import drive_fmm
+        return drive_fmm(
+            self.wrangler, source_weights,
+            timing_data=timing_data,
+            global_src_idx_all_ranks=self.src_idx_all_ranks,
+            global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
+
+# }}}
+
+# vim: fdm=marker
--- a/boxtree/distributed/calculation.py
+++ b/boxtree/distributed/calculation.py
+
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+
+import numpy as np
+from mako.template import Template
+from mpi4py import MPI
+
+import pyopencl as cl
+from pyopencl.elementwise import ElementwiseKernel
+from pyopencl.tools import dtype_to_ctype
+from pytools import memoize_method
+
+from boxtree.distributed import MPITags
+from boxtree.fmm import ExpansionWranglerInterface
+from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
+
+
+logger = logging.getLogger(__name__)
+
+
+# {{{ Distributed FMM wrangler
+
+class DistributedExpansionWrangler(ExpansionWranglerInterface):
+    """Distributed expansion wrangler base class.
+
+    This is an abstract class and should not be directly instantiated. Instead, it is
+    expected that all distributed wranglers should be subclasses of this class.
+
+    .. automethod:: __init__
+    .. automethod:: distribute_source_weights
+    .. automethod:: gather_potential_results
+    .. automethod:: communicate_mpoles
+    """
+    def __init__(self, context, comm, global_traversal,
+                 traversal_in_device_memory,
+                 communicate_mpoles_via_allreduce=False):
+        self.context = context
+        self.comm = comm
+        self.global_traversal = global_traversal
+        self.traversal_in_device_memory = traversal_in_device_memory
+        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
+
+    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+
+        if mpi_rank == 0:
+            distribute_weight_req = []
+            local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
+
+            for irank in range(mpi_size):
+                local_src_weight_vecs[irank] = [
+                    source_weights[src_idx_all_ranks[irank]]
+                    for source_weights in src_weight_vecs]
+
+                if irank != 0:
+                    distribute_weight_req.append(self.comm.isend(
+                        local_src_weight_vecs[irank], dest=irank,
+                        tag=MPITags.DIST_WEIGHT
+                    ))
+
+            MPI.Request.Waitall(distribute_weight_req)
+            local_src_weight_vecs = local_src_weight_vecs[0]
+        else:
+            local_src_weight_vecs = self.comm.recv(source=0, tag=MPITags.DIST_WEIGHT)
+
+        return local_src_weight_vecs
+
+    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+
+        from boxtree.distributed import dtype_to_mpi
+        potentials_mpi_type = dtype_to_mpi(potentials.dtype)
+
+        gathered_potentials = None
+
+        if mpi_rank == 0:
+            # The root rank received calculated potentials from all worker ranks
+            potentials_all_ranks = np.empty((mpi_size,), dtype=object)
+            potentials_all_ranks[0] = potentials
+
+            recv_reqs = []
+
+            for irank in range(1, mpi_size):
+                potentials_all_ranks[irank] = np.empty(
+                    tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
+
+                recv_reqs.append(
+                    self.comm.Irecv(
+                        [potentials_all_ranks[irank], potentials_mpi_type],
+                        source=irank, tag=MPITags.GATHER_POTENTIALS))
+
+            MPI.Request.Waitall(recv_reqs)
+
+            # Assemble potentials from worker ranks together on the root rank
+            gathered_potentials = np.empty(
+                self.global_traversal.tree.ntargets, dtype=potentials.dtype)
+
+            for irank in range(mpi_size):
+                gathered_potentials[tgt_idx_all_ranks[irank]] = (
+                    potentials_all_ranks[irank])
+        else:
+            # Worker ranks send calculated potentials to the root rank
+            self.comm.Send([potentials, potentials_mpi_type],
+                           dest=0, tag=MPITags.GATHER_POTENTIALS)
+
+        return gathered_potentials
+
+    def _slice_mpoles(self, mpoles, slice_indices):
+        if len(slice_indices) == 0:
+            return np.empty((0,), dtype=mpoles.dtype)
+
+        level_start_slice_indices = np.searchsorted(
+            slice_indices, self.traversal.tree.level_start_box_nrs)
+        mpoles_list = []
+
+        for ilevel in range(self.traversal.tree.nlevels):
+            start, stop = level_start_slice_indices[ilevel:ilevel+2]
+            if stop > start:
+                level_start_box_idx, mpoles_current_level = (
+                    self.multipole_expansions_view(mpoles, ilevel))
+                mpoles_list.append(
+                    mpoles_current_level[
+                        slice_indices[start:stop] - level_start_box_idx
+                    ].reshape(-1)
+                )
+
+        return np.concatenate(mpoles_list)
+
+    def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
+        if len(slice_indices) == 0:
+            return
+
+        level_start_slice_indices = np.searchsorted(
+            slice_indices, self.traversal.tree.level_start_box_nrs)
+        mpole_updates_start = 0
+
+        for ilevel in range(self.traversal.tree.nlevels):
+            start, stop = level_start_slice_indices[ilevel:ilevel+2]
+            if stop > start:
+                level_start_box_idx, mpoles_current_level = (
+                    self.multipole_expansions_view(mpoles, ilevel))
+                mpoles_shape = (stop - start,) + mpoles_current_level.shape[1:]
+
+                from pytools import product
+                mpole_updates_end = mpole_updates_start + product(mpoles_shape)
+
+                mpoles_current_level[
+                    slice_indices[start:stop] - level_start_box_idx
+                ] += mpole_updates[
+                    mpole_updates_start:mpole_updates_end
+                ].reshape(mpoles_shape)
+
+                mpole_updates_start = mpole_updates_end
+
+    @memoize_method
+    def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
+        return ElementwiseKernel(
+            self.context,
+            Template(r"""
+                ${box_id_t} *contributing_boxes_list,
+                int subrange_start,
+                int subrange_end,
+                ${box_id_t} *box_to_user_rank_starts,
+                int *box_to_user_rank_lists,
+                char *box_in_subrange
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+            ),
+            Template(r"""
+                ${box_id_t} ibox = contributing_boxes_list[i];
+                ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
+                ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
+                for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
+                    int useri = box_to_user_rank_lists[iuser];
+                    if(subrange_start <= useri && useri < subrange_end) {
+                        box_in_subrange[i] = 1;
+                    }
+                }
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype)
+            ),
+            "find_boxes_used_by_subrange"
+        )
+
+    def find_boxes_used_by_subrange(
+            self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
+            contributing_boxes_list):
+        """Test whether the multipole expansions of the contributing boxes are used
+        by at least one box in a range.
+
+        :arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
+        :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
+            indicating the start and end index in *box_to_user_rank_lists* for each
+            box in *contributing_boxes_list*.
+        :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
+            the users of each box in *contributing_boxes_list*.
+        :returns: a :class:`pyopencl.array.Array` object with the same shape as
+            *contributing_boxes_list*, where the i-th entry is 1 if
+            ``contributing_boxes_list[i]`` is used by at least on box in the
+            subrange specified.
+        """
+        box_in_subrange = cl.array.zeros(
+            contributing_boxes_list.queue,
+            contributing_boxes_list.shape[0],
+            dtype=np.int8
+        )
+        knl = self.find_boxes_used_by_subrange_kernel(
+                self.traversal.tree.box_id_dtype)
+
+        knl(
+            contributing_boxes_list,
+            subrange[0],
+            subrange[1],
+            box_to_user_rank_starts,
+            box_to_user_rank_lists,
+            box_in_subrange
+        )
+
+        return box_in_subrange
+
+    def communicate_mpoles(self, mpole_exps, return_stats=False):
+        """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
+
+        The main idea is to mimic an allreduce as done on a hypercube network, but to
+        decrease the bandwidth cost by sending only information that is relevant to
+        the rank receiving the message.
+
+        .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
+            Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
+            Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
+            adaptive fast multipole method on heterogeneous architectures."
+            Communications of the ACM 55, no. 5 (2012): 101-109.
+        """
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+        tree = self.traversal.tree
+
+        if self.communicate_mpoles_via_allreduce:
+            # Use MPI allreduce for communicating multipole expressions. It is slower
+            # but might be helpful for debugging purposes.
+            mpole_exps_all = np.zeros_like(mpole_exps)
+            self.comm.Allreduce(mpole_exps, mpole_exps_all)
+            mpole_exps[:] = mpole_exps_all
+            return
+
+        stats = {}
+
+        # contributing_boxes:
+        #
+        # A mask of the the set of boxes that the current process contributes
+        # to. This process contributes to a box when:
+        #
+        # (a) this process owns sources that contribute to the multipole expansion
+        # in the box (via the upward pass) or
+        # (b) this process has received a portion of the multipole expansion in this
+        # box from another process.
+        #
+        # Initially, this set consists of the boxes satisfying condition (a), which
+        # are precisely the boxes owned by this process and their ancestors.
+        if self.traversal_in_device_memory:
+            with cl.CommandQueue(self.context) as queue:
+                contributing_boxes = tree.ancestor_mask.get(queue=queue)
+                responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
+        else:
+            contributing_boxes = tree.ancestor_mask.copy()
+            responsible_boxes_list = tree.responsible_boxes_list
+        contributing_boxes[responsible_boxes_list] = 1
+
+        from boxtree.tools import AllReduceCommPattern
+        comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
+
+        # Temporary buffers for receiving data
+        mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
+        boxes_list_buf = np.empty(tree.nboxes, dtype=tree.box_id_dtype)
+
+        stats["bytes_sent_by_stage"] = []
+        stats["bytes_recvd_by_stage"] = []
+
+        if self.traversal_in_device_memory:
+            box_to_user_rank_starts_dev = \
+                tree.box_to_user_rank_starts.with_queue(None)
+            box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
+        else:
+            with cl.CommandQueue(self.context) as queue:
+                box_to_user_rank_starts_dev = cl.array.to_device(
+                    queue, tree.box_to_user_rank_starts).with_queue(None)
+                box_to_user_rank_lists_dev = cl.array.to_device(
+                    queue, tree.box_to_user_rank_lists).with_queue(None)
+
+        while not comm_pattern.done():
+            send_requests = []
+
+            # Send data to other processors.
+            if comm_pattern.sinks():
+                # Compute the subset of boxes to be sent.
+                message_subrange = comm_pattern.messages()
+
+                contributing_boxes_list = np.nonzero(contributing_boxes)[0].astype(
+                    tree.box_id_dtype
+                )
+
+                with cl.CommandQueue(self.context) as queue:
+                    contributing_boxes_list_dev = cl.array.to_device(
+                        queue, contributing_boxes_list)
+
+                    box_in_subrange = self.find_boxes_used_by_subrange(
+                        message_subrange,
+                        box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
+                        contributing_boxes_list_dev
+                    )
+
+                    box_in_subrange_host = box_in_subrange.get().astype(bool)
+
+                relevant_boxes_list = contributing_boxes_list[
+                    box_in_subrange_host
+                ].astype(tree.box_id_dtype)
+
+                """
+                # Pure Python version for debugging purpose
+                relevant_boxes_list = []
+                for contrib_box in contributing_boxes_list:
+                    iuser_start, iuser_end = tree.box_to_user_rank_starts[
+                        contrib_box:contrib_box + 2
+                    ]
+                    for user_box in tree.box_to_user_rank_lists[
+                            iuser_start:iuser_end]:
+                        if subrange_start <= user_box < subrange_end:
+                            relevant_boxes_list.append(contrib_box)
+                            break
+                """
+
+                relevant_boxes_list = np.array(
+                    relevant_boxes_list, dtype=tree.box_id_dtype
+                )
+
+                relevant_mpole_exps = self._slice_mpoles(
+                    mpole_exps, relevant_boxes_list)
+
+                # Send the box subset to the other processors.
+                for sink in comm_pattern.sinks():
+                    req = self.comm.Isend(relevant_mpole_exps, dest=sink,
+                                          tag=MPITags.REDUCE_POTENTIALS)
+                    send_requests.append(req)
+
+                    req = self.comm.Isend(relevant_boxes_list, dest=sink,
+                                          tag=MPITags.REDUCE_INDICES)
+                    send_requests.append(req)
+
+            # Receive data from other processors.
+            for source in comm_pattern.sources():
+                self.comm.Recv(mpole_exps_buf, source=source,
+                               tag=MPITags.REDUCE_POTENTIALS)
+
+                status = MPI.Status()
+                self.comm.Recv(
+                    boxes_list_buf, source=source, tag=MPITags.REDUCE_INDICES,
+                    status=status)
+                nboxes = status.Get_count() // boxes_list_buf.dtype.itemsize
+
+                # Update data structures.
+                self._update_mpoles(
+                        mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
+
+                contributing_boxes[boxes_list_buf[:nboxes]] = 1
+
+            for req in send_requests:
+                req.wait()
+
+            comm_pattern.advance()
+
+        if return_stats:
+            return stats
+
+    def finalize_potentials(self, potentials, template_ary):
+        if self.comm.Get_rank() == 0:
+            return super().finalize_potentials(potentials, template_ary)
+        else:
+            return None
+
+
+class DistributedFMMLibExpansionWrangler(
+        DistributedExpansionWrangler, FMMLibExpansionWrangler):
+    def __init__(
+            self, context, comm, tree_indep, local_traversal, global_traversal,
+            fmm_level_to_order=None,
+            communicate_mpoles_via_allreduce=False,
+            **kwargs):
+        DistributedExpansionWrangler.__init__(
+            self, context, comm, global_traversal, False,
+            communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
+        FMMLibExpansionWrangler.__init__(
+            self, tree_indep, local_traversal,
+            fmm_level_to_order=fmm_level_to_order, **kwargs)
+
+    # TODO: use log_process like FMMLibExpansionWrangler?
+    def reorder_sources(self, source_array):
+        if self.comm.Get_rank() == 0:
+            return source_array[..., self.global_traversal.tree.user_source_ids]
+        else:
+            return None
+
+    def reorder_potentials(self, potentials):
+        if self.comm.Get_rank() == 0:
+            return potentials[self.global_traversal.tree.sorted_target_ids]
+        else:
+            return None
+
+# }}}
--- a/boxtree/distributed/local_traversal.py
+++ b/boxtree/distributed/local_traversal.py
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import time
+
+
+logger = logging.getLogger(__name__)
+
+
+def generate_local_travs(
+        queue, local_tree, traversal_builder, merge_close_lists=False):
+    """Generate local traversal from local tree.
+
+    :arg queue: a :class:`pyopencl.CommandQueue` object.
+    :arg local_tree: the local tree of class
+        `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
+        object will be constructed.
+    :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
+        a tree, returns the traversal object based on the tree.
+
+    :return: generated local traversal object in device memory
+    """
+    start_time = time.time()
+
+    local_tree.with_queue(queue)
+
+    # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
+    # multipole formation and upward propagation within the rank's responsible boxes
+    # region. Had there not been such restrictions, some sources might be distributed
+    # to more than 1 rank and counted multiple times.
+    local_trav, _ = traversal_builder(
+        queue, local_tree.to_device(queue),
+        source_boxes_mask=local_tree.responsible_boxes_mask.device,
+        source_parent_boxes_mask=local_tree.ancestor_mask.device
+    )
+
+    if merge_close_lists and local_tree.targets_have_extent:
+        local_trav = local_trav.merge_close_lists(queue)
+
+    logger.info("Generate local traversal in %f sec.", time.time() - start_time)
+
+    return local_trav