02addfd6 · 4906c584 · f01bb0f3 · 5b8e8964 · 348416d8 · de1cc4d1
--- a/.editorconfig
+++ b/.editorconfig
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+version: 2
+updates:
+    # Set update schedule for GitHub Actions
+    - package-ecosystem: "github-actions"
+      directory: "/"
+      schedule:
+          interval: "weekly"
+
+# vim: sw=4
--- a/.github/workflows/autopush.yml
+++ b/.github/workflows/autopush.yml
+name: Gitlab mirror
+on:
+    push:
+        branches:
+        - main
+
+jobs:
+    autopush:
+        name: Automatic push to gitlab.tiker.net
+        if: startsWith(github.repository, 'inducer/')
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                mirror_github_to_gitlab
+
+            env:
+                GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
+
+# vim: sw=4
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: CI
+on:
+    push:
+        branches:
+        - main
+    pull_request:
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    ruff:
+        name: Ruff
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                pipx install ruff
+                ruff check
+
+    typos:
+        name: Typos
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   uses: crate-ci/typos@master
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                run_pylint "$(get_proj_name)" examples/*.py test/*.py
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                build_docs
+
+    pytest3:
+        name: Conda Python 3
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                test_py_project
+
+    py3example:
+        name: Python 3 Examples
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                run_examples
+
+    downstream_tests:
+        strategy:
+            matrix:
+                downstream_project: [sumpy, pytential]
+        name: Tests for downstream project ${{ matrix.downstream_project }}
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            env:
+                DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
+            run: |
+                export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+
+                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
+                   DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
+                fi
+                test_downstream "$DOWNSTREAM_PROJECT"
+
+# vim: sw=4
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ setuptools.pth
 distribute*egg
 distribute*tar.gz
 a.out
+
+.cache
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-Python 2.7 Intel CPU:
-  script:
-  - py_version=2.7
-  - export PYOPENCL_TEST=int:pu
-  - EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - intel-cl-cpu
-  except:
-  - tags
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-Python 3.5 POCL:
-  script:
-  - export PY_EXE=python3.5
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python3.5
-  - pocl
-  except:
-  - tags
+Python 3 POCL:
+    script: |
+      export PYOPENCL_TEST=portable:pthread
+      export EXTRA_INSTALL="pybind11 numpy mako matplotlib"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      test_py_project
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL Titan V:
+    script: |
+      export PYOPENCL_TEST=portable:titan
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-titan-v
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL Titan X:
+    script: |
+      export PYOPENCL_TEST=portable:titan
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-titan-x
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL K40:
+    script: |
+      export PYOPENCL_TEST=portable:k40
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      test_py_project
+    tags:
+    - nvidia-k40
+    except:
+    - tags
+    artifacts:
+        reports:
+            junit: test/pytest.xml
+
+Python 3 POCL MPI:
+    script: |
+      export PYOPENCL_TEST=portable
+      export EXTRA_INSTALL="numpy mako mpi4py pybind11"
+      export PYTEST_ADDOPTS="-m 'mpi' --capture=no"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      test_py_project
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+    artifacts:
+      reports:
+        junit: test/pytest.xml
+
+Python 3 POCL Examples:
+    script: |
+      test -n "$SKIP_EXAMPLES" && exit
+      export PYOPENCL_TEST=portable:pthread
+      export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_venv
+      run_examples
+    tags:
+    - python3
+    - pocl
+    except:
+    - tags
+
+Pylint:
+    script: |
+      export EXTRA_INSTALL="pybind11 numpy mako matplotlib mpi4py"
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      run_pylint "$(get_proj_name)" examples/*.py test/*.py
+    tags:
+    - python3
+    except:
+    - tags
+
+Documentation:
+    script: |
+      EXTRA_INSTALL="pybind11 numpy mako mpi4py"
+      curl -L -O https://tiker.net/ci-support-v0
+      . ci-support-v0
+      build_py_project_in_conda_env
+      build_docs
+    tags:
+    - python3
+
+Ruff:
+    script: |
+      pipx install ruff
+      ruff check
+    tags:
+    - docker-runner
+    except:
+    - tags
+
+Downstream:
+    parallel:
+      matrix:
+      - DOWNSTREAM_PROJECT: [sumpy, pytential]
+    tags:
+    - large-node
+    - "docker-runner"
+    script: |
+      export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
+
+      curl -L -O https://tiker.net/ci-support-v0
+      . ./ci-support-v0
+      test_downstream "$DOWNSTREAM_PROJECT"
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
+- arg: py-version
+  val: '3.10'
+
+- arg: extension-pkg-whitelist
+  val: pyfmmlib
+
+# Needed for boxtree.tools
+- arg: init-hook
+  val: import sys; sys.setrecursionlimit(2000)
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
+name: test-conda-env
+channels:
+- conda-forge
+- nodefaults
+
+dependencies:
+- python=3
+- git
+- numpy
+- pocl
+- pocl-cuda
+- mako
+- pyopencl
+- islpy
+- pyfmmlib
+- mpi4py
+
+# Only needed to make pylint succeed
+- matplotlib-base
+
+# This is intended to prevent conda from selecting 'external' (i.e. empty) builds
+# of OpenMPI to satisfy the MPI dependency of mpi4py. It did so in May 2024, leading
+# to confusing failues saying
+# 'libmpi.so.40: cannot open shared object file: No such file or directory'.
+# https://github.com/conda-forge/openmpi-feedstock/issues/153
+# https://conda-forge.org/docs/user/tipsandtricks/#using-external-message-passing-interface-mpi-libraries
+- openmpi>=5=h*
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Kloeckner"
+  given-names: "Andreas"
+  orcid: "https://orcid.org/0000-0003-1228-519X"
+- family-names: "Fikl"
+  given-names: "Alex"
+- family-names: "Gao"
+  given-names: "Hao"
+- family-names: "Fernando"
+  given-names: "Isuru"
+- family-names: "Wala"
+  given-names: "Matt"
+- family-names: "Wei"
+  given-names: "Xiaoyu"
+
+title: "boxtree"
+version: 2023.1
+date-released: 2023-10-01
+doi: 10.5281/zenodo.8397396
+url: "https://github.com/inducer/boxtree"
+license: MIT
--- a/MANIFEST.in
+++ b/MANIFEST.in
-include test/*.py
-include examples/*.py
-
-include doc/*.rst
-include doc/Makefile
-include doc/*.py
-include doc/images/*.png
-include doc/_static/*.css
-include doc/_templates/*.html
-
-include configure.py
-include Makefile.in
-include README.rst
-include requirements.txt
--- a/README.rst
+++ b/README.rst
-boxtree
-=======
+boxtree: Quad/Octrees, FMM Traversals, Geometric Queries
+========================================================

+.. image:: https://gitlab.tiker.net/inducer/boxtree/badges/main/pipeline.svg
+    :alt: Gitlab Build Status
+    :target: https://gitlab.tiker.net/inducer/boxtree/commits/main
+.. image:: https://github.com/inducer/boxtree/workflows/CI/badge.svg?branch=main&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/boxtree/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/boxtree.png
-    :target: http://pypi.python.org/pypi/boxtree
+    :alt: Python Package Index Release Page
+    :target: https://pypi.org/project/boxtree/
+.. image:: https://zenodo.org/badge/7193697.svg
+    :alt: Zenodo DOI for latest release
+    :target: https://zenodo.org/badge/latestdoi/7193697

-boxtree is a package that, given some point locations in two or three
+``boxtree`` is a package that, given some point locations in two or three
 dimensions, sorts them into an adaptive quad/octree of boxes, efficiently, in
-parallel, using `PyOpenCL <http://mathema.tician.de/software/pyopencl>`_.
+parallel, using `PyOpenCL <https://mathema.tician.de/software/pyopencl>`__.

 It can also generate traversal lists needed for adaptive fast multipole methods
 and related algorithms and tree-based look-up tables for geometric proximity.

-boxtree is under the MIT license.
+``boxtree`` is under the MIT license.

 Resources:

-* `documentation <http://documen.tician.de/boxtree>`_
-* `wiki home page <http://wiki.tiker.net/BoxTree>`_
-* `source code via git <https://github.com/inducer/boxtree>`_
+* `Documentation <https://documen.tician.de/boxtree>`__
+* `PyPI package <https://pypi.org/project/boxtree>`__
+* `Source Code (GitHub) <https://github.com/inducer/boxtree>`__
--- a/boxtree/__init__.py
+++ b/boxtree/__init__.py
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -22,31 +20,60 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum
+from boxtree.tree import Tree, TreeOfBoxes, TreeWithLinkedPointSources, box_flags_enum
 from boxtree.tree_build import TreeBuilder
+from boxtree.tree_of_boxes import (
+    coarsen_tree_of_boxes,
+    make_meshmode_mesh_from_leaves,
+    make_tree_of_boxes_root,
+    refine_and_coarsen_tree_of_boxes,
+    refine_tree_of_boxes,
+    uniformly_refine_tree_of_boxes,
+)

-__all__ = [
-    "Tree", "TreeWithLinkedPointSources",
-    "TreeBuilder", "box_flags_enum"]

-__doc__ = """
+__all__ = [
+    "Tree",
+    "TreeBuilder",
+    "TreeOfBoxes",
+    "TreeWithLinkedPointSources",
+    "box_flags_enum",
+    "coarsen_tree_of_boxes",
+    "make_meshmode_mesh_from_leaves",
+    "make_tree_of_boxes_root",
+    "refine_and_coarsen_tree_of_boxes",
+    "refine_tree_of_boxes",
+    "uniformly_refine_tree_of_boxes",
+]
+
+__doc__ = r"""
 :mod:`boxtree` can do three main things:

-* it can sort particles into an adaptively refined quad/octree,
-  see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`.
+* it can build quad/octrees (in at least 1D/2D/3D), in one of two modes:
+
+  * First, trees can be built as purely a collection of boxes,
+    see :ref:`tree-of-boxes`.  These trees are typically built iteratively,
+    through refinement and coarsening.
+  * Second, trees can be built from collections of points,
+    so that each box contains only a bounded number of these points.
+    This is the older, less general way to construct trees
+    that has a fairly heavy focus on point-based Fast Multipole
+    Methods (FMMs).
+    See :class:`~Tree` and :class:`~TreeBuilder`.
+    Note that :class:`~Tree` inherits from :class:`TreeOfBoxes`.

 * it can compute fast-multipole-like interaction lists on this tree structure,
  see :mod:`boxtree.traversal`. Note that while this traversal generation
  builds on the result of particle sorting,
  it is completely distinct in the software sense.

-* It can compute geometric lookup structures based on a :class:`boxtree.Tree`,
-  see :mod:`boxtree.geo_lookup`.
+* it can compute geometric lookup structures based on a :class:`Tree`,
+  see :mod:`~boxtree.area_query`.

 Tree modes
 ----------

-:mod:`boxtree` can operate in three 'modes':
+The particle-based :class:`Tree` can operate in three 'modes':

 * one where no distinction is made between sources and targets. In this mode,
  all participants in the interaction are called 'particles'.
@@ -101,8 +128,8 @@ the point sources have their own orderings:
 * **user point source order**
 * **tree point source order** (tree/box-sorted)

-:attr:`TreeWithLinkedPointSources.user_point_source_ids` helps translate point
-source arrays into tree order for processing.
+:attr:`boxtree.tree.TreeWithLinkedPointSources.user_point_source_ids` helps
+translate point source arrays into tree order for processing.

 .. _csr:

@@ -133,4 +160,7 @@ This indexing scheme has the following properties:
  is automatically the end of the previous one.
 """

+# allow namespace packages
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
 # vim: filetype=pyopencl:fdm=marker
--- a/boxtree/area_query.py
+++ b/boxtree/area_query.py
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
+from arraycontext.pytest import (
+    _PytestPyOpenCLArrayContextFactoryWithClass,
+    register_pytest_array_context_factory,
+)
+
+
+__doc__ = """
+.. autoclass:: PyOpenCLArrayContext
+"""
+
+
+def _acf():
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    return PyOpenCLArrayContext(queue, force_device_scalars=True)
+
+
+class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
+    def transform_loopy_program(self, t_unit):
+        default_ep = t_unit.default_entrypoint
+        options = default_ep.options
+
+        if not (options.return_dict and options.no_numpy):
+            raise ValueError("Loopy kernel passed to call_loopy must "
+                    "have return_dict and no_numpy options set. "
+                    "Did you use arraycontext.make_loopy_program "
+                    "to create this kernel?")
+
+        return super().transform_loopy_program(t_unit)
+
+
+class PytestPyOpenCLArrayContextFactory(
+        _PytestPyOpenCLArrayContextFactoryWithClass):
+    actx_class = PyOpenCLArrayContext
+
+
+register_pytest_array_context_factory("boxtree.pyopencl",
+        PytestPyOpenCLArrayContextFactory)
--- a/boxtree/bounding_box.py
+++ b/boxtree/bounding_box.py
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

 __license__ = """
@@ -23,11 +21,13 @@ THE SOFTWARE.
 """


+import numpy as np
+
 import pyopencl as cl  # noqa
-from boxtree.tools import get_type_moniker
-from pytools import memoize, memoize_method
 from pyopencl.reduction import ReductionTemplate
-import numpy as np
+from pytools import memoize, memoize_method
+
+from boxtree.tools import get_type_moniker


 @memoize
@@ -35,12 +35,12 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
    from boxtree.tools import AXIS_NAMES
    fields = []
    for i in range(dimensions):
-        fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype))
-        fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype))
+        fields.append((f"min_{AXIS_NAMES[i]}", coord_dtype))
+        fields.append((f"max_{AXIS_NAMES[i]}", coord_dtype))

    dtype = np.dtype(fields)
-
-    name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype))
+    type_moniker = get_type_moniker(coord_dtype)
+    name = f"boxtree_bbox_{dimensions}d_{type_moniker}_t"

    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
@@ -132,7 +132,7 @@ class BoundingBoxFinder:

    @memoize_method
    def get_kernel(self, dimensions, coord_dtype, have_radii):
-        bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
+        bbox_dtype, _bbox_cdecl = make_bounding_box_dtype(
                self.context.devices[0], dimensions, coord_dtype)

        from boxtree.tools import AXIS_NAMES
@@ -158,11 +158,7 @@ class BoundingBoxFinder:
        from pytools import single_valued
        coord_dtype = single_valued(coord.dtype for coord in particles)

-        if radii is None:
-            radii_tuple = ()
-        else:
-            radii_tuple = (radii,)
-
+        radii_tuple = () if radii is None else (radii,)
        knl = self.get_kernel(dimensions, coord_dtype,
                # have_radii:
                radii is not None)

--- a/boxtree/constant_one.py
+++ b/boxtree/constant_one.py
+"""
+.. autoclass:: ConstantOneTreeIndependentDataForWrangler
+.. autoclass:: ConstantOneExpansionWrangler
+"""
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+
+from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
+from boxtree.timing import DummyTimingFuture
+
+
+# {{{ constant one wrangler
+
+
+class ConstantOneTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
+    """
+    .. automethod:: __init__
+    """
+
+
+class ConstantOneExpansionWrangler(ExpansionWranglerInterface):
+    """This implements the 'analytical routines' for a Green's function that is
+    constant 1 everywhere. For 'charges' of 'ones', this should get every particle
+    a copy of the particle count.
+
+    Timing results returned by this wrangler contain the field *ops_elapsed*,
+    which counts approximately the number of floating-point operations required.
+    """
+
+    def _get_source_slice(self, ibox):
+        pstart = self.tree.box_source_starts[ibox]
+        return slice(
+                pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
+
+    def _get_target_slice(self, ibox):
+        pstart = self.tree.box_target_starts[ibox]
+        return slice(
+                pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
+
+    def multipole_expansion_zeros(self):
+        return np.zeros(self.tree.nboxes, dtype=np.float64)
+
+    local_expansion_zeros = multipole_expansion_zeros
+
+    def output_zeros(self):
+        return np.zeros(self.tree.ntargets, dtype=np.float64)
+
+    def reorder_sources(self, source_array):
+        return source_array[self.tree.user_source_ids]
+
+    def reorder_potentials(self, potentials):
+        return potentials[self.tree.sorted_target_ids]
+
+    def multipole_expansions_view(self, mpole_exps, level):
+        # FIXME
+        raise NotImplementedError
+
+    def local_expansions_view(self, local_exps, level):
+        # FIXME
+        raise NotImplementedError
+
+    @staticmethod
+    def timing_future(ops):
+        return DummyTimingFuture.from_op_count(ops)
+
+    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+            src_weight_vecs):
+        src_weights, = src_weight_vecs
+        mpoles = self.multipole_expansion_zeros()
+        ops = 0
+
+        for ibox in source_boxes:
+            pslice = self._get_source_slice(ibox)
+            mpoles[ibox] += np.sum(src_weights[pslice])
+            ops += src_weights[pslice].size
+
+        return mpoles, self.timing_future(ops)
+
+    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
+            source_parent_boxes, mpoles):
+        tree = self.tree
+        ops = 0
+
+        # nlevels-1 is the last valid level index
+        # nlevels-2 is the last valid level that could have children
+        #
+        # 3 is the last relevant source_level.
+        # 2 is the last relevant target_level.
+        # (because no level 1 box will be well-separated from another)
+        for source_level in range(tree.nlevels-1, 2, -1):
+            target_level = source_level - 1
+            start, stop = level_start_source_parent_box_nrs[
+                            target_level:target_level+2]
+            for ibox in source_parent_boxes[start:stop]:
+                for child in tree.box_child_ids[:, ibox]:
+                    if child:
+                        mpoles[ibox] += mpoles[child]
+                        ops += 1
+
+        return mpoles, self.timing_future(ops)
+
+    def eval_direct(self, target_boxes, neighbor_sources_starts,
+            neighbor_sources_lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        pot = self.output_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_boxes):
+            tgt_pslice = self._get_target_slice(tgt_ibox)
+
+            src_sum = 0
+            nsrcs = 0
+            start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
+            # print "DIR: %s <- %s" % (tgt_ibox, neighbor_sources_lists[start:end])
+            for src_ibox in neighbor_sources_lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                nsrcs += src_weights[src_pslice].size
+
+                src_sum += np.sum(src_weights[src_pslice])
+
+            pot[tgt_pslice] = src_sum
+            ops += pot[tgt_pslice].size * nsrcs
+
+        return pot, self.timing_future(ops)
+
+    def multipole_to_local(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
+            starts, lists, mpole_exps):
+        local_exps = self.local_expansion_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
+            start, end = starts[itgt_box:itgt_box+2]
+
+            contrib = 0
+            # print tgt_ibox, "<-", lists[start:end]
+            for src_ibox in lists[start:end]:
+                contrib += mpole_exps[src_ibox]
+                ops += 1
+
+            local_exps[tgt_ibox] += contrib
+
+        return local_exps, self.timing_future(ops)
+
+    def eval_multipoles(self,
+            target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
+            mpole_exps):
+        pot = self.output_zeros()
+        ops = 0
+
+        for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level):
+            for itgt_box, tgt_ibox in \
+                    enumerate(target_boxes_by_source_level[level]):
+                tgt_pslice = self._get_target_slice(tgt_ibox)
+
+                contrib = 0
+
+                start, end = ssn.starts[itgt_box:itgt_box+2]
+                for src_ibox in ssn.lists[start:end]:
+                    contrib += mpole_exps[src_ibox]
+
+                pot[tgt_pslice] += contrib
+                ops += pot[tgt_pslice].size * (end - start)
+
+        return pot, self.timing_future(ops)
+
+    def form_locals(self,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+        src_weights, = src_weight_vecs
+        local_exps = self.local_expansion_zeros()
+        ops = 0
+
+        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
+            start, end = starts[itgt_box:itgt_box+2]
+
+            # print "LIST 4", tgt_ibox, "<-", lists[start:end]
+            contrib = 0
+            nsrcs = 0
+            for src_ibox in lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                nsrcs += src_weights[src_pslice].size
+
+                contrib += np.sum(src_weights[src_pslice])
+
+            local_exps[tgt_ibox] += contrib
+            ops += nsrcs
+
+        return local_exps, self.timing_future(ops)
+
+    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes, local_exps):
+        ops = 0
+
+        for target_lev in range(1, self.tree.nlevels):
+            start, stop = level_start_target_or_target_parent_box_nrs[
+                    target_lev:target_lev+2]
+            for ibox in target_or_target_parent_boxes[start:stop]:
+                local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
+                ops += 1
+
+        return local_exps, self.timing_future(ops)
+
+    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+        pot = self.output_zeros()
+        ops = 0
+
+        for ibox in target_boxes:
+            tgt_pslice = self._get_target_slice(ibox)
+            pot[tgt_pslice] += local_exps[ibox]
+            ops += pot[tgt_pslice].size
+
+        return pot, self.timing_future(ops)
+
+    def finalize_potentials(self, potentials, template_ary):
+        return potentials
+
+# }}}
+
+# vim: foldmethod=marker:filetype=pyopencl
--- a/boxtree/cost.py
+++ b/boxtree/cost.py
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+__doc__ = """
+High-level Point FMM Interface
+------------------------------
+
+To perform point-FMM, first construct a
+:class:`boxtree.distributed.DistributedFMMRunner` object. The constructor will
+distribute the necessary information from the root rank to all worker ranks. Then,
+the :meth:`boxtree.distributed.DistributedFMMRunner.drive_dfmm` can be used for
+launching FMM.
+
+.. autoclass:: boxtree.distributed.DistributedFMMRunner
+
+
+Distributed Algorithm Overview
+------------------------------
+
+1. Construct the global tree and traversal lists on the root rank and broadcast to
+   all worker ranks.
+2. Partition boxes into disjoint sets, where the number of sets is the number of MPI
+   ranks. (See :ref:`partition-boxes`)
+3. Each rank constructs the local tree and traversal lists independently, according
+   to the partition. (See :ref:`construct-local-tree-traversal`)
+4. Distribute source weights from the root rank to all worker ranks. (See
+   :ref:`distributed-wrangler`)
+5. Each rank independently forms multipole expansions from the leaf nodes of the
+   local tree and propagates the partial multipole expansions upwards.
+6. Communicate multipole expansions so that all ranks have the complete multipole
+   expansions needed.
+7. Each rank independently forms local expansions, propagates the local expansions
+   downwards, and evaluate potentials of target points in its partition. The
+   calculated potentials are then assembled on the root rank.
+
+For step 5-7, see :ref:`distributed-fmm-evaluation`.
+
+Note that step 4-7 may be repeated multiple times with the same tree and traversal
+object built from step 1-3. For example, when iteratively solving a PDE, step 4-7 is
+executed for each iteration of the linear solver.
+
+The next sections will cover the interfaces of these steps.
+
+.. _partition-boxes:
+
+Partition Boxes
+---------------
+
+.. autofunction:: boxtree.distributed.partition.partition_work
+
+.. autoclass:: boxtree.distributed.partition.BoxMasks
+
+.. autofunction:: boxtree.distributed.partition.get_box_masks
+
+.. _construct-local-tree-traversal:
+
+Construct Local Tree and Traversal
+----------------------------------
+
+.. autoclass:: boxtree.distributed.local_tree.LocalTree
+
+.. autofunction:: boxtree.distributed.local_tree.generate_local_tree
+
+.. autofunction:: boxtree.distributed.local_traversal.generate_local_travs
+
+.. _distributed-wrangler:
+
+Distributed Wrangler
+--------------------
+
+.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
+
+.. _distributed-fmm-evaluation:
+
+Distributed FMM Evaluation
+--------------------------
+
+The distributed version of the FMM evaluation shares the same interface as the
+shared-memory version. To evaluate FMM in a distributed manner, use a subclass
+of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
+:func:`boxtree.fmm.drive_fmm`.
+
+"""
+
+import enum
+import warnings
+
+import numpy as np
+from mpi4py import MPI
+
+import pyopencl as cl
+import pyopencl.array
+
+from boxtree.cost import FMMCostModel
+
+
+__all__ = ["DistributedFMMRunner"]
+
+
+# {{{ MPI
+
+@enum.unique
+class MPITags(enum.IntEnum):
+    DIST_WEIGHT = 1
+    GATHER_POTENTIALS = 2
+    REDUCE_POTENTIALS = 3
+    REDUCE_INDICES = 4
+
+
+def dtype_to_mpi(dtype):
+    """ This function translates a numpy datatype into the corresponding type used in
+    mpi4py.
+    """
+    if hasattr(MPI, "_typedict"):
+        typedict = MPI._typedict
+    elif hasattr(MPI, "__TypeDict__"):
+        typedict = MPI.__TypeDict__
+    else:
+        raise RuntimeError(
+            "There is no dictionary to translate from Numpy dtype to MPI type")
+
+    mpi_type = typedict.get(np.dtype(dtype).char, None)
+    if mpi_type is None:
+        raise ValueError(f"Could not convert '{dtype}' to an MPI datatype")
+
+    return mpi_type
+
+# }}}
+
+
+# {{{ DistributedFMMRunner
+
+def make_distributed_wrangler(
+        queue, global_tree, traversal_builder, wrangler_factory,
+        calibration_params, comm):
+    """Helper function for constructing the distributed wrangler on each rank.
+
+    .. note::
+
+        This function needs to be called collectively on all ranks.
+
+    :returns: a tuple of ``(wrangler, src_idx_all_ranks, tgt_idx_all_ranks)``,
+        where the wrangler is constructed according to *wrangler_factory* and
+        the indices are passed to :func:`boxtree.fmm.drive_fmm`.
+    """
+
+    mpi_rank = comm.Get_rank()
+
+    # `tree_in_device_memory` is True if the global tree is in the device memory
+    # `tree_in_device_memory` is False if the global tree is in the host memory
+    #
+    # Note that at this point, only the root rank has the valid global tree, so
+    # we test `tree_in_device_memory` on the root rank and broadcast to all
+    # worker ranks.
+    tree_in_device_memory = None
+    if mpi_rank == 0:
+        tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
+    tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
+
+    # {{{ Broadcast the global tree
+
+    global_tree_host = None
+    if mpi_rank == 0:
+        if tree_in_device_memory:
+            global_tree_host = global_tree.get(queue)
+        else:
+            global_tree_host = global_tree
+
+    global_tree_host = comm.bcast(global_tree_host, root=0)
+
+    global_tree_dev = None
+    if mpi_rank == 0 and tree_in_device_memory:
+        global_tree_dev = global_tree
+    else:
+        global_tree_dev = global_tree_host.to_device(queue)
+    global_tree_dev = global_tree_dev.with_queue(queue)
+
+    global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
+    global_trav_host = global_trav_dev.get(queue)
+    global_trav = global_trav_dev if tree_in_device_memory else global_trav_host
+
+    # }}}
+
+    # {{{ Partition work
+
+    cost_per_box = None
+
+    if mpi_rank == 0:
+        cost_model = FMMCostModel()
+
+        if calibration_params is None:
+            # Use default calibration parameters if not supplied
+            # TODO: should replace the default calibration params with a more
+            # accurate one
+            warnings.warn("Calibration parameters for the cost model are not "
+                          "supplied. The default one will be used.",
+                          stacklevel=2)
+            calibration_params = \
+                FMMCostModel.get_unit_calibration_params()
+
+        # We need to construct a wrangler in order to access `level_orders`
+        global_wrangler = wrangler_factory(global_trav, global_trav)
+
+        cost_per_box = cost_model.cost_per_box(
+            queue, global_trav_dev, global_wrangler.level_orders,
+            calibration_params
+        ).get()
+
+    from boxtree.distributed.partition import partition_work
+    responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
+
+    # }}}
+
+    # {{{ Compute local tree
+
+    from boxtree.distributed.local_tree import generate_local_tree
+    local_tree, src_idx, tgt_idx = generate_local_tree(
+        queue, global_trav_host, responsible_boxes_list, comm)
+
+    # }}}
+
+    # {{{ Gather source indices and target indices of each rank
+
+    src_idx_all_ranks = comm.gather(src_idx, root=0)
+    tgt_idx_all_ranks = comm.gather(tgt_idx, root=0)
+
+    # }}}
+
+    # {{{ Compute traversal object on each rank
+
+    from boxtree.distributed.local_traversal import generate_local_travs
+    local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
+
+    if not tree_in_device_memory:
+        local_trav = local_trav_dev.get(queue=queue)
+    else:
+        local_trav = local_trav_dev.with_queue(None)
+
+    # }}}
+
+    wrangler = wrangler_factory(local_trav, global_trav)
+
+    return wrangler, src_idx_all_ranks, tgt_idx_all_ranks
+
+
+class DistributedFMMRunner:
+    """Helper class for setting up and running distributed point FMM.
+
+    .. automethod:: __init__
+    .. automethod:: drive_dfmm
+    """
+    def __init__(self, queue, global_tree,
+                 traversal_builder,
+                 wrangler_factory,
+                 calibration_params=None, comm=MPI.COMM_WORLD):
+        """Construct a DistributedFMMRunner object.
+
+        :arg global_tree: a :class:`boxtree.Tree` object. This tree could live in the
+            host or the device memory, depending on the wrangler. This argument is
+            only significant on the root rank.
+        :arg traversal_builder: an object which, when called, takes a
+            :class:`pyopencl.CommandQueue` object and a :class:`boxtree.Tree` object,
+            and generates a :class:`boxtree.traversal.FMMTraversalInfo` object from
+            the tree using the command queue.
+        :arg wrangler_factory: an object which, when called, takes the local
+            traversal and the global traversal objects and returns an
+            :class:`boxtree.fmm.ExpansionWranglerInterface` object.
+        :arg calibration_params: Calibration parameters for the cost model,
+            if supplied. The cost model is used for estimating the execution time of
+            each box, which is used for improving load balancing.
+        :arg comm: MPI communicator.
+        """
+        self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
+            make_distributed_wrangler(
+                queue, global_tree, traversal_builder, wrangler_factory,
+                calibration_params, comm)
+
+    def drive_dfmm(self, source_weights, timing_data=None):
+        """Calculate potentials at target points.
+        """
+        from boxtree.fmm import drive_fmm
+        return drive_fmm(
+            self.wrangler, source_weights,
+            timing_data=timing_data,
+            global_src_idx_all_ranks=self.src_idx_all_ranks,
+            global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
+
+# }}}
+
+# vim: fdm=marker
--- a/boxtree/distributed/calculation.py
+++ b/boxtree/distributed/calculation.py
+
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+
+import numpy as np
+from mako.template import Template
+from mpi4py import MPI
+
+import pyopencl as cl
+from pyopencl.elementwise import ElementwiseKernel
+from pyopencl.tools import dtype_to_ctype
+from pytools import memoize_method
+
+from boxtree.distributed import MPITags
+from boxtree.fmm import ExpansionWranglerInterface
+from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
+
+
+logger = logging.getLogger(__name__)
+
+
+# {{{ Distributed FMM wrangler
+
+class DistributedExpansionWrangler(ExpansionWranglerInterface):
+    """Distributed expansion wrangler base class.
+
+    This is an abstract class and should not be directly instantiated. Instead, it is
+    expected that all distributed wranglers should be subclasses of this class.
+
+    .. automethod:: __init__
+    .. automethod:: distribute_source_weights
+    .. automethod:: gather_potential_results
+    .. automethod:: communicate_mpoles
+    """
+    def __init__(self, context, comm, global_traversal,
+                 traversal_in_device_memory,
+                 communicate_mpoles_via_allreduce=False):
+        self.context = context
+        self.comm = comm
+        self.global_traversal = global_traversal
+        self.traversal_in_device_memory = traversal_in_device_memory
+        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
+
+    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+
+        if mpi_rank == 0:
+            distribute_weight_req = []
+            local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
+
+            for irank in range(mpi_size):
+                local_src_weight_vecs[irank] = [
+                    source_weights[src_idx_all_ranks[irank]]
+                    for source_weights in src_weight_vecs]
+
+                if irank != 0:
+                    distribute_weight_req.append(self.comm.isend(
+                        local_src_weight_vecs[irank], dest=irank,
+                        tag=MPITags.DIST_WEIGHT
+                    ))
+
+            MPI.Request.Waitall(distribute_weight_req)
+            local_src_weight_vecs = local_src_weight_vecs[0]
+        else:
+            local_src_weight_vecs = self.comm.recv(source=0, tag=MPITags.DIST_WEIGHT)
+
+        return local_src_weight_vecs
+
+    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+
+        from boxtree.distributed import dtype_to_mpi
+        potentials_mpi_type = dtype_to_mpi(potentials.dtype)
+
+        gathered_potentials = None
+
+        if mpi_rank == 0:
+            # The root rank received calculated potentials from all worker ranks
+            potentials_all_ranks = np.empty((mpi_size,), dtype=object)
+            potentials_all_ranks[0] = potentials
+
+            recv_reqs = []
+
+            for irank in range(1, mpi_size):
+                potentials_all_ranks[irank] = np.empty(
+                    tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
+
+                recv_reqs.append(
+                    self.comm.Irecv(
+                        [potentials_all_ranks[irank], potentials_mpi_type],
+                        source=irank, tag=MPITags.GATHER_POTENTIALS))
+
+            MPI.Request.Waitall(recv_reqs)
+
+            # Assemble potentials from worker ranks together on the root rank
+            gathered_potentials = np.empty(
+                self.global_traversal.tree.ntargets, dtype=potentials.dtype)
+
+            for irank in range(mpi_size):
+                gathered_potentials[tgt_idx_all_ranks[irank]] = (
+                    potentials_all_ranks[irank])
+        else:
+            # Worker ranks send calculated potentials to the root rank
+            self.comm.Send([potentials, potentials_mpi_type],
+                           dest=0, tag=MPITags.GATHER_POTENTIALS)
+
+        return gathered_potentials
+
+    def _slice_mpoles(self, mpoles, slice_indices):
+        if len(slice_indices) == 0:
+            return np.empty((0,), dtype=mpoles.dtype)
+
+        level_start_slice_indices = np.searchsorted(
+            slice_indices, self.traversal.tree.level_start_box_nrs)
+        mpoles_list = []
+
+        for ilevel in range(self.traversal.tree.nlevels):
+            start, stop = level_start_slice_indices[ilevel:ilevel+2]
+            if stop > start:
+                level_start_box_idx, mpoles_current_level = (
+                    self.multipole_expansions_view(mpoles, ilevel))
+                mpoles_list.append(
+                    mpoles_current_level[
+                        slice_indices[start:stop] - level_start_box_idx
+                    ].reshape(-1)
+                )
+
+        return np.concatenate(mpoles_list)
+
+    def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
+        if len(slice_indices) == 0:
+            return
+
+        level_start_slice_indices = np.searchsorted(
+            slice_indices, self.traversal.tree.level_start_box_nrs)
+        mpole_updates_start = 0
+
+        for ilevel in range(self.traversal.tree.nlevels):
+            start, stop = level_start_slice_indices[ilevel:ilevel+2]
+            if stop > start:
+                level_start_box_idx, mpoles_current_level = (
+                    self.multipole_expansions_view(mpoles, ilevel))
+                mpoles_shape = (stop - start,) + mpoles_current_level.shape[1:]
+
+                from pytools import product
+                mpole_updates_end = mpole_updates_start + product(mpoles_shape)
+
+                mpoles_current_level[
+                    slice_indices[start:stop] - level_start_box_idx
+                ] += mpole_updates[
+                    mpole_updates_start:mpole_updates_end
+                ].reshape(mpoles_shape)
+
+                mpole_updates_start = mpole_updates_end
+
+    @memoize_method
+    def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
+        return ElementwiseKernel(
+            self.context,
+            Template(r"""
+                ${box_id_t} *contributing_boxes_list,
+                int subrange_start,
+                int subrange_end,
+                ${box_id_t} *box_to_user_rank_starts,
+                int *box_to_user_rank_lists,
+                char *box_in_subrange
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype),
+            ),
+            Template(r"""
+                ${box_id_t} ibox = contributing_boxes_list[i];
+                ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
+                ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
+                for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
+                    int useri = box_to_user_rank_lists[iuser];
+                    if(subrange_start <= useri && useri < subrange_end) {
+                        box_in_subrange[i] = 1;
+                    }
+                }
+            """).render(
+                box_id_t=dtype_to_ctype(box_id_dtype)
+            ),
+            "find_boxes_used_by_subrange"
+        )
+
+    def find_boxes_used_by_subrange(
+            self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
+            contributing_boxes_list):
+        """Test whether the multipole expansions of the contributing boxes are used
+        by at least one box in a range.
+
+        :arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
+        :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
+            indicating the start and end index in *box_to_user_rank_lists* for each
+            box in *contributing_boxes_list*.
+        :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
+            the users of each box in *contributing_boxes_list*.
+        :returns: a :class:`pyopencl.array.Array` object with the same shape as
+            *contributing_boxes_list*, where the i-th entry is 1 if
+            ``contributing_boxes_list[i]`` is used by at least on box in the
+            subrange specified.
+        """
+        box_in_subrange = cl.array.zeros(
+            contributing_boxes_list.queue,
+            contributing_boxes_list.shape[0],
+            dtype=np.int8
+        )
+        knl = self.find_boxes_used_by_subrange_kernel(
+                self.traversal.tree.box_id_dtype)
+
+        knl(
+            contributing_boxes_list,
+            subrange[0],
+            subrange[1],
+            box_to_user_rank_starts,
+            box_to_user_rank_lists,
+            box_in_subrange
+        )
+
+        return box_in_subrange
+
+    def communicate_mpoles(self, mpole_exps, return_stats=False):
+        """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
+
+        The main idea is to mimic an allreduce as done on a hypercube network, but to
+        decrease the bandwidth cost by sending only information that is relevant to
+        the rank receiving the message.
+
+        .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
+            Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
+            Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
+            adaptive fast multipole method on heterogeneous architectures."
+            Communications of the ACM 55, no. 5 (2012): 101-109.
+        """
+        mpi_rank = self.comm.Get_rank()
+        mpi_size = self.comm.Get_size()
+        tree = self.traversal.tree
+
+        if self.communicate_mpoles_via_allreduce:
+            # Use MPI allreduce for communicating multipole expressions. It is slower
+            # but might be helpful for debugging purposes.
+            mpole_exps_all = np.zeros_like(mpole_exps)
+            self.comm.Allreduce(mpole_exps, mpole_exps_all)
+            mpole_exps[:] = mpole_exps_all
+            return
+
+        stats = {}
+
+        # contributing_boxes:
+        #
+        # A mask of the the set of boxes that the current process contributes
+        # to. This process contributes to a box when:
+        #
+        # (a) this process owns sources that contribute to the multipole expansion
+        # in the box (via the upward pass) or
+        # (b) this process has received a portion of the multipole expansion in this
+        # box from another process.
+        #
+        # Initially, this set consists of the boxes satisfying condition (a), which
+        # are precisely the boxes owned by this process and their ancestors.
+        if self.traversal_in_device_memory:
+            with cl.CommandQueue(self.context) as queue:
+                contributing_boxes = tree.ancestor_mask.get(queue=queue)
+                responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
+        else:
+            contributing_boxes = tree.ancestor_mask.copy()
+            responsible_boxes_list = tree.responsible_boxes_list
+        contributing_boxes[responsible_boxes_list] = 1
+
+        from boxtree.tools import AllReduceCommPattern
+        comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
+
+        # Temporary buffers for receiving data
+        mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
+        boxes_list_buf = np.empty(tree.nboxes, dtype=tree.box_id_dtype)
+
+        stats["bytes_sent_by_stage"] = []
+        stats["bytes_recvd_by_stage"] = []
+
+        if self.traversal_in_device_memory:
+            box_to_user_rank_starts_dev = \
+                tree.box_to_user_rank_starts.with_queue(None)
+            box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
+        else:
+            with cl.CommandQueue(self.context) as queue:
+                box_to_user_rank_starts_dev = cl.array.to_device(
+                    queue, tree.box_to_user_rank_starts).with_queue(None)
+                box_to_user_rank_lists_dev = cl.array.to_device(
+                    queue, tree.box_to_user_rank_lists).with_queue(None)
+
+        while not comm_pattern.done():
+            send_requests = []
+
+            # Send data to other processors.
+            if comm_pattern.sinks():
+                # Compute the subset of boxes to be sent.
+                message_subrange = comm_pattern.messages()
+
+                contributing_boxes_list = np.nonzero(contributing_boxes)[0].astype(
+                    tree.box_id_dtype
+                )
+
+                with cl.CommandQueue(self.context) as queue:
+                    contributing_boxes_list_dev = cl.array.to_device(
+                        queue, contributing_boxes_list)
+
+                    box_in_subrange = self.find_boxes_used_by_subrange(
+                        message_subrange,
+                        box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
+                        contributing_boxes_list_dev
+                    )
+
+                    box_in_subrange_host = box_in_subrange.get().astype(bool)
+
+                relevant_boxes_list = contributing_boxes_list[
+                    box_in_subrange_host
+                ].astype(tree.box_id_dtype)
+
+                """
+                # Pure Python version for debugging purpose
+                relevant_boxes_list = []
+                for contrib_box in contributing_boxes_list:
+                    iuser_start, iuser_end = tree.box_to_user_rank_starts[
+                        contrib_box:contrib_box + 2
+                    ]
+                    for user_box in tree.box_to_user_rank_lists[
+                            iuser_start:iuser_end]:
+                        if subrange_start <= user_box < subrange_end:
+                            relevant_boxes_list.append(contrib_box)
+                            break
+                """
+
+                relevant_boxes_list = np.array(
+                    relevant_boxes_list, dtype=tree.box_id_dtype
+                )
+
+                relevant_mpole_exps = self._slice_mpoles(
+                    mpole_exps, relevant_boxes_list)
+
+                # Send the box subset to the other processors.
+                for sink in comm_pattern.sinks():
+                    req = self.comm.Isend(relevant_mpole_exps, dest=sink,
+                                          tag=MPITags.REDUCE_POTENTIALS)
+                    send_requests.append(req)
+
+                    req = self.comm.Isend(relevant_boxes_list, dest=sink,
+                                          tag=MPITags.REDUCE_INDICES)
+                    send_requests.append(req)
+
+            # Receive data from other processors.
+            for source in comm_pattern.sources():
+                self.comm.Recv(mpole_exps_buf, source=source,
+                               tag=MPITags.REDUCE_POTENTIALS)
+
+                status = MPI.Status()
+                self.comm.Recv(
+                    boxes_list_buf, source=source, tag=MPITags.REDUCE_INDICES,
+                    status=status)
+                nboxes = status.Get_count() // boxes_list_buf.dtype.itemsize
+
+                # Update data structures.
+                self._update_mpoles(
+                        mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
+
+                contributing_boxes[boxes_list_buf[:nboxes]] = 1
+
+            for req in send_requests:
+                req.wait()
+
+            comm_pattern.advance()
+
+        if return_stats:
+            return stats
+
+    def finalize_potentials(self, potentials, template_ary):
+        if self.comm.Get_rank() == 0:
+            return super().finalize_potentials(potentials, template_ary)
+        else:
+            return None
+
+
+class DistributedFMMLibExpansionWrangler(
+        DistributedExpansionWrangler, FMMLibExpansionWrangler):
+    def __init__(
+            self, context, comm, tree_indep, local_traversal, global_traversal,
+            fmm_level_to_order=None,
+            communicate_mpoles_via_allreduce=False,
+            **kwargs):
+        DistributedExpansionWrangler.__init__(
+            self, context, comm, global_traversal, False,
+            communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
+        FMMLibExpansionWrangler.__init__(
+            self, tree_indep, local_traversal,
+            fmm_level_to_order=fmm_level_to_order, **kwargs)
+
+    # TODO: use log_process like FMMLibExpansionWrangler?
+    def reorder_sources(self, source_array):
+        if self.comm.Get_rank() == 0:
+            return source_array[..., self.global_traversal.tree.user_source_ids]
+        else:
+            return None
+
+    def reorder_potentials(self, potentials):
+        if self.comm.Get_rank() == 0:
+            return potentials[self.global_traversal.tree.sorted_target_ids]
+        else:
+            return None
+
+# }}}
--- a/boxtree/distributed/local_traversal.py
+++ b/boxtree/distributed/local_traversal.py
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
+                 Copyright (C) 2018 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import time
+
+
+logger = logging.getLogger(__name__)
+
+
+def generate_local_travs(
+        queue, local_tree, traversal_builder, merge_close_lists=False):
+    """Generate local traversal from local tree.
+
+    :arg queue: a :class:`pyopencl.CommandQueue` object.
+    :arg local_tree: the local tree of class
+        `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
+        object will be constructed.
+    :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
+        a tree, returns the traversal object based on the tree.
+
+    :return: generated local traversal object in device memory
+    """
+    start_time = time.time()
+
+    local_tree.with_queue(queue)
+
+    # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
+    # multipole formation and upward propagation within the rank's responsible boxes
+    # region. Had there not been such restrictions, some sources might be distributed
+    # to more than 1 rank and counted multiple times.
+    local_trav, _ = traversal_builder(
+        queue, local_tree.to_device(queue),
+        source_boxes_mask=local_tree.responsible_boxes_mask.device,
+        source_parent_boxes_mask=local_tree.ancestor_mask.device
+    )
+
+    if merge_close_lists and local_tree.targets_have_extent:
+        local_trav = local_trav.merge_close_lists(queue)
+
+    logger.info("Generate local traversal in %f sec.", time.time() - start_time)
+
+    return local_trav