Skip to content
Commits on Source (825)
# https://editorconfig.org/
# https://github.com/editorconfig/editorconfig-vim
# https://github.com/editorconfig/editorconfig-emacs
root = true
[*]
indent_style = space
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
[*.py]
indent_size = 4
[*.rst]
indent_size = 4
[*.cpp]
indent_size = 2
[*.hpp]
indent_size = 2
# There may be one in doc/
[Makefile]
indent_style = tab
# https://github.com/microsoft/vscode/issues/1679
[*.md]
trim_trailing_whitespace = false
version: 2
updates:
# Set update schedule for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
# vim: sw=4
name: Gitlab mirror
on:
push:
branches:
- main
jobs:
autopush:
name: Automatic push to gitlab.tiker.net
if: startsWith(github.repository, 'inducer/')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: |
curl -L -O https://tiker.net/ci-support-v0
. ./ci-support-v0
mirror_github_to_gitlab
env:
GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
# vim: sw=4
name: CI
on:
push:
branches:
- main
pull_request:
schedule:
- cron: '17 3 * * 0'
jobs:
ruff:
name: Ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
pipx install ruff
ruff check
typos:
name: Typos
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: crate-ci/typos@master
pylint:
name: Pylint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_pylint "$(get_proj_name)" examples/*.py test/*.py
docs:
name: Documentation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
build_docs
pytest3:
name: Conda Python 3
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
py3example:
name: Python 3 Examples
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_examples
downstream_tests:
strategy:
matrix:
downstream_project: [sumpy, pytential]
name: Tests for downstream project ${{ matrix.downstream_project }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
env:
DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
run: |
export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
fi
test_downstream "$DOWNSTREAM_PROJECT"
# vim: sw=4
......@@ -12,3 +12,5 @@ setuptools.pth
distribute*egg
distribute*tar.gz
a.out
.cache
Python 2.7 Intel CPU:
script:
- py_version=2.7
- export PYOPENCL_TEST=int:pu
- EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python2.7
- intel-cl-cpu
except:
- tags
Python 2.7 POCL:
script:
- export PY_EXE=python2.7
- export PYOPENCL_TEST=portable
- export EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python2.7
- pocl
except:
- tags
Python 3.5 POCL:
script:
- export PY_EXE=python3.5
- export PYOPENCL_TEST=portable
- export EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python3.5
- pocl
except:
- tags
Python 3 POCL:
script: |
export PYOPENCL_TEST=portable:pthread
export EXTRA_INSTALL="pybind11 numpy mako matplotlib"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
test_py_project
tags:
- python3
- pocl
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Titan V:
script: |
export PYOPENCL_TEST=portable:titan
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-titan-v
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Titan X:
script: |
export PYOPENCL_TEST=portable:titan
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-titan-x
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL K40:
script: |
export PYOPENCL_TEST=portable:k40
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-k40
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL MPI:
script: |
export PYOPENCL_TEST=portable
export EXTRA_INSTALL="numpy mako mpi4py pybind11"
export PYTEST_ADDOPTS="-m 'mpi' --capture=no"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
test_py_project
tags:
- python3
- pocl
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Examples:
script: |
test -n "$SKIP_EXAMPLES" && exit
export PYOPENCL_TEST=portable:pthread
export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
run_examples
tags:
- python3
- pocl
except:
- tags
Pylint:
script: |
export EXTRA_INSTALL="pybind11 numpy mako matplotlib mpi4py"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_pylint "$(get_proj_name)" examples/*.py test/*.py
tags:
- python3
except:
- tags
Documentation:
script: |
EXTRA_INSTALL="pybind11 numpy mako mpi4py"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
build_docs
tags:
- python3
Ruff:
script: |
pipx install ruff
ruff check
tags:
- docker-runner
except:
- tags
Downstream:
parallel:
matrix:
- DOWNSTREAM_PROJECT: [sumpy, pytential]
tags:
- large-node
- "docker-runner"
script: |
export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
curl -L -O https://tiker.net/ci-support-v0
. ./ci-support-v0
test_downstream "$DOWNSTREAM_PROJECT"
- arg: py-version
val: '3.10'
- arg: extension-pkg-whitelist
val: pyfmmlib
# Needed for boxtree.tools
- arg: init-hook
val: import sys; sys.setrecursionlimit(2000)
name: test-conda-env
channels:
- conda-forge
- nodefaults
dependencies:
- python=3
- git
- numpy
- pocl
- pocl-cuda
- mako
- pyopencl
- islpy
- pyfmmlib
- mpi4py
# Only needed to make pylint succeed
- matplotlib-base
# This is intended to prevent conda from selecting 'external' (i.e. empty) builds
# of OpenMPI to satisfy the MPI dependency of mpi4py. It did so in May 2024, leading
# to confusing failues saying
# 'libmpi.so.40: cannot open shared object file: No such file or directory'.
# https://github.com/conda-forge/openmpi-feedstock/issues/153
# https://conda-forge.org/docs/user/tipsandtricks/#using-external-message-passing-interface-mpi-libraries
- openmpi>=5=h*
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Kloeckner"
given-names: "Andreas"
orcid: "https://orcid.org/0000-0003-1228-519X"
- family-names: "Fikl"
given-names: "Alex"
- family-names: "Gao"
given-names: "Hao"
- family-names: "Fernando"
given-names: "Isuru"
- family-names: "Wala"
given-names: "Matt"
- family-names: "Wei"
given-names: "Xiaoyu"
title: "boxtree"
version: 2023.1
date-released: 2023-10-01
doi: 10.5281/zenodo.8397396
url: "https://github.com/inducer/boxtree"
license: MIT
include test/*.py
include examples/*.py
include doc/*.rst
include doc/Makefile
include doc/*.py
include doc/images/*.png
include doc/_static/*.css
include doc/_templates/*.html
include configure.py
include Makefile.in
include README.rst
include requirements.txt
boxtree
=======
boxtree: Quad/Octrees, FMM Traversals, Geometric Queries
========================================================
.. image:: https://gitlab.tiker.net/inducer/boxtree/badges/main/pipeline.svg
:alt: Gitlab Build Status
:target: https://gitlab.tiker.net/inducer/boxtree/commits/main
.. image:: https://github.com/inducer/boxtree/workflows/CI/badge.svg?branch=main&event=push
:alt: Github Build Status
:target: https://github.com/inducer/boxtree/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
.. image:: https://badge.fury.io/py/boxtree.png
:target: http://pypi.python.org/pypi/boxtree
:alt: Python Package Index Release Page
:target: https://pypi.org/project/boxtree/
.. image:: https://zenodo.org/badge/7193697.svg
:alt: Zenodo DOI for latest release
:target: https://zenodo.org/badge/latestdoi/7193697
boxtree is a package that, given some point locations in two or three
``boxtree`` is a package that, given some point locations in two or three
dimensions, sorts them into an adaptive quad/octree of boxes, efficiently, in
parallel, using `PyOpenCL <http://mathema.tician.de/software/pyopencl>`_.
parallel, using `PyOpenCL <https://mathema.tician.de/software/pyopencl>`__.
It can also generate traversal lists needed for adaptive fast multipole methods
and related algorithms and tree-based look-up tables for geometric proximity.
boxtree is under the MIT license.
``boxtree`` is under the MIT license.
Resources:
* `documentation <http://documen.tician.de/boxtree>`_
* `wiki home page <http://wiki.tiker.net/BoxTree>`_
* `source code via git <https://github.com/inducer/boxtree>`_
* `Documentation <https://documen.tician.de/boxtree>`__
* `PyPI package <https://pypi.org/project/boxtree>`__
* `Source Code (GitHub) <https://github.com/inducer/boxtree>`__
from __future__ import division
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
......@@ -22,31 +20,60 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum
from boxtree.tree import Tree, TreeOfBoxes, TreeWithLinkedPointSources, box_flags_enum
from boxtree.tree_build import TreeBuilder
from boxtree.tree_of_boxes import (
coarsen_tree_of_boxes,
make_meshmode_mesh_from_leaves,
make_tree_of_boxes_root,
refine_and_coarsen_tree_of_boxes,
refine_tree_of_boxes,
uniformly_refine_tree_of_boxes,
)
__all__ = [
"Tree", "TreeWithLinkedPointSources",
"TreeBuilder", "box_flags_enum"]
__doc__ = """
__all__ = [
"Tree",
"TreeBuilder",
"TreeOfBoxes",
"TreeWithLinkedPointSources",
"box_flags_enum",
"coarsen_tree_of_boxes",
"make_meshmode_mesh_from_leaves",
"make_tree_of_boxes_root",
"refine_and_coarsen_tree_of_boxes",
"refine_tree_of_boxes",
"uniformly_refine_tree_of_boxes",
]
__doc__ = r"""
:mod:`boxtree` can do three main things:
* it can sort particles into an adaptively refined quad/octree,
see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`.
* it can build quad/octrees (in at least 1D/2D/3D), in one of two modes:
* First, trees can be built as purely a collection of boxes,
see :ref:`tree-of-boxes`. These trees are typically built iteratively,
through refinement and coarsening.
* Second, trees can be built from collections of points,
so that each box contains only a bounded number of these points.
This is the older, less general way to construct trees
that has a fairly heavy focus on point-based Fast Multipole
Methods (FMMs).
See :class:`~Tree` and :class:`~TreeBuilder`.
Note that :class:`~Tree` inherits from :class:`TreeOfBoxes`.
* it can compute fast-multipole-like interaction lists on this tree structure,
see :mod:`boxtree.traversal`. Note that while this traversal generation
builds on the result of particle sorting,
it is completely distinct in the software sense.
* It can compute geometric lookup structures based on a :class:`boxtree.Tree`,
see :mod:`boxtree.geo_lookup`.
* it can compute geometric lookup structures based on a :class:`Tree`,
see :mod:`~boxtree.area_query`.
Tree modes
----------
:mod:`boxtree` can operate in three 'modes':
The particle-based :class:`Tree` can operate in three 'modes':
* one where no distinction is made between sources and targets. In this mode,
all participants in the interaction are called 'particles'.
......@@ -101,8 +128,8 @@ the point sources have their own orderings:
* **user point source order**
* **tree point source order** (tree/box-sorted)
:attr:`TreeWithLinkedPointSources.user_point_source_ids` helps translate point
source arrays into tree order for processing.
:attr:`boxtree.tree.TreeWithLinkedPointSources.user_point_source_ids` helps
translate point source arrays into tree order for processing.
.. _csr:
......@@ -133,4 +160,7 @@ This indexing scheme has the following properties:
is automatically the end of the previous one.
"""
# allow namespace packages
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
# vim: filetype=pyopencl:fdm=marker
This diff is collapsed.
__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
from arraycontext.pytest import (
_PytestPyOpenCLArrayContextFactoryWithClass,
register_pytest_array_context_factory,
)
__doc__ = """
.. autoclass:: PyOpenCLArrayContext
"""
def _acf():
import pyopencl as cl
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
return PyOpenCLArrayContext(queue, force_device_scalars=True)
class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
def transform_loopy_program(self, t_unit):
default_ep = t_unit.default_entrypoint
options = default_ep.options
if not (options.return_dict and options.no_numpy):
raise ValueError("Loopy kernel passed to call_loopy must "
"have return_dict and no_numpy options set. "
"Did you use arraycontext.make_loopy_program "
"to create this kernel?")
return super().transform_loopy_program(t_unit)
class PytestPyOpenCLArrayContextFactory(
_PytestPyOpenCLArrayContextFactoryWithClass):
actx_class = PyOpenCLArrayContext
register_pytest_array_context_factory("boxtree.pyopencl",
PytestPyOpenCLArrayContextFactory)
from __future__ import division
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
......@@ -23,11 +21,13 @@ THE SOFTWARE.
"""
import numpy as np
import pyopencl as cl # noqa
from boxtree.tools import get_type_moniker
from pytools import memoize, memoize_method
from pyopencl.reduction import ReductionTemplate
import numpy as np
from pytools import memoize, memoize_method
from boxtree.tools import get_type_moniker
@memoize
......@@ -35,12 +35,12 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
from boxtree.tools import AXIS_NAMES
fields = []
for i in range(dimensions):
fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype))
fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype))
fields.append((f"min_{AXIS_NAMES[i]}", coord_dtype))
fields.append((f"max_{AXIS_NAMES[i]}", coord_dtype))
dtype = np.dtype(fields)
name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype))
type_moniker = get_type_moniker(coord_dtype)
name = f"boxtree_bbox_{dimensions}d_{type_moniker}_t"
from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
......@@ -132,7 +132,7 @@ class BoundingBoxFinder:
@memoize_method
def get_kernel(self, dimensions, coord_dtype, have_radii):
bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
bbox_dtype, _bbox_cdecl = make_bounding_box_dtype(
self.context.devices[0], dimensions, coord_dtype)
from boxtree.tools import AXIS_NAMES
......@@ -158,11 +158,7 @@ class BoundingBoxFinder:
from pytools import single_valued
coord_dtype = single_valued(coord.dtype for coord in particles)
if radii is None:
radii_tuple = ()
else:
radii_tuple = (radii,)
radii_tuple = () if radii is None else (radii,)
knl = self.get_kernel(dimensions, coord_dtype,
# have_radii:
radii is not None)
......
"""
.. autoclass:: ConstantOneTreeIndependentDataForWrangler
.. autoclass:: ConstantOneExpansionWrangler
"""
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import numpy as np
from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
from boxtree.timing import DummyTimingFuture
# {{{ constant one wrangler
class ConstantOneTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
"""
.. automethod:: __init__
"""
class ConstantOneExpansionWrangler(ExpansionWranglerInterface):
"""This implements the 'analytical routines' for a Green's function that is
constant 1 everywhere. For 'charges' of 'ones', this should get every particle
a copy of the particle count.
Timing results returned by this wrangler contain the field *ops_elapsed*,
which counts approximately the number of floating-point operations required.
"""
def _get_source_slice(self, ibox):
pstart = self.tree.box_source_starts[ibox]
return slice(
pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
def _get_target_slice(self, ibox):
pstart = self.tree.box_target_starts[ibox]
return slice(
pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
def multipole_expansion_zeros(self):
return np.zeros(self.tree.nboxes, dtype=np.float64)
local_expansion_zeros = multipole_expansion_zeros
def output_zeros(self):
return np.zeros(self.tree.ntargets, dtype=np.float64)
def reorder_sources(self, source_array):
return source_array[self.tree.user_source_ids]
def reorder_potentials(self, potentials):
return potentials[self.tree.sorted_target_ids]
def multipole_expansions_view(self, mpole_exps, level):
# FIXME
raise NotImplementedError
def local_expansions_view(self, local_exps, level):
# FIXME
raise NotImplementedError
@staticmethod
def timing_future(ops):
return DummyTimingFuture.from_op_count(ops)
def form_multipoles(self, level_start_source_box_nrs, source_boxes,
src_weight_vecs):
src_weights, = src_weight_vecs
mpoles = self.multipole_expansion_zeros()
ops = 0
for ibox in source_boxes:
pslice = self._get_source_slice(ibox)
mpoles[ibox] += np.sum(src_weights[pslice])
ops += src_weights[pslice].size
return mpoles, self.timing_future(ops)
def coarsen_multipoles(self, level_start_source_parent_box_nrs,
source_parent_boxes, mpoles):
tree = self.tree
ops = 0
# nlevels-1 is the last valid level index
# nlevels-2 is the last valid level that could have children
#
# 3 is the last relevant source_level.
# 2 is the last relevant target_level.
# (because no level 1 box will be well-separated from another)
for source_level in range(tree.nlevels-1, 2, -1):
target_level = source_level - 1
start, stop = level_start_source_parent_box_nrs[
target_level:target_level+2]
for ibox in source_parent_boxes[start:stop]:
for child in tree.box_child_ids[:, ibox]:
if child:
mpoles[ibox] += mpoles[child]
ops += 1
return mpoles, self.timing_future(ops)
def eval_direct(self, target_boxes, neighbor_sources_starts,
neighbor_sources_lists, src_weight_vecs):
src_weights, = src_weight_vecs
pot = self.output_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_boxes):
tgt_pslice = self._get_target_slice(tgt_ibox)
src_sum = 0
nsrcs = 0
start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
# print "DIR: %s <- %s" % (tgt_ibox, neighbor_sources_lists[start:end])
for src_ibox in neighbor_sources_lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
nsrcs += src_weights[src_pslice].size
src_sum += np.sum(src_weights[src_pslice])
pot[tgt_pslice] = src_sum
ops += pot[tgt_pslice].size * nsrcs
return pot, self.timing_future(ops)
def multipole_to_local(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes,
starts, lists, mpole_exps):
local_exps = self.local_expansion_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
contrib = 0
# print tgt_ibox, "<-", lists[start:end]
for src_ibox in lists[start:end]:
contrib += mpole_exps[src_ibox]
ops += 1
local_exps[tgt_ibox] += contrib
return local_exps, self.timing_future(ops)
def eval_multipoles(self,
target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
mpole_exps):
pot = self.output_zeros()
ops = 0
for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level):
for itgt_box, tgt_ibox in \
enumerate(target_boxes_by_source_level[level]):
tgt_pslice = self._get_target_slice(tgt_ibox)
contrib = 0
start, end = ssn.starts[itgt_box:itgt_box+2]
for src_ibox in ssn.lists[start:end]:
contrib += mpole_exps[src_ibox]
pot[tgt_pslice] += contrib
ops += pot[tgt_pslice].size * (end - start)
return pot, self.timing_future(ops)
def form_locals(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, starts, lists, src_weight_vecs):
src_weights, = src_weight_vecs
local_exps = self.local_expansion_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
# print "LIST 4", tgt_ibox, "<-", lists[start:end]
contrib = 0
nsrcs = 0
for src_ibox in lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
nsrcs += src_weights[src_pslice].size
contrib += np.sum(src_weights[src_pslice])
local_exps[tgt_ibox] += contrib
ops += nsrcs
return local_exps, self.timing_future(ops)
def refine_locals(self, level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, local_exps):
ops = 0
for target_lev in range(1, self.tree.nlevels):
start, stop = level_start_target_or_target_parent_box_nrs[
target_lev:target_lev+2]
for ibox in target_or_target_parent_boxes[start:stop]:
local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
ops += 1
return local_exps, self.timing_future(ops)
def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
pot = self.output_zeros()
ops = 0
for ibox in target_boxes:
tgt_pslice = self._get_target_slice(ibox)
pot[tgt_pslice] += local_exps[ibox]
ops += pot[tgt_pslice].size
return pot, self.timing_future(ops)
def finalize_potentials(self, potentials, template_ary):
return potentials
# }}}
# vim: foldmethod=marker:filetype=pyopencl
This diff is collapsed.
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
__doc__ = """
High-level Point FMM Interface
------------------------------
To perform point-FMM, first construct a
:class:`boxtree.distributed.DistributedFMMRunner` object. The constructor will
distribute the necessary information from the root rank to all worker ranks. Then,
the :meth:`boxtree.distributed.DistributedFMMRunner.drive_dfmm` can be used for
launching FMM.
.. autoclass:: boxtree.distributed.DistributedFMMRunner
Distributed Algorithm Overview
------------------------------
1. Construct the global tree and traversal lists on the root rank and broadcast to
all worker ranks.
2. Partition boxes into disjoint sets, where the number of sets is the number of MPI
ranks. (See :ref:`partition-boxes`)
3. Each rank constructs the local tree and traversal lists independently, according
to the partition. (See :ref:`construct-local-tree-traversal`)
4. Distribute source weights from the root rank to all worker ranks. (See
:ref:`distributed-wrangler`)
5. Each rank independently forms multipole expansions from the leaf nodes of the
local tree and propagates the partial multipole expansions upwards.
6. Communicate multipole expansions so that all ranks have the complete multipole
expansions needed.
7. Each rank independently forms local expansions, propagates the local expansions
downwards, and evaluate potentials of target points in its partition. The
calculated potentials are then assembled on the root rank.
For step 5-7, see :ref:`distributed-fmm-evaluation`.
Note that step 4-7 may be repeated multiple times with the same tree and traversal
object built from step 1-3. For example, when iteratively solving a PDE, step 4-7 is
executed for each iteration of the linear solver.
The next sections will cover the interfaces of these steps.
.. _partition-boxes:
Partition Boxes
---------------
.. autofunction:: boxtree.distributed.partition.partition_work
.. autoclass:: boxtree.distributed.partition.BoxMasks
.. autofunction:: boxtree.distributed.partition.get_box_masks
.. _construct-local-tree-traversal:
Construct Local Tree and Traversal
----------------------------------
.. autoclass:: boxtree.distributed.local_tree.LocalTree
.. autofunction:: boxtree.distributed.local_tree.generate_local_tree
.. autofunction:: boxtree.distributed.local_traversal.generate_local_travs
.. _distributed-wrangler:
Distributed Wrangler
--------------------
.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
.. _distributed-fmm-evaluation:
Distributed FMM Evaluation
--------------------------
The distributed version of the FMM evaluation shares the same interface as the
shared-memory version. To evaluate FMM in a distributed manner, use a subclass
of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
:func:`boxtree.fmm.drive_fmm`.
"""
import enum
import warnings
import numpy as np
from mpi4py import MPI
import pyopencl as cl
import pyopencl.array
from boxtree.cost import FMMCostModel
__all__ = ["DistributedFMMRunner"]
# {{{ MPI
@enum.unique
class MPITags(enum.IntEnum):
DIST_WEIGHT = 1
GATHER_POTENTIALS = 2
REDUCE_POTENTIALS = 3
REDUCE_INDICES = 4
def dtype_to_mpi(dtype):
""" This function translates a numpy datatype into the corresponding type used in
mpi4py.
"""
if hasattr(MPI, "_typedict"):
typedict = MPI._typedict
elif hasattr(MPI, "__TypeDict__"):
typedict = MPI.__TypeDict__
else:
raise RuntimeError(
"There is no dictionary to translate from Numpy dtype to MPI type")
mpi_type = typedict.get(np.dtype(dtype).char, None)
if mpi_type is None:
raise ValueError(f"Could not convert '{dtype}' to an MPI datatype")
return mpi_type
# }}}
# {{{ DistributedFMMRunner
def make_distributed_wrangler(
queue, global_tree, traversal_builder, wrangler_factory,
calibration_params, comm):
"""Helper function for constructing the distributed wrangler on each rank.
.. note::
This function needs to be called collectively on all ranks.
:returns: a tuple of ``(wrangler, src_idx_all_ranks, tgt_idx_all_ranks)``,
where the wrangler is constructed according to *wrangler_factory* and
the indices are passed to :func:`boxtree.fmm.drive_fmm`.
"""
mpi_rank = comm.Get_rank()
# `tree_in_device_memory` is True if the global tree is in the device memory
# `tree_in_device_memory` is False if the global tree is in the host memory
#
# Note that at this point, only the root rank has the valid global tree, so
# we test `tree_in_device_memory` on the root rank and broadcast to all
# worker ranks.
tree_in_device_memory = None
if mpi_rank == 0:
tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
# {{{ Broadcast the global tree
global_tree_host = None
if mpi_rank == 0:
if tree_in_device_memory:
global_tree_host = global_tree.get(queue)
else:
global_tree_host = global_tree
global_tree_host = comm.bcast(global_tree_host, root=0)
global_tree_dev = None
if mpi_rank == 0 and tree_in_device_memory:
global_tree_dev = global_tree
else:
global_tree_dev = global_tree_host.to_device(queue)
global_tree_dev = global_tree_dev.with_queue(queue)
global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
global_trav_host = global_trav_dev.get(queue)
global_trav = global_trav_dev if tree_in_device_memory else global_trav_host
# }}}
# {{{ Partition work
cost_per_box = None
if mpi_rank == 0:
cost_model = FMMCostModel()
if calibration_params is None:
# Use default calibration parameters if not supplied
# TODO: should replace the default calibration params with a more
# accurate one
warnings.warn("Calibration parameters for the cost model are not "
"supplied. The default one will be used.",
stacklevel=2)
calibration_params = \
FMMCostModel.get_unit_calibration_params()
# We need to construct a wrangler in order to access `level_orders`
global_wrangler = wrangler_factory(global_trav, global_trav)
cost_per_box = cost_model.cost_per_box(
queue, global_trav_dev, global_wrangler.level_orders,
calibration_params
).get()
from boxtree.distributed.partition import partition_work
responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
# }}}
# {{{ Compute local tree
from boxtree.distributed.local_tree import generate_local_tree
local_tree, src_idx, tgt_idx = generate_local_tree(
queue, global_trav_host, responsible_boxes_list, comm)
# }}}
# {{{ Gather source indices and target indices of each rank
src_idx_all_ranks = comm.gather(src_idx, root=0)
tgt_idx_all_ranks = comm.gather(tgt_idx, root=0)
# }}}
# {{{ Compute traversal object on each rank
from boxtree.distributed.local_traversal import generate_local_travs
local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
if not tree_in_device_memory:
local_trav = local_trav_dev.get(queue=queue)
else:
local_trav = local_trav_dev.with_queue(None)
# }}}
wrangler = wrangler_factory(local_trav, global_trav)
return wrangler, src_idx_all_ranks, tgt_idx_all_ranks
class DistributedFMMRunner:
"""Helper class for setting up and running distributed point FMM.
.. automethod:: __init__
.. automethod:: drive_dfmm
"""
def __init__(self, queue, global_tree,
traversal_builder,
wrangler_factory,
calibration_params=None, comm=MPI.COMM_WORLD):
"""Construct a DistributedFMMRunner object.
:arg global_tree: a :class:`boxtree.Tree` object. This tree could live in the
host or the device memory, depending on the wrangler. This argument is
only significant on the root rank.
:arg traversal_builder: an object which, when called, takes a
:class:`pyopencl.CommandQueue` object and a :class:`boxtree.Tree` object,
and generates a :class:`boxtree.traversal.FMMTraversalInfo` object from
the tree using the command queue.
:arg wrangler_factory: an object which, when called, takes the local
traversal and the global traversal objects and returns an
:class:`boxtree.fmm.ExpansionWranglerInterface` object.
:arg calibration_params: Calibration parameters for the cost model,
if supplied. The cost model is used for estimating the execution time of
each box, which is used for improving load balancing.
:arg comm: MPI communicator.
"""
self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
make_distributed_wrangler(
queue, global_tree, traversal_builder, wrangler_factory,
calibration_params, comm)
def drive_dfmm(self, source_weights, timing_data=None):
"""Calculate potentials at target points.
"""
from boxtree.fmm import drive_fmm
return drive_fmm(
self.wrangler, source_weights,
timing_data=timing_data,
global_src_idx_all_ranks=self.src_idx_all_ranks,
global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
# }}}
# vim: fdm=marker
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import numpy as np
from mako.template import Template
from mpi4py import MPI
import pyopencl as cl
from pyopencl.elementwise import ElementwiseKernel
from pyopencl.tools import dtype_to_ctype
from pytools import memoize_method
from boxtree.distributed import MPITags
from boxtree.fmm import ExpansionWranglerInterface
from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
logger = logging.getLogger(__name__)
# {{{ Distributed FMM wrangler
class DistributedExpansionWrangler(ExpansionWranglerInterface):
"""Distributed expansion wrangler base class.
This is an abstract class and should not be directly instantiated. Instead, it is
expected that all distributed wranglers should be subclasses of this class.
.. automethod:: __init__
.. automethod:: distribute_source_weights
.. automethod:: gather_potential_results
.. automethod:: communicate_mpoles
"""
def __init__(self, context, comm, global_traversal,
traversal_in_device_memory,
communicate_mpoles_via_allreduce=False):
self.context = context
self.comm = comm
self.global_traversal = global_traversal
self.traversal_in_device_memory = traversal_in_device_memory
self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
if mpi_rank == 0:
distribute_weight_req = []
local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
for irank in range(mpi_size):
local_src_weight_vecs[irank] = [
source_weights[src_idx_all_ranks[irank]]
for source_weights in src_weight_vecs]
if irank != 0:
distribute_weight_req.append(self.comm.isend(
local_src_weight_vecs[irank], dest=irank,
tag=MPITags.DIST_WEIGHT
))
MPI.Request.Waitall(distribute_weight_req)
local_src_weight_vecs = local_src_weight_vecs[0]
else:
local_src_weight_vecs = self.comm.recv(source=0, tag=MPITags.DIST_WEIGHT)
return local_src_weight_vecs
def gather_potential_results(self, potentials, tgt_idx_all_ranks):
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
from boxtree.distributed import dtype_to_mpi
potentials_mpi_type = dtype_to_mpi(potentials.dtype)
gathered_potentials = None
if mpi_rank == 0:
# The root rank received calculated potentials from all worker ranks
potentials_all_ranks = np.empty((mpi_size,), dtype=object)
potentials_all_ranks[0] = potentials
recv_reqs = []
for irank in range(1, mpi_size):
potentials_all_ranks[irank] = np.empty(
tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
recv_reqs.append(
self.comm.Irecv(
[potentials_all_ranks[irank], potentials_mpi_type],
source=irank, tag=MPITags.GATHER_POTENTIALS))
MPI.Request.Waitall(recv_reqs)
# Assemble potentials from worker ranks together on the root rank
gathered_potentials = np.empty(
self.global_traversal.tree.ntargets, dtype=potentials.dtype)
for irank in range(mpi_size):
gathered_potentials[tgt_idx_all_ranks[irank]] = (
potentials_all_ranks[irank])
else:
# Worker ranks send calculated potentials to the root rank
self.comm.Send([potentials, potentials_mpi_type],
dest=0, tag=MPITags.GATHER_POTENTIALS)
return gathered_potentials
def _slice_mpoles(self, mpoles, slice_indices):
if len(slice_indices) == 0:
return np.empty((0,), dtype=mpoles.dtype)
level_start_slice_indices = np.searchsorted(
slice_indices, self.traversal.tree.level_start_box_nrs)
mpoles_list = []
for ilevel in range(self.traversal.tree.nlevels):
start, stop = level_start_slice_indices[ilevel:ilevel+2]
if stop > start:
level_start_box_idx, mpoles_current_level = (
self.multipole_expansions_view(mpoles, ilevel))
mpoles_list.append(
mpoles_current_level[
slice_indices[start:stop] - level_start_box_idx
].reshape(-1)
)
return np.concatenate(mpoles_list)
def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
if len(slice_indices) == 0:
return
level_start_slice_indices = np.searchsorted(
slice_indices, self.traversal.tree.level_start_box_nrs)
mpole_updates_start = 0
for ilevel in range(self.traversal.tree.nlevels):
start, stop = level_start_slice_indices[ilevel:ilevel+2]
if stop > start:
level_start_box_idx, mpoles_current_level = (
self.multipole_expansions_view(mpoles, ilevel))
mpoles_shape = (stop - start,) + mpoles_current_level.shape[1:]
from pytools import product
mpole_updates_end = mpole_updates_start + product(mpoles_shape)
mpoles_current_level[
slice_indices[start:stop] - level_start_box_idx
] += mpole_updates[
mpole_updates_start:mpole_updates_end
].reshape(mpoles_shape)
mpole_updates_start = mpole_updates_end
@memoize_method
def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
return ElementwiseKernel(
self.context,
Template(r"""
${box_id_t} *contributing_boxes_list,
int subrange_start,
int subrange_end,
${box_id_t} *box_to_user_rank_starts,
int *box_to_user_rank_lists,
char *box_in_subrange
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
),
Template(r"""
${box_id_t} ibox = contributing_boxes_list[i];
${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
int useri = box_to_user_rank_lists[iuser];
if(subrange_start <= useri && useri < subrange_end) {
box_in_subrange[i] = 1;
}
}
""").render(
box_id_t=dtype_to_ctype(box_id_dtype)
),
"find_boxes_used_by_subrange"
)
def find_boxes_used_by_subrange(
self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
contributing_boxes_list):
"""Test whether the multipole expansions of the contributing boxes are used
by at least one box in a range.
:arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
:arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
indicating the start and end index in *box_to_user_rank_lists* for each
box in *contributing_boxes_list*.
:arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
the users of each box in *contributing_boxes_list*.
:returns: a :class:`pyopencl.array.Array` object with the same shape as
*contributing_boxes_list*, where the i-th entry is 1 if
``contributing_boxes_list[i]`` is used by at least on box in the
subrange specified.
"""
box_in_subrange = cl.array.zeros(
contributing_boxes_list.queue,
contributing_boxes_list.shape[0],
dtype=np.int8
)
knl = self.find_boxes_used_by_subrange_kernel(
self.traversal.tree.box_id_dtype)
knl(
contributing_boxes_list,
subrange[0],
subrange[1],
box_to_user_rank_starts,
box_to_user_rank_lists,
box_in_subrange
)
return box_in_subrange
def communicate_mpoles(self, mpole_exps, return_stats=False):
"""Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
The main idea is to mimic an allreduce as done on a hypercube network, but to
decrease the bandwidth cost by sending only information that is relevant to
the rank receiving the message.
.. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
adaptive fast multipole method on heterogeneous architectures."
Communications of the ACM 55, no. 5 (2012): 101-109.
"""
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
tree = self.traversal.tree
if self.communicate_mpoles_via_allreduce:
# Use MPI allreduce for communicating multipole expressions. It is slower
# but might be helpful for debugging purposes.
mpole_exps_all = np.zeros_like(mpole_exps)
self.comm.Allreduce(mpole_exps, mpole_exps_all)
mpole_exps[:] = mpole_exps_all
return
stats = {}
# contributing_boxes:
#
# A mask of the the set of boxes that the current process contributes
# to. This process contributes to a box when:
#
# (a) this process owns sources that contribute to the multipole expansion
# in the box (via the upward pass) or
# (b) this process has received a portion of the multipole expansion in this
# box from another process.
#
# Initially, this set consists of the boxes satisfying condition (a), which
# are precisely the boxes owned by this process and their ancestors.
if self.traversal_in_device_memory:
with cl.CommandQueue(self.context) as queue:
contributing_boxes = tree.ancestor_mask.get(queue=queue)
responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
else:
contributing_boxes = tree.ancestor_mask.copy()
responsible_boxes_list = tree.responsible_boxes_list
contributing_boxes[responsible_boxes_list] = 1
from boxtree.tools import AllReduceCommPattern
comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
# Temporary buffers for receiving data
mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
boxes_list_buf = np.empty(tree.nboxes, dtype=tree.box_id_dtype)
stats["bytes_sent_by_stage"] = []
stats["bytes_recvd_by_stage"] = []
if self.traversal_in_device_memory:
box_to_user_rank_starts_dev = \
tree.box_to_user_rank_starts.with_queue(None)
box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
else:
with cl.CommandQueue(self.context) as queue:
box_to_user_rank_starts_dev = cl.array.to_device(
queue, tree.box_to_user_rank_starts).with_queue(None)
box_to_user_rank_lists_dev = cl.array.to_device(
queue, tree.box_to_user_rank_lists).with_queue(None)
while not comm_pattern.done():
send_requests = []
# Send data to other processors.
if comm_pattern.sinks():
# Compute the subset of boxes to be sent.
message_subrange = comm_pattern.messages()
contributing_boxes_list = np.nonzero(contributing_boxes)[0].astype(
tree.box_id_dtype
)
with cl.CommandQueue(self.context) as queue:
contributing_boxes_list_dev = cl.array.to_device(
queue, contributing_boxes_list)
box_in_subrange = self.find_boxes_used_by_subrange(
message_subrange,
box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
contributing_boxes_list_dev
)
box_in_subrange_host = box_in_subrange.get().astype(bool)
relevant_boxes_list = contributing_boxes_list[
box_in_subrange_host
].astype(tree.box_id_dtype)
"""
# Pure Python version for debugging purpose
relevant_boxes_list = []
for contrib_box in contributing_boxes_list:
iuser_start, iuser_end = tree.box_to_user_rank_starts[
contrib_box:contrib_box + 2
]
for user_box in tree.box_to_user_rank_lists[
iuser_start:iuser_end]:
if subrange_start <= user_box < subrange_end:
relevant_boxes_list.append(contrib_box)
break
"""
relevant_boxes_list = np.array(
relevant_boxes_list, dtype=tree.box_id_dtype
)
relevant_mpole_exps = self._slice_mpoles(
mpole_exps, relevant_boxes_list)
# Send the box subset to the other processors.
for sink in comm_pattern.sinks():
req = self.comm.Isend(relevant_mpole_exps, dest=sink,
tag=MPITags.REDUCE_POTENTIALS)
send_requests.append(req)
req = self.comm.Isend(relevant_boxes_list, dest=sink,
tag=MPITags.REDUCE_INDICES)
send_requests.append(req)
# Receive data from other processors.
for source in comm_pattern.sources():
self.comm.Recv(mpole_exps_buf, source=source,
tag=MPITags.REDUCE_POTENTIALS)
status = MPI.Status()
self.comm.Recv(
boxes_list_buf, source=source, tag=MPITags.REDUCE_INDICES,
status=status)
nboxes = status.Get_count() // boxes_list_buf.dtype.itemsize
# Update data structures.
self._update_mpoles(
mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
contributing_boxes[boxes_list_buf[:nboxes]] = 1
for req in send_requests:
req.wait()
comm_pattern.advance()
if return_stats:
return stats
def finalize_potentials(self, potentials, template_ary):
if self.comm.Get_rank() == 0:
return super().finalize_potentials(potentials, template_ary)
else:
return None
class DistributedFMMLibExpansionWrangler(
DistributedExpansionWrangler, FMMLibExpansionWrangler):
def __init__(
self, context, comm, tree_indep, local_traversal, global_traversal,
fmm_level_to_order=None,
communicate_mpoles_via_allreduce=False,
**kwargs):
DistributedExpansionWrangler.__init__(
self, context, comm, global_traversal, False,
communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
FMMLibExpansionWrangler.__init__(
self, tree_indep, local_traversal,
fmm_level_to_order=fmm_level_to_order, **kwargs)
# TODO: use log_process like FMMLibExpansionWrangler?
def reorder_sources(self, source_array):
if self.comm.Get_rank() == 0:
return source_array[..., self.global_traversal.tree.user_source_ids]
else:
return None
def reorder_potentials(self, potentials):
if self.comm.Get_rank() == 0:
return potentials[self.global_traversal.tree.sorted_target_ids]
else:
return None
# }}}
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import time
logger = logging.getLogger(__name__)
def generate_local_travs(
queue, local_tree, traversal_builder, merge_close_lists=False):
"""Generate local traversal from local tree.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg local_tree: the local tree of class
`boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
object will be constructed.
:arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
a tree, returns the traversal object based on the tree.
:return: generated local traversal object in device memory
"""
start_time = time.time()
local_tree.with_queue(queue)
# We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
# multipole formation and upward propagation within the rank's responsible boxes
# region. Had there not been such restrictions, some sources might be distributed
# to more than 1 rank and counted multiple times.
local_trav, _ = traversal_builder(
queue, local_tree.to_device(queue),
source_boxes_mask=local_tree.responsible_boxes_mask.device,
source_parent_boxes_mask=local_tree.ancestor_mask.device
)
if merge_close_lists and local_tree.targets_have_extent:
local_trav = local_trav.merge_close_lists(queue)
logger.info("Generate local traversal in %f sec.", time.time() - start_time)
return local_trav