Skip to content
Commits on Source (825)
# https://editorconfig.org/
# https://github.com/editorconfig/editorconfig-vim
# https://github.com/editorconfig/editorconfig-emacs
root = true
[*]
indent_style = space
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
[*.py]
indent_size = 4
[*.rst]
indent_size = 4
[*.cpp]
indent_size = 2
[*.hpp]
indent_size = 2
# There may be one in doc/
[Makefile]
indent_style = tab
# https://github.com/microsoft/vscode/issues/1679
[*.md]
trim_trailing_whitespace = false
version: 2
updates:
# Set update schedule for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
# vim: sw=4
name: Gitlab mirror
on:
push:
branches:
- main
jobs:
autopush:
name: Automatic push to gitlab.tiker.net
if: startsWith(github.repository, 'inducer/')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: |
curl -L -O https://tiker.net/ci-support-v0
. ./ci-support-v0
mirror_github_to_gitlab
env:
GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
# vim: sw=4
name: CI
on:
push:
branches:
- main
pull_request:
schedule:
- cron: '17 3 * * 0'
jobs:
ruff:
name: Ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
pipx install ruff
ruff check
typos:
name: Typos
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: crate-ci/typos@master
pylint:
name: Pylint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_pylint "$(get_proj_name)" examples/*.py test/*.py
docs:
name: Documentation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
build_docs
pytest3:
name: Conda Python 3
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
py3example:
name: Python 3 Examples
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
run: |
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_examples
downstream_tests:
strategy:
matrix:
downstream_project: [sumpy, pytential]
name: Tests for downstream project ${{ matrix.downstream_project }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Main Script"
env:
DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
run: |
export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
fi
test_downstream "$DOWNSTREAM_PROJECT"
# vim: sw=4
......@@ -12,3 +12,5 @@ setuptools.pth
distribute*egg
distribute*tar.gz
a.out
.cache
Python 2.7 Intel CPU:
script:
- py_version=2.7
- export PYOPENCL_TEST=int:pu
- EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python2.7
- intel-cl-cpu
except:
- tags
Python 2.7 POCL:
script:
- export PY_EXE=python2.7
- export PYOPENCL_TEST=portable
- export EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python2.7
- pocl
except:
- tags
Python 3.5 POCL:
script:
- export PY_EXE=python3.5
- export PYOPENCL_TEST=portable
- export EXTRA_INSTALL="numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
tags:
- python3.5
- pocl
except:
- tags
Python 3 POCL:
script: |
export PYOPENCL_TEST=portable:pthread
export EXTRA_INSTALL="pybind11 numpy mako matplotlib"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
test_py_project
tags:
- python3
- pocl
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Titan V:
script: |
export PYOPENCL_TEST=portable:titan
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-titan-v
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Titan X:
script: |
export PYOPENCL_TEST=portable:titan
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-titan-x
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL K40:
script: |
export PYOPENCL_TEST=portable:k40
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
test_py_project
tags:
- nvidia-k40
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL MPI:
script: |
export PYOPENCL_TEST=portable
export EXTRA_INSTALL="numpy mako mpi4py pybind11"
export PYTEST_ADDOPTS="-m 'mpi' --capture=no"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
test_py_project
tags:
- python3
- pocl
except:
- tags
artifacts:
reports:
junit: test/pytest.xml
Python 3 POCL Examples:
script: |
test -n "$SKIP_EXAMPLES" && exit
export PYOPENCL_TEST=portable:pthread
export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_venv
run_examples
tags:
- python3
- pocl
except:
- tags
Pylint:
script: |
export EXTRA_INSTALL="pybind11 numpy mako matplotlib mpi4py"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
run_pylint "$(get_proj_name)" examples/*.py test/*.py
tags:
- python3
except:
- tags
Documentation:
script: |
EXTRA_INSTALL="pybind11 numpy mako mpi4py"
curl -L -O https://tiker.net/ci-support-v0
. ci-support-v0
build_py_project_in_conda_env
build_docs
tags:
- python3
Ruff:
script: |
pipx install ruff
ruff check
tags:
- docker-runner
except:
- tags
Downstream:
parallel:
matrix:
- DOWNSTREAM_PROJECT: [sumpy, pytential]
tags:
- large-node
- "docker-runner"
script: |
export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:-"-m 'not slowtest'"}
curl -L -O https://tiker.net/ci-support-v0
. ./ci-support-v0
test_downstream "$DOWNSTREAM_PROJECT"
- arg: py-version
val: '3.10'
- arg: extension-pkg-whitelist
val: pyfmmlib
# Needed for boxtree.tools
- arg: init-hook
val: import sys; sys.setrecursionlimit(2000)
name: test-conda-env
channels:
- conda-forge
- nodefaults
dependencies:
- python=3
- git
- numpy
- pocl
- pocl-cuda
- mako
- pyopencl
- islpy
- pyfmmlib
- mpi4py
# Only needed to make pylint succeed
- matplotlib-base
# This is intended to prevent conda from selecting 'external' (i.e. empty) builds
# of OpenMPI to satisfy the MPI dependency of mpi4py. It did so in May 2024, leading
# to confusing failues saying
# 'libmpi.so.40: cannot open shared object file: No such file or directory'.
# https://github.com/conda-forge/openmpi-feedstock/issues/153
# https://conda-forge.org/docs/user/tipsandtricks/#using-external-message-passing-interface-mpi-libraries
- openmpi>=5=h*
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Kloeckner"
given-names: "Andreas"
orcid: "https://orcid.org/0000-0003-1228-519X"
- family-names: "Fikl"
given-names: "Alex"
- family-names: "Gao"
given-names: "Hao"
- family-names: "Fernando"
given-names: "Isuru"
- family-names: "Wala"
given-names: "Matt"
- family-names: "Wei"
given-names: "Xiaoyu"
title: "boxtree"
version: 2023.1
date-released: 2023-10-01
doi: 10.5281/zenodo.8397396
url: "https://github.com/inducer/boxtree"
license: MIT
include test/*.py
include examples/*.py
include doc/*.rst
include doc/Makefile
include doc/*.py
include doc/images/*.png
include doc/_static/*.css
include doc/_templates/*.html
include configure.py
include Makefile.in
include README.rst
include requirements.txt
boxtree
=======
boxtree: Quad/Octrees, FMM Traversals, Geometric Queries
========================================================
.. image:: https://gitlab.tiker.net/inducer/boxtree/badges/main/pipeline.svg
:alt: Gitlab Build Status
:target: https://gitlab.tiker.net/inducer/boxtree/commits/main
.. image:: https://github.com/inducer/boxtree/workflows/CI/badge.svg?branch=main&event=push
:alt: Github Build Status
:target: https://github.com/inducer/boxtree/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
.. image:: https://badge.fury.io/py/boxtree.png
:target: http://pypi.python.org/pypi/boxtree
:alt: Python Package Index Release Page
:target: https://pypi.org/project/boxtree/
.. image:: https://zenodo.org/badge/7193697.svg
:alt: Zenodo DOI for latest release
:target: https://zenodo.org/badge/latestdoi/7193697
boxtree is a package that, given some point locations in two or three
``boxtree`` is a package that, given some point locations in two or three
dimensions, sorts them into an adaptive quad/octree of boxes, efficiently, in
parallel, using `PyOpenCL <http://mathema.tician.de/software/pyopencl>`_.
parallel, using `PyOpenCL <https://mathema.tician.de/software/pyopencl>`__.
It can also generate traversal lists needed for adaptive fast multipole methods
and related algorithms and tree-based look-up tables for geometric proximity.
boxtree is under the MIT license.
``boxtree`` is under the MIT license.
Resources:
* `documentation <http://documen.tician.de/boxtree>`_
* `wiki home page <http://wiki.tiker.net/BoxTree>`_
* `source code via git <https://github.com/inducer/boxtree>`_
* `Documentation <https://documen.tician.de/boxtree>`__
* `PyPI package <https://pypi.org/project/boxtree>`__
* `Source Code (GitHub) <https://github.com/inducer/boxtree>`__
from __future__ import division
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
......@@ -22,31 +20,60 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum
from boxtree.tree import Tree, TreeOfBoxes, TreeWithLinkedPointSources, box_flags_enum
from boxtree.tree_build import TreeBuilder
from boxtree.tree_of_boxes import (
coarsen_tree_of_boxes,
make_meshmode_mesh_from_leaves,
make_tree_of_boxes_root,
refine_and_coarsen_tree_of_boxes,
refine_tree_of_boxes,
uniformly_refine_tree_of_boxes,
)
__all__ = [
"Tree", "TreeWithLinkedPointSources",
"TreeBuilder", "box_flags_enum"]
__doc__ = """
__all__ = [
"Tree",
"TreeBuilder",
"TreeOfBoxes",
"TreeWithLinkedPointSources",
"box_flags_enum",
"coarsen_tree_of_boxes",
"make_meshmode_mesh_from_leaves",
"make_tree_of_boxes_root",
"refine_and_coarsen_tree_of_boxes",
"refine_tree_of_boxes",
"uniformly_refine_tree_of_boxes",
]
__doc__ = r"""
:mod:`boxtree` can do three main things:
* it can sort particles into an adaptively refined quad/octree,
see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`.
* it can build quad/octrees (in at least 1D/2D/3D), in one of two modes:
* First, trees can be built as purely a collection of boxes,
see :ref:`tree-of-boxes`. These trees are typically built iteratively,
through refinement and coarsening.
* Second, trees can be built from collections of points,
so that each box contains only a bounded number of these points.
This is the older, less general way to construct trees
that has a fairly heavy focus on point-based Fast Multipole
Methods (FMMs).
See :class:`~Tree` and :class:`~TreeBuilder`.
Note that :class:`~Tree` inherits from :class:`TreeOfBoxes`.
* it can compute fast-multipole-like interaction lists on this tree structure,
see :mod:`boxtree.traversal`. Note that while this traversal generation
builds on the result of particle sorting,
it is completely distinct in the software sense.
* It can compute geometric lookup structures based on a :class:`boxtree.Tree`,
see :mod:`boxtree.geo_lookup`.
* it can compute geometric lookup structures based on a :class:`Tree`,
see :mod:`~boxtree.area_query`.
Tree modes
----------
:mod:`boxtree` can operate in three 'modes':
The particle-based :class:`Tree` can operate in three 'modes':
* one where no distinction is made between sources and targets. In this mode,
all participants in the interaction are called 'particles'.
......@@ -101,8 +128,8 @@ the point sources have their own orderings:
* **user point source order**
* **tree point source order** (tree/box-sorted)
:attr:`TreeWithLinkedPointSources.user_point_source_ids` helps translate point
source arrays into tree order for processing.
:attr:`boxtree.tree.TreeWithLinkedPointSources.user_point_source_ids` helps
translate point source arrays into tree order for processing.
.. _csr:
......@@ -133,4 +160,7 @@ This indexing scheme has the following properties:
is automatically the end of the previous one.
"""
# allow namespace packages
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
# vim: filetype=pyopencl:fdm=marker
# -*- coding: utf-8 -*-
from __future__ import division
__copyright__ = """Copyright (C) 2016 Matt Wala"""
__copyright__ = """
Copyright (C) 2013 Andreas Kloeckner
Copyright (C) 2016 Matt Wala"""
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
......@@ -24,14 +23,25 @@ THE SOFTWARE.
"""
import logging
from functools import partial
import numpy as np
import pyopencl as cl
import pyopencl.array # noqa
from mako.template import Template
from boxtree.tools import AXIS_NAMES, DeviceDataRecord
from pytools import memoize_method
import logging
import pyopencl as cl
import pyopencl.array
import pyopencl.cltypes
from pytools import ProcessLogger, memoize_method
from boxtree.tools import (
AXIS_NAMES,
DeviceDataRecord,
coord_vec_subscript_code,
get_coord_vec_dtype,
)
logger = logging.getLogger(__name__)
......@@ -44,6 +54,20 @@ Area queries (Balls -> overlapping leaves)
.. autoclass:: AreaQueryResult
Inverse of area query (Leaves -> overlapping balls)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: LeavesToBallsLookupBuilder
.. autoclass:: LeavesToBallsLookup
Space invader queries
^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: SpaceInvaderQueryBuilder
Peer Lists
^^^^^^^^^^
......@@ -86,8 +110,8 @@ class AreaQueryResult(DeviceDataRecord):
.. attribute:: leaves_near_ball_starts
Indices into :attr:`query_lists`.
``query_lists[leaves_near_ball_starts[ball_nr]:
Indices into :attr:`leaves_near_ball_lists`.
``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]:
leaves_near_ball_starts[ball_nr]+1]``
results in a list of leaf boxes that intersect `ball_nr`.
......@@ -98,138 +122,218 @@ class AreaQueryResult(DeviceDataRecord):
.. versionadded:: 2016.1
"""
# }}}
# {{{ kernel templates
AREA_QUERY_TEMPLATE = r"""//CL//
typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
typedef ${dtype_to_ctype(peer_list_idx_dtype)} peer_list_idx_t;
class LeavesToBallsLookup(DeviceDataRecord):
"""
.. attribute:: tree
<%def name="add_box_to_list_if_overlaps_ball(box_id)">
{
bool is_overlapping;
The :class:`boxtree.Tree` instance used to build this lookup.
${check_l_infty_ball_overlap(
"is_overlapping", box_id, "ball_radius", "ball_center")}
.. attribute:: balls_near_box_starts
if (is_overlapping)
{
APPEND_leaves(${box_id});
}
}
</%def>
Indices into :attr:`balls_near_box_lists`.
``balls_near_box_lists[balls_near_box_starts[ibox]:
balls_near_box_starts[ibox]+1]``
results in a list of balls that overlap leaf box *ibox*.
void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
{
coord_vec_t ball_center;
%for i in range(dimensions):
ball_center.${AXIS_NAMES[i]} = ball_${AXIS_NAMES[i]}[ball_nr];
%endfor
.. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
this list is indexed by the global box index.
coord_t ball_radius = ball_radii[ball_nr];
.. attribute:: balls_near_box_lists
///////////////////////////////////
// Step 1: Find the guiding box. //
///////////////////////////////////
.. automethod:: get
"""
${walk_init(0)}
box_id_t guiding_box;
# }}}
if (LEVEL_TO_RAD(0) < ball_radius / 2 || !(box_flags[0] & BOX_HAS_CHILDREN))
{
guiding_box = 0;
continue_walk = false;
}
while (continue_walk)
{
// Get the next child.
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
# {{{ kernel templates
bool last_child = walk_morton_nr == ${2**dimensions - 1};
GUIDING_BOX_FINDER_MACRO = r"""//CL:mako//
<%def name="initialize_coord_vec(vector_name, entries)">
<% assert len(entries) == dimensions %>
${vector_name} = (coord_vec_t) (${", ".join(entries)});
</%def>
if (child_box_id)
<%def name="find_guiding_box(ball_center, ball_radius, box='guiding_box')">
box_id_t ${box} = 0;
{
bool contains_ball_center;
int child_level = walk_level + 1;
coord_t child_rad = LEVEL_TO_RAD(child_level);
//
// Step 1: Ensure that the center is within the bounding box.
//
coord_vec_t query_center, bbox_min, bbox_max;
${initialize_coord_vec(
"bbox_min", ["bbox_min_" + ax for ax in AXIS_NAMES[:dimensions]])}
// bbox_max should be smaller than the true bounding box, so that
// (query_center - bbox_min) / root_extent entries are in the half open
// interval [0, 1).
bbox_max = bbox_min + (coord_t) (
root_extent / (1 + ${root_extent_stretch_factor}));
query_center = min(bbox_max, max(bbox_min, ${ball_center}));
//
// Step 2: Compute the query radius. This can be effectively
// smaller than the original ball radius, if the center
// isn't in the bounding box (see picture):
//
// +-----------+
// | |
// | |
// | |
// +-------|-+ |
// | | | |
// | +-----------+ <= bounding box
// | |
// | |
// +---------+ <= query box
//
// <---> <= original radius
// <> <= effective radius
//
coord_t query_radius = 0;
%for mnr in range(2**dimensions):
{
// Check if the child contains the ball's center.
${load_center("child_center", "child_box_id")}
coord_vec_t offset;
${initialize_coord_vec("offset",
["{sign}{ball_radius}".format(
sign="+" if (2**(dimensions-1-idim) & mnr) else "-",
ball_radius=ball_radius)
for idim in range(dimensions)])}
coord_vec_t corner = min(
bbox_max, max(bbox_min, ${ball_center} + offset));
coord_t max_dist = 0;
coord_vec_t dist = fabs(corner - query_center);
%for i in range(dimensions):
max_dist = fmax(max_dist,
fabs(ball_center.s${i} - child_center.s${i}));
query_radius = fmax(query_radius, ${cvec_sub("dist", i)});
%endfor
contains_ball_center = max_dist <= child_rad;
}
%endfor
//
// Step 3: Find the guiding box.
//
if (contains_ball_center)
// Descend when root is not the guiding box.
if (LEVEL_TO_RAD(0) / 2 >= query_radius)
{
if ((child_rad / 2 < ball_radius && ball_radius <= child_rad) ||
!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
for (unsigned box_level = 0;; ++box_level)
{
guiding_box = child_box_id;
break;
}
if (/* Found leaf? */
!(box_flags[${box}] & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES)
/* Found guiding box? */
|| (LEVEL_TO_RAD(box_level) / 2 < query_radius
&& query_radius <= LEVEL_TO_RAD(box_level)))
{
break;
}
// Find the child containing the ball center.
//
// Logic intended to match the morton nr scan kernel.
coord_vec_t offset_scaled =
(query_center - bbox_min) / root_extent;
// We want to descend into this box. Put the current state
// on the stack.
${walk_push("child_box_id")}
continue;
// Invariant: offset_scaled entries are in [0, 1).
%for ax in AXIS_NAMES[:dimensions]:
unsigned ${ax}_bits = (unsigned) (
offset_scaled.${ax} * (1U << (1 + box_level)));
%endfor
// Pick off the lowest-order bit for each axis, put it in
// its place.
int level_morton_number = 0
%for iax, ax in enumerate(AXIS_NAMES[:dimensions]):
| (${ax}_bits & 1U) << (${dimensions-1-iax})
%endfor
;
box_id_t next_box = box_child_ids[
level_morton_number * aligned_nboxes + ${box}];
if (next_box)
{
${box} = next_box;
}
else
{
// Child does not exist, this must be the guiding box
break;
}
}
}
}
</%def>
"""
if (last_child)
{
// This box has no children that contain the center, so it must
// be the guiding box.
guiding_box = walk_box_id;
break;
}
${walk_advance()}
}
AREA_QUERY_WALKER_BODY = r"""
coord_vec_t ball_center;
coord_t ball_radius;
${get_ball_center_and_radius("ball_center", "ball_radius", "i")}
///////////////////////////////////
// Step 1: Find the guiding box. //
///////////////////////////////////
${find_guiding_box("ball_center", "ball_radius")}
//////////////////////////////////////////////////////
// Step 2 - Walk the peer boxes to find the leaves. //
//////////////////////////////////////////////////////
for (peer_list_idx_t pb_i = peer_list_starts[guiding_box],
pb_e = peer_list_starts[guiding_box+1]; pb_i < pb_e; ++pb_i)
{
box_id_t peer_box = peer_lists[pb_i];
if (!(box_flags[peer_box] & BOX_HAS_CHILDREN))
if (!(box_flags[peer_box] & BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
{
${add_box_to_list_if_overlaps_ball("peer_box")}
bool is_overlapping;
${check_l_infty_ball_overlap(
"is_overlapping", "peer_box", "ball_radius", "ball_center")}
if (is_overlapping)
{
${leaf_found_op("peer_box", "ball_center", "ball_radius")}
}
}
else
{
${walk_reset("peer_box")}
${walk_init("peer_box")}
while (continue_walk)
{
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
${walk_get_box_id()}
if (child_box_id)
if (walk_box_id)
{
if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
if (!(box_flags[walk_box_id]
& BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
{
${add_box_to_list_if_overlaps_ball("child_box_id")}
bool is_overlapping;
${check_l_infty_ball_overlap(
"is_overlapping", "walk_box_id",
"ball_radius", "ball_center")}
if (is_overlapping)
{
${leaf_found_op(
"walk_box_id", "ball_center", "ball_radius")}
}
}
else
{
// We want to descend into this box. Put the current state
// on the stack.
${walk_push("child_box_id")}
${walk_push("walk_box_id")}
continue;
}
}
......@@ -238,9 +342,34 @@ void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
}
}
}
}
"""
AREA_QUERY_TEMPLATE = (
GUIDING_BOX_FINDER_MACRO + r"""//CL//
typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
typedef ${dtype_to_ctype(peer_list_idx_dtype)} peer_list_idx_t;
<%def name="get_ball_center_and_radius(ball_center, ball_radius, i)">
%for ax in AXIS_NAMES[:dimensions]:
${ball_center}.${ax} = ball_${ax}[${i}];
%endfor
${ball_radius} = ball_radii[${i}];
</%def>
<%def name="leaf_found_op(leaf_box_id, ball_center, ball_radius)">
APPEND_leaves(${leaf_box_id});
</%def>
void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t i)
{
"""
+ AREA_QUERY_WALKER_BODY
+ """
}
""")
PEER_LIST_FINDER_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
......@@ -262,27 +391,27 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
while (continue_walk)
{
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
${walk_get_box_id()}
if (child_box_id)
if (walk_box_id)
{
${load_center("child_center", "child_box_id")}
${load_center("walk_center", "walk_box_id")}
// child_box_id lives on walk_level+1.
// walk_box_id lives on level walk_stack_size+1.
bool a_or_o = is_adjacent_or_overlapping(root_extent,
center, level, child_center, walk_level+1, false);
center, level, walk_center, walk_stack_size+1);
if (a_or_o)
{
// child_box_id lives on walk_level+1.
if (walk_level+1 == level)
// walk_box_id lives on level walk_stack_size+1.
if (walk_stack_size+1 == level)
{
APPEND_peers(child_box_id);
APPEND_peers(walk_box_id);
}
else if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
else if (!(box_flags[walk_box_id]
& BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES))
{
APPEND_peers(child_box_id);
APPEND_peers(walk_box_id);
}
else
{
......@@ -295,25 +424,24 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
++morton_nr)
{
box_id_t next_child_id = box_child_ids[
morton_nr * aligned_nboxes + child_box_id];
morton_nr * aligned_nboxes + walk_box_id];
if (next_child_id)
{
${load_center("next_child_center", "next_child_id")}
${load_center("next_walk_center", "next_child_id")}
must_be_peer &= !is_adjacent_or_overlapping(root_extent,
center, level, next_child_center, walk_level+2,
false);
center, level, next_walk_center, walk_stack_size+2);
}
}
if (must_be_peer)
{
APPEND_peers(child_box_id);
APPEND_peers(walk_box_id);
}
else
{
// We want to descend into this box. Put the current state
// on the stack.
${walk_push("child_box_id")}
${walk_push("walk_box_id")}
continue;
}
}
......@@ -326,21 +454,203 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
"""
from pyopencl.elementwise import ElementwiseTemplate
from boxtree.tools import InlineBinarySearch
STARTS_EXPANDER_TEMPLATE = ElementwiseTemplate(
arguments=r"""
idx_t *dst,
idx_t *starts,
idx_t starts_len
""",
operation=r"""//CL//
/* Find my index in starts, place the index in dst. */
dst[i] = bsearch(starts, starts_len, i);
""",
name="starts_expander",
preamble=str(InlineBinarySearch("idx_t")))
# }}}
# {{{ area query elementwise template
class AreaQueryElementwiseTemplate:
"""
Experimental: Intended as a way to perform operations in the body of an area
query.
"""
@staticmethod
def unwrap_args(tree, peer_lists, *args):
return (tree.box_centers,
tree.root_extent,
tree.box_levels,
tree.aligned_nboxes,
tree.box_child_ids,
tree.box_flags,
peer_lists.peer_list_starts,
peer_lists.peer_lists,
*tree.bounding_box[0],
*args
)
def __init__(self, extra_args, ball_center_and_radius_expr,
leaf_found_op, preamble="", name="area_query_elwise"):
def wrap_in_macro(decl, expr):
return f"""
<%def name=\"{decl}\">
{expr}
</%def>
"""
from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
self.elwise_template = ElementwiseTemplate(
arguments=r"""//CL:mako//
coord_t *box_centers,
coord_t root_extent,
box_level_t *box_levels,
box_id_t aligned_nboxes,
box_id_t *box_child_ids,
box_flags_t *box_flags,
peer_list_idx_t *peer_list_starts,
box_id_t *peer_lists,
%for ax in AXIS_NAMES[:dimensions]:
coord_t bbox_min_${ax},
%endfor
""" + extra_args,
operation="//CL:mako//\n"
+ wrap_in_macro(
"get_ball_center_and_radius(ball_center, ball_radius, i)",
ball_center_and_radius_expr)
+ wrap_in_macro(
"leaf_found_op(leaf_box_id, ball_center, ball_radius)",
leaf_found_op)
+ TRAVERSAL_PREAMBLE_MAKO_DEFS
+ GUIDING_BOX_FINDER_MACRO
+ AREA_QUERY_WALKER_BODY,
name=name,
preamble=preamble)
def generate(self, context,
dimensions, coord_dtype, box_id_dtype,
peer_list_idx_dtype, max_levels,
extra_var_values=(), extra_type_aliases=(),
extra_preamble=""):
from pyopencl.cltypes import vec_types
from pyopencl.tools import dtype_to_ctype
from boxtree import box_flags_enum
from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
from boxtree.tree_build import TreeBuilder
render_vars = (
("np", np),
("dimensions", dimensions),
("dtype_to_ctype", dtype_to_ctype),
("box_id_dtype", box_id_dtype),
("particle_id_dtype", None),
("coord_dtype", coord_dtype),
("get_coord_vec_dtype", get_coord_vec_dtype),
("cvec_sub", partial(coord_vec_subscript_code, dimensions)),
("max_levels", max_levels),
("AXIS_NAMES", AXIS_NAMES),
("box_flags_enum", box_flags_enum),
("peer_list_idx_dtype", peer_list_idx_dtype),
("debug", False),
("root_extent_stretch_factor", TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR),
# FIXME This gets used in pytential with a template that still uses this:
("vec_types", tuple(vec_types.items())),
)
preamble = Template(
# HACK: box_flags_t and coord_t are defined here and
# in the template below, so disable typedef redefinition warnings.
"""
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wtypedef-redefinition"
"""
+ TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
+ """
#pragma clang diagnostic pop
""",
strict_undefined=True).render(**dict(render_vars))
return self.elwise_template.build(context,
type_aliases=(
("coord_t", coord_dtype),
("box_id_t", box_id_dtype),
("peer_list_idx_t", peer_list_idx_dtype),
("box_level_t", np.uint8),
("box_flags_t", box_flags_enum.dtype),
*extra_type_aliases,
),
var_values=render_vars + extra_var_values,
more_preamble=preamble + extra_preamble)
SPACE_INVADER_QUERY_TEMPLATE = AreaQueryElementwiseTemplate(
extra_args="""
coord_t *ball_radii,
float *outer_space_invader_dists,
%for ax in AXIS_NAMES[:dimensions]:
coord_t *ball_${ax},
%endfor
""",
ball_center_and_radius_expr=r"""
${ball_radius} = ball_radii[${i}];
%for ax in AXIS_NAMES[:dimensions]:
${ball_center}.${ax} = ball_${ax}[${i}];
%endfor
""",
leaf_found_op=r"""
{
${load_center("leaf_center", leaf_box_id)}
coord_t max_dist = 0;
%for i in range(dimensions):
max_dist = fmax(max_dist,
distance(
${cvec_sub(ball_center, i)},
${cvec_sub("leaf_center", i)}));
%endfor
// The atomic max operation supports only integer types.
// However, max_dist is of a floating point type.
// For comparison purposes we reinterpret the bits of max_dist
// as an integer. The comparison result is the same as for positive
// IEEE floating point numbers, so long as the float/int endianness
// matches (fingers crossed).
atomic_max(
(volatile __global int *)
&outer_space_invader_dists[${leaf_box_id}],
as_int((float) max_dist));
}""",
name="space_invader_query")
# }}}
# {{{ area query build
class AreaQueryBuilder(object):
"""Given a set of :math:`l^\infty` "balls", this class helps build a
class AreaQueryBuilder:
r"""Given a set of :math:`l^\infty` "balls", this class helps build a
look-up table from ball to leaf boxes that intersect with the ball.
.. versionadded:: 2016.1
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
self.peer_list_finder = PeerListFinder(self.context)
# {{{ Kernel generation
......@@ -348,45 +658,52 @@ class AreaQueryBuilder(object):
def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype,
ball_id_dtype, peer_list_idx_dtype, max_levels):
from pyopencl.tools import dtype_to_ctype
from boxtree import box_flags_enum
logger.info("start building area query kernel")
logger.debug("start building area query kernel")
from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
from boxtree.tree_build import TreeBuilder
template = Template(
TRAVERSAL_PREAMBLE_TEMPLATE
+ AREA_QUERY_TEMPLATE,
strict_undefined=True)
render_vars = dict(
dimensions=dimensions,
dtype_to_ctype=dtype_to_ctype,
box_id_dtype=box_id_dtype,
particle_id_dtype=None,
coord_dtype=coord_dtype,
vec_types=cl.array.vec.types,
max_levels=max_levels,
AXIS_NAMES=AXIS_NAMES,
box_flags_enum=box_flags_enum,
peer_list_idx_dtype=peer_list_idx_dtype,
ball_id_dtype=ball_id_dtype,
debug=False,
# Not used (but required by TRAVERSAL_PREAMBLE_TEMPLATE)
stick_out_factor=0)
from pyopencl.tools import VectorArg, ScalarArg
render_vars = {
"np": np,
"dimensions": dimensions,
"dtype_to_ctype": dtype_to_ctype,
"box_id_dtype": box_id_dtype,
"particle_id_dtype": None,
"coord_dtype": coord_dtype,
"get_coord_vec_dtype": get_coord_vec_dtype,
"cvec_sub": partial(coord_vec_subscript_code, dimensions),
"max_levels": max_levels,
"AXIS_NAMES": AXIS_NAMES,
"box_flags_enum": box_flags_enum,
"peer_list_idx_dtype": peer_list_idx_dtype,
"ball_id_dtype": ball_id_dtype,
"debug": False,
"root_extent_stretch_factor": TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR,
}
from boxtree.tools import ScalarArg, VectorArg
arg_decls = [
VectorArg(coord_dtype, "box_centers"),
VectorArg(coord_dtype, "box_centers", with_offset=False),
ScalarArg(coord_dtype, "root_extent"),
VectorArg(np.uint8, "box_levels"),
ScalarArg(box_id_dtype, "aligned_nboxes"),
VectorArg(box_id_dtype, "box_child_ids"),
VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
VectorArg(box_flags_enum.dtype, "box_flags"),
VectorArg(peer_list_idx_dtype, "peer_list_starts"),
VectorArg(box_id_dtype, "peer_lists"),
VectorArg(coord_dtype, "ball_radii"),
] + [
ScalarArg(coord_dtype, "bbox_min_"+ax)
for ax in AXIS_NAMES[:dimensions]
] + [
VectorArg(coord_dtype, "ball_"+ax)
for ax in AXIS_NAMES[:dimensions]]
......@@ -400,15 +717,11 @@ class AreaQueryBuilder(object):
count_sharing={},
complex_kernel=True)
logger.info("done building area query kernel")
logger.debug("done building area query kernel")
return area_query_kernel
# }}}
@memoize_method
def get_peer_list_finder(self):
return PeerListFinder(self.context)
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
wait_for=None):
"""
......@@ -427,8 +740,8 @@ class AreaQueryBuilder(object):
:class:`PeerListLookup` associated with `tree`.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
exeuction.
:returns: a tuple *(aq, event)*, where *lbl* is an instance of
execution.
:returns: a tuple *(aq, event)*, where *aq* is an instance of
:class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
for dependency management.
"""
......@@ -442,12 +755,12 @@ class AreaQueryBuilder(object):
ball_id_dtype = tree.particle_id_dtype # ?
from pytools import div_ceil
# Avoid generating too many kernels.
max_levels = div_ceil(tree.nlevels, 10) * 10
if peer_lists is None:
peer_list_finder = self.get_peer_list_finder()
peer_lists, evt = peer_list_finder(queue, tree, wait_for=wait_for)
peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
wait_for = [evt]
if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
......@@ -457,19 +770,20 @@ class AreaQueryBuilder(object):
tree.coord_dtype, tree.box_id_dtype, ball_id_dtype,
peer_lists.peer_list_starts.dtype, max_levels)
logger.info("area query: run area query")
aq_plog = ProcessLogger(logger, "area query")
result, evt = area_query_kernel(
queue, len(ball_radii),
tree.box_centers.data, tree.root_extent,
tree.box_levels.data, tree.aligned_nboxes,
tree.box_child_ids.data, tree.box_flags.data,
peer_lists.peer_list_starts.data,
peer_lists.peer_lists.data, ball_radii.data,
*tuple(bc.data for bc in ball_centers),
tree.box_levels, tree.aligned_nboxes,
tree.box_child_ids.data, tree.box_flags,
peer_lists.peer_list_starts,
peer_lists.peer_lists, ball_radii,
*(tuple(tree.bounding_box[0])
+ tuple(bc for bc in ball_centers)),
wait_for=wait_for)
logger.info("area query: done")
aq_plog.done()
return AreaQueryResult(
tree=tree,
......@@ -479,9 +793,253 @@ class AreaQueryBuilder(object):
# }}}
# {{{ area query transpose (leaves-to-balls) lookup build
class LeavesToBallsLookupBuilder:
r"""Given a set of :math:`l^\infty` "balls", this class helps build a
look-up table from leaf boxes to balls that overlap with each leaf box.
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
from pyopencl.algorithm import KeyValueSorter
self.key_value_sorter = KeyValueSorter(context)
self.area_query_builder = AreaQueryBuilder(context)
@memoize_method
def get_starts_expander_kernel(self, idx_dtype):
"""
Expands a "starts" array into a length starts[-1] array of increasing
indices:
Eg: [0 2 5 6] => [0 0 1 1 1 2]
"""
return STARTS_EXPANDER_TEMPLATE.build(
self.context,
type_aliases=(("idx_t", idx_dtype),))
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
wait_for=None):
"""
:arg queue: a :class:`pyopencl.CommandQueue`
:arg tree: a :class:`boxtree.Tree`.
:arg ball_centers: an object array of coordinate
:class:`pyopencl.array.Array` instances.
Their *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg ball_radii: a
:class:`pyopencl.array.Array`
of positive numbers.
Its *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg peer_lists: may either be *None* or an instance of
:class:`PeerListLookup` associated with `tree`.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
execution.
:returns: a tuple *(lbl, event)*, where *lbl* is an instance of
:class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
for dependency management.
"""
from pytools import single_valued
if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
raise TypeError("ball_centers dtype must match tree.coord_dtype")
if ball_radii.dtype != tree.coord_dtype:
raise TypeError("ball_radii dtype must match tree.coord_dtype")
ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query")
area_query, evt = self.area_query_builder(
queue, tree, ball_centers, ball_radii, peer_lists, wait_for)
wait_for = [evt]
logger.debug("leaves-to-balls lookup: expand starts")
nkeys = tree.nboxes
nballs_p_1 = len(area_query.leaves_near_ball_starts)
assert nballs_p_1 == len(ball_radii) + 1
# We invert the area query in two steps:
#
# 1. Turn the area query result into (ball number, box number) pairs.
# This is done in the "starts expander kernel."
#
# 2. Key-value sort the (ball number, box number) pairs by box number.
starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype)
expanded_starts = cl.array.empty(
queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
evt = starts_expander_knl(
expanded_starts,
area_query.leaves_near_ball_starts.with_queue(queue),
nballs_p_1)
wait_for = [evt]
logger.debug("leaves-to-balls lookup: key-value sort")
balls_near_box_starts, balls_near_box_lists, evt \
= self.key_value_sorter(
queue,
# keys
area_query.leaves_near_ball_lists.with_queue(queue),
# values
expanded_starts,
nkeys, starts_dtype=tree.box_id_dtype,
wait_for=wait_for)
ltb_plog.done()
return LeavesToBallsLookup(
tree=tree,
balls_near_box_starts=balls_near_box_starts,
balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
# }}}
# {{{ space invader query build
class SpaceInvaderQueryBuilder:
r"""
Given a set of :math:`l^\infty` "balls", this class helps build a look-up
table which maps leaf boxes to the *outer space invader distance*.
This is defined below but roughly, from the point of view
of a leaf box, it is the farthest "leaf center to ball center" distance among
all balls that intersect the leaf box.
Formally, given a leaf box :math:`b`, the *outer space invader distance* is
defined by the following expression (here :math:`d_\infty` is the
:math:`\infty` norm):
.. math::
\max \left( \{ d_{\infty}(\text{center}(b), \text{center}(b^*))
: b^* \text{ is a ball}, b^* \cap b \neq \varnothing \}
\cup \{ 0 \} \right)
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
self.peer_list_finder = PeerListFinder(self.context)
# {{{ Kernel generation
@memoize_method
def get_space_invader_query_kernel(self, dimensions, coord_dtype,
box_id_dtype, peer_list_idx_dtype, max_levels):
return SPACE_INVADER_QUERY_TEMPLATE.generate(
self.context,
dimensions,
coord_dtype,
box_id_dtype,
peer_list_idx_dtype,
max_levels)
# }}}
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
wait_for=None):
"""
:arg queue: a :class:`pyopencl.CommandQueue`
:arg tree: a :class:`boxtree.Tree`.
:arg ball_centers: an object array of coordinate
:class:`pyopencl.array.Array` instances.
Their *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg ball_radii: a
:class:`pyopencl.array.Array`
of positive numbers.
Its *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg peer_lists: may either be *None* or an instance of
:class:`PeerListLookup` associated with `tree`.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
execution.
:returns: a tuple *(sqi, event)*, where *sqi* is an instance of
:class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
for dependency management. The *dtype* of *sqi* is
*tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
*(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
The entries of *sqi* are indexed by the global box index and are
as follows:
* if *i* is not the index of a leaf box, *sqi[i] = 0*.
* if *i* is the index of a leaf box, *sqi[i]* is the
outer space invader distance for *i*.
"""
from pytools import single_valued
if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
raise TypeError("ball_centers dtype must match tree.coord_dtype")
if ball_radii.dtype != tree.coord_dtype:
raise TypeError("ball_radii dtype must match tree.coord_dtype")
from pytools import div_ceil
# Avoid generating too many kernels.
max_levels = div_ceil(tree.nlevels, 10) * 10
if peer_lists is None:
peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
wait_for = [evt]
if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
raise ValueError("size of peer lists must match with number of boxes")
space_invader_query_kernel = self.get_space_invader_query_kernel(
tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
peer_lists.peer_list_starts.dtype, max_levels)
si_plog = ProcessLogger(logger, "space invader query")
outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32)
if not wait_for:
wait_for = []
wait_for = (wait_for
+ outer_space_invader_dists.events
+ ball_radii.events
+ [evt for bc in ball_centers for evt in bc.events])
evt = space_invader_query_kernel(
*SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
tree, peer_lists,
ball_radii,
outer_space_invader_dists,
*tuple(bc for bc in ball_centers)),
wait_for=wait_for,
queue=queue,
range=slice(len(ball_radii)))
if tree.coord_dtype != np.dtype(np.float32):
# The kernel output is always an array of float32 due to limited
# support for atomic operations with float64 in OpenCL.
# Here the output is cast to match the coord dtype.
outer_space_invader_dists.finish()
outer_space_invader_dists = outer_space_invader_dists.astype(
tree.coord_dtype)
evt, = outer_space_invader_dists.events
si_plog.done()
return outer_space_invader_dists, evt
# }}}
# {{{ peer list build
class PeerListFinder(object):
class PeerListFinder:
"""This class builds a look-up table from box numbers to peer boxes. The
full definition [1]_ of a peer box is as follows:
......@@ -500,6 +1058,7 @@ class PeerListFinder(object):
.. versionadded:: 2016.1
.. automethod:: __init__
.. automethod:: __call__
"""
......@@ -512,12 +1071,15 @@ class PeerListFinder(object):
def get_peer_list_finder_kernel(self, dimensions, coord_dtype,
box_id_dtype, max_levels):
from pyopencl.tools import dtype_to_ctype
from boxtree import box_flags_enum
logger.info("start building peer list finder kernel")
logger.debug("start building peer list finder kernel")
from boxtree.traversal import (
TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE)
HELPER_FUNCTION_TEMPLATE,
TRAVERSAL_PREAMBLE_TEMPLATE,
)
template = Template(
TRAVERSAL_PREAMBLE_TEMPLATE
......@@ -525,30 +1087,31 @@ class PeerListFinder(object):
+ PEER_LIST_FINDER_TEMPLATE,
strict_undefined=True)
render_vars = dict(
dimensions=dimensions,
dtype_to_ctype=dtype_to_ctype,
box_id_dtype=box_id_dtype,
particle_id_dtype=None,
coord_dtype=coord_dtype,
vec_types=cl.array.vec.types,
max_levels=max_levels,
AXIS_NAMES=AXIS_NAMES,
box_flags_enum=box_flags_enum,
debug=False,
# Not used (but required by TRAVERSAL_PREAMBLE_TEMPLATE)
stick_out_factor=0,
render_vars = {
"np": np,
"dimensions": dimensions,
"dtype_to_ctype": dtype_to_ctype,
"box_id_dtype": box_id_dtype,
"particle_id_dtype": None,
"coord_dtype": coord_dtype,
"get_coord_vec_dtype": get_coord_vec_dtype,
"cvec_sub": partial(coord_vec_subscript_code, dimensions),
"max_levels": max_levels,
"AXIS_NAMES": AXIS_NAMES,
"box_flags_enum": box_flags_enum,
"debug": False,
# For calls to the helper is_adjacent_or_overlapping()
targets_have_extent=False,
sources_have_extent=False)
"targets_have_extent": False,
"sources_have_extent": False,
}
from pyopencl.tools import VectorArg, ScalarArg
from boxtree.tools import ScalarArg, VectorArg
arg_decls = [
VectorArg(coord_dtype, "box_centers"),
VectorArg(coord_dtype, "box_centers", with_offset=False),
ScalarArg(coord_dtype, "root_extent"),
VectorArg(np.uint8, "box_levels"),
ScalarArg(box_id_dtype, "aligned_nboxes"),
VectorArg(box_id_dtype, "box_child_ids"),
VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
VectorArg(box_flags_enum.dtype, "box_flags"),
]
......@@ -562,7 +1125,7 @@ class PeerListFinder(object):
count_sharing={},
complex_kernel=True)
logger.info("done building peer list finder kernel")
logger.debug("done building peer list finder kernel")
return peer_list_finder_kernel
# }}}
......@@ -579,22 +1142,24 @@ class PeerListFinder(object):
for dependency management.
"""
from pytools import div_ceil
# Avoid generating too many kernels.
# Round up level count--this gets included in the kernel as
# a stack bound. Rounding avoids too many kernel versions.
max_levels = div_ceil(tree.nlevels, 10) * 10
peer_list_finder_kernel = self.get_peer_list_finder_kernel(
tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)
logger.info("peer list finder: find peer lists")
pl_plog = ProcessLogger(logger, "find peer lists")
result, evt = peer_list_finder_kernel(
queue, tree.nboxes,
tree.box_centers.data, tree.root_extent,
tree.box_levels.data, tree.aligned_nboxes,
tree.box_child_ids.data, tree.box_flags.data,
tree.box_levels, tree.aligned_nboxes,
tree.box_child_ids.data, tree.box_flags,
wait_for=wait_for)
logger.info("peer list finder: done")
pl_plog.done()
return PeerListLookup(
tree=tree,
......@@ -603,4 +1168,4 @@ class PeerListFinder(object):
# }}}
# vim: filetype=pyopencl:fdm=marker
# vim: fdm=marker
__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
from arraycontext.pytest import (
_PytestPyOpenCLArrayContextFactoryWithClass,
register_pytest_array_context_factory,
)
__doc__ = """
.. autoclass:: PyOpenCLArrayContext
"""
def _acf():
import pyopencl as cl
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
return PyOpenCLArrayContext(queue, force_device_scalars=True)
class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
def transform_loopy_program(self, t_unit):
default_ep = t_unit.default_entrypoint
options = default_ep.options
if not (options.return_dict and options.no_numpy):
raise ValueError("Loopy kernel passed to call_loopy must "
"have return_dict and no_numpy options set. "
"Did you use arraycontext.make_loopy_program "
"to create this kernel?")
return super().transform_loopy_program(t_unit)
class PytestPyOpenCLArrayContextFactory(
_PytestPyOpenCLArrayContextFactoryWithClass):
actx_class = PyOpenCLArrayContext
register_pytest_array_context_factory("boxtree.pyopencl",
PytestPyOpenCLArrayContextFactory)
from __future__ import division
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
......@@ -23,11 +21,13 @@ THE SOFTWARE.
"""
import numpy as np
import pyopencl as cl # noqa
from boxtree.tools import get_type_moniker
from pytools import memoize, memoize_method
from pyopencl.reduction import ReductionTemplate
import numpy as np
from pytools import memoize, memoize_method
from boxtree.tools import get_type_moniker
@memoize
......@@ -35,12 +35,12 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
from boxtree.tools import AXIS_NAMES
fields = []
for i in range(dimensions):
fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype))
fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype))
fields.append((f"min_{AXIS_NAMES[i]}", coord_dtype))
fields.append((f"max_{AXIS_NAMES[i]}", coord_dtype))
dtype = np.dtype(fields)
name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype))
type_moniker = get_type_moniker(coord_dtype)
name = f"boxtree_bbox_{dimensions}d_{type_moniker}_t"
from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
......@@ -132,7 +132,7 @@ class BoundingBoxFinder:
@memoize_method
def get_kernel(self, dimensions, coord_dtype, have_radii):
bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
bbox_dtype, _bbox_cdecl = make_bounding_box_dtype(
self.context.devices[0], dimensions, coord_dtype)
from boxtree.tools import AXIS_NAMES
......@@ -158,11 +158,7 @@ class BoundingBoxFinder:
from pytools import single_valued
coord_dtype = single_valued(coord.dtype for coord in particles)
if radii is None:
radii_tuple = ()
else:
radii_tuple = (radii,)
radii_tuple = () if radii is None else (radii,)
knl = self.get_kernel(dimensions, coord_dtype,
# have_radii:
radii is not None)
......
"""
.. autoclass:: ConstantOneTreeIndependentDataForWrangler
.. autoclass:: ConstantOneExpansionWrangler
"""
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import numpy as np
from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
from boxtree.timing import DummyTimingFuture
# {{{ constant one wrangler
class ConstantOneTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
"""
.. automethod:: __init__
"""
class ConstantOneExpansionWrangler(ExpansionWranglerInterface):
"""This implements the 'analytical routines' for a Green's function that is
constant 1 everywhere. For 'charges' of 'ones', this should get every particle
a copy of the particle count.
Timing results returned by this wrangler contain the field *ops_elapsed*,
which counts approximately the number of floating-point operations required.
"""
def _get_source_slice(self, ibox):
pstart = self.tree.box_source_starts[ibox]
return slice(
pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
def _get_target_slice(self, ibox):
pstart = self.tree.box_target_starts[ibox]
return slice(
pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
def multipole_expansion_zeros(self):
return np.zeros(self.tree.nboxes, dtype=np.float64)
local_expansion_zeros = multipole_expansion_zeros
def output_zeros(self):
return np.zeros(self.tree.ntargets, dtype=np.float64)
def reorder_sources(self, source_array):
return source_array[self.tree.user_source_ids]
def reorder_potentials(self, potentials):
return potentials[self.tree.sorted_target_ids]
def multipole_expansions_view(self, mpole_exps, level):
# FIXME
raise NotImplementedError
def local_expansions_view(self, local_exps, level):
# FIXME
raise NotImplementedError
@staticmethod
def timing_future(ops):
return DummyTimingFuture.from_op_count(ops)
def form_multipoles(self, level_start_source_box_nrs, source_boxes,
src_weight_vecs):
src_weights, = src_weight_vecs
mpoles = self.multipole_expansion_zeros()
ops = 0
for ibox in source_boxes:
pslice = self._get_source_slice(ibox)
mpoles[ibox] += np.sum(src_weights[pslice])
ops += src_weights[pslice].size
return mpoles, self.timing_future(ops)
def coarsen_multipoles(self, level_start_source_parent_box_nrs,
source_parent_boxes, mpoles):
tree = self.tree
ops = 0
# nlevels-1 is the last valid level index
# nlevels-2 is the last valid level that could have children
#
# 3 is the last relevant source_level.
# 2 is the last relevant target_level.
# (because no level 1 box will be well-separated from another)
for source_level in range(tree.nlevels-1, 2, -1):
target_level = source_level - 1
start, stop = level_start_source_parent_box_nrs[
target_level:target_level+2]
for ibox in source_parent_boxes[start:stop]:
for child in tree.box_child_ids[:, ibox]:
if child:
mpoles[ibox] += mpoles[child]
ops += 1
return mpoles, self.timing_future(ops)
def eval_direct(self, target_boxes, neighbor_sources_starts,
neighbor_sources_lists, src_weight_vecs):
src_weights, = src_weight_vecs
pot = self.output_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_boxes):
tgt_pslice = self._get_target_slice(tgt_ibox)
src_sum = 0
nsrcs = 0
start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
# print "DIR: %s <- %s" % (tgt_ibox, neighbor_sources_lists[start:end])
for src_ibox in neighbor_sources_lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
nsrcs += src_weights[src_pslice].size
src_sum += np.sum(src_weights[src_pslice])
pot[tgt_pslice] = src_sum
ops += pot[tgt_pslice].size * nsrcs
return pot, self.timing_future(ops)
def multipole_to_local(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes,
starts, lists, mpole_exps):
local_exps = self.local_expansion_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
contrib = 0
# print tgt_ibox, "<-", lists[start:end]
for src_ibox in lists[start:end]:
contrib += mpole_exps[src_ibox]
ops += 1
local_exps[tgt_ibox] += contrib
return local_exps, self.timing_future(ops)
def eval_multipoles(self,
target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
mpole_exps):
pot = self.output_zeros()
ops = 0
for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level):
for itgt_box, tgt_ibox in \
enumerate(target_boxes_by_source_level[level]):
tgt_pslice = self._get_target_slice(tgt_ibox)
contrib = 0
start, end = ssn.starts[itgt_box:itgt_box+2]
for src_ibox in ssn.lists[start:end]:
contrib += mpole_exps[src_ibox]
pot[tgt_pslice] += contrib
ops += pot[tgt_pslice].size * (end - start)
return pot, self.timing_future(ops)
def form_locals(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, starts, lists, src_weight_vecs):
src_weights, = src_weight_vecs
local_exps = self.local_expansion_zeros()
ops = 0
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
# print "LIST 4", tgt_ibox, "<-", lists[start:end]
contrib = 0
nsrcs = 0
for src_ibox in lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
nsrcs += src_weights[src_pslice].size
contrib += np.sum(src_weights[src_pslice])
local_exps[tgt_ibox] += contrib
ops += nsrcs
return local_exps, self.timing_future(ops)
def refine_locals(self, level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, local_exps):
ops = 0
for target_lev in range(1, self.tree.nlevels):
start, stop = level_start_target_or_target_parent_box_nrs[
target_lev:target_lev+2]
for ibox in target_or_target_parent_boxes[start:stop]:
local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
ops += 1
return local_exps, self.timing_future(ops)
def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
pot = self.output_zeros()
ops = 0
for ibox in target_boxes:
tgt_pslice = self._get_target_slice(ibox)
pot[tgt_pslice] += local_exps[ibox]
ops += pot[tgt_pslice].size
return pot, self.timing_future(ops)
def finalize_potentials(self, potentials, template_ary):
return potentials
# }}}
# vim: foldmethod=marker:filetype=pyopencl
__copyright__ = """
Copyright (C) 2013 Andreas Kloeckner
Copyright (C) 2018 Matt Wala
Copyright (C) 2018 Hao Gao
"""
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
__doc__ = """
This module helps predict the running time of each step of FMM
:class:`FMMTranslationCostModel` describes the translation or evaluation cost of a
single operation. For example, *m2p* describes the cost for translating a single
multipole expansion to a single target.
:class:`AbstractFMMCostModel` uses :class:`FMMTranslationCostModel` and calibration
parameter to compute the total cost of each step of FMM in each box. There is an
:class:`AbstractFMMCostModel`, implemented by :class:`FMMCostModel`.
:file:`examples/cost_model.py` demonstrates how the calibration and evaluation
are performed.
A similar module in *pytential* extends the functionality of his module to
incorporate QBX-specific operations.
Translation Cost of a Single Operation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: FMMTranslationCostModel
.. autofunction:: make_pde_aware_translation_cost_model
.. autofunction:: make_taylor_translation_cost_model
Cost Model Classes
^^^^^^^^^^^^^^^^^^
.. autoclass:: AbstractFMMCostModel
.. autoclass:: FMMCostModel
"""
from collections.abc import Mapping
from functools import partial
from typing import ClassVar
import numpy as np
from mako.template import Template
import pyopencl as cl
import pyopencl.array
from pymbolic import evaluate, var
from pyopencl.elementwise import ElementwiseKernel
from pyopencl.tools import dtype_to_ctype
from pytools import memoize_method
Template = partial(Template, strict_undefined=True)
from abc import ABC, abstractmethod
# {{{ FMMTranslationCostModel
class FMMTranslationCostModel:
"""Provides modeled costs for individual translations or evaluations.
.. note::
Current implementation assumes the calibration parameters are linear
in the modeled cost. For example,
`var("c_p2l") * self.ncoeffs_fmm_by_level[level]` is valid, but
`var("c_p2l") ** 2 * self.ncoeffs_fmm_by_level[level]` is not.
.. autoclass:: __init__
"""
def __init__(self, ncoeffs_fmm_by_level, uses_point_and_shoot):
self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level
self.uses_point_and_shoot = uses_point_and_shoot
@staticmethod
def direct():
return var("c_p2p")
def p2l(self, level):
return var("c_p2l") * self.ncoeffs_fmm_by_level[level]
def l2p(self, level):
return var("c_l2p") * self.ncoeffs_fmm_by_level[level]
def p2m(self, level):
return var("c_p2m") * self.ncoeffs_fmm_by_level[level]
def m2p(self, level):
return var("c_m2p") * self.ncoeffs_fmm_by_level[level]
def m2m(self, src_level, tgt_level):
return var("c_m2m") * self.e2e_cost(
self.ncoeffs_fmm_by_level[src_level],
self.ncoeffs_fmm_by_level[tgt_level])
def l2l(self, src_level, tgt_level):
return var("c_l2l") * self.e2e_cost(
self.ncoeffs_fmm_by_level[src_level],
self.ncoeffs_fmm_by_level[tgt_level])
def m2l(self, src_level, tgt_level):
return var("c_m2l") * self.e2e_cost(
self.ncoeffs_fmm_by_level[src_level],
self.ncoeffs_fmm_by_level[tgt_level])
def e2e_cost(self, nsource_coeffs, ntarget_coeffs):
if self.uses_point_and_shoot:
return (
# Rotate the coordinate system to be z axis aligned.
nsource_coeffs ** (3 / 2)
# Translate the expansion along the z axis.
+ nsource_coeffs ** (1 / 2) * ntarget_coeffs
# Rotate the coordinate system back.
+ ntarget_coeffs ** (3 / 2))
return nsource_coeffs * ntarget_coeffs
# }}}
# {{{ translation cost model factories
def make_pde_aware_translation_cost_model(dim, nlevels):
"""Create a cost model for FMM translation operators that make use of the
knowledge that the potential satisfies a PDE.
For example, this factory is used for complex Taylor and Fourier-Bessel
expansions in 2D, and spherical harmonics (with point-and-shoot) in 3D.
"""
p_fmm = np.array([var(f"p_fmm_lev{i}") for i in range(nlevels)])
ncoeffs_fmm = (p_fmm + 1) ** (dim - 1)
uses_point_and_shoot = dim == 3
return FMMTranslationCostModel(
ncoeffs_fmm_by_level=ncoeffs_fmm,
uses_point_and_shoot=uses_point_and_shoot
)
def make_taylor_translation_cost_model(dim, nlevels):
"""Create a cost model for FMM translation based on Taylor expansions
in Cartesian coordinates.
"""
p_fmm = np.array([var(f"p_fmm_lev{i}") for i in range(nlevels)])
ncoeffs_fmm = (p_fmm + 1) ** dim
return FMMTranslationCostModel(
ncoeffs_fmm_by_level=ncoeffs_fmm,
uses_point_and_shoot=False
)
# }}}
# {{{ AbstractFMMCostModel
class AbstractFMMCostModel(ABC):
"""An interface to obtain both FMM operation counts and calibrated (e.g. in
seconds) cost estimates.
* To obtain operation counts only, use :meth:`get_unit_calibration_params`
with :meth:`cost_per_stage` or :meth:`cost_per_box`.
* To calibrate the model, pass operation counts together with timing data
to :meth:`estimate_calibration_params`.
* To evaluate the calibrated models, pass the calibration parameters
from :meth:`estimate_calibration_params` to :meth:`cost_per_stage` or
:meth:`cost_per_box`.
.. automethod:: __init__
.. ------------------------------------------------------------------------
.. rubric:: Evaluation
.. ------------------------------------------------------------------------
.. automethod:: cost_per_box
.. automethod:: cost_per_stage
.. ------------------------------------------------------------------------
.. rubric:: Calibration
.. ------------------------------------------------------------------------
.. automethod:: estimate_calibration_params
.. ------------------------------------------------------------------------
.. rubric:: Utilities
.. ------------------------------------------------------------------------
.. automethod:: aggregate_over_boxes
.. automethod:: get_unit_calibration_params
.. automethod:: get_ndirect_sources_per_target_box
"""
def __init__(
self,
translation_cost_model_factory=make_pde_aware_translation_cost_model):
"""
:arg translation_cost_model_factory: a function, which takes tree dimension
and the number of tree levels as arguments, returns an object of
:class:`FMMTranslationCostModel`.
"""
self.translation_cost_model_factory = translation_cost_model_factory
@abstractmethod
def process_form_multipoles(self, queue, traversal, p2m_cost):
"""Cost for forming multipole expansions of each box.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels,) representing the cost of forming the multipole
expansion of one source at each level.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(nsource_boxes,), with each entry represents the cost of the box.
"""
pass
@abstractmethod
def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
"""Cost for upward propagation.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels-1,), where the ith entry represents the
multipole-to-multipole cost from source level i+1 to target level i.
:return: a :class:`float`, the overall cost of upward propagation.
.. note:: This method returns a number instead of an array, because it is not
immediate clear how per-box cost of upward propagation will be useful for
distributed load balancing.
"""
pass
@abstractmethod
def get_ndirect_sources_per_target_box(self, queue, traversal):
"""Collect the number of direct evaluation sources (list 1, list 3 close and
list 4 close) for each target box.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(ntarget_boxes,), with each entry representing the number of direct
evaluation sources for that target box.
"""
pass
@abstractmethod
def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
box_target_counts_nonchild=None):
"""Direct evaluation cost of each target box of *traversal*.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry
representing the number of direct evaluation sources for that target box.
:arg p2p_cost: a constant representing the cost of one point-to-point
evaluation.
:arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
using direct evaluation in this box. For example, this is useful in QBX
by specifying the number of non-QBX targets. If None, all targets in
boxes are considered.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(ntarget_boxes,), with each entry represents the cost of the box.
"""
pass
@abstractmethod
def process_list2(self, queue, traversal, m2l_cost):
"""
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels,) representing the translation cost of each level.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(ntarget_or_target_parent_boxes,), with each entry representing the cost
of multipole-to-local translations to this box.
"""
pass
@abstractmethod
def process_list3(self, queue, traversal, m2p_cost,
box_target_counts_nonchild=None):
"""
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels,) where the ith entry represents the evaluation cost
from multipole expansion at level i to a point.
:arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
using multiple-to-point translations in this box. For example, this is
useful in QBX by specifying the number of non-QBX targets. If None, all
targets in boxes are considered.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(nboxes,), with each entry representing the cost of evaluating all
targets inside this box from multipole expansions of list-3 boxes.
"""
pass
@abstractmethod
def process_list4(self, queue, traversal, p2l_cost):
"""
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels,) where the ith entry represents the translation cost
from a point to the local expansion at level i.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(ntarget_or_target_parent_boxes,), with each entry representing the cost
of point-to-local translations to this box.
"""
pass
@abstractmethod
def process_eval_locals(self, queue, traversal, l2p_cost,
box_target_counts_nonchild=None):
"""
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape (nlevels,) where the ith entry represents the cost of evaluating
the potential of a target in a box of level i using the box's local
expansion.
:arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
which need evaluation. For example, this is useful in QBX by specifying
the number of non-QBX targets. If None, use
traversal.tree.box_target_counts_nonchild.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(ntarget_boxes,), the cost of evaluating the potentials of all targets
inside this box from its local expansion.
"""
pass
@abstractmethod
def process_refine_locals(self, queue, traversal, l2l_cost):
"""Cost of downward propagation.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
of shape ``(nlevels-1,)``, where the :math:`i`th entry represents
the cost of translating local expansion from level :math:`i` to
level :math:`i+1`.
:return: a :class:`float`, the overall cost of downward propagation.
.. note:: This method returns a number instead of an array, because it is not
immediate clear how per-box cost of downward propagation will be useful
for distributed load balancing.
"""
pass
@abstractmethod
def aggregate_over_boxes(self, per_box_result):
"""Sum all entries of *per_box_result* into a number.
:arg per_box_result: an object of :class:`numpy.ndarray` or
:class:`pyopencl.array.Array`, the result to be sumed.
:return: a :class:`float`, the result of the sum.
"""
pass
@staticmethod
def cost_factors_to_dev(cost_factors, queue):
cost_factors_dev = {}
for name in cost_factors:
if not isinstance(cost_factors[name], np.ndarray):
cost_factors_dev[name] = cost_factors[name]
continue
cost_factors_dev[name] = cl.array.to_device(
queue, cost_factors[name]
).with_queue(None)
return cost_factors_dev
def fmm_cost_factors_for_kernels_from_model(
self, queue, nlevels, xlat_cost, context):
"""Evaluate translation cost factors from symbolic model. The result of this
function can be used for process_* methods in this class.
:arg queue: If not None, the cost factor arrays will be transferred to device
using this queue.
:arg nlevels: the number of tree levels.
:arg xlat_cost: a :class:`FMMTranslationCostModel`.
:arg context: a :class:`dict` of parameters passed as context when
evaluating symbolic expressions in *xlat_cost*.
:return: a :class:`dict`, the translation cost of each step in FMM.
"""
cost_factors = {
"p2m_cost": np.array([
evaluate(xlat_cost.p2m(ilevel), context=context)
for ilevel in range(nlevels)
], dtype=np.float64),
"m2m_cost": np.array([
evaluate(xlat_cost.m2m(ilevel+1, ilevel), context=context)
for ilevel in range(nlevels-1)
], dtype=np.float64),
"c_p2p": evaluate(xlat_cost.direct(), context=context),
"m2l_cost": np.array([
evaluate(xlat_cost.m2l(ilevel, ilevel), context=context)
for ilevel in range(nlevels)
], dtype=np.float64),
"m2p_cost": np.array([
evaluate(xlat_cost.m2p(ilevel), context=context)
for ilevel in range(nlevels)
], dtype=np.float64),
"p2l_cost": np.array([
evaluate(xlat_cost.p2l(ilevel), context=context)
for ilevel in range(nlevels)
], dtype=np.float64),
"l2l_cost": np.array([
evaluate(xlat_cost.l2l(ilevel, ilevel+1), context=context)
for ilevel in range(nlevels-1)
], dtype=np.float64),
"l2p_cost": np.array([
evaluate(xlat_cost.l2p(ilevel), context=context)
for ilevel in range(nlevels)
], dtype=np.float64)
}
if queue:
cost_factors = self.cost_factors_to_dev(cost_factors, queue)
return cost_factors
@abstractmethod
def zero_cost_per_box(self, queue, nboxes):
"""Helper function for returning the per-box cost filled with 0.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:param nboxes: the number of boxes
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(*nboxes*,), representing the zero per-box cost.
"""
pass
def cost_per_box(self, queue, traversal, level_to_order,
calibration_params,
ndirect_sources_per_target_box=None,
box_target_counts_nonchild=None):
"""Predict the per-box costs of a new traversal object.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg level_to_order: a :class:`numpy.ndarray` of shape
(traversal.tree.nlevels,) representing the expansion orders
of different levels.
:arg calibration_params: a :class:`dict` of calibration parameters. These
parameters can be obtained via :meth:`estimate_calibration_params`
or :meth:`get_unit_calibration_params`.
:arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
direct evaluation sources (list 1, list 3 close, list 4 close) for each
target box. You may find :meth:`get_ndirect_sources_per_target_box`
helpful. This argument is useful because the same result can be reused
for p2p, p2qbxl and tsqbx.
:arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
which need evaluation. For example, this is useful in QBX by specifying
the number of non-QBX targets. If None, all targets are considered,
namely traversal.tree.box_target_counts_nonchild.
:return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
(nboxes,), where the ith entry represents the cost of all stages for box
i.
"""
if ndirect_sources_per_target_box is None:
ndirect_sources_per_target_box = (
self.get_ndirect_sources_per_target_box(queue, traversal)
)
tree = traversal.tree
nboxes = tree.nboxes
source_boxes = traversal.source_boxes
target_boxes = traversal.target_boxes
target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
result = self.zero_cost_per_box(queue, nboxes)
for ilevel in range(tree.nlevels):
calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel]
xlat_cost = self.translation_cost_model_factory(
tree.dimensions, tree.nlevels
)
translation_cost = self.fmm_cost_factors_for_kernels_from_model(
queue, tree.nlevels, xlat_cost, calibration_params
)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
result[source_boxes] += self.process_form_multipoles(
queue, traversal, translation_cost["p2m_cost"]
)
result[target_boxes] += self.process_direct(
queue, traversal, ndirect_sources_per_target_box,
translation_cost["c_p2p"],
box_target_counts_nonchild=box_target_counts_nonchild
)
result[target_or_target_parent_boxes] += self.process_list2(
queue, traversal, translation_cost["m2l_cost"]
)
result += self.process_list3(
queue, traversal, translation_cost["m2p_cost"],
box_target_counts_nonchild=box_target_counts_nonchild
)
result[target_or_target_parent_boxes] += self.process_list4(
queue, traversal, translation_cost["p2l_cost"]
)
result[target_boxes] += self.process_eval_locals(
queue, traversal, translation_cost["l2p_cost"],
box_target_counts_nonchild=box_target_counts_nonchild
)
return result
def cost_per_stage(self, queue, traversal, level_to_order,
calibration_params,
ndirect_sources_per_target_box=None,
box_target_counts_nonchild=None):
"""Predict the per-stage costs of a new traversal object.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
:arg level_to_order: a :class:`numpy.ndarray` of shape
(traversal.tree.nlevels,) representing the expansion orders
of different levels.
:arg calibration_params: a :class:`dict` of calibration parameters. These
parameters can be obtained via :meth:`estimate_calibration_params`
or :meth:`get_unit_calibration_params`.
:arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
direct evaluation sources (list 1, list 3 close, list 4 close) for each
target box. You may find :func:`get_ndirect_sources_per_target_box`
helpful. This argument is useful because the same result can be reused
for p2p, p2qbxl and tsqbx.
:arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
:class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
which need evaluation. For example, this is useful in QBX by specifying
the number of non-QBX targets. If None, all targets are considered,
namely traversal.tree.box_target_counts_nonchild.
:return: a :class:`dict`, mapping FMM stage names to cost numbers.
"""
if ndirect_sources_per_target_box is None:
ndirect_sources_per_target_box = (
self.get_ndirect_sources_per_target_box(queue, traversal)
)
tree = traversal.tree
result = {}
for ilevel in range(tree.nlevels):
calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel]
xlat_cost = self.translation_cost_model_factory(
tree.dimensions, tree.nlevels
)
translation_cost = self.fmm_cost_factors_for_kernels_from_model(
queue, tree.nlevels, xlat_cost, calibration_params
)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
result["form_multipoles"] = self.aggregate_over_boxes(
self.process_form_multipoles(
queue, traversal, translation_cost["p2m_cost"]
)
)
result["coarsen_multipoles"] = self.process_coarsen_multipoles(
queue, traversal, translation_cost["m2m_cost"]
)
result["eval_direct"] = self.aggregate_over_boxes(
self.process_direct(
queue, traversal, ndirect_sources_per_target_box,
translation_cost["c_p2p"],
box_target_counts_nonchild=box_target_counts_nonchild
)
)
result["multipole_to_local"] = self.aggregate_over_boxes(
self.process_list2(queue, traversal, translation_cost["m2l_cost"])
)
result["eval_multipoles"] = self.aggregate_over_boxes(
self.process_list3(
queue, traversal, translation_cost["m2p_cost"],
box_target_counts_nonchild=box_target_counts_nonchild
)
)
result["form_locals"] = self.aggregate_over_boxes(
self.process_list4(queue, traversal, translation_cost["p2l_cost"])
)
result["refine_locals"] = self.process_refine_locals(
queue, traversal, translation_cost["l2l_cost"]
)
result["eval_locals"] = self.aggregate_over_boxes(
self.process_eval_locals(
queue, traversal, translation_cost["l2p_cost"],
box_target_counts_nonchild=box_target_counts_nonchild
)
)
return result
@staticmethod
def get_unit_calibration_params():
return {
"c_l2l": 1.0,
"c_l2p": 1.0,
"c_m2l": 1.0,
"c_m2m": 1.0,
"c_m2p": 1.0,
"c_p2l": 1.0,
"c_p2m": 1.0,
"c_p2p": 1.0,
}
_FMM_STAGE_TO_CALIBRATION_PARAMETER: ClassVar[Mapping[str, str]] = {
"form_multipoles": "c_p2m",
"coarsen_multipoles": "c_m2m",
"eval_direct": "c_p2p",
"multipole_to_local": "c_m2l",
"eval_multipoles": "c_m2p",
"form_locals": "c_p2l",
"refine_locals": "c_l2l",
"eval_locals": "c_l2p"
}
def estimate_calibration_params(self, model_results, timing_results,
time_field_name="wall_elapsed",
additional_stage_to_param_names=()):
"""
:arg model_results: a :class:`list` of the modeled cost for each step of FMM,
returned by :func:`cost_per_stage` with unit calibration parameters
(from :meth:`get_unit_calibration_params`)
:arg timing_results: a :class:`list` of the same length as *model_results*.
Each entry is a :class:`dict` filled with timing data returned by
*boxtree.fmm.drive_fmm*
:arg time_field_name: a :class:`str`, the field name from the timing result.
Usually this can be "wall_elapsed" or "process_elapsed".
:arg additional_stage_to_param_names: a :class:`dict` for mapping stage names
to parameter names. This is useful for supplying additional stages of
QBX.
:return: a :class:`dict` of calibration parameters. If there is no model
result for a particular stage, the estimated calibration parameter for
that stage is NaN.
"""
nresults = len(model_results)
assert len(timing_results) == nresults
stage_to_param_names = self._FMM_STAGE_TO_CALIBRATION_PARAMETER.copy()
stage_to_param_names.update(additional_stage_to_param_names)
params = set(stage_to_param_names.values())
uncalibrated_times = {}
actual_times = {}
for param in params:
uncalibrated_times[param] = np.zeros(nresults)
actual_times[param] = np.zeros(nresults)
for icase, model_result in enumerate(model_results):
for stage_name, param_name in stage_to_param_names.items():
if stage_name in model_result:
uncalibrated_times[param_name][icase] = (
model_result[stage_name])
for icase, timing_result in enumerate(timing_results):
for stage_name, time in timing_result.items():
param_name = stage_to_param_names[stage_name]
actual_times[param_name][icase] = time[time_field_name]
result = {}
for param in params:
uncalibrated = uncalibrated_times[param]
actual = actual_times[param]
if np.allclose(uncalibrated, 0):
result[param] = 0.0
continue
result[param] = (
actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated))
return result
# }}}
# {{{ FMMCostModel
class FMMCostModel(AbstractFMMCostModel):
"""An OpenCL-based realization of :class:`AbstractFMMCostModel`.
.. note:: For methods in this class, argument *traversal* should live in device
memory.
"""
# {{{ form multipoles
@memoize_method
def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype,
box_level_dtype):
return ElementwiseKernel(
context,
Template(r"""
double *np2m,
${box_id_t} *source_boxes,
${particle_id_t} *box_source_counts_nonchild,
${box_level_t} *box_levels,
double *p2m_cost
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
Template(r"""
${box_id_t} box_idx = source_boxes[i];
${particle_id_t} nsources = box_source_counts_nonchild[box_idx];
${box_level_t} ilevel = box_levels[box_idx];
np2m[i] = nsources * p2m_cost[ilevel];
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
name="process_form_multipoles"
)
def process_form_multipoles(self, queue, traversal, p2m_cost):
tree = traversal.tree
np2m = cl.array.zeros(queue, len(traversal.source_boxes), dtype=np.float64)
process_form_multipoles_knl = self.process_form_multipoles_knl(
queue.context,
tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
)
process_form_multipoles_knl(
np2m,
traversal.source_boxes,
tree.box_source_counts_nonchild,
tree.box_levels,
p2m_cost
)
return np2m
# }}}
# {{{ propagate multipoles upward
@memoize_method
def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype,
box_level_dtype, nlevels):
return ElementwiseKernel(
context,
Template(r"""
${box_id_t} *source_parent_boxes,
${box_level_t} *box_levels,
double *m2m_cost,
double *nm2m,
% for i in range(2**ndimensions):
% if i == 2**ndimensions - 1:
${box_id_t} *box_child_ids_${i}
% else:
${box_id_t} *box_child_ids_${i},
% endif
% endfor
""").render(
ndimensions=ndimensions,
box_id_t=dtype_to_ctype(box_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
Template(r"""
${box_id_t} box_idx = source_parent_boxes[i];
${box_level_t} target_level = box_levels[box_idx];
if(target_level <= 1) {
nm2m[i] = 0.0;
} else {
int nchild = 0;
% for i in range(2**ndimensions):
if(box_child_ids_${i}[box_idx])
nchild += 1;
% endfor
nm2m[i] = nchild * m2m_cost[target_level];
}
""").render(
ndimensions=ndimensions,
box_id_t=dtype_to_ctype(box_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype),
nlevels=nlevels
),
name="process_coarsen_multipoles"
)
def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
tree = traversal.tree
nm2m = cl.array.zeros(
queue, len(traversal.source_parent_boxes), dtype=np.float64
)
process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl(
queue.context,
tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels
)
process_coarsen_multipoles_knl(
traversal.source_parent_boxes,
tree.box_levels,
m2m_cost,
nm2m,
*tree.box_child_ids,
queue=queue
)
return self.aggregate_over_boxes(nm2m)
# }}}
# {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
@memoize_method
def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype):
return ElementwiseKernel(
context,
Template("""
${particle_id_t} *ndirect_sources_by_itgt_box,
${box_id_t} *source_boxes_starts,
${box_id_t} *source_boxes_lists,
${particle_id_t} *box_source_counts_nonchild
""").render(
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_id_t=dtype_to_ctype(box_id_dtype)
),
Template(r"""
${particle_id_t} nsources = 0;
${box_id_t} source_boxes_start_idx = source_boxes_starts[i];
${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1];
for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx;
cur_source_boxes_idx < source_boxes_end_idx;
cur_source_boxes_idx++)
{
${box_id_t} cur_source_box = source_boxes_lists[
cur_source_boxes_idx
];
nsources += box_source_counts_nonchild[cur_source_box];
}
ndirect_sources_by_itgt_box[i] += nsources;
""").render(
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_id_t=dtype_to_ctype(box_id_dtype)
),
name="get_ndirect_sources"
)
def get_ndirect_sources_per_target_box(self, queue, traversal):
tree = traversal.tree
ntarget_boxes = len(traversal.target_boxes)
particle_id_dtype = tree.particle_id_dtype
box_id_dtype = tree.box_id_dtype
get_ndirect_sources_knl = self._get_ndirect_sources_knl(
queue.context, particle_id_dtype, box_id_dtype
)
ndirect_sources_by_itgt_box = cl.array.zeros(
queue, ntarget_boxes, dtype=particle_id_dtype
)
# List 1
get_ndirect_sources_knl(
ndirect_sources_by_itgt_box,
traversal.neighbor_source_boxes_starts,
traversal.neighbor_source_boxes_lists,
tree.box_source_counts_nonchild
)
# List 3 close
if traversal.from_sep_close_smaller_starts is not None:
queue.finish()
get_ndirect_sources_knl(
ndirect_sources_by_itgt_box,
traversal.from_sep_close_smaller_starts,
traversal.from_sep_close_smaller_lists,
tree.box_source_counts_nonchild
)
# List 4 close
if traversal.from_sep_close_bigger_starts is not None:
queue.finish()
get_ndirect_sources_knl(
ndirect_sources_by_itgt_box,
traversal.from_sep_close_bigger_starts,
traversal.from_sep_close_bigger_lists,
tree.box_source_counts_nonchild
)
return ndirect_sources_by_itgt_box
def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
box_target_counts_nonchild=None):
if box_target_counts_nonchild is None:
box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
from pyopencl.array import take
ntargets_by_itgt_box = take(
box_target_counts_nonchild,
traversal.target_boxes,
queue=queue
)
return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost
# }}}
# {{{ translate separated siblings' ("list 2") mpoles to local
@memoize_method
def process_list2_knl(self, context, box_id_dtype, box_level_dtype):
return ElementwiseKernel(
context,
Template(r"""
double *nm2l,
${box_id_t} *target_or_target_parent_boxes,
${box_id_t} *from_sep_siblings_starts,
${box_level_t} *box_levels,
double *m2l_cost
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
Template(r"""
${box_id_t} start = from_sep_siblings_starts[i];
${box_id_t} end = from_sep_siblings_starts[i+1];
${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]];
nm2l[i] = (end - start) * m2l_cost[ilevel];
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
name="process_list2"
)
def process_list2(self, queue, traversal, m2l_cost):
tree = traversal.tree
box_id_dtype = tree.box_id_dtype
box_level_dtype = tree.box_level_dtype
ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
nm2l = cl.array.zeros(
queue, (ntarget_or_target_parent_boxes,), dtype=np.float64
)
process_list2_knl = self.process_list2_knl(
queue.context, box_id_dtype, box_level_dtype
)
process_list2_knl(
nm2l,
traversal.target_or_target_parent_boxes,
traversal.from_sep_siblings_starts,
tree.box_levels,
m2l_cost
)
return nm2l
# }}}
# {{{ evaluate sep. smaller mpoles ("list 3") at particles
@memoize_method
def process_list3_knl(self, context, box_id_dtype, particle_id_dtype):
return ElementwiseKernel(
context,
Template(r"""
${box_id_t} *target_boxes_sep_smaller,
${box_id_t} *sep_smaller_start,
${particle_id_t} *box_target_counts_nonchild,
double m2p_cost_current_level,
double *nm2p
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype)
),
Template(r"""
${box_id_t} target_box = target_boxes_sep_smaller[i];
${box_id_t} start = sep_smaller_start[i];
${box_id_t} end = sep_smaller_start[i+1];
${particle_id_t} ntargets = box_target_counts_nonchild[target_box];
nm2p[target_box] += (
ntargets * (end - start) * m2p_cost_current_level
);
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype)
),
name="process_list3"
)
def process_list3(self, queue, traversal, m2p_cost,
box_target_counts_nonchild=None):
tree = traversal.tree
nm2p = cl.array.zeros(queue, tree.nboxes, dtype=np.float64)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = tree.box_target_counts_nonchild
process_list3_knl = self.process_list3_knl(
queue.context, tree.box_id_dtype, tree.particle_id_dtype
)
for ilevel, sep_smaller_list in enumerate(
traversal.from_sep_smaller_by_level):
process_list3_knl(
traversal.target_boxes_sep_smaller_by_source_level[ilevel],
sep_smaller_list.starts,
box_target_counts_nonchild,
m2p_cost[ilevel].get(queue=queue).reshape(-1)[0],
nm2p,
queue=queue
)
return nm2p
# }}}
# {{{ form locals for separated bigger source boxes ("list 4")
@memoize_method
def process_list4_knl(self, context,
box_id_dtype, particle_id_dtype, box_level_dtype):
return ElementwiseKernel(
context,
Template(r"""
double *nm2p,
${box_id_t} *from_sep_bigger_starts,
${box_id_t} *from_sep_bigger_lists,
${particle_id_t} *box_source_counts_nonchild,
${box_level_t} *box_levels,
double *p2l_cost
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
Template(r"""
${box_id_t} start = from_sep_bigger_starts[i];
${box_id_t} end = from_sep_bigger_starts[i+1];
for(${box_id_t} idx=start; idx < end; idx++) {
${box_id_t} src_ibox = from_sep_bigger_lists[idx];
${particle_id_t} nsources = box_source_counts_nonchild[src_ibox];
${box_level_t} ilevel = box_levels[src_ibox];
nm2p[i] += nsources * p2l_cost[ilevel];
}
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
name="process_list4"
)
def process_list4(self, queue, traversal, p2l_cost):
tree = traversal.tree
target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
nm2p = cl.array.zeros(
queue, len(target_or_target_parent_boxes), dtype=np.float64
)
process_list4_knl = self.process_list4_knl(
queue.context,
tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
)
process_list4_knl(
nm2p,
traversal.from_sep_bigger_starts,
traversal.from_sep_bigger_lists,
tree.box_source_counts_nonchild,
tree.box_levels,
p2l_cost
)
return nm2p
# }}}
# {{{ evaluate local expansions at targets
@memoize_method
def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype,
box_level_dtype):
return ElementwiseKernel(
context,
Template(r"""
double *neval_locals,
${box_id_t} *target_boxes,
${particle_id_t} *box_target_counts_nonchild,
${box_level_t} *box_levels,
double *l2p_cost
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
Template(r"""
${box_id_t} box_idx = target_boxes[i];
${particle_id_t} ntargets = box_target_counts_nonchild[box_idx];
${box_level_t} ilevel = box_levels[box_idx];
neval_locals[i] = ntargets * l2p_cost[ilevel];
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
particle_id_t=dtype_to_ctype(particle_id_dtype),
box_level_t=dtype_to_ctype(box_level_dtype)
),
name="process_eval_locals"
)
def process_eval_locals(self, queue, traversal, l2p_cost,
box_target_counts_nonchild=None):
tree = traversal.tree
ntarget_boxes = len(traversal.target_boxes)
neval_locals = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
process_eval_locals_knl = self.process_eval_locals_knl(
queue.context,
tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
)
process_eval_locals_knl(
neval_locals,
traversal.target_boxes,
box_target_counts_nonchild,
tree.box_levels,
l2p_cost
)
return neval_locals
# }}}
# {{{ propagate locals downward
@memoize_method
def process_refine_locals_knl(self, context, box_id_dtype):
from pyopencl.reduction import ReductionKernel
return ReductionKernel(
context,
np.float64,
neutral="0.0",
reduce_expr="a+b",
map_expr=r"""
(level_start_target_or_target_parent_box_nrs[i + 1]
- level_start_target_or_target_parent_box_nrs[i])
* l2l_cost[i - 1]
""",
arguments=Template(r"""
${box_id_t} *level_start_target_or_target_parent_box_nrs,
double *l2l_cost
""").render(
box_id_t=dtype_to_ctype(box_id_dtype)
),
name="process_refine_locals"
)
def process_refine_locals(self, queue, traversal, l2l_cost):
tree = traversal.tree
process_refine_locals_knl = self.process_refine_locals_knl(
queue.context, tree.box_id_dtype
)
level_start_target_or_target_parent_box_nrs = cl.array.to_device(
queue, traversal.level_start_target_or_target_parent_box_nrs
)
cost = process_refine_locals_knl(
level_start_target_or_target_parent_box_nrs,
l2l_cost,
range=slice(1, tree.nlevels)
).get()
return cost.reshape(-1)[0]
# }}}
def zero_cost_per_box(self, queue, nboxes):
return cl.array.zeros(queue, (nboxes,), dtype=np.float64)
def aggregate_over_boxes(self, per_box_result):
if isinstance(per_box_result, float):
return per_box_result
else:
return cl.array.sum(per_box_result).get().reshape(-1)[0]
def fmm_cost_factors_for_kernels_from_model(
self, queue, nlevels, xlat_cost, context):
if not isinstance(queue, cl.CommandQueue):
raise TypeError(
"An OpenCL command queue must be supplied for cost model")
return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
self, queue, nlevels, xlat_cost, context
)
# }}}
# {{{ _PythonFMMCostModel (undocumented, only used for testing)
class _PythonFMMCostModel(AbstractFMMCostModel):
def process_form_multipoles(self, queue, traversal, p2m_cost):
tree = traversal.tree
np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64)
for ilevel in range(tree.nlevels):
start, stop = traversal.level_start_source_box_nrs[ilevel:ilevel + 2]
for isrc_box, src_ibox in enumerate(
traversal.source_boxes[start:stop], start):
nsources = tree.box_source_counts_nonchild[src_ibox]
np2m[isrc_box] = nsources * p2m_cost[ilevel]
return np2m
def get_ndirect_sources_per_target_box(self, queue, traversal):
tree = traversal.tree
ntarget_boxes = len(traversal.target_boxes)
# target box index -> nsources
ndirect_sources_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.float64)
for itgt_box in range(ntarget_boxes):
nsources = 0
start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2]
for src_ibox in traversal.neighbor_source_boxes_lists[start:end]:
nsources += tree.box_source_counts_nonchild[src_ibox]
# Could be None, if not using targets with extent.
if traversal.from_sep_close_smaller_starts is not None:
start, end = (
traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]
)
for src_ibox in traversal.from_sep_close_smaller_lists[start:end]:
nsources += tree.box_source_counts_nonchild[src_ibox]
# Could be None, if not using targets with extent.
if traversal.from_sep_close_bigger_starts is not None:
start, end = (
traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]
)
for src_ibox in traversal.from_sep_close_bigger_lists[start:end]:
nsources += tree.box_source_counts_nonchild[src_ibox]
ndirect_sources_by_itgt_box[itgt_box] = nsources
return ndirect_sources_by_itgt_box
def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
box_target_counts_nonchild=None):
if box_target_counts_nonchild is None:
box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
ntargets_by_itgt_box = box_target_counts_nonchild[traversal.target_boxes]
return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost
def process_list2(self, queue, traversal, m2l_cost):
tree = traversal.tree
ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64)
for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes):
start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2]
ilevel = tree.box_levels[tgt_ibox]
nm2l[itgt_box] += m2l_cost[ilevel] * (end - start)
return nm2l
def process_list3(self, queue, traversal, m2p_cost,
box_target_counts_nonchild=None):
tree = traversal.tree
nm2p = np.zeros(tree.nboxes, dtype=np.float64)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = tree.box_target_counts_nonchild
for ilevel, sep_smaller_list in enumerate(
traversal.from_sep_smaller_by_level):
for itgt_box, tgt_ibox in enumerate(
traversal.target_boxes_sep_smaller_by_source_level[ilevel]):
ntargets = box_target_counts_nonchild[tgt_ibox]
start, end = sep_smaller_list.starts[itgt_box:itgt_box + 2]
nm2p[tgt_ibox] += ntargets * (end - start) * m2p_cost[ilevel]
return nm2p
def process_list4(self, queue, traversal, p2l_cost):
tree = traversal.tree
target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64)
for itgt_box in range(len(target_or_target_parent_boxes)):
start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2]
for src_ibox in traversal.from_sep_bigger_lists[start:end]:
nsources = tree.box_source_counts_nonchild[src_ibox]
ilevel = tree.box_levels[src_ibox]
nm2p[itgt_box] += nsources * p2l_cost[ilevel]
return nm2p
def process_eval_locals(self, queue, traversal, l2p_cost,
box_target_counts_nonchild=None):
tree = traversal.tree
ntarget_boxes = len(traversal.target_boxes)
neval_locals = np.zeros(ntarget_boxes, dtype=np.float64)
if box_target_counts_nonchild is None:
box_target_counts_nonchild = tree.box_target_counts_nonchild
for target_lev in range(tree.nlevels):
start, stop = traversal.level_start_target_box_nrs[
target_lev:target_lev+2]
for itgt_box, tgt_ibox in enumerate(
traversal.target_boxes[start:stop], start):
neval_locals[itgt_box] += (box_target_counts_nonchild[tgt_ibox]
* l2p_cost[target_lev])
return neval_locals
def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
tree = traversal.tree
result = 0.0
# nlevels-1 is the last valid level index
# nlevels-2 is the last valid level that could have children
#
# 3 is the last relevant source_level.
# 2 is the last relevant target_level.
# (because no level 1 box will be well-separated from another)
for source_level in range(tree.nlevels-1, 2, -1):
target_level = source_level - 1
cost = m2m_cost[target_level]
nmultipoles = 0
start, stop = traversal.level_start_source_parent_box_nrs[
target_level:target_level+2]
for ibox in traversal.source_parent_boxes[start:stop]:
for child in tree.box_child_ids[:, ibox]:
if child:
nmultipoles += 1
result += cost * nmultipoles
return result
def process_refine_locals(self, queue, traversal, l2l_cost):
tree = traversal.tree
result = 0.0
for target_lev in range(1, tree.nlevels):
start, stop = traversal.level_start_target_or_target_parent_box_nrs[
target_lev:target_lev+2]
source_lev = target_lev - 1
result += (stop-start) * l2l_cost[source_lev]
return result
def zero_cost_per_box(self, queue, nboxes):
return np.zeros(nboxes, dtype=np.float64)
def aggregate_over_boxes(self, per_box_result):
if isinstance(per_box_result, float):
return per_box_result
else:
return np.sum(per_box_result)
def fmm_cost_factors_for_kernels_from_model(
self, queue, nlevels, xlat_cost, context):
return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
self, None, nlevels, xlat_cost, context
)
# }}}
# vim: foldmethod=marker
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
__doc__ = """
High-level Point FMM Interface
------------------------------
To perform point-FMM, first construct a
:class:`boxtree.distributed.DistributedFMMRunner` object. The constructor will
distribute the necessary information from the root rank to all worker ranks. Then,
the :meth:`boxtree.distributed.DistributedFMMRunner.drive_dfmm` can be used for
launching FMM.
.. autoclass:: boxtree.distributed.DistributedFMMRunner
Distributed Algorithm Overview
------------------------------
1. Construct the global tree and traversal lists on the root rank and broadcast to
all worker ranks.
2. Partition boxes into disjoint sets, where the number of sets is the number of MPI
ranks. (See :ref:`partition-boxes`)
3. Each rank constructs the local tree and traversal lists independently, according
to the partition. (See :ref:`construct-local-tree-traversal`)
4. Distribute source weights from the root rank to all worker ranks. (See
:ref:`distributed-wrangler`)
5. Each rank independently forms multipole expansions from the leaf nodes of the
local tree and propagates the partial multipole expansions upwards.
6. Communicate multipole expansions so that all ranks have the complete multipole
expansions needed.
7. Each rank independently forms local expansions, propagates the local expansions
downwards, and evaluate potentials of target points in its partition. The
calculated potentials are then assembled on the root rank.
For step 5-7, see :ref:`distributed-fmm-evaluation`.
Note that step 4-7 may be repeated multiple times with the same tree and traversal
object built from step 1-3. For example, when iteratively solving a PDE, step 4-7 is
executed for each iteration of the linear solver.
The next sections will cover the interfaces of these steps.
.. _partition-boxes:
Partition Boxes
---------------
.. autofunction:: boxtree.distributed.partition.partition_work
.. autoclass:: boxtree.distributed.partition.BoxMasks
.. autofunction:: boxtree.distributed.partition.get_box_masks
.. _construct-local-tree-traversal:
Construct Local Tree and Traversal
----------------------------------
.. autoclass:: boxtree.distributed.local_tree.LocalTree
.. autofunction:: boxtree.distributed.local_tree.generate_local_tree
.. autofunction:: boxtree.distributed.local_traversal.generate_local_travs
.. _distributed-wrangler:
Distributed Wrangler
--------------------
.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
.. _distributed-fmm-evaluation:
Distributed FMM Evaluation
--------------------------
The distributed version of the FMM evaluation shares the same interface as the
shared-memory version. To evaluate FMM in a distributed manner, use a subclass
of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
:func:`boxtree.fmm.drive_fmm`.
"""
import enum
import warnings
import numpy as np
from mpi4py import MPI
import pyopencl as cl
import pyopencl.array
from boxtree.cost import FMMCostModel
__all__ = ["DistributedFMMRunner"]
# {{{ MPI
@enum.unique
class MPITags(enum.IntEnum):
DIST_WEIGHT = 1
GATHER_POTENTIALS = 2
REDUCE_POTENTIALS = 3
REDUCE_INDICES = 4
def dtype_to_mpi(dtype):
""" This function translates a numpy datatype into the corresponding type used in
mpi4py.
"""
if hasattr(MPI, "_typedict"):
typedict = MPI._typedict
elif hasattr(MPI, "__TypeDict__"):
typedict = MPI.__TypeDict__
else:
raise RuntimeError(
"There is no dictionary to translate from Numpy dtype to MPI type")
mpi_type = typedict.get(np.dtype(dtype).char, None)
if mpi_type is None:
raise ValueError(f"Could not convert '{dtype}' to an MPI datatype")
return mpi_type
# }}}
# {{{ DistributedFMMRunner
def make_distributed_wrangler(
queue, global_tree, traversal_builder, wrangler_factory,
calibration_params, comm):
"""Helper function for constructing the distributed wrangler on each rank.
.. note::
This function needs to be called collectively on all ranks.
:returns: a tuple of ``(wrangler, src_idx_all_ranks, tgt_idx_all_ranks)``,
where the wrangler is constructed according to *wrangler_factory* and
the indices are passed to :func:`boxtree.fmm.drive_fmm`.
"""
mpi_rank = comm.Get_rank()
# `tree_in_device_memory` is True if the global tree is in the device memory
# `tree_in_device_memory` is False if the global tree is in the host memory
#
# Note that at this point, only the root rank has the valid global tree, so
# we test `tree_in_device_memory` on the root rank and broadcast to all
# worker ranks.
tree_in_device_memory = None
if mpi_rank == 0:
tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
# {{{ Broadcast the global tree
global_tree_host = None
if mpi_rank == 0:
if tree_in_device_memory:
global_tree_host = global_tree.get(queue)
else:
global_tree_host = global_tree
global_tree_host = comm.bcast(global_tree_host, root=0)
global_tree_dev = None
if mpi_rank == 0 and tree_in_device_memory:
global_tree_dev = global_tree
else:
global_tree_dev = global_tree_host.to_device(queue)
global_tree_dev = global_tree_dev.with_queue(queue)
global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
global_trav_host = global_trav_dev.get(queue)
global_trav = global_trav_dev if tree_in_device_memory else global_trav_host
# }}}
# {{{ Partition work
cost_per_box = None
if mpi_rank == 0:
cost_model = FMMCostModel()
if calibration_params is None:
# Use default calibration parameters if not supplied
# TODO: should replace the default calibration params with a more
# accurate one
warnings.warn("Calibration parameters for the cost model are not "
"supplied. The default one will be used.",
stacklevel=2)
calibration_params = \
FMMCostModel.get_unit_calibration_params()
# We need to construct a wrangler in order to access `level_orders`
global_wrangler = wrangler_factory(global_trav, global_trav)
cost_per_box = cost_model.cost_per_box(
queue, global_trav_dev, global_wrangler.level_orders,
calibration_params
).get()
from boxtree.distributed.partition import partition_work
responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
# }}}
# {{{ Compute local tree
from boxtree.distributed.local_tree import generate_local_tree
local_tree, src_idx, tgt_idx = generate_local_tree(
queue, global_trav_host, responsible_boxes_list, comm)
# }}}
# {{{ Gather source indices and target indices of each rank
src_idx_all_ranks = comm.gather(src_idx, root=0)
tgt_idx_all_ranks = comm.gather(tgt_idx, root=0)
# }}}
# {{{ Compute traversal object on each rank
from boxtree.distributed.local_traversal import generate_local_travs
local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
if not tree_in_device_memory:
local_trav = local_trav_dev.get(queue=queue)
else:
local_trav = local_trav_dev.with_queue(None)
# }}}
wrangler = wrangler_factory(local_trav, global_trav)
return wrangler, src_idx_all_ranks, tgt_idx_all_ranks
class DistributedFMMRunner:
"""Helper class for setting up and running distributed point FMM.
.. automethod:: __init__
.. automethod:: drive_dfmm
"""
def __init__(self, queue, global_tree,
traversal_builder,
wrangler_factory,
calibration_params=None, comm=MPI.COMM_WORLD):
"""Construct a DistributedFMMRunner object.
:arg global_tree: a :class:`boxtree.Tree` object. This tree could live in the
host or the device memory, depending on the wrangler. This argument is
only significant on the root rank.
:arg traversal_builder: an object which, when called, takes a
:class:`pyopencl.CommandQueue` object and a :class:`boxtree.Tree` object,
and generates a :class:`boxtree.traversal.FMMTraversalInfo` object from
the tree using the command queue.
:arg wrangler_factory: an object which, when called, takes the local
traversal and the global traversal objects and returns an
:class:`boxtree.fmm.ExpansionWranglerInterface` object.
:arg calibration_params: Calibration parameters for the cost model,
if supplied. The cost model is used for estimating the execution time of
each box, which is used for improving load balancing.
:arg comm: MPI communicator.
"""
self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
make_distributed_wrangler(
queue, global_tree, traversal_builder, wrangler_factory,
calibration_params, comm)
def drive_dfmm(self, source_weights, timing_data=None):
"""Calculate potentials at target points.
"""
from boxtree.fmm import drive_fmm
return drive_fmm(
self.wrangler, source_weights,
timing_data=timing_data,
global_src_idx_all_ranks=self.src_idx_all_ranks,
global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
# }}}
# vim: fdm=marker
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import numpy as np
from mako.template import Template
from mpi4py import MPI
import pyopencl as cl
from pyopencl.elementwise import ElementwiseKernel
from pyopencl.tools import dtype_to_ctype
from pytools import memoize_method
from boxtree.distributed import MPITags
from boxtree.fmm import ExpansionWranglerInterface
from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
logger = logging.getLogger(__name__)
# {{{ Distributed FMM wrangler
class DistributedExpansionWrangler(ExpansionWranglerInterface):
"""Distributed expansion wrangler base class.
This is an abstract class and should not be directly instantiated. Instead, it is
expected that all distributed wranglers should be subclasses of this class.
.. automethod:: __init__
.. automethod:: distribute_source_weights
.. automethod:: gather_potential_results
.. automethod:: communicate_mpoles
"""
def __init__(self, context, comm, global_traversal,
traversal_in_device_memory,
communicate_mpoles_via_allreduce=False):
self.context = context
self.comm = comm
self.global_traversal = global_traversal
self.traversal_in_device_memory = traversal_in_device_memory
self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
if mpi_rank == 0:
distribute_weight_req = []
local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
for irank in range(mpi_size):
local_src_weight_vecs[irank] = [
source_weights[src_idx_all_ranks[irank]]
for source_weights in src_weight_vecs]
if irank != 0:
distribute_weight_req.append(self.comm.isend(
local_src_weight_vecs[irank], dest=irank,
tag=MPITags.DIST_WEIGHT
))
MPI.Request.Waitall(distribute_weight_req)
local_src_weight_vecs = local_src_weight_vecs[0]
else:
local_src_weight_vecs = self.comm.recv(source=0, tag=MPITags.DIST_WEIGHT)
return local_src_weight_vecs
def gather_potential_results(self, potentials, tgt_idx_all_ranks):
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
from boxtree.distributed import dtype_to_mpi
potentials_mpi_type = dtype_to_mpi(potentials.dtype)
gathered_potentials = None
if mpi_rank == 0:
# The root rank received calculated potentials from all worker ranks
potentials_all_ranks = np.empty((mpi_size,), dtype=object)
potentials_all_ranks[0] = potentials
recv_reqs = []
for irank in range(1, mpi_size):
potentials_all_ranks[irank] = np.empty(
tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
recv_reqs.append(
self.comm.Irecv(
[potentials_all_ranks[irank], potentials_mpi_type],
source=irank, tag=MPITags.GATHER_POTENTIALS))
MPI.Request.Waitall(recv_reqs)
# Assemble potentials from worker ranks together on the root rank
gathered_potentials = np.empty(
self.global_traversal.tree.ntargets, dtype=potentials.dtype)
for irank in range(mpi_size):
gathered_potentials[tgt_idx_all_ranks[irank]] = (
potentials_all_ranks[irank])
else:
# Worker ranks send calculated potentials to the root rank
self.comm.Send([potentials, potentials_mpi_type],
dest=0, tag=MPITags.GATHER_POTENTIALS)
return gathered_potentials
def _slice_mpoles(self, mpoles, slice_indices):
if len(slice_indices) == 0:
return np.empty((0,), dtype=mpoles.dtype)
level_start_slice_indices = np.searchsorted(
slice_indices, self.traversal.tree.level_start_box_nrs)
mpoles_list = []
for ilevel in range(self.traversal.tree.nlevels):
start, stop = level_start_slice_indices[ilevel:ilevel+2]
if stop > start:
level_start_box_idx, mpoles_current_level = (
self.multipole_expansions_view(mpoles, ilevel))
mpoles_list.append(
mpoles_current_level[
slice_indices[start:stop] - level_start_box_idx
].reshape(-1)
)
return np.concatenate(mpoles_list)
def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
if len(slice_indices) == 0:
return
level_start_slice_indices = np.searchsorted(
slice_indices, self.traversal.tree.level_start_box_nrs)
mpole_updates_start = 0
for ilevel in range(self.traversal.tree.nlevels):
start, stop = level_start_slice_indices[ilevel:ilevel+2]
if stop > start:
level_start_box_idx, mpoles_current_level = (
self.multipole_expansions_view(mpoles, ilevel))
mpoles_shape = (stop - start,) + mpoles_current_level.shape[1:]
from pytools import product
mpole_updates_end = mpole_updates_start + product(mpoles_shape)
mpoles_current_level[
slice_indices[start:stop] - level_start_box_idx
] += mpole_updates[
mpole_updates_start:mpole_updates_end
].reshape(mpoles_shape)
mpole_updates_start = mpole_updates_end
@memoize_method
def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
return ElementwiseKernel(
self.context,
Template(r"""
${box_id_t} *contributing_boxes_list,
int subrange_start,
int subrange_end,
${box_id_t} *box_to_user_rank_starts,
int *box_to_user_rank_lists,
char *box_in_subrange
""").render(
box_id_t=dtype_to_ctype(box_id_dtype),
),
Template(r"""
${box_id_t} ibox = contributing_boxes_list[i];
${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
int useri = box_to_user_rank_lists[iuser];
if(subrange_start <= useri && useri < subrange_end) {
box_in_subrange[i] = 1;
}
}
""").render(
box_id_t=dtype_to_ctype(box_id_dtype)
),
"find_boxes_used_by_subrange"
)
def find_boxes_used_by_subrange(
self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
contributing_boxes_list):
"""Test whether the multipole expansions of the contributing boxes are used
by at least one box in a range.
:arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
:arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
indicating the start and end index in *box_to_user_rank_lists* for each
box in *contributing_boxes_list*.
:arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
the users of each box in *contributing_boxes_list*.
:returns: a :class:`pyopencl.array.Array` object with the same shape as
*contributing_boxes_list*, where the i-th entry is 1 if
``contributing_boxes_list[i]`` is used by at least on box in the
subrange specified.
"""
box_in_subrange = cl.array.zeros(
contributing_boxes_list.queue,
contributing_boxes_list.shape[0],
dtype=np.int8
)
knl = self.find_boxes_used_by_subrange_kernel(
self.traversal.tree.box_id_dtype)
knl(
contributing_boxes_list,
subrange[0],
subrange[1],
box_to_user_rank_starts,
box_to_user_rank_lists,
box_in_subrange
)
return box_in_subrange
def communicate_mpoles(self, mpole_exps, return_stats=False):
"""Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
The main idea is to mimic an allreduce as done on a hypercube network, but to
decrease the bandwidth cost by sending only information that is relevant to
the rank receiving the message.
.. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
adaptive fast multipole method on heterogeneous architectures."
Communications of the ACM 55, no. 5 (2012): 101-109.
"""
mpi_rank = self.comm.Get_rank()
mpi_size = self.comm.Get_size()
tree = self.traversal.tree
if self.communicate_mpoles_via_allreduce:
# Use MPI allreduce for communicating multipole expressions. It is slower
# but might be helpful for debugging purposes.
mpole_exps_all = np.zeros_like(mpole_exps)
self.comm.Allreduce(mpole_exps, mpole_exps_all)
mpole_exps[:] = mpole_exps_all
return
stats = {}
# contributing_boxes:
#
# A mask of the the set of boxes that the current process contributes
# to. This process contributes to a box when:
#
# (a) this process owns sources that contribute to the multipole expansion
# in the box (via the upward pass) or
# (b) this process has received a portion of the multipole expansion in this
# box from another process.
#
# Initially, this set consists of the boxes satisfying condition (a), which
# are precisely the boxes owned by this process and their ancestors.
if self.traversal_in_device_memory:
with cl.CommandQueue(self.context) as queue:
contributing_boxes = tree.ancestor_mask.get(queue=queue)
responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
else:
contributing_boxes = tree.ancestor_mask.copy()
responsible_boxes_list = tree.responsible_boxes_list
contributing_boxes[responsible_boxes_list] = 1
from boxtree.tools import AllReduceCommPattern
comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
# Temporary buffers for receiving data
mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
boxes_list_buf = np.empty(tree.nboxes, dtype=tree.box_id_dtype)
stats["bytes_sent_by_stage"] = []
stats["bytes_recvd_by_stage"] = []
if self.traversal_in_device_memory:
box_to_user_rank_starts_dev = \
tree.box_to_user_rank_starts.with_queue(None)
box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
else:
with cl.CommandQueue(self.context) as queue:
box_to_user_rank_starts_dev = cl.array.to_device(
queue, tree.box_to_user_rank_starts).with_queue(None)
box_to_user_rank_lists_dev = cl.array.to_device(
queue, tree.box_to_user_rank_lists).with_queue(None)
while not comm_pattern.done():
send_requests = []
# Send data to other processors.
if comm_pattern.sinks():
# Compute the subset of boxes to be sent.
message_subrange = comm_pattern.messages()
contributing_boxes_list = np.nonzero(contributing_boxes)[0].astype(
tree.box_id_dtype
)
with cl.CommandQueue(self.context) as queue:
contributing_boxes_list_dev = cl.array.to_device(
queue, contributing_boxes_list)
box_in_subrange = self.find_boxes_used_by_subrange(
message_subrange,
box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
contributing_boxes_list_dev
)
box_in_subrange_host = box_in_subrange.get().astype(bool)
relevant_boxes_list = contributing_boxes_list[
box_in_subrange_host
].astype(tree.box_id_dtype)
"""
# Pure Python version for debugging purpose
relevant_boxes_list = []
for contrib_box in contributing_boxes_list:
iuser_start, iuser_end = tree.box_to_user_rank_starts[
contrib_box:contrib_box + 2
]
for user_box in tree.box_to_user_rank_lists[
iuser_start:iuser_end]:
if subrange_start <= user_box < subrange_end:
relevant_boxes_list.append(contrib_box)
break
"""
relevant_boxes_list = np.array(
relevant_boxes_list, dtype=tree.box_id_dtype
)
relevant_mpole_exps = self._slice_mpoles(
mpole_exps, relevant_boxes_list)
# Send the box subset to the other processors.
for sink in comm_pattern.sinks():
req = self.comm.Isend(relevant_mpole_exps, dest=sink,
tag=MPITags.REDUCE_POTENTIALS)
send_requests.append(req)
req = self.comm.Isend(relevant_boxes_list, dest=sink,
tag=MPITags.REDUCE_INDICES)
send_requests.append(req)
# Receive data from other processors.
for source in comm_pattern.sources():
self.comm.Recv(mpole_exps_buf, source=source,
tag=MPITags.REDUCE_POTENTIALS)
status = MPI.Status()
self.comm.Recv(
boxes_list_buf, source=source, tag=MPITags.REDUCE_INDICES,
status=status)
nboxes = status.Get_count() // boxes_list_buf.dtype.itemsize
# Update data structures.
self._update_mpoles(
mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
contributing_boxes[boxes_list_buf[:nboxes]] = 1
for req in send_requests:
req.wait()
comm_pattern.advance()
if return_stats:
return stats
def finalize_potentials(self, potentials, template_ary):
if self.comm.Get_rank() == 0:
return super().finalize_potentials(potentials, template_ary)
else:
return None
class DistributedFMMLibExpansionWrangler(
DistributedExpansionWrangler, FMMLibExpansionWrangler):
def __init__(
self, context, comm, tree_indep, local_traversal, global_traversal,
fmm_level_to_order=None,
communicate_mpoles_via_allreduce=False,
**kwargs):
DistributedExpansionWrangler.__init__(
self, context, comm, global_traversal, False,
communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
FMMLibExpansionWrangler.__init__(
self, tree_indep, local_traversal,
fmm_level_to_order=fmm_level_to_order, **kwargs)
# TODO: use log_process like FMMLibExpansionWrangler?
def reorder_sources(self, source_array):
if self.comm.Get_rank() == 0:
return source_array[..., self.global_traversal.tree.user_source_ids]
else:
return None
def reorder_potentials(self, potentials):
if self.comm.Get_rank() == 0:
return potentials[self.global_traversal.tree.sorted_target_ids]
else:
return None
# }}}
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import time
logger = logging.getLogger(__name__)
def generate_local_travs(
queue, local_tree, traversal_builder, merge_close_lists=False):
"""Generate local traversal from local tree.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg local_tree: the local tree of class
`boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
object will be constructed.
:arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
a tree, returns the traversal object based on the tree.
:return: generated local traversal object in device memory
"""
start_time = time.time()
local_tree.with_queue(queue)
# We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
# multipole formation and upward propagation within the rank's responsible boxes
# region. Had there not been such restrictions, some sources might be distributed
# to more than 1 rank and counted multiple times.
local_trav, _ = traversal_builder(
queue, local_tree.to_device(queue),
source_boxes_mask=local_tree.responsible_boxes_mask.device,
source_parent_boxes_mask=local_tree.ancestor_mask.device
)
if merge_close_lists and local_tree.targets_have_extent:
local_trav = local_trav.merge_close_lists(queue)
logger.info("Generate local traversal in %f sec.", time.time() - start_time)
return local_trav