Skip to content
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import time
from dataclasses import dataclass
import numpy as np
from mako.template import Template
import pyopencl as cl
from pyopencl.tools import dtype_to_ctype
from pytools import memoize_method
from boxtree import Tree
logger = logging.getLogger(__name__)
# FIXME: The logic in this file has a lot in common with
# the particle filtering functionality that already exists.
# We should refactor this to make use of this commonality.
# https://documen.tician.de/boxtree/tree.html#filtering-the-lists-of-targets
class LocalTreeGeneratorCodeContainer:
"""Objects of this type serve as a place to keep the code needed for
:func:`generate_local_tree`.
"""
def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype):
self.cl_context = cl_context
self.dimensions = dimensions
self.particle_id_dtype = particle_id_dtype
self.coord_dtype = coord_dtype
@memoize_method
def particle_mask_kernel(self):
return cl.elementwise.ElementwiseKernel(
self.cl_context,
arguments=Template("""
__global char *responsible_boxes,
__global ${particle_id_t} *box_particle_starts,
__global ${particle_id_t} *box_particle_counts_nonchild,
__global ${particle_id_t} *particle_mask
""", strict_undefined=True).render(
particle_id_t=dtype_to_ctype(self.particle_id_dtype)
),
operation=Template("""
if(responsible_boxes[i]) {
for(${particle_id_t} pid = box_particle_starts[i];
pid < box_particle_starts[i]
+ box_particle_counts_nonchild[i];
++pid) {
particle_mask[pid] = 1;
}
}
""").render(particle_id_t=dtype_to_ctype(self.particle_id_dtype))
)
@memoize_method
def mask_scan_kernel(self):
from pyopencl.scan import GenericScanKernel
return GenericScanKernel(
self.cl_context, self.particle_id_dtype,
arguments=Template("""
__global ${mask_t} *ary,
__global ${mask_t} *scan
""", strict_undefined=True).render(
mask_t=dtype_to_ctype(self.particle_id_dtype)
),
input_expr="ary[i]",
scan_expr="a+b", neutral="0",
output_statement="scan[i + 1] = item;"
)
fetch_local_particles_arguments = Template("""
__global const ${mask_t} *particle_mask,
__global const ${mask_t} *particle_scan
% for dim in range(ndims):
, __global const ${coord_t} *particles_${dim}
% endfor
% for dim in range(ndims):
, __global ${coord_t} *local_particles_${dim}
% endfor
% if particles_have_extent:
, __global const ${coord_t} *particle_radii
, __global ${coord_t} *local_particle_radii
% endif
""", strict_undefined=True)
fetch_local_particles_prg = Template("""
if(particle_mask[i]) {
${particle_id_t} des = particle_scan[i];
% for dim in range(ndims):
local_particles_${dim}[des] = particles_${dim}[i];
% endfor
% if particles_have_extent:
local_particle_radii[des] = particle_radii[i];
% endif
}
""", strict_undefined=True)
@memoize_method
def fetch_local_particles_kernel(self, particles_have_extent):
return cl.elementwise.ElementwiseKernel(
self.cl_context,
self.fetch_local_particles_arguments.render(
mask_t=dtype_to_ctype(self.particle_id_dtype),
coord_t=dtype_to_ctype(self.coord_dtype),
ndims=self.dimensions,
particles_have_extent=particles_have_extent
),
self.fetch_local_particles_prg.render(
particle_id_t=dtype_to_ctype(self.particle_id_dtype),
ndims=self.dimensions,
particles_have_extent=particles_have_extent
)
)
@memoize_method
def mask_compressor_kernel(self):
from boxtree.tools import MaskCompressorKernel
return MaskCompressorKernel(self.cl_context)
@memoize_method
def modify_target_flags_kernel(self):
from boxtree import box_flags_enum
box_flag_t = dtype_to_ctype(box_flags_enum.dtype)
return cl.elementwise.ElementwiseKernel(
self.cl_context,
Template("""
__global ${particle_id_t} *box_target_counts_nonchild,
__global ${particle_id_t} *box_target_counts_cumul,
__global ${box_flag_t} *box_flags
""").render(
particle_id_t=dtype_to_ctype(self.particle_id_dtype),
box_flag_t=box_flag_t
),
r"""
// reset BOX_IS_TARGET_BOX and BOX_HAS_TARGET_CHILD_BOXES bits
// in the flag of each box
box_flags[i] &= (~BOX_IS_TARGET_BOX);
box_flags[i] &= (~BOX_HAS_TARGET_CHILD_BOXES);
// rebuild BOX_IS_TARGET_BOX and BOX_HAS_TARGET_CHILD_BOXES bits
if(box_target_counts_nonchild[i]) box_flags[i] |= BOX_IS_TARGET_BOX;
if(box_target_counts_nonchild[i] < box_target_counts_cumul[i])
box_flags[i] |= BOX_HAS_TARGET_CHILD_BOXES;
""",
preamble=box_flags_enum.get_c_defines()
)
@dataclass
class LocalParticlesAndLists:
particles: np.ndarray
particle_radii: cl.array.Array | None
box_particle_starts: cl.array.Array
box_particle_counts_nonchild: cl.array.Array
box_particle_counts_cumul: cl.array.Array
particle_idx: np.ndarray
def construct_local_particles_and_lists(
queue, code, dimensions, num_boxes, num_global_particles,
particle_id_dtype, coord_dtype, particles_have_extent,
box_mask,
global_particles, global_particle_radii,
box_particle_starts, box_particle_counts_nonchild,
box_particle_counts_cumul):
"""This helper function generates particles (either sources or targets) of the
local tree, and reconstructs list of lists indexing accordingly.
"""
# {{{ calculate the particle mask
particle_mask = cl.array.zeros(
queue, num_global_particles, dtype=particle_id_dtype)
code.particle_mask_kernel()(
box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask)
# }}}
# {{{ calculate the scan of the particle mask
global_to_local_particle_index = cl.array.empty(
queue, num_global_particles + 1, dtype=particle_id_dtype)
global_to_local_particle_index[0] = 0
code.mask_scan_kernel()(particle_mask, global_to_local_particle_index)
# }}}
# {{{ fetch the local particles
num_local_particles = global_to_local_particle_index[-1].get(queue).item()
local_particles = [
cl.array.empty(queue, num_local_particles, dtype=coord_dtype)
for _ in range(dimensions)]
from pytools.obj_array import make_obj_array
local_particles = make_obj_array(local_particles)
local_particle_radii = None
if particles_have_extent:
local_particle_radii = cl.array.empty(
queue, num_local_particles, dtype=coord_dtype)
code.fetch_local_particles_kernel(True)(
particle_mask, global_to_local_particle_index,
*global_particles.tolist(),
*local_particles,
global_particle_radii,
local_particle_radii)
else:
code.fetch_local_particles_kernel(False)(
particle_mask, global_to_local_particle_index,
*global_particles.tolist(),
*local_particles)
# {{{ construct the list of list indices
local_box_particle_starts = global_to_local_particle_index[box_particle_starts]
box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype)
local_box_particle_counts_nonchild = cl.array.if_positive(
box_mask, box_particle_counts_nonchild, box_counts_all_zeros)
box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul
local_box_particle_counts_cumul = (
global_to_local_particle_index[box_particle_ends_cumul]
- global_to_local_particle_index[box_particle_starts])
# }}}
particle_mask = particle_mask.get(queue=queue).astype(bool)
particle_idx = np.arange(num_global_particles)[particle_mask]
return LocalParticlesAndLists(
local_particles,
local_particle_radii,
local_box_particle_starts,
local_box_particle_counts_nonchild,
local_box_particle_counts_cumul,
particle_idx)
class LocalTree(Tree):
"""
Inherits from :class:`boxtree.Tree`.
.. attribute:: box_to_user_rank_starts
``box_id_t [nboxes + 1]``
.. attribute:: box_to_user_rank_lists
``int32 [*]``
A :ref:`csr` array, together with :attr:`box_to_user_rank_starts`.
For each box, the list of ranks which own targets that *use* the
multipole expansion at this box, via either List 3 or (possibly downward
propagated from an ancestor) List 2.
"""
def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
"""Generate the local tree for the current rank.
This is an MPI-collective routine on *comm*.
:arg queue: a :class:`pyopencl.CommandQueue` object.
:arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object
on host memory.
:arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the
responsible boxes of the current rank.
:return: a tuple of ``(local_tree, src_idx, tgt_idx)``, where ``local_tree`` is
an object with class :class:`boxtree.distributed.local_tree.LocalTree` of the
generated local tree, ``src_idx`` is the indices of the local sources in the
global tree, and ``tgt_idx`` is the indices of the local targets in the
global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source
weights from root rank and assembling calculated potentials on the root rank.
"""
global_tree = global_traversal.tree
code = LocalTreeGeneratorCodeContainer(
queue.context, global_tree.dimensions,
global_tree.particle_id_dtype, global_tree.coord_dtype)
mpi_rank = comm.Get_rank()
mpi_size = comm.Get_size()
start_time = time.time()
from boxtree.distributed.partition import get_box_masks
box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list)
global_tree_dev = global_tree.to_device(queue).with_queue(queue)
local_sources_and_lists = construct_local_particles_and_lists(
queue, code, global_tree.dimensions, global_tree.nboxes,
global_tree.nsources,
global_tree.particle_id_dtype, global_tree.coord_dtype,
global_tree.sources_have_extent,
box_masks.point_src_boxes,
global_tree_dev.sources,
global_tree_dev.sources_radii if global_tree.sources_have_extent else None,
global_tree_dev.box_source_starts,
global_tree_dev.box_source_counts_nonchild,
global_tree_dev.box_source_counts_cumul)
local_targets_and_lists = construct_local_particles_and_lists(
queue, code, global_tree.dimensions, global_tree.nboxes,
global_tree.ntargets,
global_tree.particle_id_dtype, global_tree.coord_dtype,
global_tree.targets_have_extent,
box_masks.responsible_boxes,
global_tree_dev.targets,
global_tree_dev.target_radii if global_tree.targets_have_extent else None,
global_tree_dev.box_target_starts,
global_tree_dev.box_target_counts_nonchild,
global_tree_dev.box_target_counts_cumul)
# {{{ compute the users of multipole expansions of each box on the root rank
multipole_src_boxes_all_ranks = None
if mpi_rank == 0:
multipole_src_boxes_all_ranks = np.empty(
(mpi_size, global_tree.nboxes),
dtype=box_masks.multipole_src_boxes.dtype)
comm.Gather(
box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0)
box_to_user_rank_starts = None
box_to_user_rank_lists = None
if mpi_rank == 0:
multipole_src_boxes_all_ranks = cl.array.to_device(
queue, multipole_src_boxes_all_ranks)
(box_to_user_rank_starts, box_to_user_rank_lists, evt) = \
code.mask_compressor_kernel()(
queue, multipole_src_boxes_all_ranks.transpose(),
list_dtype=np.int32)
cl.wait_for_events([evt])
box_to_user_rank_starts = box_to_user_rank_starts.get()
box_to_user_rank_lists = box_to_user_rank_lists.get()
logger.debug("computing box_to_user: done")
box_to_user_rank_starts = comm.bcast(box_to_user_rank_starts, root=0)
box_to_user_rank_lists = comm.bcast(box_to_user_rank_lists, root=0)
# }}}
# {{{ Reconstruct the target box flags
# Note: We do not change the source box flags despite the local tree may only
# contain a subset of sources. This is because evaluating target potentials in
# the responsible boxes of the current rank may depend on the multipole
# expansions formed by sources in other ranks. Modifying the source box flags
# could result in incomplete interaction lists.
local_box_flags = global_tree_dev.box_flags.copy(queue=queue)
code.modify_target_flags_kernel()(
local_targets_and_lists.box_particle_counts_nonchild,
local_targets_and_lists.box_particle_counts_cumul,
local_box_flags)
# }}}
from pytools.obj_array import make_obj_array
local_sources = make_obj_array([
local_sources_idim.get(queue=queue)
for local_sources_idim in local_sources_and_lists.particles])
local_targets = make_obj_array([
local_target_idim.get(queue=queue)
for local_target_idim in local_targets_and_lists.particles])
local_tree = LocalTree(
sources_are_targets=global_tree.sources_are_targets,
sources_have_extent=global_tree.sources_have_extent,
targets_have_extent=global_tree.targets_have_extent,
particle_id_dtype=global_tree.particle_id_dtype,
box_id_dtype=global_tree.box_id_dtype,
coord_dtype=global_tree.coord_dtype,
box_level_dtype=global_tree.box_level_dtype,
root_extent=global_tree.root_extent,
stick_out_factor=global_tree.stick_out_factor,
extent_norm=global_tree.extent_norm,
bounding_box=global_tree.bounding_box,
level_start_box_nrs=global_tree.level_start_box_nrs,
level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev,
sources=local_sources,
targets=local_targets,
source_radii=(local_sources_and_lists.particle_radii.get(queue=queue)
if global_tree.sources_have_extent else None),
target_radii=(local_targets_and_lists.particle_radii.get(queue=queue)
if global_tree.targets_have_extent else None),
box_source_starts=(
local_sources_and_lists.box_particle_starts.get(queue=queue)),
box_source_counts_nonchild=(
local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)),
box_source_counts_cumul=(
local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)),
box_target_starts=(
local_targets_and_lists.box_particle_starts.get(queue=queue)),
box_target_counts_nonchild=(
local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)),
box_target_counts_cumul=(
local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)),
box_parent_ids=global_tree.box_parent_ids,
box_child_ids=global_tree.box_child_ids,
box_centers=global_tree.box_centers,
box_levels=global_tree.box_levels,
box_flags=local_box_flags.get(queue=queue),
user_source_ids=None,
sorted_target_ids=None,
box_source_bounding_box_min=global_tree.box_source_bounding_box_min,
box_source_bounding_box_max=global_tree.box_source_bounding_box_max,
box_target_bounding_box_min=global_tree.box_target_bounding_box_min,
box_target_bounding_box_max=global_tree.box_target_bounding_box_max,
_is_pruned=global_tree._is_pruned,
responsible_boxes_list=responsible_boxes_list,
responsible_boxes_mask=box_masks.responsible_boxes.get(),
ancestor_mask=box_masks.ancestor_boxes.get(),
box_to_user_rank_starts=box_to_user_rank_starts,
box_to_user_rank_lists=box_to_user_rank_lists
)
local_tree = local_tree.to_host_device_array(queue)
local_tree.with_queue(None)
logger.info("Generate local tree on rank %d in %f sec.",
mpi_rank, time.time() - start_time)
return (
local_tree,
local_sources_and_lists.particle_idx,
local_targets_and_lists.particle_idx)
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner \
Copyright (C) 2018 Hao Gao"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from dataclasses import dataclass
import numpy as np
from mako.template import Template
import pyopencl as cl
from pyopencl.tools import dtype_to_ctype
from pytools import memoize_method
def get_box_ids_dfs_order(tree):
"""Helper function for getting box ids of a tree in depth-first order.
:arg tree: A :class:`boxtree.Tree` object in the host memory. See
:meth:`boxtree.Tree.get` for getting a tree object in host memory.
:return: A numpy array of box ids in depth-first order.
"""
# FIXME: optimize the performance with OpenCL
dfs_order = np.empty((tree.nboxes,), dtype=tree.box_id_dtype)
idx = 0
stack = [0]
while stack:
box_id = stack.pop()
dfs_order[idx] = box_id
idx += 1
for i in range(2**tree.dimensions):
child_box_id = tree.box_child_ids[i][box_id]
if child_box_id > 0:
stack.append(child_box_id)
return dfs_order
def partition_work(cost_per_box, traversal, comm):
"""This function assigns responsible boxes for each rank.
If a rank is responsible for a box, it will calculate the multiple expansion of
the box and evaluate target potentials in the box.
:arg cost_per_box: The expected running time of each box. This argument is only
significant on the root rank.
:arg traversal: The global traversal object containing all particles. This
argument is significant on all ranks.
:arg comm: MPI communicator.
:return: A numpy array containing the responsible boxes of the current rank.
"""
tree = traversal.tree
mpi_rank = comm.Get_rank()
mpi_size = comm.Get_size()
if mpi_size > tree.nboxes:
raise RuntimeError("Fail to partition work because the number of boxes is "
"less than the number of processes.")
# transform tree from the level order to the morton dfs order
# dfs_order[i] stores the level-order box index of dfs index i
dfs_order = get_box_ids_dfs_order(tree)
# partition all boxes in dfs order evenly according to workload on the root rank
responsible_boxes_segments = None
# contains: [start_index, end_index)
responsible_boxes_current_rank = np.empty(2, dtype=tree.box_id_dtype)
# FIXME: Right now, the responsible boxes assigned to all ranks are computed
# centrally on the root rank to avoid inconsistency risks of floating point
# operations. We could improve the efficiency by letting each rank compute the
# costs of a subset of boxes, and use MPI_Scan to aggregate the results.
if mpi_rank == 0:
total_workload = np.sum(cost_per_box)
# second axis: [start_index, end_index)
responsible_boxes_segments = np.empty((mpi_size, 2), dtype=tree.box_id_dtype)
segment_idx = 0
start = 0
workload_count = 0
for box_idx_dfs_order in range(tree.nboxes):
if segment_idx + 1 == mpi_size:
responsible_boxes_segments[segment_idx, :] = [start, tree.nboxes]
break
box_idx = dfs_order[box_idx_dfs_order]
workload_count += cost_per_box[box_idx]
if (workload_count > (segment_idx + 1) * total_workload / mpi_size
or box_idx_dfs_order == tree.nboxes - 1):
# record "end of rank segment"
responsible_boxes_segments[segment_idx, :] = (
[start, box_idx_dfs_order + 1])
start = box_idx_dfs_order + 1
segment_idx += 1
comm.Scatter(responsible_boxes_segments, responsible_boxes_current_rank, root=0)
return dfs_order[
responsible_boxes_current_rank[0]:responsible_boxes_current_rank[1]]
class GetBoxMasksCodeContainer:
def __init__(self, cl_context, box_id_dtype):
self.cl_context = cl_context
self.box_id_dtype = box_id_dtype
@memoize_method
def add_interaction_list_boxes_kernel(self):
"""Given a ``responsible_boxes_mask`` and an interaction list, mark source
boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask.
"""
return cl.elementwise.ElementwiseKernel(
self.cl_context,
Template("""
__global ${box_id_t} *box_list,
__global char *responsible_boxes_mask,
__global ${box_id_t} *interaction_boxes_starts,
__global ${box_id_t} *interaction_boxes_lists,
__global char *src_boxes_mask
""", strict_undefined=True).render(
box_id_t=dtype_to_ctype(self.box_id_dtype)
),
Template(r"""
typedef ${box_id_t} box_id_t;
box_id_t current_box = box_list[i];
if(responsible_boxes_mask[current_box]) {
for(box_id_t box_idx = interaction_boxes_starts[i];
box_idx < interaction_boxes_starts[i + 1];
++box_idx)
src_boxes_mask[interaction_boxes_lists[box_idx]] = 1;
}
""", strict_undefined=True).render(
box_id_t=dtype_to_ctype(self.box_id_dtype)
),
)
@memoize_method
def add_parent_boxes_kernel(self):
return cl.elementwise.ElementwiseKernel(
self.cl_context,
"__global char *current, __global char *parent, "
f"__global {dtype_to_ctype(self.box_id_dtype)} *box_parent_ids",
"if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1"
)
def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
"""Query the ancestors of responsible boxes.
:arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
:return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by
*responsible_boxes_mask*.
"""
ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8)
ancestor_boxes_last = responsible_boxes_mask.copy()
while ancestor_boxes_last.any():
ancestor_boxes_new = cl.array.zeros(
queue, (traversal.tree.nboxes,), dtype=np.int8)
code.add_parent_boxes_kernel()(
ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids)
ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes)
ancestor_boxes = ancestor_boxes | ancestor_boxes_new
ancestor_boxes_last = ancestor_boxes_new
return ancestor_boxes
def get_point_src_boxes_mask(
queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
"""Query the boxes whose sources are needed in order to evaluate potentials
of boxes represented by *responsible_boxes_mask*.
:arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
:param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
or an ancestor of the responsible boxes.
:return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
i-th entry is 1 if sources of box ``i`` are needed for evaluating the
potentials of targets in boxes represented by *responsible_boxes_mask*.
"""
src_boxes_mask = responsible_boxes_mask.copy()
# Add list 1 of responsible boxes
code.add_interaction_list_boxes_kernel()(
traversal.target_boxes, responsible_boxes_mask,
traversal.neighbor_source_boxes_starts,
traversal.neighbor_source_boxes_lists, src_boxes_mask,
queue=queue)
# Add list 4 of responsible boxes or ancestor boxes
code.add_interaction_list_boxes_kernel()(
traversal.target_or_target_parent_boxes,
responsible_boxes_mask | ancestor_boxes_mask,
traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists,
src_boxes_mask,
queue=queue)
if traversal.tree.targets_have_extent:
# Add list 3 close of responsible boxes
if traversal.from_sep_close_smaller_starts is not None:
code.add_interaction_list_boxes_kernel()(
traversal.target_boxes,
responsible_boxes_mask,
traversal.from_sep_close_smaller_starts,
traversal.from_sep_close_smaller_lists,
src_boxes_mask,
queue=queue
)
# Add list 4 close of responsible boxes
if traversal.from_sep_close_bigger_starts is not None:
code.add_interaction_list_boxes_kernel()(
traversal.target_boxes,
responsible_boxes_mask | ancestor_boxes_mask,
traversal.from_sep_close_bigger_starts,
traversal.from_sep_close_bigger_lists,
src_boxes_mask,
queue=queue
)
return src_boxes_mask
def get_multipole_src_boxes_mask(
queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
"""Query the boxes whose multipoles are used in order to evaluate
potentials of targets in boxes represented by *responsible_boxes_mask*.
:arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
:arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
or an ancestor of the responsible boxes.
:return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)``
whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating
the potentials of targets in boxes represented by *responsible_boxes_mask*.
"""
multipole_boxes_mask = cl.array.zeros(
queue, (traversal.tree.nboxes,), dtype=np.int8
)
# A mpole is used by process p if it is in the List 2 of either a box
# owned by p or one of its ancestors.
code.add_interaction_list_boxes_kernel()(
traversal.target_or_target_parent_boxes,
responsible_boxes_mask | ancestor_boxes_mask,
traversal.from_sep_siblings_starts,
traversal.from_sep_siblings_lists,
multipole_boxes_mask,
queue=queue
)
multipole_boxes_mask.finish()
# A mpole is used by process p if it is in the List 3 of a box owned by p.
for ilevel in range(traversal.tree.nlevels):
code.add_interaction_list_boxes_kernel()(
traversal.target_boxes_sep_smaller_by_source_level[ilevel],
responsible_boxes_mask,
traversal.from_sep_smaller_by_level[ilevel].starts,
traversal.from_sep_smaller_by_level[ilevel].lists,
multipole_boxes_mask,
queue=queue
)
multipole_boxes_mask.finish()
return multipole_boxes_mask
@dataclass
class BoxMasks:
"""
Box masks needed for the distributed calculation. Each of these masks is a
PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
set.
.. attribute:: responsible_boxes
Current process will evaluate target potentials and multipole expansions in
these boxes. Sources and targets in these boxes are needed.
.. attribute:: ancestor_boxes
Ancestors of the responsible boxes.
.. attribute:: point_src_boxes
Current process needs sources but not targets in these boxes.
.. attribute:: multipole_src_boxes
Current process needs multipole expressions in these boxes.
"""
responsible_boxes: cl.array.Array
ancestor_boxes: cl.array.Array
point_src_boxes: cl.array.Array
multipole_src_boxes: cl.array.Array
def get_box_masks(queue, traversal, responsible_boxes_list):
"""Given the responsible boxes for a rank, this helper function calculates the
relevant masks.
:arg responsible_boxes_list: A numpy array of responsible box indices.
:returns: A :class:`BoxMasks` object of the relevant masks.
"""
code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype)
# FIXME: It is wasteful to copy the whole traversal object into device memory
# here because
# 1) Not all fields are needed.
# 2) For sumpy wrangler, a device traversal object is already available.
traversal = traversal.to_device(queue)
responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8)
responsible_boxes_mask[responsible_boxes_list] = 1
responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask)
ancestor_boxes_mask = get_ancestor_boxes_mask(
queue, code, traversal, responsible_boxes_mask)
point_src_boxes_mask = get_point_src_boxes_mask(
queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
multipole_src_boxes_mask = get_multipole_src_boxes_mask(
queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
return BoxMasks(
responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask,
multipole_src_boxes_mask)
from __future__ import division
"""
.. autofunction:: drive_fmm
.. autoclass:: TreeIndependentDataForWrangler
.. autoclass:: ExpansionWranglerInterface
"""
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
......@@ -23,75 +28,391 @@ THE SOFTWARE.
"""
import logging
from abc import ABC, abstractmethod
logger = logging.getLogger(__name__)
from pytools import ProcessLogger
from boxtree.traversal import FMMTraversalInfo
from boxtree.tree import Tree
# {{{ expansion wrangler interface
class TreeIndependentDataForWrangler:
"""An object that can be used to store information for efficient
wrangler execution that depends on the kernel but not the tree and/or
the traversal.
Examples of such data include generated code for carrying out
translations.
.. note::
Instances of this type should not hold a reference (and thereby be
specific to) a :class:`boxtree.Tree` instance. Their purpose is to
host caches for generated translation code that is reusable across
trees. It is OK for these instances to be specific to a given kernel
(or set of kernels).
"""
class ExpansionWranglerInterface(ABC):
"""Abstract expansion handling interface for use with :func:`drive_fmm`.
See this
`test code <https://github.com/inducer/boxtree/blob/main/test/test_fmm.py>`__
for a very simple sample implementation.
.. note::
Wranglers may hold a reference (and thereby be specific to) a
:class:`boxtree.Tree` instance.
:class:`TreeIndependentDataForWrangler` exists to hold data that
is more broadly reusable.
Functions that support returning timing data return a value supporting the
:class:`~boxtree.timing.TimingFuture` interface.
.. versionchanged:: 2018.1
Changed (a subset of) functions to return timing data.
.. attribute:: tree_indep
An instance of (a typically wrangler-dependent subclass of)
:class:`TreeIndependentDataForWrangler`.
.. attribute:: traversal
An instance of :class:`~boxtree.traversal.FMMTraversalInfo`.
.. autoattribute:: tree
.. rubric:: Particle ordering
.. automethod:: reorder_sources
.. automethod:: reorder_potentials
.. rubric:: Views into arrays of expansions
.. automethod:: multipole_expansions_view
.. automethod:: local_expansions_view
.. rubric:: Translations
.. automethod:: form_multipoles
.. automethod:: coarsen_multipoles
.. automethod:: eval_direct
.. automethod:: multipole_to_local
.. automethod:: eval_multipoles
.. automethod:: form_locals
.. automethod:: refine_locals
.. automethod:: eval_locals
.. automethod:: finalize_potentials
"""
def __init__(self, tree_indep: TreeIndependentDataForWrangler,
traversal: FMMTraversalInfo):
self.tree_indep = tree_indep
self.traversal = traversal
@property
def tree(self) -> Tree:
return self.traversal.tree
@abstractmethod
def reorder_sources(self, source_array):
"""Return a copy of *source_array* in
:ref:`tree source order <particle-orderings>`.
*source_array* is in user source order.
"""
@abstractmethod
def reorder_potentials(self, potentials):
"""Return a copy of *potentials* in
:ref:`user target order <particle-orderings>`.
*source_weights* is in tree target order.
"""
# {{{ views into arrays of expansions
# Included here for the benefit of the distributed-memory FMM
@abstractmethod
def multipole_expansions_view(self, mpole_exps, level):
pass
@abstractmethod
def local_expansions_view(self, local_exps, level):
pass
# }}}
# {{{ translations
@abstractmethod
def form_multipoles(self,
level_start_source_box_nrs, source_boxes,
src_weight_vecs):
"""Return an expansions array
containing multipole expansions in *source_boxes* due to sources
with *src_weight_vecs*.
All other expansions must be zero.
:return: A pair (*mpoles*, *timing_future*).
"""
@abstractmethod
def coarsen_multipoles(self,
level_start_source_parent_box_nrs,
source_parent_boxes, mpoles):
"""For each box in *source_parent_boxes*,
gather (and translate) the box's children's multipole expansions in
*mpole* and add the resulting expansion into the box's multipole
expansion in *mpole*.
:returns: A pair (*mpoles*, *timing_future*).
"""
@abstractmethod
def eval_direct(self,
target_boxes, neighbor_sources_starts,
neighbor_sources_lists, src_weight_vecs):
"""For each box in *target_boxes*, evaluate the influence of the
neighbor sources due to *src_weight_vecs*, which use :ref:`csr` and are
indexed like *target_boxes*.
:returns: A pair (*pot*, *timing_future*), where *pot* is a
a new potential array.
"""
@abstractmethod
def multipole_to_local(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes,
starts, lists, mpole_exps):
"""For each box in *target_or_target_parent_boxes*, translate and add
the influence of the multipole expansion in *mpole_exps* into a new
array of local expansions. *starts* and *lists* use :ref:`csr`, and
*starts* is indexed like *target_or_target_parent_boxes*.
:returns: A pair (*pot*, *timing_future*) where *pot* is
a new (local) expansion array.
"""
@abstractmethod
def eval_multipoles(self,
target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps):
"""For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate
the multipole expansion in *mpole_exps* in the nearby boxes given in
*from_sep_smaller_by_level*, and return a new potential array.
*starts* and *lists* in *from_sep_smaller_by_level[i]* use :ref:`csr`
and *starts* is indexed like *target_boxes_by_source_level[i]*.
:returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
array.
"""
@abstractmethod
def form_locals(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, starts, lists, src_weight_vecs):
"""For each box in *target_or_target_parent_boxes*, form local
expansions due to the sources in the nearby boxes given in *starts* and
*lists*, and return a new local expansion array. *starts* and *lists*
use :ref:`csr` and *starts* is indexed like
*target_or_target_parent_boxes*.
:returns: A pair (*pot*, *timing_future*) where *pot* is a new
local expansion array.
"""
@abstractmethod
def refine_locals(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, local_exps):
"""For each box in *child_boxes*,
translate the box's parent's local expansion in *local_exps* and add
the resulting expansion into the box's local expansion in *local_exps*.
:returns: A pair (*local_exps*, *timing_future*).
"""
@abstractmethod
def eval_locals(self,
level_start_target_box_nrs, target_boxes, local_exps):
"""For each box in *target_boxes*, evaluate the local expansion in
*local_exps* and return a new potential array.
:returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
array.
"""
# }}}
@abstractmethod
def finalize_potentials(self, potentials, template_ary):
"""
Postprocess the reordered potentials. This is where global scaling
factors could be applied. This is distinct from :meth:`reorder_potentials`
because some derived FMMs (notably the QBX FMM) do their own reordering.
:arg template_ary: If the array type used inside of the FMM
is different from the array type used by the user (e.g.
:class:`boxtree.pyfmmlib_integration.FMMLibExpansionWrangler`
uses :class:`numpy.ndarray` internally, this array can be used
to help convert the output back to the user's array
type (typically :class:`pyopencl.array.Array`).
"""
def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
"""Used by the distributed implementation for transferring needed source
weights from root rank to each worker rank in the communicator.
This method needs to be called collectively by all ranks in the communicator.
:arg src_weight_vecs: a sequence of :class:`numpy.ndarray`, each with length
``nsources``, representing the weights of sources on the root rank.
*None* on worker ranks.
:arg src_idx_all_ranks: a :class:`list` of length ``nranks``, including the
root rank, where the i-th entry is a :class:`numpy.ndarray` of indices,
of which *src_weight_vecs* to be sent from the root rank to rank *i*.
Each entry can be generated by :func:`.generate_local_tree`. *None* on
worker ranks.
:return: Received source weights of the current rank, including the root
rank.
"""
return src_weight_vecs
def gather_potential_results(self, potentials, tgt_idx_all_ranks):
"""Used by the distributed implementation for gathering calculated potentials
from all worker ranks in the communicator to the root rank.
This method needs to be called collectively by all ranks in the communicator.
:arg potentials: Calculated potentials on each rank. This argument is
significant on all ranks, including the root rank.
:arg tgt_idx_all_ranks: a :class:`list` of length ``nranks``, where the
i-th entry is a :class:`numpy.ndarray` of the global potential indices
of potentials from rank *i*. This argument is only significant on the
root rank.
:return: Gathered potentials on the root rank. *None* on worker ranks.
"""
return potentials
def communicate_mpoles(self, mpole_exps, return_stats=False): # noqa: B027
"""Used by the distributed implementation for forming the complete multipole
expansions from the partial multipole expansions.
This function accepts partial multipole expansions in the argument
*mpole_exps*, and modifies *mpole_exps* in place with the communicated and
reduced multipole expansions.
This function needs to be called collectively by all ranks in the
communicator.
def drive_fmm(traversal, expansion_wrangler, src_weights):
:returns: Statistics of the communication if *return_stats* is True. *None*
otherwise.
"""
pass
# }}}
def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
timing_data=None,
global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None):
"""Top-level driver routine for a fast multipole calculation.
In part, this is intended as a template for custom FMMs, in the sense that
you may copy and paste its
`source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_
`source code <https://github.com/inducer/boxtree/blob/main/boxtree/fmm.py>`__
as a starting point.
Nonetheless, many common applications (such as point-to-point FMMs) can be
covered by supplying the right *expansion_wrangler* to this routine.
:arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance.
:arg expansion_wrangler: An object exhibiting the
:class:`ExpansionWranglerInterface`.
:arg src_weights: Source 'density/weights/charges'.
Passed unmodified to *expansion_wrangler*.
Returns the potentials computed by *expansion_wrangler*.
:class:`ExpansionWranglerInterface`. For distributed implementation, this
wrangler should be a subclass of
:class:`boxtree.distributed.calculation.DistributedExpansionWrangler`.
:arg src_weight_vecs: A sequence of source 'density/weights/charges'.
Passed unmodified to *expansion_wrangler*. For distributed
implementation, this argument is only significant on the root rank, but
worker ranks still need to supply a dummy vector.
:arg timing_data: Either *None*, or a :class:`dict` that is populated with
timing information for the stages of the algorithm (in the form of
:class:`~boxtree.timing.TimingResult`), if such information is available.
:arg global_src_idx_all_ranks: Only used in the distributed implementation. A
:class:`list` of length ``nranks``, where the i-th entry is a
:class:`numpy.ndarray` representing the global indices of sources in the
local tree on rank *i*. Each entry can be returned from
*generate_local_tree*. This argument is only significant on the root rank.
:arg global_tgt_idx_all_ranks: Only used in the distributed implementation. A
:class:`list` of length ``nranks``, where the i-th entry is a
:class:`numpy.ndarray` representing the global indices of targets in the
local tree on rank *i*. Each entry can be returned from
*generate_local_tree*. This argument is only significant on the root rank.
:return: the potentials computed by *expansion_wrangler*. For the distributed
implementation, the potentials are gathered and returned on the root rank;
this function returns *None* on the worker ranks.
"""
tree = traversal.tree
wrangler = expansion_wrangler
traversal = wrangler.traversal
# Interface guidelines: Attributes of the tree are assumed to be known
# to the expansion wrangler and should not be passed.
logger.info("start fmm")
fmm_proc = ProcessLogger(logger, "fmm")
from boxtree.timing import TimingRecorder
recorder = TimingRecorder()
logger.debug("reorder source weights")
src_weight_vecs = [wrangler.reorder_sources(weight) for
weight in src_weight_vecs]
src_weights = wrangler.reorder_sources(src_weights)
src_weight_vecs = wrangler.distribute_source_weights(
src_weight_vecs, global_src_idx_all_ranks)
# {{{ "Step 2.1:" Construct local multipoles
logger.debug("construct local multipoles")
mpole_exps = wrangler.form_multipoles(
mpole_exps, timing_future = wrangler.form_multipoles(
traversal.level_start_source_box_nrs,
traversal.source_boxes,
src_weights)
src_weight_vecs)
recorder.add("form_multipoles", timing_future)
# }}}
# {{{ "Step 2.2:" Propagate multipoles upward
logger.debug("propagate multipoles upward")
mpole_exps, timing_future = wrangler.coarsen_multipoles(
traversal.level_start_source_parent_box_nrs,
traversal.source_parent_boxes,
mpole_exps)
for lev in range(tree.nlevels-1, -1, -1):
start_parent_box, end_parent_box = \
traversal.level_start_source_parent_box_nrs[lev:lev+2]
wrangler.coarsen_multipoles(
traversal.source_parent_boxes[start_parent_box:end_parent_box],
mpole_exps)
recorder.add("coarsen_multipoles", timing_future)
# mpole_exps is called Phi in [1]
# }}}
# {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
wrangler.communicate_mpoles(mpole_exps)
logger.debug("direct evaluation from neighbor source boxes ('list 1')")
# {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
potentials = wrangler.eval_direct(
potentials, timing_future = wrangler.eval_direct(
traversal.target_boxes,
traversal.neighbor_source_boxes_starts,
traversal.neighbor_source_boxes_lists,
src_weights)
src_weight_vecs)
recorder.add("eval_direct", timing_future)
# these potentials are called alpha in [1]
......@@ -99,217 +420,114 @@ def drive_fmm(traversal, expansion_wrangler, src_weights):
# {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local
logger.debug("translate separated siblings' ('list 2') mpoles to local")
local_exps = wrangler.multipole_to_local(
local_exps, timing_future = wrangler.multipole_to_local(
traversal.level_start_target_or_target_parent_box_nrs,
traversal.target_or_target_parent_boxes,
traversal.sep_siblings_starts,
traversal.sep_siblings_lists,
traversal.from_sep_siblings_starts,
traversal.from_sep_siblings_lists,
mpole_exps)
recorder.add("multipole_to_local", timing_future)
# local_exps represents both Gamma and Delta in [1]
# }}}
# {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles
logger.debug("evaluate sep. smaller mpoles at particles ('list 3 far')")
# (the point of aiming this stage at particles is specifically to keep its
# contribution *out* of the downward-propagating local expansions)
potentials = potentials + wrangler.eval_multipoles(
traversal.target_boxes,
traversal.sep_smaller_starts,
traversal.sep_smaller_lists,
mpole_result, timing_future = wrangler.eval_multipoles(
traversal.target_boxes_sep_smaller_by_source_level,
traversal.from_sep_smaller_by_level,
mpole_exps)
recorder.add("eval_multipoles", timing_future)
potentials = potentials + mpole_result
# these potentials are called beta in [1]
if traversal.sep_close_smaller_starts is not None:
if traversal.from_sep_close_smaller_starts is not None:
logger.debug("evaluate separated close smaller interactions directly "
"('list 3 close')")
potentials = potentials + wrangler.eval_direct(
direct_result, timing_future = wrangler.eval_direct(
traversal.target_boxes,
traversal.sep_close_smaller_starts,
traversal.sep_close_smaller_lists,
src_weights)
traversal.from_sep_close_smaller_starts,
traversal.from_sep_close_smaller_lists,
src_weight_vecs)
# }}}
recorder.add("eval_direct", timing_future)
# {{{ "Stage 6:" form locals for separated bigger mpoles ("list 4")
potentials = potentials + direct_result
logger.debug("form locals for separated bigger mpoles ('list 4 far')")
# }}}
local_exps = local_exps + wrangler.form_locals(
# {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")
local_result, timing_future = wrangler.form_locals(
traversal.level_start_target_or_target_parent_box_nrs,
traversal.target_or_target_parent_boxes,
traversal.sep_bigger_starts,
traversal.sep_bigger_lists,
src_weights)
traversal.from_sep_bigger_starts,
traversal.from_sep_bigger_lists,
src_weight_vecs)
recorder.add("form_locals", timing_future)
if traversal.sep_close_bigger_starts is not None:
logger.debug("evaluate separated close bigger interactions directly "
"('list 4 close')")
local_exps = local_exps + local_result
if traversal.from_sep_close_bigger_starts is not None:
direct_result, timing_future = wrangler.eval_direct(
traversal.target_boxes,
traversal.from_sep_close_bigger_starts,
traversal.from_sep_close_bigger_lists,
src_weight_vecs)
potentials = potentials + wrangler.eval_direct(
traversal.target_or_target_parent_boxes,
traversal.sep_close_bigger_starts,
traversal.sep_close_bigger_lists,
src_weights)
recorder.add("eval_direct", timing_future)
potentials = potentials + direct_result
# }}}
# {{{ "Stage 7:" propagate local_exps downward
logger.debug("propagate local_exps downward")
local_exps, timing_future = wrangler.refine_locals(
traversal.level_start_target_or_target_parent_box_nrs,
traversal.target_or_target_parent_boxes,
local_exps)
for lev in range(1, tree.nlevels):
start_box, end_box = \
traversal.level_start_target_or_target_parent_box_nrs[lev:lev+2]
wrangler.refine_locals(
traversal.target_or_target_parent_boxes[start_box:end_box],
local_exps)
recorder.add("refine_locals", timing_future)
# }}}
# {{{ "Stage 8:" evaluate locals
logger.debug("evaluate locals")
potentials = potentials + wrangler.eval_locals(
local_result, timing_future = wrangler.eval_locals(
traversal.level_start_target_box_nrs,
traversal.target_boxes,
local_exps)
# }}}
logger.debug("reorder potentials")
result = wrangler.reorder_potentials(potentials)
logger.info("fmm complete")
return result
# {{{ expansion wrangler interface
class ExpansionWranglerInterface:
"""Abstract expansion handling interface for use with :func:`drive_fmm`.
See this
`test code <https://github.com/inducer/boxtree/blob/master/test/test_fmm.py>`_
for a very simple sample implementation.
Will usually hold a reference (and thereby be specific to) a
:class:`boxtree.Tree` instance.
"""
def multipole_expansion_zeros(self):
"""Return an expansions array (which must support addition)
capable of holding one multipole or local expansion for every
box in the tree.
"""
def local_expansion_zeros(self):
"""Return an expansions array (which must support addition)
capable of holding one multipole or local expansion for every
box in the tree.
"""
def potential_zeros(self):
"""Return a potentials array (which must support addition) capable of
holding a potential value for each target in the tree. Note that
:func:`drive_fmm` makes no assumptions about *potential* other than
that it supports addition--it may consist of potentials, gradients of
the potential, or arbitrary other per-target output data.
"""
recorder.add("eval_locals", timing_future)
def reorder_sources(self, source_array):
"""Return a copy of *source_array* in
:ref:`tree source order <particle-orderings>`.
*source_array* is in user source order.
"""
potentials = potentials + local_result
def reorder_potentials(self, potentials):
"""Return a copy of *potentials* in
:ref:`user target order <particle-orderings>`.
*source_weights* is in tree target order.
"""
def form_multipoles(self, source_boxes, src_weights):
"""Return an expansions array (compatible with
:meth:`multipole_expansion_zeros`)
containing multipole expansions in *source_boxes* due to sources
with *src_weights*.
All other expansions must be zero.
"""
def coarsen_multipoles(self, parent_boxes, mpoles):
"""For each box in *parent_boxes*,
gather (and translate) the box's children's multipole expansions in
*mpole* and add the resulting expansion into the box's multipole
expansion in *mpole*.
:returns: *mpoles*
"""
def eval_direct(self, target_boxes, neighbor_sources_starts,
neighbor_sources_lists, src_weights):
"""For each box in *target_boxes*, evaluate the influence of the
neigbor sources due to *src_weights*, which use :ref:`csr` and are
indexed like *target_boxes*.
:returns: a new potential array, see :meth:`potential_zeros`.
"""
def multipole_to_local(self, target_or_target_parent_boxes,
starts, lists, mpole_exps):
"""For each box in *target_or_target_parent_boxes*, translate and add
the influence of the multipole expansion in *mpole_exps* into a new
array of local expansions. *starts* and *lists* use :ref:`csr`, and
*starts* is indexed like *target_or_target_parent_boxes*.
:returns: a new (local) expansion array, see
:meth:`local_expansion_zeros`.
"""
def eval_multipoles(self, target_boxes, starts, lists, mpole_exps):
"""For each box in *target_boxes*, evaluate the multipole expansion in
*mpole_exps* in the nearby boxes given in *starts* and *lists*, and
return a new potential array. *starts* and *lists* use :ref:`csr` and
*starts* is indexed like *target_boxes*.
:returns: a new potential array, see :meth:`potential_zeros`.
"""
def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
"""For each box in *target_or_target_parent_boxes*, form local
expansions due to the sources in the nearby boxes given in *starts* and
*lists*, and return a new local expansion array. *starts* and *lists*
use :ref:`csr` and *starts* is indexed like
*target_or_target_parent_boxes*.
# }}}
:returns: a new local expansion array, see
:meth:`local_expansion_zeros`.
"""
pass
potentials = wrangler.gather_potential_results(
potentials, global_tgt_idx_all_ranks)
def refine_locals(self, child_boxes, local_exps):
"""For each box in *child_boxes*,
translate the box's parent's local expansion in *local_exps* and add
the resulting expansion into the box's local expansion in *local_exps*.
result = wrangler.reorder_potentials(potentials)
:returns: *local_exps*
"""
result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0])
def eval_locals(self, target_boxes, local_exps):
"""For each box in *target_boxes*, evaluate the local expansion in
*local_exps* and return a new potential array.
fmm_proc.done()
:returns: a new potential array, see :meth:`potential_zeros`.
"""
if timing_data is not None:
timing_data.update(recorder.summarize())
# }}}
return result
# vim: filetype=pyopencl:fdm=marker
from __future__ import division
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from pytools import memoize_method, Record
import numpy as np
import pyopencl as cl
import pyopencl.array # noqa
from mako.template import Template
from boxtree.tools import AXIS_NAMES, DeviceDataRecord
import logging
logger = logging.getLogger(__name__)
__doc__ = """
Leaves -> overlapping balls
---------------------------
.. autoclass:: LeavesToBallsLookupBuilder
.. autoclass:: LeavesToBallsLookup
"""
# {{{ output
class LeavesToBallsLookup(DeviceDataRecord):
"""
.. attribute:: tree
The :class:`boxtree.Tree` instance used to build this lookup.
.. attribute:: balls_near_box_starts
Indices into :attr:`balls_near_box_lists`.
``balls_near_box_lists[balls_near_box_starts[ibox]:
balls_near_box_starts[ibox]+1]``
results in a list of balls that overlap leaf box *ibox*.
.. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
this list is indexed by the global box index.
.. attribute:: balls_near_box_lists
.. automethod:: get
"""
# }}}
# {{{ kernel templates
BALLS_TO_LEAVES_TEMPLATE = r"""//CL//
typedef ${dtype_to_ctype(ball_id_dtype)} ball_id_t;
void generate(LIST_ARG_DECL USER_ARG_DECL ball_id_t ball_nr)
{
coord_vec_t ball_center;
%for i in range(dimensions):
ball_center.${AXIS_NAMES[i]} = ball_${AXIS_NAMES[i]}[ball_nr];
%endfor
coord_t ball_radius = ball_radii[ball_nr];
// To find overlapping leaves, start at the top of the tree, descend
// into overlapping boxes.
${walk_init(0)}
while (continue_walk)
{
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
dbg_printf((" walk box id: %d morton: %d child id: %d level: %d\n",
walk_box_id, walk_morton_nr, child_box_id, walk_level));
if (child_box_id)
{
bool is_overlapping;
${check_l_infty_ball_overlap(
"is_overlapping", "child_box_id", "ball_radius", "ball_center")}
if (is_overlapping)
{
if (!(box_flags[child_box_id] & BOX_HAS_CHILDREN))
{
APPEND_ball_numbers(ball_nr);
APPEND_overlapping_leaves(child_box_id);
}
else
{
// We want to descend into this box. Put the current state
// on the stack.
${walk_push("child_box_id")}
continue;
}
}
}
${walk_advance()}
}
}
"""
class _KernelInfo(Record):
pass
class LeavesToBallsLookupBuilder(object):
"""Given a set of :math:`l^\infty` "balls", this class helps build a
look-up table from leaf boxes to balls that overlap with each leaf box.
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
from pyopencl.algorithm import KeyValueSorter
self.key_value_sorter = KeyValueSorter(context)
@memoize_method
def get_balls_to_leaves_kernel(self, dimensions, coord_dtype, box_id_dtype,
ball_id_dtype, max_levels, stick_out_factor):
from pyopencl.tools import dtype_to_ctype
from boxtree import box_flags_enum
render_vars = dict(
dimensions=dimensions,
dtype_to_ctype=dtype_to_ctype,
box_id_dtype=box_id_dtype,
particle_id_dtype=None,
ball_id_dtype=ball_id_dtype,
coord_dtype=coord_dtype,
vec_types=cl.array.vec.types,
max_levels=max_levels,
AXIS_NAMES=AXIS_NAMES,
box_flags_enum=box_flags_enum,
debug=False,
stick_out_factor=stick_out_factor,
)
logger.info("start building leaves-to-balls lookup kernel")
from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
src = Template(
TRAVERSAL_PREAMBLE_TEMPLATE
+ BALLS_TO_LEAVES_TEMPLATE,
strict_undefined=True).render(**render_vars)
from pyopencl.tools import VectorArg, ScalarArg
from pyopencl.algorithm import ListOfListsBuilder
result = ListOfListsBuilder(self.context,
[
("ball_numbers", ball_id_dtype),
("overlapping_leaves", box_id_dtype),
],
str(src),
arg_decls=[
VectorArg(box_flags_enum.dtype, "box_flags"),
VectorArg(coord_dtype, "box_centers"),
VectorArg(box_id_dtype, "box_child_ids"),
VectorArg(np.uint8, "box_levels"),
ScalarArg(coord_dtype, "root_extent"),
ScalarArg(box_id_dtype, "aligned_nboxes"),
VectorArg(coord_dtype, "ball_radii"),
] + [
VectorArg(coord_dtype, "ball_"+ax)
for ax in AXIS_NAMES[:dimensions]],
name_prefix="circles_to_balls",
count_sharing={
# /!\ This makes a promise that APPEND_ball_numbers will
# always occur *before* APPEND_overlapping_leaves.
"overlapping_leaves": "ball_numbers"
},
complex_kernel=True)
logger.info("done building leaves-to-balls lookup kernel")
return result
def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None):
"""
:arg queue: a :class:`pyopencl.CommandQueue`
:arg tree: a :class:`boxtree.Tree`.
:arg ball_centers: an object array of coordinate
:class:`pyopencl.array.Array` instances.
Their *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg ball_radii: a
:class:`pyopencl.array.Array`
of positive numbers.
Its *dtype* must match *tree*'s
:attr:`boxtree.Tree.coord_dtype`.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
exeuction.
:returns: a tuple *(lbl, event)*, where *lbl* is an instance of
:class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
for dependency management.
"""
from pytools import single_valued
if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
raise TypeError("ball_centers dtype must match tree.coord_dtype")
if ball_radii.dtype != tree.coord_dtype:
raise TypeError("ball_radii dtype must match tree.coord_dtype")
ball_id_dtype = tree.particle_id_dtype # ?
from pytools import div_ceil
# Avoid generating too many kernels.
max_levels = div_ceil(tree.nlevels, 10) * 10
b2l_knl = self.get_balls_to_leaves_kernel(
tree.dimensions, tree.coord_dtype,
tree.box_id_dtype, ball_id_dtype,
max_levels, tree.stick_out_factor)
logger.info("leaves-to-balls lookup: prepare ball list")
nballs = len(ball_radii)
result, evt = b2l_knl(
queue, nballs,
tree.box_flags.data, tree.box_centers.data,
tree.box_child_ids.data, tree.box_levels.data,
tree.root_extent, tree.aligned_nboxes,
ball_radii.data, *tuple(bc.data for bc in ball_centers),
wait_for=wait_for)
wait_for = [evt]
logger.info("leaves-to-balls lookup: key-value sort")
balls_near_box_starts, balls_near_box_lists, evt \
= self.key_value_sorter(
queue,
# keys
result["overlapping_leaves"].lists,
# values
result["ball_numbers"].lists,
tree.nboxes, starts_dtype=tree.box_id_dtype,
wait_for=wait_for)
logger.info("leaves-to-balls lookup: built")
return LeavesToBallsLookup(
tree=tree,
balls_near_box_starts=balls_near_box_starts,
balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
# }}}
# vim: filetype=pyopencl:fdm=marker
from __future__ import division
"""
Integrates :mod:`boxtree` with
`pyfmmlib <https://pypi.org/project/pyfmmlib>`__.
.. autoclass:: FMMLibTreeIndependentDataForWrangler
.. autoclass:: FMMLibExpansionWrangler
Internal bits
^^^^^^^^^^^^^
"""Integration between boxtree and pyfmmlib."""
.. autoclass:: FMMLibRotationDataInterface
.. autoclass:: FMMLibRotationData
.. autoclass:: FMMLibRotationDataNotSuppliedWarning
"""
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
......@@ -25,31 +36,482 @@ THE SOFTWARE.
"""
import logging
logger = logging.getLogger(__name__)
import enum
import numpy as np
from pytools import log_process, memoize_method
__doc__ = """Integrates :mod:`boxtree` with
`pyfmmlib <http://pypi.python.org/pypi/pyfmmlib>`_.
"""
from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
from boxtree.timing import return_timing_data
# {{{ rotation data interface
class FMMLibRotationDataInterface:
"""Abstract interface for additional, optional data for precomputation of
rotation matrices passed to the expansion wrangler.
.. automethod:: m2l_rotation_lists
.. automethod:: m2l_rotation_angles
"""
def m2l_rotation_lists(self):
"""Return a :mod:`numpy` array mapping entries of List 2 to rotation classes.
"""
raise NotImplementedError
def m2l_rotation_angles(self):
"""Return a :mod:`numpy` array mapping List 2 rotation classes to
rotation angles.
"""
raise NotImplementedError
class FMMLibRotationData(FMMLibRotationDataInterface):
"""An implementation of the :class:`FMMLibRotationDataInterface`.
.. automethod:: __init__
"""
def __init__(self, queue, trav):
self.queue = queue
self.trav = trav
self.tree = trav.tree
@property
@memoize_method
def rotation_classes_builder(self):
from boxtree.rotation_classes import RotationClassesBuilder
return RotationClassesBuilder(self.queue.context)
@memoize_method
def build_rotation_classes_lists(self):
trav = self.trav.to_device(self.queue)
tree = self.tree.to_device(self.queue)
return self.rotation_classes_builder(self.queue, trav, tree)[0]
@memoize_method
def m2l_rotation_lists(self):
return (self
.build_rotation_classes_lists()
.from_sep_siblings_rotation_classes
.get(self.queue))
@memoize_method
def m2l_rotation_angles(self):
return (self
.build_rotation_classes_lists()
.from_sep_siblings_rotation_class_to_angle
.get(self.queue))
class FMMLibRotationDataNotSuppliedWarning(UserWarning):
pass
# }}}
@enum.unique
class Kernel(enum.Enum):
LAPLACE = enum.auto()
HELMHOLTZ = enum.auto()
# {{{ tree-independent data for wrangler
class FMMLibTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
"""
.. automethod:: __init__
"""
def __init__(self, dim, kernel, ifgrad=False):
self.dim = dim
self.ifgrad = ifgrad
self.kernel = kernel
if kernel == Kernel.LAPLACE:
self.eqn_letter = "l"
elif kernel == Kernel.HELMHOLTZ:
self.eqn_letter = "h"
else:
raise ValueError(kernel)
self.dtype = np.complex128
# {{{ routine getters
def get_routine(self, name, suffix=""):
import pyfmmlib
return getattr(pyfmmlib, f"{self.eqn_letter}{name % self.dim}{suffix}")
def get_vec_routine(self, name):
return self.get_routine(name, "_vec")
def get_translation_routine(self, wrangler, name, vec_suffix="_vec"):
suffix = ""
if self.dim == 3:
suffix = "quadu"
suffix += vec_suffix
rout = self.get_routine(name, suffix)
if self.dim == 2:
def wrapper(*args, **kwargs):
# not used
kwargs.pop("level_for_projection", None)
return rout(*args, **kwargs)
else:
def wrapper(*args, **kwargs):
kwargs.pop("level_for_projection", None)
nterms2 = kwargs["nterms2"]
kwargs.update(wrangler.projection_quad_extra_kwargs(order=nterms2))
val, ier = rout(*args, **kwargs)
if (ier != 0).any():
raise RuntimeError(f"{name} failed with nonzero ier")
return val
class Helmholtz2DExpansionWrangler:
# Doesn't work in in Py2
# from functools import update_wrapper
# update_wrapper(wrapper, rout)
return wrapper
def get_direct_eval_routine(self, use_dipoles):
if self.dim == 2:
rout = self.get_vec_routine(
"potgrad%ddall" + ("_dp" if use_dipoles else ""))
def wrapper(*args, **kwargs):
kwargs["ifgrad"] = self.ifgrad
kwargs["ifhess"] = False
pot, grad, _hess = rout(*args, **kwargs)
if not self.ifgrad:
grad = 0
return pot, grad
# Doesn't work in in Py2
# from functools import update_wrapper
# update_wrapper(wrapper, rout)
return wrapper
elif self.dim == 3:
rout = self.get_vec_routine(
"potfld%ddall" + ("_dp" if use_dipoles else ""))
def wrapper(*args, **kwargs):
kwargs["iffld"] = self.ifgrad
pot, fld = rout(*args, **kwargs)
grad = -fld if self.ifgrad else 0
return pot, grad
# Doesn't work in in Py2
# from functools import update_wrapper
# update_wrapper(wrapper, rout)
return wrapper
else:
raise ValueError("unsupported dimensionality")
def get_expn_eval_routine(self, expn_kind):
name = f"%dd{expn_kind}eval"
rout = self.get_routine(name, "_vec")
if self.dim == 2:
def wrapper(*args, **kwargs):
kwargs["ifgrad"] = self.ifgrad
kwargs["ifhess"] = False
pot, grad, _hess = rout(*args, **kwargs)
if not self.ifgrad:
grad = 0
return pot, grad
# Doesn't work in in Py2
# from functools import update_wrapper
# update_wrapper(wrapper, rout)
return wrapper
elif self.dim == 3:
def wrapper(*args, **kwargs):
kwargs["iffld"] = self.ifgrad
pot, fld, ier = rout(*args, **kwargs)
if (ier != 0).any():
raise RuntimeError(f"{name} failed with nonzero ier")
grad = -fld if self.ifgrad else 0
return pot, grad
# Doesn't work in in Py2
# from functools import update_wrapper
# update_wrapper(wrapper, rout)
return wrapper
else:
raise ValueError("unsupported dimensionality")
# }}}
# }}}
# {{{ wrangler
class FMMLibExpansionWrangler(ExpansionWranglerInterface):
"""Implements the :class:`boxtree.fmm.ExpansionWranglerInterface`
by using pyfmmlib.
Timing results returned by this wrangler contains the values *wall_elapsed*
and (optionally, if supported) *process_elapsed*, which measure wall time
and process time in seconds, respectively.
"""
def __init__(self, tree, helmholtz_k, nterms):
self.tree = tree
# {{{ constructor
def __init__(self, tree_indep, traversal, *,
helmholtz_k=None, fmm_level_to_order=None,
dipole_vec=None, dipoles_already_reordered=False, order=None,
optimized_m2l_precomputation_memory_cutoff_bytes=10**8,
rotation_data=None):
"""
:arg fmm_level_to_order: A callable that, upon being passed the tree
and the tree level as an integer, returns the order for the multipole and
local expansions on that level.
:arg rotation_data: Either *None* or an instance of the
:class:`FMMLibRotationDataInterface`. In three dimensions, passing
*rotation_data* enables optimized M2L (List 2) translations.
In two dimensions, this does nothing.
:arg optimized_m2l_precomputation_memory_cutoff_bytes: When using
optimized List 2 translations, an upper bound in bytes on the
amount of storage to use for a precomputed rotation matrix.
"""
if order is not None and fmm_level_to_order is not None:
raise TypeError("may specify either fmm_level_to_order or order, "
"but not both")
if order is not None:
from warnings import warn
warn("Passing order is deprecated. Pass fmm_level_to_order instead.",
DeprecationWarning, stacklevel=2)
def fmm_level_to_order(tree, level): # pylint:disable=function-redefined
return order
super().__init__(tree_indep, traversal)
if tree_indep.kernel == Kernel.LAPLACE:
self.kernel_kwargs = {}
self.rscale_factor = 1
if helmholtz_k:
raise ValueError(
"helmholtz_k must be zero or unspecified for Laplace")
helmholtz_k = 0
elif tree_indep.kernel == Kernel.HELMHOLTZ:
self.kernel_kwargs = {"zk": helmholtz_k}
if not helmholtz_k:
raise ValueError(
"helmholtz_k must be specified and nonzero")
self.rscale_factor = abs(helmholtz_k)
else:
raise ValueError(tree_indep.kernel)
self.helmholtz_k = helmholtz_k
self.nterms = nterms
def multipole_expansion_zeros(self):
return np.zeros((self.tree.nboxes, 2*self.nterms+1), dtype=np.complex128)
tree = traversal.tree
if tree_indep.dim != tree.dimensions:
raise ValueError(f"Kernel dim ({tree_indep.dim}) "
f"does not match tree dim ({tree.dimensions})")
local_expansion_zeros = multipole_expansion_zeros
self.level_orders = np.array([
fmm_level_to_order(tree, lev) for lev in range(tree.nlevels)
], dtype=np.int32)
def potential_zeros(self):
return np.zeros(self.tree.ntargets, dtype=np.complex128)
if tree_indep.kernel == Kernel.HELMHOLTZ:
logger.info("expansion orders by level used in Helmholtz FMM: %s",
self.level_orders)
self.rotation_data = rotation_data
self.rotmat_cutoff_bytes = optimized_m2l_precomputation_memory_cutoff_bytes
if self.dim == 3:
if rotation_data is None:
from warnings import warn
warn(
"List 2 (multipole-to-local) translations will be "
"unoptimized. Supply a rotation_data argument to "
"FMMLibExpansionWrangler for optimized List 2.",
FMMLibRotationDataNotSuppliedWarning,
stacklevel=2)
self.supports_optimized_m2l = rotation_data is not None
else:
self.supports_optimized_m2l = False
# FIXME: dipole_vec shouldn't be stored here! Otherwise, we'll recompute
# bunches of tree-dependent stuff for every new dipole vector.
# It's not super bad because the dipole vectors are typically geometry
# normals and thus change about at the same time as the tree... but there's
# still no reason for them to be here.
self.use_dipoles = dipole_vec is not None
if self.use_dipoles:
assert dipole_vec.shape == (self.dim, self.tree.nsources)
if not dipoles_already_reordered:
dipole_vec = self.reorder_sources(dipole_vec)
self.dipole_vec = dipole_vec.copy(order="F")
else:
self.dipole_vec = None
# }}}
@property
def dim(self):
return self.tree.dimensions
def level_to_rscale(self, level):
result = self.tree.root_extent * 2 ** -level * self.rscale_factor
if abs(result) > 1:
result = 1
if self.dim == 3 and self.tree_indep.eqn_letter == "l":
# Laplace 3D uses the opposite convention compared to
# all other cases.
# https://gitlab.tiker.net/inducer/boxtree/merge_requests/81
result = 1 / result
return result
@memoize_method
def projection_quad_extra_kwargs(self, level=None, order=None):
if level is None and order is None:
raise TypeError("must pass exactly one of level or order")
if level is not None and order is not None:
raise TypeError("must pass exactly one of level or order")
if level is not None:
order = self.level_orders[level]
common_extra_kwargs = {}
if self.dim == 3 and self.tree_indep.eqn_letter == "h":
nquad = max(6, int(2.5*order))
from pyfmmlib import legewhts
xnodes, weights = legewhts(nquad, ifwhts=1)
common_extra_kwargs = {
"xnodes": xnodes,
"wts": weights,
}
return common_extra_kwargs
# {{{ overridable target lists for the benefit of the QBX FMM
def box_target_starts(self):
return self.tree.box_target_starts
def box_target_counts_nonchild(self):
return self.tree.box_target_counts_nonchild
def targets(self):
return self.tree.targets
# }}}
# {{{ level starts
def _expansions_level_starts(self, order_to_size):
result = [0]
for lev in range(self.tree.nlevels):
lev_nboxes = (
self.tree.level_start_box_nrs[lev+1]
- self.tree.level_start_box_nrs[lev])
expn_size = order_to_size(self.level_orders[lev])
result.append(
result[-1]
+ expn_size * lev_nboxes)
return result
@memoize_method
def multipole_expansions_level_starts(self):
from pytools import product
return self._expansions_level_starts(
lambda order: product(
self.expansion_shape(order)))
@memoize_method
def local_expansions_level_starts(self):
from pytools import product
return self._expansions_level_starts(
lambda order: product(
self.expansion_shape(order)))
# }}}
# {{{ views into arrays of expansions
def multipole_expansions_view(self, mpole_exps, level):
box_start, box_stop = self.tree.level_start_box_nrs[level:level+2]
expn_start, expn_stop = \
self.multipole_expansions_level_starts()[level:level+2]
return (box_start,
mpole_exps[expn_start:expn_stop].reshape(
box_stop-box_start,
*self.expansion_shape(self.level_orders[level])))
def local_expansions_view(self, local_exps, level):
box_start, box_stop = self.tree.level_start_box_nrs[level:level+2]
expn_start, expn_stop = \
self.local_expansions_level_starts()[level:level+2]
return (box_start,
local_exps[expn_start:expn_stop].reshape(
box_stop-box_start,
*self.expansion_shape(self.level_orders[level])))
# }}}
def get_source_kwargs(self, src_weights, pslice):
if self.dipole_vec is None:
return {
"charge": src_weights[pslice],
}
else:
if self.tree_indep.eqn_letter == "l" and self.dim == 2:
return {
"dipstr": -src_weights[pslice] * (
self.dipole_vec[0, pslice]
+ 1j * self.dipole_vec[1, pslice])
}
else:
return {
"dipstr": src_weights[pslice],
"dipvec": self.dipole_vec[:, pslice],
}
# {{{ source/target particle wrangling
def _get_source_slice(self, ibox):
pstart = self.tree.box_source_starts[ibox]
......@@ -57,75 +519,268 @@ class Helmholtz2DExpansionWrangler:
pstart, pstart + self.tree.box_source_counts_nonchild[ibox])
def _get_target_slice(self, ibox):
pstart = self.tree.box_target_starts[ibox]
pstart = self.box_target_starts()[ibox]
return slice(
pstart, pstart + self.tree.box_target_counts_nonchild[ibox])
pstart, pstart + self.box_target_counts_nonchild()[ibox])
@memoize_method
def _get_single_sources_array(self):
return np.array([
self.tree.sources[idim]
for idim in range(self.dim)
], order="F")
def _get_sources(self, pslice):
# FIXME yuck!
return self._get_single_sources_array()[:, pslice]
@memoize_method
def _get_single_targets_array(self):
return np.array([
self.tree.sources[idim][pslice]
for idim in range(self.tree.dimensions)
self.targets()[idim]
for idim in range(self.dim)
], order="F")
def _get_targets(self, pslice):
# FIXME yuck!
return self._get_single_targets_array()[:, pslice]
@memoize_method
def _get_single_box_centers_array(self):
return np.array([
self.tree.targets[idim][pslice]
for idim in range(self.tree.dimensions)
self.tree.box_centers[idim]
for idim in range(self.dim)
], order="F")
# }}}
# {{{ precompute rotation matrices for optimized m2l
@memoize_method
def m2l_rotation_matrices(self):
# Returns a tuple (rotmatf, rotmatb, rotmat_order), consisting of the
# forward rotation matrices, backward rotation matrices, and the
# translation order of the matrices. rotmat_order is -1 if not
# supported.
rotmatf = None
rotmatb = None
rotmat_order = -1
if not self.supports_optimized_m2l:
return (rotmatf, rotmatb, rotmat_order)
m2l_rotation_angles = self.rotation_data.m2l_rotation_angles()
if len(m2l_rotation_angles) == 0:
# The pyfmmlib wrapper may or may not complain if you give it a
# zero-length array.
return (rotmatf, rotmatb, rotmat_order)
def mem_estimate(order):
# Rotation matrix memory cost estimate.
return (8
* (order + 1)**2
* (2*order + 1)
* len(m2l_rotation_angles))
# Find the largest order we can use. Because the memory cost of the
# matrices could be large, only precompute them if the cost estimate
# for the order does not exceed the cutoff.
for order in sorted(self.level_orders, reverse=True):
if mem_estimate(order) < self.rotmat_cutoff_bytes:
rotmat_order = order
break
if rotmat_order == -1:
return (rotmatf, rotmatb, rotmat_order)
# Compute the rotation matrices.
from pyfmmlib import rotviarecur3p_init_vec as rotmat_builder
ier, rotmatf = (
rotmat_builder(rotmat_order, m2l_rotation_angles))
assert (ier == 0).all()
ier, rotmatb = (
rotmat_builder(rotmat_order, -m2l_rotation_angles))
assert (ier == 0).all()
return (rotmatf, rotmatb, rotmat_order)
# }}}
# {{{ data vector utilities
def expansion_shape(self, order):
if self.dim == 2 and self.tree_indep.eqn_letter == "l":
return (order+1,)
elif self.dim == 2 and self.tree_indep.eqn_letter == "h":
return (2*order+1,)
elif self.dim == 3:
# This is the transpose of the Fortran format, to
# minimize mismatch between C and Fortran orders.
return (2*order+1, order+1,)
else:
raise ValueError("unsupported dimensionality")
def multipole_expansion_zeros(self):
"""Return an expansions array (which must support addition)
capable of holding one multipole or local expansion for every
box in the tree.
"""
return np.zeros(
self.multipole_expansions_level_starts()[-1],
dtype=self.tree_indep.dtype)
def local_expansion_zeros(self):
"""Return an expansions array (which must support addition)
capable of holding one multipole or local expansion for every
box in the tree.
"""
return np.zeros(
self.local_expansions_level_starts()[-1],
dtype=self.tree_indep.dtype)
def output_zeros(self):
"""Return a potentials array (which must support addition) capable of
holding a potential value for each target in the tree. Note that
:func:`drive_fmm` makes no assumptions about *potential* other than
that it supports addition--it may consist of potentials, gradients of
the potential, or arbitrary other per-target output data.
"""
if self.tree_indep.ifgrad:
from pytools.obj_array import make_obj_array
return make_obj_array([
np.zeros(self.tree.ntargets, self.tree_indep.dtype)
for i in range(1 + self.dim)])
else:
return np.zeros(self.tree.ntargets, self.tree_indep.dtype)
def add_potgrad_onto_output(self, output, output_slice, pot, grad):
if self.tree_indep.ifgrad:
output[0, output_slice] += pot
output[1:, output_slice] += grad
else:
output[output_slice] += pot
# }}}
@log_process(logger)
def reorder_sources(self, source_array):
return source_array[self.tree.user_source_ids]
return source_array[..., self.tree.user_source_ids]
@log_process(logger)
def reorder_potentials(self, potentials):
return potentials[self.tree.sorted_target_ids]
def form_multipoles(self, source_boxes, src_weights):
rscale = 1 # FIXME
from pyfmmlib import h2dformmp
@log_process(logger)
@return_timing_data
def form_multipoles(self, level_start_source_box_nrs, source_boxes,
src_weight_vecs):
src_weights, = src_weight_vecs
formmp = self.tree_indep.get_routine(
"%ddformmp" + ("_dp" if self.use_dipoles else ""))
mpoles = self.multipole_expansion_zeros()
for src_ibox in source_boxes:
pslice = self._get_source_slice(src_ibox)
if pslice.stop - pslice.start == 0:
for lev in range(self.tree.nlevels):
start, stop = level_start_source_box_nrs[lev:lev+2]
if start == stop:
continue
ier, mpoles[src_ibox] = h2dformmp(
self.helmholtz_k, rscale, self._get_sources(pslice),
src_weights[pslice],
self.tree.box_centers[:, src_ibox], self.nterms)
if ier:
raise RuntimeError("h2dformmp failed")
level_start_ibox, mpoles_view = self.multipole_expansions_view(
mpoles, lev)
rscale = self.level_to_rscale(lev)
for src_ibox in source_boxes[start:stop]:
pslice = self._get_source_slice(src_ibox)
if pslice.stop - pslice.start == 0:
continue
kwargs = {}
kwargs.update(self.kernel_kwargs)
kwargs.update(self.get_source_kwargs(src_weights, pslice))
ier, mpole = formmp(
rscale=rscale,
source=self._get_sources(pslice),
center=self.tree.box_centers[:, src_ibox],
nterms=self.level_orders[lev],
**kwargs)
if ier:
raise RuntimeError("formmp failed")
mpoles_view[src_ibox-level_start_ibox] = mpole.T
return mpoles
def coarsen_multipoles(self, parent_boxes, mpoles):
@log_process(logger)
@return_timing_data
def coarsen_multipoles(self, level_start_source_parent_box_nrs,
source_parent_boxes, mpoles):
tree = self.tree
rscale = 1 # FIXME
from pyfmmlib import h2dmpmp_vec
mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp")
# nlevels-1 is the last valid level index
# nlevels-2 is the last valid level that could have children
#
# 3 is the last relevant source_level.
# 2 is the last relevant target_level.
# (because no level 1 box will be well-separated from another)
for source_level in range(tree.nlevels-1, 2, -1):
target_level = source_level - 1
start, stop = level_start_source_parent_box_nrs[
target_level:target_level+2]
source_level_start_ibox, source_mpoles_view = \
self.multipole_expansions_view(mpoles, source_level)
target_level_start_ibox, target_mpoles_view = \
self.multipole_expansions_view(mpoles, target_level)
source_rscale = self.level_to_rscale(source_level)
target_rscale = self.level_to_rscale(target_level)
for ibox in parent_boxes:
parent_center = tree.box_centers[:, ibox]
for child in tree.box_child_ids[:, ibox]:
if child:
child_center = tree.box_centers[:, child]
for ibox in source_parent_boxes[start:stop]:
parent_center = tree.box_centers[:, ibox]
for child in tree.box_child_ids[:, ibox]:
if child:
child_center = tree.box_centers[:, child]
new_mp = h2dmpmp_vec(
self.helmholtz_k,
rscale, child_center, mpoles[child],
rscale, parent_center, self.nterms)
kwargs = {}
if self.dim == 3 and self.tree_indep.eqn_letter == "h":
kwargs["radius"] = tree.root_extent * 2**(-target_level)
mpoles[ibox] += new_mp[:, 0]
kwargs.update(self.kernel_kwargs)
new_mp = mpmp(
rscale1=source_rscale,
center1=child_center,
expn1=source_mpoles_view[
child - source_level_start_ibox].T,
rscale2=target_rscale,
center2=parent_center,
nterms2=self.level_orders[target_level],
**kwargs)
target_mpoles_view[
ibox - target_level_start_ibox] += new_mp[..., 0].T
return mpoles
@log_process(logger)
@return_timing_data
def eval_direct(self, target_boxes, neighbor_sources_starts,
neighbor_sources_lists, src_weights):
pot = self.potential_zeros()
neighbor_sources_lists, src_weight_vecs):
src_weights, = src_weight_vecs
output = self.output_zeros()
from pyfmmlib import hpotgrad2dall_vec
ev = self.tree_indep.get_direct_eval_routine(self.use_dipoles)
for itgt_box, tgt_ibox in enumerate(target_boxes):
tgt_pslice = self._get_target_slice(tgt_ibox)
......@@ -133,7 +788,11 @@ class Helmholtz2DExpansionWrangler:
if tgt_pslice.stop - tgt_pslice.start == 0:
continue
tgt_result = np.zeros(tgt_pslice.stop - tgt_pslice.start, np.complex128)
# tgt_result = np.zeros(
# tgt_pslice.stop - tgt_pslice.start, self.tree_indep.dtype)
tgt_pot_result = 0
tgt_grad_result = 0
start, end = neighbor_sources_starts[itgt_box:itgt_box+2]
for src_ibox in neighbor_sources_lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
......@@ -141,140 +800,363 @@ class Helmholtz2DExpansionWrangler:
if src_pslice.stop - src_pslice.start == 0:
continue
tmp_pot, _, _ = hpotgrad2dall_vec(
ifgrad=False, ifhess=False,
kwargs = {}
kwargs.update(self.kernel_kwargs)
kwargs.update(self.get_source_kwargs(src_weights, src_pslice))
tmp_pot, tmp_grad = ev(
sources=self._get_sources(src_pslice),
charge=src_weights[src_pslice],
targets=self._get_targets(tgt_pslice), zk=self.helmholtz_k)
targets=self._get_targets(tgt_pslice),
**kwargs)
tgt_result += tmp_pot
tgt_pot_result += tmp_pot
tgt_grad_result += tmp_grad
pot[tgt_pslice] = tgt_result
self.add_potgrad_onto_output(
output, tgt_pslice, tgt_pot_result, tgt_grad_result)
return pot
return output
def multipole_to_local(self, target_or_target_parent_boxes,
@log_process(logger)
@return_timing_data
def multipole_to_local(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes,
starts, lists, mpole_exps):
tree = self.tree
local_exps = self.local_expansion_zeros()
rscale = 1
# Precomputed rotation matrices (matrices of larger order can be used
# for translations of smaller order)
rotmatf, rotmatb, rotmat_order = self.m2l_rotation_matrices()
from pyfmmlib import h2dmploc_vec
for lev in range(self.tree.nlevels):
lstart, lstop = level_start_target_or_target_parent_box_nrs[lev:lev+2]
if lstart == lstop:
continue
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
tgt_center = tree.box_centers[:, tgt_ibox]
starts_on_lvl = starts[lstart:lstop+1]
#print tgt_ibox, "<-", lists[start:end]
tgt_loc = 0
mploc = self.tree_indep.get_translation_routine(
self, "%ddmploc", vec_suffix="_imany")
for src_ibox in lists[start:end]:
src_center = tree.box_centers[:, src_ibox]
kwargs = {}
tgt_loc = tgt_loc + h2dmploc_vec(
self.helmholtz_k,
rscale, src_center, mpole_exps[src_ibox],
rscale, tgt_center, self.nterms)[:, 0]
# {{{ set up optimized m2l, if applicable
local_exps[tgt_ibox] += tgt_loc
if self.level_orders[lev] <= rotmat_order:
m2l_rotation_lists = self.rotation_data.m2l_rotation_lists()
assert len(m2l_rotation_lists) == len(lists)
return local_exps
mploc = self.tree_indep.get_translation_routine(
self, "%ddmploc", vec_suffix="2_trunc_imany")
def eval_multipoles(self, target_boxes, sep_smaller_nonsiblings_starts,
sep_smaller_nonsiblings_lists, mpole_exps):
pot = self.potential_zeros()
kwargs["ldm"] = rotmat_order
kwargs["nterms"] = self.level_orders[lev]
kwargs["nterms1"] = self.level_orders[lev]
rscale = 1
kwargs["rotmatf"] = rotmatf
kwargs["rotmatf_offsets"] = m2l_rotation_lists
kwargs["rotmatf_starts"] = starts_on_lvl
from pyfmmlib import h2dmpeval_vec
for itgt_box, tgt_ibox in enumerate(target_boxes):
tgt_pslice = self._get_target_slice(tgt_ibox)
kwargs["rotmatb"] = rotmatb
kwargs["rotmatb_offsets"] = m2l_rotation_lists
kwargs["rotmatb_starts"] = starts_on_lvl
if tgt_pslice.stop - tgt_pslice.start == 0:
continue
# }}}
tgt_pot = 0
start, end = sep_smaller_nonsiblings_starts[itgt_box:itgt_box+2]
for src_ibox in sep_smaller_nonsiblings_lists[start:end]:
source_level_start_ibox, source_mpoles_view = \
self.multipole_expansions_view(mpole_exps, lev)
target_level_start_ibox, target_local_exps_view = \
self.local_expansions_view(local_exps, lev)
tmp_pot, _, _ = h2dmpeval_vec(self.helmholtz_k, rscale, self.
tree.box_centers[:, src_ibox], mpole_exps[src_ibox],
self._get_targets(tgt_pslice),
ifgrad=False, ifhess=False)
ntgt_boxes = lstop-lstart
itgt_box_vec = np.arange(ntgt_boxes)
tgt_ibox_vec = target_or_target_parent_boxes[lstart:lstop]
tgt_pot = tgt_pot + tmp_pot
nsrc_boxes_per_tgt_box = (
starts[lstart + itgt_box_vec+1] - starts[lstart + itgt_box_vec])
pot[tgt_pslice] += tgt_pot
nsrc_boxes = np.sum(nsrc_boxes_per_tgt_box)
return pot
src_boxes_starts = np.empty(ntgt_boxes+1, dtype=np.int32)
src_boxes_starts[0] = 0
src_boxes_starts[1:] = np.cumsum(nsrc_boxes_per_tgt_box)
def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
rscale = 1 # FIXME
local_exps = self.local_expansion_zeros()
rscale = self.level_to_rscale(lev)
from pyfmmlib import h2dformta
rscale1 = np.ones(nsrc_boxes) * rscale
rscale1_offsets = np.arange(nsrc_boxes)
for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
start, end = starts[itgt_box:itgt_box+2]
if self.dim == 3 and self.tree_indep.eqn_letter == "h":
kwargs["radius"] = (
tree.root_extent * 2**(-lev)
* np.ones(ntgt_boxes))
contrib = 0
rscale2 = np.ones(ntgt_boxes, np.float64) * rscale
for src_ibox in lists[start:end]:
src_pslice = self._get_source_slice(src_ibox)
tgt_center = self.tree.box_centers[:, tgt_ibox]
# These get max'd/added onto: pass initialized versions.
if self.dim == 3:
ier = np.zeros(ntgt_boxes, dtype=np.int32)
kwargs["ier"] = ier
if src_pslice.stop - src_pslice.start == 0:
continue
expn2 = np.zeros(
(ntgt_boxes, *self.expansion_shape(self.level_orders[lev])),
dtype=self.tree_indep.dtype)
ier, mpole = h2dformta(
self.helmholtz_k, rscale,
self._get_sources(src_pslice), src_weights[src_pslice],
tgt_center, self.nterms)
if ier:
raise RuntimeError("h2dformta failed")
kwargs.update(self.kernel_kwargs)
expn2 = mploc(
rscale1=rscale1,
rscale1_offsets=rscale1_offsets,
rscale1_starts=src_boxes_starts,
center1=tree.box_centers,
center1_offsets=lists,
center1_starts=starts_on_lvl,
expn1=source_mpoles_view.T,
expn1_offsets=lists - source_level_start_ibox,
expn1_starts=starts_on_lvl,
contrib = contrib + mpole
rscale2=rscale2,
# FIXME: wrong layout, will copy
center2=tree.box_centers[:, tgt_ibox_vec],
expn2=expn2.T,
local_exps[tgt_ibox] = contrib
nterms2=self.level_orders[lev],
**kwargs).T
target_local_exps_view[tgt_ibox_vec - target_level_start_ibox] += expn2
return local_exps
def refine_locals(self, child_boxes, local_exps):
rscale = 1 # FIXME
@log_process(logger)
@return_timing_data
def eval_multipoles(self,
target_boxes_by_source_level, sep_smaller_nonsiblings_by_level,
mpole_exps):
output = self.output_zeros()
mpeval = self.tree_indep.get_expn_eval_routine("mp")
for isrc_level, ssn in enumerate(sep_smaller_nonsiblings_by_level):
source_level_start_ibox, source_mpoles_view = \
self.multipole_expansions_view(mpole_exps, isrc_level)
rscale = self.level_to_rscale(isrc_level)
for itgt_box, tgt_ibox in \
enumerate(target_boxes_by_source_level[isrc_level]):
tgt_pslice = self._get_target_slice(tgt_ibox)
from pyfmmlib import h2dlocloc_vec
if tgt_pslice.stop - tgt_pslice.start == 0:
continue
tgt_pot = 0
tgt_grad = 0
start, end = ssn.starts[itgt_box:itgt_box+2]
for src_ibox in ssn.lists[start:end]:
tmp_pot, tmp_grad = mpeval(
rscale=rscale,
center=self.tree.box_centers[:, src_ibox],
expn=source_mpoles_view[
src_ibox - source_level_start_ibox].T,
ztarg=self._get_targets(tgt_pslice),
**self.kernel_kwargs)
tgt_pot = tgt_pot + tmp_pot
tgt_grad = tgt_grad + tmp_grad
self.add_potgrad_onto_output(
output, tgt_pslice, tgt_pot, tgt_grad)
return output
@log_process(logger)
@return_timing_data
def form_locals(self,
level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, starts, lists, src_weight_vecs):
src_weights, = src_weight_vecs
local_exps = self.local_expansion_zeros()
formta = self.tree_indep.get_routine(
"%ddformta" + ("_dp" if self.use_dipoles else ""), suffix="_imany")
sources = self._get_single_sources_array()
# sources_starts / sources_lists is a CSR list mapping box centers to
# lists of starting indices into the sources array. To get the starting
# source indices we have to look at box_source_starts.
sources_offsets = self.tree.box_source_starts[lists]
# nsources_starts / nsources_lists is a CSR list mapping box centers to
# lists of indices into nsources, each of which represents a source
# count.
nsources = self.tree.box_source_counts_nonchild
nsources_offsets = lists
for tgt_ibox in child_boxes:
tgt_center = self.tree.box_centers[:, tgt_ibox]
src_ibox = self.tree.box_parent_ids[tgt_ibox]
src_center = self.tree.box_centers[:, src_ibox]
# centers is indexed into by values of centers_offsets, which is a list
# mapping box indices to box center indices.
centers = self._get_single_box_centers_array()
tmp_loc_exp = h2dlocloc_vec(
self.helmholtz_k,
rscale, src_center, local_exps[src_ibox],
rscale, tgt_center, self.nterms)[:, 0]
source_kwargs = self.get_source_kwargs(src_weights, slice(None))
local_exps[tgt_ibox] += tmp_loc_exp
for lev in range(self.tree.nlevels):
lev_start, lev_stop = \
level_start_target_or_target_parent_box_nrs[lev:lev+2]
if lev_start == lev_stop:
continue
target_box_start, target_local_exps_view = \
self.local_expansions_view(local_exps, lev)
centers_offsets = target_or_target_parent_boxes[lev_start:lev_stop]
rscale = self.level_to_rscale(lev)
sources_starts = starts[lev_start:1 + lev_stop]
nsources_starts = sources_starts
kwargs = {}
kwargs.update(self.kernel_kwargs)
for key, val in source_kwargs.items():
kwargs[key] = val
# Add CSR lists mapping box centers to lists of starting positions
# in the array of source strengths.
# Since the source strengths have the same order as the sources,
# these lists are the same as those for starting position in the
# sources array.
kwargs[key + "_starts"] = sources_starts
kwargs[key + "_offsets"] = sources_offsets
ier, expn = formta(
rscale=rscale,
sources=sources,
sources_offsets=sources_offsets,
sources_starts=sources_starts,
nsources=nsources,
nsources_starts=nsources_starts,
nsources_offsets=nsources_offsets,
centers=centers,
centers_offsets=centers_offsets,
nterms=self.level_orders[lev],
**kwargs)
if ier.any():
raise RuntimeError("formta failed")
target_local_exps_view[
target_or_target_parent_boxes[lev_start:lev_stop]
- target_box_start] = expn.T
return local_exps
def eval_locals(self, target_boxes, local_exps):
pot = self.potential_zeros()
rscale = 1 # FIXME
@log_process(logger)
@return_timing_data
def refine_locals(self, level_start_target_or_target_parent_box_nrs,
target_or_target_parent_boxes, local_exps):
from pyfmmlib import h2dtaeval_vec
locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc")
for tgt_ibox in target_boxes:
tgt_pslice = self._get_target_slice(tgt_ibox)
for target_lev in range(1, self.tree.nlevels):
start, stop = level_start_target_or_target_parent_box_nrs[
target_lev:target_lev+2]
if tgt_pslice.stop - tgt_pslice.start == 0:
source_lev = target_lev - 1
source_level_start_ibox, source_local_exps_view = \
self.local_expansions_view(local_exps, source_lev)
target_level_start_ibox, target_local_exps_view = \
self.local_expansions_view(local_exps, target_lev)
source_rscale = self.level_to_rscale(source_lev)
target_rscale = self.level_to_rscale(target_lev)
for tgt_ibox in target_or_target_parent_boxes[start:stop]:
tgt_center = self.tree.box_centers[:, tgt_ibox]
src_ibox = self.tree.box_parent_ids[tgt_ibox]
src_center = self.tree.box_centers[:, src_ibox]
kwargs = {}
if self.dim == 3 and self.tree_indep.eqn_letter == "h":
kwargs["radius"] = self.tree.root_extent * 2**(-target_lev)
kwargs.update(self.kernel_kwargs)
tmp_loc_exp = locloc(
rscale1=source_rscale,
center1=src_center,
expn1=source_local_exps_view[
src_ibox - source_level_start_ibox].T,
rscale2=target_rscale,
center2=tgt_center,
nterms2=self.level_orders[target_lev],
**kwargs)[..., 0]
target_local_exps_view[
tgt_ibox - target_level_start_ibox] += tmp_loc_exp.T
return local_exps
@log_process(logger)
@return_timing_data
def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
output = self.output_zeros()
taeval = self.tree_indep.get_expn_eval_routine("ta")
for lev in range(self.tree.nlevels):
start, stop = level_start_target_box_nrs[lev:lev+2]
if start == stop:
continue
tmp_pot, _, _ = h2dtaeval_vec(self.helmholtz_k, rscale,
self.tree.box_centers[:, tgt_ibox], local_exps[tgt_ibox],
self._get_targets(tgt_pslice), ifgrad=False, ifhess=False)
source_level_start_ibox, source_local_exps_view = \
self.local_expansions_view(local_exps, lev)
rscale = self.level_to_rscale(lev)
for tgt_ibox in target_boxes[start:stop]:
tgt_pslice = self._get_target_slice(tgt_ibox)
if tgt_pslice.stop - tgt_pslice.start == 0:
continue
tmp_pot, tmp_grad = taeval(
rscale=rscale,
center=self.tree.box_centers[:, tgt_ibox],
expn=source_local_exps_view[
tgt_ibox - source_level_start_ibox].T,
ztarg=self._get_targets(tgt_pslice),
**self.kernel_kwargs)
self.add_potgrad_onto_output(
output, tgt_pslice, tmp_pot, tmp_grad)
return output
@log_process(logger)
def finalize_potentials(self, potential, template_ary):
if self.tree_indep.eqn_letter == "l" and self.dim == 2:
scale_factor = -1/(2*np.pi)
elif self.tree_indep.eqn_letter == "h" and self.dim == 2:
scale_factor = 1
elif self.tree_indep.eqn_letter in ["l", "h"] and self.dim == 3:
scale_factor = 1/(4*np.pi)
else:
raise NotImplementedError(
f"scale factor for pyfmmlib {self.tree_indep.eqn_letter} "
f"for {self.dim} dimensions")
if self.tree_indep.eqn_letter == "l" and self.dim == 2:
potential = potential.real
return potential * scale_factor
# }}}
pot[tgt_pslice] += tmp_pot
return pot
# vim: foldmethod=marker
"""
Rotation classes data structure
-------------------------------
.. autoclass:: RotationClassesInfo
Build rotation classes
----------------------
.. autoclass:: RotationClassesBuilder
"""
__copyright__ = "Copyright (C) 2019 Matt Wala"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
import numpy as np
import pyopencl as cl
import pyopencl.array
from boxtree.tools import DeviceDataRecord
from boxtree.translation_classes import TranslationClassesBuilder
logger = logging.getLogger(__name__)
from pytools import log_process
# {{{ rotation classes builder
class RotationClassesInfo(DeviceDataRecord):
r"""Interaction lists to help with matrix precomputations for rotation-based
translations ("point and shoot").
.. attribute:: nfrom_sep_siblings_rotation_classes
The number of distinct rotation classes.
.. attribute:: from_sep_siblings_rotation_classes
``int32 [*]``
A list, corresponding to *from_sep_siblings_lists* of *trav*, of
the rotation class of each box pair.
.. attribute:: from_sep_siblings_rotation_class_to_angle
``coord_t [nfrom_sep_siblings_rotation_classes]``
Maps rotation classes in *from_sep_siblings_rotation_classes* to
rotation angles. This represents the angle between box translation
pairs and the *z*-axis.
"""
@property
def nfrom_sep_siblings_rotation_classes(self):
return len(self.from_sep_siblings_rotation_class_to_angle)
class RotationClassesBuilder:
"""Build rotation classes for List 2 translations.
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
self.tcb = TranslationClassesBuilder(context)
@staticmethod
def vec_gcd(vec):
"""Return the GCD of a list of integers."""
def gcd(a, b):
while b:
a, b = b, a % b
return a
result = abs(vec[0])
for elem in vec[1:]:
result = gcd(result, abs(elem))
return result
def compute_rotation_classes(self,
well_sep_is_n_away, dimensions, used_translation_classes):
"""Convert translation classes to a list of rotation classes and angles."""
angle_to_rot_class = {}
angles = []
ntranslation_classes_per_level = (
self.tcb.ntranslation_classes_per_level(well_sep_is_n_away,
dimensions))
translation_class_to_rot_class = (
np.empty(ntranslation_classes_per_level, dtype=np.int32))
translation_class_to_rot_class[:] = -1
for cls in used_translation_classes:
vec = self.tcb.translation_class_to_normalized_vector(
well_sep_is_n_away, dimensions, cls)
# Normalize the translation vector (by dividing by its GCD).
#
# We need this before computing the cosine of the rotation angle,
# because generally in in floating point arithmetic, if k is a
# positive scalar and v is a vector, we can't assume
#
# kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2).
#
# Normalizing ensures vectors that are positive integer multiples of
# each other get classified into the same equivalence class of
# rotations.
vec //= self.vec_gcd(vec)
# Compute the rotation angle for the vector.
norm = np.linalg.norm(vec)
assert norm != 0
angle = np.arccos(vec[-1] / norm)
# Find the rotation class.
if angle in angle_to_rot_class:
rot_class = angle_to_rot_class[angle]
else:
rot_class = len(angles)
angle_to_rot_class[angle] = rot_class
angles.append(angle)
translation_class_to_rot_class[cls] = rot_class
return translation_class_to_rot_class, angles
@log_process(logger, "build m2l rotation classes")
def __call__(self, queue, trav, tree, wait_for=None):
"""Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`.
"""
evt, translation_class_is_used, translation_classes_lists = \
self.tcb.compute_translation_classes(queue, trav, tree, wait_for, False)
d = tree.dimensions
n = trav.well_sep_is_n_away
# convert translation classes to rotation classes
used_translation_classes = (
np.flatnonzero(translation_class_is_used.get()))
translation_class_to_rotation_class, rotation_angles = (
self.compute_rotation_classes(n, d, used_translation_classes))
# There should be no more than 2^(d-1) * (2n+1)^d distinct rotation
# classes, since that is an upper bound on the number of distinct
# positions for list 2 boxes.
assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d
rotation_classes_lists = (
cl.array.take(
cl.array.to_device(queue, translation_class_to_rotation_class),
translation_classes_lists))
rotation_angles = cl.array.to_device(queue, np.array(rotation_angles))
return RotationClassesInfo(
from_sep_siblings_rotation_classes=rotation_classes_lists,
from_sep_siblings_rotation_class_to_angle=rotation_angles,
).with_queue(None), evt
# }}}
# vim: filetype=pyopencl:fdm=marker
"""
.. autoclass:: TimingResult
.. autoclass:: TimingFuture
"""
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
from collections.abc import Mapping
# {{{ timing result
class TimingResult(Mapping):
"""Interface for returned timing data.
This supports accessing timing results via a mapping interface, along with
combining results via :meth:`merge`.
.. automethod:: merge
"""
def __init__(self, *args, **kwargs):
"""See constructor for :class:`dict`."""
self._mapping = dict(*args, **kwargs)
def __getitem__(self, key):
return self._mapping[key]
def __iter__(self):
return iter(self._mapping)
def __len__(self):
return len(self._mapping)
def merge(self, other):
"""Merge this result with another by adding together common fields."""
result = {}
for key in self:
val = self.get(key)
other_val = other.get(key)
if val is None or other_val is None:
continue
result[key] = val + other_val
return type(self)(result)
# }}}
# {{{ timing future
class TimingFuture:
"""Returns timing data for a potentially asynchronous operation.
.. automethod:: result
.. automethod:: done
"""
def result(self):
"""Return a :class:`TimingResult`. May block."""
raise NotImplementedError
def done(self):
"""Return *True* if the operation is complete."""
raise NotImplementedError
# }}}
# {{{ timing recorder
class TimingRecorder:
def __init__(self):
from collections import defaultdict
self.futures = defaultdict(list)
def add(self, description, future):
self.futures[description].append(future)
def summarize(self):
result = {}
for description, futures_list in self.futures.items():
futures = iter(futures_list)
timing_result = next(futures).result()
for future in futures:
timing_result = timing_result.merge(future.result())
result[description] = timing_result
return result
# }}}
# {{{ time recording tool
class DummyTimingFuture(TimingFuture):
@classmethod
def from_timer(cls, timer):
return cls(wall_elapsed=timer.wall_elapsed,
process_elapsed=timer.process_elapsed)
@classmethod
def from_op_count(cls, op_count):
return cls(ops_elapsed=op_count)
def __init__(self, *args, **kwargs):
self._result = TimingResult(*args, **kwargs)
def result(self):
return self._result
def done(self):
return True
def return_timing_data(wrapped):
"""A decorator for recording timing data for a function call.
The decorated function returns a tuple (*retval*, *timing_future*)
where *retval* is the original return value and *timing_future*
supports the timing data future interface in :mod:`boxtree.fmm`.
"""
from pytools import ProcessTimer
def wrapper(*args, **kwargs):
timer = ProcessTimer()
retval = wrapped(*args, **kwargs)
timer.done()
future = DummyTimingFuture.from_timer(timer)
return (retval, future)
from functools import update_wrapper
new_wrapper = update_wrapper(wrapper, wrapped)
return new_wrapper
# }}}
# vim: foldmethod=marker
from __future__ import division
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
__license__ = """
......@@ -22,37 +20,47 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import sys
from functools import partial
from typing import Any
import numpy as np
from pytools import Record, memoize_method
import pyopencl as cl
import pyopencl.array # noqa
from pyopencl.tools import first_arg_dependent_memoize_nested
from mako.template import Template
import pyopencl as cl
import pyopencl.array
import pyopencl.cltypes as cltypes
from pyopencl.tools import ScalarArg, VectorArg as _VectorArg, dtype_to_c_struct
from pytools import Record, memoize_method
from pytools.obj_array import make_obj_array
# Use offsets in VectorArg by default.
VectorArg = partial(_VectorArg, with_offset=True)
AXIS_NAMES = ("x", "y", "z", "w")
def padded_bin(i, l):
"""Format *i* as binary number, pad it to length *l*."""
def padded_bin(i, nbits):
"""Format *i* as binary number, pad it to length *nbits*."""
return bin(i)[2:].rjust(nbits, "0")
s = bin(i)[2:]
while len(s) < l:
s = '0' + s
return s
# NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__()
def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=None):
if wait_for is None:
wait_for = []
if zero_fill: # noqa: SIM108
array_maker = cl.array.zeros
else:
array_maker = cl.array.empty
def realloc_array(ary, new_shape, zero_fill, queue, wait_for):
new_ary = cl.array.empty(queue, shape=new_shape, dtype=ary.dtype,
allocator=ary.allocator)
if zero_fill:
new_ary.fill(0, wait_for=wait_for)
wait_for = new_ary.events
new_ary = array_maker(queue, shape=new_shape, dtype=ary.dtype,
allocator=allocator)
evt = cl.enqueue_copy(queue, new_ary.data, ary.data, byte_count=ary.nbytes,
wait_for=wait_for)
wait_for=wait_for + new_ary.events)
return new_ary, evt
......@@ -91,8 +99,8 @@ def reverse_index_array(indices, target_size=None, result_fill_value=None,
# {{{ particle distribution generators
def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15):
from pyopencl.clrandom import RanluxGenerator
rng = RanluxGenerator(queue, seed=seed)
from pyopencl.clrandom import PhiloxGenerator
rng = PhiloxGenerator(queue.context, seed=seed)
return make_obj_array([
rng.normal(queue, nparticles, dtype=dtype)
......@@ -103,25 +111,29 @@ def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15):
import loopy as lp
if dims == 2:
@first_arg_dependent_memoize_nested
def get_2d_knl(context, dtype):
def get_2d_knl(dtype):
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
<> phi = 2*M_PI/n * i
x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
""",
for i
<> phi = 2*M_PI/n * i
x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
end
""",
[
lp.GlobalArg("x,y", dtype, shape=lp.auto),
lp.ValueArg("n", np.int32),
])
],
name="make_surface_particles_2d",
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
)
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
return lp.CompiledKernel(context, knl)
return knl.executor(queue.context)
evt, result = get_2d_knl(queue.context, dtype)(queue, n=nparticles)
_evt, result = get_2d_knl(dtype)(queue, n=nparticles)
result = [x.ravel() for x in result]
......@@ -129,28 +141,33 @@ def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15):
elif dims == 3:
n = int(nparticles**0.5)
@first_arg_dependent_memoize_nested
def get_3d_knl(context, dtype):
def get_3d_knl(dtype):
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"""
<> phi = 2*M_PI/n * i
<> theta = 2*M_PI/n * j
x[i,j] = 5*cos(phi) * (3 + cos(theta))
y[i,j] = 5*sin(phi) * (3 + cos(theta))
z[i,j] = 5*sin(theta)
""",
for i,j
<> phi = 2*M_PI/n * i
<> theta = 2*M_PI/n * j
x[i,j] = 5*cos(phi) * (3 + cos(theta))
y[i,j] = 5*sin(phi) * (3 + cos(theta))
z[i,j] = 5*sin(theta)
end
""",
[
lp.GlobalArg("x,y,z,", dtype, shape=lp.auto),
lp.ValueArg("n", np.int32),
])
],
assumptions="n>0",
name="make_surface_particles_3d",
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
)
knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
return lp.CompiledKernel(context, knl)
return knl.executor(queue.context)
evt, result = get_3d_knl(queue.context, dtype)(queue, n=n)
_evt, result = get_3d_knl(dtype)(queue, n=n)
result = [x.ravel() for x in result]
......@@ -165,30 +182,35 @@ def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
if dims == 2:
n = int(nparticles**0.5)
@first_arg_dependent_memoize_nested
def get_2d_knl(context, dtype):
def get_2d_knl(dtype):
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"""
<> xx = 4*i/(n-1)
<> yy = 4*j/(n-1)
<float64> angle = 0.3
<> s = sin(angle)
<> c = cos(angle)
x[i,j] = c*xx + s*yy - 2
y[i,j] = -s*xx + c*yy - 2
""",
for i,j
<> xx = 4*i/(n-1)
<> yy = 4*j/(n-1)
<float64> angle = 0.3
<> s = sin(angle)
<> c = cos(angle)
x[i,j] = c*xx + s*yy - 2
y[i,j] = -s*xx + c*yy - 2
end
""",
[
lp.GlobalArg("x,y", dtype, shape=lp.auto),
lp.ValueArg("n", np.int32),
], assumptions="n>0")
],
assumptions="n>0",
name="make_uniform_particles_2d",
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
)
knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
return lp.CompiledKernel(context, knl)
return knl.executor(queue.context)
evt, result = get_2d_knl(queue.context, dtype)(queue, n=n)
_evt, result = get_2d_knl(dtype)(queue, n=n)
result = [x.ravel() for x in result]
......@@ -196,42 +218,47 @@ def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
elif dims == 3:
n = int(nparticles**(1/3))
@first_arg_dependent_memoize_nested
def get_3d_knl(context, dtype):
def get_3d_knl(dtype):
knl = lp.make_kernel(
"{[i,j,k]: 0<=i,j,k<n}",
"""
<> xx = i/(n-1)
<> yy = j/(n-1)
<> zz = k/(n-1)
<float64> phi = 0.3
<> s1 = sin(phi)
<> c1 = cos(phi)
<> xxx = c1*xx + s1*yy
<> yyy = -s1*xx + c1*yy
<> zzz = zz
<float64> theta = 0.7
<> s2 = sin(theta)
<> c2 = cos(theta)
x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
y[i,j,k] = 4 * yyy - 2
z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
""",
for i,j,k
<> xx = i/(n-1)
<> yy = j/(n-1)
<> zz = k/(n-1)
<float64> phi = 0.3
<> s1 = sin(phi)
<> c1 = cos(phi)
<> xxx = c1*xx + s1*yy
<> yyy = -s1*xx + c1*yy
<> zzz = zz
<float64> theta = 0.7
<> s2 = sin(theta)
<> c2 = cos(theta)
x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
y[i,j,k] = 4 * yyy - 2
z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
end
""",
[
lp.GlobalArg("x,y,z", dtype, shape=lp.auto),
lp.ValueArg("n", np.int32),
], assumptions="n>0")
],
assumptions="n>0",
name="make_uniform_particles_3d",
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
)
knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")
return lp.CompiledKernel(context, knl)
return knl.executor(queue.context)
evt, result = get_3d_knl(queue.context, dtype)(queue, n=n)
_evt, result = get_3d_knl(dtype)(queue, n=n)
result = [x.ravel() for x in result]
......@@ -259,35 +286,53 @@ class DeviceDataRecord(Record):
instances on the host.
"""
def _transform_arrays(self, f):
def _transform_arrays(self, f, exclude_fields=frozenset()):
result = {}
def transform_val(val):
from pyopencl.algorithm import BuiltList
if isinstance(val, np.ndarray) and val.dtype == object:
from pytools.obj_array import obj_array_vectorize
return obj_array_vectorize(f, val)
elif isinstance(val, list):
return [transform_val(i) for i in val]
elif isinstance(val, BuiltList):
transformed_list = {}
for field in val.__dict__:
if field != "count" and not field.startswith("_"):
transformed_list[field] = f(getattr(val, field))
return BuiltList(count=val.count, **transformed_list)
else:
return f(val)
for field_name in self.__class__.fields:
if field_name in exclude_fields:
continue
try:
attr = getattr(self, field_name)
except AttributeError:
pass
else:
if isinstance(attr, np.ndarray) and attr.dtype == object:
from pytools.obj_array import with_object_array_or_scalar
result[field_name] = with_object_array_or_scalar(f, attr)
else:
result[field_name] = f(attr)
result[field_name] = transform_val(attr)
return self.copy(**result)
def get(self, **kwargs):
def get(self, queue, **kwargs):
"""Return a copy of `self` in which all data lives on the host, i.e.
all :class:`pyopencl.array.Array` objects are replaced by corresponding
:class:`numpy.ndarray` instances on the host.
all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` objects are
replaced by corresponding :class:`numpy.ndarray` instances on the host.
"""
def try_get(attr):
if isinstance(attr, ImmutableHostDeviceArray):
return attr.host
try:
get_meth = attr.get
except AttributeError:
return attr
return get_meth(**kwargs)
return get_meth(queue=queue, **kwargs)
return self._transform_arrays(try_get)
......@@ -311,13 +356,52 @@ class DeviceDataRecord(Record):
return self._transform_arrays(try_with_queue)
def to_device(self, queue, exclude_fields=frozenset()):
"""Return a copy of `self` in all :class:`numpy.ndarray` arrays are
transferred to device memory as :class:`pyopencl.array.Array` objects.
:arg exclude_fields: a :class:`frozenset` containing fields excluding from
transferring to the device memory.
"""
def _to_device(attr):
if isinstance(attr, np.ndarray):
return cl.array.to_device(queue, attr).with_queue(None)
elif isinstance(attr, ImmutableHostDeviceArray):
return attr.device
elif isinstance(attr, DeviceDataRecord):
return attr.to_device(queue)
else:
return attr
return self._transform_arrays(_to_device, exclude_fields=exclude_fields)
def to_host_device_array(self, queue, exclude_fields=frozenset()):
"""Return a copy of `self` where all device and host arrays are transformed
to `ImmutableHostDeviceArray` objects.
:arg exclude_fields: a :class:`frozenset` containing fields excluding from
transformed to `ImmutableHostDeviceArray`.
"""
def _to_host_device_array(attr):
if isinstance(attr, np.ndarray | cl.array.Array):
return ImmutableHostDeviceArray(queue, attr)
elif isinstance(attr, DeviceDataRecord):
return attr.to_host_device_array(queue)
else:
return attr
return self._transform_arrays(
_to_host_device_array, exclude_fields=exclude_fields
)
# }}}
# {{{ type mangling
def get_type_moniker(dtype):
return "%s%d" % (dtype.kind, dtype.itemsize)
return f"{dtype.kind}{dtype.itemsize}"
# }}}
......@@ -328,14 +412,22 @@ GAPPY_COPY_TPL = Template(r"""//CL//
typedef ${dtype_to_ctype(dtype)} value_t;
value_t val = input_ary[from_indices[i]];
%if from_indices:
value_t val = input_ary[from_indices[i]];
%else:
value_t val = input_ary[i];
%endif
// Optionally, noodle values through a lookup table.
%if map_values:
val = value_map[val];
%endif
output_ary[i] = val;
%if to_indices:
output_ary[to_indices[i]] = val;
%else:
output_ary[i] = val;
%endif
""", strict_undefined=True)
......@@ -345,51 +437,514 @@ class GappyCopyAndMapKernel:
self.context = context
@memoize_method
def _get_kernel(self, dtype, src_index_dtype, map_values=False):
from pyopencl.tools import VectorArg
def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
have_src_indices, have_dst_indices, map_values):
from boxtree.tools import VectorArg
args = [
VectorArg(dtype, "input_ary", with_offset=True),
VectorArg(dtype, "output_ary", with_offset=True),
VectorArg(src_index_dtype, "from_indices", with_offset=True)
]
VectorArg(dtype, "input_ary"),
VectorArg(dtype, "output_ary"),
]
if have_src_indices:
args.append(VectorArg(src_index_dtype, "from_indices"))
if have_dst_indices:
args.append(VectorArg(dst_index_dtype, "to_indices"))
if map_values:
args.append(VectorArg(dtype, "value_map", with_offset=True))
args.append(VectorArg(dtype, "value_map"))
from pyopencl.tools import dtype_to_ctype
src = GAPPY_COPY_TPL.render(
dtype=dtype,
dtype_to_ctype=dtype_to_ctype,
from_dtype=src_index_dtype,
to_dtype=dst_index_dtype,
from_indices=have_src_indices,
to_indices=have_dst_indices,
map_values=map_values)
from pyopencl.elementwise import ElementwiseKernel
return ElementwiseKernel(self.context,
args, str(src), name="gappy_copy_and_map")
def __call__(self, queue, allocator, new_size,
src_indices, ary, map_values=None, wait_for=None):
args, str(src),
preamble=dtype_to_c_struct(self.context.devices[0], dtype),
name="gappy_copy_and_map")
# NOTE: Order of positional args should match realloc_array()
def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
dst_indices=None, map_values=None, zero_fill=False,
wait_for=None, range=None, debug=False):
"""Compresses box info arrays after empty leaf pruning and, optionally,
maps old box IDs to new box IDs (if the array being operated on contains
box IDs).
"""
assert len(ary) >= new_size
have_src_indices = src_indices is not None
have_dst_indices = dst_indices is not None
have_map_values = map_values is not None
if not (have_src_indices or have_dst_indices):
raise ValueError("must specify at least one of src or dest indices")
if range is None:
if have_src_indices and have_dst_indices:
raise ValueError(
"must supply range when passing both src and dest indices")
elif have_src_indices:
range = slice(src_indices.shape[0])
if debug:
assert int(cl.array.max(src_indices).get()) < len(ary)
elif have_dst_indices:
range = slice(dst_indices.shape[0])
if debug:
assert int(cl.array.max(dst_indices).get()) < new_shape
if zero_fill: # noqa: SIM108
array_maker = cl.array.zeros
else:
array_maker = cl.array.empty
result = array_maker(queue, new_shape, ary.dtype, allocator=allocator)
kernel = self._get_kernel(ary.dtype,
src_indices.dtype if have_src_indices else None,
dst_indices.dtype if have_dst_indices else None,
have_src_indices,
have_dst_indices,
have_map_values)
args = (ary, result)
args += (src_indices,) if have_src_indices else ()
args += (dst_indices,) if have_dst_indices else ()
args += (map_values,) if have_map_values else ()
evt = kernel(*args, queue=queue, range=range, wait_for=wait_for)
result = cl.array.empty(queue, new_size, ary.dtype, allocator=allocator)
return result, evt
kernel = self._get_kernel(ary.dtype, src_indices.dtype,
# map_values:
map_values is not None)
# }}}
args = (ary, result, src_indices)
if map_values is not None:
args += (map_values,)
evt = kernel(*args, queue=queue, range=slice(new_size), wait_for=wait_for)
# {{{ map values through table
return result, evt
from pyopencl.elementwise import ElementwiseTemplate
MAP_VALUES_TPL = ElementwiseTemplate(
arguments="""//CL//
dst_value_t *dst,
src_value_t *src,
dst_value_t *map_values
""",
operation=r"""//CL//
dst[i] = map_values[src[i]];
""",
name="map_values")
class MapValuesKernel:
def __init__(self, context):
self.context = context
@memoize_method
def _get_kernel(self, dst_dtype, src_dtype):
type_aliases = (
("src_value_t", src_dtype),
("dst_value_t", dst_dtype)
)
return MAP_VALUES_TPL.build(self.context, type_aliases)
def __call__(self, map_values, src, dst=None):
"""
Map the entries of the array `src` through the table `map_values`.
"""
if dst is None:
dst = src
kernel = self._get_kernel(dst.dtype, src.dtype)
evt = kernel(dst, src, map_values)
return dst, evt
# }}}
# {{{ binary search
from mako.template import Template
BINARY_SEARCH_TEMPLATE = Template("""
/*
* Returns the largest value of i such that arr[i] <= val, or (size_t) -1 if val
* is less than all values.
*/
inline size_t bsearch(
__global const ${elem_t} *arr,
size_t len,
const ${elem_t} val)
{
if (val < arr[0])
{
return -1;
}
size_t l = 0, r = len, i;
while (1)
{
i = l + (r - l) / 2;
if (arr[i] <= val && (i == len - 1 || val < arr[i + 1]))
{
return i;
}
if (arr[i] <= val)
{
l = i;
}
else
{
r = i;
}
}
}
""")
class InlineBinarySearch:
def __init__(self, elem_type_name):
self.render_vars = {"elem_t": elem_type_name}
@memoize_method
def __str__(self):
return BINARY_SEARCH_TEMPLATE.render(**self.render_vars)
# }}}
# vim: foldmethod=marker:filetype=pyopencl
# {{{ compress a masked array into a list / list of lists
MASK_LIST_COMPRESSOR_BODY = r"""
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
{
if (mask[i])
{
APPEND_output(i);
}
}
"""
MASK_MATRIX_COMPRESSOR_BODY = r"""
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
{
for (int j = 0; j < ncols; ++j)
{
if (mask[outer_stride * i + j * inner_stride])
{
APPEND_output(j);
}
}
}
"""
class MaskCompressorKernel:
"""
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
@memoize_method
def get_list_compressor_kernel(self, mask_dtype, list_dtype):
from pyopencl.algorithm import ListOfListsBuilder
return ListOfListsBuilder(
self.context,
[("output", list_dtype)],
MASK_LIST_COMPRESSOR_BODY,
[
_VectorArg(mask_dtype, "mask"),
],
name_prefix="compress_list")
@memoize_method
def get_matrix_compressor_kernel(self, mask_dtype, list_dtype):
from pyopencl.algorithm import ListOfListsBuilder
return ListOfListsBuilder(
self.context,
[("output", list_dtype)],
MASK_MATRIX_COMPRESSOR_BODY,
[
ScalarArg(np.int32, "ncols"),
ScalarArg(np.int32, "outer_stride"),
ScalarArg(np.int32, "inner_stride"),
_VectorArg(mask_dtype, "mask"),
],
name_prefix="compress_matrix")
def __call__(self, queue, mask, list_dtype=None):
"""Convert a mask to a list in :ref:`csr` format.
:arg mask: Either a 1D or 2D array.
* If *mask* is 1D, it should represent a masked list, where
*mask[i]* is true if and only if *i* is in the list.
* If *mask* is 2D, it should represent a list of masked lists,
so that *mask[i,j]* is true if and only if *j* is in list *i*.
:arg list_dtype: The dtype for the output list(s). Defaults to the mask
dtype.
:returns: The return value depends on the type of the input.
* If mask* is 1D, returns a tuple *(list, evt)*.
* If *mask* is 2D, returns a tuple *(starts, lists, event)*, as a
:ref:`csr` list.
"""
if list_dtype is None:
list_dtype = mask.dtype
if len(mask.shape) == 1:
knl = self.get_list_compressor_kernel(mask.dtype, list_dtype)
result, evt = knl(queue, mask.shape[0], mask.data)
return (result["output"].lists, evt)
elif len(mask.shape) == 2:
# FIXME: This is efficient for small column sizes but may not be
# for larger ones since the work is partitioned by row.
knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype)
size = mask.dtype.itemsize
assert size > 0
result, evt = knl(queue, mask.shape[0], mask.shape[1],
mask.strides[0] // size, mask.strides[1] // size,
mask.data)
return (result["output"].starts, result["output"].lists, evt)
else:
raise ValueError("unsupported dimensionality")
# }}}
# {{{ Communication pattern for partial multipole expansions
class AllReduceCommPattern:
"""Describes a tree-like communication pattern for exchanging and reducing
multipole expansions. Supports an arbitrary number of processes.
A user must instantiate a version of this with identical *size* and varying
*rank* on each rank. During each stage, each rank sends its contribution to
the reduction results on ranks returned by :meth:`sinks` and listens for
contributions from :meth:`source`. :meth:`messages` can be used for determining
array indices whose partial results need to be sent during the current stage.
Then, all ranks call :meth:`advance` and use :meth:`done` to check whether the
communication is complete. In the use case of multipole communication, the
reduction result is a vector of multipole expansions to which all ranks add
contribution. These contributions are communicated sparsely via arrays of box
indices and expansions.
.. automethod:: __init__
.. automethod:: sources
.. automethod:: sinks
.. automethod:: messages
.. automethod:: advance
.. automethod:: done
"""
def __init__(self, rank, size):
"""
:arg rank: Current rank.
:arg size: Total number of ranks.
"""
assert 0 <= rank < size
self.rank = rank
self.left = 0
self.right = size
self.midpoint = size // 2
def sources(self):
"""Return the set of source nodes at the current communication stage. The
current rank receives messages from these ranks.
"""
if self.rank < self.midpoint:
partner = self.midpoint + (self.rank - self.left)
if self.rank == self.midpoint - 1 and partner == self.right:
partners = set()
elif self.rank == self.midpoint - 1 and partner == self.right - 2:
partners = {partner, partner + 1}
else:
partners = {partner}
else:
partner = self.left + (self.rank - self.midpoint)
if self.rank == self.right - 1 and partner == self.midpoint:
partners = set()
elif self.rank == self.right - 1 and partner == self.midpoint - 2:
partners = {partner, partner + 1}
else:
partners = {partner}
return partners
def sinks(self):
"""Return the set of sink nodes at this communication stage. The current rank
sends a message to these ranks.
"""
if self.rank < self.midpoint:
partner = self.midpoint + (self.rank - self.left)
if partner == self.right:
partner -= 1
else:
partner = self.left + (self.rank - self.midpoint)
if partner == self.midpoint:
partner -= 1
return {partner}
def messages(self):
"""Return a range of ranks, such that the partial results of array indices
used by these ranks are sent to the sinks. This is returned as a
[start, end) pair. By design, it is a consecutive range.
"""
if self.rank < self.midpoint:
return (self.midpoint, self.right)
else:
return (self.left, self.midpoint)
def advance(self):
"""Advance to the next stage in the communication pattern.
"""
if self.done():
raise RuntimeError("finished communicating")
if self.rank < self.midpoint:
self.right = self.midpoint
self.midpoint = (self.midpoint + self.left) // 2
else:
self.left = self.midpoint
self.midpoint = (self.midpoint + self.right) // 2
def done(self):
"""Return whether the current rank is finished communicating.
"""
return self.left + 1 == self.right
# }}}
# {{{ MPI launcher
def run_mpi(script: str, num_processes: int, env: dict[str, Any]) -> None:
"""Launch MPI processes.
This function forks another process and uses ``mpiexec`` to launch
*num_processes* MPI processes running *script*.
:arg script: the Python script to run.
:arg num_processes: the number of MPI process to launch.
:arg env: a :class:`dict` of environment variables.
"""
import os
env = {key: str(value) for key, value in env.items()}
env = {**os.environ, **env}
import subprocess
from mpi4py import MPI
# Using "-m mpi4py" is necessary for avoiding deadlocks on exception cleanup
# See https://mpi4py.readthedocs.io/en/stable/mpi4py.run.html for details.
mpi_library_name = MPI.Get_library_version()
if mpi_library_name.startswith("Open MPI"):
command = ["mpiexec", "-np", str(num_processes), "--oversubscribe"]
for env_variable_name in env:
command.extend(["-x", env_variable_name])
command.extend([sys.executable, "-m", "mpi4py", script])
else:
command = [
"mpiexec", "-np", str(num_processes), sys.executable,
"-m", "mpi4py", script
]
subprocess.run(command, env=env, check=True)
# }}}
# {{{ HostDeviceArray
class ImmutableHostDeviceArray:
"""Interface for arrays on both host and device.
.. note:: This interface assumes the array is immutable. The behavior of
modifying the content of either the host array or the device array is undefined.
@TODO: Once available, replace this implementation with PyOpenCL's in-house
implementation.
"""
def __init__(self, queue, array):
self.queue = queue
self.shape = array.shape
self.host_array = None
self.device_array = None
if isinstance(array, np.ndarray):
self.host_array = array
elif isinstance(array, cl.array.Array):
self.device_array = array
def with_queue(self, queue):
self.queue = queue
@property
def svm_capable(self):
svm_capabilities = \
self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES)
return svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0
@property
def host(self):
if self.host_array is None:
self.host_array = self.device_array.get(self.queue)
return self.host_array
@property
def device(self):
if self.device_array is None:
# @TODO: Use SVM
self.device_array = cl.array.to_device(self.queue, self.host_array)
self.device_array.with_queue(self.queue)
return self.device_array
# }}}
# {{{ coord_vec tools
def get_coord_vec_dtype(
coord_dtype: np.dtype, dimensions: int) -> np.dtype:
if dimensions == 1:
return coord_dtype
else:
return cltypes.vec_types[coord_dtype, dimensions]
def coord_vec_subscript_code(dimensions: int, vec_name: str, iaxis: int) -> str:
assert 0 <= iaxis < dimensions
if dimensions == 1:
# a coord_vec_t is just a scalar
return vec_name
else:
return f"{vec_name}.s{iaxis}"
# }}}
# vim: foldmethod=marker
"""
Translation classes data structure
----------------------------------
.. autoclass:: TranslationClassesInfo
Build translation classes
-------------------------
.. autoclass:: TranslationClassesBuilder
"""
__copyright__ = "Copyright (C) 2019 Matt Wala"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
from functools import partial
import numpy as np
from mako.template import Template
import pyopencl as cl
import pyopencl.array
import pyopencl.cltypes
from pyopencl.elementwise import ElementwiseTemplate
from pytools import Record, memoize_method
from boxtree.tools import (
DeviceDataRecord,
InlineBinarySearch,
coord_vec_subscript_code,
get_coord_vec_dtype,
)
from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
logger = logging.getLogger(__name__)
from pytools import log_process
# {{{ translation classes builder
TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE = Template(r"""//CL:mako//
#define LEVEL_TO_RAD(level) \
(root_extent * 1 / (coord_t) (1 << (level + 1)))
// Return an integer vector indicating the a translation direction
// as a multiple of the box diameter.
inline int_coord_vec_t get_normalized_translation_vector(
coord_t root_extent,
int level,
coord_vec_t source_center,
coord_vec_t target_center)
{
int_coord_vec_t result = (int_coord_vec_t) 0;
coord_t diam = 2 * LEVEL_TO_RAD(level);
%for i in range(dimensions):
${cvec_sub("result", i)} = rint(
(${cvec_sub("target_center", i)} - ${cvec_sub("source_center", i)})
/ diam);
%endfor
return result;
}
// Compute the translation class for the given translation vector. The
// translation class maps a translation vector (a_1, a_2, ..., a_d) into
// a dense range of integers [0, ..., (4*n+3)^d - 1], where
// d is the dimension and n is well_sep_is_n_away.
//
// The translation vector should be normalized for a box diameter of 1.
//
// This relies on the fact that the entries of the vector will
// always be in the range [-2n-1,...,2n+1].
//
// The mapping from vector to class is:
//
// \~~ d k-1
// cls(a ,a ,...,a ) = > (2n+1+a ) (4n+3)
// 1 2 d /__ k=1 k
//
// Returns -1 on error.
inline int get_translation_class(int_coord_vec_t vec, int well_sep_is_n_away)
{
int dim_bound = 2 * well_sep_is_n_away + 1;
%for i in range(dimensions):
if (!(-dim_bound <= ${cvec_sub("vec", i)}
&& ${cvec_sub("vec", i)} <= dim_bound))
{
return -1;
}
%endfor
int result = 0;
int base = 4 * well_sep_is_n_away + 3;
int mult = 1;
%for i in range(dimensions):
result += (2 * well_sep_is_n_away + 1 + ${cvec_sub("vec", i)}) * mult;
mult *= base;
%endfor
return result;
}
""" + str(InlineBinarySearch("box_id_t")),
strict_undefined=True)
TRANSLATION_CLASS_FINDER_TEMPLATE = ElementwiseTemplate(
arguments=r"""//CL:mako//
/* input: */
box_id_t *from_sep_siblings_lists,
box_id_t *from_sep_siblings_starts,
box_id_t *target_or_target_parent_boxes,
int ntarget_or_target_parent_boxes,
coord_t *box_centers,
int aligned_nboxes,
coord_t root_extent,
box_level_t *box_levels,
int well_sep_is_n_away,
/* output: */
int *translation_classes,
int *translation_class_is_used,
int *error_flag,
""",
operation=TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
// Find the target box for this source box.
box_id_t source_box_id = from_sep_siblings_lists[i];
size_t itarget_box = bsearch(
from_sep_siblings_starts, 1 + ntarget_or_target_parent_boxes, i);
box_id_t target_box_id = target_or_target_parent_boxes[itarget_box];
// Ensure levels are the same.
if (box_levels[source_box_id] != box_levels[target_box_id])
{
atomic_or(error_flag, 1);
PYOPENCL_ELWISE_CONTINUE;
}
// Compute the translation vector and translation class.
${load_center("source_center", "source_box_id")}
${load_center("target_center", "target_box_id")}
int_coord_vec_t vec = get_normalized_translation_vector(
root_extent, box_levels[source_box_id], source_center, target_center);
int translation_class = get_translation_class(vec, well_sep_is_n_away);
// Ensure valid translation class.
if (translation_class == -1)
{
atomic_or(error_flag, 1);
PYOPENCL_ELWISE_CONTINUE;
}
% if translation_class_per_level:
translation_class += box_levels[source_box_id] * \
${ntranslation_classes_per_level};
% endif
translation_classes[i] = translation_class;
atomic_or(&translation_class_is_used[translation_class], 1);
""")
class _KernelInfo(Record):
pass
class TranslationClassesInfo(DeviceDataRecord):
r"""Interaction lists to help with for translations that benefit from
precomputing distance related values
.. attribute:: nfrom_sep_siblings_translation_classes
The number of distinct translation classes.
.. attribute:: from_sep_siblings_translation_classes
``int32 [*]``
A list, corresponding to *from_sep_siblings_lists* of :attr:`traversal`, of
the translation classes of each box pair.
.. attribute:: from_sep_siblings_translation_class_to_distance_vector
``coord_vec_t [nfrom_sep_siblings_translation_classes]``
Maps translation classes in *from_sep_siblings_translation_classes*
to distance (translation) vectors from source box center to
target box center.
.. attribute:: from_sep_siblings_translation_classes_level_starts
``int32 [nlevels + 1]``
A list with an entry for each level giving the starting translation
class id for that level. Translation classes are numbered contiguously
by level.
.. attribute:: traversal
A :class:`boxtree.traversal.FMMTraversalInfo` object corresponding to the
traversal that these translation classes refer to.
"""
def __init__(self, traversal, **kwargs):
super().__init__(**kwargs)
self.traversal = traversal
def copy(self, **kwargs):
traversal = kwargs.pop("traversal", self.traversal)
return self.__class__(traversal=traversal, **self.get_copy_kwargs(**kwargs))
@property
def nfrom_sep_siblings_translation_classes(self):
return len(self.from_sep_siblings_translation_class_to_distance_vector)
class TranslationClassesBuilder:
"""Build translation classes for List 2 translations.
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
self.context = context
@memoize_method
def get_kernel_info(self, dimensions, well_sep_is_n_away,
box_id_dtype, box_level_dtype, coord_dtype, translation_class_per_level):
coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions)
num_translation_classes = \
self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
# Make sure translation classes can fit inside a 32 bit integer.
if not num_translation_classes <= 1 + np.iinfo(np.int32).max:
raise ValueError("would overflow")
preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render(
dimensions=dimensions,
cvec_sub=partial(coord_vec_subscript_code, dimensions))
translation_class_finder = (
TRANSLATION_CLASS_FINDER_TEMPLATE.build(
self.context,
type_aliases=(
("int_coord_vec_t", int_coord_vec_dtype),
("coord_vec_t", coord_vec_dtype),
("coord_t", coord_dtype),
("box_id_t", box_id_dtype),
("box_level_t", box_level_dtype),
),
var_values=(
("dimensions", dimensions),
("ntranslation_classes_per_level", num_translation_classes),
("translation_class_per_level", translation_class_per_level),
("cvec_sub", partial(
coord_vec_subscript_code, dimensions)),
),
more_preamble=preamble))
return _KernelInfo(translation_class_finder=translation_class_finder)
@staticmethod
def ntranslation_classes_per_level(well_sep_is_n_away, dimensions):
return (4 * well_sep_is_n_away + 3) ** dimensions
def translation_class_to_normalized_vector(self, well_sep_is_n_away,
dimensions, cls):
# This computes the vector for the translation class, using the inverse
# of the formula found in get_translation_class() defined in
# TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.
assert 0 <= cls < self.ntranslation_classes_per_level(well_sep_is_n_away,
dimensions)
result = np.zeros(dimensions, dtype=np.int32)
shift = 2 * well_sep_is_n_away + 1
base = 4 * well_sep_is_n_away + 3
for i in range(dimensions):
result[i] = cls % base - shift
cls //= base
return result
def compute_translation_classes(self, queue, trav, tree, wait_for,
is_translation_per_level):
"""
Returns a tuple *evt*, *translation_class_is_used* and
*translation_classes_lists*.
"""
# {{{ compute translation classes for list 2
well_sep_is_n_away = trav.well_sep_is_n_away
dimensions = tree.dimensions
coord_dtype = tree.coord_dtype
knl_info = self.get_kernel_info(
dimensions, well_sep_is_n_away, tree.box_id_dtype,
tree.box_level_dtype, coord_dtype, is_translation_per_level)
ntranslation_classes = (
self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
if is_translation_per_level:
ntranslation_classes = ntranslation_classes * tree.nlevels
translation_classes_lists = cl.array.empty(
queue, len(trav.from_sep_siblings_lists), dtype=np.int32)
translation_class_is_used = cl.array.zeros(
queue, ntranslation_classes, dtype=np.int32)
error_flag = cl.array.zeros(queue, 1, dtype=np.int32)
evt = knl_info.translation_class_finder(
trav.from_sep_siblings_lists,
trav.from_sep_siblings_starts,
trav.target_or_target_parent_boxes,
trav.ntarget_or_target_parent_boxes,
tree.box_centers,
tree.aligned_nboxes,
tree.root_extent,
tree.box_levels,
well_sep_is_n_away,
translation_classes_lists,
translation_class_is_used,
error_flag,
queue=queue, wait_for=wait_for)
if (error_flag.get()):
raise ValueError("could not compute translation classes")
return (evt, translation_class_is_used, translation_classes_lists)
# }}}
@log_process(logger, "build m2l translation classes")
def __call__(self, queue, trav, tree, wait_for=None,
is_translation_per_level=True):
"""Returns a pair *info*, *evt* where info is a
:class:`TranslationClassesInfo`.
"""
evt, translation_class_is_used, translation_classes_lists = \
self.compute_translation_classes(queue, trav, tree, wait_for,
is_translation_per_level)
well_sep_is_n_away = trav.well_sep_is_n_away
dimensions = tree.dimensions
used_translation_classes_map = np.empty(len(translation_class_is_used),
dtype=np.int32)
used_translation_classes_map.fill(-1)
distances = np.empty((dimensions, len(translation_class_is_used)),
dtype=tree.coord_dtype)
num_translation_classes = \
self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
nlevels = tree.nlevels
count = 0
prev_level = -1
from_sep_siblings_translation_classes_level_starts = \
np.empty(nlevels+1, dtype=np.int32)
for i, used in enumerate(translation_class_is_used.get()):
cls_without_level = i % num_translation_classes
level = i // num_translation_classes
if (prev_level != level):
from_sep_siblings_translation_classes_level_starts[level] = count
prev_level = level
if not used:
continue
used_translation_classes_map[i] = count
unit_vector = self.translation_class_to_normalized_vector(
well_sep_is_n_away, dimensions, cls_without_level)
distances[:, count] = unit_vector * tree.root_extent / (1 << level)
count = count + 1
from_sep_siblings_translation_classes_level_starts[nlevels] = count
translation_classes_lists = (
cl.array.take(
cl.array.to_device(queue, used_translation_classes_map),
translation_classes_lists))
distances = cl.array.to_device(queue, distances)
from_sep_siblings_translation_classes_level_starts = cl.array.to_device(
queue, from_sep_siblings_translation_classes_level_starts)
info = TranslationClassesInfo(
traversal=trav,
from_sep_siblings_translation_classes=translation_classes_lists,
from_sep_siblings_translation_class_to_distance_vector=distances,
from_sep_siblings_translation_classes_level_starts=(
from_sep_siblings_translation_classes_level_starts),
).with_queue(None)
return info, evt
# }}}
# vim: fdm=marker
from __future__ import division
"""
Traversal data structure
------------------------
.. autoclass:: FMMTraversalInfo
Build Entrypoint
----------------
.. autoclass:: FMMTraversalBuilder
.. automethod:: __call__
"""
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
......@@ -22,66 +34,50 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
from functools import partial
import numpy as np
from pytools import Record, memoize_method, memoize_in
import pyopencl as cl
import pyopencl.array # noqa
from pyopencl.elementwise import ElementwiseTemplate
from mako.template import Template
from boxtree.tools import AXIS_NAMES, DeviceDataRecord
import logging
logger = logging.getLogger(__name__)
import pyopencl as cl
import pyopencl.array
import pyopencl.cltypes
from pyopencl.elementwise import ElementwiseTemplate
from pytools import Record, memoize_method
# {{{ preamble
from boxtree.tools import (
AXIS_NAMES,
DeviceDataRecord,
coord_vec_subscript_code,
get_coord_vec_dtype,
)
TRAVERSAL_PREAMBLE_TEMPLATE = r"""//CL//
${box_flags_enum.get_c_defines()}
${box_flags_enum.get_c_typedef()}
typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
%if particle_id_dtype is not None:
typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
%endif
typedef ${dtype_to_ctype(coord_dtype)} coord_t;
typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
logger = logging.getLogger(__name__)
#define NLEVELS ${max_levels}
#define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor})
from pytools import ProcessLogger, log_process
<%def name="load_center(name, box_id)">
coord_vec_t ${name};
%for i in range(dimensions):
${name}.${AXIS_NAMES[i]} = box_centers[aligned_nboxes * ${i} + ${box_id}];
%endfor
</%def>
#define LEVEL_TO_RAD(level) \
(root_extent * 1 / (coord_t) (1 << (level + 1)))
# {{{ preamble
%if 0:
#define dbg_printf(ARGS) printf ARGS
%else:
#define dbg_printf(ARGS) /* */
%endif
# This 'walk' mechanism walks over 'child' boxes in the tree.
TRAVERSAL_PREAMBLE_MAKO_DEFS = r"""//CL:mako//
<%def name="walk_init(start_box_id)">
box_id_t box_stack[NLEVELS];
int morton_nr_stack[NLEVELS];
box_id_t walk_box_stack[NLEVELS];
int walk_morton_nr_stack[NLEVELS];
// start at root
int walk_level = 0;
box_id_t walk_box_id = ${start_box_id};
int walk_stack_size = 0;
box_id_t walk_parent_box_id = ${start_box_id};
int walk_morton_nr = 0;
bool continue_walk = true;
</%def>
<%def name="walk_reset(start_box_id)">
walk_level = 0;
walk_box_id = ${start_box_id};
walk_morton_nr = 0;
continue_walk = true;
<%def name="walk_get_box_id()">
box_id_t walk_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_parent_box_id];
</%def>
<%def name="walk_advance()">
......@@ -94,13 +90,17 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
// Ran out of children, pull the next guy off the stack
// and advance him.
continue_walk = walk_level > 0;
continue_walk = (
// Stack empty? Abort.
walk_stack_size > 0
);
if (continue_walk)
{
--walk_level;
--walk_stack_size;
dbg_printf((" ascend\n"));
walk_box_id = box_stack[walk_level];
walk_morton_nr = morton_nr_stack[walk_level];
walk_parent_box_id = walk_box_stack[walk_stack_size];
walk_morton_nr = walk_morton_nr_stack[walk_stack_size];
}
else
{
......@@ -111,87 +111,178 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t;
</%def>
<%def name="walk_push(new_box)">
box_stack[walk_level] = walk_box_id;
morton_nr_stack[walk_level] = walk_morton_nr;
++walk_level;
walk_box_stack[walk_stack_size] = walk_parent_box_id;
walk_morton_nr_stack[walk_stack_size] = walk_morton_nr;
++walk_stack_size;
%if debug:
if (walk_level >= NLEVELS)
if (walk_stack_size >= NLEVELS)
{
dbg_printf((" ** ERROR: overran levels stack\n"));
return;
}
%endif
walk_box_id = ${new_box};
walk_parent_box_id = ${new_box};
walk_morton_nr = 0;
</%def>
<%def name="load_center(name, box_id, declare=True)">
%if declare:
coord_vec_t ${name} = (coord_vec_t)(
%else:
${name} = (coord_vec_t)(
%endif
%for i in range(dimensions):
box_centers[aligned_nboxes * ${i} + ${box_id}]
%if i + 1 < dimensions:
,
%endif
%endfor
);
</%def>
<%def name="load_true_box_extent(name, box_id, kind, declare=True)">
%if declare:
coord_vec_t ${name}_ext_center, ${name}_radii_vec;
%endif
{
%for bound in ["min", "max"]:
coord_vec_t ${name}_${bound} = (coord_vec_t)(
%for iaxis in range(dimensions):
box_${kind}_bounding_box_${bound}[
${iaxis} * aligned_nboxes + ${box_id}]
%if iaxis + 1 < dimensions:
,
%endif
%endfor
);
%endfor
${name}_ext_center = ((coord_vec_t) 0.5) * (${name}_min + ${name}_max);
${name}_radii_vec = ((coord_vec_t) 0.5) * (${name}_max - ${name}_min);
}
</%def>
<%def name="check_l_infty_ball_overlap(
is_overlapping, box_id, ball_radius, ball_center)">
{
${load_center("box_center", box_id)}
int box_level = box_levels[${box_id}];
coord_t size_sum = LEVEL_TO_RAD(box_level) + ${ball_radius};
coord_t max_dist = 0;
%for i in range(dimensions):
max_dist = fmax(max_dist,
fabs(${ball_center}.s${i} - box_center.s${i}));
fabs(${cvec_sub(ball_center, i)}
- ${cvec_sub("box_center", i)}));
%endfor
${is_overlapping} = max_dist <= size_sum;
}
</%def>
"""
TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES = r"""//CL//
${box_flags_enum.get_c_defines()}
${box_flags_enum.get_c_typedef()}
typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
%if particle_id_dtype is not None:
typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
%endif
## Convert to dict first, as this may be passed as a tuple-of-tuples.
typedef ${dtype_to_ctype(coord_dtype)} coord_t;
typedef ${dtype_to_ctype(get_coord_vec_dtype(coord_dtype, dimensions))} coord_vec_t;
#define COORD_T_MACH_EPS ((coord_t) ${ repr(float(np.finfo(coord_dtype).eps)) })
#define NLEVELS ${max_levels}
#define LEVEL_TO_RAD(level) \
(root_extent * 1 / (coord_t) (1 << (level + 1)))
%if 0:
#define dbg_printf(ARGS) printf ARGS
%else:
#define dbg_printf(ARGS) /* */
%endif
#define square(x) ((x)*(x))
"""
TRAVERSAL_PREAMBLE_TEMPLATE = (
TRAVERSAL_PREAMBLE_MAKO_DEFS
+ TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES)
# }}}
# {{{ adjacency test
HELPER_FUNCTION_TEMPLATE = r"""//CL//
inline bool is_adjacent_or_overlapping(
/*
These adjacency tests check the l^\infty distance between centers to check whether
two boxes are adjacent or overlapping.
Rather than a 'small floating point number', these adjacency test routines use the
smaller of the source/target box radii as the floating point tolerance, which
calls the following configuration 'adjacent' even though it actually is not:
+---------+ +---------+
| | | |
| | | |
| o | | o<--->
| | r | r |
| |<--->| |
+---------+ +---------+
This is generically OK since one would expect the distance between the edge of
a large box and the edge of a smaller box to be a integer multiple of the
smaller box's diameter (which is twice its radius, our tolerance).
*/
inline bool is_adjacent_or_overlapping_with_neighborhood(
coord_t root_extent,
// target and source order only matter if include_stick_out is true.
coord_vec_t target_center, int target_level,
coord_vec_t source_center, int source_level,
// this is expected to be constant so that the inliner will kill the if.
const bool include_stick_out
)
coord_t target_box_neighborhood_size,
coord_vec_t source_center, int source_level)
{
// This checks if the two boxes overlap
// with an amount of 'slack' corresponding to half the
// width of the smaller of the two boxes.
// (Without the 'slack', there wouldn't be any
// overlap.)
// This checks if the source box overlaps the target box
// including a neighborhood of target_box_neighborhood_size boxes
// of the same size as the target box.
coord_t target_rad = LEVEL_TO_RAD(target_level);
coord_t source_rad = LEVEL_TO_RAD(source_level);
coord_t rad_sum = target_rad + source_rad;
coord_t rad_sum = (
(2*(target_box_neighborhood_size-1) + 1) * target_rad
+ source_rad);
coord_t slack = rad_sum + fmin(target_rad, source_rad);
if (include_stick_out)
{
slack += STICK_OUT_FACTOR * (
0
%if targets_have_extent:
+ target_rad
%endif
%if sources_have_extent:
+ source_rad
%endif
);
}
coord_t max_dist = 0;
coord_t l_inf_dist = 0;
%for i in range(dimensions):
max_dist = fmax(max_dist, fabs(target_center.s${i} - source_center.s${i}));
l_inf_dist = fmax(
l_inf_dist,
fabs(${cvec_sub("target_center", i)}
- ${cvec_sub("source_center", i)}));
%endfor
return max_dist <= slack;
return l_inf_dist <= slack;
}
inline bool is_adjacent_or_overlapping(
coord_t root_extent,
// note: order does not matter
coord_vec_t target_center, int target_level,
coord_vec_t source_center, int source_level)
{
return is_adjacent_or_overlapping_with_neighborhood(
root_extent,
target_center, target_level,
1,
source_center, source_level);
}
"""
......@@ -206,17 +297,27 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
{
box_flags_t flags = box_flags[box_id];
if (flags & BOX_HAS_OWN_SOURCES)
{ APPEND_source_boxes(box_id); }
%if source_boxes_has_mask:
if (flags & BOX_IS_SOURCE_BOX && source_boxes_mask[box_id])
{ APPEND_source_boxes(box_id); }
%else:
if (flags & BOX_IS_SOURCE_BOX)
{ APPEND_source_boxes(box_id); }
%endif
if (flags & BOX_HAS_CHILD_SOURCES)
{ APPEND_source_parent_boxes(box_id); }
%if source_parent_boxes_has_mask:
if (flags & BOX_HAS_SOURCE_CHILD_BOXES && source_parent_boxes_mask[box_id])
{ APPEND_source_parent_boxes(box_id); }
%else:
if (flags & BOX_HAS_SOURCE_CHILD_BOXES)
{ APPEND_source_parent_boxes(box_id); }
%endif
%if not sources_are_targets:
if (flags & BOX_HAS_OWN_TARGETS)
if (flags & BOX_IS_TARGET_BOX)
{ APPEND_target_boxes(box_id); }
%endif
if (flags & (BOX_HAS_CHILD_TARGETS | BOX_HAS_OWN_TARGETS))
if (flags & (BOX_HAS_TARGET_CHILD_BOXES | BOX_IS_TARGET_BOX))
{ APPEND_target_or_target_parent_boxes(box_id); }
}
"""
......@@ -238,21 +339,31 @@ LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE = ElementwiseTemplate(
// assert(i > 0);
box_id_t my_box_id = box_list[i];
box_id_t prev_box_id = box_list[i-1];
int my_level = box_levels[my_box_id];
box_id_t my_level_start = level_start_box_nrs[my_level];
if (prev_box_id < my_level_start && my_level_start <= my_box_id)
bool is_level_leading_box;
if (i == 0)
is_level_leading_box = true;
else
{
box_id_t prev_box_id = box_list[i-1];
box_id_t my_level_start = level_start_box_nrs[my_level];
is_level_leading_box = (
prev_box_id < my_level_start
&& my_level_start <= my_box_id);
}
if (is_level_leading_box)
list_level_start_box_nrs[my_level] = i;
""",
name="extract_level_start_box_nrs")
# }}}
# {{{ colleagues
# {{{ same-level non-well-separated boxes (generalization of "colleagues")
COLLEAGUES_TEMPLATE = r"""//CL//
SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
{
......@@ -260,7 +371,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
if (box_id == 0)
{
// The root has no colleagues.
// The root has no boxes on the same level, nws or not.
return;
}
......@@ -268,31 +379,34 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
dbg_printf(("box id: %d level: %d\n", box_id, level));
// To find this box's colleagues, start at the top of the tree, descend
// To find this box's same-level nws boxes, start at the top of the tree, descend
// into adjacent (or overlapping) parents.
${walk_init(0)}
while (continue_walk)
{
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
dbg_printf((" level: %d walk box id: %d morton: %d child id: %d\n",
walk_level, walk_box_id, walk_morton_nr, child_box_id));
${walk_get_box_id()}
dbg_printf((" level: %d walk parent box id: %d morton: %d child id: %d\n",
walk_stack_size, walk_parent_box_id, walk_morton_nr, walk_box_id));
if (child_box_id)
if (walk_box_id)
{
${load_center("child_center", "child_box_id")}
${load_center("walk_center", "walk_box_id")}
bool a_or_o = is_adjacent_or_overlapping(root_extent,
center, level, child_center, box_levels[child_box_id], false);
bool a_or_o = is_adjacent_or_overlapping_with_neighborhood(
root_extent,
center, level,
${well_sep_is_n_away},
walk_center, box_levels[walk_box_id]);
if (a_or_o)
{
// child_box_id lives on walk_level+1.
if (walk_level+1 == level && child_box_id != box_id)
// walk_box_id lives on level walk_stack_size+1.
if (walk_stack_size+1 == level && walk_box_id != box_id)
{
dbg_printf((" colleague\n"));
APPEND_colleagues(child_box_id);
dbg_printf((" found same-lev nws\n"));
APPEND_same_level_non_well_sep_boxes(walk_box_id);
}
else
{
......@@ -300,7 +414,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
// on the stack.
dbg_printf((" descend\n"));
${walk_push("child_box_id")}
${walk_push("walk_box_id")}
continue;
}
......@@ -321,7 +435,7 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t box_id)
# {{{ neighbor source boxes ("list 1")
NEIGBHOR_SOURCE_BOXES_TEMPLATE = r"""//CL//
NEIGHBOR_SOURCE_BOXES_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
{
......@@ -342,50 +456,51 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
{
box_flags_t root_flags = box_flags[0];
if (root_flags & BOX_HAS_OWN_SOURCES)
if (root_flags & BOX_IS_SOURCE_BOX)
{
APPEND_neighbor_source_boxes(0);
}
}
// To find this box's colleagues, start at the top of the tree, descend
// To find this box's adjacent boxes, start at the top of the tree, descend
// into adjacent (or overlapping) parents.
${walk_init(0)}
while (continue_walk)
{
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
${walk_get_box_id()}
dbg_printf((" walk box id: %d morton: %d child id: %d level: %d\n",
walk_box_id, walk_morton_nr, child_box_id, walk_level));
dbg_printf((" walk parent box id: %d morton: %d child id: %d level: %d\n",
walk_parent_box_id, walk_morton_nr, walk_box_id, walk_stack_size));
if (child_box_id)
if (walk_box_id)
{
${load_center("child_center", "child_box_id")}
${load_center("walk_center", "walk_box_id")}
bool a_or_o = is_adjacent_or_overlapping(root_extent,
center, level, child_center, box_levels[child_box_id], false);
bool a_or_o = is_adjacent_or_overlapping(
root_extent,
center, level,
walk_center, box_levels[walk_box_id]);
if (a_or_o)
{
box_flags_t flags = box_flags[child_box_id];
/* child_box_id == box_id is ok */
if (flags & BOX_HAS_OWN_SOURCES)
box_flags_t flags = box_flags[walk_box_id];
/* walk_box_id == box_id is ok */
if (flags & BOX_IS_SOURCE_BOX)
{
dbg_printf((" neighbor source box\n"));
APPEND_neighbor_source_boxes(child_box_id);
APPEND_neighbor_source_boxes(walk_box_id);
}
if (flags & BOX_HAS_CHILD_SOURCES)
if (flags & BOX_HAS_SOURCE_CHILD_BOXES)
{
// We want to descend into this box. Put the current state
// on the stack.
dbg_printf((" descend\n"));
${walk_push("child_box_id")}
${walk_push("walk_box_id")}
continue;
}
......@@ -404,9 +519,9 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
# }}}
# {{{ well-separated siblings ("list 2")
# {{{ from well-separated siblings ("list 2")
SEP_SIBLINGS_TEMPLATE = r"""//CL//
FROM_SEP_SIBLINGS_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
{
......@@ -420,27 +535,33 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
if (parent == box_id)
return;
box_id_t parent_coll_start = colleagues_starts[parent];
box_id_t parent_coll_stop = colleagues_starts[parent+1];
box_id_t parent_slnf_start = same_level_non_well_sep_boxes_starts[parent];
box_id_t parent_slnf_stop = same_level_non_well_sep_boxes_starts[parent+1];
// /!\ i is not a box_id, it's an index into colleagues_list.
for (box_id_t i = parent_coll_start; i < parent_coll_stop; ++i)
// /!\ i is not a box_id, it's an index into same_level_non_well_sep_boxes_list.
for (box_id_t i = parent_slnf_start; i < parent_slnf_stop; ++i)
{
box_id_t parent_colleague = colleagues_list[i];
box_id_t parent_nf = same_level_non_well_sep_boxes_lists[i];
for (int morton_nr = 0; morton_nr < ${2**dimensions}; ++morton_nr)
{
box_id_t sib_box_id = box_child_ids[
morton_nr * aligned_nboxes + parent_colleague];
morton_nr * aligned_nboxes + parent_nf];
if (sib_box_id == 0)
continue;
${load_center("sib_center", "sib_box_id")}
bool sep = !is_adjacent_or_overlapping(root_extent,
center, level, sib_center, box_levels[sib_box_id], false);
bool sep = !is_adjacent_or_overlapping_with_neighborhood(
root_extent,
center, level,
${well_sep_is_n_away},
sib_center, box_levels[sib_box_id]);
if (sep)
{
APPEND_sep_siblings(sib_box_id);
APPEND_from_sep_siblings(sib_box_id);
}
}
}
......@@ -449,117 +570,272 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
# }}}
# {{{ separated smaller ("list 3")
# {{{ from separated smaller ("list 3")
SEP_SMALLER_TEMPLATE = r"""//CL//
FROM_SEP_SMALLER_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
{
// /!\ target_box_number is *not* a box_id, despite the type.
// It's the number of the target box we're currently processing.
box_id_t box_id = target_boxes[target_box_number];
box_id_t tgt_box_id = target_boxes[target_box_number];
${load_center("center", "box_id")}
${load_center("tgt_center", "tgt_box_id")}
int level = box_levels[box_id];
int tgt_level = box_levels[tgt_box_id];
%if targets_have_extent:
%if from_sep_smaller_crit in ["static_linf", "static_l2"]:
coord_t tgt_stickout_l_inf_rad =
(1 + stick_out_factor) * LEVEL_TO_RAD(tgt_level);
box_id_t coll_start = colleagues_starts[box_id];
box_id_t coll_stop = colleagues_starts[box_id+1];
%elif from_sep_smaller_crit == "precise_linf":
${load_true_box_extent("tgt", "tgt_box_id", "target")}
// defines tgt_ext_center, tgt_radii_vec
// /!\ i is not a box_id, it's an index into colleagues_list.
for (box_id_t i = coll_start; i < coll_stop; ++i)
%endif
%endif
box_id_t slnws_start = same_level_non_well_sep_boxes_starts[tgt_box_id];
box_id_t slnws_stop = same_level_non_well_sep_boxes_starts[tgt_box_id+1];
// /!\ i is not a box_id, it's an index into same_level_non_well_sep_boxes_lists.
for (box_id_t i = slnws_start; i < slnws_stop; ++i)
{
box_id_t colleague = colleagues_list[i];
box_id_t same_lev_nws_box = same_level_non_well_sep_boxes_lists[i];
if (same_lev_nws_box == tgt_box_id)
continue;
${walk_init("colleague")}
// Colleagues (same-level NWS boxes) for 1-away are always adjacent, so
// we always want to descend into them. For 2-away, we may already
// satisfy the criteria for being in list 3 and therefore may never
// need to descend. Hence include the start box in the search here
// if we're in the two-or-more-away case.
${walk_init("same_lev_nws_box")}
while (continue_walk)
{
// Loop invariant: walk_box_id is, at first, always adjacent to box_id.
// This is true at the first level because colleagues are by adjacent
// Loop invariant:
// walk_parent_box_id is, at first, always adjacent to tgt_box_id.
//
// This is true at the first level because colleagues are adjacent
// by definition, and is kept true throughout the walk by only descending
// into adjacent boxes.
//
// As we descend, we may find a child of an adjacent box that is
// non-adjacent to box_id.
// non-adjacent to tgt_box_id.
//
// If neither sources nor targets have extent, then that
// nonadjacent child box is added to box_id's sep_smaller ("list 3
// far") and that's it.
// nonadjacent child box is added to tgt_box_id's from_sep_smaller
// ("list 3far") and that's it.
//
// If they have extent, then while they may be separated, the
// intersection of box_id's and the child box's stick-out region
// intersection of tgt_box_id's and the child box's stick-out region
// may be non-empty, and we thus need to add that child to
// sep_close_smaller ("list 3 close") for the interaction to be
// from_sep_close_smaller ("list 3 close") for the interaction to be
// done by direct evaluation. We also need to descend into that
// child.
box_id_t child_box_id = box_child_ids[
walk_morton_nr * aligned_nboxes + walk_box_id];
${walk_get_box_id()}
dbg_printf((" walk box id: %d morton: %d child id: %d\n",
walk_box_id, walk_morton_nr, child_box_id));
dbg_printf((" walk parent box id: %d morton: %d child id: %d\n",
walk_parent_box_id, walk_morton_nr, walk_box_id));
box_flags_t child_box_flags = box_flags[child_box_id];
box_flags_t child_box_flags = box_flags[walk_box_id];
if (child_box_id &&
if (walk_box_id &&
(child_box_flags &
(BOX_HAS_OWN_SOURCES | BOX_HAS_CHILD_SOURCES)))
(BOX_IS_SOURCE_BOX | BOX_HAS_SOURCE_CHILD_BOXES)))
{
${load_center("child_center", "child_box_id")}
${load_center("walk_center", "walk_box_id")}
int walk_level = box_levels[walk_box_id];
bool a_or_o = is_adjacent_or_overlapping(root_extent,
center, level, child_center, box_levels[child_box_id], false);
bool in_list_1 = is_adjacent_or_overlapping(root_extent,
tgt_center, tgt_level, walk_center, walk_level);
if (a_or_o)
if (in_list_1)
{
if (child_box_flags & BOX_HAS_CHILD_SOURCES)
if (child_box_flags & BOX_HAS_SOURCE_CHILD_BOXES)
{
// We want to descend into this box. Put the current state
// on the stack.
${walk_push("child_box_id")}
continue;
if (walk_level <= from_sep_smaller_source_level
|| from_sep_smaller_source_level == -1)
{
${walk_push("walk_box_id")}
continue;
}
// otherwise there's no point to descending further.
}
}
else
{
%if sources_have_extent or targets_have_extent:
const bool a_or_o_with_stick_out =
is_adjacent_or_overlapping(root_extent,
center, level, child_center,
box_levels[child_box_id], true);
bool meets_sep_crit;
<% assert not sources_have_extent %>
%if not targets_have_extent:
meets_sep_crit = true;
%elif from_sep_smaller_crit == "static_linf":
{
coord_t source_rad = LEVEL_TO_RAD(walk_level);
// l^infty distance between source box and target box.
// Negative indicates overlap.
coord_t l_inf_dist = 0;
%for i in range(dimensions):
l_inf_dist = fmax(
l_inf_dist,
fabs(${cvec_sub("tgt_center", i)}
- ${cvec_sub("walk_center", i)})
- tgt_stickout_l_inf_rad
- source_rad);
%endfor
meets_sep_crit = l_inf_dist >=
(2 - 8 * COORD_T_MACH_EPS) * source_rad;
}
%elif from_sep_smaller_crit == "precise_linf":
{
coord_t source_rad = LEVEL_TO_RAD(walk_level);
// l^infty distance between source box and target box.
// Negative indicates overlap.
coord_t l_inf_dist = 0;
%for i in range(dimensions):
l_inf_dist = fmax(
l_inf_dist,
fabs(
${cvec_sub("tgt_ext_center", i)}
- ${cvec_sub("walk_center", i)}
)
- ${cvec_sub("tgt_radii_vec", i)}
- source_rad);
%endfor
meets_sep_crit = l_inf_dist >=
(2 - 8 * COORD_T_MACH_EPS) * source_rad;
}
%elif from_sep_smaller_crit == "static_l2":
{
coord_t source_l_inf_rad = LEVEL_TO_RAD(walk_level);
// l^2 distance between source box and target centers.
coord_t l_2_squared_center_dist =
0
%for i in range(dimensions):
+ square(
${cvec_sub("tgt_center", i)}
- ${cvec_sub("walk_center", i)})
%endfor
;
<% assert not sources_have_extent %>
// We're considering convergence of a multipole
// in the (square) source box at all locations
// in the (round) target box. We need
// src_box_l2_radius
// / d_2(src_box_center, tgt_box) <= sqrt(d)/3
// <=>
// src_box_linf_radius * sqrt(d)
// / d_2(src_box_center, tgt_box) <= sqrt(d)/3
// <=>
// 3 * src_box_linf_radius
// <= d_2(src_box_center, tgt_box)
// <=>
// 3 * src_box_linf_radius
// <= d_2(src_box_center, tgt_box_center)
// - sqrt(d) * tgt_stickout_l_inf_rad
// <=> (because why not)
// 2 * src_box_linf_radius
// <= d_2(src_box_center, tgt_box_center)
// - sqrt(d) * tgt_stickout_l_inf_rad
// - src_box_linf_radius
coord_t rhs =
sqrt(l_2_squared_center_dist)
- sqrt((coord_t) (${dimensions}))
* tgt_stickout_l_inf_rad
- source_l_inf_rad;
meets_sep_crit = (
(2 - 8 * COORD_T_MACH_EPS) * source_l_inf_rad
<= rhs);
}
%else:
const bool a_or_o_with_stick_out = false;
<% raise ValueError(
"unknown value of from_sep_smaller_crit: %s"
% from_sep_smaller_crit) %>
%endif
// We're no longer *immediately* adjacent to our target
// box, but our stick-out regions might still have a
// non-empty intersection.
if (!a_or_o_with_stick_out)
// If the number of particles in this box is below the
// source count threshold, it can be moved to a "close" list.
// This is a performance optimization.
<% close_lists_exist = \
sources_have_extent or targets_have_extent %>
bool close_lists_exist = ${ str(close_lists_exist).lower() };
bool force_close_list_for_low_interaction_count =
%if close_lists_exist:
close_lists_exist &&
(box_source_counts_cumul[walk_box_id]
< from_sep_smaller_min_nsources_cumul);
%else:
false;
%endif
if (meets_sep_crit &&
!force_close_list_for_low_interaction_count)
{
APPEND_sep_smaller(child_box_id);
if (from_sep_smaller_source_level == walk_level)
APPEND_from_sep_smaller(walk_box_id);
}
else
{
%if sources_have_extent or targets_have_extent:
if (child_box_flags & BOX_HAS_OWN_SOURCES)
{
APPEND_sep_close_smaller(child_box_id);
}
if (child_box_flags & BOX_HAS_CHILD_SOURCES)
// from_sep_smaller_source_level == -1 means "only build
// list 3 close", with sources on any level.
// This kernel will be run once per source level to
// generate per-level list 3, and once
// (not per level) to generate list 3 close.
if (
(child_box_flags & BOX_IS_SOURCE_BOX)
&& (from_sep_smaller_source_level == -1))
APPEND_from_sep_close_smaller(walk_box_id);
if (child_box_flags & BOX_HAS_SOURCE_CHILD_BOXES)
{
${walk_push("child_box_id")}
${walk_push("walk_box_id")}
continue;
}
%endif
}
}
}
${walk_advance()}
}
}
......@@ -568,167 +844,266 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t target_box_number)
# }}}
# {{{ separated bigger ("list 4")
# {{{ from separated bigger ("list 4")
# "Normal" case: Sources/targets without extent
# ---------------------------------------------
#
# List 4 interactions for box "B" are about a parent P's colleague A not
# adjacent to B.
#
# -------|----------|----------|
# Case | 1 | 2 |
# | adj to A | adj to A |
# -------|----------|----------|
# | | |
# A---P | X ! | X ! |
# | | | |
# o | X | X |
# | | | |
# o | X | X |
# | | | |
# o | X | O |
# | | | |
# B | O ! | O ! |
# List 4 consists of source boxes that 'missed the boat' on entering the downward
# propagation through list 2. That is, they are non-well-separated from the
# target box itself or a box in its chain of parents. In addition, they are
# not adjacent to the target box and have the same size or are bigger.
#
# Note that once a parent is no longer adjacent, its children won't be either.
# To be in list 4, a box must have its own sources. In the no-extents case,
# this will happen only if that box is a leaf, but for the with-extents case,
# any box can have sources.
#
# (X: yes, O:no, exclamation marks denote that this *must* be the case. Entries
# without exclamation mark are choices for this case)
# (Yes, you read that right--same-level non-well separated boxes *can* be in
# list 4, although only for 2+-away. They *could* also use list 3, but that
# would be less efficient because it would not make use of the downward
# propagation.)
#
# Case 1: A->B interaction enters the downward propagation at B, i.e. A is in
# B's "sep_bigger". (list 4)
#
# Case 2: A->B interaction entered the downward propagation at B's parent, i.e.
# A is not in B's "sep_bigger". (list 4)
# Sources/targets with extent
# ---------------------------
# For a box not well-separated from the target box or one of its parents, we
# check whether the box is adjacent to our target box (in its list 1). If so,
# we don't need to consider it (because the interaction to this box will be
# mediated by list 1).
#
# List 4 interactions for box "B" are about a parent P's colleague A not
# adjacent to B.
# Case I: Neither sources nor targets have extent
#
# -------|----------|----------|----------|
# Case | 1 | 2 | 3 |
# | so adj | so adj | so adj |
# -------|----------|----------|----------|
# | | | |
# A---P | X! X! | X! X! | X! X! |
# | | | | |
# o | X ? | X ? | X ? |
# | | | | |
# o | X ? | X ? | X ? |
# | | | | |
# o | X ? | X ? | O O |
# | | | | |
# B | X O! | O O! | O O! |
# In this case and once non-membership in list 1 has been verified, list 4
# membership is simply a matter of deciding whether the source box's
# contribution should enter the downward propagation at this target box or
# whether it has already entered it at a parent of the target box.
#
# "so": adjacent or overlapping when stick-out is taken into account (to A)
# "adj": adjacent to A without stick-out
# It suffices to check this for the immediate parent because the check has to
# be monotone: Child boxes are subsets of parent boxes, and therefore any
# minimum distance requirement satisfied by the parent will also be satisfied
# by the child. Thus, if the source box is in the target box's parent's list 4,
# then it entered downward propagation with it or another ancestor.
#
# Note that once a parent is no longer "adj" or "so", its children won't be
# either. Also note that "adj" => "so". (And there by "not so" => "not adj".)
# Case II: Sources or targets have extent
#
# (X: yes, O:no, ?: doesn't matter, exclamation marks denote that this *must*
# be the case. Entries without exclamation mark are choices for this case)
# The with-extents case is conceptually similar to the no-extents case, however
# there is an extra 'separation requirement' based on the extents that, if not
# satisfied, may prevent a source box from entering the downward propagation
# at a given box. If we once again assume monotonicity of this 'separation
# requirement' check, then simply verifying whether or not the interaction from
# the source box would be *allowed* to enter the downward propagation at the
# parent suffices to determine whether the target box may be responsible for
# entering the source interaction into the downward propagation.
#
# Case 1: A->B interaction must be processed by direct eval because of "so",
# i.e. it is in B's "sep_close_bigger".
#
# Case 2: A->B interaction enters downward the propagation at B,
# i.e. it is in B's "sep_bigger".
#
# Case 3: A->B interaction enters downward the propagation at B's parent,
# i.e. A is not in B's "sep*bigger"
# In cases where the source box is not yet part of the downward propagation
# received from the parent and also not eligible for entering downward
# propagation at this box (noting that this can only happen in the with-extents
# case), the interaction is added to the (non-downward-propagating) 'list 4
# close' (from_sep_close_bigger).
FROM_SEP_BIGGER_TEMPLATE = r"""//CL//
inline bool meets_sep_bigger_criterion(
coord_t root_extent,
coord_vec_t target_center, int target_level,
coord_vec_t source_center, int source_level,
coord_t stick_out_factor)
{
<%
assert not sources_have_extent
%>
// What we are interested in ensuring is that
// (*)
// d_2(src_box, tgt_center)
// >= 3 * (radius of tgt box potentially
// including stick-out)
// (because convergence factors are in l^2,
// irrespective of how we measure)
// Since d_2(a, b) >= d_inf(a, b), ensuring that
// (*) holds with d_inf implies that it also holds
// with d_2.
coord_t target_rad = LEVEL_TO_RAD(target_level);
coord_t source_rad = LEVEL_TO_RAD(source_level);
coord_t max_allowed_center_l_inf_dist = (
3 * (1 + stick_out_factor) * target_rad
+ source_rad);
coord_t l_inf_dist = 0;
%for i in range(dimensions):
l_inf_dist = fmax(
l_inf_dist,
fabs(${cvec_sub("target_center", i)}
- ${cvec_sub("source_center", i)}));
%endfor
return l_inf_dist >= max_allowed_center_l_inf_dist * (1 - 8 * COORD_T_MACH_EPS);
}
SEP_BIGGER_TEMPLATE = r"""//CL//
void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
{
box_id_t tgt_ibox = target_or_target_parent_boxes[itarget_or_target_parent_box];
${load_center("center", "tgt_ibox")}
${load_center("tgt_box_center", "tgt_ibox")}
int box_level = box_levels[tgt_ibox];
int tgt_box_level = box_levels[tgt_ibox];
// The root box has no parents, so no list 4.
if (box_level == 0)
if (tgt_box_level == 0)
return;
box_id_t parent_box_id = box_parent_ids[tgt_ibox];
${load_center("parent_center", "parent_box_id")}
box_id_t current_parent_box_id = parent_box_id;
int walk_level = box_level - 1;
box_id_t tgt_parent_box_id = box_parent_ids[tgt_ibox];
const int tgt_parent_level = tgt_box_level - 1;
${load_center("parent_center", "tgt_parent_box_id")}
box_flags_t tgt_box_flags = box_flags[tgt_ibox];
// Look for colleagues of parents that are non-adjacent to tgt_ibox.
// Walk up the tree from tgt_ibox.
%if well_sep_is_n_away == 1:
// In a 1-away FMM, tgt_ibox's colleagues are by default uninteresting
// (i.e. not in list 4) because they're adjacent. So in this case, we
// may directly jump to the parent level.
int walk_level = tgt_box_level - 1;
box_id_t current_tgt_parent_box_id = tgt_parent_box_id;
%else:
// In a 2+-away FMM, tgt_ibox's same-level non-well-separated boxes *may*
// be sufficiently separated from tgt_ibox to be in its list 4.
int walk_level = tgt_box_level;
box_id_t current_tgt_parent_box_id = tgt_ibox;
%endif
/*
Look for same-level non-well-separated boxes of parents that are
non-adjacent to tgt_ibox.
Walk up the tree from tgt_ibox.
// Box 0 (== level 0) doesn't have any colleagues, so we can stop the
// search for such colleagues there.
for (int walk_level = box_level - 1; walk_level != 0;
Box 0 (== level 0) doesn't have any slnws boxes, so we can stop the
search for such slnws boxes there.
*/
for (; walk_level != 0;
// {{{ advance
--walk_level,
current_parent_box_id = box_parent_ids[current_parent_box_id]
current_tgt_parent_box_id = box_parent_ids[current_tgt_parent_box_id]
// }}}
)
{
box_id_t coll_start = colleagues_starts[current_parent_box_id];
box_id_t coll_stop = colleagues_starts[current_parent_box_id+1];
// /!\ i is not a box id, it's an index into colleagues_list.
for (box_id_t i = coll_start; i < coll_stop; ++i)
box_id_t slnws_start =
same_level_non_well_sep_boxes_starts[current_tgt_parent_box_id];
box_id_t slnws_stop =
same_level_non_well_sep_boxes_starts[current_tgt_parent_box_id+1];
// /!\ i is not a box id, it's an index into
// same_level_non_well_sep_boxes_lists.
for (box_id_t i = slnws_start; i < slnws_stop; ++i)
{
box_id_t colleague_box_id = colleagues_list[i];
box_id_t slnws_box_id = same_level_non_well_sep_boxes_lists[i];
if (box_flags[colleague_box_id] & BOX_HAS_OWN_SOURCES)
if (box_flags[slnws_box_id] & BOX_IS_SOURCE_BOX)
{
${load_center("colleague_center", "colleague_box_id")}
bool a_or_o = is_adjacent_or_overlapping(root_extent,
center, box_level, colleague_center, walk_level, false);
${load_center("slnws_center", "slnws_box_id")}
if (!a_or_o)
{
// Found one.
bool in_list_1 = is_adjacent_or_overlapping(root_extent,
tgt_box_center, tgt_box_level,
slnws_center, walk_level);
if (!in_list_1)
{
%if sources_have_extent or targets_have_extent:
const bool a_or_o_with_stick_out =
is_adjacent_or_overlapping(root_extent,
center, box_level, colleague_center,
walk_level, true);
if (a_or_o_with_stick_out)
/*
With-extent list 4 separation criterion.
Needs to be monotone. (see main comment narrative
above for what that means) If you change this, also
change the equivalent check for the parent, below.
*/
const bool tgt_meets_with_ext_sep_criterion =
meets_sep_bigger_criterion(root_extent,
tgt_box_center, tgt_box_level,
slnws_center, walk_level,
stick_out_factor);
if (!tgt_meets_with_ext_sep_criterion)
{
// "Case 1" above: colleague_box_id is too close and
// overlaps our stick_out region. We're obliged to do
// the interaction directly.
/*
slnws_box_id failed the separation criterion (i.e. is
too close to the target box) for list 4 proper. Stick
it in list 4 close.
*/
if (tgt_box_flags & BOX_HAS_OWN_TARGETS)
if (tgt_box_flags & BOX_IS_TARGET_BOX)
{
APPEND_sep_close_bigger(colleague_box_id);
APPEND_from_sep_close_bigger(slnws_box_id);
}
}
else
%endif
{
bool parent_a_or_o_with_stick_out =
bool in_parent_list_1 =
is_adjacent_or_overlapping(root_extent,
parent_center, box_level-1, colleague_center,
walk_level, true);
if (parent_a_or_o_with_stick_out)
parent_center, tgt_parent_level,
slnws_center, walk_level);
bool would_be_in_parent_list_4_not_considering_stickout = (
!in_parent_list_1
%if well_sep_is_n_away > 1:
/*
From-sep-bigger boxes can only be in the
parent's from-sep-bigger list if they're
actually bigger (or equal) to the parent
box size.
For 1-away, that's guaranteed at this
point, because we only start ascending the
tree at the parent's level, so any box we
find here is naturally big enough. For
2-away, we start looking at the target
box's level, so slnws_box_id may actually
be too small (at too deep a level) to be in
the parent's from-sep-bigger list.
*/
&& walk_level < tgt_box_level
%endif
);
if (would_be_in_parent_list_4_not_considering_stickout)
{
// "Case 2" above: We're the first box down the chain
// to be far enough away to let the interaction into
// our local downward subtree.
APPEND_sep_bigger(colleague_box_id);
/*
Our immediate parent box was already far enough
away to (hypothetically) let the interaction into
its downward propagation--so this happened either
there or at a more distant ancestor. We'll get the
interaction that way. Nothing to do, unless the box
was too close to the parent and ended up in the
parent's from_sep_close_bigger. If that's the case,
we'll simply let it enter the downward propagation
here.
With-extent list 4 separation criterion.
Needs to be monotone. (see main comment narrative
above for what that means) If you change this, also
change the equivalent check for the target box, above.
*/
%if sources_have_extent or targets_have_extent:
const bool parent_meets_with_ext_sep_criterion =
meets_sep_bigger_criterion(root_extent,
parent_center, tgt_parent_level,
slnws_center, walk_level,
stick_out_factor);
if (!parent_meets_with_ext_sep_criterion)
{
APPEND_from_sep_bigger(slnws_box_id);
}
%endif
}
else
{
// "Case 2" above: A parent box was already far
// enough away to let the interaction into its
// local downward subtree. We'll get the interaction
// that way. Nothing to do.
/*
We're the first box down the chain to be far enough
away to let the interaction into our local downward
propagation.
*/
APPEND_from_sep_bigger(slnws_box_id);
}
}
}
......@@ -741,18 +1116,201 @@ void generate(LIST_ARG_DECL USER_ARG_DECL box_id_t itarget_or_target_parent_box)
# }}}
# {{{ list merger
LIST_MERGER_TEMPLATE = ElementwiseTemplate(
arguments=r"""//CL:mako//
/* input: */
box_id_t *output_to_input_box,
%for ilist in range(nlists):
box_id_t *list${ilist}_starts,
%endfor
%if not write_counts:
%for ilist in range(nlists):
const box_id_t *list${ilist}_lists,
%endfor
const box_id_t *new_starts,
%endif
/* output: */
%if not write_counts:
box_id_t *new_lists,
%else:
box_id_t *new_counts,
%endif
""",
operation=r"""//CL:mako//
/* Compute output and input indices. */
const box_id_t ioutput_box = i;
const box_id_t ibox = output_to_input_box[ioutput_box];
/* Count the size of the input at the current index. */
%for ilist in range(nlists):
const box_id_t list${ilist}_start = list${ilist}_starts[ibox];
const box_id_t list${ilist}_count =
list${ilist}_starts[ibox + 1] - list${ilist}_start;
%endfor
/* Update the counts or copy the elements. */
%if write_counts:
if (ioutput_box == 0)
new_counts[0] = 0;
new_counts[ioutput_box + 1] =
%for ilist in range(nlists):
+ list${ilist}_count
%endfor
;
%else:
box_id_t cur_idx = new_starts[ioutput_box];
%for ilist in range(nlists):
for (box_id_t j = 0; j < list${ilist}_count; ++j)
{
new_lists[cur_idx++] =
list${ilist}_lists[list${ilist}_start + j];
}
%endfor
%endif
""",
name="merge_lists")
class _IndexStyle:
TARGET_BOXES = 0
TARGET_OR_TARGET_PARENT_BOXES = 1
class _ListMerger:
"""Utility class for combining box lists optionally changing indexing style."""
def __init__(self, context, box_id_dtype):
self.context = context
self.box_id_dtype = box_id_dtype
@memoize_method
def get_list_merger_kernel(self, nlists, write_counts):
"""
:arg nlists: Number of input lists
:arg write_counts: A :class:`bool`, indicating whether to generate a
kernel that produces box counts or box lists
"""
assert nlists >= 1
return LIST_MERGER_TEMPLATE.build(
self.context,
type_aliases=(
("box_id_t", self.box_id_dtype),
),
var_values=(
("nlists", nlists),
("write_counts", write_counts),
))
def __call__(self, queue, input_starts, input_lists, input_index_style,
output_index_style, target_boxes, target_or_target_parent_boxes,
nboxes, debug=False, wait_for=None):
"""
:arg input_starts: Starts arrays of input
:arg input_lists: Lists arrays of input
:arg input_index_style: A :class:`_IndexStyle`
:arg output_index_style: A :class:`_IndexStyle`
:returns: A pair *results_dict, event*, where *results_dict*
contains entries *starts* and *lists*
"""
if wait_for is None:
wait_for = []
if (
output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
and input_index_style == _IndexStyle.TARGET_BOXES):
raise ValueError(
"unsupported: merging a list indexed by target boxes "
"into a list indexed by target or target parent boxes")
ntarget_boxes = len(target_boxes)
ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes)
noutput_boxes = (ntarget_boxes
if output_index_style == _IndexStyle.TARGET_BOXES
else ntarget_or_ntarget_parent_boxes)
if (
input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
and output_index_style == _IndexStyle.TARGET_BOXES):
from boxtree.tools import reverse_index_array
target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
target_or_target_parent_boxes, target_size=nboxes,
queue=queue)
target_or_target_parent_boxes_from_target_boxes = cl.array.take(
target_or_target_parent_boxes_from_all_boxes,
target_boxes, queue=queue)
output_to_input_box = target_or_target_parent_boxes_from_target_boxes
else:
output_to_input_box = cl.array.arange(
queue, noutput_boxes, dtype=self.box_id_dtype)
new_counts = cl.array.empty(queue, noutput_boxes+1, self.box_id_dtype)
assert len(input_starts) == len(input_lists)
nlists = len(input_starts)
evt = self.get_list_merger_kernel(nlists, True)(
# input:
output_to_input_box,
*input_starts,
# output:
new_counts,
range=slice(noutput_boxes),
queue=queue,
wait_for=wait_for)
new_starts = cl.array.cumsum(new_counts)
del new_counts
new_lists = cl.array.empty(
queue,
int(new_starts[-1].get()),
self.box_id_dtype)
new_lists.fill(999999999)
evt = self.get_list_merger_kernel(nlists, False)(
# input:
output_to_input_box,
*input_starts,
*input_lists,
new_starts,
# output:
new_lists,
range=slice(noutput_boxes),
queue=queue,
wait_for=[evt])
return {"starts": new_starts, "lists": new_lists}, evt
# }}}
# {{{ traversal info (output)
class FMMTraversalInfo(DeviceDataRecord):
"""Interaction lists needed for a fast-multipole-like linear-time gather of
r"""Interaction lists needed for a fast-multipole-like linear-time gather of
particle interactions.
Terminology follows this article:
Terminology (largely) follows this article:
Carrier, J., Greengard, L. and Rokhlin, V. "A Fast
Adaptive Multipole Algorithm for Particle Simulations." SIAM Journal on
Scientific and Statistical Computing 9, no. 4 (July 1988): 669-686.
`DOI: 10.1137/0909044 <http://dx.doi.org/10.1137/0909044>`_.
`DOI: 10.1137/0909044 <https://dx.doi.org/10.1137/0909044>`__.
Unless otherwise indicated, all bulk data in this data structure is stored
in a :class:`pyopencl.array.Array`. See also :meth:`get`.
......@@ -761,6 +1319,20 @@ class FMMTraversalInfo(DeviceDataRecord):
An instance of :class:`boxtree.Tree`.
.. attribute:: nboxes
Number of boxes in the tree.
.. attribute:: nlevels
Number of levels in the tree.
.. attribute:: well_sep_is_n_away
The distance (measured in target box diameters in the :math:`l^\infty`
norm) from the edge of the target box at which the 'well-separated'
(i.e. M2L-handled) 'far-field' starts.
.. ------------------------------------------------------------------------
.. rubric:: Basic box lists for iteration
.. ------------------------------------------------------------------------
......@@ -779,6 +1351,10 @@ class FMMTraversalInfo(DeviceDataRecord):
If :attr:`boxtree.Tree.sources_are_targets`,
then ``target_boxes is source_boxes``.
.. attribute:: ntarget_boxes
Number of :attr:`target_boxes`.
.. attribute:: source_parent_boxes
``box_id_t [*]``
......@@ -787,6 +1363,13 @@ class FMMTraversalInfo(DeviceDataRecord):
of one of the :attr:`source_boxes`. These boxes may have sources of their
own.
.. attribute:: level_start_source_box_nrs
``box_id_t [nlevels+1]``
Indices into :attr:`source_boxes` indicating where
each level starts and ends.
.. attribute:: level_start_source_parent_box_nrs
``box_id_t [nlevels+1]``
......@@ -805,6 +1388,13 @@ class FMMTraversalInfo(DeviceDataRecord):
Number of :attr:`target_or_target_parent_boxes`.
.. attribute:: level_start_target_box_nrs
``box_id_t [nlevels+1]``
Indices into :attr:`target_boxes` indicating where
each level starts and ends.
.. attribute:: level_start_target_or_target_parent_box_nrs
``box_id_t [nlevels+1]``
......@@ -813,10 +1403,25 @@ class FMMTraversalInfo(DeviceDataRecord):
each level starts and ends.
.. ------------------------------------------------------------------------
.. rubric:: Colleagues
.. rubric:: Same-level non-well-separated boxes
.. ------------------------------------------------------------------------
Immediately adjacent boxes on the same level. See :ref:`csr`.
Boxes considered to be within the 'non-well-separated area' according to
:attr:`well_sep_is_n_away` that are on the same level as their reference
box. See :ref:`csr`.
This is a generalization of the "colleagues" concept from the Carrier paper
to the case in which :attr:`well_sep_is_n_away` is not 1.
.. attribute:: same_level_non_well_sep_boxes_starts
``box_id_t [nboxes+1]``
.. attribute:: same_level_non_well_sep_boxes_lists
``box_id_t [*]``
Following attributes are deprecated.
.. attribute:: colleagues_starts
......@@ -831,7 +1436,9 @@ class FMMTraversalInfo(DeviceDataRecord):
.. ------------------------------------------------------------------------
List of source boxes immediately adjacent to each target box. Indexed like
:attr:`target_boxes`. See :ref:`csr`.
:attr:`target_boxes`. Includes the target box itself. See :ref:`csr`.
(Note: This list contains global box numbers, not indices into
:attr:`source_boxes`.)
.. attribute:: neighbor_source_boxes_starts
......@@ -848,11 +1455,11 @@ class FMMTraversalInfo(DeviceDataRecord):
Well-separated boxes on the same level. Indexed like
:attr:`target_or_target_parent_boxes`. See :ref:`csr`.
.. attribute:: sep_siblings_starts
.. attribute:: from_sep_siblings_starts
``box_id_t [ntarget_or_target_parent_boxes+1]``
.. attribute:: sep_siblings_lists
.. attribute:: from_sep_siblings_lists
``box_id_t [*]``
......@@ -863,26 +1470,44 @@ class FMMTraversalInfo(DeviceDataRecord):
Smaller source boxes separated from the target box by their own size.
If :attr:`boxtree.Tree.targets_have_extent`, then
:attr:`sep_close_smaller_starts` will be non-*None*. It records
:attr:`from_sep_close_smaller_starts` will be non-*None*. It records
interactions between boxes that would ordinarily be handled
through "List 3", but must be evaluated specially/directly
because of :ref:`extent`.
Indexed like :attr:`target_or_target_parent_boxes`. See :ref:`csr`.
.. attribute:: target_boxes_sep_smaller_by_source_level
.. attribute:: sep_smaller_starts
A list of arrays of global box numbers, one array per level, indicating
which boxes are used with the interaction list entries of
:attr:`from_sep_smaller_by_level`.
``target_boxes_sep_smaller_by_source_level[i]`` has length
``from_sep_smaller_by_level[i].num_nonempty_lists``.
``box_id_t [ntargets+1]``
.. attribute:: sep_smaller_lists
.. attribute:: from_sep_smaller_by_level
``box_id_t [*]``
A list of :attr:`boxtree.Tree.nlevels` (corresponding to the levels on
which each listed source box resides) objects, each of which has
attributes *count*, *starts*, *lists*, *num_nonempty_lists*, and
*nonempty_indices*, which form a CSR list of List 3 source boxes.
.. attribute:: sep_close_smaller_starts
*starts* has shape/type ``box_id_t [num_nonempty_lists+1]``. *lists* is of
type ``box_id_t``. (Note: This list contains global box numbers, not
indices into :attr:`source_boxes`.)
``box_id_t [ntargets+1]`` (or *None*)
Note *starts* are indexed along with
`target_boxes_sep_smaller_by_source_level`. For
example, for level *i*, *lists[starts[j]:starts[j+1]]* represents "List 3"
source boxes of *target_boxes_sep_smaller_by_source_level[i][j]* on level
*i*.
.. attribute:: sep_close_smaller_lists
.. attribute:: from_sep_close_smaller_starts
Indexed like :attr:`target_boxes`. See :ref:`csr`.
``box_id_t [ntarget_boxes+1]`` (or *None*)
.. attribute:: from_sep_close_smaller_lists
``box_id_t [*]`` (or *None*)
......@@ -892,184 +1517,89 @@ class FMMTraversalInfo(DeviceDataRecord):
Bigger source boxes separated from the target box by the (smaller) target
box's size.
(Note: This list contains global box numbers, not indices into
:attr:`source_boxes`.)
If :attr:`boxtree.Tree.sources_have_extent`, then
:attr:`sep_close_bigger_starts` will be non-*None*. It records
interactions between boxes that would ordinarily be handled
through "List 4", but must be evaluated specially/directly
because of :ref:`extent`.
If :attr:`boxtree.Tree.sources_have_extent` or
:attr:`boxtree.Tree.targets_have_extent`, then
:attr:`from_sep_close_bigger_starts` will be non-*None*. It records
interactions between boxes that would ordinarily be handled through "List
4", but must be evaluated specially/directly because of :ref:`extent`.
Indexed like :attr:`target_or_target_parent_boxes`. See :ref:`csr`.
*from_sep_bigger_starts* is indexed like
:attr:`target_or_target_parent_boxes`. Similar to the other "close" lists,
*from_sep_close_bigger_starts* is indexed like :attr:`target_boxes`. See
:ref:`csr`.
.. attribute:: sep_bigger_starts
.. attribute:: from_sep_bigger_starts
``box_id_t [ntarget_or_target_parent_boxes+1]``
.. attribute:: sep_bigger_lists
.. attribute:: from_sep_bigger_lists
``box_id_t [*]``
.. attribute:: sep_close_bigger_starts
.. attribute:: from_sep_close_bigger_starts
``box_id_t [ntarget_or_target_parent_boxes+1]`` (or *None*)
``box_id_t [ntarget_boxes+1]`` (or *None*)
.. attribute:: sep_close_bigger_lists
.. attribute:: from_sep_close_bigger_lists
``box_id_t [*]`` (or *None*)
"""
# {{{ "close" list merging -> "unified list 1"
.. versionchanged:: 2018.2
def merge_close_lists(self, queue, debug=False):
"""Return a new :class:`FMMTraversalInfo` instance with the contents of
:attr:`sep_close_smaller_starts` and :attr:`sep_close_bigger_starts`
merged into :attr:`neighbor_source_boxes_starts` and these two
attributes set to *None*.
"""
Changed index style of *from_sep_close_bigger_starts* from
:attr:`target_or_target_parent_boxes` to :attr:`target_boxes`.
from boxtree.tools import reverse_index_array
target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
self.target_or_target_parent_boxes, target_size=self.tree.nboxes,
queue=queue)
target_or_target_parent_boxes_from_tgt_boxes = cl.array.take(
target_or_target_parent_boxes_from_all_boxes,
self.target_boxes, queue=queue)
del target_or_target_parent_boxes_from_all_boxes
@memoize_in(self, "merge_close_lists_kernel")
def get_new_nb_sources_knl(write_counts):
from pyopencl.elementwise import ElementwiseTemplate
return ElementwiseTemplate("""//CL:mako//
/* input: */
box_id_t *target_or_target_parent_boxes_from_tgt_boxes,
box_id_t *neighbor_source_boxes_starts,
box_id_t *sep_close_smaller_starts,
box_id_t *sep_close_bigger_starts,
%if not write_counts:
box_id_t *neighbor_source_boxes_lists,
box_id_t *sep_close_smaller_lists,
box_id_t *sep_close_bigger_lists,
box_id_t *new_neighbor_source_boxes_starts,
%endif
/* output: */
%if write_counts:
box_id_t *new_neighbor_source_boxes_counts,
%else:
box_id_t *new_neighbor_source_boxes_lists,
%endif
""",
"""//CL:mako//
box_id_t itgt_box = i;
box_id_t itarget_or_target_parent_box =
target_or_target_parent_boxes_from_tgt_boxes[itgt_box];
box_id_t neighbor_source_boxes_start =
neighbor_source_boxes_starts[itgt_box];
box_id_t neighbor_source_boxes_count =
neighbor_source_boxes_starts[itgt_box + 1]
- neighbor_source_boxes_start;
box_id_t sep_close_smaller_start =
sep_close_smaller_starts[itgt_box];
box_id_t sep_close_smaller_count =
sep_close_smaller_starts[itgt_box + 1]
- sep_close_smaller_start;
box_id_t sep_close_bigger_start =
sep_close_bigger_starts[itarget_or_target_parent_box];
box_id_t sep_close_bigger_count =
sep_close_bigger_starts[itarget_or_target_parent_box + 1]
- sep_close_bigger_start;
%if write_counts:
if (itgt_box == 0)
new_neighbor_source_boxes_counts[0] = 0;
new_neighbor_source_boxes_counts[itgt_box + 1] =
neighbor_source_boxes_count
+ sep_close_smaller_count
+ sep_close_bigger_count
;
%else:
box_id_t cur_idx = new_neighbor_source_boxes_starts[itgt_box];
#define COPY_FROM(NAME) \
for (box_id_t i = 0; i < NAME##_count; ++i) \
new_neighbor_source_boxes_lists[cur_idx++] = \
NAME##_lists[NAME##_start+i];
COPY_FROM(neighbor_source_boxes)
COPY_FROM(sep_close_smaller)
COPY_FROM(sep_close_bigger)
%endif
""").build(
queue.context,
type_aliases=(
("box_id_t", self.tree.box_id_dtype),
),
var_values=(
("write_counts", write_counts),
)
)
ntarget_boxes = len(self.target_boxes)
new_neighbor_source_boxes_counts = cl.array.empty(
queue, ntarget_boxes+1, self.tree.box_id_dtype)
get_new_nb_sources_knl(True)(
# input:
target_or_target_parent_boxes_from_tgt_boxes,
self.neighbor_source_boxes_starts,
self.sep_close_smaller_starts,
self.sep_close_bigger_starts,
# output:
new_neighbor_source_boxes_counts,
range=slice(ntarget_boxes),
queue=queue)
new_neighbor_source_boxes_starts = cl.array.cumsum(
new_neighbor_source_boxes_counts)
del new_neighbor_source_boxes_counts
new_neighbor_source_boxes_lists = cl.array.empty(
queue,
int(new_neighbor_source_boxes_starts[ntarget_boxes].get()),
self.tree.box_id_dtype)
new_neighbor_source_boxes_lists.fill(999999999)
.. automethod:: get
get_new_nb_sources_knl(False)(
# input:
target_or_target_parent_boxes_from_tgt_boxes,
.. automethod:: merge_close_lists
"""
self.neighbor_source_boxes_starts,
self.sep_close_smaller_starts,
self.sep_close_bigger_starts,
self.neighbor_source_boxes_lists,
self.sep_close_smaller_lists,
self.sep_close_bigger_lists,
# {{{ "close" list merging -> "unified list 1"
new_neighbor_source_boxes_starts,
def merge_close_lists(self, queue, debug=False):
"""Return a new :class:`FMMTraversalInfo` instance with the contents of
:attr:`from_sep_close_smaller_starts` and
:attr:`from_sep_close_bigger_starts` merged into
:attr:`neighbor_source_boxes_starts` and these two attributes set to
*None*.
"""
# output:
new_neighbor_source_boxes_lists,
range=slice(ntarget_boxes),
queue=queue)
list_merger = _ListMerger(queue.context, self.tree.box_id_dtype)
result, evt = (
list_merger(
queue,
# starts
(self.neighbor_source_boxes_starts,
self.from_sep_close_smaller_starts,
self.from_sep_close_bigger_starts),
# lists
(self.neighbor_source_boxes_lists,
self.from_sep_close_smaller_lists,
self.from_sep_close_bigger_lists),
# input index styles
_IndexStyle.TARGET_BOXES,
# output index style
_IndexStyle.TARGET_BOXES,
# box and tree data
self.target_boxes,
self.target_or_target_parent_boxes,
self.tree.nboxes,
debug))
cl.wait_for_events([evt])
return self.copy(
neighbor_source_boxes_starts=new_neighbor_source_boxes_starts,
neighbor_source_boxes_lists=new_neighbor_source_boxes_lists,
sep_close_smaller_starts=None,
sep_close_smaller_lists=None,
sep_close_bigger_starts=None,
sep_close_bigger_lists=None)
neighbor_source_boxes_starts=result["starts"].with_queue(None),
neighbor_source_boxes_lists=result["lists"].with_queue(None),
from_sep_close_smaller_starts=None,
from_sep_close_smaller_lists=None,
from_sep_close_bigger_starts=None,
from_sep_close_bigger_lists=None)
# }}}
......@@ -1083,6 +1613,18 @@ class FMMTraversalInfo(DeviceDataRecord):
# }}}
@property
def nboxes(self):
return self.tree.nboxes
@property
def nlevels(self):
return self.tree.nlevels
@property
def ntarget_boxes(self):
return len(self.target_boxes)
@property
def ntarget_or_target_parent_boxes(self):
return len(self.target_or_target_parent_boxes)
......@@ -1095,41 +1637,110 @@ class _KernelInfo(Record):
class FMMTraversalBuilder:
def __init__(self, context):
"""
.. automethod:: __init__
"""
def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None):
"""
:arg well_sep_is_n_away: Either An integer 1 or greater.
(Only 1 and 2 are tested.)
The spacing between boxes that is considered "well-separated" for
:attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts`
(List 2).
:arg from_sep_smaller_crit: The criterion used to determine separation
box dimensions and separation for
:attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level`
(List 3). May be one of ``"static_linf"`` (use the box square,
possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`),
``"precise_linf"`` (use the precise extent of targets in the box,
including their radii), or ``"static_l2"`` (use the circumcircle of
the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`).
"""
self.context = context
self.well_sep_is_n_away = well_sep_is_n_away
self.from_sep_smaller_crit = from_sep_smaller_crit
# {{{ kernel builder
@memoize_method
def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype,
@log_process(logger)
def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
coord_dtype, box_level_dtype, max_levels,
sources_are_targets, sources_have_extent, targets_have_extent,
stick_out_factor):
extent_norm,
source_boxes_has_mask,
source_parent_boxes_has_mask):
logging.info("building traversal build kernels")
# {{{ process from_sep_smaller_crit
from_sep_smaller_crit = self.from_sep_smaller_crit
if from_sep_smaller_crit is None:
from_sep_smaller_crit = "precise_linf"
if extent_norm == "linf":
# no special checks needed
pass
elif extent_norm == "l2":
if from_sep_smaller_crit == "static_linf":
# Not technically necessary, but static linf will assume box
# bounds that are not guaranteed to contain all particle
# extents.
raise ValueError(
"The static l^inf from-sep-smaller criterion "
"cannot be used with the l^2 extent norm")
elif extent_norm is None:
assert not (sources_have_extent or targets_have_extent)
if from_sep_smaller_crit is None:
# doesn't matter
from_sep_smaller_crit = "static_linf"
else:
raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}")
if from_sep_smaller_crit not in [
"static_linf", "precise_linf",
"static_l2",
]:
raise ValueError(
"unexpected value of 'from_sep_smaller_crit': "
f"{from_sep_smaller_crit}")
# }}}
debug = False
from pyopencl.tools import dtype_to_ctype
from boxtree.tree import box_flags_enum
render_vars = dict(
dimensions=dimensions,
dtype_to_ctype=dtype_to_ctype,
particle_id_dtype=particle_id_dtype,
box_id_dtype=box_id_dtype,
box_flags_enum=box_flags_enum,
coord_dtype=coord_dtype,
vec_types=cl.array.vec.types,
max_levels=max_levels,
AXIS_NAMES=AXIS_NAMES,
debug=debug,
sources_are_targets=sources_are_targets,
sources_have_extent=sources_have_extent,
targets_have_extent=targets_have_extent,
stick_out_factor=stick_out_factor,
)
render_vars = {
"np": np,
"dimensions": dimensions,
"dtype_to_ctype": dtype_to_ctype,
"particle_id_dtype": particle_id_dtype,
"box_id_dtype": box_id_dtype,
"box_flags_enum": box_flags_enum,
"coord_dtype": coord_dtype,
"get_coord_vec_dtype": get_coord_vec_dtype,
"cvec_sub": partial(coord_vec_subscript_code, dimensions),
"max_levels": max_levels,
"AXIS_NAMES": AXIS_NAMES,
"debug": debug,
"sources_are_targets": sources_are_targets,
"sources_have_extent": sources_have_extent,
"targets_have_extent": targets_have_extent,
"well_sep_is_n_away": self.well_sep_is_n_away,
"from_sep_smaller_crit": from_sep_smaller_crit,
"source_boxes_has_mask": source_boxes_has_mask,
"source_parent_boxes_has_mask": source_parent_boxes_has_mask,
}
from pyopencl.algorithm import ListOfListsBuilder
from pyopencl.tools import VectorArg, ScalarArg
from boxtree.tools import ScalarArg, VectorArg
result = {}
......@@ -1140,6 +1751,12 @@ class FMMTraversalBuilder:
+ SOURCES_PARENTS_AND_TARGETS_TEMPLATE,
strict_undefined=True).render(**render_vars)
arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")]
if source_boxes_has_mask:
arg_decls.append(VectorArg(np.int8, "source_boxes_mask"))
if source_parent_boxes_has_mask:
arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask"))
result["sources_parents_and_targets_builder"] = \
ListOfListsBuilder(self.context,
[
......@@ -1151,9 +1768,7 @@ class FMMTraversalBuilder:
if not sources_are_targets
else []),
str(src),
arg_decls=[
VectorArg(box_flags_enum.dtype, "box_flags"),
],
arg_decls=arg_decls,
debug=debug,
name_prefix="sources_parents_and_targets")
......@@ -1170,46 +1785,68 @@ class FMMTraversalBuilder:
# {{{ build list N builders
base_args = [
VectorArg(coord_dtype, "box_centers"),
VectorArg(coord_dtype, "box_centers", with_offset=False),
ScalarArg(coord_dtype, "root_extent"),
VectorArg(np.uint8, "box_levels"),
ScalarArg(box_id_dtype, "aligned_nboxes"),
VectorArg(box_id_dtype, "box_child_ids"),
VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
VectorArg(box_flags_enum.dtype, "box_flags"),
]
for list_name, template, extra_args, extra_lists in [
("colleagues", COLLEAGUES_TEMPLATE, [], []),
("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE,
for list_name, template, extra_args, extra_lists, eliminate_empty_list in [
("same_level_non_well_sep_boxes",
SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []),
("neighbor_source_boxes", NEIGHBOR_SOURCE_BOXES_TEMPLATE,
[
VectorArg(box_id_dtype, "target_boxes"),
], []),
("sep_siblings", SEP_SIBLINGS_TEMPLATE,
], [], []),
("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE,
[
VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
VectorArg(box_id_dtype, "box_parent_ids"),
VectorArg(box_id_dtype, "colleagues_starts"),
VectorArg(box_id_dtype, "colleagues_list"),
], []),
("sep_smaller", SEP_SMALLER_TEMPLATE,
VectorArg(box_id_dtype, "box_parent_ids",
with_offset=False),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_starts"),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_lists"),
], [], []),
("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE,
[
ScalarArg(coord_dtype, "stick_out_factor"),
VectorArg(box_id_dtype, "target_boxes"),
VectorArg(box_id_dtype, "colleagues_starts"),
VectorArg(box_id_dtype, "colleagues_list"),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_starts"),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_lists"),
*([VectorArg(coord_dtype, "box_target_bounding_box_min",
with_offset=False),
VectorArg(coord_dtype, "box_target_bounding_box_max",
with_offset=False),
VectorArg(particle_id_dtype,
"box_source_counts_cumul"),
]
if targets_have_extent else []),
ScalarArg(particle_id_dtype,
"from_sep_smaller_min_nsources_cumul"),
ScalarArg(box_id_dtype, "from_sep_smaller_source_level"),
],
["sep_close_smaller"]
["from_sep_close_smaller"]
if sources_have_extent or targets_have_extent
else []),
("sep_bigger", SEP_BIGGER_TEMPLATE,
else [], ["from_sep_smaller"]),
("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE,
[
ScalarArg(coord_dtype, "stick_out_factor"),
VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
VectorArg(box_id_dtype, "box_parent_ids"),
VectorArg(box_id_dtype, "colleagues_starts"),
VectorArg(box_id_dtype, "colleagues_list"),
VectorArg(box_id_dtype, "box_parent_ids",
with_offset=False),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_starts"),
VectorArg(box_id_dtype,
"same_level_non_well_sep_boxes_lists"),
],
["sep_close_bigger"]
["from_sep_close_bigger"]
if sources_have_extent or targets_have_extent
else []),
else [], []),
]:
src = Template(
TRAVERSAL_PREAMBLE_TEMPLATE
......@@ -1217,51 +1854,84 @@ class FMMTraversalBuilder:
+ template,
strict_undefined=True).render(**render_vars)
result[list_name+"_builder"] = ListOfListsBuilder(self.context,
result[f"{list_name}_builder"] = ListOfListsBuilder(
self.context,
[(list_name, box_id_dtype)]
+ [(extra_list_name, box_id_dtype)
for extra_list_name in extra_lists],
str(src),
arg_decls=base_args + extra_args,
debug=debug, name_prefix=list_name,
complex_kernel=True)
complex_kernel=True,
eliminate_empty_output_lists=eliminate_empty_list)
# }}}
logging.info("traversal build kernels built")
return _KernelInfo(**result)
# }}}
# {{{ driver
def __call__(self, queue, tree, wait_for=None, debug=False):
def __call__(self, queue, tree, wait_for=None, debug=False,
_from_sep_smaller_min_nsources_cumul=None,
source_boxes_mask=None,
source_parent_boxes_mask=None):
"""
:arg queue: A :class:`pyopencl.CommandQueue` instance.
:arg tree: A :class:`boxtree.Tree` instance.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
exeuction.
execution.
:arg source_boxes_mask: Only boxes passing this mask will be considered for
`source_boxes`. Used by the distributed implementation.
:arg source_parent_boxes_mask: Only boxes passing this mask will be
considered for `source_parent_boxes`. Used by the distributed
implementation.
:return: A tuple *(trav, event)*, where *trav* is a new instance of
:class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
for dependency management.
"""
from_sep_smaller_min_nsources_cumul = _from_sep_smaller_min_nsources_cumul
if from_sep_smaller_min_nsources_cumul is None:
# default to old no-threshold behavior
from_sep_smaller_min_nsources_cumul = 0
if not tree._is_pruned:
raise ValueError("tree must be pruned for traversal generation")
# Generated code shouldn't depend on tje *exact* number of tree levels.
if tree.sources_have_extent:
# YAGNI
raise NotImplementedError(
"trees with source extent are not supported for "
"traversal generation")
# FIXME: missing on TreeOfBoxes
sources_are_targets = getattr(tree, "sources_are_targets", True)
# Generated code shouldn't depend on the *exact* number of tree levels.
# So round up to the next multiple of 5.
from pytools import div_ceil
max_levels = div_ceil(tree.nlevels, 5) * 5
level_start_box_nrs = (
None if tree.level_start_box_nrs is None else
cl.array.to_device(queue, tree.level_start_box_nrs))
knl_info = self.get_kernel_info(
tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype,
tree.coord_dtype, tree.box_level_dtype, max_levels,
tree.sources_are_targets,
tree.sources_have_extent, tree.targets_have_extent,
tree.stick_out_factor)
dimensions=tree.dimensions,
particle_id_dtype=getattr(tree, "particle_id_dtype", None),
box_id_dtype=tree.box_id_dtype,
coord_dtype=tree.coord_dtype,
box_level_dtype=tree.box_level_dtype,
max_levels=max_levels,
sources_are_targets=sources_are_targets,
sources_have_extent=tree.sources_have_extent,
targets_have_extent=tree.targets_have_extent,
extent_norm=tree.extent_norm,
source_boxes_has_mask=source_boxes_mask is not None,
source_parent_boxes_has_mask=source_parent_boxes_mask is not None)
def fin_debug(s):
if debug:
......@@ -1269,21 +1939,29 @@ class FMMTraversalBuilder:
logger.debug(s)
logger.info("start building traversal")
traversal_plog = ProcessLogger(logger, "build traversal")
# {{{ source boxes, their parents, and target boxes
fin_debug("building list of source boxes, their parents, and target boxes")
extra_args = []
if source_boxes_mask is not None:
extra_args.append(source_boxes_mask)
if source_parent_boxes_mask is not None:
extra_args.append(source_parent_boxes_mask)
result, evt = knl_info.sources_parents_and_targets_builder(
queue, tree.nboxes, tree.box_flags.data, wait_for=wait_for)
queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for
)
wait_for = [evt]
source_parent_boxes = result["source_parent_boxes"].lists
source_boxes = result["source_boxes"].lists
target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists
if not tree.sources_are_targets:
if not sources_are_targets:
target_boxes = result["target_boxes"].lists
else:
target_boxes = source_boxes
......@@ -1293,56 +1971,68 @@ class FMMTraversalBuilder:
# {{{ figure out level starts in *_parent_boxes
def extract_level_start_box_nrs(box_list, wait_for):
if level_start_box_nrs is None:
return None, []
result = cl.array.empty(queue,
tree.nlevels+1, tree.box_id_dtype) \
.fill(len(box_list))
evt = knl_info.level_start_box_nrs_extractor(
tree.level_start_box_nrs_dev,
level_start_box_nrs,
tree.box_levels,
box_list,
result,
range=slice(1, len(box_list)),
range=slice(0, len(box_list)),
queue=queue, wait_for=wait_for)
result = result.get()
# We skipped box 0 above. This is always true, whether
# box 0 (=level 0) is a leaf or a parent.
result[0] = 0
# Postprocess result for unoccupied levels
prev_start = len(box_list)
for ilev in range(tree.nlevels-1, -1, -1):
result[ilev] = prev_start = \
min(result[ilev], prev_start)
return result, evt
return result, [evt]
fin_debug("finding level starts in source boxes array")
level_start_source_box_nrs, evt_s = \
extract_level_start_box_nrs(
source_boxes, wait_for=wait_for)
fin_debug("finding level starts in source parent boxes array")
level_start_source_parent_box_nrs, evt_s = \
level_start_source_parent_box_nrs, evt_sp = \
extract_level_start_box_nrs(
source_parent_boxes, wait_for=wait_for)
fin_debug("finding level starts in target boxes array")
level_start_target_box_nrs, evt_t = \
extract_level_start_box_nrs(
target_boxes, wait_for=wait_for)
fin_debug("finding level starts in target or target parent boxes array")
level_start_target_or_target_parent_box_nrs, evt_t = \
level_start_target_or_target_parent_box_nrs, evt_tp = \
extract_level_start_box_nrs(
target_or_target_parent_boxes, wait_for=wait_for)
wait_for = [evt_s, evt_t]
wait_for = evt_s + evt_sp + evt_t + evt_tp
# }}}
# {{{ colleagues
# {{{ same-level non-well-separated boxes
# If well_sep_is_n_away is 1, this agrees with the definition of
# 'colleagues' from the classical FMM literature.
fin_debug("finding colleagues")
fin_debug("finding same-level near-field boxes")
result, evt = knl_info.colleagues_builder(
result, evt = knl_info.same_level_non_well_sep_boxes_builder(
queue, tree.nboxes,
tree.box_centers.data, tree.root_extent, tree.box_levels.data,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
tree.box_centers.data, tree.root_extent, tree.box_levels,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
wait_for=wait_for)
wait_for = [evt]
colleagues = result["colleagues"]
same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"]
# }}}
......@@ -1352,9 +2042,9 @@ class FMMTraversalBuilder:
result, evt = knl_info.neighbor_source_boxes_builder(
queue, len(target_boxes),
tree.box_centers.data, tree.root_extent, tree.box_levels.data,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
target_boxes.data, wait_for=wait_for)
tree.box_centers.data, tree.root_extent, tree.box_levels,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
target_boxes, wait_for=wait_for)
wait_for = [evt]
neighbor_source_boxes = result["neighbor_source_boxes"]
......@@ -1365,72 +2055,157 @@ class FMMTraversalBuilder:
fin_debug("finding well-separated siblings ('list 2')")
result, evt = knl_info.sep_siblings_builder(
result, evt = knl_info.from_sep_siblings_builder(
queue, len(target_or_target_parent_boxes),
tree.box_centers.data, tree.root_extent, tree.box_levels.data,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
target_or_target_parent_boxes.data, tree.box_parent_ids.data,
colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
tree.box_centers.data, tree.root_extent, tree.box_levels,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
target_or_target_parent_boxes, tree.box_parent_ids.data,
same_level_non_well_sep_boxes.starts,
same_level_non_well_sep_boxes.lists,
wait_for=wait_for)
wait_for = [evt]
sep_siblings = result["sep_siblings"]
from_sep_siblings = result["from_sep_siblings"]
# }}}
with_extent = tree.sources_have_extent or tree.targets_have_extent
# {{{ separated smaller ("list 3")
fin_debug("finding separated smaller ('list 3')")
result, evt = knl_info.sep_smaller_builder(
from_sep_smaller_base_args = (
queue, len(target_boxes),
tree.box_centers.data, tree.root_extent, tree.box_levels.data,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
target_boxes.data,
colleagues.starts.data, colleagues.lists.data,
wait_for=wait_for)
wait_for = [evt]
sep_smaller = result["sep_smaller"]
# base_args
tree.box_centers.data, tree.root_extent, tree.box_levels,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
# list-specific args
tree.stick_out_factor, target_boxes,
same_level_non_well_sep_boxes.starts,
same_level_non_well_sep_boxes.lists,
*([tree.box_target_bounding_box_min.data,
tree.box_target_bounding_box_max.data,
tree.box_source_counts_cumul]
if tree.targets_have_extent else []),
from_sep_smaller_min_nsources_cumul,
)
from_sep_smaller_wait_for = []
from_sep_smaller_by_level = []
target_boxes_sep_smaller_by_source_level = []
for ilevel in range(tree.nlevels):
fin_debug(f"finding separated smaller ('list 3 level {ilevel}')")
result, evt = knl_info.from_sep_smaller_builder(
*from_sep_smaller_base_args, ilevel,
omit_lists=("from_sep_close_smaller",) if with_extent else (),
wait_for=wait_for)
if tree.sources_have_extent or tree.targets_have_extent:
sep_close_smaller_starts = result["sep_close_smaller"].starts
sep_close_smaller_lists = result["sep_close_smaller"].lists
target_boxes_sep_smaller = target_boxes[
result["from_sep_smaller"].nonempty_indices]
from_sep_smaller_by_level.append(result["from_sep_smaller"])
target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller)
from_sep_smaller_wait_for.append(evt)
if with_extent:
fin_debug("finding separated smaller close ('list 3 close')")
result, evt = knl_info.from_sep_smaller_builder(
*from_sep_smaller_base_args,
-1,
omit_lists=("from_sep_smaller",),
wait_for=wait_for)
from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts
from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists
from_sep_smaller_wait_for.append(evt)
else:
sep_close_smaller_starts = None
sep_close_smaller_lists = None
from_sep_close_smaller_starts = None
from_sep_close_smaller_lists = None
# }}}
wait_for = from_sep_smaller_wait_for
del from_sep_smaller_wait_for
# {{{ separated bigger ("list 4")
fin_debug("finding separated bigger ('list 4')")
result, evt = knl_info.sep_bigger_builder(
result, evt = knl_info.from_sep_bigger_builder(
queue, len(target_or_target_parent_boxes),
tree.box_centers.data, tree.root_extent, tree.box_levels.data,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
target_or_target_parent_boxes.data, tree.box_parent_ids.data,
colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
wait_for = [evt]
sep_bigger = result["sep_bigger"]
tree.box_centers.data, tree.root_extent, tree.box_levels,
tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
tree.stick_out_factor, target_or_target_parent_boxes,
tree.box_parent_ids.data,
same_level_non_well_sep_boxes.starts,
same_level_non_well_sep_boxes.lists,
wait_for=wait_for)
if tree.sources_have_extent or tree.targets_have_extent:
sep_close_bigger_starts = result["sep_close_bigger"].starts
sep_close_bigger_lists = result["sep_close_bigger"].lists
wait_for = [evt]
from_sep_bigger = result["from_sep_bigger"]
if with_extent:
# These are indexed by target_or_target_parent boxes; we rewrite
# them to be indexed by target_boxes.
from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts
from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists
list_merger = _ListMerger(queue.context, tree.box_id_dtype)
result, evt = list_merger(
queue,
# starts
(from_sep_close_bigger_starts_raw,),
# lists
(from_sep_close_bigger_lists_raw,),
# input index style
_IndexStyle.TARGET_OR_TARGET_PARENT_BOXES,
# output index style
_IndexStyle.TARGET_BOXES,
# box and tree data
target_boxes,
target_or_target_parent_boxes,
tree.nboxes,
debug,
wait_for=wait_for)
wait_for = [evt]
del from_sep_close_bigger_starts_raw
del from_sep_close_bigger_lists_raw
from_sep_close_bigger_starts = result["starts"]
from_sep_close_bigger_lists = result["lists"]
else:
sep_close_bigger_starts = None
sep_close_bigger_lists = None
from_sep_close_bigger_starts = None
from_sep_close_bigger_lists = None
# }}}
if self.well_sep_is_n_away == 1:
colleagues_starts = same_level_non_well_sep_boxes.starts
colleagues_lists = same_level_non_well_sep_boxes.lists
else:
colleagues_starts = None
colleagues_lists = None
evt, = wait_for
logger.info("traversal built")
traversal_plog.done(
"from_sep_smaller_crit: %s",
self.from_sep_smaller_crit)
return FMMTraversalInfo(
tree=tree,
well_sep_is_n_away=self.well_sep_is_n_away,
source_boxes=source_boxes,
target_boxes=target_boxes,
level_start_source_box_nrs=level_start_source_box_nrs,
level_start_target_box_nrs=level_start_target_box_nrs,
source_parent_boxes=source_parent_boxes,
level_start_source_parent_box_nrs=level_start_source_parent_box_nrs,
......@@ -1438,28 +2213,34 @@ class FMMTraversalBuilder:
level_start_target_or_target_parent_box_nrs=(
level_start_target_or_target_parent_box_nrs),
colleagues_starts=colleagues.starts,
colleagues_lists=colleagues.lists,
same_level_non_well_sep_boxes_starts=(
same_level_non_well_sep_boxes.starts),
same_level_non_well_sep_boxes_lists=(
same_level_non_well_sep_boxes.lists),
# Deprecated, but we'll keep these alive for the time being.
colleagues_starts=colleagues_starts,
colleagues_lists=colleagues_lists,
neighbor_source_boxes_starts=neighbor_source_boxes.starts,
neighbor_source_boxes_lists=neighbor_source_boxes.lists,
sep_siblings_starts=sep_siblings.starts,
sep_siblings_lists=sep_siblings.lists,
from_sep_siblings_starts=from_sep_siblings.starts,
from_sep_siblings_lists=from_sep_siblings.lists,
sep_smaller_starts=sep_smaller.starts,
sep_smaller_lists=sep_smaller.lists,
from_sep_smaller_by_level=from_sep_smaller_by_level,
target_boxes_sep_smaller_by_source_level=(
target_boxes_sep_smaller_by_source_level),
sep_close_smaller_starts=sep_close_smaller_starts,
sep_close_smaller_lists=sep_close_smaller_lists,
from_sep_close_smaller_starts=from_sep_close_smaller_starts,
from_sep_close_smaller_lists=from_sep_close_smaller_lists,
sep_bigger_starts=sep_bigger.starts,
sep_bigger_lists=sep_bigger.lists,
from_sep_bigger_starts=from_sep_bigger.starts,
from_sep_bigger_lists=from_sep_bigger.lists,
sep_close_bigger_starts=sep_close_bigger_starts,
sep_close_bigger_lists=sep_close_bigger_lists,
from_sep_close_bigger_starts=from_sep_close_bigger_starts,
from_sep_close_bigger_lists=from_sep_close_bigger_lists,
).with_queue(None), evt
# }}}
# vim: filetype=pyopencl:fdm=marker
# vim: fdm=marker
from __future__ import division
"""
.. _tree-kinds:
Supported tree kinds
--------------------
The following tree kinds are supported:
- *Nonadaptive* trees have all leaves on the same (last) level.
- *Adaptive* trees differ from nonadaptive trees in that they may have leaves on
more than one level. Adaptive trees have the option of being
*level-restricted*: in a level-restricted tree, neighboring leaves differ by
at most one level.
All trees returned by the tree builder are pruned so that empty leaves have been
removed. If a level-restricted tree is requested, the tree gets constructed in
such a way that the version of the tree before pruning is also level-restricted.
Tree data structure
-------------------
.. currentmodule:: boxtree
.. autoclass:: box_flags_enum
.. autoclass:: TreeOfBoxes
.. autoclass:: Tree
.. currentmodule:: boxtree.tree
Tree with linked point sources
------------------------------
.. autoclass:: TreeWithLinkedPointSources
.. autofunction:: link_point_sources
Filtering the lists of targets
------------------------------
.. currentmodule:: boxtree.tree
Data structures
^^^^^^^^^^^^^^^
.. autoclass:: FilteredTargetListsInUserOrder
.. autoclass:: FilteredTargetListsInTreeOrder
Tools
^^^^^
.. autoclass:: ParticleListFilter
.. autofunction:: filter_target_lists_in_user_order
.. autofunction:: filter_target_lists_in_tree_order
"""
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
......@@ -22,40 +80,217 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
from dataclasses import dataclass
from functools import cached_property
import pyopencl as cl
import numpy as np
from boxtree.tools import DeviceDataRecord
import pyopencl as cl
from cgen import Enum
from pytools import memoize_method
from boxtree.tools import DeviceDataRecord
import logging
logger = logging.getLogger(__name__)
# {{{ box flags
class box_flags_enum(Enum): # noqa
"""Constants for box flags bit field."""
"""Constants for box flags bit field.
.. rubric:: Flags for particle-based trees
.. attribute:: dtype
.. attribute:: IS_SOURCE_BOX
.. attribute:: IS_TARGET_BOX
.. attribute:: IS_SOURCE_OR_TARGET_BOX
.. attribute:: HAS_SOURCE_CHILD_BOXES
.. attribute:: HAS_TARGET_CHILD_BOXES
.. attribute:: HAS_SOURCE_OR_TARGET_CHILD_BOXES
.. attribute:: IS_LEAF_BOX
.. warning ::
:attr:`IS_LEAF_BOX` is only used for :class:`TreeOfBoxes` for the moment.
"""
c_name = "box_flags_t"
dtype = np.dtype(np.uint8)
c_value_prefix = "BOX_"
HAS_OWN_SOURCES = 1 << 0
HAS_OWN_TARGETS = 1 << 1
HAS_OWN_SRCNTGTS = (HAS_OWN_SOURCES | HAS_OWN_TARGETS)
HAS_CHILD_SOURCES = 1 << 2
HAS_CHILD_TARGETS = 1 << 3
HAS_CHILDREN = (HAS_CHILD_SOURCES | HAS_CHILD_TARGETS)
IS_SOURCE_BOX = 1 << 0
IS_TARGET_BOX = 1 << 1
IS_SOURCE_OR_TARGET_BOX = (IS_SOURCE_BOX | IS_TARGET_BOX)
HAS_SOURCE_CHILD_BOXES = 1 << 2
HAS_TARGET_CHILD_BOXES = 1 << 3
HAS_SOURCE_OR_TARGET_CHILD_BOXES = (
HAS_SOURCE_CHILD_BOXES | HAS_TARGET_CHILD_BOXES)
# FIXME: Only used for TreeOfBoxes for now
IS_LEAF_BOX = 1 << 4
# Deprecated alias, do not use.
HAS_CHILDREN = HAS_SOURCE_OR_TARGET_CHILD_BOXES
# }}}
# {{{ tree of boxes
@dataclass
class TreeOfBoxes:
"""A quad/octree tree of pure boxes, excluding their contents (e.g.
particles). It is a lightweight tree handled with :mod:`numpy`, intended
for mesh adaptivity. One may generate a :class:`meshmode.mesh.Mesh` object
consisting of leaf boxes using :func:`make_meshmode_mesh_from_leaves`.
.. attribute:: dimensions
.. attribute:: nlevels
.. attribute:: nboxes
.. attribute:: root_extent
(Scalar) extent of the root box.
.. attribute:: box_centers
mod:`numpy` array of shape ``(dim, nboxes)`` of the centers of the boxes.
.. attribute:: box_parent_ids
:mod:`numpy` vector of parent box ids.
.. attribute:: box_child_ids
(2**dim)-by-nboxes :mod:`numpy` array of children box ids.
.. attribute:: box_levels
:mod:`numpy` vector of box levels in non-decreasing order.
.. attribute:: bounding_box
A :class:`tuple` ``(bbox_min, bbox_max)`` of :mod:`numpy` vectors
giving the (built) extent of the tree. Note that this may be slightly
larger than what is required to contain all particles, if any.
.. attribute:: box_flags
:attr:`box_flags_enum.dtype` ``[nboxes]``
A bitwise combination of :class:`box_flags_enum` constants.
.. attribute:: level_start_box_nrs
``box_id_t [nlevels+1]``
An array of box ids indicating the ID at which each level starts. Levels
are contiguous in box ID space. To determine how many boxes there are
in each level, access the start of the next level. This array is
built so that this works even for the last level.
.. attribute:: box_id_dtype
.. attribute:: box_level_dtype
.. attribute:: coord_dtype
See :class:`Tree` documentation.
.. attribute:: leaf_boxes
Array of leaf boxes.
.. attribute:: sources_have_extent
.. attribute:: targets_have_extent
.. attribute:: extent_norm
.. attribute:: stick_out_factor
See :class:`Tree` documentation.
.. automethod:: __init__
"""
root_extent: np.ndarray
box_centers: np.ndarray
box_parent_ids: np.ndarray
box_child_ids: np.ndarray
box_levels: np.ndarray
box_flags: np.ndarray | None
level_start_box_nrs: np.ndarray | None
# FIXME: these should be properties and take values from box_parent_ids, etc
box_id_dtype: np.dtype
box_level_dtype: np.dtype
coord_dtype: np.dtype
sources_have_extent: bool
targets_have_extent: bool
extent_norm: str
stick_out_factor: float
_is_pruned: bool
@property
def dimensions(self):
return self.box_centers.shape[0]
@property
def nboxes(self):
return self.box_centers.shape[1]
@property
def aligned_nboxes(self):
return self.box_child_ids.shape[-1]
@property
def nlevels(self):
# level starts from 0
if isinstance(self.box_levels, cl.array.Array):
return int(max(self.box_levels).get()) + 1
else:
return max(self.box_levels) + 1
@property
def leaf_boxes(self):
boxes = np.arange(self.nboxes)
return boxes[self.box_flags & box_flags_enum.IS_LEAF_BOX != 0]
@cached_property
def bounding_box(self) -> tuple[np.ndarray, np.ndarray]:
lows = self.box_centers[:, 0] - 0.5 * self.root_extent
highs = lows + self.root_extent
return lows, highs
# {{{ dummy interface for TreePlotter
def get_box_size(self, ibox):
lev = self.box_levels[ibox]
box_size = self.root_extent * 0.5**lev
return box_size
def get_box_extent(self, ibox):
box_size = self.get_box_size(ibox)
extent_low = self.box_centers[:, ibox] - 0.5*box_size
extent_high = extent_low + box_size
return extent_low, extent_high
# }}}
# }}}
# {{{ tree data structure
# {{{ tree with particles
class Tree(DeviceDataRecord, TreeOfBoxes):
r"""A quad/octree consisting of particles sorted into a hierarchy of boxes.
class Tree(DeviceDataRecord):
"""A quad/octree consisting of particles sorted into a hierarchy of boxes.
Optionally, particles may be designated 'sources' and 'targets'. They
may also be assigned radii which restrict the minimum size of the box
into which they may be sorted.
......@@ -66,6 +301,8 @@ class Tree(DeviceDataRecord):
Unless otherwise indicated, all bulk data in this data structure is stored
in a :class:`pyopencl.array.Array`. See also :meth:`get`.
Inherits from :class:`TreeOfBoxes`.
.. rubric:: Flags
.. attribute:: sources_are_targets
......@@ -99,43 +336,52 @@ class Tree(DeviceDataRecord):
.. rubric:: Counts and sizes
.. ------------------------------------------------------------------------
.. attribute:: root_extent
.. attribute:: stick_out_factor
the root box size, a scalar
A scalar used for calculating how much particles with extent may
overextend their containing box.
.. attribute:: stick_out_factor
Each box in the tree can be thought of as being surrounded by a
fictitious box whose :math:`l^\infty` radius is `1 + stick_out_factor`
larger. Particles with extent are allowed to extend inside (a) the
fictitious box or (b) a disk surrounding the fictitious box, depending on
:attr:`extent_norm`.
The fraction of the box diameter by which the :math:`l^\infty` circles
given by :attr:`source_radii` may stick out the box in which they are
contained. A scalar.
.. attribute:: extent_norm
.. attribute:: nsources
One of ``None``, ``"l2"`` or ``"linf"``. If *None*, particles do not have
extent. If not *None*, indicates the norm with which extent-bearing particles
are determined to lie 'inside' a box, taking into account the box's
:attr:`stick_out_factor`.
.. attribute:: ntargets
This image illustrates the difference in semantics:
.. image:: images/linf-l2.png
In the figure, the box has (:math:`\ell^\infty`) radius :math:`R`, the
particle has radius :math:`r`, and :attr:`stick_out_factor` is denoted
:math:`\alpha`.
.. attribute:: nlevels
.. attribute:: bounding_box
.. attribute:: nboxes
a tuple *(bbox_min, bbox_max)* of
:mod:`numpy` vectors giving the (built) extent
of the tree. Note that this may be slightly larger
than what is required to contain all particles.
.. attribute:: nsources
.. attribute:: ntargets
.. attribute:: level_start_box_nrs
``box_id_t [nlevels+1]``
A :class:`numpy.ndarray` of box ids
indicating the ID at which each level starts. Levels
are contiguous in box ID space. To determine
how many boxes there are in each level,
access the start of the next level. This array is
An array of box ids indicating the ID at which each level starts. Levels
are contiguous in box ID space. To determine how many boxes there are
in each level, access the start of the next level. This array is
built so that this works even for the last level.
.. attribute:: level_start_box_nrs_dev
``particle_id_t [nlevels+1``
``particle_id_t [nlevels+1]``
The same array as :attr:`level_start_box_nrs`
as a :class:`pyopencl.array.Array`.
......@@ -169,10 +415,10 @@ class Tree(DeviceDataRecord):
.. attribute:: target_radii
``coord_t [nsources]``
``coord_t [ntargets]``
:math:`l^\infty` radii of the :attr:`targets`.
Available if :attr:`sources_have_extent` is *True*.
Available if :attr:`targets_have_extent` is *True*.
.. ------------------------------------------------------------------------
.. rubric:: Tree/user order indices
......@@ -275,11 +521,43 @@ class Tree(DeviceDataRecord):
:attr:`box_level_dtype` ``box_level_t [nboxes]``
.. attribute:: box_flags
.. ------------------------------------------------------------------------
.. rubric:: Particle-adaptive box extents
.. ------------------------------------------------------------------------
:attr:`box_flags_enum.dtype` ``[nboxes]``
These attributes capture the maximum extent of particles (including the
particle's extents) inside of the box. If the box is empty, both *min* and *max*
will reflect the box center. The purpose of this information is to reduce the
cost of some interactions through knowledge that some boxes are partially empty.
(See the *from_sep_smaller_crit* argument to the constructor of
:class:`boxtree.traversal.FMMTraversalBuilder` for an example.)
A bitwise combination of :class:`box_flags_enum` constants.
.. note::
To obtain the overall, non-adaptive box extent, use
:attr:`boxtree.Tree.box_centers` along with :attr:`boxtree.Tree.box_levels`.
If they are not available, the corresponding attributes will be *None*.
.. attribute:: box_source_bounding_box_min
``coordt_t [dimensions, aligned_nboxes]``
.. attribute:: box_source_bounding_box_max
``coordt_t [dimensions, aligned_nboxes]``
.. attribute:: box_target_bounding_box_min
``coordt_t [dimensions, aligned_nboxes]``
.. attribute:: box_target_bounding_box_max
``coordt_t [dimensions, aligned_nboxes]``
.. rubric:: Methods
.. automethod:: get
"""
@property
......@@ -294,20 +572,16 @@ class Tree(DeviceDataRecord):
@property
def nsources(self):
return len(self.user_source_ids)
return len(self.sources[0])
@property
def ntargets(self):
return len(self.sorted_target_ids)
return len(self.targets[0])
@property
def nlevels(self):
return len(self.level_start_box_nrs) - 1
@property
def aligned_nboxes(self):
return self.box_child_ids.shape[-1]
def plot(self, **kwargs):
from boxtree.visualization import TreePlotter
plotter = TreePlotter(self)
......@@ -350,8 +624,8 @@ class Tree(DeviceDataRecord):
"""
crit = (
(self.box_target_starts <= itarget)
&
(itarget < self.box_target_starts + self.box_target_counts_nonchild))
& (itarget
< self.box_target_starts + self.box_target_counts_nonchild))
return int(np.where(crit)[0])
......@@ -361,13 +635,28 @@ class Tree(DeviceDataRecord):
"""
crit = (
(self.box_source_starts <= isource)
&
(isource < self.box_source_starts + self.box_source_counts_nonchild))
& (isource
< self.box_source_starts + self.box_source_counts_nonchild))
return int(np.where(crit)[0])
# }}}
def to_device(self, queue, exclude_fields=frozenset()):
# level_start_box_nrs should remain in host memory
exclude_fields = set(exclude_fields)
exclude_fields.add("level_start_box_nrs")
return super().to_device(queue, frozenset(exclude_fields))
def to_host_device_array(self, queue, exclude_fields=frozenset()):
# level_start_box_nrs should remain in host memory
exclude_fields = set(exclude_fields)
exclude_fields.add("level_start_box_nrs")
return super().to_host_device_array(
queue, frozenset(exclude_fields))
# }}}
......@@ -378,8 +667,8 @@ class TreeWithLinkedPointSources(Tree):
linked with extent are expanded into point sources which are linked to the
extent-having sources in the original tree. (In an FMM context, they may
stand in for the 'underlying' source for the purpose of the far-field
calculation.) Has all the same attributes as :class:`Tree`.
:attr:`Tree.sources_have_extent` is always *True* for instances of this
calculation.) Has all the same attributes as :class:`boxtree.Tree`.
:attr:`boxtree.Tree.sources_have_extent` is always *True* for instances of this
type. In addition, the following attributes are available.
.. attribute:: npoint_sources
......@@ -397,7 +686,8 @@ class TreeWithLinkedPointSources(Tree):
is an object array.)
This array is stored in :ref:`tree point source order <particle-orderings>`,
unlike the parameter to :meth:`TreeWithLinkedPointSources.___init__`
unlike the parameter to
:meth:`boxtree.tree.TreeWithLinkedPointSources.__init__`
.. attribute:: point_source_counts
......@@ -429,13 +719,22 @@ class TreeWithLinkedPointSources(Tree):
.. attribute:: box_point_source_counts_cumul
``particle_id_t [nboxes]``
.. method:: __init__
This constructor is not intended to be called by users directly.
Call :func:`link_point_sources` instead.
.. rubric:: Methods
.. automethod:: get
"""
def link_point_sources(queue, tree, point_source_starts, point_sources,
debug=False):
"""
*Construction:* Requires that :attr:`Tree.sources_have_extent` is *True*
r"""
*Construction:* Requires that :attr:`boxtree.Tree.sources_have_extent` is *True*
on *tree*.
:arg queue: a :class:`pyopencl.CommandQueue` instance
......@@ -445,9 +744,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
original (extent-having) source number *isrc*. *isrc* is in :ref:`user
source order <particle-orderings>`.
All the particles linked to *isrc* shoud fall within the :math:`l^\infty`
All the particles linked to *isrc* should fall within the :math:`l^\infty`
'circle' around particle number *isrc* with the radius drawn from
:attr:`source_radii`.
:attr:`boxtree.Tree.source_radii`.
:arg point_sources: an object array of (XYZ) point coordinate arrays.
"""
......@@ -523,8 +822,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
dest_indices=tree_order_point_source_starts,
out=[source_boundaries])
from boxtree.tree_build_kernels import \
POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL
from boxtree.tree_build_kernels import (
POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL,
)
logger.debug("point source linking: point source id scan")
......@@ -591,7 +891,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
tree_attrs = {}
for attr_name in tree.__class__.fields:
try:
try: # noqa: SIM105
tree_attrs[attr_name] = getattr(tree, attr_name)
except AttributeError:
pass
......@@ -612,11 +912,11 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
# }}}
# {{{ filtered target lists
# {{{ particle list filter
class FilteredTargetListsInUserOrder(DeviceDataRecord):
"""Use :func:`filter_target_lists_in_user_order` to create instances of this
class.
"""Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create
instances of this class.
This class represents subsets of the list of targets in each box (as given
by :attr:`boxtree.Tree.box_target_starts` and
......@@ -647,73 +947,16 @@ class FilteredTargetListsInUserOrder(DeviceDataRecord):
child boxes). Use together with :attr:`target_starts`.
Target numbers are stored in user order, as the class name suggests.
"""
.. rubric:: Methods
def filter_target_lists_in_user_order(queue, tree, flags):
.. automethod:: get
"""
:arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
:class:`numpy.int8` objects, which indicate by being zero that the
corresponding target (in user target order) is not part of the
filtered list, or by being nonzero that it is.
:returns: A :class:`FilteredTargetListsInUserOrder`
"""
user_order_flags = flags
del flags
user_target_ids = cl.array.empty(queue, tree.ntargets,
tree.sorted_target_ids.dtype)
user_target_ids[tree.sorted_target_ids] = cl.array.arange(
queue, tree.ntargets, user_target_ids.dtype)
from pyopencl.tools import VectorArg, dtype_to_ctype
from pyopencl.algorithm import ListOfListsBuilder
from mako.template import Template
builder = ListOfListsBuilder(queue.context,
[("filt_tgt_list", tree.particle_id_dtype)], Template("""//CL//
typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
{
particle_id_t b_t_start = box_target_starts[i];
particle_id_t b_t_count = box_target_counts_nonchild[i];
for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
{
particle_id_t user_target_id = user_target_ids[j];
if (user_order_flags[user_target_id])
{
APPEND_filt_tgt_list(user_target_id);
}
}
}
""", strict_undefined=True).render(
dtype_to_ctype=dtype_to_ctype,
particle_id_dtype=tree.particle_id_dtype
), arg_decls=[
VectorArg(user_order_flags.dtype, "user_order_flags"),
VectorArg(tree.particle_id_dtype, "user_target_ids"),
VectorArg(tree.particle_id_dtype, "box_target_starts"),
VectorArg(tree.particle_id_dtype, "box_target_counts_nonchild"),
])
result, evt = builder(queue, tree.nboxes,
user_order_flags.data,
user_target_ids.data,
tree.box_target_starts.data, tree.box_target_counts_nonchild.data)
return FilteredTargetListsInUserOrder(
nfiltered_targets=result["filt_tgt_list"].count,
target_starts=result["filt_tgt_list"].starts,
target_lists=result["filt_tgt_list"].lists,
).with_queue(None)
class FilteredTargetListsInTreeOrder(DeviceDataRecord):
"""Use :func:`filter_target_lists_in_tree_order` to create instances of this
class.
"""Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create
instances of this class.
This class represents subsets of the list of targets in each box (as given by
:attr:`boxtree.Tree.box_target_starts` and
......@@ -756,90 +999,219 @@ class FilteredTargetListsInTreeOrder(DeviceDataRecord):
Storing *to* these indices will reorder the targets
from *filtered* tree target order into 'regular'
:ref:`tree target order <particle-orderings>`.
.. rubric:: Methods
.. automethod:: get
"""
def filter_target_lists_in_tree_order(queue, tree, flags):
class ParticleListFilter:
"""
:arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
:class:`numpy.int8` objects, which indicate by being zero that the
corresponding target (in user target order) is not part of the
filtered list, or by being nonzero that it is.
:returns: A :class:`FilteredTargetListsInTreeOrder`
.. automethod:: filter_target_lists_in_tree_order
.. automethod:: filter_target_lists_in_user_order
"""
tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8)
tree_order_flags[tree.sorted_target_ids] = flags
def __init__(self, context):
self.context = context
from boxtree.tree_build_kernels import (
TREE_ORDER_TARGET_FILTER_SCAN_TPL,
TREE_ORDER_TARGET_FILTER_INDEX_TPL)
@memoize_method
def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
user_order_flags_dtype):
from mako.template import Template
scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build(
queue.context,
type_aliases=(
("scan_t", tree.particle_id_dtype),
("particle_id_t", tree.particle_id_dtype),
),
)
filtered_from_unfiltered_target_indices = cl.array.empty(
queue, tree.ntargets, tree.particle_id_dtype)
unfiltered_from_filtered_target_indices = cl.array.empty(
queue, tree.ntargets, tree.particle_id_dtype)
nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype)
scan_knl(tree_order_flags,
filtered_from_unfiltered_target_indices,
unfiltered_from_filtered_target_indices,
nfiltered_targets,
queue=queue)
from pyopencl.algorithm import ListOfListsBuilder
from pyopencl.tools import dtype_to_ctype
nfiltered_targets = int(nfiltered_targets.get())
from boxtree.tools import VectorArg
unfiltered_from_filtered_target_indices = \
unfiltered_from_filtered_target_indices[:nfiltered_targets]
builder = ListOfListsBuilder(self.context,
[("filt_tgt_list", particle_id_dtype)], Template("""//CL//
typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
from pytools.obj_array import make_obj_array
filtered_targets = make_obj_array([
targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices]
for targets_i in tree.targets
])
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
{
particle_id_t b_t_start = box_target_starts[i];
particle_id_t b_t_count = box_target_counts_nonchild[i];
index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build(
queue.context,
type_aliases=(
("particle_id_t", tree.particle_id_dtype),
),
for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
{
particle_id_t user_target_id = user_target_ids[j];
if (user_order_flags[user_target_id])
{
APPEND_filt_tgt_list(user_target_id);
}
}
}
""", strict_undefined=True).render(
dtype_to_ctype=dtype_to_ctype,
particle_id_dtype=particle_id_dtype
), arg_decls=[
VectorArg(user_order_flags_dtype, "user_order_flags"),
VectorArg(particle_id_dtype, "user_target_ids"),
VectorArg(particle_id_dtype, "box_target_starts"),
VectorArg(particle_id_dtype, "box_target_counts_nonchild"),
])
return builder
def filter_target_lists_in_user_order(self, queue, tree, flags):
"""
:arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
:class:`numpy.int8` objects, which indicate by being zero that the
corresponding target (in user target order) is not part of the
filtered list, or by being nonzero that it is.
:returns: A :class:`FilteredTargetListsInUserOrder`
"""
user_order_flags = flags
del flags
user_target_ids = cl.array.empty(queue, tree.ntargets,
tree.sorted_target_ids.dtype)
user_target_ids[tree.sorted_target_ids] = cl.array.arange(
queue, tree.ntargets, user_target_ids.dtype)
kernel = self.get_filter_target_lists_in_user_order_kernel(
tree.particle_id_dtype, user_order_flags.dtype)
result, _evt = kernel(queue, tree.nboxes,
user_order_flags,
user_target_ids,
tree.box_target_starts,
tree.box_target_counts_nonchild)
return FilteredTargetListsInUserOrder(
nfiltered_targets=result["filt_tgt_list"].count,
target_starts=result["filt_tgt_list"].starts,
target_lists=result["filt_tgt_list"].lists,
).with_queue(None)
@memoize_method
def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype):
from boxtree.tree_build_kernels import (
TREE_ORDER_TARGET_FILTER_INDEX_TPL,
TREE_ORDER_TARGET_FILTER_SCAN_TPL,
)
box_target_starts_filtered = \
cl.array.empty_like(tree.box_target_starts)
box_target_counts_nonchild_filtered = \
cl.array.empty_like(tree.box_target_counts_nonchild)
scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build(
self.context,
type_aliases=(
("scan_t", particle_id_dtype),
("particle_id_t", particle_id_dtype),
),
)
index_knl(
# input
tree.box_target_starts,
tree.box_target_counts_nonchild,
filtered_from_unfiltered_target_indices,
tree.ntargets,
nfiltered_targets,
index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build(
self.context,
type_aliases=(
("particle_id_t", particle_id_dtype),
),
)
# output
box_target_starts_filtered,
box_target_counts_nonchild_filtered,
return scan_knl, index_knl
queue=queue)
def filter_target_lists_in_tree_order(self, queue, tree, flags):
"""
:arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
:class:`numpy.int8` objects, which indicate by being zero that the
corresponding target (in user target order) is not part of the
filtered list, or by being nonzero that it is.
:returns: A :class:`FilteredTargetListsInTreeOrder`
"""
tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8)
tree_order_flags[tree.sorted_target_ids] = flags
filtered_from_unfiltered_target_indices = cl.array.empty(
queue, tree.ntargets, tree.particle_id_dtype)
unfiltered_from_filtered_target_indices = cl.array.empty(
queue, tree.ntargets, tree.particle_id_dtype)
nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype)
scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels(
tree.particle_id_dtype)
scan_knl(tree_order_flags,
filtered_from_unfiltered_target_indices,
unfiltered_from_filtered_target_indices,
nfiltered_targets,
queue=queue)
nfiltered_targets = int(nfiltered_targets.get().item())
unfiltered_from_filtered_target_indices = \
unfiltered_from_filtered_target_indices[:nfiltered_targets]
from pytools.obj_array import make_obj_array
filtered_targets = make_obj_array([
targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices]
for targets_i in tree.targets
])
return FilteredTargetListsInTreeOrder(
nfiltered_targets=nfiltered_targets,
box_target_starts=box_target_starts_filtered,
box_target_counts_nonchild=box_target_counts_nonchild_filtered,
unfiltered_from_filtered_target_indices=(
unfiltered_from_filtered_target_indices),
targets=filtered_targets,
).with_queue(None)
box_target_starts_filtered = \
cl.array.empty_like(tree.box_target_starts)
box_target_counts_nonchild_filtered = \
cl.array.empty_like(tree.box_target_counts_nonchild)
index_knl(
# input
tree.box_target_starts,
tree.box_target_counts_nonchild,
filtered_from_unfiltered_target_indices,
tree.ntargets,
nfiltered_targets,
# output
box_target_starts_filtered,
box_target_counts_nonchild_filtered,
queue=queue)
return FilteredTargetListsInTreeOrder(
nfiltered_targets=nfiltered_targets,
box_target_starts=box_target_starts_filtered,
box_target_counts_nonchild=box_target_counts_nonchild_filtered,
unfiltered_from_filtered_target_indices=(
unfiltered_from_filtered_target_indices),
targets=filtered_targets,
).with_queue(None)
# }}}
# {{{ filter_target_lists_in_*_order
def filter_target_lists_in_user_order(queue, tree, flags):
"""
Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_user_order`.
"""
from warnings import warn
warn(
"filter_target_lists_in_user_order() is deprecated and will go "
"away in a future release. Use "
"ParticleListFilter.filter_target_lists_in_user_order() instead.",
DeprecationWarning, stacklevel=2)
return (ParticleListFilter(queue.context)
.filter_target_lists_in_user_order(queue, tree, flags))
def filter_target_lists_in_tree_order(queue, tree, flags):
"""
Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_tree_order`.
"""
from warnings import warn
warn(
"filter_target_lists_in_tree_order() is deprecated and will go "
"away in a future release. Use "
"ParticleListFilter.filter_target_lists_in_tree_order() instead.",
DeprecationWarning, stacklevel=2)
return (ParticleListFilter(queue.context)
.filter_target_lists_in_tree_order(queue, tree, flags))
# }}}
# vim: filetype=pyopencl:fdm=marker
from __future__ import division, absolute_import
"""
.. currentmodule:: boxtree
Building Particle-Based Trees
-----------------------------
These functions produce instances of the particle-based :class:`Tree`.
.. note::
These functions currently keep their bulk data in in
:class:`pyopencl.array.Array` instances. This contrasts with the box-based
tree (:class:`TreeOfBoxes`), which operates on data in :class:`numpy.ndarray`
instances. Along with the rest of :mod:`boxtree`, both will migrate to
:mod:`arraycontext` in the future.
.. autoclass:: TreeBuilder
"""
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__copyright__ = """
Copyright (C) 2012 Andreas Kloeckner
Copyright (C) 2022 University of Illinois Board of Trustees
"""
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
......@@ -23,20 +43,34 @@ THE SOFTWARE.
"""
from six.moves import range, zip
import logging
from functools import partial
from itertools import pairwise
import numpy as np
from pytools import memoize_method
import pyopencl as cl
import pyopencl.array # noqa
from functools import partial
import pyopencl.array
from pytools import DebugProcessLogger, ProcessLogger, memoize_method
from boxtree.tree import Tree
import logging
logger = logging.getLogger(__name__)
class TreeBuilder(object):
class MaxLevelsExceeded(RuntimeError): # noqa: N818
pass
# {{{ tree builder
class TreeBuilder:
"""
.. automethod:: __init__
.. automethod:: __call__
"""
def __init__(self, context):
"""
:arg context: A :class:`pyopencl.Context`.
......@@ -50,35 +84,48 @@ class TreeBuilder(object):
# This is used to map box IDs and compress box lists in empty leaf
# pruning.
from boxtree.tools import GappyCopyAndMapKernel
from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel
self.gappy_copy_and_map = GappyCopyAndMapKernel(self.context)
self.map_values_kernel = MapValuesKernel(self.context)
morton_nr_dtype = np.dtype(np.int8)
box_level_dtype = np.dtype(np.uint8)
ROOT_EXTENT_STRETCH_FACTOR = 1e-4
@memoize_method
def get_kernel_info(self, dimensions, coord_dtype,
particle_id_dtype, box_id_dtype,
sources_are_targets, srcntgts_have_extent,
stick_out_factor, adaptive):
sources_are_targets, srcntgts_extent_norm,
kind):
from boxtree.tree_build_kernels import get_tree_build_kernel_info
return get_tree_build_kernel_info(self.context, dimensions, coord_dtype,
particle_id_dtype, box_id_dtype,
sources_are_targets, srcntgts_have_extent,
stick_out_factor, self.morton_nr_dtype, self.box_level_dtype,
adaptive=adaptive)
sources_are_targets, srcntgts_extent_norm,
self.morton_nr_dtype, self.box_level_dtype,
kind=kind)
# {{{ run control
def __call__(self, queue, particles, max_particles_in_box,
allocator=None, debug=False, targets=None,
source_radii=None, target_radii=None, stick_out_factor=0.25,
wait_for=None, non_adaptive=False,
def __call__(self, queue, particles, kind="adaptive",
max_particles_in_box=None, allocator=None, debug=False,
targets=None, source_radii=None, target_radii=None,
stick_out_factor=None, refine_weights=None,
max_leaf_refine_weight=None, wait_for=None,
extent_norm=None, bbox=None,
**kwargs):
"""
:arg queue: a :class:`pyopencl.CommandQueue` instance
:arg particles: an object array of (XYZ) point coordinate arrays.
:arg kind: One of the following strings:
- 'adaptive'
- 'adaptive-level-restricted'
- 'non-adaptive'
'adaptive' requests an adaptive tree without level restriction. See
:ref:`tree-kinds` for further explanation.
:arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
If ``None``, *particles* act as targets, too.
Must have the same (inner) dtype as *particles*.
......@@ -90,12 +137,32 @@ class TreeBuilder(object):
:arg target_radii: Like *source_radii*, but for targets.
:arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
:arg refine_weights: If not *None*, a :class:`pyopencl.array.Array` of the
type :class:`numpy.int32`. A box will be split if it has a cumulative
refine_weight greater than *max_leaf_refine_weight*. If this is given,
*max_leaf_refine_weight* must also be given and *max_particles_in_box*
must be *None*.
:arg max_leaf_refine_weight: If not *None*, specifies the maximum weight
of a leaf box.
:arg max_particles_in_box: If not *None*, specifies the maximum number
of particles in a leaf box. If this is given, both
*refine_weights* and *max_leaf_refine_weight* must be *None*.
:arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
instances for whose completion this command waits before starting
exeuction.
:arg non_adaptive: If *True*, return a tree in which all leaf boxes are
on the same (last) level. The tree is pruned, in the sense that empty
boxes have been eliminated.
execution.
:arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect
to which particle stick-out is measured. See :attr:`Tree.extent_norm`.
:arg bbox: Bounding box of either type:
1. A dim-by-2 array, with each row to be [min, max] coordinates
in its corresponding axis direction.
2. (Internal use only) of the same type as returned by
*boxtree.bounding_box.make_bounding_box_dtype*.
When given, this bounding box is used for tree
building. Otherwise, the bounding box is determined from particles
in such a way that it is square and is slightly larger at the top (so
that scaled coordinates are always < 1).
When supplied, the bounding box must be square and have all the
particles in its closure.
:arg kwargs: Used internally for debugging.
:returns: a tuple ``(tree, event)``, where *tree* is an instance of
......@@ -105,11 +172,11 @@ class TreeBuilder(object):
# {{{ input processing
if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]:
raise ValueError(f"unknown tree kind '{kind}'")
# we'll modify this below, so copy it
if wait_for is None:
wait_for = []
else:
wait_for = list(wait_for)
wait_for = [] if wait_for is None else list(wait_for)
dimensions = len(particles)
......@@ -119,9 +186,21 @@ class TreeBuilder(object):
sources_are_targets = targets is None
sources_have_extent = source_radii is not None
targets_have_extent = target_radii is not None
if extent_norm is None:
extent_norm = "linf"
if extent_norm not in ["linf", "l2"]:
raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}")
srcntgts_extent_norm = extent_norm
srcntgts_have_extent = sources_have_extent or targets_have_extent
if not srcntgts_have_extent:
srcntgts_extent_norm = None
if srcntgts_have_extent and targets is None:
del extent_norm
if srcntgts_extent_norm and targets is None:
raise ValueError("must specify targets when specifying "
"any kind of radii")
......@@ -153,20 +232,37 @@ class TreeBuilder(object):
raise TypeError("dtypes of coordinate arrays and "
"target_radii must agree")
if sources_have_extent or targets_have_extent:
if stick_out_factor is None:
raise ValueError("if sources or targets have extent, "
"stick_out_factor must be explicitly specified")
else:
stick_out_factor = 0
# }}}
empty = partial(cl.array.empty, queue, allocator=allocator)
def zeros(shape, dtype):
result = (cl.array.empty(queue, shape, dtype, allocator=allocator)
.fill(0, wait_for=wait_for))
event, = result.events
result = cl.array.zeros(queue, shape, dtype, allocator=allocator)
if result.events:
event, = result.events
else:
from numbers import Number
if isinstance(shape, Number):
shape = (shape,)
from pytools import product
assert product(shape) == 0
event = cl.enqueue_marker(queue)
return result, event
knl_info = self.get_kernel_info(dimensions, coord_dtype,
particle_id_dtype, box_id_dtype,
sources_are_targets, srcntgts_have_extent,
stick_out_factor, adaptive=not non_adaptive)
sources_are_targets, srcntgts_extent_norm,
kind=kind)
logger.debug("tree build: start")
# {{{ combine sources and targets into one array, if necessary
......@@ -176,7 +272,13 @@ class TreeBuilder(object):
# Targets weren't specified. Sources are also targets. Let's
# call them "srcntgts".
srcntgts = particles
if isinstance(particles, np.ndarray) and particles.dtype.char == "O":
srcntgts = particles
else:
from pytools.obj_array import make_obj_array
srcntgts = make_obj_array([
p.with_queue(queue).copy() for p in particles
])
assert source_radii is None
assert target_radii is None
......@@ -197,10 +299,7 @@ class TreeBuilder(object):
"dtype")
def combine_srcntgt_arrays(ary1, ary2=None):
if ary2 is None:
dtype = ary1.dtype
else:
dtype = ary2.dtype
dtype = ary1.dtype if ary2 is None else ary2.dtype
result = empty(nsrcntgts, dtype)
if (ary1 is None) or (ary2 is None):
......@@ -217,7 +316,7 @@ class TreeBuilder(object):
from pytools.obj_array import make_obj_array
srcntgts = make_obj_array([
combine_srcntgt_arrays(src_i, tgt_i)
for src_i, tgt_i in zip(particles, targets)
for src_i, tgt_i in zip(particles, targets, strict=True)
])
if srcntgts_have_extent:
......@@ -239,28 +338,108 @@ class TreeBuilder(object):
# }}}
# {{{ process refine_weights
from boxtree.tree_build_kernels import refine_weight_dtype
specified_max_particles_in_box = max_particles_in_box is not None
specified_refine_weights = refine_weights is not None and \
max_leaf_refine_weight is not None
if specified_max_particles_in_box and specified_refine_weights:
raise ValueError("may only specify one of max_particles_in_box and "
"refine_weights/max_leaf_refine_weight")
elif not specified_max_particles_in_box and not specified_refine_weights:
raise ValueError("must specify either max_particles_in_box or "
"refine_weights/max_leaf_refine_weight")
elif specified_max_particles_in_box:
refine_weights = (
cl.array.empty(
queue, nsrcntgts, refine_weight_dtype, allocator=allocator)
.fill(1))
event, = refine_weights.events
prep_events.append(event)
max_leaf_refine_weight = max_particles_in_box
elif specified_refine_weights: # noqa: SIM102
if refine_weights.dtype != refine_weight_dtype:
raise TypeError(
f"refine_weights must have dtype '{refine_weight_dtype}'")
if max_leaf_refine_weight < cl.array.max(refine_weights).get():
raise ValueError(
"entries of refine_weights cannot exceed max_leaf_refine_weight")
if cl.array.min(refine_weights).get() < 0:
raise ValueError("all entries of refine_weights must be nonnegative")
if max_leaf_refine_weight <= 0:
raise ValueError("max_leaf_refine_weight must be positive")
total_refine_weight = cl.array.sum(
refine_weights, dtype=np.dtype(np.int64)).get()
del max_particles_in_box
del specified_max_particles_in_box
del specified_refine_weights
# }}}
# {{{ find and process bounding box
bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
bbox = bbox.get()
if bbox is None:
bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
bbox = bbox.get()
root_extent = max(
root_extent = max(
bbox["max_"+ax] - bbox["min_"+ax]
for ax in axis_names) * (1+1e-4)
for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR)
# make bbox square and slightly larger at the top, to ensure scaled
# coordinates are always < 1
bbox_min = np.empty(dimensions, coord_dtype)
for i, ax in enumerate(axis_names):
bbox_min[i] = bbox["min_"+ax]
bbox_max = bbox_min + root_extent
for i, ax in enumerate(axis_names):
bbox["max_"+ax] = bbox_max[i]
else:
# Validate that bbox is a superset of particle-derived bbox
bbox_auto, _ = self.bbox_finder(
srcntgts, srcntgt_radii, wait_for=wait_for)
bbox_auto = bbox_auto.get()
# Convert unstructured numpy array to bbox_type
if isinstance(bbox, np.ndarray):
if len(bbox) == dimensions:
bbox_bak = bbox.copy()
bbox = np.empty(1, bbox_auto.dtype)
for i, ax in enumerate(axis_names):
bbox["min_"+ax] = bbox_bak[i][0]
bbox["max_"+ax] = bbox_bak[i][1]
else:
assert len(bbox) == 1
else:
raise NotImplementedError("Unsupported bounding box type: "
+ str(type(bbox)))
# make bbox square and slightly larger at the top, to ensure scaled
# coordinates are always < 1
bbox_min = np.empty(dimensions, coord_dtype)
for i, ax in enumerate(axis_names):
bbox_min[i] = bbox["min_"+ax]
# bbox must cover bbox_auto
bbox_min = np.empty(dimensions, coord_dtype)
bbox_max = np.empty(dimensions, coord_dtype)
bbox_max = bbox_min + root_extent
for i, ax in enumerate(axis_names):
bbox["max_"+ax] = bbox_max[i]
for i, ax in enumerate(axis_names):
bbox_min[i] = bbox["min_" + ax]
bbox_max[i] = bbox["max_" + ax]
assert bbox_min[i] < bbox_max[i]
assert bbox_min[i] <= bbox_auto["min_" + ax]
assert bbox_max[i] >= bbox_auto["max_" + ax]
# }}}
# bbox must be a square
bbox_exts = bbox_max - bbox_min
for ext in bbox_exts:
assert abs(ext - bbox_exts[0]) < 1e-15
from pytools import div_ceil
root_extent = bbox_exts[0]
# }}}
# {{{ allocate data
......@@ -282,26 +461,37 @@ class TreeBuilder(object):
prep_events.append(evt)
srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
prep_events.append(evt)
split_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
prep_events.append(evt)
# number of boxes total, and a guess
nboxes_dev = empty((), dtype=box_id_dtype)
nboxes_dev.fill(1)
# /!\ If you're allocating an array here that depends on nboxes_guess,
# you *must* also write reallocation code down below for the case when
# nboxes_guess was too low.
# Outside nboxes_guess feeding is solely for debugging purposes,
# to test the reallocation code.
nboxes_guess = kwargs.get("nboxes_guess")
if nboxes_guess is None:
nboxes_guess = div_ceil(nsrcntgts, max_particles_in_box) * 2**dimensions
nboxes_guess = 2**dimensions * (
(max_leaf_refine_weight + total_refine_weight - 1)
// max_leaf_refine_weight)
assert nboxes_guess > 0
# /!\ IMPORTANT
#
# If you're allocating an array here that depends on nboxes_guess, or if
# your array contains box numbers, you have to write code for the
# following down below as well:
#
# * You *must* write reallocation code to handle box renumbering and
# reallocation triggered at the top of the level loop.
#
# * If your array persists after the level loop, you *must* write code
# to handle box renumbering and reallocation triggered by the box
# pruning step.
split_box_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
prep_events.append(evt)
# per-box morton bin counts
box_morton_bin_counts = empty(nboxes_guess,
dtype=knl_info.morton_bin_count_dtype)
box_morton_bin_counts, evt = zeros(nboxes_guess,
dtype=knl_info.morton_bin_count_dtype)
prep_events.append(evt)
# particle# at which each box starts
box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
......@@ -311,9 +501,22 @@ class TreeBuilder(object):
box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
prep_events.append(evt)
# morton nr identifier {quadr,oct}ant of parent in which this box was created
box_morton_nrs, evt = zeros(nboxes_guess, dtype=self.morton_nr_dtype)
prep_events.append(evt)
# pointer to child box, by morton number
box_child_ids, evts = zip(
*(zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions)),
strict=True)
prep_events.extend(evts)
# box centers, by dimension
box_centers, evts = zip(
*(zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions)),
strict=True)
prep_events.extend(evts)
# Initialize box_centers[0] to contain the root box's center
for d, (ax, evt) in enumerate(zip(axis_names, evts, strict=True)):
center_ax = bbox["min_"+ax] + (bbox["max_"+ax] - bbox["min_"+ax]) / 2
box_centers[d][0].fill(center_ax, wait_for=[evt])
# box -> level map
box_levels, evt = zeros(nboxes_guess, self.box_level_dtype)
......@@ -324,15 +527,39 @@ class TreeBuilder(object):
box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
prep_events.append(evt)
# Initalize box 0 to contain all particles
evt = box_srcntgt_counts_cumul[0].fill(
# Initialize box 0 to contain all particles
box_srcntgt_counts_cumul[0].fill(
nsrcntgts, queue=queue, wait_for=[evt])
# box -> whether the box has a child. FIXME: use smaller integer type
box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32))
prep_events.append(evt)
# box -> whether the box needs a splitting to enforce level restriction.
# FIXME: use smaller integer type
force_split_box, evt = zeros(nboxes_guess
if knl_info.level_restrict
else 0, dtype=np.dtype(np.int32))
prep_events.append(evt)
# set parent of root box to itself
evt = cl.enqueue_copy(
queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype))
prep_events.append(evt)
# 2*(num bits in the significand)
# https://gitlab.tiker.net/inducer/boxtree/issues/23
nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1)
assert nlevels_max <= np.iinfo(self.box_level_dtype).max
# level -> starting box on level
level_start_box_nrs_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
prep_events.append(evt)
# level -> number of used boxes on level
level_used_box_counts_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
prep_events.append(evt)
# }}}
def fin_debug(s):
......@@ -345,48 +572,81 @@ class TreeBuilder(object):
have_oversize_split_box, evt = zeros((), np.int32)
prep_events.append(evt)
# True if and only if the level restrict kernel found a box to split in
# order to enforce level restriction.
have_upper_level_split_box, evt = zeros((), np.int32)
prep_events.append(evt)
wait_for = prep_events
# {{{ level loop
from pytools import div_ceil
# {{{ level loop
# Level 0 starts at 0 and always contains box 0 and nothing else.
# Level 1 therefore starts at 1.
level_start_box_nrs = [0, 1]
level_start_box_nrs_dev[0] = 0
level_start_box_nrs_dev[1] = 1
wait_for.extend(level_start_box_nrs_dev.events)
from time import time
start_time = time()
if nsrcntgts > max_particles_in_box:
level = 1
else:
level = 0
# This counts the number of boxes that have been used per level. Note
# that this could be fewer than the actual number of boxes allocated to
# the level (in the case of building a level restricted tree, more boxes
# are pre-allocated for a level than used since we may decide to split
# parent level boxes later).
level_used_box_counts = [1]
level_used_box_counts_dev[0] = 1
wait_for.extend(level_used_box_counts_dev.events)
# level -> number of leaf boxes on level. Initially the root node is a
# leaf.
level_leaf_counts = np.array([1])
tree_build_proc = ProcessLogger(logger, "tree build")
level = 1 if total_refine_weight > max_leaf_refine_weight else 0
# INVARIANTS -- Upon entry to this loop:
#
# - level is the level being built.
# - the last entry of level_start_box_nrs is the beginning of the level
# to be built
# - the last entry of level_used_box_counts is the number of boxes that
# are used (not just allocated) at the previous level
# This while condition prevents entering the loop in case there's just a
# single box, by how 'level' is set above. Read this as 'while True' with
# an edge case.
logger.debug("entering level loop with %s srcntgts" % nsrcntgts)
level_loop_proc = DebugProcessLogger(logger, "tree build level loop")
# When doing level restriction, the level loop may need to be entered
# one more time after creating all the levels (see fixme note below
# regarding this). This flag is set to True when that happens.
final_level_restrict_iteration = False
while level:
if debug:
# More invariants:
assert level == len(level_start_box_nrs) - 1
assert level == len(level_used_box_counts)
assert level == len(level_leaf_counts)
if level > np.iinfo(self.box_level_dtype).max:
raise RuntimeError("level count exceeded maximum")
if level + 1 >= nlevels_max: # level is zero-based
raise MaxLevelsExceeded("Level count exceeded number of significant "
"bits in coordinate dtype. That means that a large number "
"of particles was indistinguishable up to floating point "
"precision (because they ended up in the same box).")
common_args = ((morton_bin_counts, morton_nrs,
box_start_flags, srcntgt_box_ids, split_box_ids,
box_start_flags,
srcntgt_box_ids, split_box_ids,
box_morton_bin_counts,
refine_weights,
max_leaf_refine_weight,
box_srcntgt_starts, box_srcntgt_counts_cumul,
box_parent_ids, box_morton_nrs,
nboxes_dev,
level, max_particles_in_box, bbox,
box_parent_ids, box_levels,
level, bbox,
user_srcntgt_ids)
+ tuple(srcntgts)
+ ((srcntgt_radii,) if srcntgts_have_extent else ())
......@@ -394,153 +654,560 @@ class TreeBuilder(object):
fin_debug("morton count scan")
# writes: box_morton_bin_counts, morton_nrs
morton_count_args = common_args
if srcntgts_have_extent:
morton_count_args += (stick_out_factor,)
# writes: box_morton_bin_counts
evt = knl_info.morton_count_scan(
*common_args, queue=queue, size=nsrcntgts,
*morton_count_args, queue=queue, size=nsrcntgts,
wait_for=wait_for)
wait_for = [evt]
fin_debug("split box id scan")
# writes: nboxes_dev, split_box_ids
# writes: box_has_children, split_box_ids
evt = knl_info.split_box_id_scan(
srcntgt_box_ids,
box_srcntgt_starts,
box_srcntgt_counts_cumul,
max_particles_in_box,
box_morton_bin_counts,
refine_weights,
max_leaf_refine_weight,
box_levels,
level_start_box_nrs_dev,
level_used_box_counts_dev,
force_split_box,
level,
# input/output:
nboxes_dev,
# output:
box_has_children,
split_box_ids,
queue=queue, size=nsrcntgts, wait_for=wait_for)
have_oversize_split_box,
queue=queue,
size=level_start_box_nrs[level],
wait_for=wait_for)
wait_for = [evt]
nboxes_new = int(nboxes_dev.get())
# {{{ compute new level_used_box_counts, level_leaf_counts
# The last split_box_id on each level tells us how many boxes are
# needed at the next level.
new_level_used_box_counts = [1]
for level_start_box_id in level_start_box_nrs[1:]:
last_box_on_prev_level = level_start_box_id - 1
new_level_used_box_counts.append(
# FIXME: Get this all at once.
int(split_box_ids[last_box_on_prev_level].get())
- level_start_box_id)
# New leaf count =
# old leaf count
# + nr. new boxes from splitting parent's leaves
# - nr. new boxes from splitting current level's leaves / 2**d
level_used_box_counts_diff = (new_level_used_box_counts
- np.append(level_used_box_counts, [0]))
new_level_leaf_counts = (level_leaf_counts
+ level_used_box_counts_diff[:-1]
- level_used_box_counts_diff[1:] // 2 ** dimensions)
new_level_leaf_counts = np.append(
new_level_leaf_counts,
[level_used_box_counts_diff[-1]])
del level_used_box_counts_diff
# }}}
# Assumption: Everything between here and the top of the loop must
# be repeatable, so that in an out-of-memory situation, we can just
# rerun this bit of the code after reallocating and a minimal reset
# procedure.
# {{{ reallocate and retry if nboxes_guess was too small
# The algorithm for deciding on level sizes is as follows:
# 1. Compute the minimal necessary size of each level, including the
# new level being created.
# 2. If level restricting, add padding to the new level being created.
# 3. Check if there is enough existing space for each level.
# 4. If any level does not have sufficient space, reallocate all levels:
# 4a. Compute new sizes of upper levels
# 4b. If level restricting, add padding to all levels.
curr_upper_level_lengths = np.diff(level_start_box_nrs)
minimal_upper_level_lengths = np.max(
[new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0)
minimal_new_level_length = new_level_used_box_counts[-1]
# Allocate extra space at the end of the current level for higher
# level leaves that may be split later.
#
# If there are no further levels to split (i.e.
# have_oversize_split_box = 0), then we do not need to allocate any
# extra space, since no new leaves can be created at the bottom
# level.
if knl_info.level_restrict and have_oversize_split_box.get():
# Currently undocumented.
lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1)
minimal_new_level_length += sum(
2**(lev*dimensions) * new_level_leaf_counts[level - lev]
for lev in range(1, 1 + min(level, lr_lookbehind_levels)))
nboxes_minimal = \
sum(minimal_upper_level_lengths) + minimal_new_level_length
needs_renumbering = \
(curr_upper_level_lengths < minimal_upper_level_lengths).any()
# {{{ prepare for reallocation/renumbering
if needs_renumbering:
assert knl_info.level_restrict
# {{{ compute new level_start_box_nrs
# Represents the amount of padding needed for upper levels.
upper_level_padding = np.zeros(level, dtype=int)
# Recompute the level padding.
for ulevel in range(level):
upper_level_padding[ulevel] = sum(
2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev]
for lev in range(
1, 1 + min(ulevel, lr_lookbehind_levels)))
new_upper_level_unused_box_counts = np.max(
[upper_level_padding,
minimal_upper_level_lengths - new_level_used_box_counts[:-1]],
axis=0)
new_level_start_box_nrs = np.empty(level + 1, dtype=int)
new_level_start_box_nrs[0] = 0
new_level_start_box_nrs[1:] = np.cumsum(
new_level_used_box_counts[:-1]
+ new_upper_level_unused_box_counts)
assert not (level_start_box_nrs == new_level_start_box_nrs).all()
# }}}
# {{{ set up reallocators
old_box_count = level_start_box_nrs[-1]
# Where should I put this box?
dst_box_id = cl.array.empty(queue,
shape=old_box_count, dtype=box_id_dtype)
for level_start, new_level_start, level_len in zip(
level_start_box_nrs[:-1],
new_level_start_box_nrs[:-1],
curr_upper_level_lengths, strict=True):
dst_box_id[level_start:level_start + level_len] = \
cl.array.arange(queue,
new_level_start,
new_level_start + level_len,
dtype=box_id_dtype)
wait_for.extend(dst_box_id.events)
realloc_array = partial(self.gappy_copy_and_map,
dst_indices=dst_box_id, range=slice(old_box_count),
debug=debug)
realloc_and_renumber_array = partial(self.gappy_copy_and_map,
dst_indices=dst_box_id, map_values=dst_box_id,
range=slice(old_box_count), debug=debug)
renumber_array = partial(self.map_values_kernel, dst_box_id)
# }}}
# Update level_start_box_nrs. This will be the
# level_start_box_nrs for the reallocated data.
level_start_box_nrs = list(new_level_start_box_nrs)
level_start_box_nrs_dev[:level + 1] = \
np.array(new_level_start_box_nrs, dtype=box_id_dtype)
level_start_box_nrs_updated = True
wait_for.extend(level_start_box_nrs_dev.events)
nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length
del new_level_start_box_nrs
else:
from boxtree.tools import realloc_array
realloc_and_renumber_array = realloc_array
renumber_array = None
level_start_box_nrs_updated = False
nboxes_new = nboxes_minimal
del nboxes_minimal
if nboxes_new > nboxes_guess:
# }}}
# {{{ reallocate and/or renumber boxes if necessary
if level_start_box_nrs_updated or nboxes_new > nboxes_guess:
fin_debug("starting nboxes_guess increase")
while nboxes_guess < nboxes_new:
nboxes_guess *= 2
from boxtree.tools import realloc_array
my_realloc = partial(realloc_array, new_shape=nboxes_guess,
zero_fill=False, queue=queue, wait_for=wait_for)
my_realloc_zeros = partial(realloc_array, new_shape=nboxes_guess,
zero_fill=True, queue=queue, wait_for=wait_for)
def my_realloc_nocopy(ary, shape=nboxes_guess):
return cl.array.empty(queue, allocator=allocator,
shape=shape, dtype=ary.dtype)
def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
result = cl.array.zeros(queue, allocator=allocator,
shape=shape, dtype=ary.dtype)
return result, result.events[0]
my_realloc = partial(realloc_array,
queue, allocator, nboxes_guess, wait_for=wait_for)
my_realloc_zeros = partial(realloc_array,
queue, allocator, nboxes_guess, zero_fill=True,
wait_for=wait_for)
my_realloc_zeros_and_renumber = partial(realloc_and_renumber_array,
queue, allocator, nboxes_guess, zero_fill=True,
wait_for=wait_for)
resize_events = []
box_morton_bin_counts, evt = my_realloc(box_morton_bin_counts)
split_box_ids = my_realloc_nocopy(split_box_ids)
# *Most*, but not *all* of the values in this array are
# rewritten when the morton scan is redone. Specifically,
# only the box morton bin counts of boxes on the level
# currently being processed are written-but we need to
# retain the box morton bin counts from the higher levels.
box_morton_bin_counts, evt = my_realloc_zeros(
box_morton_bin_counts)
resize_events.append(evt)
# force_split_box is unused unless level restriction is enabled.
if knl_info.level_restrict:
force_split_box, evt = my_realloc_zeros(force_split_box)
resize_events.append(evt)
box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts)
resize_events.append(evt)
box_parent_ids, evt = my_realloc_zeros(box_parent_ids)
resize_events.append(evt)
box_morton_nrs, evt = my_realloc_zeros(box_morton_nrs)
resize_events.append(evt)
box_levels, evt = my_realloc_zeros(box_levels)
resize_events.append(evt)
box_srcntgt_counts_cumul, evt = \
my_realloc_zeros(box_srcntgt_counts_cumul)
resize_events.append(evt)
del my_realloc
del my_realloc_zeros
box_has_children, evt = my_realloc_zeros(box_has_children)
resize_events.append(evt)
box_centers, evts = zip(
*(my_realloc(ary) for ary in box_centers), strict=True)
resize_events.extend(evts)
# reset nboxes_dev to previous value
nboxes_dev.fill(level_start_box_nrs[-1])
box_child_ids, evts = zip(
*(my_realloc_zeros_and_renumber(ary)
for ary in box_child_ids), strict=True)
resize_events.extend(evts)
box_parent_ids, evt = my_realloc_zeros_and_renumber(box_parent_ids)
resize_events.append(evt)
wait_for = resize_events
if not level_start_box_nrs_updated:
box_levels, evt = my_realloc(box_levels)
resize_events.append(evt)
else:
box_levels, evt = my_realloc_zeros_nocopy(box_levels)
cl.wait_for_events([evt])
for box_level, (level_start, level_end) in enumerate(
pairwise(level_start_box_nrs)):
box_levels[level_start:level_end].fill(box_level)
resize_events.extend(box_levels.events)
if level_start_box_nrs_updated:
srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids)
resize_events.append(evt)
del my_realloc_zeros
del my_realloc_nocopy
del my_realloc_zeros_nocopy
del renumber_array
# Can't del on Py2.7 - these are used in generator expressions
# above, which are nested scopes
my_realloc = None
my_realloc_zeros_and_renumber = None
# retry
logger.info("nboxes_guess exceeded: "
"enlarged allocations, restarting level")
"enlarged allocations, restarting level")
continue
# }}}
logger.info("LEVEL %d -> %d boxes" % (level, nboxes_new))
logger.debug("LEVEL %d -> %d boxes", level, nboxes_new)
assert level_start_box_nrs[-1] != nboxes_new or srcntgts_have_extent
assert (
level_start_box_nrs[-1] != nboxes_new
or srcntgts_have_extent
or final_level_restrict_iteration)
if level_start_box_nrs[-1] == nboxes_new:
# We haven't created new boxes in this level loop trip. Unless
# srcntgts have extent, this should never happen. (I.e., we
# should've never entered this loop trip.)
# We haven't created new boxes in this level loop trip.
#
# If srcntgts have extent, this can happen if boxes were
# in-principle overfull, but couldn't subdivide because of
# extent restrictions.
if srcntgts_have_extent and not final_level_restrict_iteration:
level -= 1
break
assert final_level_restrict_iteration
assert srcntgts_have_extent
# {{{ update level_start_box_nrs, level_used_box_counts
level -= 1
level_start_box_nrs.append(nboxes_new)
level_start_box_nrs_dev[level + 1].fill(nboxes_new)
wait_for.extend(level_start_box_nrs_dev.events)
logger.debug("no new boxes created this loop trip")
break
level_used_box_counts = new_level_used_box_counts
level_used_box_counts_dev[:level + 1] = \
np.array(level_used_box_counts, dtype=box_id_dtype)
wait_for.extend(level_used_box_counts_dev.events)
level_leaf_counts = new_level_leaf_counts
if debug:
for level_start, level_nboxes, leaf_count in zip(
level_start_box_nrs[:-1],
level_used_box_counts,
level_leaf_counts, strict=True):
if level_nboxes == 0:
assert leaf_count == 0
continue
nleaves_actual = level_nboxes - int(
cl.array.sum(box_has_children[
level_start:level_start + level_nboxes]).get())
assert leaf_count == nleaves_actual
# Can't del in Py2.7 - see note below
new_level_leaf_counts = None
# }}}
level_start_box_nrs.append(nboxes_new)
del nboxes_new
del new_level_used_box_counts
new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
split_and_sort_args = (
common_args
+ (new_user_srcntgt_ids, have_oversize_split_box,
new_srcntgt_box_ids, box_levels))
# {{{ split boxes
fin_debug("split and sort")
box_splitter_args = (
*common_args,
box_has_children,
force_split_box,
root_extent,
*box_child_ids,
*box_centers)
evt = knl_info.split_and_sort_kernel(*split_and_sort_args,
evt = knl_info.box_splitter_kernel(*box_splitter_args,
range=slice(level_start_box_nrs[-1]),
wait_for=wait_for)
wait_for = [evt]
fin_debug("box splitter")
# Mark the levels of boxes added for padding (these were not updated
# by the box splitter kernel).
last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1]
box_levels[last_used_box:level_start_box_nrs[-1]].fill(level)
wait_for.extend(box_levels.events)
if debug:
box_levels.finish()
level_bl_chunk = box_levels.get()[
level_start_box_nrs[-2]:level_start_box_nrs[-1]]
assert ((level_bl_chunk == level) | (level_bl_chunk == 0)).all()
assert (level_bl_chunk == level).all()
del level_bl_chunk
if debug:
assert (box_srcntgt_starts.get() < nsrcntgts).all()
# }}}
# {{{ renumber particles within split boxes
new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
particle_renumberer_args = (
*common_args,
box_has_children,
force_split_box,
new_user_srcntgt_ids,
new_srcntgt_box_ids)
evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args,
range=slice(nsrcntgts), wait_for=wait_for)
wait_for = [evt]
fin_debug("particle renumbering")
user_srcntgt_ids = new_user_srcntgt_ids
del new_user_srcntgt_ids
srcntgt_box_ids = new_srcntgt_box_ids
del new_srcntgt_box_ids
# }}}
# {{{ enforce level restriction on upper levels
if final_level_restrict_iteration:
# Roll back level update.
#
# FIXME: The extra iteration at the end to split boxes should
# not be necessary. Instead, all the work for the final box
# split should be done in the last iteration of the level
# loop. Currently the main issue that forces the extra iteration
# to be there is the need to use the box renumbering and
# reallocation code. In order to fix this issue, the box
# numbering and reallocation code needs to be accessible after
# the final level restriction is done.
assert int(have_oversize_split_box.get()) == 0
assert level_used_box_counts[-1] == 0
del level_used_box_counts[-1]
del level_start_box_nrs[-1]
level -= 1
break
if knl_info.level_restrict:
# Avoid generating too many kernels.
LEVEL_STEP = 10 # noqa
if level % LEVEL_STEP == 1:
level_restrict_kernel = knl_info.level_restrict_kernel_builder(
LEVEL_STEP * div_ceil(level, LEVEL_STEP))
# Upward pass - check if leaf boxes at higher levels need
# further splitting.
assert len(force_split_box) > 0
force_split_box.fill(0)
wait_for.extend(force_split_box.events)
did_upper_level_split = False
if debug:
boxes_split = []
for upper_level, upper_level_start, upper_level_box_count in zip(
# We just built level. Our parent level doesn't need to
# be rechecked for splitting because the smallest boxes
# in the tree (ours) already have a 2-to-1 ratio with
# that. Start checking at the level above our parent.
range(level - 2, 0, -1),
# At this point, the last entry in level_start_box_nrs
# already refers to (level + 1).
level_start_box_nrs[-4::-1],
level_used_box_counts[-3::-1], strict=False):
upper_level_slice = slice(
upper_level_start, upper_level_start + upper_level_box_count)
have_upper_level_split_box.fill(0)
wait_for.extend(have_upper_level_split_box.events)
# writes: force_split_box, have_upper_level_split_box
evt = level_restrict_kernel( # pylint: disable=possibly-used-before-assignment
upper_level,
root_extent,
box_has_children,
force_split_box,
have_upper_level_split_box,
*(box_child_ids + box_centers),
slice=upper_level_slice,
wait_for=wait_for)
wait_for = [evt]
if debug:
force_split_box.finish()
boxes_split.append(int(cl.array.sum(
force_split_box[upper_level_slice]).get()))
if int(have_upper_level_split_box.get()) == 0:
break
did_upper_level_split = True
if debug:
total_boxes_split = sum(boxes_split)
logger.debug("level restriction: %d boxes split",
total_boxes_split)
from itertools import count
for level_, nboxes_split in zip(
count(level - 2, step=-1), boxes_split[:-1]):
logger.debug("level %d: %d boxes split", level_, nboxes_split)
del boxes_split
if int(have_oversize_split_box.get()) == 0 and did_upper_level_split:
# We are in the situation where there are boxes left to
# split on upper levels, and the level loop is done creating
# lower levels.
#
# We re-run the level loop one more time to finish creating
# the upper level boxes.
final_level_restrict_iteration = True
level += 1
continue
# }}}
if not int(have_oversize_split_box.get()):
logger.debug("no overfull boxes left")
logger.debug("no boxes left to split")
break
level += 1
have_oversize_split_box.fill(0)
end_time = time()
elapsed = end_time-start_time
# {{{ check that nonchild part of box_morton_bin_counts is consistent
if debug and 0:
h_box_morton_bin_counts = box_morton_bin_counts.get()
h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
h_box_child_ids = tuple(bci.get() for bci in box_child_ids)
has_mismatch = False
for ibox in range(level_start_box_nrs[-1]):
is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids)
if is_leaf:
# nonchild count only found in box_info kernel
continue
if h_box_srcntgt_counts_cumul[ibox] == 0:
# empty boxes don't have box_morton_bin_counts written
continue
kid_sum = sum(
h_box_srcntgt_counts_cumul[bci[ibox]]
for bci in h_box_child_ids
if bci[ibox] != 0)
if (
h_box_srcntgt_counts_cumul[ibox]
!= (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"]
+ kid_sum)):
print("MISMATCH", level, ibox)
has_mismatch = True
assert not has_mismatch
print(f"LEVEL {level} OK")
# Cannot delete in Py 2.7: referred to from nested scope.
h_box_srcntgt_counts_cumul = None
del h_box_morton_bin_counts
del h_box_child_ids
# }}}
nboxes = level_start_box_nrs[-1]
npasses = level+1
logger.info("elapsed time: %g s (%g s/particle/pass)" % (
elapsed, elapsed/(npasses*nsrcntgts)))
level_loop_proc.done("%d levels, %d boxes", level, nboxes)
del npasses
nboxes = int(nboxes_dev.get())
# }}}
# {{{ extract number of non-child srcntgts from box morton counts
......@@ -567,44 +1234,88 @@ class TreeBuilder(object):
del highest_possibly_split_box_nr
if debug:
assert (box_srcntgt_counts_nonchild.get()
<= box_srcntgt_counts_cumul.get()[:nboxes]).all()
h_box_srcntgt_counts_nonchild = box_srcntgt_counts_nonchild.get()
h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
assert (h_box_srcntgt_counts_nonchild
<= h_box_srcntgt_counts_cumul[:nboxes]).all()
del h_box_srcntgt_counts_nonchild
# Cannot delete in Py 2.7: referred to from nested scope.
h_box_srcntgt_counts_cumul = None
# }}}
del morton_nrs
del box_morton_bin_counts
# {{{ prune empty leaf boxes
# {{{ prune empty/unused leaf boxes
is_pruned = not kwargs.get("skip_prune")
if is_pruned:
prune_empty_leaves = not kwargs.get("skip_prune")
if prune_empty_leaves:
# What is the original index of this box?
from_box_id = empty(nboxes, box_id_dtype)
src_box_id = empty(nboxes, box_id_dtype)
# Where should I put this box?
to_box_id = empty(nboxes, box_id_dtype)
#
# Initialize to all zeros, because pruned boxes should be mapped to
# zero (e.g. when pruning child_box_ids).
dst_box_id, evt = zeros(nboxes, box_id_dtype)
wait_for.append(evt)
fin_debug("find prune indices")
nboxes_post_prune_dev = empty((), dtype=box_id_dtype)
evt = knl_info.find_prune_indices_kernel(
box_srcntgt_counts_cumul,
to_box_id, from_box_id, nboxes_post_prune_dev,
src_box_id, dst_box_id, nboxes_post_prune_dev,
size=nboxes, wait_for=wait_for)
wait_for = [evt]
fin_debug("prune copy")
nboxes_post_prune = int(nboxes_post_prune_dev.get())
logger.debug("%d boxes after pruning "
"(%d empty leaves and/or unused boxes removed)",
nboxes_post_prune, nboxes - nboxes_post_prune)
should_prune = True
elif knl_info.level_restrict:
# Remove unused boxes from the tree.
src_box_id = empty(nboxes, box_id_dtype)
dst_box_id = empty(nboxes, box_id_dtype)
new_level_start_box_nrs = np.empty_like(level_start_box_nrs)
new_level_start_box_nrs[0] = 0
new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts)
for level_start, new_level_start, level_used_box_count in zip(
level_start_box_nrs[:-1],
new_level_start_box_nrs[:-1],
level_used_box_counts, strict=True):
def make_slice(start, offset=level_used_box_count):
return slice(start, start + offset)
def make_arange(start, offset=level_used_box_count):
return cl.array.arange(
queue, start, start + offset, dtype=box_id_dtype)
src_box_id[make_slice(new_level_start)] = make_arange(level_start)
dst_box_id[make_slice(level_start)] = make_arange(new_level_start)
wait_for.extend(src_box_id.events + dst_box_id.events)
nboxes_post_prune = new_level_start_box_nrs[-1]
logger.info("%d boxes after pruning (%d unused boxes removed)",
nboxes_post_prune, nboxes - nboxes_post_prune)
should_prune = True
else:
should_prune = False
logger.info("%d empty leaves" % (nboxes-nboxes_post_prune))
if should_prune:
prune_events = []
prune_empty = partial(self.gappy_copy_and_map,
queue, allocator, nboxes_post_prune, from_box_id)
queue, allocator, nboxes_post_prune,
src_indices=src_box_id,
range=slice(nboxes_post_prune), debug=debug)
box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts)
prune_events.append(evt)
......@@ -612,28 +1323,52 @@ class TreeBuilder(object):
box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul)
prune_events.append(evt)
if debug:
if debug and prune_empty_leaves:
assert (box_srcntgt_counts_cumul.get() > 0).all()
srcntgt_box_ids = cl.array.take(to_box_id, srcntgt_box_ids)
box_parent_ids, evt = prune_empty(box_parent_ids, map_values=to_box_id)
srcntgt_box_ids, evt = self.map_values_kernel(
dst_box_id, srcntgt_box_ids)
prune_events.append(evt)
box_morton_nrs, evt = prune_empty(box_morton_nrs)
box_parent_ids, evt = prune_empty(box_parent_ids, map_values=dst_box_id)
prune_events.append(evt)
box_levels, evt = prune_empty(box_levels)
prune_events.append(evt)
if srcntgts_have_extent:
box_srcntgt_counts_nonchild, evt = prune_empty(
box_srcntgt_counts_nonchild)
prune_events.append(evt)
# Remap level_start_box_nrs to new box IDs.
# FIXME: It would be better to do this on the device.
level_start_box_nrs = list(
to_box_id.get()
[np.array(level_start_box_nrs[:-1], box_id_dtype)])
level_start_box_nrs = level_start_box_nrs + [nboxes_post_prune]
box_has_children, evt = prune_empty(box_has_children)
prune_events.append(evt)
box_child_ids, evts = zip(
*(prune_empty(ary, map_values=dst_box_id)
for ary in box_child_ids), strict=True)
prune_events.extend(evts)
box_centers, evts = zip(
*(prune_empty(ary) for ary in box_centers), strict=True)
prune_events.extend(evts)
# Update box counts and level start box indices.
box_levels.finish()
evt = knl_info.find_level_box_counts_kernel(
box_levels, level_used_box_counts_dev)
cl.wait_for_events([evt])
nlevels = len(level_used_box_counts)
level_used_box_counts = level_used_box_counts_dev[:nlevels].get()
level_start_box_nrs = [0]
level_start_box_nrs.extend(np.cumsum(level_used_box_counts))
level_start_box_nrs_dev[:nlevels + 1] = np.array(
level_start_box_nrs, dtype=box_id_dtype)
prune_events.extend(level_start_box_nrs_dev.events)
wait_for = prune_events
else:
......@@ -714,26 +1449,25 @@ class TreeBuilder(object):
box_target_starts, box_target_counts_cumul,
)
+ ((
box_source_counts_nonchild,
box_target_counts_nonchild,
box_source_counts_nonchild, # pylint: disable=possibly-used-before-assignment
box_target_counts_nonchild, # pylint: disable=possibly-used-before-assignment
) if srcntgts_have_extent else ())
),
queue=queue, range=slice(nsrcntgts),
wait_for=wait_for)
wait_for = [evt]
if srcntgts_have_extent:
if srcntgts_have_extent: # noqa: SIM102
if debug:
assert (
box_srcntgt_counts_nonchild.get()
==
(box_source_counts_nonchild
+ box_target_counts_nonchild).get()).all()
== (box_source_counts_nonchild
+ box_target_counts_nonchild).get()).all()
if debug:
usi_host = user_source_ids.get()
assert (usi_host < nsources).all()
assert (0 <= usi_host).all()
assert (usi_host >= 0).all()
del usi_host
sti_host = srcntgt_target_ids.get()
......@@ -811,22 +1545,44 @@ class TreeBuilder(object):
del srcntgts
nlevels = len(level_start_box_nrs) - 1
assert nlevels == len(level_used_box_counts)
assert level + 1 == nlevels, (level+1, nlevels)
if debug:
max_level = np.max(box_levels.get())
assert max_level + 1 == nlevels
# {{{ compute box info
# {{{ gather box child ids, box centers
# A number of arrays below are nominally 2-dimensional and stored with
# the box index as the fastest-moving index. To make sure that accesses
# remain aligned, we round up the number of boxes used for indexing.
aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32
box_child_ids, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
wait_for.append(evt)
box_centers = empty((dimensions, aligned_nboxes), coord_dtype)
box_centers_new = empty((dimensions, aligned_nboxes), coord_dtype)
for mnr, child_row in enumerate(box_child_ids):
box_child_ids_new[mnr, :nboxes_post_prune] = \
child_row[:nboxes_post_prune]
wait_for.extend(box_child_ids_new.events)
for dim, center_row in enumerate(box_centers):
box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune]
wait_for.extend(box_centers_new.events)
cl.wait_for_events(wait_for)
box_centers = box_centers_new
box_child_ids = box_child_ids_new
del box_centers_new
del box_child_ids_new
# }}}
# {{{ compute box flags
from boxtree.tree import box_flags_enum
box_flags = empty(nboxes_post_prune, box_flags_enum.dtype)
......@@ -838,21 +1594,22 @@ class TreeBuilder(object):
# the cumulative counts and setting them to zero for non-leaves.
# {{{ make sure box_{source,target}_counts_nonchild are not defined
# (before we overwrite them)
try:
box_source_counts_nonchild
box_source_counts_nonchild # noqa: B018
except NameError:
pass
else:
assert False
raise AssertionError
try:
box_target_counts_nonchild
box_target_counts_nonchild # noqa: B018
except NameError:
pass
else:
assert False
raise AssertionError
# }}}
......@@ -871,24 +1628,106 @@ class TreeBuilder(object):
evt = knl_info.box_info_kernel(
*(
# input:
box_parent_ids, box_morton_nrs, bbox, aligned_nboxes,
box_srcntgt_counts_cumul,
box_parent_ids, box_srcntgt_counts_cumul,
box_source_counts_cumul, box_target_counts_cumul,
max_particles_in_box,
box_levels, nlevels,
box_has_children, box_levels, nlevels,
# output if srcntgts_have_extent, input+output otherwise
box_source_counts_nonchild, box_target_counts_nonchild,
# output:
box_child_ids, box_centers, box_flags,
box_flags,
),
range=slice(nboxes_post_prune),
wait_for=wait_for)
# }}}
del box_has_children
wait_for = [evt]
# {{{ compute box bounding box
fin_debug("finding box extents")
box_source_bounding_box_min = cl.array.empty(
queue, (dimensions, aligned_nboxes),
dtype=coord_dtype)
box_source_bounding_box_max = cl.array.empty(
queue, (dimensions, aligned_nboxes),
dtype=coord_dtype)
if sources_are_targets:
box_target_bounding_box_min = box_source_bounding_box_min
box_target_bounding_box_max = box_source_bounding_box_max
else:
box_target_bounding_box_min = cl.array.empty(
queue, (dimensions, aligned_nboxes),
dtype=coord_dtype)
box_target_bounding_box_max = cl.array.empty(
queue, (dimensions, aligned_nboxes),
dtype=coord_dtype)
bogus_radii_array = cl.array.empty(queue, 1, dtype=coord_dtype)
# nlevels-1 is the highest valid level index
for level in range(nlevels-1, -1, -1):
start, stop = level_start_box_nrs[level:level+2]
for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max,
pstarts, pcounts, particle_radii, particles) in [
(
# never skip
False,
sources_have_extent,
box_source_bounding_box_min,
box_source_bounding_box_max,
box_source_starts,
box_source_counts_nonchild,
source_radii if sources_have_extent else bogus_radii_array,
sources),
(
# skip the 'target' round if sources and targets
# are the same.
sources_are_targets,
targets_have_extent,
box_target_bounding_box_min,
box_target_bounding_box_max,
box_target_starts,
box_target_counts_nonchild,
target_radii if targets_have_extent else bogus_radii_array,
targets),
]:
if skip:
continue
args = (
aligned_nboxes,
box_child_ids,
box_centers,
pstarts,
pcounts,
*particles,
particle_radii,
enable_radii,
box_bounding_box_min,
box_bounding_box_max)
evt = knl_info.box_extents_finder_kernel(
*args,
range=slice(start, stop),
queue=queue, wait_for=wait_for)
wait_for = [evt]
del bogus_radii_array
# }}}
# {{{ build output
extra_tree_attrs = {}
......@@ -898,12 +1737,15 @@ class TreeBuilder(object):
if targets_have_extent:
extra_tree_attrs.update(target_radii=target_radii)
logger.info("tree build complete")
tree_build_proc.done(
"%d levels, %d boxes, %d particles, box extent norm: %s, "
"max_leaf_refine_weight: %d",
nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm,
max_leaf_refine_weight)
return Tree(
# If you change this, also change the documentation
# of what's in the tree, above.
sources_are_targets=sources_are_targets,
sources_have_extent=sources_have_extent,
targets_have_extent=targets_have_extent,
......@@ -915,12 +1757,11 @@ class TreeBuilder(object):
root_extent=root_extent,
stick_out_factor=stick_out_factor,
extent_norm=srcntgts_extent_norm,
bounding_box=(bbox_min, bbox_max),
level_start_box_nrs=level_start_box_nrs,
level_start_box_nrs_dev=cl.array.to_device(
queue, level_start_box_nrs,
allocator=allocator),
level_start_box_nrs_dev=level_start_box_nrs_dev,
sources=sources,
targets=targets,
......@@ -941,7 +1782,12 @@ class TreeBuilder(object):
user_source_ids=user_source_ids,
sorted_target_ids=sorted_target_ids,
_is_pruned=is_pruned,
box_source_bounding_box_min=box_source_bounding_box_min,
box_source_bounding_box_max=box_source_bounding_box_max,
box_target_bounding_box_min=box_target_bounding_box_min,
box_target_bounding_box_max=box_target_bounding_box_max,
_is_pruned=prune_empty_leaves,
**extra_tree_attrs
).with_queue(None), evt
......@@ -950,4 +1796,6 @@ class TreeBuilder(object):
# }}}
# vim: foldmethod=marker:filetype=pyopencl
# }}}
# vim: foldmethod=marker
from __future__ import division
from __future__ import absolute_import
import six
from six.moves import range
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
# __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
......@@ -25,30 +20,41 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import logging
from functools import partial
import numpy as np
import pyopencl as cl
from mako.template import Template
from pyopencl.elementwise import ElementwiseTemplate
from pyopencl.scan import ScanTemplate
from mako.template import Template
from pytools import Record, memoize
from boxtree.tools import get_type_moniker
from pytools import Record, log_process, memoize
from boxtree.tools import (
coord_vec_subscript_code,
get_coord_vec_dtype,
get_type_moniker,
)
import logging
logger = logging.getLogger(__name__)
# TODO:
# - Add *restrict where applicable.
# - Split up the arrays so that there is one array per box level. This avoids
# having to reallocate the middle of an array.
# - Use level-relative box numbering in parent_box_ids, child_box_ids. This
# avoids having to renumber these arrays after reallocation.
# -----------------------------------------------------------------------------
# CONTROL FLOW
# ------------
#
# Since this file mostly fills in the blanks in the outer parallel 'scan'
# Since this file mostly fills in the blanks in the tree build
# implementation, control flow here can be a bit hard to see.
#
# - Everything starts and ends in the 'driver' bit at the end.
# - Everything starts and ends in the driver in tree_build.py
#
# - The first thing that happens is that data types get built and
# kernels get compiled. Most of the file consists of type and
......@@ -57,12 +63,11 @@ logger = logging.getLogger(__name__)
# - We start with a reduction that determines the bounding box of all
# particles.
#
# - The level loop is in the driver below, which alternates between
# scans and local post processing ("split and sort"), according to
# the algorithm described below.
# - The level loop is in the driver, which alternates between scans and local
# post processing, according to the algorithm described below.
#
# - Once the level loop finishes, a "box info" kernel is run
# that extracts some more information for each box. (center, level, ...)
# that extracts flags for each box.
#
# - As a last step, empty leaf boxes are eliminated. This is done by a
# scan kernel that computes indices, and by an elementwise kernel
......@@ -70,24 +75,48 @@ logger = logging.getLogger(__name__)
#
# -----------------------------------------------------------------------------
#
# HOW DOES THE PRIMARY SCAN WORK?
# -------------------------------
# HOW DOES THE LEVEL LOOP WORK?
# -----------------------------
#
# This code sorts particles into an nD-tree of boxes. It does this by doing a
# (parallel) scan over particles and a (local, i.e. independent for each particle)
# postprocessing step for each level.
# This code sorts particles into an nD-tree of boxes. It does this by doing two
# successive (parallel) scans and a postprocessing step.
#
# The following information is being pushed around by the scan, which
# proceeds over particles:
# The following information is being pushed around by the scans, which
# proceed over particles:
#
# - a cumulative count ("counts") of particles in each subbox ("morton_nr") at
# the current level, should the current box need to be subdivided.
# - a cumulative count ("pcnt") and weight ("pwt") of particles in each subbox
# ("morton_nr") , should the current box need to be subdivided.
#
# - the "split_box_id". The very first entry here gets intialized to
# the number of boxes present at the previous level. If a box knows it needs to
# be subdivided, its first particle asks for 2**d new boxes. This gets scanned
# over by summing globally (unsegmented-ly). The splits are then realized in
# the post-processing step.
# - the "split_box_id". This is an array that, for each box, answers the
# question, "After I am subdivided, what is end of the range of boxes
# that my particles get pushed into?" The split_box_id is not meaningful
# unless the box is about to be subdivided.
#
# Using this data, the stages of the algorithm proceed as follows:
#
# 1. Count the number of particles in each subbox. This stage uses a segmented
# (per-box) scan to fill "pcnt" and "pwt". This information is kept
# per-particle ("morton_bin_counts") and per-box ("box_morton_bin_counts").
#
# 2. Using a scan over the boxes, segmented by level, make a decision whether to
# refine each box, and compute the split_box_id. This stage also computes the
# total number of new boxes needed. If a box knows it needs to be subdivided,
# it asks for 2**d new boxes at the next level.
#
# 3. Realize the splitting determined in #2. This part consists of splitting the
# boxes (done in the "box splitter kernel") and renumbering the particles so
# that particles in the same box have are numbered contiguously (done in the
# "particle renumberer kernel").
#
# HOW DOES LEVEL RESTRICTION WORK?
# --------------------------------
#
# This requires some post-processing in the level loop described above: as an
# additional step, the "level restrict" kernel gets run at the end of the level
# loop. The job of the level restrict kernel is to mark boxes on higher levels
# to be split based on looking at the levels of their neighbor boxes. The
# splitting is then realized by the next iteration of the level loop,
# simultaneously with the creation of the next level.
#
# -----------------------------------------------------------------------------
......@@ -98,7 +127,10 @@ class _KernelInfo(Record):
# {{{ data types
@memoize
refine_weight_dtype = np.dtype(np.int32)
@memoize(use_kwargs=True)
def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
srcntgts_have_extent):
fields = []
......@@ -109,7 +141,10 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
from boxtree.tools import padded_bin
for mnr in range(2**dimensions):
fields.append(("pcnt%s" % padded_bin(mnr, dimensions), particle_id_dtype))
fields.append((f"pcnt{padded_bin(mnr, dimensions)}", particle_id_dtype))
# Morton bin weight totals
for mnr in range(2**dimensions):
fields.append((f"pwt{padded_bin(mnr, dimensions)}", refine_weight_dtype))
dtype = np.dtype(fields)
......@@ -117,10 +152,10 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
if srcntgts_have_extent:
name_suffix = "_ext"
name = "boxtree_morton_bin_count_%dd_p%s%s_t" % (
dimensions,
get_type_moniker(particle_id_dtype),
name_suffix)
type_moniker = get_type_moniker(particle_id_dtype)
name = (
f"boxtree_morton_bin_count_{dimensions}d_p{type_moniker}{name_suffix}_t"
)
from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
......@@ -130,16 +165,19 @@ def make_morton_bin_count_type(device, dimensions, particle_id_dtype,
# }}}
# {{{ preamble
TYPE_DECL_PREAMBLE_TPL = Template(r"""//CL//
typedef ${dtype_to_ctype(morton_bin_count_dtype)} morton_counts_t;
typedef morton_counts_t scan_t;
typedef ${dtype_to_ctype(refine_weight_dtype)} refine_weight_t;
typedef ${dtype_to_ctype(bbox_dtype)} bbox_t;
typedef ${dtype_to_ctype(coord_dtype)} coord_t;
typedef ${dtype_to_ctype(coord_vec_dtype)} coord_vec_t;
typedef ${dtype_to_ctype(box_id_dtype)} box_id_t;
typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
typedef ${dtype_to_ctype(box_level_dtype)} box_level_t;
// morton_nr == -1 is defined to mean that the srcntgt is
// remaining at the present level and will not be sorted
......@@ -148,7 +186,6 @@ TYPE_DECL_PREAMBLE_TPL = Template(r"""//CL//
""", strict_undefined=True)
GENERIC_PREAMBLE_TPL = Template(r"""//CL//
#define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor})
// Use this as dbg_printf(("oh snap: %d\n", stuff)); Note the double
// parentheses.
......@@ -180,26 +217,37 @@ GENERIC_PREAMBLE_TPL = Template(r"""//CL//
# BEGIN KERNELS IN THE LEVEL LOOP
# {{{ scan primitive code template
# {{{ morton scan
SCAN_PREAMBLE_TPL = Template(r"""//CL//
MORTON_NR_SCAN_PREAMBLE_TPL = Template(r"""//CL//
// {{{ neutral element
scan_t scan_t_neutral()
{
scan_t result;
%if srcntgts_have_extent:
result.nonchild_srcntgts = 0;
%endif
%for mnr in range(2**dimensions):
result.pcnt${padded_bin(mnr, dimensions)} = 0;
%endfor
%for mnr in range(2**dimensions):
result.pwt${padded_bin(mnr, dimensions)} = 0;
%endfor
return result;
}
// }}}
inline int my_add_sat(int a, int b)
{
long result = (long) a + b;
return (result > INT_MAX) ? INT_MAX : result;
}
// {{{ scan 'add' operation
scan_t scan_t_add(scan_t a, scan_t b, bool across_seg_boundary)
{
......@@ -213,6 +261,16 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
<% field = "pcnt"+padded_bin(mnr, dimensions) %>
b.${field} = a.${field} + b.${field};
%endfor
%for mnr in range(2**dimensions):
<% field = "pwt"+padded_bin(mnr, dimensions) %>
// XXX: The use of add_sat() seems to be causing trouble
// with multiple compilers. For d=3:
// 1. POCL will miscompile and either give wrong
// results or crash.
// 2. Intel will use a large amount of memory.
// Versions tested: POCL 0.13, Intel OpenCL 16.1
b.${field} = my_add_sat(a.${field}, b.${field});
%endfor
}
return b;
......@@ -224,41 +282,53 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
scan_t scan_t_from_particle(
const int i,
const int level,
const int particle_level,
bbox_t const *bbox,
global morton_nr_t *morton_nrs, // output/side effect
global particle_id_t *user_srcntgt_ids
global particle_id_t *user_srcntgt_ids,
global refine_weight_t *refine_weights
%for ax in axis_names:
, global const coord_t *${ax}
%endfor
%if srcntgts_have_extent:
, global const coord_t *srcntgt_radii
, const coord_t stick_out_factor
%endif
)
{
particle_id_t user_srcntgt_id = user_srcntgt_ids[i];
// Recall that 'level' is the level currently being built, e.g. 1 at
// the root. This should be 0.5 at level 1. (Level 0 is the root.)
// The next level is 1 + the current level of the particle.
// This should be 0.5 when next level = 1. (Level 0 is the root.)
coord_t next_level_box_size_factor =
((coord_t) 1) / ((coord_t) (1U << level));
((coord_t) 1) / ((coord_t) (1U << (1 + particle_level)));
%if srcntgts_have_extent:
bool stop_srcntgt_descent = false;
coord_t srcntgt_radius = srcntgt_radii[user_srcntgt_id];
%endif
%if not srcntgts_have_extent:
// This argument is only supplied with srcntgts_have_extent.
#define stick_out_factor 0.
%endif
const coord_t one_half = ((coord_t) 1) / 2;
const coord_t box_radius_factor =
// AMD CPU seems to like to miscompile this--change with care.
// (last seen on 13.4-2)
(1. + STICK_OUT_FACTOR)
(1. + stick_out_factor)
* one_half; // convert diameter to radius
%if not srcntgts_have_extent:
#undef stick_out_factor
%endif
%for ax in axis_names:
// Most FMMs are isotropic, i.e. global_extent_{x,y,z} are all the same.
// Nonetheless, the gain from exploiting this assumption seems so
// minimal that doing so here didn't seem worthwhile.
// minimal that doing so here didn't seem worthwhile in the
// srcntgts_extent_norm == "linf" case.
coord_t global_min_${ax} = bbox->min_${ax};
coord_t global_extent_${ax} = bbox->max_${ax} - global_min_${ax};
......@@ -270,28 +340,34 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
// level, and it isn't either by the fact that boxes are
// [)-half-open in subsequent levels.
// So (1 << level) is 2 when building level 1. Because the
// floating point factor is strictly less than 1, 2 is never
// reached, so when building level 1, the result is either 0 or 1.
// So (1 << (1 + particle_level)) is 2 when building level 1.
// Because the floating point factor is strictly less than 1, 2 is
// never reached, so when building level 1, the result is either
// 0 or 1.
// After that, we just add one (less significant) bit per level.
unsigned ${ax}_bits = (unsigned) (
((srcntgt_${ax} - global_min_${ax}) / global_extent_${ax})
* (1U << level));
* (1U << (1 + particle_level)));
%if srcntgts_have_extent:
// Need to compute center to compare excess with STICK_OUT_FACTOR.
coord_t next_level_box_center_${ax} =
global_min_${ax}
+ global_extent_${ax}
* (${ax}_bits + one_half)
* next_level_box_size_factor;
// Need to compute center to compare excess with stick_out_factor.
// Unused if no stickout, relying on compiler to eliminate this.
const coord_t next_level_box_center_${ax} =
global_min_${ax}
+ global_extent_${ax}
* (${ax}_bits + one_half)
* next_level_box_size_factor;
%endfor
coord_t next_level_box_stick_out_radius_${ax} =
%if srcntgts_extent_norm == "linf":
%for ax in axis_names:
const coord_t next_level_box_stick_out_radius_${ax} =
box_radius_factor
* global_extent_${ax}
* next_level_box_size_factor;
// stop descent here if particle sticks out of next-level box
stop_srcntgt_descent = stop_srcntgt_descent ||
(srcntgt_${ax} + srcntgt_radius >=
next_level_box_center_${ax}
......@@ -300,8 +376,41 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
(srcntgt_${ax} - srcntgt_radius <
next_level_box_center_${ax}
- next_level_box_stick_out_radius_${ax});
%endif
%endfor
%endfor
%elif srcntgts_extent_norm == "l2":
coord_t next_level_box_stick_out_radius =
box_radius_factor
* global_extent_x /* assume isotropy */
* next_level_box_size_factor;
coord_t next_level_box_center_to_srcntgt_bdry_l2_dist =
sqrt(
%for ax in axis_names:
+ (srcntgt_${ax} - next_level_box_center_${ax})
* (srcntgt_${ax} - next_level_box_center_${ax})
%endfor
) + srcntgt_radius;
// stop descent here if particle sticks out of next-level box
stop_srcntgt_descent = stop_srcntgt_descent ||
(
next_level_box_center_to_srcntgt_bdry_l2_dist
* next_level_box_center_to_srcntgt_bdry_l2_dist
>= ${dimensions}
* next_level_box_stick_out_radius
* next_level_box_stick_out_radius);
%elif srcntgts_extent_norm is None:
// nothing to do
%else:
<%
raise ValueError("unexpected value of 'srcntgts_extent_norm': %s"
% srcntgts_extent_norm)
%>
%endif
// Pick off the lowest-order bit for each axis, put it in its place.
int level_morton_number = 0
......@@ -325,6 +434,11 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
<% field = "pcnt"+padded_bin(mnr, dimensions) %>
result.${field} = (level_morton_number == ${mnr});
%endfor
%for mnr in range(2**dimensions):
<% field = "pwt"+padded_bin(mnr, dimensions) %>
result.${field} = (level_morton_number == ${mnr}) ?
refine_weights[user_srcntgt_id] : 0;
%endfor
morton_nrs[i] = level_morton_number;
return result;
......@@ -336,9 +450,9 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL//
# }}}
# {{{ scan output code template
# {{{ morton scan output
SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
MORTON_NR_SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
{
particle_id_t my_id_in_my_box = -1
%if srcntgts_have_extent:
......@@ -352,6 +466,7 @@ SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
morton_bin_counts[i] = item;
box_id_t current_box_id = srcntgt_box_ids[i];
particle_id_t box_srcntgt_count = box_srcntgt_counts_cumul[current_box_id];
// Am I the last particle in my current box?
......@@ -372,40 +487,51 @@ SCAN_OUTPUT_STMT_TPL = Template(r"""//CL//
# {{{ split box id scan
SPLIT_BOX_ID_SCAN_TPL = ScanTemplate(
name_prefix="split_box_id_scan",
arguments=r"""//CL:mako//
/* input */
box_id_t *srcntgt_box_ids,
particle_id_t *box_srcntgt_starts,
particle_id_t *box_srcntgt_counts_cumul,
particle_id_t max_particles_in_box,
morton_counts_t *box_morton_bin_counts,
refine_weight_t *refine_weights,
refine_weight_t max_leaf_refine_weight,
box_level_t *box_levels,
box_level_t level,
/* input/output */
box_id_t *nboxes,
box_id_t *level_start_box_ids,
box_id_t *level_used_box_counts,
int *box_force_split,
box_level_t last_level,
/* output */
int *box_has_children,
box_id_t *split_box_ids,
int *have_oversize_split_box,
""",
preamble=r"""//CL:mako//
scan_t count_new_boxes_needed(
particle_id_t i,
box_id_t box_id,
__global box_id_t *nboxes,
__global particle_id_t *box_srcntgt_starts,
box_level_t level,
box_level_t last_level,
refine_weight_t max_leaf_refine_weight,
__global particle_id_t *box_srcntgt_counts_cumul,
__global morton_counts_t *box_morton_bin_counts,
particle_id_t max_particles_in_box,
__global box_level_t *box_levels,
box_level_t level
__global box_id_t *level_start_box_ids,
__global box_id_t *level_used_box_counts,
%if level_restrict:
__global int *box_force_split,
%endif
__global int *have_oversize_split_box, // output/side effect
__global int *box_has_children // output/side effect
)
{
scan_t result = 0;
// First particle? Start counting at (the previous level's) nboxes.
if (i == 0)
result += *nboxes;
// First box at my level? Start counting at the number of boxes
// used at the child level.
if (box_id == level_start_box_ids[level])
{
result += level_start_box_ids[level + 1];
result += level_used_box_counts[level + 1];
}
%if srcntgts_have_extent:
const particle_id_t nonchild_srcntgts_in_box =
......@@ -414,63 +540,156 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate(
const particle_id_t nonchild_srcntgts_in_box = 0;
%endif
particle_id_t first_particle_in_my_box =
box_srcntgt_starts[box_id];
// Get box refine weight.
refine_weight_t box_refine_weight = 0;
%for mnr in range(2**dimensions):
box_refine_weight = add_sat(box_refine_weight,
box_morton_bin_counts[box_id].pwt${padded_bin(mnr, dimensions)});
%endfor
// Add 2**d to make enough room for a split of the current box
// This will be the split_box_id for *all* particles in this box,
// including non-child srcntgts.
if (i == first_particle_in_my_box
%if srcntgts_have_extent:
// Only last-level boxes get to produce new boxes.
// If srcntgts have extent, then prior-level boxes
// will keep asking for more boxes to be allocated.
// Prevent that.
&&
box_levels[box_id] + 1 == level
%endif
if ((
level + 1 == last_level
&&
%if adaptive:
/* box overfull? */
box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box
> max_particles_in_box
box_refine_weight
> max_leaf_refine_weight
%else:
/* box non-empty? */
/* Note: Refine weights are allowed to be 0,
so check # of particles directly. */
box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box
> 0
>= 0
%endif
)
%if level_restrict:
|| box_force_split[box_id]
%endif
)
{
result += ${2**dimensions};
box_has_children[box_id] = 1;
// Check if the box is oversized. This drives the level loop.
refine_weight_t max_subbox_refine_weight = 0;
%for mnr in range(2**dimensions):
max_subbox_refine_weight = max(max_subbox_refine_weight,
box_morton_bin_counts[box_id]
.pwt${padded_bin(mnr, dimensions)});
%endfor
if (max_subbox_refine_weight > max_leaf_refine_weight)
{
*have_oversize_split_box = 1;
}
}
return result;
}
""",
input_expr="""count_new_boxes_needed(
i, srcntgt_box_ids[i], nboxes,
box_srcntgt_starts, box_srcntgt_counts_cumul, box_morton_bin_counts,
max_particles_in_box, box_levels, level
input_expr=r"""//CL:mako//
count_new_boxes_needed(
i,
box_levels[i],
last_level,
max_leaf_refine_weight,
box_srcntgt_counts_cumul,
box_morton_bin_counts,
level_start_box_ids,
level_used_box_counts,
%if level_restrict:
box_force_split,
%endif
have_oversize_split_box,
box_has_children
)""",
scan_expr="a + b",
scan_expr="across_seg_boundary ? b : a + b",
neutral="0",
output_statement="""//CL//
is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i-1]",
output_statement=r"""//CL//
dbg_assert(item >= 0);
split_box_ids[i] = item;
// Am I the last particle overall? If so, write box count
if (i+1 == N)
*nboxes = item;
""")
# }}}
# {{{ split-and-sort kernel
# {{{ box splitter kernel
BOX_SPLITTER_KERNEL_TPL = Template(r"""//CL//
box_id_t ibox = i;
bool do_split_box =
(box_has_children[ibox] && box_levels[ibox] + 1 == level)
%if level_restrict:
|| box_force_split[ibox]
%endif
;
if (!do_split_box)
{
PYOPENCL_ELWISE_CONTINUE;
}
// {{{ Set up child box data structure.
morton_counts_t box_morton_bin_count = box_morton_bin_counts[ibox];
%for mnr in range(2**dimensions):
{
box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + ${mnr};
// Parent / child / level info
box_parent_ids[new_box_id] = ibox;
box_child_ids_mnr_${mnr}[ibox] = new_box_id;
box_level_t new_level = box_levels[ibox] + 1;
box_levels[new_box_id] = new_level;
// Box particle counts / starting particle number
particle_id_t new_count =
box_morton_bin_count.pcnt${padded_bin(mnr, dimensions)};
box_srcntgt_counts_cumul[new_box_id] = new_count;
// Only set the starting particle number / start flags if
// the new box has particles to begin with.
if (new_count > 0)
{
particle_id_t new_box_start = box_srcntgt_starts[ibox]
%if srcntgts_have_extent:
+ box_morton_bin_count.nonchild_srcntgts
%endif
%for sub_mnr in range(mnr):
+ box_morton_bin_count.pcnt${padded_bin(sub_mnr, dimensions)}
%endfor
;
box_start_flags[new_box_start] = 1;
box_srcntgt_starts[new_box_id] = new_box_start;
}
// Compute box center.
coord_t radius = (root_extent * 1 / (coord_t) (1 << (1 + new_level)));
%for idim, ax in enumerate(axis_names):
{
<% has_bit = mnr & 2**(dimensions-1-idim) %>
box_centers_${ax}[new_box_id] = box_centers_${ax}[ibox]
${"+" if has_bit else "-"} radius;
}
%endfor
}
%endfor
// }}}
""", strict_undefined=True)
# }}}
# {{{ post-split particle renumbering
SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL//
PARTICLE_RENUMBERER_PREAMBLE_TPL = Template(r"""//CL//
<%
def get_count_for_branch(known_bits):
if len(known_bits) == dimensions:
......@@ -497,162 +716,244 @@ SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL//
""", strict_undefined=True)
SPLIT_AND_SORT_KERNEL_TPL = Template(r"""//CL//
PARTICLE_RENUMBERER_KERNEL_TPL = Template(r"""//CL//
box_id_t ibox = srcntgt_box_ids[i];
dbg_assert(ibox >= 0);
dbg_assert(ibox < nboxes);
dbg_printf(("postproc %d:\n", i));
dbg_printf((" my box id: %d\n", ibox));
particle_id_t box_srcntgt_count = box_srcntgt_counts_cumul[ibox];
bool do_split_box = (box_has_children[ibox] && box_levels[ibox] + 1 == level)
%if level_restrict:
|| box_force_split[ibox]
%endif
;
%if srcntgts_have_extent:
const particle_id_t nonchild_srcntgt_count =
box_morton_bin_counts[ibox].nonchild_srcntgts;
if (!do_split_box)
{
// Not splitting? Copy over existing particle info.
new_user_srcntgt_ids[i] = user_srcntgt_ids[i];
new_srcntgt_box_ids[i] = ibox;
%else:
const particle_id_t nonchild_srcntgt_count = 0;
%endif
PYOPENCL_ELWISE_CONTINUE;
}
%if adaptive:
bool do_split_box =
box_srcntgt_count - nonchild_srcntgt_count
> max_particles_in_box;
%else:
bool do_split_box =
box_srcntgt_count - nonchild_srcntgt_count
> 0;
%endif
morton_nr_t my_morton_nr = morton_nrs[i];
// printf(" my morton nr: %d\n", my_morton_nr);
morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox];
morton_counts_t my_morton_bin_counts = morton_bin_counts[i];
particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr);
// {{{ compute this srcntgt's new index
particle_id_t my_box_start = box_srcntgt_starts[ibox];
particle_id_t tgt_particle_idx = my_box_start + my_count-1;
%if srcntgts_have_extent:
## Only do split-box processing for srcntgts that were touched
## on the immediately preceding level.
##
## If srcntgts have no extent, then subsequent levels
## will never decide to split boxes that were kept unsplit on prior
## levels either. If srcntgts do
## have an extent, this could happen. Prevent running the
## split code for such particles.
int box_level = box_levels[ibox];
do_split_box = do_split_box && box_level + 1 == level;
tgt_particle_idx +=
(my_morton_nr >= 0)
? my_box_morton_bin_counts.nonchild_srcntgts
: 0;
%endif
%for mnr in range(2**dimensions):
<% bin_nmr = padded_bin(mnr, dimensions) %>
tgt_particle_idx +=
(my_morton_nr > ${mnr})
? my_box_morton_bin_counts.pcnt${bin_nmr}
: 0;
%endfor
dbg_assert(tgt_particle_idx < n);
dbg_printf((" moving %ld -> %d "
"(ibox %d, my_box_start %d, my_count %d)\n",
i, tgt_particle_idx,
ibox, my_box_start, my_count));
new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i];
if (do_split_box)
{
morton_nr_t my_morton_nr = morton_nrs[i];
dbg_printf((" my morton nr: %d\n", my_morton_nr));
// }}}
morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox];
// {{{ compute this srcntgt's new box id
morton_counts_t my_morton_bin_counts = morton_bin_counts[i];
particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr);
box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + my_morton_nr;
// {{{ compute this srcntgt's new index
%if srcntgts_have_extent:
if (my_morton_nr == -1)
{
new_box_id = ibox;
}
%endif
particle_id_t my_box_start = box_srcntgt_starts[ibox];
particle_id_t tgt_particle_idx = my_box_start + my_count-1;
%if srcntgts_have_extent:
tgt_particle_idx +=
(my_morton_nr >= 0)
? my_box_morton_bin_counts.nonchild_srcntgts
: 0;
%endif
%for mnr in range(2**dimensions):
<% bin_nmr = padded_bin(mnr, dimensions) %>
tgt_particle_idx +=
(my_morton_nr > ${mnr})
? my_box_morton_bin_counts.pcnt${bin_nmr}
: 0;
%endfor
dbg_printf((" new_box_id: %d\n", new_box_id));
dbg_assert(new_box_id >= 0);
dbg_assert(tgt_particle_idx < n);
dbg_printf((" moving %ld -> %d "
"(ibox %d, my_box_start %d, my_count %d)\n",
i, tgt_particle_idx,
ibox, my_box_start, my_count));
new_srcntgt_box_ids[tgt_particle_idx] = new_box_id;
new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i];
// }}}
""", strict_undefined=True)
// }}}
# }}}
// {{{ compute this srcntgt's new box id
# {{{ level restrict kernel
box_id_t new_box_id = split_box_ids[i] - ${2**dimensions} + my_morton_nr;
from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
%if srcntgts_have_extent:
if (my_morton_nr == -1)
new_box_id = ibox;
%endif
dbg_printf((" new_box_id: %d\n", new_box_id));
dbg_assert(new_box_id >= 0);
LEVEL_RESTRICT_TPL = Template(
TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
<%def name="my_load_center(name, box_id)">
## This differs from load_center() because in this kernel box centers
## live in one array per axis.
coord_vec_t ${name};
%for i in range(dimensions):
${name}.${AXIS_NAMES[i]} = box_centers_${AXIS_NAMES[i]}[${box_id}];
%endfor
</%def>
new_srcntgt_box_ids[tgt_particle_idx] = new_box_id;
#define NLEVELS (${max_levels})
// }}}
box_id_t box_id = i;
// {{{ set up child box data structure
// Skip unless this box is a leaf.
if (box_has_children[box_id])
{
PYOPENCL_ELWISE_CONTINUE;
}
%for mnr in range(2**dimensions):
/* Am I the last particle in my Morton bin? */
%if mnr > 0:
else
%endif
if (${mnr} == my_morton_nr
&& my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)}
== my_count)
{
dbg_printf((" ## splitting\n"));
${walk_init(0)}
particle_id_t new_box_start = my_box_start
%if srcntgts_have_extent:
+ my_box_morton_bin_counts.nonchild_srcntgts
%endif
%for sub_mnr in range(mnr):
+ my_box_morton_bin_counts.pcnt${padded_bin(sub_mnr, dimensions)}
%endfor
;
// Descend the tree searching for neighboring leaves.
while (continue_walk)
{
box_id_t child_box_id;
// Look for the child in the appropriate array.
%for morton_nr in range(2**dimensions):
if (walk_morton_nr == ${morton_nr})
{
child_box_id = box_child_ids_mnr_${morton_nr}[walk_parent_box_id];
}
%endfor
dbg_printf((" new_box_start: %d\n", new_box_start));
if (child_box_id)
{
int child_level = walk_stack_size + 1;
box_start_flags[new_box_start] = 1;
box_srcntgt_starts[new_box_id] = new_box_start;
box_parent_ids[new_box_id] = ibox;
box_morton_nrs[new_box_id] = my_morton_nr;
// Check adjacency.
bool is_adjacent;
particle_id_t new_count =
my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)};
box_srcntgt_counts_cumul[new_box_id] = new_count;
box_levels[new_box_id] = level;
if (child_box_id == box_id)
{
// Skip considering self.
is_adjacent = false;
}
else
{
${my_load_center("box_center", "box_id")}
${my_load_center("child_center", "child_box_id")}
is_adjacent = is_adjacent_or_overlapping(
root_extent, child_center, child_level, box_center, level);
}
// For a non-adaptive run, max_particles_in_box drives the
// level loop.
if (new_count > max_particles_in_box)
if (is_adjacent)
{
// Invariant: When new leaves get added,
// they are never more than 2 levels deeper than
// all their adjacent leaves.
//
// Hence in we only need to look at boxes up to
// (level + 2) deep.
if (box_has_children[child_box_id])
{
*have_oversize_split_box = 1;
if (child_level <= 1 + level)
{
${walk_push("child_box_id")}
continue;
}
}
else
{
// We are looking at a neighboring leaf box.
// Check if my box must be split to enforce level
// restriction.
if (child_level == 2 + level || (
child_level == 1 + level &&
box_force_split[child_box_id]))
{
box_force_split[box_id] = 1;
atomic_or(have_upper_level_split_box, 1);
continue_walk = false;
}
}
dbg_printf((" box pcount: %d\n",
box_srcntgt_counts_cumul[new_box_id]));
}
%endfor
// }}}
}
else
{
// Not splitting? Copy over existing particle info.
new_user_srcntgt_ids[i] = user_srcntgt_ids[i];
new_srcntgt_box_ids[i] = ibox;
}
${walk_advance()}
}
""", strict_undefined=True)
def build_level_restrict_kernel(context, preamble_with_dtype_decls,
dimensions, axis_names, box_id_dtype, coord_dtype,
box_level_dtype, max_levels):
from boxtree.tools import ScalarArg, VectorArg
arguments = (
[
# input
ScalarArg(box_level_dtype, "level"), # [1]
ScalarArg(coord_dtype, "root_extent"), # [1]
VectorArg(np.int32, "box_has_children"), # [nboxes]
# input/output
VectorArg(np.int32, "box_force_split"), # [nboxes]
# output
VectorArg(np.int32, "have_upper_level_split_box"), # [1]
]
# input, length depends on dim
+ [VectorArg(box_id_dtype, f"box_child_ids_mnr_{mnr}")
for mnr in range(2**dimensions)] # [nboxes]
+ [VectorArg(coord_dtype, f"box_centers_{ax}")
for ax in axis_names] # [nboxes]
)
render_vars = {
"AXIS_NAMES": axis_names,
"dimensions": dimensions,
"max_levels": max_levels,
# Entries below are needed by HELPER_FUNCTION_TEMPLATE
# and/or TRAVERSAL_PREAMBLE_MAKO_DEFS:
"debug": False,
"targets_have_extent": False,
"sources_have_extent": False,
"get_coord_vec_dtype": get_coord_vec_dtype,
"cvec_sub": partial(coord_vec_subscript_code, dimensions),
}
from pyopencl.elementwise import ElementwiseKernel
from boxtree.traversal import HELPER_FUNCTION_TEMPLATE
return ElementwiseKernel(
context,
arguments=arguments,
operation=LEVEL_RESTRICT_TPL.render(**render_vars),
name="level_restrict",
preamble=(
str(preamble_with_dtype_decls)
+ Template(r"""
#define LEVEL_TO_RAD(level) \
(root_extent * 1 / (coord_t) (1 << (level + 1)))
"""
+ HELPER_FUNCTION_TEMPLATE)
.render(**render_vars)))
# }}}
# END KERNELS IN THE LEVEL LOOP
# {{{ nonchild srcntgt count extraction
EXTRACT_NONCHILD_SRCNTGT_COUNT_TPL = ElementwiseTemplate(
......@@ -784,9 +1085,6 @@ SOURCE_AND_TARGET_INDEX_FINDER = ElementwiseTemplate(
target_nr + 1 - (particle_id_t) is_source
- box_start_target_nr;
}
%elif srcntgts_have_extent:
box_source_counts_nonchild[box_id] = 0;
box_target_counts_nonchild[box_id] = 0;
%endif
// {{{ last particle for this or the parents' boxes? update counts
......@@ -869,20 +1167,16 @@ SRCNTGT_PERMUTER_TPL = ElementwiseTemplate(
# }}}
# {{{ box info kernel
BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
arguments="""//CL:mako//
/* input */
box_id_t *box_parent_ids,
morton_nr_t *box_morton_nrs,
bbox_t bbox,
box_id_t aligned_nboxes,
particle_id_t *box_srcntgt_counts_cumul,
particle_id_t *box_source_counts_cumul,
particle_id_t *box_target_counts_cumul,
particle_id_t max_particles_in_box,
int *box_has_children,
box_level_t *box_levels,
box_level_t nlevels,
......@@ -891,8 +1185,6 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
particle_id_t *box_target_counts_nonchild,
/* output */
box_id_t *box_child_ids, /* [2**dimensions, aligned_nboxes] */
coord_t *box_centers, /* [dimensions, aligned_nboxes] */
box_flags_t *box_flags, /* [nboxes] */
""",
operation=r"""//CL:mako//
......@@ -905,10 +1197,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
*
* box_srcntgt_counts_cumul is zero (here) exactly for empty leaves
* because it gets initialized to zero and never gets set to another
* value. If you check above, most box info is only ever initialized
* *if* there's a particle in the box, because the sort/build is a
* repeated scan over *particles* (not boxes). Thus, no particle -> no
* work done.
* value.
*/
particle_id_t particle_count = box_srcntgt_counts_cumul[box_id];
......@@ -940,41 +1229,16 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
dbg_assert(particle_count >= nonchild_srcntgt_count);
if (particle_count == 0)
{
// Empty leaf: Lots of stuff uninitialized, prevent
// damage by quitting now.
// Also, those should have gotten pruned by this point,
// unless skip_prune is True.
box_flags[box_id] = 0; // no children, no sources, no targets, bye.
PYOPENCL_ELWISE_CONTINUE;
}
else if (
%if adaptive:
particle_count - nonchild_srcntgt_count > max_particles_in_box
%else:
particle_count - nonchild_srcntgt_count > 0
%endif
&& box_levels[box_id] + 1 < nlevels)
if (box_has_children[box_id])
{
// This box has children, it is not a leaf.
// That second condition there covers a weird corner case. It's
// obviously true--a last-level box won't have children. But why
// is it necessary? It turns out that nonchild_srcntgt_count is not
// available (i.e. zero) for boxes on the last level. So these boxes
// look like they got split if they have enough non-child srcntgts,
// to the first part of the 'if' condition. But in fact they weren't,
// because of their non-child srcntgts.
my_box_flags |= BOX_HAS_CHILDREN;
my_box_flags |= BOX_HAS_SOURCE_OR_TARGET_CHILD_BOXES;
%if sources_are_targets:
if (particle_count - nonchild_srcntgt_count)
my_box_flags |= BOX_HAS_CHILD_SOURCES | BOX_HAS_CHILD_TARGETS;
my_box_flags |=
BOX_HAS_SOURCE_CHILD_BOXES | BOX_HAS_TARGET_CHILD_BOXES;
%else:
particle_id_t source_count = box_source_counts_cumul[box_id];
particle_id_t target_count = box_target_counts_cumul[box_id];
......@@ -983,15 +1247,15 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
dbg_assert(target_count >= nonchild_target_count);
if (source_count - nonchild_source_count)
my_box_flags |= BOX_HAS_CHILD_SOURCES;
my_box_flags |= BOX_HAS_SOURCE_CHILD_BOXES;
if (target_count - nonchild_target_count)
my_box_flags |= BOX_HAS_CHILD_TARGETS;
my_box_flags |= BOX_HAS_TARGET_CHILD_BOXES;
%endif
if (nonchild_source_count)
my_box_flags |= BOX_HAS_OWN_SOURCES;
my_box_flags |= BOX_IS_SOURCE_BOX;
if (nonchild_target_count)
my_box_flags |= BOX_HAS_OWN_TARGETS;
my_box_flags |= BOX_IS_TARGET_BOX;
}
else
{
......@@ -999,7 +1263,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
%if sources_are_targets:
if (particle_count)
my_box_flags |= BOX_HAS_OWN_SOURCES | BOX_HAS_OWN_TARGETS;
my_box_flags |= BOX_IS_SOURCE_BOX | BOX_IS_TARGET_BOX;
box_source_counts_nonchild[box_id] = particle_count;
dbg_assert(box_source_counts_nonchild == box_target_counts_nonchild);
......@@ -1008,9 +1272,9 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
particle_id_t my_target_count = particle_count - my_source_count;
if (my_source_count)
my_box_flags |= BOX_HAS_OWN_SOURCES;
my_box_flags |= BOX_IS_SOURCE_BOX;
if (my_target_count)
my_box_flags |= BOX_HAS_OWN_TARGETS;
my_box_flags |= BOX_IS_TARGET_BOX;
box_source_counts_nonchild[box_id] = my_source_count;
box_target_counts_nonchild[box_id] = my_target_count;
......@@ -1018,57 +1282,118 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate(
}
box_flags[box_id] = my_box_flags;
""")
# }}}
# {{{ box extents
BOX_EXTENTS_FINDER_TEMPLATE = ElementwiseTemplate(
arguments="""//CL:mako//
box_id_t aligned_nboxes,
box_id_t *box_child_ids,
coord_t *box_centers,
particle_id_t *box_particle_starts,
particle_id_t *box_particle_counts_nonchild
%for iaxis in range(dimensions):
, const coord_t *particle_${AXIS_NAMES[iaxis]}
%endfor
,
const coord_t *particle_radii,
int enable_radii,
coord_t *box_particle_bounding_box_min,
coord_t *box_particle_bounding_box_max,
""",
operation=TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
box_id_t ibox = i;
${load_center("box_center", "ibox")}
box_id_t parent_id = box_parent_ids[box_id];
morton_nr_t morton_nr = box_morton_nrs[box_id];
box_child_ids[parent_id + aligned_nboxes*morton_nr] = box_id;
<% axis_names = AXIS_NAMES[:dimensions] %>
/* walk up to root to find center */
%for idim in range(dimensions):
coord_t center_${idim} = 0;
// incorporate own particles
%for iaxis, ax in enumerate(axis_names):
coord_t min_particle_${ax} =
${coord_vec_subscript_code("box_center", iaxis)};
coord_t max_particle_${ax} =
${coord_vec_subscript_code("box_center", iaxis)};
%endfor
box_id_t walk_parent_id = parent_id;
box_id_t current_box_id = box_id;
morton_nr_t walk_morton_nr = morton_nr;
while (walk_parent_id != current_box_id)
particle_id_t start = box_particle_starts[ibox];
particle_id_t stop = start + box_particle_counts_nonchild[ibox];
for (particle_id_t iparticle = start; iparticle < stop; ++iparticle)
{
%for idim in range(dimensions):
{
bool has_bit = (walk_morton_nr & ${2**(dimensions-1-idim)});
center_${idim} = one_half*(
center_${idim}
- one_half
+ has_bit);
}
%endfor
coord_t particle_rad = 0;
%if srcntgts_have_extent:
// If only one has extent, then the radius array for the other
// may well be a null pointer.
if (enable_radii)
particle_rad = particle_radii[iparticle];
%endif
current_box_id = walk_parent_id;
walk_parent_id = box_parent_ids[walk_parent_id];
walk_morton_nr = box_morton_nrs[current_box_id];
%for iaxis, ax in enumerate(axis_names):
coord_t particle_coord_${ax} = particle_${ax}[iparticle];
min_particle_${ax} = min(
min_particle_${ax},
particle_coord_${ax} - particle_rad);
max_particle_${ax} = max(
max_particle_${ax},
particle_coord_${ax} + particle_rad);
%endfor
}
coord_t extent = bbox.max_x - bbox.min_x;
%for idim in range(dimensions):
// incorporate child boxes
for (int morton_nr = 0; morton_nr < ${2**dimensions}; ++morton_nr)
{
box_centers[box_id + aligned_nboxes*${idim}] =
bbox.min_${AXIS_NAMES[idim]} + extent*(one_half+center_${idim});
box_id_t child_id = box_child_ids[
morton_nr * aligned_nboxes + ibox];
if (child_id == 0)
continue;
%for iaxis, ax in enumerate(axis_names):
min_particle_${ax} = min(
min_particle_${ax},
box_particle_bounding_box_min[
${iaxis} * aligned_nboxes + child_id]);
max_particle_${ax} = max(
max_particle_${ax},
box_particle_bounding_box_max[
${iaxis} * aligned_nboxes + child_id]);
%endfor
}
// write result
%for iaxis, ax in enumerate(axis_names):
box_particle_bounding_box_min[
${iaxis} * aligned_nboxes + ibox] = min_particle_${ax};
box_particle_bounding_box_max[
${iaxis} * aligned_nboxes + ibox] = max_particle_${ax};
%endfor
""")
""",
name="find_box_extents")
# }}}
# {{{ kernel creation top-level
@log_process(logger)
def get_tree_build_kernel_info(context, dimensions, coord_dtype,
particle_id_dtype, box_id_dtype,
sources_are_targets, srcntgts_have_extent,
stick_out_factor, morton_nr_dtype, box_level_dtype,
adaptive):
sources_are_targets, srcntgts_extent_norm,
morton_nr_dtype, box_level_dtype, kind):
"""
:arg srcntgts_extent_norm: one of ``None``, ``"l2"`` or ``"linf"``
"""
logger.info("start building tree build kernels")
level_restrict = (kind == "adaptive-level-restricted")
adaptive = (kind != "non-adaptive")
# {{{ preparation
......@@ -1079,7 +1404,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
"incorrect results.", stacklevel=4)
from pyopencl.tools import dtype_to_c_struct, dtype_to_ctype
coord_vec_dtype = cl.array.vec.types[coord_dtype, dimensions]
coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
particle_id_dtype = np.dtype(particle_id_dtype)
box_id_dtype = np.dtype(box_id_dtype)
......@@ -1087,10 +1412,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
dev = context.devices[0]
morton_bin_count_dtype, _ = make_morton_bin_count_type(
dev, dimensions, particle_id_dtype,
srcntgts_have_extent)
srcntgts_have_extent=srcntgts_extent_norm is not None)
from boxtree.bounding_box import make_bounding_box_dtype
bbox_dtype, bbox_type_decl = make_bounding_box_dtype(
bbox_dtype, _bbox_type_decl = make_bounding_box_dtype(
dev, dimensions, coord_dtype)
from boxtree.tools import AXIS_NAMES
......@@ -1098,31 +1423,33 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
from boxtree.tools import padded_bin
from boxtree.tree import box_flags_enum
codegen_args = dict(
dimensions=dimensions,
axis_names=axis_names,
padded_bin=padded_bin,
coord_dtype=coord_dtype,
coord_vec_dtype=coord_vec_dtype,
bbox_dtype=bbox_dtype,
particle_id_dtype=particle_id_dtype,
morton_bin_count_dtype=morton_bin_count_dtype,
morton_nr_dtype=morton_nr_dtype,
box_id_dtype=box_id_dtype,
dtype_to_ctype=dtype_to_ctype,
AXIS_NAMES=AXIS_NAMES,
box_flags_enum=box_flags_enum,
adaptive=adaptive,
sources_are_targets=sources_are_targets,
srcntgts_have_extent=srcntgts_have_extent,
stick_out_factor=stick_out_factor,
enable_assert=False,
enable_printf=False,
)
codegen_args = {
"dimensions": dimensions,
"axis_names": axis_names,
"padded_bin": padded_bin,
"coord_dtype": coord_dtype,
"coord_vec_dtype": coord_vec_dtype,
"bbox_dtype": bbox_dtype,
"refine_weight_dtype": refine_weight_dtype,
"particle_id_dtype": particle_id_dtype,
"morton_bin_count_dtype": morton_bin_count_dtype,
"morton_nr_dtype": morton_nr_dtype,
"box_id_dtype": box_id_dtype,
"box_level_dtype": box_level_dtype,
"dtype_to_ctype": dtype_to_ctype,
"AXIS_NAMES": AXIS_NAMES,
"box_flags_enum": box_flags_enum,
"adaptive": adaptive,
"level_restrict": level_restrict,
"sources_are_targets": sources_are_targets,
"srcntgts_have_extent": srcntgts_extent_norm is not None,
"srcntgts_extent_norm": srcntgts_extent_norm,
"enable_assert": False,
"enable_printf": False,
}
# }}}
......@@ -1141,10 +1468,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
scan_preamble = (
preamble_with_dtype_decls
+ str(SCAN_PREAMBLE_TPL.render(**codegen_args))
+ str(MORTON_NR_SCAN_PREAMBLE_TPL.render(**codegen_args))
)
from pyopencl.tools import VectorArg, ScalarArg
from boxtree.tools import ScalarArg, VectorArg
common_arguments = (
[
# box-local morton bin counts for each particle at the current level
......@@ -1160,15 +1487,20 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
# segment flags
# invariant to sorting once set
# (particles are only reordered within a box)
VectorArg(np.uint8, "box_start_flags"), # [nsrcntgts]
VectorArg(np.uint8, "box_start_flags"), # [nsrcntgts]
VectorArg(box_id_dtype, "srcntgt_box_ids"), # [nsrcntgts]
VectorArg(box_id_dtype, "split_box_ids"), # [nsrcntgts]
VectorArg(box_id_dtype, "split_box_ids"), # [nboxes]
# per-box morton bin counts
VectorArg(morton_bin_count_dtype, "box_morton_bin_counts"),
# [nboxes]
VectorArg(refine_weight_dtype, "refine_weights"),
# [nsrcntgts]
ScalarArg(refine_weight_dtype, "max_leaf_refine_weight"),
# particle# at which each box starts
VectorArg(particle_id_dtype, "box_srcntgt_starts"), # [nboxes]
......@@ -1178,15 +1510,10 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
# pointer to parent box
VectorArg(box_id_dtype, "box_parent_ids"), # [nboxes]
# morton nr identifier {quadr,oct}ant of parent in which this
# box was created
VectorArg(morton_nr_dtype, "box_morton_nrs"), # [nboxes]
# number of boxes total
VectorArg(box_id_dtype, "nboxes"), # [1]
# level number
VectorArg(box_level_dtype, "box_levels"), # [nboxes]
ScalarArg(np.int32, "level"),
ScalarArg(particle_id_dtype, "max_particles_in_box"),
ScalarArg(bbox_dtype, "bbox"),
VectorArg(particle_id_dtype, "user_srcntgt_ids"), # [nsrcntgts]
......@@ -1196,26 +1523,35 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
+ [VectorArg(coord_dtype, ax) for ax in axis_names]
+ ([VectorArg(coord_dtype, "srcntgt_radii")]
if srcntgts_have_extent else [])
if srcntgts_extent_norm is not None else [])
)
morton_count_scan_arguments = list(common_arguments)
if srcntgts_extent_norm is not None:
morton_count_scan_arguments += [
(ScalarArg(coord_dtype, "stick_out_factor"))
]
from pyopencl.scan import GenericScanKernel
morton_count_scan = GenericScanKernel(
context, morton_bin_count_dtype,
arguments=common_arguments,
arguments=morton_count_scan_arguments,
input_expr=(
"scan_t_from_particle(%s)"
% ", ".join([
"i", "level", "&bbox", "morton_nrs",
"scan_t_from_particle({})".format(", ".join([
"i", "box_levels[srcntgt_box_ids[i]]", "&bbox", "morton_nrs",
"user_srcntgt_ids",
"refine_weights",
]
+ ["%s" % ax for ax in axis_names]
+ (["srcntgt_radii"] if srcntgts_have_extent else []))),
+ [f"{ax}" for ax in axis_names]
+ (["srcntgt_radii, stick_out_factor"]
if srcntgts_extent_norm is not None else [])))),
scan_expr="scan_t_add(a, b, across_seg_boundary)",
neutral="scan_t_neutral()",
is_segment_start_expr="box_start_flags[i]",
output_statement=SCAN_OUTPUT_STMT_TPL.render(**codegen_args),
preamble=scan_preamble)
output_statement=MORTON_NR_SCAN_OUTPUT_STMT_TPL.render(**codegen_args),
preamble=scan_preamble,
name_prefix="morton_scan")
# }}}
......@@ -1231,51 +1567,100 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
("box_id_t", box_id_dtype),
("morton_counts_t", morton_bin_count_dtype),
("box_level_t", box_level_dtype),
("refine_weight_t", refine_weight_dtype),
),
var_values=(
("dimensions", dimensions),
("srcntgts_have_extent", srcntgts_have_extent),
("srcntgts_have_extent", srcntgts_extent_norm is not None),
("srcntgts_extent_norm", srcntgts_extent_norm),
("adaptive", adaptive),
("padded_bin", padded_bin),
("level_restrict", level_restrict),
),
more_preamble=generic_preamble)
# }}}
# {{{ split-and-sort
# {{{ box splitter
# Work around a bug in Mako < 0.7.3
# FIXME: Is this needed?
box_s_codegen_args = codegen_args.copy()
box_s_codegen_args.update(
dim=None,
boundary_morton_nr=None)
box_splitter_kernel_source = BOX_SPLITTER_KERNEL_TPL.render(**box_s_codegen_args)
from pyopencl.elementwise import ElementwiseKernel
box_splitter_kernel = ElementwiseKernel(
context,
common_arguments
+ [
VectorArg(np.int32, "box_has_children"),
VectorArg(np.int32, "box_force_split"),
ScalarArg(coord_dtype, "root_extent"),
]
+ [VectorArg(box_id_dtype, f"box_child_ids_mnr_{mnr}")
for mnr in range(2**dimensions)]
+ [VectorArg(coord_dtype, f"box_centers_{ax}")
for ax in axis_names],
str(box_splitter_kernel_source),
name="box_splitter",
preamble=preamble_with_dtype_decls
)
# }}}
# {{{ particle renumberer
# Work around a bug in Mako < 0.7.3
s_and_s_codegen_args = codegen_args.copy()
s_and_s_codegen_args.update(
# FIXME: Copied from above. It may not be necessary?
part_rn_codegen_args = codegen_args.copy()
part_rn_codegen_args.update(
dim=None,
boundary_morton_nr=None)
split_and_sort_preamble = \
SPLIT_AND_SORT_PREAMBLE_TPL.render(**s_and_s_codegen_args)
particle_renumberer_preamble = \
PARTICLE_RENUMBERER_PREAMBLE_TPL.render(**part_rn_codegen_args)
split_and_sort_kernel_source = SPLIT_AND_SORT_KERNEL_TPL.render(**codegen_args)
particle_renumberer_kernel_source = \
PARTICLE_RENUMBERER_KERNEL_TPL.render(**codegen_args)
from pyopencl.elementwise import ElementwiseKernel
split_and_sort_kernel = ElementwiseKernel(
particle_renumberer_kernel = ElementwiseKernel(
context,
common_arguments
+ [
VectorArg(particle_id_dtype, "new_user_srcntgt_ids",
with_offset=True),
VectorArg(np.int32, "have_oversize_split_box", with_offset=True),
VectorArg(box_id_dtype, "new_srcntgt_box_ids", with_offset=True),
VectorArg(box_level_dtype, "box_levels", with_offset=True),
],
str(split_and_sort_kernel_source), name="split_and_sort",
[*common_arguments,
VectorArg(np.int32, "box_has_children"),
VectorArg(np.int32, "box_force_split"),
VectorArg(particle_id_dtype, "new_user_srcntgt_ids"),
VectorArg(box_id_dtype, "new_srcntgt_box_ids")],
str(particle_renumberer_kernel_source), name="renumber_particles",
preamble=(
preamble_with_dtype_decls
+ str(split_and_sort_preamble))
+ str(particle_renumberer_preamble))
)
# }}}
# {{{ level restrict propagator
if level_restrict:
# At compile time the level restrict kernel requires fixing a
# "max_levels" constant for traversing the tree. This constant cannot be
# known at this point, hence we return a kernel builder.
level_restrict_kernel_builder = partial(build_level_restrict_kernel,
context, preamble_with_dtype_decls, dimensions, axis_names, box_id_dtype,
coord_dtype, box_level_dtype)
else:
level_restrict_kernel_builder = None
# }}}
# END KERNELS IN LEVEL LOOP
if srcntgts_have_extent:
if srcntgts_extent_norm is not None:
extract_nonchild_srcntgt_count_kernel = \
EXTRACT_NONCHILD_SRCNTGT_COUNT_TPL.build(
context,
......@@ -1294,26 +1679,53 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
# FIXME: Turn me into a scan template
from pyopencl.tools import VectorArg
from boxtree.tools import VectorArg
find_prune_indices_kernel = GenericScanKernel(
context, box_id_dtype,
arguments=[
# input
VectorArg(particle_id_dtype, "box_srcntgt_counts_cumul"),
# output
VectorArg(box_id_dtype, "to_box_id"),
VectorArg(box_id_dtype, "from_box_id"),
VectorArg(box_id_dtype, "src_box_id"),
VectorArg(box_id_dtype, "dst_box_id"),
VectorArg(box_id_dtype, "nboxes_post_prune"),
],
input_expr="box_srcntgt_counts_cumul[i] == 0 ? 1 : 0",
input_expr="box_srcntgt_counts_cumul[i] != 0",
preamble=box_flags_enum.get_c_defines(),
scan_expr="a+b", neutral="0",
output_statement="""
to_box_id[i] = i-prev_item;
if (box_srcntgt_counts_cumul[i])
from_box_id[i-prev_item] = i;
if (i+1 == N) *nboxes_post_prune = N-item;
""")
{
dst_box_id[i] = item - 1;
src_box_id[item - 1] = i;
}
if (i+1 == N) *nboxes_post_prune = item;
""",
name_prefix="find_prune_indices_scan")
# }}}
# {{{ find new level box counts
find_level_box_counts_kernel = GenericScanKernel(
context, box_id_dtype,
arguments=[
# input
VectorArg(box_level_dtype, "box_levels"), # [nboxes]
# output
VectorArg(box_id_dtype, "level_box_counts"), # [nlevels]
],
input_expr="1",
is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i - 1]",
scan_expr="across_seg_boundary ? b : a + b",
neutral="0",
output_statement=r"""//CL//
if (i + 1 == N || box_levels[i] != box_levels[i + 1])
{
level_box_counts[box_levels[i]] = item;
}
""",
name_prefix="find_level_box_counts_scan")
# }}}
......@@ -1364,7 +1776,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
("box_id_t", box_id_dtype),
),
var_values=(
("srcntgts_have_extent", srcntgts_have_extent),
("srcntgts_have_extent", srcntgts_extent_norm is not None),
("sources_are_targets", sources_are_targets),
),
more_preamble=generic_preamble)
......@@ -1385,7 +1797,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
("box_flags_t", box_flags_enum.dtype),
("box_level_t", box_level_dtype),
)
codegen_args_tuples = tuple(six.iteritems(codegen_args))
codegen_args_tuples = tuple(codegen_args.items())
box_info_kernel = BOX_INFO_KERNEL_TPL.build(
context,
type_aliases,
......@@ -1395,7 +1807,25 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
# }}}
logger.info("tree build kernels built")
# {{{ box extent
box_extents_finder_kernel = BOX_EXTENTS_FINDER_TEMPLATE.build(context,
type_aliases=(
("box_id_t", box_id_dtype),
("coord_t", coord_dtype),
("coord_vec_t", get_coord_vec_dtype(coord_dtype, dimensions)),
("particle_id_t", particle_id_dtype),
),
var_values=(
("coord_vec_subscript_code",
partial(coord_vec_subscript_code, dimensions)),
("dimensions", dimensions),
("AXIS_NAMES", AXIS_NAMES),
("srcntgts_have_extent", srcntgts_extent_norm is not None),
),
)
# }}}
return _KernelInfo(
particle_id_dtype=particle_id_dtype,
......@@ -1404,15 +1834,20 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
morton_count_scan=morton_count_scan,
split_box_id_scan=split_box_id_scan,
split_and_sort_kernel=split_and_sort_kernel,
box_splitter_kernel=box_splitter_kernel,
particle_renumberer_kernel=particle_renumberer_kernel,
level_restrict=level_restrict,
level_restrict_kernel_builder=level_restrict_kernel_builder,
extract_nonchild_srcntgt_count_kernel=(
extract_nonchild_srcntgt_count_kernel),
find_prune_indices_kernel=find_prune_indices_kernel,
find_level_box_counts_kernel=find_level_box_counts_kernel,
srcntgt_permuter=srcntgt_permuter,
source_counter=source_counter,
source_and_target_index_finder=source_and_target_index_finder,
box_info_kernel=box_info_kernel,
box_extents_finder_kernel=box_extents_finder_kernel,
)
# }}}
......@@ -1501,7 +1936,6 @@ POINT_SOURCE_LINKING_BOX_POINT_SOURCES = ElementwiseTemplate(
# }}}
# {{{ target filtering
TREE_ORDER_TARGET_FILTER_SCAN_TPL = ScanTemplate(
......@@ -1575,4 +2009,4 @@ TREE_ORDER_TARGET_FILTER_INDEX_TPL = ElementwiseTemplate(
# }}}
# vim: foldmethod=marker:filetype=pyopencl
# vim: foldmethod=marker
"""
.. currentmodule:: boxtree
.. _tree-of-boxes:
Manipulating Trees of Boxes
---------------------------
These functions manipulate instances of :class:`TreeOfBoxes`.
.. note::
These functions currently keep their bulk data in :class:`numpy.ndarray`
instances. This contrasts with the particle-based tree (:class:`Tree`),
which operates on data in :class:`pyopencl.array.Array` instances). Along
with the rest of :mod:`boxtree`, this will migrate to :mod:`arraycontext`
in the future.
.. autofunction:: make_tree_of_boxes_root
.. autofunction:: refine_tree_of_boxes
.. autofunction:: uniformly_refine_tree_of_boxes
.. autofunction:: coarsen_tree_of_boxes
.. autofunction:: refine_and_coarsen_tree_of_boxes
.. autofunction:: make_meshmode_mesh_from_leaves
"""
__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees"
__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import sys
from typing import TYPE_CHECKING, Any
import numpy as np
from boxtree.tree import TreeOfBoxes, box_flags_enum
if TYPE_CHECKING or getattr(sys, "_BUILDING_SPHINX_DOCS", False):
from meshmode.mesh import Mesh
# {{{ utils for tree of boxes
def _compute_tob_box_flags(box_child_ids: np.ndarray) -> np.ndarray:
nboxes = box_child_ids.shape[1]
# For the time being, we will work with the assumption that each box
# in the tree is both a source and a target box.
box_flags = np.full(
nboxes,
box_flags_enum.IS_SOURCE_BOX | box_flags_enum.IS_TARGET_BOX,
dtype=box_flags_enum.dtype)
box_is_leaf = np.all(box_child_ids == 0, axis=0)
box_flags[box_is_leaf] = box_flags[box_is_leaf] | box_flags_enum.IS_LEAF_BOX
box_flags[~box_is_leaf] = box_flags[~box_is_leaf] | (
box_flags_enum.HAS_SOURCE_CHILD_BOXES
| box_flags_enum.HAS_TARGET_CHILD_BOXES)
return box_flags
def _resized_array(arr: np.ndarray, new_size: int) -> np.ndarray:
"""Return a resized copy of the array. The new_size is a scalar which is
applied to the last dimension.
"""
old_size = arr.shape[-1]
prefix = (slice(None), ) * (arr.ndim - 1)
if old_size >= new_size:
key = (*prefix, slice(new_size))
return arr[key].copy()
else:
new_shape = list(arr.shape)
new_shape[-1] = new_size
new_arr = np.zeros(new_shape, arr.dtype)
key = (*prefix, slice(old_size))
new_arr[key] = arr
return new_arr
def _vec_of_signs(dim: int, i: int) -> np.ndarray:
"""The sign vector is obtained by converting i to a dim-bit binary.
"""
# e.g. bin(10) = '0b1010'
binary_digits = [int(bd) for bd in bin(i)[2:]]
n = len(binary_digits)
assert n <= dim
return np.array([0]*(dim-n) + binary_digits) * 2 - 1
# }}}
# {{{ refine/coarsen a tree of boxes
def refine_tree_of_boxes(tob: TreeOfBoxes, refine_flags: np.ndarray) -> TreeOfBoxes:
"""Make a refined copy of `tob` where boxes flagged with `refine_flags` are
refined.
"""
return refine_and_coarsen_tree_of_boxes(tob, refine_flags, None)
def uniformly_refine_tree_of_boxes(tob: TreeOfBoxes) -> TreeOfBoxes:
"""Make a uniformly refined copy of `tob`.
"""
refine_flags = np.zeros(tob.nboxes, bool)
refine_flags[tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0] = 1
return refine_tree_of_boxes(tob, refine_flags)
def coarsen_tree_of_boxes(
tob: TreeOfBoxes, coarsen_flags: np.ndarray,
error_on_ignored_flags: bool = True
) -> TreeOfBoxes:
"""Make a coarsened copy of `tob` where boxes flagged with `coarsen_flags`
are coarsened.
"""
return refine_and_coarsen_tree_of_boxes(
tob, None, coarsen_flags,
error_on_ignored_flags=error_on_ignored_flags)
def _apply_refine_flags_without_sorting(refine_flags, tob):
box_is_leaf = tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0
if refine_flags[~box_is_leaf].any():
raise ValueError("attempting to split non-leaf")
refine_parents, = np.where(refine_flags)
if len(refine_parents) == 0:
return tob
dim = tob.dimensions
nchildren = 2**dim
n_new_boxes = len(refine_parents) * nchildren
nboxes_new = tob.nboxes + n_new_boxes
child_box_starts = (
tob.nboxes
+ nchildren * np.arange(len(refine_parents)))
refine_parents_per_child = np.empty(
(nchildren, len(refine_parents)), np.intp)
refine_parents_per_child[:] = refine_parents.reshape(-1)
refine_parents_per_child = refine_parents_per_child.reshape(-1)
box_parents = _resized_array(tob.box_parent_ids, nboxes_new)
box_centers = _resized_array(tob.box_centers, nboxes_new)
box_children = _resized_array(tob.box_child_ids, nboxes_new)
box_levels = _resized_array(tob.box_levels, nboxes_new)
# new boxes are appended at the end, so applying coarsen_flags wrt the
# original tree is still meaningful after this
box_parents[tob.nboxes:] = refine_parents_per_child
box_levels[tob.nboxes:] = tob.box_levels[box_parents[tob.nboxes:]] + 1
box_children[:, refine_parents] = (
child_box_starts + np.arange(nchildren).reshape(-1, 1))
for i in range(2**dim):
children_i = box_children[i, refine_parents]
offsets = (
tob.root_extent * _vec_of_signs(dim, i).reshape(-1, 1)
* (1/2**(1+box_levels[children_i])))
box_centers[:, children_i] = (
box_centers[:, refine_parents] + offsets)
return TreeOfBoxes(
box_centers=box_centers,
root_extent=tob.root_extent,
box_parent_ids=box_parents,
box_child_ids=box_children,
box_levels=box_levels,
box_flags=_compute_tob_box_flags(box_children),
level_start_box_nrs=None,
box_id_dtype=tob.box_id_dtype,
box_level_dtype=tob.box_level_dtype,
coord_dtype=tob.coord_dtype,
sources_have_extent=tob.sources_have_extent,
targets_have_extent=tob.targets_have_extent,
extent_norm=tob.extent_norm,
stick_out_factor=tob.stick_out_factor,
_is_pruned=tob._is_pruned,
)
def _apply_coarsen_flags(coarsen_flags, tob, error_on_ignored_flags=True):
box_is_leaf = tob.box_flags & box_flags_enum.IS_LEAF_BOX != 0
if coarsen_flags[~box_is_leaf].any():
raise ValueError("attempting to coarsen non-leaf")
coarsen_sources, = np.where(coarsen_flags)
if coarsen_sources.size == 0:
return tob
coarsen_parents = tob.box_parent_ids[coarsen_sources]
coarsen_peers = tob.box_child_ids[:, coarsen_parents].reshape(-1)
coarsen_peer_is_leaf = box_is_leaf[coarsen_peers]
coarsen_exec_flags = np.all(coarsen_peer_is_leaf, axis=0)
# when a leaf box marked for coarsening has non-leaf peers
coarsen_flags_ignored = (coarsen_exec_flags != coarsen_flags)
if np.any(coarsen_flags_ignored):
msg = (f"{np.sum(coarsen_flags_ignored)} out of "
f"{np.sum(coarsen_flags)} coarsening flags ignored "
"to prevent removing non-leaf boxes")
if error_on_ignored_flags:
raise RuntimeError(msg)
else:
import warnings
warnings.warn(msg, stacklevel=3)
# deleted boxes are marked as:
# level = inf
# parent = -1
coarsen_parents = coarsen_parents[coarsen_exec_flags]
coarsen_peers = coarsen_peers[:, coarsen_exec_flags]
box_parents = tob.box_parent_ids.copy()
box_parents[coarsen_peers] = -1
box_children = tob.box_child_ids.copy()
box_children[:, coarsen_parents] = 0
box_levels = tob.box_levels.copy()
box_levels[coarsen_peers] = np.inf
return TreeOfBoxes(
box_centers=tob.box_centers,
root_extent=tob.root_extent,
box_parent_ids=box_parents,
box_child_ids=box_children,
box_levels=box_levels,
box_flags=_compute_tob_box_flags(box_children),
level_start_box_nrs=None,
box_id_dtype=tob.box_id_dtype,
box_level_dtype=tob.box_level_dtype,
coord_dtype=tob.coord_dtype,
sources_have_extent=tob.sources_have_extent,
targets_have_extent=tob.targets_have_extent,
extent_norm=tob.extent_norm,
stick_out_factor=tob.stick_out_factor,
_is_pruned=tob._is_pruned,
)
def _sort_boxes_by_level(tob, queue=None):
if not np.any(np.diff(tob.box_levels) < 0):
return tob
# reorder boxes to into non-decreasing levels
neworder = np.argsort(tob.box_levels)
box_centers = tob.box_centers[:, neworder]
box_parent_ids = tob.box_parent_ids[neworder]
box_child_ids = tob.box_child_ids[:, neworder]
box_levels = tob.box_levels[neworder]
return TreeOfBoxes(
box_centers=box_centers,
root_extent=tob.root_extent,
box_parent_ids=box_parent_ids,
box_child_ids=box_child_ids,
box_levels=box_levels,
box_flags=_compute_tob_box_flags(box_child_ids),
level_start_box_nrs=None,
box_id_dtype=tob.box_id_dtype,
box_level_dtype=tob.box_level_dtype,
coord_dtype=tob.coord_dtype,
sources_have_extent=tob.sources_have_extent,
targets_have_extent=tob.targets_have_extent,
extent_norm=tob.extent_norm,
stick_out_factor=tob.stick_out_factor,
_is_pruned=tob._is_pruned,
)
def _sort_and_prune_deleted_boxes(tob):
tob = _sort_boxes_by_level(tob)
n_stale_boxes = np.sum(tob.box_levels == np.inf)
newn = tob.nboxes - n_stale_boxes
return TreeOfBoxes(
root_extent=tob.root_extent,
box_parent_ids=tob.box_parent_ids[:newn],
box_child_ids=tob.box_child_ids[:, :newn],
box_levels=tob.box_levels[:newn],
box_centers=tob.box_centers[:, :newn],
box_flags=_compute_tob_box_flags(tob.box_child_ids[:, :newn]),
level_start_box_nrs=None,
box_id_dtype=tob.box_id_dtype,
box_level_dtype=tob.box_level_dtype,
coord_dtype=tob.coord_dtype,
sources_have_extent=tob.sources_have_extent,
targets_have_extent=tob.targets_have_extent,
extent_norm=tob.extent_norm,
stick_out_factor=tob.stick_out_factor,
_is_pruned=tob._is_pruned,
)
def refine_and_coarsen_tree_of_boxes(
tob: TreeOfBoxes,
refine_flags: np.ndarray | None = None,
coarsen_flags: np.ndarray | None = None, *,
error_on_ignored_flags: bool = True,
) -> TreeOfBoxes:
"""Make a refined/coarsened copy. When children of the same parent box
are marked differently, the refinement flag takes priority.
Both refinement and coarsening flags can only be set of leaves.
To prevent drastic mesh change, coarsening is only executed when a leaf
box is marked for coarsening, and its parent's children are all leaf
boxes (so that change in the number of boxes is bounded per box flagged).
Please note that the above behavior may be subject to change in the future.
:arg refine_flags: a boolean array of size `nboxes`.
:arg coarsen_flags: a boolean array of size `nboxes`.
:arg error_on_ignored_flags: if true, an exception is raised when enforcing
level restriction requires ignoring some coarsening flags.
:returns: a processed copy of the tree.
"""
if refine_flags is None:
refine_flags = np.zeros(tob.nboxes, dtype=bool)
if coarsen_flags is None:
coarsen_flags = np.zeros(tob.nboxes, dtype=bool)
if (refine_flags & coarsen_flags).any():
raise ValueError("some boxes are simultaneously marked "
"to refine and coarsen")
tob = _apply_refine_flags_without_sorting(refine_flags, tob)
coarsen_flags = _resized_array(coarsen_flags, tob.nboxes)
tob = _apply_coarsen_flags(coarsen_flags, tob, error_on_ignored_flags)
return _sort_and_prune_deleted_boxes(tob)
# }}}
# {{{ make_tree_of_boxes_root
def make_tree_of_boxes_root(
bbox: tuple[np.ndarray, np.ndarray], *,
box_id_dtype: Any = None,
box_level_dtype: Any = None,
coord_dtype: Any = None,
) -> TreeOfBoxes:
"""
Make the minimal tree of boxes, consisting of a single root box filling
*bbox*.
.. note::
*bbox* is expected to be square (with tolerances as accepted by
:func:`numpy.allclose`).
:arg bbox: a :class:`tuple` of ``(lower_bounds, upper_bounds)`` for the
bounding box.
"""
assert len(bbox) == 2
from pytools import single_valued
dim = single_valued([len(bbox[0]), len(bbox[1])])
if box_id_dtype is None:
box_id_dtype = np.int32
box_id_dtype = np.dtype(box_id_dtype)
if box_level_dtype is None:
box_level_dtype = np.int32
box_level_dtype = np.dtype(box_level_dtype)
if coord_dtype is None:
coord_dtype = bbox[0].dtype
coord_dtype = np.dtype(coord_dtype)
box_centers = np.array(
[(bbox[0][iaxis] + bbox[1][iaxis]) * 0.5 for iaxis in range(dim)],
dtype=coord_dtype,
).reshape(dim, 1)
root_extent = single_valued(
np.array(
[(bbox[1][iaxis] - bbox[0][iaxis]) for iaxis in range(dim)],
dtype=coord_dtype),
equality_pred=np.allclose)
box_parent_ids = np.array([0], dtype=box_id_dtype)
box_parent_ids[0] = -1 # root has no parent
box_child_ids = np.array([0] * 2**dim, box_id_dtype).reshape(2**dim, 1)
return TreeOfBoxes(
box_centers=box_centers,
root_extent=root_extent,
box_parent_ids=box_parent_ids,
box_child_ids=box_child_ids,
box_levels=np.array([0], box_level_dtype),
box_flags=_compute_tob_box_flags(box_child_ids),
level_start_box_nrs=np.array([0], dtype=box_level_dtype),
box_id_dtype=box_id_dtype,
box_level_dtype=box_level_dtype,
coord_dtype=coord_dtype,
sources_have_extent=False,
targets_have_extent=False,
extent_norm="linf",
stick_out_factor=0,
_is_pruned=True,
)
# }}}
# {{{ make_meshmode_mesh_from_leaves
def make_meshmode_mesh_from_leaves(tob: TreeOfBoxes) -> tuple["Mesh", np.ndarray]:
"""Make a :class:`~meshmode.mesh.Mesh` from the leaf boxes of the tree
of boxes *tob*.
:returns: A tuple of the mesh and a vector of the element number -> box number
mapping.
"""
dim = tob.dimensions
lfboxes = tob.leaf_boxes
lfcenters = tob.box_centers[:, lfboxes]
lflevels = tob.box_levels[lfboxes]
lfradii = tob.root_extent / 2 / (2**lflevels)
# use tensor product nodes ordering
import modepy as mp
cell_nodes_1d = np.array([-1, 1])
cell_nodes = mp.tensor_product_nodes(dim, cell_nodes_1d)
lfvertices = (
np.repeat(lfcenters, 2**dim, axis=1)
+ np.repeat(lfradii, 2**dim) * np.tile(cell_nodes, (1, len(lfboxes)))
)
# FIXME: purge redundant vertices
from meshmode.mesh import TensorProductElementGroup, make_mesh
from meshmode.mesh.generation import make_group_from_vertices
vertex_indices = np.arange(
len(lfboxes) * 2**dim, dtype=np.int32).reshape([-1, 2**dim])
group = make_group_from_vertices(
lfvertices, vertex_indices, 1,
group_cls=TensorProductElementGroup,
unit_nodes=None)
return make_mesh(lfvertices, [group]), tob.leaf_boxes
# }}}
# vim: foldmethod=marker
VERSION = (2013, 1)
VERSION_TEXT = ".".join(str(i) for i in VERSION)
from importlib import metadata
def _parse_version(version: str) -> tuple[tuple[int, ...], str]:
import re
m = re.match(r"^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
assert m is not None
return tuple(int(nr) for nr in m.group(1).split(".")), m.group(2)
VERSION_TEXT = metadata.version("boxtree")
VERSION, VERSION_STATUS = _parse_version(VERSION_TEXT)
from __future__ import division
from __future__ import absolute_import
from six.moves import range
from six.moves import zip
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
__license__ = """
......@@ -25,6 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import numpy as np
# {{{ utilities
def int_to_roman(inp):
"""
......@@ -34,13 +33,13 @@ def int_to_roman(inp):
# https://code.activestate.com/recipes/81611-roman-numerals/
if not isinstance(inp, int):
raise TypeError("expected integer, got %s" % type(inp))
raise TypeError(f"expected integer, got {type(inp)}")
if inp == 0:
return "Z"
if not 0 < inp < 4000:
raise ValueError("Argument must be between 1 and 3999 (got %d)" % inp)
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
raise ValueError(f"Argument must be between 1 and 3999 (got {inp})")
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
nums = ("M", "CM", "D", "CD", "C", "XC", "L", "XL", "X", "IX", "V", "IV", "I")
result = ""
for i in range(len(ints)):
count = int(inp / ints[i])
......@@ -48,6 +47,10 @@ def int_to_roman(inp):
inp -= ints[i] * count
return result
# }}}
# {{{ tree plotting
class TreePlotter:
"""Assumes that the tree has data living on the host.
......@@ -81,13 +84,19 @@ class TreePlotter:
"""
:arg kwargs: keyword arguments to pass on to
:class:`matplotlib.patches.PathPatch`,
e.g. `facecolor='red', edgecolor='yellow', alpha=0.5`
e.g. `facecolor="red", edgecolor="yellow", alpha=0.5`
"""
el, eh = self.tree.get_box_extent(ibox)
import matplotlib.pyplot as pt
shrink_factor = kwargs.pop("shrink_factor", 0)
if shrink_factor:
center = 0.5*(el+eh)
el += (center-el)*shrink_factor
eh += (center-eh)*shrink_factor
import matplotlib.patches as mpatches
import matplotlib.pyplot as pt
from matplotlib.path import Path
pathdata = [
......@@ -98,7 +107,7 @@ class TreePlotter:
(Path.CLOSEPOLY, (el[0], el[1])),
]
codes, verts = zip(*pathdata)
codes, verts = zip(*pathdata, strict=True)
path = Path(verts, codes)
patch = mpatches.PathPatch(path, **kwargs)
pt.gca().add_patch(patch)
......@@ -113,7 +122,7 @@ class TreePlotter:
lev = int(tree.box_levels[ibox])
pt.text(x, y, str(ibox), fontsize=20*1.15**(-lev),
ha="center", va="center",
bbox=dict(facecolor='white', alpha=0.5, lw=0))
bbox={"facecolor": "white", "alpha": 0.5, "lw": 0})
def get_tikz_for_tree(self):
if self.tree.dimensions != 2:
......@@ -121,28 +130,28 @@ class TreePlotter:
lines = []
lines.append(r"\def\nboxes{%d}" % self.tree.nboxes)
lines.append(r"\def\lastboxnr{%d}" % (self.tree.nboxes-1))
lines.append(r"\def\nboxes{%d}" % self.tree.nboxes) # noqa: UP031
lines.append(r"\def\lastboxnr{%d}" % (self.tree.nboxes-1)) # noqa: UP031
for ibox in range(self.tree.nboxes):
el, eh = self.tree.get_box_extent(ibox)
el_0, el_1 = float(el[0]), float(el[1])
eh_0, eh_1 = float(eh[0]), float(eh[1])
c = self.tree.box_centers[:, ibox]
c_0, c_1 = float(c[0]), float(c[1])
lines.append(
r"\coordinate (boxl%d) at (%r, %r);"
% (ibox, float(el[0]), float(el[1])))
fr"\coordinate (boxl{ibox}) at ({el_0!r}, {el_1!r});")
lines.append(
r"\coordinate (boxh%d) at (%r, %r);"
% (ibox, float(eh[0]), float(eh[1])))
fr"\coordinate (boxh{ibox}) at ({eh_0!r}, {eh_1!r});")
lines.append(
r"\coordinate (boxc%d) at (%r, %r);"
% (ibox, float(c[0]), float(c[1])))
fr"\coordinate (boxc{ibox}) at ({c_0!r}, {c_1!r});")
lines.append(
r"\def\boxsize%s{%r}"
% (int_to_roman(ibox), float(eh[0]-el[0])))
r"\def\boxsize%s{%r}" % (int_to_roman(ibox), eh_0 - el_0) # noqa: UP031
)
lines.append(
r"\def\boxlevel%s{%r}"
% (int_to_roman(ibox), self.tree.box_levels[ibox]))
r"\dev\boxlevel%s{%r}" % (int_to_roman(ibox), # noqa: UP031
self.tree.box_levels[ibox]))
lines.append(
r"\def\boxpath#1{(boxl#1) rectangle (boxh#1)}")
......@@ -158,4 +167,111 @@ class TreePlotter:
r"}}")
return "\n".join(lines)
# }}}
# {{{ traversal plotting
def _draw_box_list(tree_plotter, ibox, starts, lists, key_to_box=None, **kwargs):
rng = kwargs.pop("rng", None)
if rng is None:
rng = np.random.default_rng()
default_facecolor = "blue"
if key_to_box is not None:
ind, = np.where(key_to_box == ibox)
if len(ind):
key, = ind
else:
# indicate empty list
actual_kwargs = {
"edgecolor": getattr(kwargs, "facecolor", default_facecolor),
"fill": False,
"alpha": 0.5,
"shrink_factor": -0.1+0.1*rng.random(),
}
tree_plotter.draw_box(ibox, **actual_kwargs)
return
else:
key = ibox
start, end = starts[key:key+2]
if start == end:
return
actual_kwargs = {
"facecolor": default_facecolor,
"linewidth": 0,
"alpha": 0.5,
"shrink_factor": 0.1 + rng.random()*0.2,
}
actual_kwargs.update(kwargs)
print(actual_kwargs["facecolor"], ibox, lists[start:end])
for jbox in lists[start:end]:
tree_plotter.draw_box(jbox, **actual_kwargs)
def draw_same_level_non_well_sep_boxes(tree_plotter, traversal, ibox):
tree_plotter.draw_box(ibox, facecolor="red",
alpha=0.5)
# same-level non-well-sep
_draw_box_list(tree_plotter, ibox,
traversal.same_level_non_well_sep_boxes_starts,
traversal.same_level_non_well_sep_boxes_lists,
facecolor="green")
def draw_box_lists(tree_plotter, traversal, ibox):
tree_plotter.draw_box(ibox, facecolor="red",
alpha=0.5)
# from near neighbors ("list 1")
_draw_box_list(tree_plotter, ibox,
traversal.neighbor_source_boxes_starts,
traversal.neighbor_source_boxes_lists,
key_to_box=traversal.target_boxes,
facecolor="green")
# from well-separated siblings (list 2)
_draw_box_list(tree_plotter, ibox,
traversal.from_sep_siblings_starts,
traversal.from_sep_siblings_lists,
key_to_box=traversal.target_or_target_parent_boxes,
facecolor="blue")
# from separated smaller (list 3)
for ilev in range(tree_plotter.tree.nlevels):
_draw_box_list(tree_plotter, ibox,
traversal.from_sep_smaller_by_level[ilev].starts,
traversal.from_sep_smaller_by_level[ilev].lists,
key_to_box=traversal.target_boxes_sep_smaller_by_source_level[ilev],
facecolor="orange")
# list 3 close
if traversal.from_sep_close_smaller_starts is not None:
_draw_box_list(tree_plotter, ibox,
traversal.from_sep_close_smaller_starts,
traversal.from_sep_close_smaller_lists,
key_to_box=traversal.target_boxes,
facecolor="orange", hatch=".")
# from separated bigger (list 4)
_draw_box_list(tree_plotter, ibox,
traversal.from_sep_bigger_starts,
traversal.from_sep_bigger_lists,
key_to_box=traversal.target_or_target_parent_boxes,
facecolor="purple")
# list 4 close
if traversal.from_sep_close_bigger_starts is not None:
_draw_box_list(tree_plotter, ibox,
traversal.from_sep_close_bigger_starts,
traversal.from_sep_close_bigger_lists,
key_to_box=traversal.target_boxes,
facecolor="purple", hatch=".")
# }}}
# vim: filetype=pyopencl:fdm=marker
......@@ -2,7 +2,7 @@
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXOPTS = -n
SPHINXBUILD = python $(shell which sphinx-build)
PAPER =
BUILDDIR = _build
......
from __future__ import absolute_import
# -*- coding: utf-8 -*-
#
# boxtree documentation build configuration file.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
from importlib import metadata
from urllib.request import urlopen
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
_conf_url = \
"https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
with urlopen(_conf_url) as _inf:
exec(compile(_inf.read(), _conf_url, "exec"), globals())
# -- General configuration -----------------------------------------------------
copyright = "2013-21, Andreas Kloeckner"
release = metadata.version("boxtree")
version = ".".join(release.split(".")[:2])
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.mathjax',
#'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
autoclass_content = "both"
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'boxtree'
copyright = u'2013, Andreas Kloeckner'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
ver_dic = {}
exec(compile(open("../boxtree/version.py").read(), "../boxtree/version.py", 'exec'), ver_dic)
version = ".".join(str(x) for x in ver_dic["VERSION"])
# The full version, including alpha/beta/rc tags.
release = ver_dic["VERSION_TEXT"]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
html_theme = "alabaster"
html_theme_options = {
"extra_nav_links": {
"🚀 Github": "https://github.com/inducer/boxtree",
"💾 Download Releases": "https://pypi.python.org/pypi/boxtree",
}
}
html_sidebars = {
'**': [
'about.html',
'navigation.html',
'relations.html',
'searchbox.html',
]
intersphinx_mapping = {
"arraycontext": ("https://documen.tician.de/arraycontext", None),
"meshmode": ("https://documen.tician.de/meshmode", None),
"numpy": ("https://numpy.org/doc/stable", None),
"pyopencl": ("https://documen.tician.de/pyopencl", None),
"pytential": ("https://documen.tician.de/pytential", None),
"python": ("https://docs.python.org/3", None),
}
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'boxtreedoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'boxtree.tex', u'boxtree Documentation',
u'Andreas Kloeckner', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'boxtree', u'boxtree Documentation',
[u'Andreas Kloeckner'], 1)
nitpick_ignore_regex = [
["py:class", r"numpy._?typing._generic_alias.ScalarType"],
]
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'http://docs.python.org/': None,
'http://docs.scipy.org/doc/numpy/': None,
'http://documen.tician.de/pyopencl': None,
}
# Some modules need to import things just so that sphinx can resolve symbols in
# type annotations. Often, we do not want these imports (e.g. of PyOpenCL) when
# in normal use (because they would introduce unintended side effects or hard
# dependencies). This flag exists so that these imports only occur during doc
# build. Since sphinx appears to resolve type hints lexically (as it should),
# this needs to be cross-module (since, e.g. an inherited arraycontext
# docstring can be read by sphinx when building meshmode, a dependent package),
# this needs a setting of the same name across all packages involved, that's
# why this name is as global-sounding as it is.
sys._BUILDING_SPHINX_DOCS = True
FMM Cost Model
==============
.. automodule:: boxtree.cost