From 2f402fc90f3d3422def2cdda283e03fa500be0b2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 14:53:32 -0500 Subject: [PATCH 01/13] Implement level restriction. --- boxtree/tools.py | 142 +++++-- boxtree/traversal.py | 67 ++-- boxtree/tree_build.py | 725 +++++++++++++++++++++++++++------ boxtree/tree_build_kernels.py | 728 ++++++++++++++++++++++------------ doc/tree.rst | 18 + test/test_tree.py | 88 +++- 6 files changed, 1327 insertions(+), 441 deletions(-) diff --git a/boxtree/tools.py b/boxtree/tools.py index fcc2c08..638c5fa 100644 --- a/boxtree/tools.py +++ b/boxtree/tools.py @@ -44,15 +44,18 @@ def padded_bin(i, l): return s -def realloc_array(ary, new_shape, zero_fill, queue, wait_for): - new_ary = cl.array.empty(queue, shape=new_shape, dtype=ary.dtype, - allocator=ary.allocator) +# NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__() +def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=[]): if zero_fill: - new_ary.fill(0, wait_for=wait_for) - wait_for = new_ary.events + array_maker = cl.array.zeros + else: + array_maker = cl.array.empty + + new_ary = array_maker(queue, shape=new_shape, dtype=ary.dtype, + allocator=allocator) evt = cl.enqueue_copy(queue, new_ary.data, ary.data, byte_count=ary.nbytes, - wait_for=wait_for) + wait_for=wait_for + new_ary.events) return new_ary, evt @@ -341,14 +344,22 @@ GAPPY_COPY_TPL = Template(r"""//CL// typedef ${dtype_to_ctype(dtype)} value_t; - value_t val = input_ary[from_indices[i]]; + %if from_indices: + value_t val = input_ary[from_indices[i]]; + %else: + value_t val = input_ary[i]; + %endif // Optionally, noodle values through a lookup table. %if map_values: val = value_map[val]; %endif - output_ary[i] = val; + %if to_indices: + output_ary[to_indices[i]] = val; + %else: + output_ary[i] = val; + %endif """, strict_undefined=True) @@ -358,14 +369,20 @@ class GappyCopyAndMapKernel: self.context = context @memoize_method - def _get_kernel(self, dtype, src_index_dtype, map_values=False): + def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, + have_src_indices, have_dst_indices, map_values): from pyopencl.tools import VectorArg args = [ VectorArg(dtype, "input_ary", with_offset=True), VectorArg(dtype, "output_ary", with_offset=True), - VectorArg(src_index_dtype, "from_indices", with_offset=True) - ] + ] + + if have_src_indices: + args.append(VectorArg(src_index_dtype, "from_indices", with_offset=True)) + + if have_dst_indices: + args.append(VectorArg(dst_index_dtype, "to_indices", with_offset=True)) if map_values: args.append(VectorArg(dtype, "value_map", with_offset=True)) @@ -374,34 +391,113 @@ class GappyCopyAndMapKernel: src = GAPPY_COPY_TPL.render( dtype=dtype, dtype_to_ctype=dtype_to_ctype, + from_dtype=src_index_dtype, + to_dtype=dst_index_dtype, + from_indices=have_src_indices, + to_indices=have_dst_indices, map_values=map_values) from pyopencl.elementwise import ElementwiseKernel return ElementwiseKernel(self.context, args, str(src), name="gappy_copy_and_map") - def __call__(self, queue, allocator, new_size, - src_indices, ary, map_values=None, wait_for=None): + # NOTE: Order of positional args should match realloc_array() + def __call__(self, queue, allocator, new_shape, ary, src_indices=None, + dst_indices=None, map_values=None, zero_fill=False, + wait_for=None, range=None, debug=False): """Compresses box info arrays after empty leaf pruning and, optionally, maps old box IDs to new box IDs (if the array being operated on contains box IDs). """ - assert len(ary) >= new_size + have_src_indices = src_indices is not None + have_dst_indices = dst_indices is not None + have_map_values = map_values is not None + + if not (have_src_indices or have_dst_indices): + raise ValueError("must specify at least one of src or dest indices") + + if range is None: + if have_src_indices and have_dst_indices: + raise ValueError( + "must supply range when passing both src and dest indices") + elif have_src_indices: + range = slice(src_indices.shape[0]) + if debug: + assert int(cl.array.max(src_indices).get()) < len(ary) + elif have_dst_indices: + range = slice(dst_indices.shape[0]) + if debug: + assert int(cl.array.max(dst_indices).get()) < new_shape + + if zero_fill: + array_maker = cl.array.zeros + else: + array_maker = cl.array.empty + + result = array_maker(queue, new_shape, ary.dtype, allocator=allocator) + + kernel = self._get_kernel(ary.dtype, + src_indices.dtype if have_src_indices else None, + dst_indices.dtype if have_dst_indices else None, + have_src_indices, + have_dst_indices, + have_map_values) + + args = (ary, result) + args += (src_indices,) if have_src_indices else () + args += (dst_indices,) if have_dst_indices else () + args += (map_values,) if have_map_values else () + + evt = kernel(*args, queue=queue, range=range, wait_for=wait_for) - result = cl.array.empty(queue, new_size, ary.dtype, allocator=allocator) + return result, evt - kernel = self._get_kernel(ary.dtype, src_indices.dtype, - # map_values: - map_values is not None) +# }}} - args = (ary, result, src_indices) - if map_values is not None: - args += (map_values,) - evt = kernel(*args, queue=queue, range=slice(new_size), wait_for=wait_for) +# {{{ map values through table - return result, evt +from pyopencl.elementwise import ElementwiseTemplate + + +MAP_VALUES_TPL = ElementwiseTemplate( + arguments="""//CL// + dst_value_t *dst, + src_value_t *src, + dst_value_t *map_values + """, + operation=r"""//CL// + dst[i] = map_values[src[i]]; + """, + name="map_values") + + +class MapValuesKernel(object): + + def __init__(self, context): + self.context = context + + @memoize_method + def _get_kernel(self, dst_dtype, src_dtype): + type_aliases = ( + ("src_value_t", src_dtype), + ("dst_value_t", dst_dtype) + ) + + return MAP_VALUES_TPL.build(self.context, type_aliases) + + def __call__(self, map_values, src, dst=None): + """ + Map the entries of the array `src` through the table `map_values`. + """ + if dst is None: + dst = src + + kernel = self._get_kernel(dst.dtype, src.dtype) + evt = kernel(dst, src, map_values) + + return dst, evt # }}} diff --git a/boxtree/traversal.py b/boxtree/traversal.py index 593e2c8..98cd590 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -36,36 +36,7 @@ logger = logging.getLogger(__name__) # {{{ preamble -TRAVERSAL_PREAMBLE_TEMPLATE = r"""//CL// -${box_flags_enum.get_c_defines()} -${box_flags_enum.get_c_typedef()} - -typedef ${dtype_to_ctype(box_id_dtype)} box_id_t; -%if particle_id_dtype is not None: - typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; -%endif -typedef ${dtype_to_ctype(coord_dtype)} coord_t; -typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t; - -#define NLEVELS ${max_levels} -#define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor}) - -<%def name="load_center(name, box_id)"> - coord_vec_t ${name}; - %for i in range(dimensions): - ${name}.${AXIS_NAMES[i]} = box_centers[aligned_nboxes * ${i} + ${box_id}]; - %endfor - - -#define LEVEL_TO_RAD(level) \ - (root_extent * 1 / (coord_t) (1 << (level + 1))) - -%if 0: - #define dbg_printf(ARGS) printf ARGS -%else: - #define dbg_printf(ARGS) /* */ -%endif - +TRAVERSAL_PREAMBLE_MAKO_DEFS = r"""//CL:mako// <%def name="walk_init(start_box_id)"> box_id_t box_stack[NLEVELS]; int morton_nr_stack[NLEVELS]; @@ -127,6 +98,13 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t; walk_morton_nr = 0; +<%def name="load_center(name, box_id)"> + coord_vec_t ${name}; + %for i in range(dimensions): + ${name}.${AXIS_NAMES[i]} = box_centers[aligned_nboxes * ${i} + ${box_id}]; + %endfor + + <%def name="check_l_infty_ball_overlap( is_overlapping, box_id, ball_radius, ball_center)"> { @@ -144,9 +122,38 @@ typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t; ${is_overlapping} = max_dist <= size_sum; } +""" + + +TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES = r"""//CL// +${box_flags_enum.get_c_defines()} +${box_flags_enum.get_c_typedef()} + +typedef ${dtype_to_ctype(box_id_dtype)} box_id_t; +%if particle_id_dtype is not None: + typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; +%endif +typedef ${dtype_to_ctype(coord_dtype)} coord_t; +typedef ${dtype_to_ctype(vec_types[coord_dtype, dimensions])} coord_vec_t; + +#define NLEVELS ${max_levels} +#define STICK_OUT_FACTOR ((coord_t) ${stick_out_factor}) + +#define LEVEL_TO_RAD(level) \ + (root_extent * 1 / (coord_t) (1 << (level + 1))) +%if 0: + #define dbg_printf(ARGS) printf ARGS +%else: + #define dbg_printf(ARGS) /* */ +%endif """ + +TRAVERSAL_PREAMBLE_TEMPLATE = ( + TRAVERSAL_PREAMBLE_MAKO_DEFS + + TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES) + # }}} # {{{ adjacency test diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index f84c9ed..ab0fd86 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -50,8 +50,9 @@ class TreeBuilder(object): # This is used to map box IDs and compress box lists in empty leaf # pruning. - from boxtree.tools import GappyCopyAndMapKernel + from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel self.gappy_copy_and_map = GappyCopyAndMapKernel(self.context) + self.map_values_kernel = MapValuesKernel(self.context) morton_nr_dtype = np.dtype(np.int8) box_level_dtype = np.dtype(np.uint8) @@ -60,26 +61,34 @@ class TreeBuilder(object): def get_kernel_info(self, dimensions, coord_dtype, particle_id_dtype, box_id_dtype, sources_are_targets, srcntgts_have_extent, - stick_out_factor, adaptive): + stick_out_factor, kind): from boxtree.tree_build_kernels import get_tree_build_kernel_info return get_tree_build_kernel_info(self.context, dimensions, coord_dtype, particle_id_dtype, box_id_dtype, sources_are_targets, srcntgts_have_extent, stick_out_factor, self.morton_nr_dtype, self.box_level_dtype, - adaptive=adaptive) + kind=kind) # {{{ run control - def __call__(self, queue, particles, max_particles_in_box=None, - allocator=None, debug=False, targets=None, - source_radii=None, target_radii=None, stick_out_factor=0.25, - refine_weights=None, max_leaf_refine_weight=None, - wait_for=None, non_adaptive=False, - **kwargs): + def __call__(self, queue, particles, kind="adaptive", + max_particles_in_box=None, allocator=None, debug=False, + targets=None, source_radii=None, target_radii=None, + stick_out_factor=0.25, refine_weights=None, + max_leaf_refine_weight=None, wait_for=None, **kwargs): """ :arg queue: a :class:`pyopencl.CommandQueue` instance :arg particles: an object array of (XYZ) point coordinate arrays. + :arg kind: One of the following strings: + + - 'adaptive' + - 'adaptive-level-restricted' + - 'non-adaptive' + + 'adaptive' requests an adaptive tree without level restriction. See + :ref:`tree-kinds` for further explanation. + :arg targets: an object array of (XYZ) point coordinate arrays or ``None``. If ``None``, *particles* act as targets, too. Must have the same (inner) dtype as *particles*. @@ -104,9 +113,6 @@ class TreeBuilder(object): :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. - :arg non_adaptive: If *True*, return a tree in which all leaf boxes are - on the same (last) level. The tree is pruned, in the sense that empty - boxes have been eliminated. :arg kwargs: Used internally for debugging. :returns: a tuple ``(tree, event)``, where *tree* is an instance of @@ -116,6 +122,9 @@ class TreeBuilder(object): # {{{ input processing + if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]: + raise ValueError("unknown tree kind \"{0}\"".format(kind)) + # we'll modify this below, so copy it if wait_for is None: wait_for = [] @@ -169,15 +178,14 @@ class TreeBuilder(object): empty = partial(cl.array.empty, queue, allocator=allocator) def zeros(shape, dtype): - result = (cl.array.empty(queue, shape, dtype, allocator=allocator) - .fill(0, wait_for=wait_for)) + result = cl.array.zeros(queue, shape, dtype, allocator=allocator) event, = result.events return result, event knl_info = self.get_kernel_info(dimensions, coord_dtype, particle_id_dtype, box_id_dtype, sources_are_targets, srcntgts_have_extent, - stick_out_factor, adaptive=not non_adaptive) + stick_out_factor, kind=kind) # {{{ combine sources and targets into one array, if necessary @@ -315,8 +323,6 @@ class TreeBuilder(object): # }}} - from pytools import div_ceil - # {{{ allocate data logger.debug("allocating memory") @@ -337,16 +343,6 @@ class TreeBuilder(object): prep_events.append(evt) srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype) prep_events.append(evt) - split_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype) - prep_events.append(evt) - - # number of boxes total, and a guess - nboxes_dev = empty((), dtype=box_id_dtype) - nboxes_dev.fill(1) - - # /!\ If you're allocating an array here that depends on nboxes_guess, - # you *must* also write reallocation code down below for the case when - # nboxes_guess was too low. # Outside nboxes_guess feeding is solely for debugging purposes, # to test the reallocation code. @@ -358,9 +354,26 @@ class TreeBuilder(object): assert nboxes_guess > 0 + # /!\ IMPORTANT + # + # If you're allocating an array here that depends on nboxes_guess, or if + # your array contains box numbers, you have to write code for the + # following down below as well: + # + # * You *must* write reallocation code to handle box renumbering and + # reallocation triggered at the top of the level loop. + # + # * If your array persists after the level loop, you *must* write code + # to handle box renumbering and reallocation triggered by the box + # pruning step. + + split_box_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype) + prep_events.append(evt) + # per-box morton bin counts - box_morton_bin_counts = empty(nboxes_guess, - dtype=knl_info.morton_bin_count_dtype) + box_morton_bin_counts, evt = zeros(nboxes_guess, + dtype=knl_info.morton_bin_count_dtype) + prep_events.append(evt) # particle# at which each box starts box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype) @@ -370,9 +383,20 @@ class TreeBuilder(object): box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype) prep_events.append(evt) - # morton nr identifier {quadr,oct}ant of parent in which this box was created - box_morton_nrs, evt = zeros(nboxes_guess, dtype=self.morton_nr_dtype) - prep_events.append(evt) + # pointer to child box, by morton number + box_child_ids, evts = zip( + *(zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions))) + prep_events.extend(evts) + + # box centers, by dimension + box_centers, evts = zip( + *(zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions))) + prep_events.extend(evts) + + # Initialize box_centers[0] to contain the root box's center + for d, (ax, evt) in enumerate(zip(axis_names, evts)): + center_ax = bbox["min_"+ax] + (bbox["max_"+ax] - bbox["min_"+ax]) / 2 + box_centers[d][0].fill(center_ax, wait_for=[evt]) # box -> level map box_levels, evt = zeros(nboxes_guess, self.box_level_dtype) @@ -387,15 +411,30 @@ class TreeBuilder(object): box_srcntgt_counts_cumul[0].fill( nsrcntgts, queue=queue, wait_for=[evt]) - # box -> whether the box has a child + # box -> whether the box has a child. FIXME: use bools box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) prep_events.append(evt) + # box -> whether the box needs a splitting to enforce level restriction. + # FIXME: use bools + force_split_box, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) + prep_events.append(evt) + # set parent of root box to itself evt = cl.enqueue_copy( queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype)) prep_events.append(evt) + nlevels_max = np.iinfo(self.box_level_dtype).max + + # level -> starting box on level + level_start_box_nrs_dev, evt = zeros(nlevels_max, dtype=box_id_dtype) + prep_events.append(evt) + + # level -> number of used boxes on level + level_used_box_counts_dev, evt = zeros(nlevels_max, dtype=box_id_dtype) + prep_events.append(evt) + # }}} def fin_debug(s): @@ -408,13 +447,34 @@ class TreeBuilder(object): have_oversize_split_box, evt = zeros((), np.int32) prep_events.append(evt) + have_upper_level_split_box, evt = zeros((), np.int32) + prep_events.append(evt) + wait_for = prep_events + from pytools import div_ceil + # {{{ level loop # Level 0 starts at 0 and always contains box 0 and nothing else. # Level 1 therefore starts at 1. level_start_box_nrs = [0, 1] + level_start_box_nrs_dev[0].fill(0) + level_start_box_nrs_dev[1].fill(1) + wait_for.extend(level_start_box_nrs_dev.events) + + # This counts the number of boxes that have been used per level. Note + # that this could be fewer than the actual number of boxes allocated to + # the level (in the case of building a level restricted tree, more boxes + # are pre-allocated for a level than used since we may decide to split + # parent level boxes later). + level_used_box_counts = [1] + level_used_box_counts_dev[0].fill(1) + wait_for.extend(level_used_box_counts_dev.events) + + # level -> number of leaf boxes on level. Initially the root node is a + # leaf. + level_leaf_counts = np.array([1]) from time import time start_time = time() @@ -428,6 +488,8 @@ class TreeBuilder(object): # - level is the level being built. # - the last entry of level_start_box_nrs is the beginning of the level # to be built + # - the last entry of level_used_box_counts is the number of boxes that + # are used (not just allocated) at the previous level # This while condition prevents entering the loop in case there's just a # single box, by how 'level' is set above. Read this as 'while True' with @@ -435,22 +497,31 @@ class TreeBuilder(object): logger.debug("entering level loop with %s srcntgts" % nsrcntgts) + # When doing level restriction, the level loop may need to be entered + # one more time after creating all the levels (see fixme note below + # regarding this). This flag is set to True when that happens. + final_level_restrict_iteration = False + while level: if debug: # More invariants: assert level == len(level_start_box_nrs) - 1 + assert level == len(level_used_box_counts) + assert level == len(level_leaf_counts) if level > np.iinfo(self.box_level_dtype).max: raise RuntimeError("level count exceeded maximum") + cl.wait_for_events(wait_for) + common_args = ((morton_bin_counts, morton_nrs, - box_start_flags, srcntgt_box_ids, split_box_ids, + box_start_flags, + srcntgt_box_ids, split_box_ids, box_morton_bin_counts, refine_weights, max_leaf_refine_weight, box_srcntgt_starts, box_srcntgt_counts_cumul, - box_parent_ids, box_morton_nrs, - nboxes_dev, + box_parent_ids, box_levels, level, bbox, user_srcntgt_ids) + tuple(srcntgts) @@ -459,7 +530,7 @@ class TreeBuilder(object): fin_debug("morton count scan") - # writes: box_morton_bin_counts, morton_nrs + # writes: box_morton_bin_counts evt = knl_info.morton_count_scan( *common_args, queue=queue, size=nsrcntgts, wait_for=wait_for) @@ -467,77 +538,254 @@ class TreeBuilder(object): fin_debug("split box id scan") - # writes: nboxes_dev, split_box_ids + # writes: box_has_children, split_box_ids evt = knl_info.split_box_id_scan( srcntgt_box_ids, - box_srcntgt_starts, box_srcntgt_counts_cumul, box_morton_bin_counts, refine_weights, max_leaf_refine_weight, box_levels, + level_start_box_nrs_dev, + level_used_box_counts_dev, + force_split_box, level, - # input/output: - nboxes_dev, - # output: box_has_children, split_box_ids, - queue=queue, size=nsrcntgts, wait_for=wait_for) + have_oversize_split_box, + + queue=queue, + size=level_start_box_nrs[level], + wait_for=wait_for) wait_for = [evt] - nboxes_new = int(nboxes_dev.get()) + # {{{ compute new level_used_box_counts, level_leaf_counts + + # The last split_box_id on each level tells us how many boxes are + # needed at the next level. + new_level_used_box_counts = [1] + for level_start_box_id in level_start_box_nrs[1:]: + last_box_on_prev_level = level_start_box_id - 1 + new_level_used_box_counts.append( + # FIXME: Get this all at once. + int(split_box_ids[last_box_on_prev_level].get()) + - level_start_box_id) + + new_level_leaf_counts = np.empty(level + 1, dtype=int) + # New leaf count = + # old leaf count + # + nr. new boxes from splitting parent's leaves + # - nr. new boxes from splitting current level's leaves / 2**d + level_used_box_counts_diff = (new_level_used_box_counts + - np.append(level_used_box_counts, [0])) + new_level_leaf_counts[:-1] = (level_leaf_counts + + level_used_box_counts_diff[:-1] + - level_used_box_counts_diff[1:] // 2 ** dimensions) + new_level_leaf_counts[-1] = level_used_box_counts_diff[-1] + del level_used_box_counts_diff + + # }}} # Assumption: Everything between here and the top of the loop must # be repeatable, so that in an out-of-memory situation, we can just # rerun this bit of the code after reallocating and a minimal reset # procedure. - # {{{ reallocate and retry if nboxes_guess was too small + # The algorithm for deciding on level sizes is as follows: + # 1. Compute the minimal necessary size of each level. + # 2. If level restricting, add padding to the lower level. + # 3. Check if there is enough existing space for each level. + # 4. If any does not have space, reallocate all levels: + # 4a. Compute new sizes of upper levels + # 4b. If level restricting, add padding to all levels. + + curr_upper_level_lengths = np.diff(level_start_box_nrs) + minimal_upper_level_lengths = np.max( + [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0) + minimal_new_level_length = new_level_used_box_counts[-1] + + # Allocate extra space at the end of the current level for higher + # level leaves that may be split later. + # + # If there are no further levels to split (i.e. + # have_oversize_split_box = 0), then we do not need to allocate any + # extra space, since no new leaves can be created at the bottom + # level. + if knl_info.level_restrict and have_oversize_split_box.get(): + # Currently undocumented. + lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1) + minimal_new_level_length += sum( + 2**(l*dimensions) * new_level_leaf_counts[level - l] + for l in range(1, 1 + min(level, lr_lookbehind_levels))) + + nboxes_minimal = \ + sum(minimal_upper_level_lengths) + minimal_new_level_length + + needs_renumbering = \ + (curr_upper_level_lengths < minimal_upper_level_lengths).any() + + # {{{ prepare for reallocation/renumbering + + if (knl_info.level_restrict and (nboxes_guess < nboxes_minimal)) \ + or needs_renumbering: + # Represents the amount of padding needed for upper levels. + upper_level_padding = np.zeros(level, dtype=int) + + # Recompute the level padding. + for ulevel in range(level): + upper_level_padding[ulevel] = sum( + 2**(l*dimensions) * new_level_leaf_counts[ulevel - l] + for l in range( + 1, 1 + min(ulevel, lr_lookbehind_levels))) + + new_upper_level_unused_box_counts = np.max( + [upper_level_padding, + minimal_upper_level_lengths - new_level_used_box_counts[:-1]], + axis=0) + + new_level_start_box_nrs = np.empty(level + 1, dtype=int) + new_level_start_box_nrs[0] = 0 + new_level_start_box_nrs[1:] = np.cumsum( + new_level_used_box_counts[:-1] + + new_upper_level_unused_box_counts) + + old_box_count = level_start_box_nrs[-1] + # Where should I put this box? + dst_box_id = cl.array.empty(queue, + shape=old_box_count, dtype=box_id_dtype) + + for level_start, new_level_start, level_len in zip( + level_start_box_nrs, new_level_start_box_nrs, + curr_upper_level_lengths): + dst_box_id[level_start:level_start + level_len] = \ + cl.array.arange(queue, + new_level_start, + new_level_start + level_len, + dtype=box_id_dtype) + + wait_for.extend(dst_box_id.events) + + realloc_array = partial(self.gappy_copy_and_map, + dst_indices=dst_box_id, range=slice(old_box_count)) + realloc_and_renumber_array = partial(self.gappy_copy_and_map, + dst_indices=dst_box_id, map_values=dst_box_id, + range=slice(old_box_count)) + renumber_array = partial(self.map_values_kernel, dst_box_id) + + # Update level_start_box_nrs. This will be the + # level_start_box_nrs for the reallocated data. + + level_start_box_nrs = list(new_level_start_box_nrs) + level_start_box_nrs_dev[:level + 1] = \ + np.array(new_level_start_box_nrs, dtype=box_id_dtype) + level_start_box_nrs_updated = True + wait_for.extend(level_start_box_nrs_dev.events) + + nboxes_new = ( + level_start_box_nrs[-1] + + minimal_new_level_length) + + del new_level_start_box_nrs + else: + from boxtree.tools import realloc_array + realloc_and_renumber_array = realloc_array + renumber_array = None + level_start_box_nrs_updated = False + nboxes_new = nboxes_minimal - if nboxes_new > nboxes_guess: + del nboxes_minimal + + # }}} + + # {{{ reallocate and/or renumber boxes if necessary + + if level_start_box_nrs_updated or nboxes_new > nboxes_guess: fin_debug("starting nboxes_guess increase") while nboxes_guess < nboxes_new: nboxes_guess *= 2 - from boxtree.tools import realloc_array - my_realloc = partial(realloc_array, new_shape=nboxes_guess, - zero_fill=False, queue=queue, wait_for=wait_for) - my_realloc_zeros = partial(realloc_array, new_shape=nboxes_guess, - zero_fill=True, queue=queue, wait_for=wait_for) + def my_realloc_nocopy(ary): + return cl.array.empty(queue, allocator=allocator, + shape=nboxes_guess, dtype=ary.dtype) + + def my_realloc_zeros_nocopy(ary): + result = cl.array.zeros(queue, allocator=allocator, + shape=nboxes_guess, dtype=ary.dtype) + return result, result.events[0] + + my_realloc = partial(realloc_array, + queue, allocator, nboxes_guess, wait_for=wait_for) + my_realloc_zeros = partial(realloc_array, + queue, allocator, nboxes_guess, zero_fill=True, + wait_for=wait_for) + my_realloc_zeros_and_renumber = partial(realloc_and_renumber_array, + queue, allocator, nboxes_guess, zero_fill=True, + wait_for=wait_for) resize_events = [] - box_morton_bin_counts, evt = my_realloc(box_morton_bin_counts) - resize_events.append(evt) - box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts) - resize_events.append(evt) - box_parent_ids, evt = my_realloc_zeros(box_parent_ids) + split_box_ids = my_realloc_nocopy(split_box_ids) + + # FIXME: This can be reused across iterations, saving the cost + # of the particle scan. Right now there isn't a reallocator + # written for it. + box_morton_bin_counts, evt = my_realloc_zeros_nocopy( + box_morton_bin_counts) resize_events.append(evt) - box_morton_nrs, evt = my_realloc_zeros(box_morton_nrs) + + force_split_box, evt = my_realloc_zeros(force_split_box) resize_events.append(evt) - box_levels, evt = my_realloc_zeros(box_levels) + + box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts) resize_events.append(evt) + box_srcntgt_counts_cumul, evt = \ my_realloc_zeros(box_srcntgt_counts_cumul) resize_events.append(evt) + box_has_children, evt = my_realloc_zeros(box_has_children) resize_events.append(evt) - del my_realloc - del my_realloc_zeros + box_centers, evts = zip( + *(my_realloc(ary) for ary in box_centers)) + resize_events.extend(evts) - # reset nboxes_dev to previous value - nboxes_dev.fill(level_start_box_nrs[-1]) + box_child_ids, evts = zip( + *(my_realloc_zeros_and_renumber(ary) + for ary in box_child_ids)) + resize_events.extend(evts) + + box_parent_ids, evt = my_realloc_zeros_and_renumber(box_parent_ids) resize_events.append(evt) - wait_for = resize_events + if not level_start_box_nrs_updated: + box_levels, evt = my_realloc(box_levels) + resize_events.append(evt) + else: + box_levels, evt = my_realloc_zeros_nocopy(box_levels) + cl.wait_for_events([evt]) + for box_level, (level_start, level_end) in enumerate(zip( + level_start_box_nrs, level_start_box_nrs[1:])): + box_levels[level_start:level_end].fill(box_level) + resize_events.extend(box_levels.events) + + if level_start_box_nrs_updated: + srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids) + resize_events.append(evt) + + del my_realloc + del my_realloc_zeros + del my_realloc_zeros_and_renumber + del my_realloc_nocopy + del my_realloc_zeros_nocopy + del renumber_array # retry logger.info("nboxes_guess exceeded: " - "enlarged allocations, restarting level") + "enlarged allocations, restarting level") continue @@ -545,60 +793,219 @@ class TreeBuilder(object): logger.info("LEVEL %d -> %d boxes" % (level, nboxes_new)) - assert level_start_box_nrs[-1] != nboxes_new or srcntgts_have_extent + assert ( + level_start_box_nrs[-1] != nboxes_new or + srcntgts_have_extent or + final_level_restrict_iteration) if level_start_box_nrs[-1] == nboxes_new: - # We haven't created new boxes in this level loop trip. Unless - # srcntgts have extent, this should never happen. (I.e., we - # should've never entered this loop trip.) + # We haven't created new boxes in this level loop trip. # # If srcntgts have extent, this can happen if boxes were # in-principle overfull, but couldn't subdivide because of # extent restrictions. + if srcntgts_have_extent and not final_level_restrict_iteration: + level -= 1 + break + assert final_level_restrict_iteration - assert srcntgts_have_extent + # {{{ update level_start_box_nrs, level_used_box_counts - level -= 1 + level_start_box_nrs.append(nboxes_new) + level_start_box_nrs_dev[level + 1].fill(nboxes_new) + wait_for.extend(level_start_box_nrs_dev.events) - logger.debug("no new boxes created this loop trip") - break + level_used_box_counts = new_level_used_box_counts + level_used_box_counts_dev[:level + 1] = \ + np.array(level_used_box_counts, dtype=box_id_dtype) + wait_for.extend(level_used_box_counts_dev.events) - level_start_box_nrs.append(nboxes_new) + level_leaf_counts = new_level_leaf_counts + if debug: + for level_start, level_nboxes, leaf_count in zip( + level_start_box_nrs, + level_used_box_counts, + level_leaf_counts): + if level_nboxes == 0: + assert leaf_count == 0 + continue + nleaves_actual = level_nboxes - int( + cl.array.sum(box_has_children[ + level_start:level_start + level_nboxes]).get()) + assert leaf_count == nleaves_actual + + del new_level_leaf_counts + + # }}} + + # XXX del nboxes_new + del new_level_used_box_counts - new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids) - new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids) - split_and_sort_args = ( - common_args - + (new_user_srcntgt_ids, have_oversize_split_box, - new_srcntgt_box_ids, box_levels, - box_has_children)) - fin_debug("split and sort") - - evt = knl_info.split_and_sort_kernel(*split_and_sort_args, + # {{{ split boxes + + box_splitter_args = ( + common_args + + (box_has_children, force_split_box, root_extent) + + box_child_ids + + box_centers) + + evt = knl_info.box_splitter_kernel(*box_splitter_args, + range=range(level_start_box_nrs[-1]), wait_for=wait_for) + wait_for = [evt] + fin_debug("box splitter") + + # Mark the levels of boxes added for padding (these were not updated + # by the box splitter kernel). + last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1] + box_levels[last_used_box:level_start_box_nrs[-1]].fill(level) + + wait_for.extend(box_levels.events) + if debug: + box_levels.finish() level_bl_chunk = box_levels.get()[ level_start_box_nrs[-2]:level_start_box_nrs[-1]] - assert ((level_bl_chunk == level) | (level_bl_chunk == 0)).all() + assert (level_bl_chunk == level).all() del level_bl_chunk if debug: assert (box_srcntgt_starts.get() < nsrcntgts).all() + # }}} + + # {{{ renumber particles within split boxes + + new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids) + new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids) + + particle_renumberer_args = ( + common_args + + (box_has_children, force_split_box, + new_user_srcntgt_ids, new_srcntgt_box_ids)) + + evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args, + range=range(nsrcntgts), wait_for=wait_for) + + wait_for = [evt] + + fin_debug("particle renumbering") + user_srcntgt_ids = new_user_srcntgt_ids del new_user_srcntgt_ids srcntgt_box_ids = new_srcntgt_box_ids del new_srcntgt_box_ids + # }}} + + # {{{ enforce level restriction on upper levels + + if final_level_restrict_iteration: + # Roll back level update. + # + # FIXME: The extra iteration at the end to split boxes should + # not be necessary. Instead, all the work for the final box + # split should be done in the last iteration of the level + # loop. Currently the main issue that forces the extra iteration + # to be there is the need to use the box renumbering and + # reallocation code. In order to fix this issue, the box + # numbering and reallocation code needs to be accessible after + # the final level restriction is done. + assert int(have_oversize_split_box.get()) == 0 + assert level_used_box_counts[-1] == 0 + del level_used_box_counts[-1] + del level_start_box_nrs[-1] + level -= 1 + break + + if knl_info.level_restrict: + # Avoid generating too many kernels. + LEVEL_STEP = 10 + if level % LEVEL_STEP == 1: + level_restrict_kernel = knl_info.level_restrict_kernel_builder( + LEVEL_STEP * div_ceil(level, LEVEL_STEP)) + + # Upward pass - check if leaf boxes at higher levels need + # further splitting. + force_split_box.fill(0) + wait_for.extend(force_split_box.events) + + did_upper_level_split = False + + if debug: + boxes_split = [] + + for upper_level, upper_level_start, upper_level_box_count in zip( + # After the new level has been created, only only boxes + # at (level - 2) or above need to be rechecked for + # splitting. + range(level - 2, 0, -1), + # At this point, the last entry in level_start_box_nrs + # already refers to (level + 1). + level_start_box_nrs[-4::-1], + level_used_box_counts[-3::-1]): + + upper_level_slice = slice( + upper_level_start, upper_level_start + upper_level_box_count) + + have_upper_level_split_box.fill(0) + wait_for.extend(have_upper_level_split_box.events) + + # writes: force_split_box, have_upper_level_split_box + evt = level_restrict_kernel( + upper_level, + root_extent, + box_has_children, + force_split_box, + have_upper_level_split_box, + *(box_child_ids + box_centers), + slice=upper_level_slice, + wait_for=wait_for) + + wait_for = [evt] + + if debug: + force_split_box.finish() + boxes_split.append(int(cl.array.sum( + force_split_box[upper_level_slice]).get())) + + if int(have_upper_level_split_box.get()) == 0: + break + + did_upper_level_split = True + + if debug: + total_boxes_split = sum(boxes_split) + logger.debug("level restriction: {total_boxes_split} boxes split" + .format(total_boxes_split=total_boxes_split)) + from itertools import count + for level_, nboxes_split in zip( + count(level - 2, step=-1), boxes_split[:-1]): + logger.debug("level {level}: {nboxes_split} boxes split" + .format(level=level_, nboxes_split=nboxes_split)) + del boxes_split + + if int(have_oversize_split_box.get()) == 0 and did_upper_level_split: + # We are in the situation where there are boxes left to + # split on upper levels, and the level loop is done creating + # lower levels. + # + # We re-run the level loop one more time to finish creating + # the upper level boxes. + final_level_restrict_iteration = True + level += 1 + continue + + # }}} + if not int(have_oversize_split_box.get()): - logger.debug("no overfull boxes left") + logger.debug("no boxes left to split") break level += 1 - have_oversize_split_box.fill(0) end_time = time() @@ -608,7 +1015,7 @@ class TreeBuilder(object): elapsed, elapsed/(npasses*nsrcntgts))) del npasses - nboxes = int(nboxes_dev.get()) + nboxes = level_start_box_nrs[-1] # }}} @@ -644,36 +1051,70 @@ class TreeBuilder(object): del morton_nrs del box_morton_bin_counts - # {{{ prune empty leaf boxes + # {{{ prune empty/unused leaf boxes - is_pruned = not kwargs.get("skip_prune") - if is_pruned: + prune_empty_leaves = not kwargs.get("skip_prune") + if prune_empty_leaves: # What is the original index of this box? - from_box_id = empty(nboxes, box_id_dtype) + src_box_id = empty(nboxes, box_id_dtype) # Where should I put this box? - to_box_id = empty(nboxes, box_id_dtype) + # + # Initialize to all zeros, because pruned boxes should be mapped to + # zero (e.g. when pruning child_box_ids). + dst_box_id, evt = zeros(nboxes, box_id_dtype) + wait_for.append(evt) fin_debug("find prune indices") nboxes_post_prune_dev = empty((), dtype=box_id_dtype) evt = knl_info.find_prune_indices_kernel( box_srcntgt_counts_cumul, - to_box_id, from_box_id, nboxes_post_prune_dev, + src_box_id, dst_box_id, nboxes_post_prune_dev, size=nboxes, wait_for=wait_for) wait_for = [evt] - - fin_debug("prune copy") - nboxes_post_prune = int(nboxes_post_prune_dev.get()) + logger.info("{} empty leaves and/or unused boxes, {} boxes after pruning" + .format(nboxes - nboxes_post_prune, nboxes_post_prune)) + should_prune = True + elif knl_info.level_restrict: + # Remove unused boxes from the tree. + src_box_id = empty(nboxes, box_id_dtype) + dst_box_id = empty(nboxes, box_id_dtype) + + new_level_start_box_nrs = np.empty_like(level_start_box_nrs) + new_level_start_box_nrs[0] = 0 + new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts) + for level_start, new_level_start, level_used_box_count in zip( + level_start_box_nrs, new_level_start_box_nrs, + level_used_box_counts): + def make_slice(start): + return slice(start, start + level_used_box_count) + + def make_arange(start): + return cl.array.arange(queue, start, + start + level_used_box_count, dtype=box_id_dtype) + + src_box_id[make_slice(new_level_start)] = make_arange(level_start) + dst_box_id[make_slice(level_start)] = make_arange(new_level_start) + wait_for.extend(src_box_id.events + dst_box_id.events) + + nboxes_post_prune = new_level_start_box_nrs[-1] + + logger.info("{} unused boxes, {} boxes after pruning" + .format(nboxes - nboxes_post_prune, nboxes_post_prune)) + should_prune = True + else: + should_prune = False - logger.info("%d empty leaves" % (nboxes-nboxes_post_prune)) - + if should_prune: prune_events = [] prune_empty = partial(self.gappy_copy_and_map, - queue, allocator, nboxes_post_prune, from_box_id) + queue, allocator, nboxes_post_prune, + src_indices=src_box_id, + range=slice(nboxes_post_prune)) box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts) prune_events.append(evt) @@ -681,17 +1122,19 @@ class TreeBuilder(object): box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul) prune_events.append(evt) - if debug: + if debug and prune_empty_leaves: assert (box_srcntgt_counts_cumul.get() > 0).all() - srcntgt_box_ids = cl.array.take(to_box_id, srcntgt_box_ids) - - box_parent_ids, evt = prune_empty(box_parent_ids, map_values=to_box_id) + srcntgt_box_ids, evt = self.map_values_kernel( + dst_box_id, srcntgt_box_ids) prune_events.append(evt) - box_morton_nrs, evt = prune_empty(box_morton_nrs) + + box_parent_ids, evt = prune_empty(box_parent_ids, map_values=dst_box_id) prune_events.append(evt) + box_levels, evt = prune_empty(box_levels) prune_events.append(evt) + if srcntgts_have_extent: box_srcntgt_counts_nonchild, evt = prune_empty( box_srcntgt_counts_nonchild) @@ -700,12 +1143,31 @@ class TreeBuilder(object): box_has_children, evt = prune_empty(box_has_children) prune_events.append(evt) - # Remap level_start_box_nrs to new box IDs. - # FIXME: It would be better to do this on the device. - level_start_box_nrs = list( - to_box_id.get() - [np.array(level_start_box_nrs[:-1], box_id_dtype)]) - level_start_box_nrs = level_start_box_nrs + [nboxes_post_prune] + box_child_ids, evts = zip( + *(prune_empty(ary, map_values=dst_box_id) + for ary in box_child_ids)) + prune_events.extend(evts) + + box_centers, evts = zip( + *(prune_empty(ary) for ary in box_centers)) + prune_events.extend(evts) + + # Update box counts and level start box indices. + box_levels.finish() + + evt = knl_info.find_level_box_counts_kernel( + box_levels, level_used_box_counts_dev) + cl.wait_for_events([evt]) + + nlevels = len(level_used_box_counts) + level_used_box_counts = level_used_box_counts_dev[:nlevels].get() + + level_start_box_nrs = [0] + level_start_box_nrs.extend(np.cumsum(level_used_box_counts)) + + level_start_box_nrs_dev[:nlevels + 1] = np.array( + level_start_box_nrs, dtype=box_id_dtype) + prune_events.extend(level_start_box_nrs_dev.events) wait_for = prune_events else: @@ -883,22 +1345,44 @@ class TreeBuilder(object): del srcntgts nlevels = len(level_start_box_nrs) - 1 + + assert nlevels == len(level_used_box_counts) assert level + 1 == nlevels, (level+1, nlevels) if debug: max_level = np.max(box_levels.get()) - assert max_level + 1 == nlevels - # {{{ compute box info + # {{{ gather box child ids, box centers # A number of arrays below are nominally 2-dimensional and stored with # the box index as the fastest-moving index. To make sure that accesses # remain aligned, we round up the number of boxes used for indexing. aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32 - box_child_ids, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype) + box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype) wait_for.append(evt) - box_centers = empty((dimensions, aligned_nboxes), coord_dtype) + box_centers_new = empty((dimensions, aligned_nboxes), coord_dtype) + + for mnr, child_row in enumerate(box_child_ids): + box_child_ids_new[mnr, :nboxes_post_prune] = \ + child_row[:nboxes_post_prune] + wait_for.extend(box_child_ids_new.events) + + for dim, center_row in enumerate(box_centers): + box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune] + wait_for.extend(box_centers_new.events) + + cl.wait_for_events(wait_for) + + box_centers = box_centers_new + box_child_ids = box_child_ids_new + + del box_centers_new + del box_child_ids_new + + # }}} + + # {{{ compute box flags from boxtree.tree import box_flags_enum box_flags = empty(nboxes_post_prune, box_flags_enum.dtype) @@ -910,6 +1394,7 @@ class TreeBuilder(object): # the cumulative counts and setting them to zero for non-leaves. # {{{ make sure box_{source,target}_counts_nonchild are not defined + # (before we overwrite them) try: @@ -943,9 +1428,7 @@ class TreeBuilder(object): evt = knl_info.box_info_kernel( *( # input: - box_parent_ids, box_morton_nrs, bbox, aligned_nboxes, - - box_srcntgt_counts_cumul, + box_parent_ids, box_srcntgt_counts_cumul, box_source_counts_cumul, box_target_counts_cumul, box_has_children, box_levels, nlevels, @@ -953,7 +1436,7 @@ class TreeBuilder(object): box_source_counts_nonchild, box_target_counts_nonchild, # output: - box_child_ids, box_centers, box_flags, + box_flags, ), range=slice(nboxes_post_prune), wait_for=wait_for) @@ -991,9 +1474,7 @@ class TreeBuilder(object): bounding_box=(bbox_min, bbox_max), level_start_box_nrs=level_start_box_nrs, - level_start_box_nrs_dev=cl.array.to_device( - queue, level_start_box_nrs, - allocator=allocator), + level_start_box_nrs_dev=level_start_box_nrs_dev, sources=sources, targets=targets, @@ -1014,7 +1495,7 @@ class TreeBuilder(object): user_source_ids=user_source_ids, sorted_target_ids=sorted_target_ids, - _is_pruned=is_pruned, + _is_pruned=prune_empty_leaves, **extra_tree_attrs ).with_queue(None), evt diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 5a8dbc2..4e772c5 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -40,6 +40,10 @@ logger = logging.getLogger(__name__) # TODO: # - Add *restrict where applicable. +# - Split up the arrays so that there is one array per box level. This avoids +# having to reallocate the middle of an array. +# - Use level-relative box numbering in parent_box_ids, child_box_ids. This +# avoids having to renumber these arrays after reallocation. # ----------------------------------------------------------------------------- # CONTROL FLOW @@ -74,33 +78,43 @@ logger = logging.getLogger(__name__) # ------------------------------- # # This code sorts particles into an nD-tree of boxes. It does this by doing two -# succesive (parallel) scans over particles and a (local, i.e. independent for -# each particle) postprocessing step for each level. +# succesive (parallel) scans and a (local, i.e. independent for each particle) +# postprocessing step. # # The following information is being pushed around by the scans, which # proceed over particles: # # - a cumulative count ("pcnt") and weight ("pwt") of particles in each subbox -# ("morton_nr") at the current level, should the current box need to be -# subdivided. +# ("morton_nr") , should the current box need to be subdivided. # -# - the "split_box_id". This is the box number that the particle gets pushed -# into. The very first entry here gets initialized to the number of boxes -# present at the previous level. +# - the "split_box_id". This is an array that, for each box, answers the +# question, "After I am subdivided, what is the start of the range of boxes +# that my particles get pushed into?" The split_box_id is not meaningful +# unless the box is about to be subdivided. # # Using this data, the stages of the algorithm proceed as follows: # # 1. Count the number of particles in each subbox. This stage uses a segmented -# (per-box) scan to fill "pcnt" and "pwt". +# (per-box) scan to fill "pcnt" and "pwt". This information is kept +# per-particle ("morton_bin_counts") and per-box ("box_morton_bin_counts"). # -# 2. Using a global (non-segmented) scan over the particles, make a decision -# whether to refine each box, and compute the total number of new boxes -# needed. This stage also computes the split_box_id for each particle. If a -# box knows it needs to be subdivided, its first particle asks for 2**d new -# boxes. +# 2. Using a scan over the boxes, segmented by level, make a decision whether to +# refine each box, and compute the split_box_id. This stage also computes the +# total number of new boxes needed. If a box knows it needs to be subdivided, +# it asks for 2**d new boxes at the next level. # # 3. Realize the splitting determined in #2. This stage proceeds in an -# element-wise fashion over the particles. +# elementwise fashion over the particles. +# +# HOW DOES LEVEL RESTRICTION WORK? +# -------------------------------- +# +# This requires some post-processing in the level loop described above: as an +# additional step, the "level restrict" kernel gets run at the end of the level +# loop. The job of the level restrict kernel is to mark boxes on higher levels +# to be split based on looking at the levels of their neighbor boxes. The +# splitting is then realized by the next iteration of the level loop, +# simultaneously with the creation of the lower level. # # ----------------------------------------------------------------------------- @@ -160,6 +174,7 @@ TYPE_DECL_PREAMBLE_TPL = Template(r"""//CL// typedef ${dtype_to_ctype(coord_vec_dtype)} coord_vec_t; typedef ${dtype_to_ctype(box_id_dtype)} box_id_t; typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; + typedef ${dtype_to_ctype(box_level_dtype)} box_level_t; // morton_nr == -1 is defined to mean that the srcntgt is // remaining at the present level and will not be sorted @@ -200,9 +215,9 @@ GENERIC_PREAMBLE_TPL = Template(r"""//CL// # BEGIN KERNELS IN THE LEVEL LOOP -# {{{ scan primitive code template +# {{{ morton scan -SCAN_PREAMBLE_TPL = Template(r"""//CL// +MORTON_NR_SCAN_PREAMBLE_TPL = Template(r"""//CL// // {{{ neutral element @@ -265,7 +280,7 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL// scan_t scan_t_from_particle( const int i, - const int level, + const int particle_level, bbox_t const *bbox, global morton_nr_t *morton_nrs, // output/side effect global particle_id_t *user_srcntgt_ids, @@ -280,10 +295,10 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL// { particle_id_t user_srcntgt_id = user_srcntgt_ids[i]; - // Recall that 'level' is the level currently being built, e.g. 1 at - // the root. This should be 0.5 at level 1. (Level 0 is the root.) + // The next level is 1 + the current level of the particle. + // This should be 0.5 when next level = 1. (Level 0 is the root.) coord_t next_level_box_size_factor = - ((coord_t) 1) / ((coord_t) (1U << level)); + ((coord_t) 1) / ((coord_t) (1U << (1 + particle_level))); %if srcntgts_have_extent: bool stop_srcntgt_descent = false; @@ -312,14 +327,15 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL// // level, and it isn't either by the fact that boxes are // [)-half-open in subsequent levels. - // So (1 << level) is 2 when building level 1. Because the - // floating point factor is strictly less than 1, 2 is never - // reached, so when building level 1, the result is either 0 or 1. + // So (1 << (1 + particle_level)) is 2 when building level 1. + // Because the floating point factor is strictly less than 1, 2 is + // never reached, so when building level 1, the result is either + // 0 or 1. // After that, we just add one (less significant) bit per level. unsigned ${ax}_bits = (unsigned) ( ((srcntgt_${ax} - global_min_${ax}) / global_extent_${ax}) - * (1U << level)); + * (1U << (1 + particle_level))); %if srcntgts_have_extent: // Need to compute center to compare excess with STICK_OUT_FACTOR. @@ -383,9 +399,9 @@ SCAN_PREAMBLE_TPL = Template(r"""//CL// # }}} -# {{{ scan output code template +# {{{ morton scan output -SCAN_OUTPUT_STMT_TPL = Template(r"""//CL// +MORTON_NR_SCAN_OUTPUT_STMT_TPL = Template(r"""//CL// { particle_id_t my_id_in_my_box = -1 %if srcntgts_have_extent: @@ -399,6 +415,7 @@ SCAN_OUTPUT_STMT_TPL = Template(r"""//CL// morton_bin_counts[i] = item; box_id_t current_box_id = srcntgt_box_ids[i]; + particle_id_t box_srcntgt_count = box_srcntgt_counts_cumul[current_box_id]; // Am I the last particle in my current box? @@ -422,43 +439,45 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( arguments=r"""//CL:mako// /* input */ box_id_t *srcntgt_box_ids, - particle_id_t *box_srcntgt_starts, particle_id_t *box_srcntgt_counts_cumul, morton_counts_t *box_morton_bin_counts, refine_weight_t *refine_weights, refine_weight_t max_leaf_refine_weight, box_level_t *box_levels, - box_level_t level, - - /* input/output */ - box_id_t *nboxes, + box_id_t *level_start_box_ids, + box_id_t *level_used_box_counts, + int *box_force_split, + box_level_t last_level, /* output */ int *box_has_children, box_id_t *split_box_ids, + int *have_oversize_split_box, """, preamble=r"""//CL:mako// scan_t count_new_boxes_needed( - particle_id_t i, box_id_t box_id, + box_level_t level, + box_level_t last_level, refine_weight_t max_leaf_refine_weight, - __global box_id_t *nboxes, - __global particle_id_t *box_srcntgt_starts, __global particle_id_t *box_srcntgt_counts_cumul, __global morton_counts_t *box_morton_bin_counts, - __global box_level_t *box_levels, - __global int *box_has_children, // output/side effect - box_level_t level + __global box_id_t *level_start_box_ids, + __global box_id_t *level_used_box_counts, + __global int *box_force_split, + __global int *have_oversize_split_box, // output/side effect + __global int *box_has_children // output/side effect ) { scan_t result = 0; - // First particle? Start counting at (the previous level's) nboxes. - if (i == 0) - result += *nboxes; - - particle_id_t first_particle_in_my_box = - box_srcntgt_starts[box_id]; + // First box at my level? Start counting at the number of boxes + // used at the child level. + if (box_id == level_start_box_ids[level]) + { + result += level_start_box_ids[level + 1]; + result += level_used_box_counts[level + 1]; + } %if srcntgts_have_extent: const particle_id_t nonchild_srcntgts_in_box = @@ -475,18 +494,9 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( %endfor // Add 2**d to make enough room for a split of the current box - // This will be the split_box_id for *all* particles in this box, - // including non-child srcntgts. - - if (i == first_particle_in_my_box - %if srcntgts_have_extent: - // Only last-level boxes get to produce new boxes. - // If srcntgts have extent, then prior-level boxes - // will keep asking for more boxes to be allocated. - // Prevent that. - && - box_levels[box_id] + 1 == level - %endif + + if (( + level + 1 == last_level && %if adaptive: /* box overfull? */ @@ -497,39 +507,120 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( /* Note: Refine weights are allowed to be 0, so check # of particles directly. */ box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box - > 0 + >= 0 %endif - ) + ) || + box_force_split[box_id]) { result += ${2**dimensions}; box_has_children[box_id] = 1; + + // Check if the box is oversized. This drives the level loop. + refine_weight_t max_subbox_refine_weight = 0; + %for mnr in range(2**dimensions): + max_subbox_refine_weight = max(max_subbox_refine_weight, + box_morton_bin_counts[box_id] + .pwt${padded_bin(mnr, dimensions)}); + %endfor + if (max_subbox_refine_weight > max_leaf_refine_weight) + { + *have_oversize_split_box = 1; + } } return result; } """, input_expr="""count_new_boxes_needed( - i, srcntgt_box_ids[i], max_leaf_refine_weight, nboxes, - box_srcntgt_starts, box_srcntgt_counts_cumul, box_morton_bin_counts, - box_levels, box_has_children, level + i, box_levels[i], last_level, max_leaf_refine_weight, + box_srcntgt_counts_cumul, box_morton_bin_counts, + level_start_box_ids, level_used_box_counts, + box_force_split, have_oversize_split_box, + box_has_children )""", - scan_expr="a + b", + scan_expr="across_seg_boundary ? b : a + b", neutral="0", - output_statement="""//CL// + is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i-1]", + output_statement=r"""//CL// dbg_assert(item >= 0); split_box_ids[i] = item; - // Am I the last particle overall? If so, write box count - if (i+1 == N) - *nboxes = item; """) # }}} -# {{{ split-and-sort kernel +# {{{ box splitter kernel + +BOX_SPLITTER_KERNEL_TPL = Template(r"""//CL// + box_id_t ibox = i; + + bool do_split_box = + (box_has_children[ibox] && box_levels[ibox] + 1 == level) || + box_force_split[ibox]; + + if (!do_split_box) + { + PYOPENCL_ELWISE_CONTINUE; + } + + // {{{ Set up child box data structure. + + morton_counts_t box_morton_bin_count = box_morton_bin_counts[ibox]; + + %for mnr in range(2**dimensions): + { + box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + ${mnr}; + + // Parent / child / level info + box_parent_ids[new_box_id] = ibox; + box_child_ids_mnr_${mnr}[ibox] = new_box_id; + box_level_t new_level = box_levels[ibox] + 1; + box_levels[new_box_id] = new_level; + + // Box particle counts / starting particle number + particle_id_t new_count = + box_morton_bin_count.pcnt${padded_bin(mnr, dimensions)}; + box_srcntgt_counts_cumul[new_box_id] = new_count; + + // Only set the starting particle number / start flags if + // the new box has particles to begin with. + if (new_count > 0) + { + particle_id_t new_box_start = box_srcntgt_starts[ibox] + %if srcntgts_have_extent: + + box_morton_bin_count.nonchild_srcntgts + %endif + %for sub_mnr in range(mnr): + + box_morton_bin_count.pcnt${padded_bin(sub_mnr, dimensions)} + %endfor + ; + + box_start_flags[new_box_start] = 1; + box_srcntgt_starts[new_box_id] = new_box_start; + } + + // Compute box center. + coord_t radius = (root_extent * 1 / (coord_t) (1 << (1 + new_level))); + + %for idim, ax in enumerate(axis_names): + { + <% has_bit = mnr & 2**(dimensions-1-idim) %> + box_centers_${ax}[new_box_id] = box_centers_${ax}[ibox] + ${"+" if has_bit else "-"} radius; + } + %endfor + } + %endfor + + // }}} +""", strict_undefined=True) + +# }}} + +# {{{ post-split particle renumbering -SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL// +PARTICLE_RENUMBERER_PREAMBLE_TPL = Template(r"""//CL// <% def get_count_for_branch(known_bits): if len(known_bits) == dimensions: @@ -556,141 +647,233 @@ SPLIT_AND_SORT_PREAMBLE_TPL = Template(r"""//CL// """, strict_undefined=True) -SPLIT_AND_SORT_KERNEL_TPL = Template(r"""//CL// +PARTICLE_RENUMBERER_KERNEL_TPL = Template(r"""//CL// box_id_t ibox = srcntgt_box_ids[i]; dbg_assert(ibox >= 0); - dbg_assert(ibox < nboxes); dbg_printf(("postproc %d:\n", i)); dbg_printf((" my box id: %d\n", ibox)); - bool do_split_box = box_has_children[ibox]; - %if srcntgts_have_extent: - ## Only do split-box processing for srcntgts that were touched - ## on the immediately preceding level. - ## - ## If srcntgts have no extent, then subsequent levels - ## will never decide to split boxes that were kept unsplit on prior - ## levels either. If srcntgts do - ## have an extent, this could happen. Prevent running the - ## split code for such particles. - - int box_level = box_levels[ibox]; - do_split_box = do_split_box && box_level + 1 == level; - %endif + bool do_split_box = (box_has_children[ibox] && box_levels[ibox] + 1 == level) || + box_force_split[ibox]; - if (do_split_box) + if (!do_split_box) { - morton_nr_t my_morton_nr = morton_nrs[i]; - dbg_printf((" my morton nr: %d\n", my_morton_nr)); + // Not splitting? Copy over existing particle info. + new_user_srcntgt_ids[i] = user_srcntgt_ids[i]; + new_srcntgt_box_ids[i] = ibox; - morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox]; + PYOPENCL_ELWISE_CONTINUE; + } - morton_counts_t my_morton_bin_counts = morton_bin_counts[i]; - particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr); + morton_nr_t my_morton_nr = morton_nrs[i]; + // printf(" my morton nr: %d\n", my_morton_nr); - // {{{ compute this srcntgt's new index + morton_counts_t my_box_morton_bin_counts = box_morton_bin_counts[ibox]; - particle_id_t my_box_start = box_srcntgt_starts[ibox]; - particle_id_t tgt_particle_idx = my_box_start + my_count-1; - %if srcntgts_have_extent: - tgt_particle_idx += - (my_morton_nr >= 0) - ? my_box_morton_bin_counts.nonchild_srcntgts - : 0; - %endif - %for mnr in range(2**dimensions): - <% bin_nmr = padded_bin(mnr, dimensions) %> - tgt_particle_idx += - (my_morton_nr > ${mnr}) - ? my_box_morton_bin_counts.pcnt${bin_nmr} - : 0; - %endfor + morton_counts_t my_morton_bin_counts = morton_bin_counts[i]; + particle_id_t my_count = get_count(my_morton_bin_counts, my_morton_nr); - dbg_assert(tgt_particle_idx < n); - dbg_printf((" moving %ld -> %d " - "(ibox %d, my_box_start %d, my_count %d)\n", - i, tgt_particle_idx, - ibox, my_box_start, my_count)); + // {{{ compute this srcntgt's new index - new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i]; + particle_id_t my_box_start = box_srcntgt_starts[ibox]; + particle_id_t tgt_particle_idx = my_box_start + my_count-1; + %if srcntgts_have_extent: + tgt_particle_idx += + (my_morton_nr >= 0) + ? my_box_morton_bin_counts.nonchild_srcntgts + : 0; + %endif + %for mnr in range(2**dimensions): + <% bin_nmr = padded_bin(mnr, dimensions) %> + tgt_particle_idx += + (my_morton_nr > ${mnr}) + ? my_box_morton_bin_counts.pcnt${bin_nmr} + : 0; + %endfor + + dbg_assert(tgt_particle_idx < n); + dbg_printf((" moving %ld -> %d " + "(ibox %d, my_box_start %d, my_count %d)\n", + i, tgt_particle_idx, + ibox, my_box_start, my_count)); + + new_user_srcntgt_ids[tgt_particle_idx] = user_srcntgt_ids[i]; - // }}} + // }}} - // {{{ compute this srcntgt's new box id + // {{{ compute this srcntgt's new box id - box_id_t new_box_id = split_box_ids[i] - ${2**dimensions} + my_morton_nr; + box_id_t new_box_id = split_box_ids[ibox] - ${2**dimensions} + my_morton_nr; - %if srcntgts_have_extent: - if (my_morton_nr == -1) - new_box_id = ibox; - %endif + %if srcntgts_have_extent: + if (my_morton_nr == -1) + { + new_box_id = ibox; + } + %endif - dbg_printf((" new_box_id: %d\n", new_box_id)); - dbg_assert(new_box_id >= 0); + dbg_printf((" new_box_id: %d\n", new_box_id)); + dbg_assert(new_box_id >= 0); - new_srcntgt_box_ids[tgt_particle_idx] = new_box_id; + new_srcntgt_box_ids[tgt_particle_idx] = new_box_id; - // }}} + // }}} +""", strict_undefined=True) - // {{{ set up child box data structure +# }}} - %for mnr in range(2**dimensions): - /* Am I the last particle in my Morton bin? */ - %if mnr > 0: - else - %endif - if (${mnr} == my_morton_nr - && my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)} - == my_count) - { - dbg_printf((" ## splitting\n")); +# {{{ level restrict kernel - particle_id_t new_box_start = my_box_start - %if srcntgts_have_extent: - + my_box_morton_bin_counts.nonchild_srcntgts - %endif - %for sub_mnr in range(mnr): - + my_box_morton_bin_counts.pcnt${padded_bin(sub_mnr, dimensions)} - %endfor - ; +from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS - dbg_printf((" new_box_start: %d\n", new_box_start)); +LEVEL_RESTRICT_TPL = Template( + TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako// + <%def name="my_load_center(name, box_id)"> + ## This differs from load_center() because in this kernel box centers + ## live in one array per axis. + coord_vec_t ${name}; + %for i in range(dimensions): + ${name}.${AXIS_NAMES[i]} = box_centers_${AXIS_NAMES[i]}[${box_id}]; + %endfor + - box_start_flags[new_box_start] = 1; - box_srcntgt_starts[new_box_id] = new_box_start; - box_parent_ids[new_box_id] = ibox; - box_morton_nrs[new_box_id] = my_morton_nr; + #define NLEVELS (${max_levels}) - particle_id_t new_count = - my_box_morton_bin_counts.pcnt${padded_bin(mnr, dimensions)}; - box_srcntgt_counts_cumul[new_box_id] = new_count; - box_levels[new_box_id] = level; + box_id_t box_id = i; - refine_weight_t new_weight = - my_box_morton_bin_counts.pwt${padded_bin(mnr, dimensions)}; + // Skip unless this box is a leaf. + if (box_has_children[box_id]) + { + PYOPENCL_ELWISE_CONTINUE; + } - // This drives the level loop. - if (new_weight > max_leaf_refine_weight) - { - *have_oversize_split_box = 1; - } + ${walk_init(0)} + + // Descend the tree searching for neighboring leaves. + while (continue_walk) + { + box_id_t child_box_id; + // Look for the child in the appropriate array. + %for morton_nr in range(2**dimensions): + if (walk_morton_nr == ${morton_nr}) + { + child_box_id = box_child_ids_mnr_${morton_nr}[walk_box_id]; + } + %endfor + + if (child_box_id) + { + int child_level = walk_level + 1; + + // Check adjacency. + bool is_adjacent; - dbg_printf((" box pcount: %d\n", - box_srcntgt_counts_cumul[new_box_id])); + if (child_box_id == box_id) + { + // Skip considering self. + is_adjacent = false; + } + else + { + ${my_load_center("box_center", "box_id")} + ${my_load_center("child_center", "child_box_id")} + is_adjacent = is_adjacent_or_overlapping( + root_extent, child_center, child_level, box_center, level, + false); } - %endfor - // }}} - } - else - { - // Not splitting? Copy over existing particle info. - new_user_srcntgt_ids[i] = user_srcntgt_ids[i]; - new_srcntgt_box_ids[i] = ibox; + if (is_adjacent) + { + // Invariant: When new leaves get added, + // they are never more than 2 levels deeper than + // all their adjacent leaves. + // + // Hence in we only need to look at boxes up to + // (level + 2) deep. + + if (box_has_children[child_box_id]) + { + if (child_level <= 1 + level) + { + ${walk_push("child_box_id")} + continue; + } + } + else + { + // We are looking at a neighboring leaf box. + // Check if my box must be split to enforce level + // restriction. + if (child_level == 2 + level || ( + child_level == 1 + level && + box_force_split[child_box_id])) + { + box_force_split[box_id] = 1; + *have_upper_level_split_box = 1; + continue_walk = false; + } + } + } + } + ${walk_advance()} } """, strict_undefined=True) + +def build_level_restrict_kernel(context, generic_preamble, + dimensions, axis_names, box_id_dtype, coord_dtype, + box_level_dtype, max_levels): + from pyopencl.tools import VectorArg, ScalarArg + + arguments = ( + [ + # input + ScalarArg(box_level_dtype, "level"), # [1] + ScalarArg(coord_dtype, "root_extent"), # [1] + VectorArg(np.int32, "box_has_children"), # [nboxes] + + # input/output + VectorArg(np.int32, "box_force_split"), # [nboxes] + + # output + VectorArg(np.int32, "have_upper_level_split_box"), # [1] + ] + # input, length depends on dim + + [VectorArg(box_id_dtype, "box_child_ids_mnr_{mnr}".format(mnr=mnr)) + for mnr in range(2**dimensions)] # [nboxes] + + [VectorArg(coord_dtype, "box_centers_{ax}".format(ax=ax)) + for ax in axis_names] # [nboxes] + ) + + render_vars = dict( + AXIS_NAMES=axis_names, + dimensions=dimensions, + max_levels=max_levels, + # Entries below are needed by HELPER_FUNCTION_TEMPLATE + # and/or TRAVERSAL_PREAMBLE_MAKO_DEFS: + debug=False, + targets_have_extent=False, + sources_have_extent=False + ) + + from boxtree.traversal import HELPER_FUNCTION_TEMPLATE + from pyopencl.elementwise import ElementwiseKernel + + return ElementwiseKernel( + context, + arguments=arguments, + operation=LEVEL_RESTRICT_TPL.render(**render_vars), + name="level_restrict", + preamble=( + str(generic_preamble) + + Template(r""" + #define LEVEL_TO_RAD(level) \ + (root_extent * 1 / (coord_t) (1 << (level + 1))) + """ + + HELPER_FUNCTION_TEMPLATE) + .render(**render_vars))) + # }}} # END KERNELS IN THE LEVEL LOOP @@ -911,16 +1094,12 @@ SRCNTGT_PERMUTER_TPL = ElementwiseTemplate( # }}} - # {{{ box info kernel BOX_INFO_KERNEL_TPL = ElementwiseTemplate( arguments="""//CL:mako// /* input */ box_id_t *box_parent_ids, - morton_nr_t *box_morton_nrs, - bbox_t bbox, - box_id_t aligned_nboxes, particle_id_t *box_srcntgt_counts_cumul, particle_id_t *box_source_counts_cumul, particle_id_t *box_target_counts_cumul, @@ -933,8 +1112,6 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate( particle_id_t *box_target_counts_nonchild, /* output */ - box_id_t *box_child_ids, /* [2**dimensions, aligned_nboxes] */ - coord_t *box_centers, /* [dimensions, aligned_nboxes] */ box_flags_t *box_flags, /* [nboxes] */ """, operation=r"""//CL:mako// @@ -947,10 +1124,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate( * * box_srcntgt_counts_cumul is zero (here) exactly for empty leaves * because it gets initialized to zero and never gets set to another - * value. If you check above, most box info is only ever initialized - * *if* there's a particle in the box, because the sort/build is a - * repeated scan over *particles* (not boxes). Thus, no particle -> no - * work done. + * value. */ particle_id_t particle_count = box_srcntgt_counts_cumul[box_id]; @@ -982,19 +1156,7 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate( dbg_assert(particle_count >= nonchild_srcntgt_count); - if (particle_count == 0) - { - // Empty leaf: Lots of stuff uninitialized, prevent - // damage by quitting now. - - // Also, those should have gotten pruned by this point, - // unless skip_prune is True. - - box_flags[box_id] = 0; // no children, no sources, no targets, bye. - - PYOPENCL_ELWISE_CONTINUE; - } - else if (box_has_children[box_id]) + if (box_has_children[box_id]) { // This box has children, it is not a leaf. @@ -1046,55 +1208,21 @@ BOX_INFO_KERNEL_TPL = ElementwiseTemplate( } box_flags[box_id] = my_box_flags; - - box_id_t parent_id = box_parent_ids[box_id]; - morton_nr_t morton_nr = box_morton_nrs[box_id]; - box_child_ids[parent_id + aligned_nboxes*morton_nr] = box_id; - - /* walk up to root to find center */ - %for idim in range(dimensions): - coord_t center_${idim} = 0; - %endfor - - box_id_t walk_parent_id = parent_id; - box_id_t current_box_id = box_id; - morton_nr_t walk_morton_nr = morton_nr; - while (walk_parent_id != current_box_id) - { - %for idim in range(dimensions): - { - bool has_bit = (walk_morton_nr & ${2**(dimensions-1-idim)}); - center_${idim} = one_half*( - center_${idim} - - one_half - + has_bit); - } - %endfor - - current_box_id = walk_parent_id; - walk_parent_id = box_parent_ids[walk_parent_id]; - walk_morton_nr = box_morton_nrs[current_box_id]; - } - - coord_t extent = bbox.max_x - bbox.min_x; - %for idim in range(dimensions): - { - box_centers[box_id + aligned_nboxes*${idim}] = - bbox.min_${AXIS_NAMES[idim]} + extent*(one_half+center_${idim}); - } - %endfor """) # }}} - # {{{ kernel creation top-level + def get_tree_build_kernel_info(context, dimensions, coord_dtype, particle_id_dtype, box_id_dtype, sources_are_targets, srcntgts_have_extent, stick_out_factor, morton_nr_dtype, box_level_dtype, - adaptive): + kind): + + level_restrict = (kind == "adaptive-level-restricted") + adaptive = not (kind == "non-adaptive") logger.info("start building tree build kernels") @@ -1138,6 +1266,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, morton_bin_count_dtype=morton_bin_count_dtype, morton_nr_dtype=morton_nr_dtype, box_id_dtype=box_id_dtype, + box_level_dtype=box_level_dtype, dtype_to_ctype=dtype_to_ctype, AXIS_NAMES=AXIS_NAMES, box_flags_enum=box_flags_enum, @@ -1170,7 +1299,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, scan_preamble = ( preamble_with_dtype_decls - + str(SCAN_PREAMBLE_TPL.render(**codegen_args)) + + str(MORTON_NR_SCAN_PREAMBLE_TPL.render(**codegen_args)) ) from pyopencl.tools import VectorArg, ScalarArg @@ -1189,14 +1318,14 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # segment flags # invariant to sorting once set # (particles are only reordered within a box) - VectorArg(np.uint8, "box_start_flags"), # [nsrcntgts] + VectorArg(np.uint8, "box_start_flags"), # [nsrcntgts] VectorArg(box_id_dtype, "srcntgt_box_ids"), # [nsrcntgts] - VectorArg(box_id_dtype, "split_box_ids"), # [nsrcntgts] + VectorArg(box_id_dtype, "split_box_ids"), # [nboxes] # per-box morton bin counts VectorArg(morton_bin_count_dtype, "box_morton_bin_counts"), - # [nsrcntgts] + # [nboxes] VectorArg(refine_weight_dtype, "refine_weights"), # [nsrcntgts] @@ -1212,12 +1341,8 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # pointer to parent box VectorArg(box_id_dtype, "box_parent_ids"), # [nboxes] - # morton nr identifier {quadr,oct}ant of parent in which this - # box was created - VectorArg(morton_nr_dtype, "box_morton_nrs"), # [nboxes] - - # number of boxes total - VectorArg(box_id_dtype, "nboxes"), # [1] + # level number + VectorArg(box_level_dtype, "box_levels"), # [nboxes] ScalarArg(np.int32, "level"), ScalarArg(bbox_dtype, "bbox"), @@ -1239,7 +1364,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, input_expr=( "scan_t_from_particle(%s)" % ", ".join([ - "i", "level", "&bbox", "morton_nrs", + "i", "box_levels[srcntgt_box_ids[i]]", "&bbox", "morton_nrs", "user_srcntgt_ids", "refine_weights", ] @@ -1248,7 +1373,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", is_segment_start_expr="box_start_flags[i]", - output_statement=SCAN_OUTPUT_STMT_TPL.render(**codegen_args), + output_statement=MORTON_NR_SCAN_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble) # }}} @@ -1277,39 +1402,87 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # }}} - # {{{ split-and-sort + # {{{ box splitter + + # Work around a bug in Mako < 0.7.3 + # FIXME: Is this needed? + box_s_codegen_args = codegen_args.copy() + box_s_codegen_args.update( + dim=None, + boundary_morton_nr=None) + + box_splitter_kernel_source = BOX_SPLITTER_KERNEL_TPL.render(**box_s_codegen_args) + + from pyopencl.elementwise import ElementwiseKernel + box_splitter_kernel = ElementwiseKernel( + context, + common_arguments + + [ + VectorArg(np.int32, "box_has_children", with_offset=True), + VectorArg(np.int32, "box_force_split", with_offset=True), + ScalarArg(coord_dtype, "root_extent"), + ] + + [VectorArg(box_id_dtype, "box_child_ids_mnr_{mnr}".format(mnr=mnr)) + for mnr in range(2**dimensions)] + + [VectorArg(coord_dtype, "box_centers_{ax}".format(ax=ax)) + for ax in axis_names], + str(box_splitter_kernel_source), + name="box_splitter", + preamble=preamble_with_dtype_decls + ) + + # }}} + + # {{{ particle renumberer # Work around a bug in Mako < 0.7.3 - s_and_s_codegen_args = codegen_args.copy() - s_and_s_codegen_args.update( + # FIXME: Copied from above. It may not be necessary? + part_rn_codegen_args = codegen_args.copy() + part_rn_codegen_args.update( dim=None, boundary_morton_nr=None) - split_and_sort_preamble = \ - SPLIT_AND_SORT_PREAMBLE_TPL.render(**s_and_s_codegen_args) + particle_renumberer_preamble = \ + PARTICLE_RENUMBERER_PREAMBLE_TPL.render(**part_rn_codegen_args) - split_and_sort_kernel_source = SPLIT_AND_SORT_KERNEL_TPL.render(**codegen_args) + particle_renumberer_kernel_source = \ + PARTICLE_RENUMBERER_KERNEL_TPL.render(**codegen_args) from pyopencl.elementwise import ElementwiseKernel - split_and_sort_kernel = ElementwiseKernel( + particle_renumberer_kernel = ElementwiseKernel( context, common_arguments + [ + VectorArg(np.int32, "box_has_children", with_offset=True), + VectorArg(np.int32, "box_force_split", with_offset=True), VectorArg(particle_id_dtype, "new_user_srcntgt_ids", with_offset=True), - VectorArg(np.int32, "have_oversize_split_box", with_offset=True), VectorArg(box_id_dtype, "new_srcntgt_box_ids", with_offset=True), - VectorArg(box_level_dtype, "box_levels", with_offset=True), - VectorArg(np.int32, "box_has_children", with_offset=True), ], - str(split_and_sort_kernel_source), name="split_and_sort", + str(particle_renumberer_kernel_source), name="renumber_particles", preamble=( preamble_with_dtype_decls - + str(split_and_sort_preamble)) + + str(particle_renumberer_preamble)) ) # }}} + # {{{ level restrict propagator + + if level_restrict: + # At compile time the level restrict kernel requires fixing a + # "max_levels" constant for traversing the tree. This constant cannot be + # known at this point, hence we return a kernel builder. + + from functools import partial + level_restrict_kernel_builder = partial(build_level_restrict_kernel, + context, preamble_with_dtype_decls, dimensions, axis_names, box_id_dtype, + coord_dtype, box_level_dtype) + else: + level_restrict_kernel_builder = None + + # }}} + # END KERNELS IN LEVEL LOOP if srcntgts_have_extent: @@ -1338,22 +1511,51 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # input VectorArg(particle_id_dtype, "box_srcntgt_counts_cumul"), # output - VectorArg(box_id_dtype, "to_box_id"), - VectorArg(box_id_dtype, "from_box_id"), + VectorArg(box_id_dtype, "src_box_id"), + VectorArg(box_id_dtype, "dst_box_id"), VectorArg(box_id_dtype, "nboxes_post_prune"), ], - input_expr="box_srcntgt_counts_cumul[i] == 0 ? 1 : 0", + input_expr="box_srcntgt_counts_cumul[i] != 0", preamble=box_flags_enum.get_c_defines(), scan_expr="a+b", neutral="0", output_statement=""" - to_box_id[i] = i-prev_item; if (box_srcntgt_counts_cumul[i]) - from_box_id[i-prev_item] = i; - if (i+1 == N) *nboxes_post_prune = N-item; + { + dst_box_id[i] = item - 1; + src_box_id[item - 1] = i; + } + if (i+1 == N) *nboxes_post_prune = item; """) # }}} + # {{{ find new level box counts + + find_level_box_counts_kernel = GenericScanKernel( + context, box_id_dtype, + arguments=[ + # input + VectorArg(box_level_dtype, "box_levels"), # [nboxes] + # output + VectorArg(box_id_dtype, "level_box_counts"), # [nlevels] + ], + input_expr="1", + is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i - 1]", + scan_expr="across_seg_boundary ? b : a + b", + neutral="0", + output_statement=r"""//CL// + if (i + 1 == N) + { + level_box_counts[box_levels[i]] = item; + } + else if (box_levels[i] != box_levels[i + 1]) + { + level_box_counts[box_levels[i]] = item; + } + """) + + # }}} + # {{{ particle permuter # used if there is only one source/target array @@ -1441,11 +1643,15 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, morton_count_scan=morton_count_scan, split_box_id_scan=split_box_id_scan, - split_and_sort_kernel=split_and_sort_kernel, + box_splitter_kernel=box_splitter_kernel, + particle_renumberer_kernel=particle_renumberer_kernel, + level_restrict=level_restrict, + level_restrict_kernel_builder=level_restrict_kernel_builder, extract_nonchild_srcntgt_count_kernel=( extract_nonchild_srcntgt_count_kernel), find_prune_indices_kernel=find_prune_indices_kernel, + find_level_box_counts_kernel=find_level_box_counts_kernel, srcntgt_permuter=srcntgt_permuter, source_counter=source_counter, source_and_target_index_finder=source_and_target_index_finder, @@ -1454,7 +1660,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # }}} - # {{{ point source linking kernels # scan over (non-point) source ids in tree order @@ -1538,7 +1743,6 @@ POINT_SOURCE_LINKING_BOX_POINT_SOURCES = ElementwiseTemplate( # }}} - # {{{ target filtering TREE_ORDER_TARGET_FILTER_SCAN_TPL = ScanTemplate( diff --git a/doc/tree.rst b/doc/tree.rst index 328d96e..ab4af98 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -3,6 +3,24 @@ Tree building .. currentmodule:: boxtree +.. _tree-kinds: + +Supported tree kinds +-------------------- + +The following tree kinds are supported: + +- *Nonadaptive* trees have all leaves on the same (last) level. + +- *Adaptive* trees differ from nonadaptive trees in that they may have leaves on + more than one level. Adaptive trees have the option of being + *level-restricted*: in a level-restricted tree, neighboring leaves differ by + at most one level. + +All trees returned by the tree builder are pruned so that empty leaves have been +removed. If a level-restricted tree is requested, the tree gets constructed in +such a way that the version of the tree before pruning is also level-restricted. + Tree data structure ------------------- diff --git a/test/test_tree.py b/test/test_tree.py index c6d8cf3..7ad1336 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -329,7 +329,7 @@ def test_non_adaptive_particle_tree(ctx_getter, dtype, dims, do_plot=False): builder = TreeBuilder(ctx) run_build_test(builder, queue, dims, dtype, 10**4, - max_particles_in_box=30, do_plot=do_plot, non_adaptive=True) + max_particles_in_box=30, do_plot=do_plot, kind="non-adaptive") # }}} @@ -655,8 +655,6 @@ def test_geometry_query(ctx_getter, dims, do_plot=False): near_circles, = np.where(linf_circle_dists - ball_radii < box_rad) start, end = lbl.balls_near_box_starts[ibox:ibox+2] - #print sorted(lbl.balls_near_box_lists[start:end]) - #print sorted(near_circles) assert sorted(lbl.balls_near_box_lists[start:end]) == sorted(near_circles) # }}} @@ -676,7 +674,10 @@ def test_area_query(ctx_getter, dims, do_plot=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(queue, nparticles, dims, dtype) + from boxtree.tools import make_surface_particle_array + particles = make_surface_particle_array(queue, nparticles, dims, dtype, seed=15) + + #particles = make_normal_particle_array(queue, nparticles, dims, dtype) if do_plot: import matplotlib.pyplot as pt @@ -732,6 +733,85 @@ def test_area_query(ctx_getter, dims, do_plot=False): # }}} +# {{{ level restriction test + +@pytest.mark.opencl +@pytest.mark.parametrize("lookbehind", [0, 1]) +@pytest.mark.parametrize("skip_prune", [True, False]) +@pytest.mark.parametrize("dims", [2, 3]) +def test_level_restriction(ctx_getter, dims, skip_prune, lookbehind, do_plot=False): + ctx = ctx_getter() + queue = cl.CommandQueue(ctx) + + nparticles = 10**5 + dtype = np.float64 + + from boxtree.tools import make_surface_particle_array + particles = make_surface_particle_array(queue, nparticles, dims, dtype, seed=15) + + if do_plot: + import matplotlib.pyplot as pt + pt.plot(particles[0].get(), particles[1].get(), "x") + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + + queue.finish() + tree_dev, _ = tb(queue, particles, kind="adaptive-level-restricted", + max_particles_in_box=30, debug=True, + skip_prune=skip_prune, lr_lookbehind=lookbehind) + + def find_neighbors(leaf_box_centers, leaf_box_radii): + # We use an area query with a ball that is slightly larger than + # the size of a leaf box to find the neighboring leaves. + # + # Note that since this comes from an area query, the self box will be + # included in the neighbor list. + from boxtree.area_query import AreaQueryBuilder + aqb = AreaQueryBuilder(ctx) + + ball_radii = cl.array.to_device(queue, + np.min(leaf_box_radii) / 2 + leaf_box_radii) + leaf_box_centers = [ + cl.array.to_device(queue, axis) for axis in leaf_box_centers] + + area_query, _ = aqb(queue, tree_dev, leaf_box_centers, ball_radii) + area_query = area_query.get(queue=queue) + return (area_query.leaves_near_ball_starts, + area_query.leaves_near_ball_lists) + + # Get data to host for test. + tree = tree_dev.get(queue=queue) + + # Find leaf boxes. + from boxtree import box_flags_enum + leaf_boxes = np.array([ibox for ibox in range(tree.nboxes) + if not (tree.box_flags[ibox] & box_flags_enum.HAS_CHILDREN)]) + + leaf_box_radii = np.empty(len(leaf_boxes)) + leaf_box_centers = np.empty((dims, len(leaf_boxes))) + + for idx, leaf_box in enumerate(leaf_boxes): + box_center = tree.box_centers[:, leaf_box] + ext_l, ext_h = tree.get_box_extent(leaf_box) + leaf_box_radii[idx] = np.max(ext_h - ext_l) * 0.5 + leaf_box_centers[:, idx] = box_center + + neighbor_starts, neighbor_and_self_lists = find_neighbors( + leaf_box_centers, leaf_box_radii) + + # Check level restriction. + for leaf_idx, leaf in enumerate(leaf_boxes): + neighbors = neighbor_and_self_lists[ + neighbor_starts[leaf_idx]:neighbor_starts[leaf_idx+1]] + neighbor_levels = np.array(tree.box_levels[neighbors], dtype=int) + leaf_level = int(tree.box_levels[leaf]) + assert (np.abs(neighbor_levels - leaf_level) <= 1).all(), \ + (neighbor_levels, leaf_level) + +# }}} + + # You can test individual routines by typing # $ python test_tree.py 'test_routine(cl.create_some_context)' -- GitLab From cb734527396554b1e810d635786c96fef18ca3cc Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 16:24:21 -0500 Subject: [PATCH 02/13] Py2.7 fix --- boxtree/tree_build.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index ab0fd86..35e9325 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -776,13 +776,16 @@ class TreeBuilder(object): srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids) resize_events.append(evt) - del my_realloc del my_realloc_zeros - del my_realloc_zeros_and_renumber del my_realloc_nocopy del my_realloc_zeros_nocopy del renumber_array + # Can't del on Py2.7 - these are used in generator expressions + # above, which are nested scopes + my_realloc = None + my_realloc_zeros_and_renumber = None + # retry logger.info("nboxes_guess exceeded: " "enlarged allocations, restarting level") -- GitLab From 26e285ba37dd620e1ab45c0d8c4020448739c7c9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 16:30:54 -0500 Subject: [PATCH 03/13] More Py2.7 fixes --- boxtree/tree_build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index 35e9325..81c2d86 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -837,7 +837,8 @@ class TreeBuilder(object): level_start:level_start + level_nboxes]).get()) assert leaf_count == nleaves_actual - del new_level_leaf_counts + # Can't del in Py2.7 - see note below + new_level_leaf_counts = None # }}} -- GitLab From dba1582a9474313a3ec60548fbaa46e66382618e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 16:53:02 -0500 Subject: [PATCH 04/13] Still more Py2.7 fixes --- boxtree/tree_build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index 81c2d86..ed37c99 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -855,7 +855,7 @@ class TreeBuilder(object): + box_centers) evt = knl_info.box_splitter_kernel(*box_splitter_args, - range=range(level_start_box_nrs[-1]), + range=slice(level_start_box_nrs[-1]), wait_for=wait_for) wait_for = [evt] @@ -892,7 +892,7 @@ class TreeBuilder(object): new_user_srcntgt_ids, new_srcntgt_box_ids)) evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args, - range=range(nsrcntgts), wait_for=wait_for) + range=slice(nsrcntgts), wait_for=wait_for) wait_for = [evt] -- GitLab From 16f7c6bd28b18bbd9c65140f9d5e6d041efc5962 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 18:32:47 -0500 Subject: [PATCH 05/13] Revert unintended changes to area query test. --- test/test_tree.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/test_tree.py b/test/test_tree.py index 7ad1336..9f3957d 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -674,10 +674,7 @@ def test_area_query(ctx_getter, dims, do_plot=False): nparticles = 10**5 dtype = np.float64 - from boxtree.tools import make_surface_particle_array - particles = make_surface_particle_array(queue, nparticles, dims, dtype, seed=15) - - #particles = make_normal_particle_array(queue, nparticles, dims, dtype) + particles = make_normal_particle_array(queue, nparticles, dims, dtype) if do_plot: import matplotlib.pyplot as pt -- GitLab From dc2cf87e83370bc80a66cf398b4e7137e4ac37d4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 25 Aug 2016 23:36:19 -0500 Subject: [PATCH 06/13] [ci skip] Fix typo. --- boxtree/tree_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index ed37c99..77c1d01 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -597,7 +597,7 @@ class TreeBuilder(object): # 1. Compute the minimal necessary size of each level. # 2. If level restricting, add padding to the lower level. # 3. Check if there is enough existing space for each level. - # 4. If any does not have space, reallocate all levels: + # 4. If any level does not have space, reallocate all levels: # 4a. Compute new sizes of upper levels # 4b. If level restricting, add padding to all levels. -- GitLab From 3551fbb98e7f6d94dc93c9b1c125330438a5d8ce Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 26 Aug 2016 11:28:56 -0500 Subject: [PATCH 07/13] Pass debug flag through to gappy copy and map. --- boxtree/tree_build.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index 77c1d01..9ef7dcb 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -668,10 +668,11 @@ class TreeBuilder(object): wait_for.extend(dst_box_id.events) realloc_array = partial(self.gappy_copy_and_map, - dst_indices=dst_box_id, range=slice(old_box_count)) + dst_indices=dst_box_id, range=slice(old_box_count), + debug=debug) realloc_and_renumber_array = partial(self.gappy_copy_and_map, dst_indices=dst_box_id, map_values=dst_box_id, - range=slice(old_box_count)) + range=slice(old_box_count), debug=debug) renumber_array = partial(self.map_values_kernel, dst_box_id) # Update level_start_box_nrs. This will be the @@ -1118,7 +1119,7 @@ class TreeBuilder(object): prune_empty = partial(self.gappy_copy_and_map, queue, allocator, nboxes_post_prune, src_indices=src_box_id, - range=slice(nboxes_post_prune)) + range=slice(nboxes_post_prune), debug=debug) box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts) prune_events.append(evt) -- GitLab From a4b260340ef4bcf5613d0b7822f7ab65d5ad72b2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 26 Aug 2016 12:00:27 -0500 Subject: [PATCH 08/13] Clean up some reallocation code. --- boxtree/tree_build.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index 9ef7dcb..b662494 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -628,8 +628,11 @@ class TreeBuilder(object): # {{{ prepare for reallocation/renumbering - if (knl_info.level_restrict and (nboxes_guess < nboxes_minimal)) \ - or needs_renumbering: + if needs_renumbering: + assert knl_info.level_restrict + + # {{{ compute new level_start_box_nrs + # Represents the amount of padding needed for upper levels. upper_level_padding = np.zeros(level, dtype=int) @@ -651,6 +654,12 @@ class TreeBuilder(object): new_level_used_box_counts[:-1] + new_upper_level_unused_box_counts) + assert not (level_start_box_nrs == new_level_start_box_nrs).all() + + # }}} + + # {{{ set up reallocators + old_box_count = level_start_box_nrs[-1] # Where should I put this box? dst_box_id = cl.array.empty(queue, @@ -675,6 +684,8 @@ class TreeBuilder(object): range=slice(old_box_count), debug=debug) renumber_array = partial(self.map_values_kernel, dst_box_id) + # }}} + # Update level_start_box_nrs. This will be the # level_start_box_nrs for the reallocated data. @@ -684,9 +695,7 @@ class TreeBuilder(object): level_start_box_nrs_updated = True wait_for.extend(level_start_box_nrs_dev.events) - nboxes_new = ( - level_start_box_nrs[-1] - + minimal_new_level_length) + nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length del new_level_start_box_nrs else: -- GitLab From 0e2fac430824cf43279cfd10e18fb7c5f4790fd0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 26 Aug 2016 12:14:13 -0500 Subject: [PATCH 09/13] Simplify box count finder. --- boxtree/tree_build_kernels.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 4e772c5..5f7bb99 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -1540,15 +1540,11 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, VectorArg(box_id_dtype, "level_box_counts"), # [nlevels] ], input_expr="1", - is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i - 1]", + is_segment_start_expr="i + 1 == N || box_levels[i] != box_levels[i + 1]", scan_expr="across_seg_boundary ? b : a + b", neutral="0", output_statement=r"""//CL// - if (i + 1 == N) - { - level_box_counts[box_levels[i]] = item; - } - else if (box_levels[i] != box_levels[i + 1]) + if (i + 1 == N || box_levels[i] != box_levels[i + 1]) { level_box_counts[box_levels[i]] = item; } -- GitLab From 301ffeef50d037008d437de2692609689f631440 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 26 Aug 2016 12:31:33 -0500 Subject: [PATCH 10/13] Fix is_segment_start_expr --- boxtree/tree_build_kernels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 5f7bb99..3b3763b 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -1540,7 +1540,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, VectorArg(box_id_dtype, "level_box_counts"), # [nlevels] ], input_expr="1", - is_segment_start_expr="i + 1 == N || box_levels[i] != box_levels[i + 1]", + is_segment_start_expr="i == 0 || box_levels[i] != box_levels[i - 1]", scan_expr="across_seg_boundary ? b : a + b", neutral="0", output_statement=r"""//CL// -- GitLab From 8670d0a2dc19780d86381ff06a4523c05abfa7a4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 28 Aug 2016 17:19:53 -0500 Subject: [PATCH 11/13] Address code comments. --- boxtree/tree_build.py | 44 ++++++++++++++++------------- boxtree/tree_build_kernels.py | 52 +++++++++++++++++++++++++---------- test/test_tree.py | 7 ++--- 3 files changed, 64 insertions(+), 39 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index b662494..f0754c2 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -411,13 +411,15 @@ class TreeBuilder(object): box_srcntgt_counts_cumul[0].fill( nsrcntgts, queue=queue, wait_for=[evt]) - # box -> whether the box has a child. FIXME: use bools + # box -> whether the box has a child. FIXME: use smaller integer type box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) prep_events.append(evt) # box -> whether the box needs a splitting to enforce level restriction. # FIXME: use bools - force_split_box, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) + force_split_box, evt = zeros(nboxes_guess + if knl_info.level_restrict + else 0, dtype=np.dtype(np.int32)) prep_events.append(evt) # set parent of root box to itself @@ -447,6 +449,8 @@ class TreeBuilder(object): have_oversize_split_box, evt = zeros((), np.int32) prep_events.append(evt) + # True if and only if the level restrict kernel found a box to split in + # order to enforce level restriction. have_upper_level_split_box, evt = zeros((), np.int32) prep_events.append(evt) @@ -459,8 +463,8 @@ class TreeBuilder(object): # Level 0 starts at 0 and always contains box 0 and nothing else. # Level 1 therefore starts at 1. level_start_box_nrs = [0, 1] - level_start_box_nrs_dev[0].fill(0) - level_start_box_nrs_dev[1].fill(1) + level_start_box_nrs_dev[0] = 0 + level_start_box_nrs_dev[1] = 1 wait_for.extend(level_start_box_nrs_dev.events) # This counts the number of boxes that have been used per level. Note @@ -469,7 +473,7 @@ class TreeBuilder(object): # are pre-allocated for a level than used since we may decide to split # parent level boxes later). level_used_box_counts = [1] - level_used_box_counts_dev[0].fill(1) + level_used_box_counts_dev[0] = 1 wait_for.extend(level_used_box_counts_dev.events) # level -> number of leaf boxes on level. Initially the root node is a @@ -512,8 +516,6 @@ class TreeBuilder(object): if level > np.iinfo(self.box_level_dtype).max: raise RuntimeError("level count exceeded maximum") - cl.wait_for_events(wait_for) - common_args = ((morton_bin_counts, morton_nrs, box_start_flags, srcntgt_box_ids, split_box_ids, @@ -573,17 +575,18 @@ class TreeBuilder(object): int(split_box_ids[last_box_on_prev_level].get()) - level_start_box_id) - new_level_leaf_counts = np.empty(level + 1, dtype=int) # New leaf count = # old leaf count # + nr. new boxes from splitting parent's leaves # - nr. new boxes from splitting current level's leaves / 2**d level_used_box_counts_diff = (new_level_used_box_counts - np.append(level_used_box_counts, [0])) - new_level_leaf_counts[:-1] = (level_leaf_counts + new_level_leaf_counts = (level_leaf_counts + level_used_box_counts_diff[:-1] - level_used_box_counts_diff[1:] // 2 ** dimensions) - new_level_leaf_counts[-1] = level_used_box_counts_diff[-1] + new_level_leaf_counts = np.append( + new_level_leaf_counts, + [level_used_box_counts_diff[-1]]) del level_used_box_counts_diff # }}} @@ -594,10 +597,11 @@ class TreeBuilder(object): # procedure. # The algorithm for deciding on level sizes is as follows: - # 1. Compute the minimal necessary size of each level. - # 2. If level restricting, add padding to the lower level. + # 1. Compute the minimal necessary size of each level, including the + # new level being created. + # 2. If level restricting, add padding to the new level being created. # 3. Check if there is enough existing space for each level. - # 4. If any level does not have space, reallocate all levels: + # 4. If any level does not have sufficient space, reallocate all levels: # 4a. Compute new sizes of upper levels # 4b. If level restricting, add padding to all levels. @@ -746,8 +750,9 @@ class TreeBuilder(object): box_morton_bin_counts) resize_events.append(evt) - force_split_box, evt = my_realloc_zeros(force_split_box) - resize_events.append(evt) + if force_split_box: + force_split_box, evt = my_realloc_zeros(force_split_box) + resize_events.append(evt) box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts) resize_events.append(evt) @@ -852,7 +857,6 @@ class TreeBuilder(object): # }}} - # XXX del nboxes_new del new_level_used_box_counts @@ -944,6 +948,7 @@ class TreeBuilder(object): # Upward pass - check if leaf boxes at higher levels need # further splitting. + assert len(force_split_box) > 0 force_split_box.fill(0) wait_for.extend(force_split_box.events) @@ -953,9 +958,10 @@ class TreeBuilder(object): boxes_split = [] for upper_level, upper_level_start, upper_level_box_count in zip( - # After the new level has been created, only only boxes - # at (level - 2) or above need to be rechecked for - # splitting. + # We just built level. Our parent level doesn't need to + # be rechecked for splitting because the smallest boxes + # in the tree (ours) already have a 2-to-1 ratio with + # that. Start checking at the level above our parent. range(level - 2, 0, -1), # At this point, the last entry in level_start_box_nrs # already refers to (level + 1). diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 3b3763b..97ef52e 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -464,7 +464,9 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( __global morton_counts_t *box_morton_bin_counts, __global box_id_t *level_start_box_ids, __global box_id_t *level_used_box_counts, - __global int *box_force_split, + %if level_restrict: + __global int *box_force_split, + %endif __global int *have_oversize_split_box, // output/side effect __global int *box_has_children // output/side effect ) @@ -509,8 +511,11 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( box_srcntgt_counts_cumul[box_id] - nonchild_srcntgts_in_box >= 0 %endif - ) || - box_force_split[box_id]) + ) + %if level_restrict: + || box_force_split[box_id] + %endif + ) { result += ${2**dimensions}; box_has_children[box_id] = 1; @@ -531,12 +536,21 @@ SPLIT_BOX_ID_SCAN_TPL = ScanTemplate( return result; } """, - input_expr="""count_new_boxes_needed( - i, box_levels[i], last_level, max_leaf_refine_weight, - box_srcntgt_counts_cumul, box_morton_bin_counts, - level_start_box_ids, level_used_box_counts, - box_force_split, have_oversize_split_box, - box_has_children + input_expr=r"""//CL:mako// + count_new_boxes_needed( + i, + box_levels[i], + last_level, + max_leaf_refine_weight, + box_srcntgt_counts_cumul, + box_morton_bin_counts, + level_start_box_ids, + level_used_box_counts, + %if level_restrict: + box_force_split, + %endif + have_oversize_split_box, + box_has_children )""", scan_expr="across_seg_boundary ? b : a + b", neutral="0", @@ -556,8 +570,11 @@ BOX_SPLITTER_KERNEL_TPL = Template(r"""//CL// box_id_t ibox = i; bool do_split_box = - (box_has_children[ibox] && box_levels[ibox] + 1 == level) || - box_force_split[ibox]; + (box_has_children[ibox] && box_levels[ibox] + 1 == level) + %if level_restrict: + || box_force_split[ibox] + %endif + ; if (!do_split_box) { @@ -654,8 +671,11 @@ PARTICLE_RENUMBERER_KERNEL_TPL = Template(r"""//CL// dbg_printf(("postproc %d:\n", i)); dbg_printf((" my box id: %d\n", ibox)); - bool do_split_box = (box_has_children[ibox] && box_levels[ibox] + 1 == level) || - box_force_split[ibox]; + bool do_split_box = (box_has_children[ibox] && box_levels[ibox] + 1 == level) + %if level_restrict: + || box_force_split[ibox] + %endif + ; if (!do_split_box) { @@ -821,7 +841,7 @@ LEVEL_RESTRICT_TPL = Template( """, strict_undefined=True) -def build_level_restrict_kernel(context, generic_preamble, +def build_level_restrict_kernel(context, preamble_with_dtype_decls, dimensions, axis_names, box_id_dtype, coord_dtype, box_level_dtype, max_levels): from pyopencl.tools import VectorArg, ScalarArg @@ -866,7 +886,7 @@ def build_level_restrict_kernel(context, generic_preamble, operation=LEVEL_RESTRICT_TPL.render(**render_vars), name="level_restrict", preamble=( - str(generic_preamble) + + str(preamble_with_dtype_decls) + Template(r""" #define LEVEL_TO_RAD(level) \ (root_extent * 1 / (coord_t) (1 << (level + 1))) @@ -1272,6 +1292,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, box_flags_enum=box_flags_enum, adaptive=adaptive, + level_restrict=level_restrict, sources_are_targets=sources_are_targets, srcntgts_have_extent=srcntgts_have_extent, @@ -1397,6 +1418,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, ("srcntgts_have_extent", srcntgts_have_extent), ("adaptive", adaptive), ("padded_bin", padded_bin), + ("level_restrict", level_restrict), ), more_preamble=generic_preamble) diff --git a/test/test_tree.py b/test/test_tree.py index 9f3957d..1540af0 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -702,9 +702,7 @@ def test_area_query(ctx_getter, dims, do_plot=False): ball_radii = ball_radii.get() from boxtree import box_flags_enum - - leaf_boxes = np.array([ibox for ibox in range(tree.nboxes) - if tree.box_flags[ibox] & ~box_flags_enum.HAS_CHILDREN]) + leaf_boxes, = (tree.box_flags & box_flags_enum.HAS_CHILDREN == 0).nonzero() leaf_box_radii = np.empty(len(leaf_boxes)) leaf_box_centers = np.empty((len(leaf_boxes), dims)) @@ -782,8 +780,7 @@ def test_level_restriction(ctx_getter, dims, skip_prune, lookbehind, do_plot=Fal # Find leaf boxes. from boxtree import box_flags_enum - leaf_boxes = np.array([ibox for ibox in range(tree.nboxes) - if not (tree.box_flags[ibox] & box_flags_enum.HAS_CHILDREN)]) + leaf_boxes, = (tree.box_flags & box_flags_enum.HAS_CHILDREN == 0).nonzero() leaf_box_radii = np.empty(len(leaf_boxes)) leaf_box_centers = np.empty((dims, len(leaf_boxes))) -- GitLab From 0622af075414aced6dca869dc7c5d9349db77f76 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 28 Aug 2016 21:11:13 -0500 Subject: [PATCH 12/13] Fix force_split_box reallocation condition. --- boxtree/tree_build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index f0754c2..a9eb13f 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -750,7 +750,8 @@ class TreeBuilder(object): box_morton_bin_counts) resize_events.append(evt) - if force_split_box: + # force_split_box is unused unless level restriction is enabled. + if knl_info.level_restrict: force_split_box, evt = my_realloc_zeros(force_split_box) resize_events.append(evt) -- GitLab From 161073b35bdd6cb609ccfb5b4ab96d806dc72cb5 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 28 Aug 2016 21:14:38 -0500 Subject: [PATCH 13/13] Fix comment. --- boxtree/tree_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index a9eb13f..1d3614c 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -416,7 +416,7 @@ class TreeBuilder(object): prep_events.append(evt) # box -> whether the box needs a splitting to enforce level restriction. - # FIXME: use bools + # FIXME: use smaller integer type force_split_box, evt = zeros(nboxes_guess if knl_info.level_restrict else 0, dtype=np.dtype(np.int32)) -- GitLab