From d4b22bb5eeb9b2c9282d7ae654e01ad19a7ba7ef Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 6 Nov 2017 23:11:16 -0600 Subject: [PATCH] Compress List 3 --- boxtree/traversal.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/boxtree/traversal.py b/boxtree/traversal.py index a35c60d..e0fe14f 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -28,6 +28,7 @@ import pyopencl as cl import pyopencl.array # noqa import pyopencl.cltypes # noqa from pyopencl.elementwise import ElementwiseTemplate +from pyopencl.scan import GenericScanKernel from mako.template import Template from boxtree.tools import AXIS_NAMES, DeviceDataRecord @@ -2017,6 +2018,80 @@ class FMMTraversalBuilder: omit_lists=("from_sep_close_smaller",) if with_extent else (), wait_for=wait_for) + fin_debug("compressing separated smaller ('list 3 level %d')" % ilevel) + + from pyopencl.tools import dtype_to_ctype + starts_dtype = result["from_sep_smaller"].starts.dtype + + starts_scan_knl = GenericScanKernel( + queue.context, starts_dtype, + arguments=Template(""" + __global ${scan_t} *starts, + __global ${scan_t} *starts_scan + """).render(scan_t = dtype_to_ctype(starts_dtype) + ), + input_fetch_exprs=[ + ("starts_im1", "starts", -1), + ("starts_i", "starts", 0), + ], + input_expr="(i == 0) || (starts_im1 == starts_i) ? 0 : 1", + scan_expr="a+b", neutral="0", + output_statement="starts_scan[i] = item;" + ) + + starts_scan = cl.array.empty_like(result["from_sep_smaller"].starts) + starts_scan_knl(result["from_sep_smaller"].starts, starts_scan) + + compressed_starts_prg = cl.Program(queue.context, Template(""" + __kernel void compress( + __global const ${scan_t} *starts, + __global const ${scan_t} *starts_scan, + const ${scan_t} ntarget_boxes, + __global ${scan_t} *compressed_start, + __global ${box_id_t} *compressed_indices) + { + int gid = get_global_id(0); + int gsize = get_global_size(0); + ${scan_t} len_per_thread = (ntarget_boxes + gsize - 1) / gsize; + + for(${scan_t} i = len_per_thread * gid; + i < len_per_thread * (gid + 1) && i < ntarget_boxes; + i++) { + if(starts[i] != starts[i + 1]) { + compressed_start[starts_scan[i]] = starts[i]; + compressed_indices[starts_scan[i]] = i; + } + } + } + """).render( + scan_t = dtype_to_ctype(starts_dtype), + box_id_t = dtype_to_ctype(tree.box_id_dtype)) + ).build() + compress_knl = compressed_starts_prg.compress + compress_knl.set_scalar_arg_dtypes( + [None, None, starts_dtype, None, None]) + + compressed_count = starts_scan[-1:].get()[0] + result["from_sep_smaller"].compressed_count = compressed_count + + if compressed_count > 0: + compressed_start = cl.array.empty(queue, (compressed_count + 1,), + starts_dtype) + compressed_indices = cl.array.empty(queue, (compressed_count,), + tree.box_id_dtype) + compress_knl(queue, (2048,), None, + result["from_sep_smaller"].starts.data, + starts_scan.data, + result["from_sep_smaller"].starts.shape[0] - 1, + compressed_start.data, + compressed_indices.data + ) + compressed_start[-1].set( + result["from_sep_smaller"].starts[-1].get()) + + result["from_sep_smaller"].compressed_start = compressed_start + result["from_sep_smaller"].compressed_indices = compressed_indices + from_sep_smaller_by_level.append(result["from_sep_smaller"]) from_sep_smaller_wait_for.append(evt) -- GitLab