diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 82e767b7337058422f7a893e00b3d1620f20c139..1e4c240b09860ac2630ea9160f8ca487538eebbc 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -186,7 +186,7 @@ class E2EFromCSR(E2EBase): [ lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_box_starts, src_box_lists", - None, shape=None, strides=(1,)), + None, shape=None, strides=(1,), offset=lp.auto), lp.ValueArg("aligned_nboxes,nboxes", np.int32), lp.GlobalArg("src_expansions", None, shape=("nboxes", ncoeff_src)), @@ -196,7 +196,8 @@ class E2EFromCSR(E2EBase): ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ntgt_boxes>=1", - silenced_warnings="write_race(write_expn*)") + silenced_warnings="write_race(write_expn*)", + default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) @@ -230,7 +231,8 @@ class E2EFromChildren(E2EBase): def get_kernel(self): if self.src_expansion is not self.tgt_expansion: raise RuntimeError("%s requires that the source " - "and target expansion are the same object") + "and target expansion are the same object" + % type(self).__name__) ncoeffs = len(self.src_expansion) diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 13dae6bc3ebe46e5b49a62dcb25de77044a4686d..3705832b8414ac746025f60e716528de69d33f88 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -175,7 +175,8 @@ class E2PFromSingleBox(E2PBase): ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="ntgt_boxes>=1", - silenced_warnings="write_race(write_result*)") + silenced_warnings="write_race(write_result*)", + default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim, @@ -269,12 +270,13 @@ class E2PFromCSR(E2PBase): lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C"), lp.GlobalArg("source_box_starts, source_box_lists,", - None, shape=None), + None, shape=None, offset=lp.auto), "..." ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="ntgt_boxes>=1", - silenced_warnings="write_race(write_result*)") + silenced_warnings="write_race(write_result*)", + default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim, diff --git a/sumpy/fmm.py b/sumpy/fmm.py index 51ab40a242702b22d8a5890b40ba87e785c38684..9508a9b8b6814d4ab00a5be7a482e2ea88ba2f0c 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -34,6 +34,14 @@ __doc__ = """Integrates :mod:`boxtree` with :mod:`sumpy`. import pyopencl as cl import pyopencl.array # noqa +from pytools import memoize_method + +from sumpy import ( + P2EFromSingleBox, P2EFromCSR, + E2PFromSingleBox, E2PFromCSR, + P2PFromCSR, + E2EFromCSR, E2EFromChildren, E2EFromParent) + # {{{ expansion wrangler code container @@ -46,33 +54,77 @@ class SumpyExpansionWranglerCodeContainer(object): """ def __init__(self, cl_context, - multipole_expansion, local_expansion, out_kernels): - m_expn = self.multipole_expansion = multipole_expansion - l_expn = self.local_expansion = local_expansion + multipole_expansion_factory, + local_expansion_factory, out_kernels): + """ + :arg multipole_expansion_factory: a callable of a single argument (order) + that returns a multipole expansion. + :arg local_expansion_factory: a callable of a single argument (order) + that returns a local expansion. + """ + self.multipole_expansion_factory = multipole_expansion_factory + self.local_expansion_factory = local_expansion_factory self.out_kernels = out_kernels self.cl_context = cl_context - from sumpy import ( - P2EFromSingleBox, P2EFromCSR, - E2PFromSingleBox, E2PFromCSR, - P2PFromCSR, - E2EFromCSR, E2EFromChildren, E2EFromParent) - self.p2m = P2EFromSingleBox(cl_context, m_expn) - self.p2l = P2EFromCSR(cl_context, l_expn) - self.m2m = E2EFromChildren(cl_context, m_expn, m_expn) - self.m2l = E2EFromCSR(cl_context, m_expn, l_expn) - self.l2l = E2EFromParent(cl_context, l_expn, l_expn) - self.m2p = E2PFromCSR(cl_context, m_expn, out_kernels) - self.l2p = E2PFromSingleBox(cl_context, l_expn, out_kernels) - + @memoize_method + def multipole_expansion(self, order): + return self.multipole_expansion_factory(order) + + @memoize_method + def local_expansion(self, order): + return self.local_expansion_factory(order) + + @memoize_method + def p2m(self, tgt_order): + return P2EFromSingleBox(self.cl_context, + self.multipole_expansion(tgt_order)) + + @memoize_method + def p2l(self, tgt_order): + return P2EFromCSR(self.cl_context, + self.local_expansion(tgt_order)) + + @memoize_method + def m2m(self, src_order, tgt_order): + return E2EFromChildren(self.cl_context, + self.multipole_expansion(src_order), + self.multipole_expansion(tgt_order)) + + @memoize_method + def m2l(self, src_order, tgt_order): + return E2EFromCSR(self.cl_context, + self.multipole_expansion(src_order), + self.local_expansion(tgt_order)) + + @memoize_method + def l2l(self, src_order, tgt_order): + return E2EFromParent(self.cl_context, + self.local_expansion(src_order), + self.local_expansion(tgt_order)) + + @memoize_method + def m2p(self, src_order): + return E2PFromCSR(self.cl_context, + self.multipole_expansion(src_order), + self.out_kernels) + + @memoize_method + def l2p(self, src_order): + return E2PFromSingleBox(self.cl_context, + self.local_expansion(src_order), + self.out_kernels) + + @memoize_method + def p2p(self): # FIXME figure out what to do about exclude_self - self.p2p = P2PFromCSR(cl_context, out_kernels, exclude_self=False) + return P2PFromCSR(self.cl_context, self.out_kernels, exclude_self=False) - def get_wrangler(self, queue, tree, dtype, + def get_wrangler(self, queue, tree, dtype, level_to_order, source_extra_kwargs={}, kernel_extra_kwargs=None): - return SumpyExpansionWrangler(self, queue, tree, dtype, + return SumpyExpansionWrangler(self, queue, tree, dtype, level_to_order, source_extra_kwargs, kernel_extra_kwargs) # }}} @@ -80,6 +132,15 @@ class SumpyExpansionWranglerCodeContainer(object): # {{{ expansion wrangler +def _enqueue_barrier(queue, wait_for): + if queue.device.platform.name == "Portable Computing Language": + # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList + queue.finish() + return cl.enqueue_marker() + else: + return cl.enqueue_barrier(queue, wait_for=wait_for) + + class SumpyExpansionWrangler(object): """Implements the :class:`boxtree.fmm.ExpansionWranglerInterface` by using :mod:`sumpy` expansions/translations. @@ -95,14 +156,19 @@ class SumpyExpansionWrangler(object): expansions, but not source particles. """ - def __init__(self, code_container, queue, tree, dtype, + def __init__(self, code_container, queue, tree, dtype, level_to_order, source_extra_kwargs, kernel_extra_kwargs=None): self.code = code_container self.queue = queue self.tree = tree + self.dtype = dtype + if not callable(level_to_order): + raise TypeError("level_to_order not passed") + self.level_to_order = level_to_order + if kernel_extra_kwargs is None: kernel_extra_kwargs = {} @@ -112,26 +178,26 @@ class SumpyExpansionWrangler(object): self.extra_kwargs = source_extra_kwargs.copy() self.extra_kwargs.update(self.kernel_extra_kwargs) - @property - def multipole_expansion(self): - return self.code.multipole_expansion - - @property - def local_expansion(self): - return self.code.local_expansion + self.level_queues = [ + cl.CommandQueue(self.code.cl_context) + for i in range(self.tree.nlevels)] # {{{ data vector utilities def multipole_expansion_zeros(self): + order = self.level_to_order(0) # AAARGH FIXME + m_expn = self.code.multipole_expansion_factory(order) return cl.array.zeros( self.queue, - (self.tree.nboxes, len(self.multipole_expansion)), + (self.tree.nboxes, len(m_expn)), dtype=self.dtype) def local_expansion_zeros(self): + order = self.level_to_order(0) # AAARGH FIXME + l_expn = self.code.local_expansion_factory(order) return cl.array.zeros( self.queue, - (self.tree.nboxes, len(self.local_expansion)), + (self.tree.nboxes, len(l_expn)), dtype=self.dtype) def potential_zeros(self): @@ -177,37 +243,75 @@ class SumpyExpansionWrangler(object): # }}} - def form_multipoles(self, source_boxes, src_weights): + def form_multipoles(self, + level_start_source_box_nrs, source_boxes, + src_weights): mpoles = self.multipole_expansion_zeros() kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) - evt, (mpoles_res,) = self.code.p2m(self.queue, - source_boxes=source_boxes, - centers=self.tree.box_centers, - strengths=src_weights, - expansions=mpoles, + events = [] + for lev in range(self.tree.nlevels): + p2m = self.code.p2m(self.level_to_order(lev)) + start, stop = level_start_source_box_nrs[lev:lev+2] + if start == stop: + continue - **kwargs) + evt, (mpoles_res,) = p2m( + self.level_queues[lev], + source_boxes=source_boxes[start:stop], + centers=self.tree.box_centers, + strengths=src_weights, + expansions=mpoles, + + wait_for=src_weights.events, + + **kwargs) + + assert mpoles_res is mpoles + + events.append(evt) - assert mpoles_res is mpoles + evt = _enqueue_barrier(self.queue, wait_for=events) + mpoles.add_event(evt) return mpoles - def coarsen_multipoles(self, parent_boxes, mpoles): - if not len(parent_boxes): - return mpoles + def coarsen_multipoles(self, + level_start_source_parent_box_nrs, + source_parent_boxes, + mpoles): + tree = self.tree - evt, (mpoles_res,) = self.code.m2m(self.queue, - expansions=mpoles, - target_boxes=parent_boxes, - box_child_ids=self.tree.box_child_ids, - centers=self.tree.box_centers, + # 2 is the last relevant source_level. + # 1 is the last relevant target_level. + # (Nobody needs a multipole on level 0, i.e. for the root box.) + for source_level in range(tree.nlevels-1, 1, -1): + start, stop = level_start_source_parent_box_nrs[ + source_level:source_level+2] + if start == stop: + continue - **self.kernel_extra_kwargs) + target_level = source_level - 1 + assert target_level > 0 + + m2m = self.code.m2m( + self.level_to_order(source_level), + self.level_to_order(target_level)) + + evt, (mpoles_res,) = m2m(self.queue, + expansions=mpoles, + target_boxes=source_parent_boxes[start:stop], + box_child_ids=self.tree.box_child_ids, + centers=self.tree.box_centers, + + **self.kernel_extra_kwargs) + + assert mpoles_res is mpoles + + mpoles.add_event(evt) - assert mpoles_res is mpoles return mpoles def eval_direct(self, target_boxes, source_box_starts, @@ -218,7 +322,7 @@ class SumpyExpansionWrangler(object): kwargs.update(self.box_source_list_kwargs()) kwargs.update(self.box_target_list_kwargs()) - evt, pot_res = self.code.p2p(self.queue, + evt, pot_res = self.code.p2p()(self.queue, target_boxes=target_boxes, source_box_starts=source_box_starts, source_box_lists=source_box_lists, @@ -229,100 +333,183 @@ class SumpyExpansionWrangler(object): for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i + pot_i.add_event(evt) return pot - def multipole_to_local(self, target_or_target_parent_boxes, - starts, lists, mpole_exps): + def multipole_to_local(self, + level_start_target_box_nrs, + target_boxes, src_box_starts, src_box_lists, + mpole_exps): local_exps = self.local_expansion_zeros() - evt, (local_exps_res,) = self.code.m2l(self.queue, - src_expansions=mpole_exps, - target_boxes=target_or_target_parent_boxes, - src_box_starts=starts, - src_box_lists=lists, - centers=self.tree.box_centers, - tgt_expansions=local_exps, + events = [] + for lev in range(self.tree.nlevels): + start, stop = level_start_target_box_nrs[lev:lev+2] + if start == stop: + continue - **self.kernel_extra_kwargs) + order = self.level_to_order(lev) + m2l = self.code.m2l(order, order) - assert local_exps_res is local_exps + evt, (local_exps_res,) = m2l( + self.level_queues[lev], + src_expansions=mpole_exps, + target_boxes=target_boxes[start:stop], + src_box_starts=src_box_starts[start:stop], + src_box_lists=src_box_lists, + centers=self.tree.box_centers, + tgt_expansions=local_exps, + + wait_for=mpole_exps.events, + + **self.kernel_extra_kwargs) + + assert local_exps_res is local_exps + events.append(evt) + + evt = _enqueue_barrier(self.queue, wait_for=events) + local_exps.add_event(evt) return local_exps - def eval_multipoles(self, target_boxes, source_box_starts, + def eval_multipoles(self, + level_start_target_box_nrs, + target_boxes, source_box_starts, source_box_lists, mpole_exps): pot = self.potential_zeros() kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) - evt, pot_res = self.code.m2p(self.queue, - expansions=mpole_exps, - target_boxes=target_boxes, - source_box_starts=source_box_starts, - source_box_lists=source_box_lists, - centers=self.tree.box_centers, - result=pot, + events = [] + for lev in range(self.tree.nlevels): + start, stop = level_start_target_box_nrs[lev:lev+2] + if start == stop: + continue - **kwargs) + m2p = self.code.m2p(self.level_to_order(lev)) - for pot_i, pot_res_i in zip(pot, pot_res): - assert pot_i is pot_res_i + evt, pot_res = m2p( + self.level_queues[lev], + expansions=mpole_exps, + target_boxes=target_boxes[start:stop], + source_box_starts=source_box_starts[start:stop], + source_box_lists=source_box_lists, + centers=self.tree.box_centers, + result=pot, + + wait_for=mpole_exps.events, + + **kwargs) + + for pot_i, pot_res_i in zip(pot, pot_res): + assert pot_i is pot_res_i + events.append(evt) + + evt = _enqueue_barrier(self.queue, wait_for=events) + for pot_i in pot: + pot_i.add_event(evt) return pot - def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights): + def form_locals(self, + level_start_target_or_target_parent_box_nrs, + target_or_target_parent_boxes, starts, lists, src_weights): local_exps = self.local_expansion_zeros() kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) - evt, (result,) = self.code.p2l(self.queue, - target_boxes=target_or_target_parent_boxes, - source_box_starts=starts, - source_box_lists=lists, - centers=self.tree.box_centers, - strengths=src_weights, - expansions=local_exps, + events = [] + for lev in range(self.tree.nlevels): + start, stop = \ + level_start_target_or_target_parent_box_nrs[lev:lev+2] + if start == stop: + continue - **kwargs) + p2l = self.code.p2l(self.level_to_order(lev)) + + evt, (result,) = p2l( + self.level_queues[lev], + target_boxes=target_or_target_parent_boxes[start:stop], + source_box_starts=starts[start:stop], + source_box_lists=lists, + centers=self.tree.box_centers, + strengths=src_weights, + expansions=local_exps, + + wait_for=src_weights.events, - assert local_exps is result + **kwargs) + + assert local_exps is result + events.append(evt) + + evt = _enqueue_barrier(self.queue, wait_for=events) + result.add_event(evt) return result - def refine_locals(self, child_boxes, local_exps): - if not len(child_boxes): - return local_exps + def refine_locals(self, + level_start_target_or_target_parent_box_nrs, + target_or_target_parent_boxes, + local_exps): + for target_lev in range(1, self.tree.nlevels): + start, stop = level_start_target_or_target_parent_box_nrs[ + target_lev:target_lev+2] + if start == stop: + continue + + source_lev = target_lev - 1 + l2l = self.code.l2l( + self.level_to_order(source_lev), + self.level_to_order(target_lev)) - evt, (local_exps_res,) = self.code.l2l(self.queue, - expansions=local_exps, - target_boxes=child_boxes, - box_parent_ids=self.tree.box_parent_ids, - centers=self.tree.box_centers, + evt, (local_exps_res,) = l2l(self.queue, + expansions=local_exps, + target_boxes=target_or_target_parent_boxes[start:stop], + box_parent_ids=self.tree.box_parent_ids, + centers=self.tree.box_centers, - **self.kernel_extra_kwargs) + **self.kernel_extra_kwargs) + + local_exps.add_event(evt) assert local_exps_res is local_exps return local_exps - def eval_locals(self, target_boxes, local_exps): + def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): pot = self.potential_zeros() kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) - evt, pot_res = self.code.l2p(self.queue, - expansions=local_exps, - target_boxes=target_boxes, - centers=self.tree.box_centers, - result=pot, + events = [] + for lev in range(self.tree.nlevels): + start, stop = level_start_target_box_nrs[lev:lev+2] + if start == stop: + continue - **kwargs) + l2p = self.code.l2p(self.level_to_order(lev)) + evt, pot_res = l2p( + self.level_queues[lev], + expansions=local_exps, + target_boxes=target_boxes[start:stop], + centers=self.tree.box_centers, + result=pot, - for pot_i, pot_res_i in zip(pot, pot_res): - assert pot_i is pot_res_i + wait_for=local_exps.events, + + **kwargs) + + for pot_i, pot_res_i in zip(pot, pot_res): + assert pot_i is pot_res_i + events.append(evt) + + evt = _enqueue_barrier(self.queue, wait_for=events) + for pot_i in pot: + pot_i.add_event(evt) return pot diff --git a/sumpy/p2e.py b/sumpy/p2e.py index c0b3f50df5801d7b202dda65288f634a7f42f3de..5026d61e56b4faf7b4fdd52c618071591efbeea6 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -153,7 +153,8 @@ class P2EFromSingleBox(P2EBase): ] + gather_loopy_source_arguments([self.expansion]), name=self.name, assumptions="nsrc_boxes>=1", - silenced_warnings="write_race(write_expn*)") + silenced_warnings="write_race(write_expn*)", + default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) @@ -201,7 +202,7 @@ class P2EFromCSR(P2EBase): dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), lp.GlobalArg("source_box_starts,source_box_lists", - None, shape=None), + None, shape=None, offset=lp.auto), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), @@ -251,7 +252,8 @@ class P2EFromCSR(P2EBase): arguments, name=self.name, assumptions="ntgt_boxes>=1", - silenced_warnings="write_race(write_expn*)") + silenced_warnings="write_race(write_expn*)", + default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) diff --git a/sumpy/version.py b/sumpy/version.py index 7d9527a8c91c126962dfa5ebf8504eb84fd8dc3b..5cc1934ddcfbfd8b5e3925d887d5f6f321aa4447 100644 --- a/sumpy/version.py +++ b/sumpy/version.py @@ -25,4 +25,4 @@ VERSION = (2016, 1) VERSION_STATUS = "beta1" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS -KERNEL_VERSION = 6 +KERNEL_VERSION = 7 diff --git a/test/test_fmm.py b/test/test_fmm.py index 8144f7e3832333230d4ace30842b98e9aea66296..63582078d3e8202fbd019891c36509f9e3702c7a 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -142,15 +142,18 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class): elif knl.dim == 2 and issubclass(local_expn_class, H2DLocalExpansion): order_values = [10, 12] + from functools import partial for order in order_values: - mpole_expn = mpole_expn_class(knl, order) - local_expn = local_expn_class(knl, order) out_kernels = [knl] from sumpy.fmm import SumpyExpansionWranglerCodeContainer wcc = SumpyExpansionWranglerCodeContainer( - ctx, mpole_expn, local_expn, out_kernels) + ctx, + partial(mpole_expn_class, knl), + partial(local_expn_class, knl), + out_kernels) wrangler = wcc.get_wrangler(queue, tree, dtype, + level_to_order=lambda lev: order, kernel_extra_kwargs=extra_kwargs) from boxtree.fmm import drive_fmm