diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 3168c87a7f135346bd7cc36804916e754998dc20..61329a140f6b2a5468d2a1490bf60de5a2f8c91c 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -142,10 +142,10 @@ class E2EFromCSR(E2EBase): coeff${TGT_COEFFIDX} {id_prefix=write_expn} """], [ - lp.GlobalArg("centers", None, shape="dim, nboxes"), + lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_box_starts, src_box_lists", None, shape=None, strides=(1,)), - lp.ValueArg("nboxes", np.int32), + lp.ValueArg("aligned_nboxes,nboxes", np.int32), lp.GlobalArg("src_expansions", None, shape=("nboxes", ncoeff_src)), lp.GlobalArg("tgt_expansions", None, @@ -218,7 +218,7 @@ class E2EFromChildren(E2EBase): <> tgt_center[idim] = centers[idim, tgt_ibox] \ {id=fetch_tgt_center} - <> src_ibox = box_child_ids[isrc_box,itgt_box] + <> src_ibox = box_child_ids[isrc_box,tgt_ibox] <> is_src_box_valid = src_ibox != 0 <> src_center[idim] = centers[idim, src_ibox] \ @@ -312,11 +312,12 @@ class E2EFromParent(E2EBase): expansions[src_ibox, ${COEFFIDX}] expansions[tgt_ibox, ${COEFFIDX}] = \ expansions[tgt_ibox, ${COEFFIDX}] + coeff${COEFFIDX} \ - {id_prefix=write_expn,if=is_src_box_valid} + {id_prefix=write_expn} """], [ - lp.GlobalArg("centers", None, shape="dim, nboxes"), - lp.ValueArg("nboxes", np.int32), + lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), + lp.ValueArg("naligned_boxes,nboxes", np.int32), + lp.GlobalArg("box_parent_ids", None, shape="nboxes"), lp.GlobalArg("expansions", None, shape=("nboxes", ncoeffs)), "..." diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 18cc8f8ea764afc6ea2bc665a0519aed3b5bab3d..1dea621c89815d39fee4a2625e22c2d4de44a348 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -105,29 +105,11 @@ class E2PBase(object): # {{{ box-local E2P (L2P, likely) class E2PFromLocal(E2PBase): - @memoize_method def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() - arguments = ( - [ - lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), - lp.GlobalArg("box_target_starts,box_target_counts_nonchild", - None, shape=None), - lp.GlobalArg("centers", None, shape="dim, nboxes"), - lp.GlobalArg("expansions", None, - shape=("nboxes", ncoeffs)), - lp.ValueArg("nboxes", np.int32), - lp.ValueArg("ntargets", np.int32), - "..." - ] + [ - lp.GlobalArg("result_%d" % i, None, shape="ntargets") - for i in range(len(result_names)) - ] + self.expansion.get_args() - ) - loopy_knl = lp.make_kernel(self.device, [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[itgt_box] - <> itgt_end = itgt_start+box_target_counts_nonchild[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] <> center[idim] = centers[idim, tgt_ibox] {id=fetch_center} - <> b[idim] = targets[idim, itgt] - center[idim] + <> b[idim] = targets[idim, itgt] - center[idim] \ + {id=compute_b} <> coeff${COEFFIDX} = expansions[tgt_ibox, ${COEFFIDX}] - result_${RESULTIDX}[itgt] = \ + result[${RESULTIDX},itgt] = \ kernel_scaling * result_${RESULTIDX}_p """], - arguments, + [ + lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), + dim_tags="sep,C"), + lp.GlobalArg("box_target_starts,box_target_counts_nonchild", + None, shape=None), + lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), + lp.GlobalArg("result", None, shape="nresults, ntargets", + dim_tags="sep,C"), + lp.GlobalArg("expansions", None, shape=("nboxes", ncoeffs)), + lp.ValueArg("nboxes,naligned_boxes", np.int32), + lp.ValueArg("ntargets", np.int32), + "..." + ] + self.expansion.get_args(), name=self.name, assumptions="ntgt_boxes>=1", defines=dict( dim=self.dim, COEFFIDX=[str(i) for i in xrange(ncoeffs)], RESULTIDX=[str(i) for i in xrange(len(result_names))], + nresults=len(result_names), ) ) - loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", + loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "compute_b", tags={"idim": "unr"}) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) @@ -185,60 +181,59 @@ class E2PFromLocal(E2PBase): # {{{ E2P from CSR-like interaction list class E2PFromCSR(E2PBase): - @memoize_method def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() - arguments = ( - [ - lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), - lp.GlobalArg("box_target_starts,box_target_counts_nonchild", - None, shape=None), - lp.GlobalArg("centers", None, shape="dim, nboxes"), - lp.GlobalArg("expansions", None, - shape=("nboxes", ncoeffs)), - lp.ValueArg("nboxes", np.int32), - lp.ValueArg("ntargets", np.int32), - "..." - ] + [ - lp.GlobalArg("result_%d" % i, None, shape="ntargets") - for i in range(len(result_names)) - ] + self.expansion.get_args() - ) - loopy_knl = lp.make_kernel(self.device, [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[itgt_box] - <> itgt_end = itgt_start+box_target_counts_nonchild[itgt_box] - <> tgt[idim] = targets[idim] {id=fetch_tgt} + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] + + <> tgt[idim] = targets[idim,itgt] {id=fetch_tgt} - <> isrc_box_start = source_box_starts[isrc_box] - <> isrc_box_end = source_box_starts[isrc_box+1] + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_end = source_box_starts[itgt_box+1] <> src_ibox = source_box_lists[isrc_box] <> coeff${COEFFIDX} = expansions[src_ibox, ${COEFFIDX}] <> center[idim] = centers[idim, src_ibox] {id=fetch_center} <> b[idim] = tgt[idim] - center[idim] - result[${RESULTIDX, itgt] = \ + result[${RESULTIDX}, itgt] = \ kernel_scaling * sum(isrc_box, result_${RESULTIDX}_p) """], - arguments, + [ + lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), + dim_tags="sep,C"), + lp.GlobalArg("box_target_starts,box_target_counts_nonchild", + None, shape=None), + lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), + lp.GlobalArg("expansions", None, + shape=("nboxes", ncoeffs)), + lp.ValueArg("nboxes,aligned_nboxes", np.int32), + lp.ValueArg("ntargets", np.int32), + lp.GlobalArg("result", None, shape="nresults,ntargets", + dim_tags="sep,C"), + lp.GlobalArg("source_box_starts, source_box_lists,", + None, shape=None), + "..." + ] + self.expansion.get_args(), name=self.name, assumptions="ntgt_boxes>=1", defines=dict( dim=self.dim, COEFFIDX=[str(i) for i in xrange(ncoeffs)], RESULTIDX=[str(i) for i in xrange(len(result_names))], + nresults=len(result_names), ) ) @@ -246,6 +241,7 @@ class E2PFromCSR(E2PBase): tags={"idim": "unr"}) loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", tags={"idim": "unr"}) + loopy_knl = lp.set_loop_priority(loopy_knl, "itgt_box,itgt,isrc_box") loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) return loopy_knl diff --git a/sumpy/fmm.py b/sumpy/fmm.py index 10de0de3745a65013255010e73de668ba543e8e3..2b2f3d2fb96b4271a988a9cf29bf451b4c89547a 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -43,6 +43,7 @@ class SumpyExpansionWranglerCodeContainer(object): self.tree = tree m_expn = self.multipole_expansion = multipole_expansion l_expn = self.local_expansion = local_expansion + self.out_kernels = out_kernels self.cl_context = cl_context @@ -100,18 +101,25 @@ class SumpyExpansionWrangler(object): dtype=self.dtype) def potential_zeros(self): - return tuple( + from pytools.obj_array import make_obj_array + return make_obj_array([ cl.array.zeros( self.queue, self.tree.ntargets, dtype=self.dtype) - for k in self.out_kernels) + for k in self.code.out_kernels]) def reorder_src_weights(self, src_weights): return src_weights.with_queue(self.queue)[self.tree.user_source_ids] def reorder_potentials(self, potentials): - return potentials.with_queue(self.queue)[self.tree.sorted_target_ids] + from pytools.obj_array import is_obj_array, with_object_array_or_scalar + assert is_obj_array(potentials) + + def reorder(x): + return x.with_queue(self.queue)[self.tree.sorted_target_ids] + + return with_object_array_or_scalar(reorder, potentials) def form_multipoles(self, source_boxes, src_weights): mpoles = self.multipole_expansion_zeros() @@ -132,6 +140,9 @@ class SumpyExpansionWrangler(object): return mpoles def coarsen_multipoles(self, parent_boxes, mpoles): + if not len(parent_boxes): + return mpoles + evt, (mpoles_res,) = self.code.m2m(self.queue, expansions=mpoles, target_boxes=parent_boxes, @@ -150,9 +161,11 @@ class SumpyExpansionWrangler(object): target_boxes=target_boxes, source_box_starts=source_box_starts, source_box_lists=source_box_lists, - strengths=(src_weights,), + strength=(src_weights,), box_target_starts=self.tree.box_target_starts, box_target_counts_nonchild=self.tree.box_target_counts_nonchild, + box_source_starts=self.tree.box_source_starts, + box_source_counts_nonchild=self.tree.box_source_counts_nonchild, sources=self.tree.sources, targets=self.tree.targets, result=pot, @@ -162,18 +175,18 @@ class SumpyExpansionWrangler(object): for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - return pot_res + return pot def multipole_to_local(self, target_or_target_parent_boxes, starts, lists, mpole_exps): local_exps = self.local_expansion_zeros() - evt, (local_exps_res,) = self.m2l(self.queue, + evt, (local_exps_res,) = self.code.m2l(self.queue, src_expansions=mpole_exps, target_boxes=target_or_target_parent_boxes, src_box_starts=starts, src_box_lists=lists, - centers=self.tree.centers, + centers=self.tree.box_centers, tgt_expansions=local_exps, **self.extra_kwargs) @@ -182,19 +195,20 @@ class SumpyExpansionWrangler(object): return local_exps - def eval_multipoles(self, target_boxes, sep_smaller_nonsiblings_starts, - sep_smaller_nonsiblings_lists, mpole_exps): + def eval_multipoles(self, target_boxes, source_box_starts, + source_box_lists, mpole_exps): pot = self.potential_zeros() - evt, pot_res = self.m2p(self.queue, + evt, pot_res = self.code.m2p(self.queue, expansions=mpole_exps, target_boxes=target_boxes, box_target_starts=self.tree.box_target_starts, box_target_counts_nonchild=self.tree.box_target_counts_nonchild, - source_box_starts=sep_smaller_nonsiblings_starts, - source_box_lists=sep_smaller_nonsiblings_lists, - centers=self.tree.centers, + source_box_starts=source_box_starts, + source_box_lists=source_box_lists, + centers=self.tree.box_centers, targets=self.tree.targets, + result=pot, **self.extra_kwargs) @@ -204,77 +218,54 @@ class SumpyExpansionWrangler(object): return pot def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights): - local_exps = self.expansion_zeros() + local_exps = self.local_expansion_zeros() - evt, (local_exps,) = self.p2l(self.queue, - source_boxes=source_boxes, - box_source_starts=box_source_starts, - box_source_counts_nonchild=box_source_counts_nonchild, - centers=self.tree.centers, + evt, (result,) = self.code.p2l(self.queue, + target_boxes=target_or_target_parent_boxes, + source_box_starts=starts, + source_box_lists=lists, + box_source_starts=self.tree.box_source_starts, + box_source_counts_nonchild=self.tree.box_source_counts_nonchild, + centers=self.tree.box_centers, sources=self.tree.sources, strengths=src_weights, + expansions=local_exps, **self.extra_kwargs) - from pyfmmlib import h2dformta - - for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes): - start, end = starts[itgt_box:itgt_box+2] - - contrib = 0 - - for src_ibox in lists[start:end]: - src_pslice = self._get_source_slice(src_ibox) - tgt_center = self.tree.box_centers[:, tgt_ibox] - if src_pslice.stop - src_pslice.start == 0: - continue + assert local_exps is result - ier, mpole = h2dformta( - self.helmholtz_k, rscale, - self._get_sources(src_pslice), src_weights[src_pslice], - tgt_center, self.nterms) - if ier: - raise RuntimeError("h2dformta failed") - - contrib = contrib + mpole - - local_exps[tgt_ibox] = contrib - - return local_exps + return result def refine_locals(self, child_boxes, local_exps): - from pyfmmlib import h2dlocloc_vec - - for tgt_ibox in child_boxes: - tgt_center = self.tree.box_centers[:, tgt_ibox] - src_ibox = self.tree.box_parent_ids[tgt_ibox] - src_center = self.tree.box_centers[:, src_ibox] + if not len(child_boxes): + return local_exps - tmp_loc_exp = h2dlocloc_vec( - self.helmholtz_k, - rscale, src_center, local_exps[src_ibox], - rscale, tgt_center, self.nterms)[:, 0] - - local_exps[tgt_ibox] += tmp_loc_exp + evt, (local_exps_res,) = self.code.l2l(self.queue, + expansions=local_exps, + target_boxes=child_boxes, + box_parent_ids=self.tree.box_parent_ids, + centers=self.tree.box_centers, + **self.extra_kwargs) + assert local_exps_res is local_exps return local_exps def eval_locals(self, target_boxes, local_exps): pot = self.potential_zeros() - rscale = 1 # FIXME - - from pyfmmlib import h2dtaeval_vec - for tgt_ibox in target_boxes: - tgt_pslice = self._get_target_slice(tgt_ibox) - - if tgt_pslice.stop - tgt_pslice.start == 0: - continue + evt, pot_res = self.code.l2p(self.queue, + expansions=local_exps, + target_boxes=target_boxes, + box_target_starts=self.tree.box_target_starts, + box_target_counts_nonchild=self.tree.box_target_counts_nonchild, + centers=self.tree.box_centers, + targets=self.tree.targets, + result=pot, - tmp_pot, _, _ = h2dtaeval_vec(self.helmholtz_k, rscale, - self.tree.box_centers[:, tgt_ibox], local_exps[tgt_ibox], - self._get_targets(tgt_pslice), ifgrad=False, ifhess=False) + **self.extra_kwargs) - pot[tgt_pslice] += tmp_pot + for pot_i, pot_res_i in zip(pot, pot_res): + assert pot_i is pot_res_i return pot diff --git a/sumpy/p2e.py b/sumpy/p2e.py index 9bd731d6a7e3bdab6beb8e4563d47d0abb277f94..80cbd770e5bd983d49132ee74c20871be57037ae 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -85,7 +85,6 @@ class P2EBase(object): # {{{ P2E from local boxes class P2EFromLocal(P2EBase): - @memoize_method def get_kernel(self): ncoeffs = len(self.expansion) @@ -98,8 +97,8 @@ class P2EFromLocal(P2EBase): self.get_looy_instructions() + [""" <> src_ibox = source_boxes[isrc_box] - <> isrc_start = box_source_starts[isrc_box] - <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> center[idim] = centers[idim, src_ibox] {id=fetch_center} <> a[idim] = center[idim] - sources[idim, isrc] \ {id=compute_a} @@ -109,7 +108,8 @@ class P2EFromLocal(P2EBase): {id_prefix=write_expn} """], [ - lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), + lp.GlobalArg("sources", None, shape=(self.dim, "nsources"), + dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), @@ -131,7 +131,6 @@ class P2EFromLocal(P2EBase): loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", tags={"idim": "unr"}) loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) - loopy_knl = lp.tag_data_axes(loopy_knl, "sources", "sep,c") return loopy_knl @@ -162,22 +161,23 @@ class P2EFromLocal(P2EBase): # {{{ P2E from CSR-like interaction list class P2EFromCSR(P2EBase): - @memoize_method def get_kernel(self): ncoeffs = len(self.expansion) - # FIXXME from sumpy.tools import gather_source_arguments arguments = ( [ - lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), + lp.GlobalArg("sources", None, shape=(self.dim, "nsources"), + dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), + lp.GlobalArg("source_box_starts,source_box_lists", + None, shape=None), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), - lp.GlobalArg("centers", None, shape="dim, nboxes"), + lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), lp.GlobalArg("expansions", None, shape=("nboxes", ncoeffs)), - lp.ValueArg("nboxes", np.int32), + lp.ValueArg("naligned_boxes,nboxes", np.int32), lp.ValueArg("nsources", np.int32), "..." ] + gather_source_arguments([self.expansion])) @@ -185,14 +185,20 @@ class P2EFromCSR(P2EBase): loopy_knl = lp.make_kernel(self.device, [ "{[itgt_box]: 0<=itgt_box src_ibox = source_boxes[isrc_box] - <> isrc_start = box_source_starts[isrc_box] - <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box] + <> tgt_ibox = target_boxes[itgt_box] + + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_stop = source_box_starts[itgt_box+1] + + <> src_ibox = source_box_lists[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] + <> center[idim] = centers[idim, src_ibox] {id=fetch_center} <> a[idim] = center[idim] - sources[idim, isrc] {id=compute_a} <> strength = strengths[isrc] @@ -201,7 +207,7 @@ class P2EFromCSR(P2EBase): {id_prefix=write_expn} """], arguments, - name=self.name, assumptions="nsrc_boxes>=1", + name=self.name, assumptions="ntgt_boxes>=1", defines=dict( dim=self.dim, COEFFIDX=[str(i) for i in xrange(ncoeffs)] @@ -212,7 +218,6 @@ class P2EFromCSR(P2EBase): loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", tags={"idim": "unr"}) loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) - loopy_knl = lp.tag_data_axes(loopy_knl, "sources", "sep,c") return loopy_knl @@ -220,7 +225,7 @@ class P2EFromCSR(P2EBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() - knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0") + knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 20c133650388d5da639891c7d3e22ddab772bd94..a0431686c903ca04fdaca189bc3194d42c4b3525 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -86,33 +86,15 @@ class P2PBase(KernelComputation): # {{{ P2P with list of sources and list of targets class P2P(P2PBase): - @memoize_method def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() from pymbolic import var exprs = [ var(name) - * var("strength_%d" % self.strength_usage[i])[var("isrc")] + * var("strength")[self.strength_usage[i], var("isrc")] for i, name in enumerate(result_names)] - from sumpy.tools import gather_source_arguments - arguments = ( - [ - lp.GlobalArg("src", None, - shape=(self.dim, "nsources"), order="C"), - lp.GlobalArg("tgt", None, - shape=(self.dim, "ntargets"), order="C"), - lp.ValueArg("nsources", None), - lp.ValueArg("ntargets", np.int32), - ]+[ - lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C") - for i in xrange(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets", order="C") - for i, dtype in enumerate(self.value_dtypes) - ] + gather_source_arguments(self.kernels)) - if self.exclude_self: from pymbolic.primitives import If, ComparisonOperator, Variable exprs = [ @@ -121,49 +103,76 @@ class P2P(P2PBase): expr, 0) for expr in exprs] + from sumpy.tools import gather_source_arguments loopy_knl = lp.make_kernel(self.device, - "{[isrc,itgt,idim]: 0<=itgt d[idim] = tgt[idim,itgt] - src[idim,isrc] {id=compute_d}", + "<> d[idim] = targets[idim,itgt] - sources[idim,isrc] {id=compute_d}", ]+[ lp.ExpressionInstruction(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ]+[ - "result_${KNLIDX}[itgt] = knl_${KNLIDX}_scaling \ + "result[${KNLIDX}, itgt] = knl_${KNLIDX}_scaling \ * sum(isrc, pair_result_${KNLIDX})" ], - arguments, + [ + lp.GlobalArg("sources", None, + shape=(self.dim, "nsources")), + lp.GlobalArg("targets", None, + shape=(self.dim, "ntargets")), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None), + lp.GlobalArg("strength", None, shape="nstrengths,nsources"), + lp.GlobalArg("result", self.value_dtypes[0], # FIXME + shape="nresults,ntargets", dim_tags="sep,C") + ] + gather_source_arguments(self.kernels), name=self.name, assumptions="nsources>=1 and ntargets>=1", defines=dict( KNLIDX=range(len(exprs)), dim=self.dim, + nstrengths=self.strength_count, + nresults=len(self.kernels), )) - for where in ["compute_a", "compute_b"]: - loopy_knl = lp.duplicate_inames(loopy_knl, "idim", where) + for where in ["compute_d"]: + loopy_knl = lp.duplicate_inames(loopy_knl, "idim", where, + tags=dict(idim="unr")) for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + loopy_knl = lp.tag_data_axes(loopy_knl, "strength", "sep,C") + return loopy_knl - def get_optimized_kernel(self): + @memoize_method + def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME knl = self.get_kernel() + + if sources_is_obj_array: + knl = lp.tag_data_axes(knl, "sources", "sep,C") + if targets_is_obj_array: + knl = lp.tag_data_axes(knl, "targets", "sep,C") + knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl - def __call__(self, queue, targets, sources, src_strengths, **kwargs): - knl = self.get_optimized_kernel() - - for i, sstr in enumerate(src_strengths): - kwargs["strength_%d" % i] = sstr + def __call__(self, queue, targets, sources, strength, **kwargs): + from pytools.obj_array import is_obj_array + knl = self.get_optimized_kernel( + targets_is_obj_array= + is_obj_array(targets) or isinstance(targets, (tuple, list)), + sources_is_obj_array= + is_obj_array(sources) or isinstance(sources, (tuple, list))) - return knl(queue, src=sources, tgt=targets, **kwargs) + return knl(queue, sources=sources, targets=targets, strength=strength, + **kwargs) # }}} @@ -180,27 +189,14 @@ class P2PFromCSR(P2PBase): from pymbolic import var exprs = [ var(name) - * var("strength_%d" % self.strength_usage[i])[var("isrc")] + * var("strength")[self.strength_usage[i], var("isrc")] for i, name in enumerate(result_names)] from sumpy.tools import gather_source_arguments - arguments = ( - [ - lp.GlobalArg("box_target_starts,box_target_counts_nonchild," - "box_source_starts,box_source_counts_nonchild", - None, shape=None) - ]+[ - lp.GlobalArg("strength_%d" % i, None, shape="nsources") - for i in xrange(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets") - for i, dtype in enumerate(self.value_dtypes) - ] + gather_source_arguments(self.kernels)) - loopy_knl = lp.make_kernel(self.device, [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[itgt_box] - <> itgt_end = itgt_start+box_target_counts_nonchild[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] <> src_ibox = source_box_lists[isrc_box] - <> isrc_start = box_source_starts[isrc_box] - <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> d[idim] = targets[idim,itgt] - sources[idim,isrc] \ {id=compute_d} @@ -232,17 +228,36 @@ class P2PFromCSR(P2PBase): assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) - ]+[ ], - arguments, - name=self.name, assumptions="nsources>=1 and ntargets>=1", + [ + lp.GlobalArg("box_target_starts,box_target_counts_nonchild," + "box_source_starts,box_source_counts_nonchild,", + None, shape=None), + lp.GlobalArg("source_box_starts, source_box_lists,", + None, shape=None), + lp.GlobalArg("strength", None, shape="nstrengths,nsources"), + lp.GlobalArg("result", self.value_dtypes[0], # FIXME + shape="nkernels,ntargets", dim_tags="sep,c"), + lp.GlobalArg("targets", None, + shape="dim,ntargets", dim_tags="sep,c"), + lp.GlobalArg("sources", None, + shape="dim,nsources", dim_tags="sep,c"), + lp.ValueArg("nsources", np.int32), + lp.ValueArg("ntargets", np.int32), + "...", + ] + gather_source_arguments(self.kernels), + name=self.name, assumptions="ntgt_boxes>=1", defines=dict( KNLIDX=range(len(exprs)), dim=self.dim, + nstrengths=self.strength_count, + nkernels=len(self.kernels), )) - for where in ["compute_a", "compute_b"]: - loopy_knl = lp.duplicate_inames(loopy_knl, "idim", where) + loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "compute_d", + tags=dict(idim="unr")) + + loopy_knl = lp.tag_data_axes(loopy_knl, "strength", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -252,16 +267,13 @@ class P2PFromCSR(P2PBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() - knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") + #knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") return knl - def __call__(self, queue, targets, sources, src_strengths, **kwargs): + def __call__(self, queue, **kwargs): knl = self.get_optimized_kernel() - for i, sstr in enumerate(src_strengths): - kwargs["strength_%d" % i] = sstr - - return knl(queue, src=sources, tgt=targets, **kwargs) + return knl(queue, **kwargs) # }}} diff --git a/test/test_fmm.py b/test/test_fmm.py index 800682af99f38b21558f7a3e7747617a72c78ae6..fb23350878444fe4dea617867f041c09bf85a5a1 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -33,14 +33,22 @@ import logging logger = logging.getLogger(__name__) +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + + def no_test_sumpy_fmm(ctx_getter): logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) - nsources = 3000 - ntargets = 1000 + nsources = 500 + ntargets = 50 dim = 2 dtype = np.float64 @@ -76,7 +84,7 @@ def no_test_sumpy_fmm(ctx_getter): from sumpy.expansion.local import VolumeTaylorLocalExpansion from sumpy.kernel import LaplaceKernel - order = 3 + order = 2 knl = LaplaceKernel(dim) mpole_expn = VolumeTaylorMultipoleExpansion(knl, order) local_expn = VolumeTaylorLocalExpansion(knl, order) @@ -88,11 +96,11 @@ def no_test_sumpy_fmm(ctx_getter): wrangler = wcc.get_wrangler(queue, np.float64) from boxtree.fmm import drive_fmm - pot = drive_fmm(trav, wrangler, weights) + pot, = drive_fmm(trav, wrangler, weights) from sumpy import P2P p2p = P2P(ctx, out_kernels, exclude_self=False) - ref_pot = p2p(queue, sources, targets) + evt, (ref_pot,) = p2p(queue, targets, sources, (weights,)) pot = pot.get() ref_pot = ref_pot.get() diff --git a/test/test_kernels.py b/test/test_kernels.py index 6ab925436ba0374e77c1b9cc95aff3c8d34281e7..35dad42ba9664b7f48a0dd2a88c3ea65d7402718 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -382,7 +382,7 @@ def test_translations(ctx_getter, knl): # FIXME: Embarrassing--but we run out of memory for higher orders. orders = [2, 3] else: - orders = [2, 3, 4, 5] + orders = [2, 3, 4] nboxes = centers.shape[-1] for order in orders: @@ -403,8 +403,11 @@ def test_translations(ctx_getter, knl): # {{{ apply P2M p2m_source_boxes = np.array([0], dtype=np.int32) - p2m_box_source_starts = np.array([0], dtype=np.int32) - p2m_box_source_counts_nonchild = np.array([nsources], dtype=np.int32) + + # These are indexed by global box numbers. + p2m_box_source_starts = np.array([0, 0, 0, 0], dtype=np.int32) + p2m_box_source_counts_nonchild = np.array([nsources, 0, 0, 0], + dtype=np.int32) evt, (mpoles,) = p2m(queue, source_boxes=p2m_source_boxes, @@ -476,8 +479,11 @@ def test_translations(ctx_getter, knl): ntargets = targets.shape[-1] l2p_target_boxes = np.array([3], dtype=np.int32) - l2p_box_target_starts = np.array([0], dtype=np.int32) - l2p_box_target_counts_nonchild = np.array([ntargets], dtype=np.int32) + + # These are indexed by global box numbers. + l2p_box_target_starts = np.array([0, 0, 0, 0], dtype=np.int32) + l2p_box_target_counts_nonchild = np.array([0, 0, 0, ntargets], + dtype=np.int32) evt, (pot,) = l2p( queue,