diff --git a/sumpy/p2p.py b/sumpy/p2p.py index d8586cc228e47d6aa9f12d7c7c436726ad28bc2e..a35505ea418b0cb0d2d58cde1cabb53c521d3602 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -40,10 +40,10 @@ __doc__ = """ Particle-to-particle -------------------- -.. autoclass:: P2PComputationBase -.. autoclass:: SingleSrcTgtListP2PBase +.. autoclass:: P2PBase .. autoclass:: P2P .. autoclass:: P2PMatrixGenerator +.. autoclass:: P2PMatrixBlockGenerator .. autoclass:: P2PFromCSR """ @@ -54,7 +54,7 @@ Particle-to-particle # {{{ p2p base class -class P2PComputationBase(KernelComputation, KernelCacheWrapper): +class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, options=[], name=None, device=None): @@ -104,17 +104,27 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): return loopy_insns, result_names - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength").index((self.strength_usage[kernel_idx], isrc)) -# }}} + def get_exprs(self, results): + from pymbolic import var + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(results)] -# {{{ P2P with list of sources and list of targets + if self.exclude_self: + from pymbolic.primitives import If, Variable + exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] + + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] -class SingleSrcTgtListP2PBase(P2PComputationBase): - def get_src_tgt_arguments(self): + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments return [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), @@ -122,75 +132,13 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): shape=(self.dim, "ntargets")), lp.ValueArg("nsources", np.int32), lp.ValueArg("ntargets", np.int32) - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" + ] + ([ + lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] + if self.exclude_self else []) + ( + gather_loopy_source_arguments(self.kernels)) def get_kernel(self): - loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"] - + ["<> is_self = (isrc == target_to_source[itgt])" - if self.exclude_self else ""] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - assumptions=self.get_assumptions(), - name=self.name, - fixed_parameters=dict( - dim=self.dim, - nstrengths=self.strength_count, - nresults=len(self.kernels))) - - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - - for knl in self.kernels: - loopy_knl = knl.prepare_loopy_kernel(loopy_knl) - - return loopy_knl + raise NotImplementedError def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -204,32 +152,60 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) # }}} # {{{ P2P point-interaction calculation -class P2P(SingleSrcTgtListP2PBase): +class P2P(P2PBase): default_name = "p2p_apply" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength").index((self.strength_usage[kernel_idx], isrc)) - - def get_input_and_output_arguments(self): - return [ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ lp.GlobalArg("strength", None, - shape="nstrengths,nsources", dim_tags="sep,C"), + shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults,ntargets", dim_tags="sep,C") - ] + shape="nresults, ntargets", dim_tags="sep,C")]) - def get_result_store_instructions(self): - return [""" - result[{i}, itgt] = knl_{i}_scaling \ - * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.kernels))] + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + + exprs + + [""" + result[{i}, itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict( + dim=self.dim, + nstrengths=self.strength_count, + nresults=len(self.kernels))) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, strength, **kwargs): from pytools.obj_array import is_obj_array @@ -247,26 +223,49 @@ class P2P(SingleSrcTgtListP2PBase): # {{{ P2P matrix writer -class P2PMatrixGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixGenerator(P2PBase): default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + exprs + + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim)) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, **kwargs): from pytools.obj_array import is_obj_array @@ -281,74 +280,73 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase): # }}} -# {{{ +# {{{ P2P matrix block writer -class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixBlockGenerator(P2PBase): default_name = "p2p_block" - def get_src_tgt_arguments(self): - return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), lp.ValueArg("nsrcindices", np.int32), lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None) - ] - - def get_domains(self): - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + lp.ValueArg("nranges", None)] + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" for irange <> tgtstart = tgtranges[irange] <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart + <> ntgtblock = tgtend - tgtstart <> srcstart = srcranges[irange] <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ + sources[idim, srcindices[srcstart + isrc]] + """] + [""" + <> is_self = (srcindices[srcstart + isrc] == + target_to_source[tgtindices[tgtstart + itgt]]) + """ if self.exclude_self else ""] + + loopy_insns + exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} {{inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + [""" end end - """ - ] - - def get_strength_or_not(self, isrc, kernel_idx): - return 1 + """], + arguments, + assumptions="nranges>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim)) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - def get_result_store_instructions(self): - return [ - """ - result_{i}[tgtstart + j, srcstart + k] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} - """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -380,39 +378,42 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): # {{{ P2P from CSR-like interaction list -class P2PFromCSR(P2PComputationBase): +class P2PFromCSR(P2PBase): default_name = "p2p_from_csr" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - from pymbolic import var - exprs = [ - var(name) - * var("strength").index((self.strength_usage[i], var("isrc"))) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ + lp.GlobalArg("box_target_starts", None, shape=None), + lp.GlobalArg("box_target_counts_nonchild", None, shape=None), + lp.GlobalArg("box_source_starts", None, shape=None), + lp.GlobalArg("box_source_counts_nonchild", None, shape=None), + lp.GlobalArg("source_box_starts", None, shape=None), + lp.GlobalArg("source_box_lists", None, shape=None), + lp.GlobalArg("strength", None, + shape="nstrengths, nsources", dim_tags="sep,C"), + lp.GlobalArg("result", None, + shape="nkernels, ntargets", dim_tags="sep,C"), + "..." + ]) from sumpy.tools import gather_loopy_source_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", - "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}", - "{[itgt,isrc,idim]: \ - itgt_start<=itgt<itgt_end and \ - isrc_start<=isrc<isrc_end and \ - 0<=idim<dim }", - ], + loopy_knl = lp.make_kernel([ + "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", + "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", + "{[itgt, isrc, idim]: \ + itgt_start <= itgt < itgt_end and \ + isrc_start <= isrc < isrc_end and \ + 0 <= idim < dim}", + ], self.get_kernel_scaling_assignments() - + [ - """ + + [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] @@ -420,62 +421,38 @@ class P2PFromCSR(P2PComputationBase): for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt - for isrc - <> d[idim] = \ - targets[idim,itgt] - sources[idim,isrc] \ - {dup=idim} - """] + [""" - <> is_self = (isrc == target_to_source[itgt]) - """ if self.exclude_self else ""] + [ - ] + loopy_insns + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] + [ - """ - end - """] + [""" - result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ - knl_KNLIDX_scaling \ - * simul_reduce(sum, isrc, pair_result_KNLIDX) - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(exprs))] + [""" + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} + """] + [""" + <> is_self = (isrc == target_to_source[itgt]) + """ if self.exclude_self else ""] + + loopy_insns + exprs + + [""" + end + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) + """.format(i=iknl) + for iknl in range(len(exprs))] + + [""" end end end - """], - [ - lp.GlobalArg("box_target_starts,box_target_counts_nonchild," - "box_source_starts,box_source_counts_nonchild,", - None, shape=None), - lp.GlobalArg("source_box_starts, source_box_lists,", - None, shape=None), - lp.GlobalArg("strength", None, shape="nstrengths,nsources"), - lp.GlobalArg("result", None, - shape="nkernels,ntargets", dim_tags="sep,c"), - lp.GlobalArg("targets", None, - shape="dim,ntargets", dim_tags="sep,c"), - lp.GlobalArg("sources", None, - shape="dim,nsources", dim_tags="sep,c"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - "...", - ] + ( - [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else [] - ) + gather_loopy_source_arguments(self.kernels), - name=self.name, assumptions="ntgt_boxes>=1", + """], + arguments, + assumptions="ntgt_boxes>=1", + name=self.name, fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels))) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -485,12 +462,14 @@ class P2PFromCSR(P2PComputationBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() + import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") else: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") + return knl def __call__(self, queue, **kwargs):