From 045a07cc52ea741d60ebfcf6f114ebcf82c87833 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Sun, 25 Feb 2018 19:24:09 -0600 Subject: [PATCH 01/12] p2p: some cleanup and unification of the kernel evaluation classes --- sumpy/p2p.py | 421 ++++++++++++++++++++++++--------------------------- 1 file changed, 200 insertions(+), 221 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index d8586cc2..a35505ea 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -40,10 +40,10 @@ __doc__ = """ Particle-to-particle -------------------- -.. autoclass:: P2PComputationBase -.. autoclass:: SingleSrcTgtListP2PBase +.. autoclass:: P2PBase .. autoclass:: P2P .. autoclass:: P2PMatrixGenerator +.. autoclass:: P2PMatrixBlockGenerator .. autoclass:: P2PFromCSR """ @@ -54,7 +54,7 @@ Particle-to-particle # {{{ p2p base class -class P2PComputationBase(KernelComputation, KernelCacheWrapper): +class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, options=[], name=None, device=None): @@ -104,17 +104,27 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): return loopy_insns, result_names - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength").index((self.strength_usage[kernel_idx], isrc)) -# }}} + def get_exprs(self, results): + from pymbolic import var + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(results)] -# {{{ P2P with list of sources and list of targets + if self.exclude_self: + from pymbolic.primitives import If, Variable + exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] + + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] -class SingleSrcTgtListP2PBase(P2PComputationBase): - def get_src_tgt_arguments(self): + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments return [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), @@ -122,75 +132,13 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): shape=(self.dim, "ntargets")), lp.ValueArg("nsources", np.int32), lp.ValueArg("ntargets", np.int32) - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" + ] + ([ + lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] + if self.exclude_self else []) + ( + gather_loopy_source_arguments(self.kernels)) def get_kernel(self): - loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"] - + ["<> is_self = (isrc == target_to_source[itgt])" - if self.exclude_self else ""] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - assumptions=self.get_assumptions(), - name=self.name, - fixed_parameters=dict( - dim=self.dim, - nstrengths=self.strength_count, - nresults=len(self.kernels))) - - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - - for knl in self.kernels: - loopy_knl = knl.prepare_loopy_kernel(loopy_knl) - - return loopy_knl + raise NotImplementedError def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -204,32 +152,60 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) # }}} # {{{ P2P point-interaction calculation -class P2P(SingleSrcTgtListP2PBase): +class P2P(P2PBase): default_name = "p2p_apply" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength").index((self.strength_usage[kernel_idx], isrc)) - - def get_input_and_output_arguments(self): - return [ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ lp.GlobalArg("strength", None, - shape="nstrengths,nsources", dim_tags="sep,C"), + shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults,ntargets", dim_tags="sep,C") - ] + shape="nresults, ntargets", dim_tags="sep,C")]) - def get_result_store_instructions(self): - return [""" - result[{i}, itgt] = knl_{i}_scaling \ - * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.kernels))] + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + + exprs + + [""" + result[{i}, itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict( + dim=self.dim, + nstrengths=self.strength_count, + nresults=len(self.kernels))) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, strength, **kwargs): from pytools.obj_array import is_obj_array @@ -247,26 +223,49 @@ class P2P(SingleSrcTgtListP2PBase): # {{{ P2P matrix writer -class P2PMatrixGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixGenerator(P2PBase): default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + exprs + + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim)) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, **kwargs): from pytools.obj_array import is_obj_array @@ -281,74 +280,73 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase): # }}} -# {{{ +# {{{ P2P matrix block writer -class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixBlockGenerator(P2PBase): default_name = "p2p_block" - def get_src_tgt_arguments(self): - return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), lp.ValueArg("nsrcindices", np.int32), lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None) - ] - - def get_domains(self): - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + lp.ValueArg("nranges", None)] + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" for irange <> tgtstart = tgtranges[irange] <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart + <> ntgtblock = tgtend - tgtstart <> srcstart = srcranges[irange] <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ + sources[idim, srcindices[srcstart + isrc]] + """] + [""" + <> is_self = (srcindices[srcstart + isrc] == + target_to_source[tgtindices[tgtstart + itgt]]) + """ if self.exclude_self else ""] + + loopy_insns + exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} {{inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + [""" end end - """ - ] - - def get_strength_or_not(self, isrc, kernel_idx): - return 1 + """], + arguments, + assumptions="nranges>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim)) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - def get_result_store_instructions(self): - return [ - """ - result_{i}[tgtstart + j, srcstart + k] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} - """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -380,39 +378,42 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): # {{{ P2P from CSR-like interaction list -class P2PFromCSR(P2PComputationBase): +class P2PFromCSR(P2PBase): default_name = "p2p_from_csr" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - from pymbolic import var - exprs = [ - var(name) - * var("strength").index((self.strength_usage[i], var("isrc"))) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] + exprs = self.get_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ + lp.GlobalArg("box_target_starts", None, shape=None), + lp.GlobalArg("box_target_counts_nonchild", None, shape=None), + lp.GlobalArg("box_source_starts", None, shape=None), + lp.GlobalArg("box_source_counts_nonchild", None, shape=None), + lp.GlobalArg("source_box_starts", None, shape=None), + lp.GlobalArg("source_box_lists", None, shape=None), + lp.GlobalArg("strength", None, + shape="nstrengths, nsources", dim_tags="sep,C"), + lp.GlobalArg("result", None, + shape="nkernels, ntargets", dim_tags="sep,C"), + "..." + ]) from sumpy.tools import gather_loopy_source_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] @@ -420,62 +421,38 @@ class P2PFromCSR(P2PComputationBase): for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt - for isrc - <> d[idim] = \ - targets[idim,itgt] - sources[idim,isrc] \ - {dup=idim} - """] + [""" - <> is_self = (isrc == target_to_source[itgt]) - """ if self.exclude_self else ""] + [ - ] + loopy_insns + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] + [ - """ - end - """] + [""" - result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ - knl_KNLIDX_scaling \ - * simul_reduce(sum, isrc, pair_result_KNLIDX) - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(exprs))] + [""" + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} + """] + [""" + <> is_self = (isrc == target_to_source[itgt]) + """ if self.exclude_self else ""] + + loopy_insns + exprs + + [""" + end + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) + """.format(i=iknl) + for iknl in range(len(exprs))] + + [""" end end end - """], - [ - lp.GlobalArg("box_target_starts,box_target_counts_nonchild," - "box_source_starts,box_source_counts_nonchild,", - None, shape=None), - lp.GlobalArg("source_box_starts, source_box_lists,", - None, shape=None), - lp.GlobalArg("strength", None, shape="nstrengths,nsources"), - lp.GlobalArg("result", None, - shape="nkernels,ntargets", dim_tags="sep,c"), - lp.GlobalArg("targets", None, - shape="dim,ntargets", dim_tags="sep,c"), - lp.GlobalArg("sources", None, - shape="dim,nsources", dim_tags="sep,c"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - "...", - ] + ( - [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else [] - ) + gather_loopy_source_arguments(self.kernels), - name=self.name, assumptions="ntgt_boxes>=1", + """], + arguments, + assumptions="ntgt_boxes>=1", + name=self.name, fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels))) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -485,12 +462,14 @@ class P2PFromCSR(P2PComputationBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() + import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") else: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") + return knl def __call__(self, queue, **kwargs): -- GitLab From 11425c50de42cd5e7d777c89adae8894e73cf688 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Wed, 28 Feb 2018 20:28:32 -0600 Subject: [PATCH 02/12] qbx: some cleanup --- sumpy/p2p.py | 181 +++++++++++++++------------ sumpy/qbx.py | 347 ++++++++++++++++++++++++++------------------------- 2 files changed, 279 insertions(+), 249 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index a35505ea..d92360e3 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -30,6 +30,7 @@ from six.moves import range import numpy as np import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -57,7 +58,7 @@ Particle-to-particle class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, - options=[], name=None, device=None): + options=[], name="p2p", device=None): """ :arg kernels: list of :class:`sumpy.kernel.Kernel` instances :arg strength_usage: A list of integers indicating which expression @@ -74,6 +75,10 @@ class P2PBase(KernelComputation, KernelCacheWrapper): from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.kernels) + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) @@ -107,34 +112,33 @@ class P2PBase(KernelComputation, KernelCacheWrapper): def get_strength_or_not(self, isrc, kernel_idx): return var("strength").index((self.strength_usage[kernel_idx], isrc)) - def get_exprs(self, results): + def get_kernel_exprs(self, result_names): from pymbolic import var isrc_sym = var("isrc") exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(results)] + for i, name in enumerate(result_names)] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] return [lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) for i, expr in enumerate(exprs)] def get_default_src_tgt_arguments(self): from sumpy.tools import gather_loopy_source_arguments - return [ + return ([ lp.GlobalArg("sources", None, - shape=(self.dim, "nsources")), + shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32) - ] + ([ - lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) + ( + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))] + if self.exclude_self else []) + gather_loopy_source_arguments(self.kernels)) def get_kernel(self): @@ -152,9 +156,6 @@ class P2PBase(KernelComputation, KernelCacheWrapper): knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) # }}} @@ -162,17 +163,21 @@ class P2PBase(KernelComputation, KernelCacheWrapper): # {{{ P2P point-interaction calculation class P2P(P2PBase): + """Direct applier for P2P interactions.""" + default_name = "p2p_apply" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults, ntargets", dim_tags="sep,C")]) + shape="nresults, ntargets", dim_tags="sep,C") + ]) loopy_knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntargets}", @@ -184,8 +189,7 @@ class P2P(P2PBase): + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + ["<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else ""] - + loopy_insns - + exprs + + loopy_insns + kernel_exprs + [""" result[{i}, itgt] = knl_{i}_scaling * \ simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} @@ -198,7 +202,8 @@ class P2P(P2PBase): fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nresults=len(self.kernels))) + nresults=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -224,6 +229,8 @@ class P2P(P2PBase): # {{{ P2P matrix writer class P2PMatrixGenerator(P2PBase): + """Generator for P2P interaction matrix entries.""" + default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): @@ -231,12 +238,12 @@ class P2PMatrixGenerator(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes)]) + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntargets}", @@ -248,7 +255,7 @@ class P2PMatrixGenerator(P2PBase): + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + ["<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else ""] - + loopy_insns + exprs + + loopy_insns + kernel_exprs + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} @@ -258,7 +265,8 @@ class P2PMatrixGenerator(P2PBase): arguments, assumptions="nsources>=1 and ntargets>=1", name=self.name, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -283,6 +291,8 @@ class P2PMatrixGenerator(P2PBase): # {{{ P2P matrix block writer class P2PMatrixBlockGenerator(P2PBase): + """Generator for a subset of P2P interaction matrix entries.""" + default_name = "p2p_block" def get_strength_or_not(self, isrc, kernel_idx): @@ -290,19 +300,21 @@ class P2PMatrixBlockGenerator(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None)] + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntgtindices, nsrcindices") - for i, dtype in enumerate(self.value_dtypes)]) + lp.ValueArg("nsrcindices", np.int64), + lp.ValueArg("ntgtindices", np.int64), + lp.ValueArg("nranges", None) + ] + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ "{[irange]: 0 <= irange < nranges}", @@ -322,14 +334,16 @@ class P2PMatrixBlockGenerator(P2PBase): for itgt, isrc <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ sources[idim, srcindices[srcstart + isrc]] - """] + [""" + """] + + [""" <> is_self = (srcindices[srcstart + isrc] == target_to_source[tgtindices[tgtstart + itgt]]) - """ if self.exclude_self else ""] - + loopy_insns + exprs + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + [""" - result_{i}[tgtstart + itgt, srcstart + isrc] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=irange}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" @@ -339,10 +353,13 @@ class P2PMatrixBlockGenerator(P2PBase): arguments, assumptions="nranges>=1", name=self.name, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -383,23 +400,29 @@ class P2PFromCSR(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ - lp.GlobalArg("box_target_starts", None, shape=None), - lp.GlobalArg("box_target_counts_nonchild", None, shape=None), - lp.GlobalArg("box_source_starts", None, shape=None), - lp.GlobalArg("box_source_counts_nonchild", None, shape=None), - lp.GlobalArg("source_box_starts", None, shape=None), - lp.GlobalArg("source_box_lists", None, shape=None), + self.get_default_src_tgt_arguments() + + [ + lp.GlobalArg("box_target_starts", + None, shape=None), + lp.GlobalArg("box_target_counts_nonchild", + None, shape=None), + lp.GlobalArg("box_source_starts", + None, shape=None), + lp.GlobalArg("box_source_counts_nonchild", + None, shape=None), + lp.GlobalArg("source_box_starts", + None, shape=None), + lp.GlobalArg("source_box_lists", + None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." - ]) + ]) - from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel([ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", @@ -411,36 +434,36 @@ class P2PFromCSR(P2PBase): self.get_kernel_scaling_assignments() + [""" for itgt_box - <> tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] - - for itgt - for isrc - <> d[idim] = \ - targets[idim, itgt] - sources[idim, isrc] {dup=idim} + <> tgt_ibox = target_boxes[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] + + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_end = source_box_starts[itgt_box+1] + + for isrc_box + <> src_ibox = source_box_lists[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for itgt + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} """] + [""" - <> is_self = (isrc == target_to_source[itgt]) + <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] - + loopy_insns + exprs + + loopy_insns + kernel_exprs + [""" - end - result[{i}, itgt] = result[{i}, itgt] + \ - knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) + end + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) """.format(i=iknl) - for iknl in range(len(exprs))] + for iknl in range(len(self.kernels))] + [""" - end end end + end """], arguments, assumptions="ntgt_boxes>=1", @@ -448,7 +471,11 @@ class P2PFromCSR(P2PBase): fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nkernels=len(self.kernels))) + nkernels=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 0f617e62..6e8be491 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -27,7 +27,7 @@ THE SOFTWARE. import six -from six.moves import range, zip +from six.moves import range import numpy as np import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -103,46 +103,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def expansions(self): return self.kernels - def get_compute_a_and_b_vecs(self): - return """ - <> a[idim] = center[idim,itgt] - src[idim,isrc] {dup=idim} - <> b[idim] = tgt[idim,itgt] - center[idim,itgt] {dup=idim} - <> rscale = expansion_radii[itgt] - """ - - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("src", None, - shape=(self.dim, "nsources"), order="C"), - lp.GlobalArg("tgt", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("center", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("expansion_radii", None, shape="ntargets"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" - - @memoize_method - def get_kernel(self): + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector - avec = make_sym_vector("a", self.dim) bvec = make_sym_vector("b", self.dim) @@ -152,7 +114,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): logger.info("compute expansion expressions: start") result_names = [expand(i, sac, expn, avec, bvec) - for i, expn in enumerate(self.expansions)] + for i, expn in enumerate(self.expansions)] logger.info("compute expansion expressions: done") @@ -168,55 +130,43 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): complex_dtype=np.complex128 # FIXME ) - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] - - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + gather_loopy_source_arguments(self.kernels)) + return loopy_insns, result_names - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + [self.get_compute_a_and_b_vecs()] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - name=self.name, - assumptions=self.get_assumptions(), - default_offset=lp.auto, - silenced_warnings="write_race(write_lpot*)", - fixed_parameters=dict(dim=self.dim), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + def get_kernel_exprs(self, result_names): + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] - for expn in self.expansions: - loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("src", None, + shape=(self.dim, "nsources"), order="C"), + lp.GlobalArg("tgt", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("center", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("expansion_radii", + None, shape="ntargets"), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError - @memoize_method def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() - # FIXME: how to tune for blocks? import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: @@ -242,28 +192,49 @@ class LayerPotential(LayerPotentialBase): default_name = "qbx_apply" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("strength_%d" % i, + None, shape="nsources", order="C") + for i in range(self.strength_count)] + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_lpot,inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C") - for i in range(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C") - for i in range(len(self.kernels)) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[itgt] = \ - knl_{i}_scaling*simul_reduce( - sum, isrc, pair_result_{i}) \ - {{id_prefix=write_lpot,inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, **kwargs): @@ -293,23 +264,48 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[itgt, isrc] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=isrc:itgt} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets, nsources", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt, isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=isrc:itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, **kwargs) @@ -320,76 +316,83 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): # {{{ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): + """Generator for a subset of the layer potential matrix entries.""" + default_name = "qbx_block" - def get_src_tgt_arguments(self): - return LayerPotentialBase.get_src_tgt_arguments(self) \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nsrcindices", None), + lp.ValueArg("ntgtindices", None), lp.ValueArg("nranges", None) - ] - - def get_domains(self): - # FIXME: this doesn't work when separating j and k - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + ] + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] + <> ntgtblock = tgtend - tgtstart + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \ + src[idim, srcindices[srcstart + isrc]] \ + {dup=idim} + <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \ + center[idim, tgtindices[tgtstart + itgt]] \ + {dup=idim} + <> rscale = expansion_radii[tgtindices[tgtstart + itgt]] + """] + + loopy_insns + kernel_exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + [""" end end - """ - ] - - def get_strength_or_not(self, isrc, kernel_idx): - return 1 + """], + arguments, + name=self.name, + assumptions="nranges>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[tgtstart + j, srcstart + k] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl - @memoize_method def get_optimized_kernel(self): - # FIXME loopy_knl = self.get_kernel() loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") @@ -397,7 +400,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def __call__(self, queue, targets, sources, centers, expansion_radii, tgtindices, srcindices, tgtranges, srcranges, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, -- GitLab From 243c4039d9dc87650eef41f6318ab72a9d4cb03d Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Mon, 5 Mar 2018 18:34:39 -0600 Subject: [PATCH 03/12] bump version --- .pytest_cache/v/cache/lastfailed | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .pytest_cache/v/cache/lastfailed diff --git a/.pytest_cache/v/cache/lastfailed b/.pytest_cache/v/cache/lastfailed new file mode 100644 index 00000000..bd70197b --- /dev/null +++ b/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,4 @@ +{ + "test/test_matrixgen.py::test_p2p_direct[ctx_getter=>-False]": true, + "test/test_matrixgen.py::test_p2p_direct[ctx_getter=>-True]": true +} \ No newline at end of file -- GitLab From 5978a409a67b948ea4edfb9e563abc559ff57396 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Mon, 5 Mar 2018 19:56:30 -0600 Subject: [PATCH 04/12] add type for some remaining indices --- sumpy/p2p.py | 11 +++++++---- sumpy/qbx.py | 7 +++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index d92360e3..9e26e8e6 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -308,8 +308,8 @@ class P2PMatrixBlockGenerator(P2PBase): lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int64), - lp.ValueArg("ntgtindices", np.int64), + lp.ValueArg("nsrcindices", None), + lp.ValueArg("ntgtindices", None), lp.ValueArg("nranges", None) ] + [lp.GlobalArg("result_%d" % i, dtype, @@ -356,8 +356,11 @@ class P2PMatrixBlockGenerator(P2PBase): fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.add_dtypes(loopy_knl, - dict(nsources=np.int64, ntargets=np.int64)) + loopy_knl = lp.add_dtypes(loopy_knl, dict( + nsources=np.int64, + ntargets=np.int64, + ntgtindices=np.int64, + nsrcindices=np.int64)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 6e8be491..7697bdd9 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -383,8 +383,11 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.add_dtypes(loopy_knl, - dict(nsources=np.int64, ntargets=np.int64)) + loopy_knl = lp.add_dtypes(loopy_knl, dict( + nsources=np.int64, + ntargets=np.int64, + ntgtindices=np.int64, + nsrcindices=np.int64)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.expansions: -- GitLab From f3d702b98b6fd571d8a886acba0057e50f06a142 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Mon, 5 Mar 2018 21:02:02 -0600 Subject: [PATCH 05/12] qbx: make centers sep,C --- sumpy/qbx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 7697bdd9..d1533cf8 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -153,7 +153,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): lp.GlobalArg("tgt", None, shape=(self.dim, "ntargets"), order="C"), lp.GlobalArg("center", None, - shape=(self.dim, "ntargets"), order="C"), + shape=(self.dim, "ntargets"), dim_tags="sep,C"), lp.GlobalArg("expansion_radii", None, shape="ntargets"), lp.ValueArg("nsources", None), -- GitLab From c75c5cd6982741f7c97e34cd13e2e6d405a15f1a Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Mon, 5 Mar 2018 22:00:05 -0600 Subject: [PATCH 06/12] bump kernel version --- sumpy/qbx.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sumpy/qbx.py b/sumpy/qbx.py index d1533cf8..d91d3a11 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -49,6 +49,7 @@ QBX for Layer Potentials .. autoclass:: LayerPotentialBase .. autoclass:: LayerPotential .. autoclass:: LayerPotentialMatrixGenerator +.. autoclass:: LayerPotentialMatrixBlockGenerator """ -- GitLab From 9e1757be48d408800361f90e8480e593d5674a6b Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Wed, 7 Mar 2018 17:20:28 -0600 Subject: [PATCH 07/12] p2p: silence WriteRaceCondition warnings --- sumpy/p2p.py | 5 +++-- sumpy/qbx.py | 53 ++++++++++++++++++++++++++-------------------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 9e26e8e6..2c5d9189 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -58,7 +58,7 @@ Particle-to-particle class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, - options=[], name="p2p", device=None): + options=[], name=None, device=None): """ :arg kernels: list of :class:`sumpy.kernel.Kernel` instances :arg strength_usage: A list of integers indicating which expression @@ -343,7 +343,7 @@ class P2PMatrixBlockGenerator(P2PBase): + [""" result_{i}[tgtstart + itgt, srcstart + isrc] = \ knl_{i}_scaling * pair_result_{i} \ - {{inames=irange}} + {{id_prefix=write_p2p,inames=irange}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" @@ -352,6 +352,7 @@ class P2PMatrixBlockGenerator(P2PBase): """], arguments, assumptions="nranges>=1", + silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) diff --git a/sumpy/qbx.py b/sumpy/qbx.py index d91d3a11..f0c35174 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -88,7 +88,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec): class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, expansions, strength_usage=None, value_dtypes=None, - options=[], name="layerpot", device=None): + options=[], name=None, device=None): KernelComputation.__init__(self, ctx, expansions, strength_usage, value_dtypes, name, options, device) @@ -206,30 +206,30 @@ class LayerPotential(LayerPotentialBase): for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ], - self.get_kernel_scaling_assignments() - + ["for itgt, isrc"] - + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] - + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] - + ["<> rscale = expansion_radii[itgt]"] - + loopy_insns + kernel_exprs - + [""" - result_{i}[itgt] = knl_{i}_scaling * \ - simul_reduce(sum, isrc, pair_result_{i}) \ - {{id_prefix=write_lpot,inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.expansions))] - + ["end"], - arguments, - name=self.name, - assumptions="ntargets>=1 and nsources>=1", - default_offset=lp.auto, - silenced_warnings="write_race(write_lpot*)", - fixed_parameters=dict(dim=self.dim), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_lpot,inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.expansions: @@ -370,7 +370,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): + [""" result_{i}[tgtstart + itgt, srcstart + isrc] = \ knl_{i}_scaling * pair_result_{i} \ - {{inames=irange}} + {{id_prefix=write_lpot,inames=irange}} """.format(i=iknl) for iknl in range(len(self.expansions))] + [""" @@ -381,6 +381,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): name=self.name, assumptions="nranges>=1", default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) -- GitLab From 52bc14e83e770e5d9407923839309c59cce7a340 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Thu, 8 Mar 2018 16:22:23 -0600 Subject: [PATCH 08/12] p2p: fix misplaced end --- .gitignore | 1 + sumpy/p2p.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 71f03d3a..9bae8f40 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,6 @@ doc/_build .cache .DS_Store .ipynb_checkpoints +.pytest_cache sumpy/_git_rev.py diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 2c5d9189..ce9fc2c8 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -458,8 +458,8 @@ class P2PFromCSR(P2PBase): <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + + [" end"] + [""" - end result[{i}, itgt] = result[{i}, itgt] + \ knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) """.format(i=iknl) -- GitLab From be32e3aaa57a81479050609a36625149b2549c05 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Thu, 8 Mar 2018 21:32:53 -0600 Subject: [PATCH 09/12] qbx: remove dtype from result --- sumpy/p2p.py | 10 +++++----- sumpy/qbx.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index ce9fc2c8..84c8f390 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -358,10 +358,10 @@ class P2PMatrixBlockGenerator(P2PBase): lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict( - nsources=np.int64, - ntargets=np.int64, - ntgtindices=np.int64, - nsrcindices=np.int64)) + nsources=np.int32, + ntargets=np.int32, + ntgtindices=np.int32, + nsrcindices=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: @@ -479,7 +479,7 @@ class P2PFromCSR(P2PBase): lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, - dict(nsources=np.int64, ntargets=np.int64)) + dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") diff --git a/sumpy/qbx.py b/sumpy/qbx.py index f0c35174..5d4e1105 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -202,8 +202,8 @@ class LayerPotential(LayerPotentialBase): None, shape="nsources", order="C") for i in range(self.strength_count)] + [lp.GlobalArg("result_%d" % i, - dtype, shape="ntargets", order="C") - for i, dtype in enumerate(self.value_dtypes)]) + None, shape="ntargets", order="C") + for i, _ in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntargets}", @@ -386,10 +386,10 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict( - nsources=np.int64, - ntargets=np.int64, - ntgtindices=np.int64, - nsrcindices=np.int64)) + nsources=np.int32, + ntargets=np.int32, + ntgtindices=np.int32, + nsrcindices=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.expansions: -- GitLab From 705c2304feb921da9f91385a37b12645bf0a112d Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Sun, 11 Mar 2018 15:42:01 -0500 Subject: [PATCH 10/12] merge some of the domains --- sumpy/p2p.py | 30 +++++++++++++++++------------- sumpy/qbx.py | 27 ++++++++++++++++----------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 84c8f390..efdbdcd4 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -132,7 +132,7 @@ class P2PBase(KernelComputation, KernelCacheWrapper): from sumpy.tools import gather_loopy_source_arguments return ([ lp.GlobalArg("sources", None, - shape=(self.dim, "nsources")), + shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), lp.ValueArg("nsources", None), @@ -179,11 +179,12 @@ class P2P(P2PBase): shape="nresults, ntargets", dim_tags="sep,C") ]) - loopy_knl = lp.make_kernel([ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ], + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] @@ -245,11 +246,12 @@ class P2PMatrixGenerator(P2PBase): shape="ntargets,nsources") for i, dtype in enumerate(self.value_dtypes)]) - loopy_knl = lp.make_kernel([ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ], + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] @@ -318,8 +320,10 @@ class P2PMatrixBlockGenerator(P2PBase): loopy_knl = lp.make_kernel([ "{[irange]: 0 <= irange < nranges}", - "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", - "{[idim]: 0 <= idim < dim}" + "{[itgt, isrc, idim]: \ + 0 <= itgt < ntgtblock and \ + 0 <= isrc < nsrcblock and \ + 0 <= idim < dim}", ], self.get_kernel_scaling_assignments() + [""" diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 5d4e1105..f3b2d6f7 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -193,6 +193,7 @@ class LayerPotential(LayerPotentialBase): default_name = "qbx_apply" + @memoize_method def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) @@ -203,13 +204,14 @@ class LayerPotential(LayerPotentialBase): for i in range(self.strength_count)] + [lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C") - for i, _ in enumerate(self.value_dtypes)]) + for i in range(len(self.kernels))]) - loopy_knl = lp.make_kernel([ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ], + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] @@ -265,6 +267,7 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 + @memoize_method def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) @@ -274,11 +277,12 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): dtype, shape="ntargets, nsources", order="C") for i, dtype in enumerate(self.value_dtypes)]) - loopy_knl = lp.make_kernel([ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ], + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] @@ -324,6 +328,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 + @memoize_method def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) -- GitLab From 2adb951550f811c933595345cf2437f5099c4206 Mon Sep 17 00:00:00 2001 From: Alex Fikl Date: Sun, 11 Mar 2018 19:46:02 -0500 Subject: [PATCH 11/12] p2p: silence race condition warning --- sumpy/p2p.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sumpy/p2p.py b/sumpy/p2p.py index efdbdcd4..38bec39d 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -465,7 +465,8 @@ class P2PFromCSR(P2PBase): + [" end"] + [""" result[{i}, itgt] = result[{i}, itgt] + \ - knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_csr}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" @@ -476,6 +477,7 @@ class P2PFromCSR(P2PBase): arguments, assumptions="ntgt_boxes>=1", name=self.name, + silenced_warnings="write_race(write_csr*)", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, -- GitLab From e9fbbf0fa94982dd1041218a53d1bbbeebd33f25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Thu, 15 Mar 2018 15:40:40 -0400 Subject: [PATCH 12/12] Delete spurious pytest cache file --- .pytest_cache/v/cache/lastfailed | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .pytest_cache/v/cache/lastfailed diff --git a/.pytest_cache/v/cache/lastfailed b/.pytest_cache/v/cache/lastfailed deleted file mode 100644 index bd70197b..00000000 --- a/.pytest_cache/v/cache/lastfailed +++ /dev/null @@ -1,4 +0,0 @@ -{ - "test/test_matrixgen.py::test_p2p_direct[ctx_getter=>-False]": true, - "test/test_matrixgen.py::test_p2p_direct[ctx_getter=>-True]": true -} \ No newline at end of file -- GitLab