diff --git a/.gitignore b/.gitignore index 71f03d3ad65ed14d6e37cfb471ba601adf355b4b..9bae8f4083047f33b6d4306be469ffa8451f977b 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,6 @@ doc/_build .cache .DS_Store .ipynb_checkpoints +.pytest_cache sumpy/_git_rev.py diff --git a/sumpy/p2p.py b/sumpy/p2p.py index d8586cc228e47d6aa9f12d7c7c436726ad28bc2e..38bec39d3b4dbd7efe6229c563f391c29fdd3efb 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -30,6 +30,7 @@ from six.moves import range import numpy as np import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -40,10 +41,10 @@ __doc__ = """ Particle-to-particle -------------------- -.. autoclass:: P2PComputationBase -.. autoclass:: SingleSrcTgtListP2PBase +.. autoclass:: P2PBase .. autoclass:: P2P .. autoclass:: P2PMatrixGenerator +.. autoclass:: P2PMatrixBlockGenerator .. autoclass:: P2PFromCSR """ @@ -54,7 +55,7 @@ Particle-to-particle # {{{ p2p base class -class P2PComputationBase(KernelComputation, KernelCacheWrapper): +class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, options=[], name=None, device=None): @@ -74,6 +75,10 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.kernels) + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) @@ -104,93 +109,40 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): return loopy_insns, result_names - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) - -# }}} - - -# {{{ P2P with list of sources and list of targets - -class SingleSrcTgtListP2PBase(P2PComputationBase): - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("sources", None, - shape=(self.dim, "nsources")), - lp.GlobalArg("targets", None, - shape=(self.dim, "ntargets")), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32) - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength").index((self.strength_usage[kernel_idx], isrc)) - def get_kernel(self): - loopy_insns, result_names = self.get_loopy_insns_and_result_names() + def get_kernel_exprs(self, result_names): + from pymbolic import var isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"] - + ["<> is_self = (isrc == target_to_source[itgt])" - if self.exclude_self else ""] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - assumptions=self.get_assumptions(), - name=self.name, - fixed_parameters=dict( - dim=self.dim, - nstrengths=self.strength_count, - nresults=len(self.kernels))) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - - for knl in self.kernels: - loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("sources", None, + shape=(self.dim, "nsources")), + lp.GlobalArg("targets", None, + shape=(self.dim, "ntargets")), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))] + if self.exclude_self else []) + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -210,26 +162,56 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): # {{{ P2P point-interaction calculation -class P2P(SingleSrcTgtListP2PBase): - default_name = "p2p_apply" +class P2P(P2PBase): + """Direct applier for P2P interactions.""" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength").index((self.strength_usage[kernel_idx], isrc)) + default_name = "p2p_apply" - def get_input_and_output_arguments(self): - return [ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("strength", None, - shape="nstrengths,nsources", dim_tags="sep,C"), + shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults,ntargets", dim_tags="sep,C") - ] + shape="nresults, ntargets", dim_tags="sep,C") + ]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" + result[{i}, itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict( + dim=self.dim, + nstrengths=self.strength_count, + nresults=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_result_store_instructions(self): - return [""" - result[{i}, itgt] = knl_{i}_scaling \ - * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.kernels))] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, strength, **kwargs): from pytools.obj_array import is_obj_array @@ -247,26 +229,53 @@ class P2P(SingleSrcTgtListP2PBase): # {{{ P2P matrix writer -class P2PMatrixGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixGenerator(P2PBase): + """Generator for P2P interaction matrix entries.""" + default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, **kwargs): from pytools.obj_array import is_obj_array @@ -281,74 +290,88 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase): # }}} -# {{{ +# {{{ P2P matrix block writer + +class P2PMatrixBlockGenerator(P2PBase): + """Generator for a subset of P2P interaction matrix entries.""" -class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): default_name = "p2p_block" - def get_src_tgt_arguments(self): - return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nsrcindices", None), + lp.ValueArg("ntgtindices", None), lp.ValueArg("nranges", None) - ] - - def get_domains(self): - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + ] + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc, idim]: \ + 0 <= itgt < ntgtblock and \ + 0 <= isrc < nsrcblock and \ + 0 <= idim < dim}", + ], + self.get_kernel_scaling_assignments() + + [""" for irange <> tgtstart = tgtranges[irange] <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart + <> ntgtblock = tgtend - tgtstart <> srcstart = srcranges[irange] <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ + sources[idim, srcindices[srcstart + isrc]] + """] + + [""" + <> is_self = (srcindices[srcstart + isrc] == + target_to_source[tgtindices[tgtstart + itgt]]) + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_p2p,inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + [""" end end - """ - ] + """], + arguments, + assumptions="nranges>=1", + silenced_warnings="write_race(write_p2p*)", + name=self.name, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, dict( + nsources=np.int32, + ntargets=np.int32, + ntgtindices=np.int32, + nsrcindices=np.int32)) - def get_strength_or_not(self, isrc, kernel_idx): - return 1 - - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[tgtstart + j, srcstart + k] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} - """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -380,102 +403,93 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): # {{{ P2P from CSR-like interaction list -class P2PFromCSR(P2PComputationBase): +class P2PFromCSR(P2PBase): default_name = "p2p_from_csr" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - from pymbolic import var - exprs = [ - var(name) - * var("strength").index((self.strength_usage[i], var("isrc"))) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - - from sumpy.tools import gather_loopy_source_arguments - loopy_knl = lp.make_kernel( + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] - - for itgt - for isrc - <> d[idim] = \ - targets[idim,itgt] - sources[idim,isrc] \ - {dup=idim} - """] + [""" - <> is_self = (isrc == target_to_source[itgt]) - """ if self.exclude_self else ""] + [ - ] + loopy_insns + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] + [ - """ - end - """] + [""" - result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ - knl_KNLIDX_scaling \ - * simul_reduce(sum, isrc, pair_result_KNLIDX) - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(exprs))] + [""" - end + <> tgt_ibox = target_boxes[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] + + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_end = source_box_starts[itgt_box+1] + + for isrc_box + <> src_ibox = source_box_lists[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for itgt + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} + """] + [""" + <> is_self = (isrc == target_to_source[itgt]) + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [" end"] + + [""" + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_csr}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + [""" end end - """], - [ - lp.GlobalArg("box_target_starts,box_target_counts_nonchild," - "box_source_starts,box_source_counts_nonchild,", - None, shape=None), - lp.GlobalArg("source_box_starts, source_box_lists,", - None, shape=None), - lp.GlobalArg("strength", None, shape="nstrengths,nsources"), - lp.GlobalArg("result", None, - shape="nkernels,ntargets", dim_tags="sep,c"), - lp.GlobalArg("targets", None, - shape="dim,ntargets", dim_tags="sep,c"), - lp.GlobalArg("sources", None, - shape="dim,nsources", dim_tags="sep,c"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - "...", - ] + ( - [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else [] - ) + gather_loopy_source_arguments(self.kernels), - name=self.name, assumptions="ntgt_boxes>=1", + end + """], + arguments, + assumptions="ntgt_boxes>=1", + name=self.name, + silenced_warnings="write_race(write_csr*)", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nkernels=len(self.kernels))) + nkernels=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -485,12 +499,14 @@ class P2PFromCSR(P2PComputationBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() + import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") else: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") + return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 0f617e62990c4aad8f187b03d22e2ac9c72b46fd..f3b2d6f78964d3d4f5a88248f7b9a08d32b722eb 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -27,7 +27,7 @@ THE SOFTWARE. import six -from six.moves import range, zip +from six.moves import range import numpy as np import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -49,6 +49,7 @@ QBX for Layer Potentials .. autoclass:: LayerPotentialBase .. autoclass:: LayerPotential .. autoclass:: LayerPotentialMatrixGenerator +.. autoclass:: LayerPotentialMatrixBlockGenerator """ @@ -87,7 +88,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec): class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, expansions, strength_usage=None, value_dtypes=None, - options=[], name="layerpot", device=None): + options=[], name=None, device=None): KernelComputation.__init__(self, ctx, expansions, strength_usage, value_dtypes, name, options, device) @@ -103,46 +104,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def expansions(self): return self.kernels - def get_compute_a_and_b_vecs(self): - return """ - <> a[idim] = center[idim,itgt] - src[idim,isrc] {dup=idim} - <> b[idim] = tgt[idim,itgt] - center[idim,itgt] {dup=idim} - <> rscale = expansion_radii[itgt] - """ - - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("src", None, - shape=(self.dim, "nsources"), order="C"), - lp.GlobalArg("tgt", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("center", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("expansion_radii", None, shape="ntargets"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" - - @memoize_method - def get_kernel(self): + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector - avec = make_sym_vector("a", self.dim) bvec = make_sym_vector("b", self.dim) @@ -152,7 +115,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): logger.info("compute expansion expressions: start") result_names = [expand(i, sac, expn, avec, bvec) - for i, expn in enumerate(self.expansions)] + for i, expn in enumerate(self.expansions)] logger.info("compute expansion expressions: done") @@ -168,55 +131,43 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): complex_dtype=np.complex128 # FIXME ) - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] + return loopy_insns, result_names - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + [self.get_compute_a_and_b_vecs()] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - name=self.name, - assumptions=self.get_assumptions(), - default_offset=lp.auto, - silenced_warnings="write_race(write_lpot*)", - fixed_parameters=dict(dim=self.dim), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + def get_kernel_exprs(self, result_names): + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] - for expn in self.expansions: - loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("src", None, + shape=(self.dim, "nsources"), order="C"), + lp.GlobalArg("tgt", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("center", None, + shape=(self.dim, "ntargets"), dim_tags="sep,C"), + lp.GlobalArg("expansion_radii", + None, shape="ntargets"), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError - @memoize_method def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() - # FIXME: how to tune for blocks? import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: @@ -242,28 +193,51 @@ class LayerPotential(LayerPotentialBase): default_name = "qbx_apply" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) - - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C") - for i in range(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C") - for i in range(len(self.kernels)) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[itgt] = \ - knl_{i}_scaling*simul_reduce( - sum, isrc, pair_result_{i}) \ - {{id_prefix=write_lpot,inames=itgt}} + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("strength_%d" % i, + None, shape="nsources", order="C") + for i in range(self.strength_count)] + + [lp.GlobalArg("result_%d" % i, + None, shape="ntargets", order="C") + for i in range(len(self.kernels))]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_lpot,inames=itgt}} """.format(i=iknl) - for iknl in range(len(self.expansions)) - ] + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, **kwargs): @@ -293,23 +267,50 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[itgt, isrc] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=isrc:itgt} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets, nsources", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt, isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=isrc:itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, **kwargs) @@ -320,76 +321,88 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): # {{{ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): + """Generator for a subset of the layer potential matrix entries.""" + default_name = "qbx_block" - def get_src_tgt_arguments(self): - return LayerPotentialBase.get_src_tgt_arguments(self) \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nsrcindices", None), + lp.ValueArg("ntgtindices", None), lp.ValueArg("nranges", None) - ] - - def get_domains(self): - # FIXME: this doesn't work when separating j and k - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + ] + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] + <> ntgtblock = tgtend - tgtstart + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \ + src[idim, srcindices[srcstart + isrc]] \ + {dup=idim} + <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \ + center[idim, tgtindices[tgtstart + itgt]] \ + {dup=idim} + <> rscale = expansion_radii[tgtindices[tgtstart + itgt]] + """] + + loopy_insns + kernel_exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_lpot,inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + [""" end end - """ - ] - - def get_strength_or_not(self, isrc, kernel_idx): - return 1 - - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + """], + arguments, + name=self.name, + assumptions="nranges>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, dict( + nsources=np.int32, + ntargets=np.int32, + ntgtindices=np.int32, + nsrcindices=np.int32)) - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[tgtstart + j, srcstart + k] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl - @memoize_method def get_optimized_kernel(self): - # FIXME loopy_knl = self.get_kernel() loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") @@ -397,7 +410,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def __call__(self, queue, targets, sources, centers, expansion_radii, tgtindices, srcindices, tgtranges, srcranges, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii,