diff --git a/sumpy/p2p.py b/sumpy/p2p.py index a35505ea418b0cb0d2d58cde1cabb53c521d3602..d92360e3d6a364f1cfd058fa909af82eee089958 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -30,6 +30,7 @@ from six.moves import range import numpy as np import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -57,7 +58,7 @@ Particle-to-particle class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, - options=[], name=None, device=None): + options=[], name="p2p", device=None): """ :arg kernels: list of :class:`sumpy.kernel.Kernel` instances :arg strength_usage: A list of integers indicating which expression @@ -74,6 +75,10 @@ class P2PBase(KernelComputation, KernelCacheWrapper): from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.kernels) + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) @@ -107,34 +112,33 @@ class P2PBase(KernelComputation, KernelCacheWrapper): def get_strength_or_not(self, isrc, kernel_idx): return var("strength").index((self.strength_usage[kernel_idx], isrc)) - def get_exprs(self, results): + def get_kernel_exprs(self, result_names): from pymbolic import var isrc_sym = var("isrc") exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(results)] + for i, name in enumerate(result_names)] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] return [lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) for i, expr in enumerate(exprs)] def get_default_src_tgt_arguments(self): from sumpy.tools import gather_loopy_source_arguments - return [ + return ([ lp.GlobalArg("sources", None, - shape=(self.dim, "nsources")), + shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32) - ] + ([ - lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) + ( + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))] + if self.exclude_self else []) + gather_loopy_source_arguments(self.kernels)) def get_kernel(self): @@ -152,9 +156,6 @@ class P2PBase(KernelComputation, KernelCacheWrapper): knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) # }}} @@ -162,17 +163,21 @@ class P2PBase(KernelComputation, KernelCacheWrapper): # {{{ P2P point-interaction calculation class P2P(P2PBase): + """Direct applier for P2P interactions.""" + default_name = "p2p_apply" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults, ntargets", dim_tags="sep,C")]) + shape="nresults, ntargets", dim_tags="sep,C") + ]) loopy_knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntargets}", @@ -184,8 +189,7 @@ class P2P(P2PBase): + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + ["<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else ""] - + loopy_insns - + exprs + + loopy_insns + kernel_exprs + [""" result[{i}, itgt] = knl_{i}_scaling * \ simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} @@ -198,7 +202,8 @@ class P2P(P2PBase): fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nresults=len(self.kernels))) + nresults=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -224,6 +229,8 @@ class P2P(P2PBase): # {{{ P2P matrix writer class P2PMatrixGenerator(P2PBase): + """Generator for P2P interaction matrix entries.""" + default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): @@ -231,12 +238,12 @@ class P2PMatrixGenerator(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes)]) + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntargets}", @@ -248,7 +255,7 @@ class P2PMatrixGenerator(P2PBase): + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + ["<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else ""] - + loopy_insns + exprs + + loopy_insns + kernel_exprs + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} @@ -258,7 +265,8 @@ class P2PMatrixGenerator(P2PBase): arguments, assumptions="nsources>=1 and ntargets>=1", name=self.name, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -283,6 +291,8 @@ class P2PMatrixGenerator(P2PBase): # {{{ P2P matrix block writer class P2PMatrixBlockGenerator(P2PBase): + """Generator for a subset of P2P interaction matrix entries.""" + default_name = "p2p_block" def get_strength_or_not(self, isrc, kernel_idx): @@ -290,19 +300,21 @@ class P2PMatrixBlockGenerator(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None)] + - [lp.GlobalArg("result_%d" % i, dtype, - shape="ntgtindices, nsrcindices") - for i, dtype in enumerate(self.value_dtypes)]) + lp.ValueArg("nsrcindices", np.int64), + lp.ValueArg("ntgtindices", np.int64), + lp.ValueArg("nranges", None) + ] + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel([ "{[irange]: 0 <= irange < nranges}", @@ -322,14 +334,16 @@ class P2PMatrixBlockGenerator(P2PBase): for itgt, isrc <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \ sources[idim, srcindices[srcstart + isrc]] - """] + [""" + """] + + [""" <> is_self = (srcindices[srcstart + isrc] == target_to_source[tgtindices[tgtstart + itgt]]) - """ if self.exclude_self else ""] - + loopy_insns + exprs + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + [""" - result_{i}[tgtstart + itgt, srcstart + isrc] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=irange}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" @@ -339,10 +353,13 @@ class P2PMatrixBlockGenerator(P2PBase): arguments, assumptions="nranges>=1", name=self.name, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -383,23 +400,29 @@ class P2PFromCSR(P2PBase): def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - exprs = self.get_exprs(result_names) + kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( - self.get_default_src_tgt_arguments() + [ - lp.GlobalArg("box_target_starts", None, shape=None), - lp.GlobalArg("box_target_counts_nonchild", None, shape=None), - lp.GlobalArg("box_source_starts", None, shape=None), - lp.GlobalArg("box_source_counts_nonchild", None, shape=None), - lp.GlobalArg("source_box_starts", None, shape=None), - lp.GlobalArg("source_box_lists", None, shape=None), + self.get_default_src_tgt_arguments() + + [ + lp.GlobalArg("box_target_starts", + None, shape=None), + lp.GlobalArg("box_target_counts_nonchild", + None, shape=None), + lp.GlobalArg("box_source_starts", + None, shape=None), + lp.GlobalArg("box_source_counts_nonchild", + None, shape=None), + lp.GlobalArg("source_box_starts", + None, shape=None), + lp.GlobalArg("source_box_lists", + None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." - ]) + ]) - from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel([ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", @@ -411,36 +434,36 @@ class P2PFromCSR(P2PBase): self.get_kernel_scaling_assignments() + [""" for itgt_box - <> tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] - - for itgt - for isrc - <> d[idim] = \ - targets[idim, itgt] - sources[idim, isrc] {dup=idim} + <> tgt_ibox = target_boxes[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] + + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_end = source_box_starts[itgt_box+1] + + for isrc_box + <> src_ibox = source_box_lists[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for itgt + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} """] + [""" - <> is_self = (isrc == target_to_source[itgt]) + <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] - + loopy_insns + exprs + + loopy_insns + kernel_exprs + [""" - end - result[{i}, itgt] = result[{i}, itgt] + \ - knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) + end + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) """.format(i=iknl) - for iknl in range(len(exprs))] + for iknl in range(len(self.kernels))] + [""" - end end end + end """], arguments, assumptions="ntgt_boxes>=1", @@ -448,7 +471,11 @@ class P2PFromCSR(P2PBase): fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nkernels=len(self.kernels))) + nkernels=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 0f617e62990c4aad8f187b03d22e2ac9c72b46fd..6e8be4911aec45224e1cbab99099699ca358d7ce 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -27,7 +27,7 @@ THE SOFTWARE. import six -from six.moves import range, zip +from six.moves import range import numpy as np import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -103,46 +103,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def expansions(self): return self.kernels - def get_compute_a_and_b_vecs(self): - return """ - <> a[idim] = center[idim,itgt] - src[idim,isrc] {dup=idim} - <> b[idim] = tgt[idim,itgt] - center[idim,itgt] {dup=idim} - <> rscale = expansion_radii[itgt] - """ - - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("src", None, - shape=(self.dim, "nsources"), order="C"), - lp.GlobalArg("tgt", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("center", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("expansion_radii", None, shape="ntargets"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" - - @memoize_method - def get_kernel(self): + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector - avec = make_sym_vector("a", self.dim) bvec = make_sym_vector("b", self.dim) @@ -152,7 +114,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): logger.info("compute expansion expressions: start") result_names = [expand(i, sac, expn, avec, bvec) - for i, expn in enumerate(self.expansions)] + for i, expn in enumerate(self.expansions)] logger.info("compute expansion expressions: done") @@ -168,55 +130,43 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): complex_dtype=np.complex128 # FIXME ) - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] - - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + gather_loopy_source_arguments(self.kernels)) + return loopy_insns, result_names - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + [self.get_compute_a_and_b_vecs()] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - name=self.name, - assumptions=self.get_assumptions(), - default_offset=lp.auto, - silenced_warnings="write_race(write_lpot*)", - fixed_parameters=dict(dim=self.dim), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + def get_kernel_exprs(self, result_names): + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] - for expn in self.expansions: - loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("src", None, + shape=(self.dim, "nsources"), order="C"), + lp.GlobalArg("tgt", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("center", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("expansion_radii", + None, shape="ntargets"), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError - @memoize_method def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() - # FIXME: how to tune for blocks? import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: @@ -242,28 +192,49 @@ class LayerPotential(LayerPotentialBase): default_name = "qbx_apply" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("strength_%d" % i, + None, shape="nsources", order="C") + for i in range(self.strength_count)] + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_lpot,inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C") - for i in range(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C") - for i in range(len(self.kernels)) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[itgt] = \ - knl_{i}_scaling*simul_reduce( - sum, isrc, pair_result_{i}) \ - {{id_prefix=write_lpot,inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, **kwargs): @@ -293,23 +264,48 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[itgt, isrc] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=isrc:itgt} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets, nsources", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt, isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=isrc:itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, **kwargs) @@ -320,76 +316,83 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): # {{{ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): + """Generator for a subset of the layer potential matrix entries.""" + default_name = "qbx_block" - def get_src_tgt_arguments(self): - return LayerPotentialBase.get_src_tgt_arguments(self) \ - + [ + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("srcindices", None, shape="nsrcindices"), lp.GlobalArg("tgtindices", None, shape="ntgtindices"), lp.GlobalArg("srcranges", None, shape="nranges + 1"), lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nsrcindices", None), + lp.ValueArg("ntgtindices", None), lp.ValueArg("nranges", None) - ] - - def get_domains(self): - # FIXME: this doesn't work when separating j and k - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ + ] + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntgtindices, nsrcindices") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}", + "{[idim]: 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] + <> ntgtblock = tgtend - tgtstart + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] + <> nsrcblock = srcend - srcstart + + for itgt, isrc + <> a[idim] = center[idim, tgtindices[tgtstart + itgt]] - \ + src[idim, srcindices[srcstart + isrc]] \ + {dup=idim} + <> b[idim] = tgt[idim, tgtindices[tgtstart + itgt]] - \ + center[idim, tgtindices[tgtstart + itgt]] \ + {dup=idim} + <> rscale = expansion_radii[tgtindices[tgtstart + itgt]] + """] + + loopy_insns + kernel_exprs + + [""" + result_{i}[tgtstart + itgt, srcstart + isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + [""" end end - """ - ] - - def get_strength_or_not(self, isrc, kernel_idx): - return 1 + """], + arguments, + name=self.name, + assumptions="nranges>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int64, ntargets=np.int64)) - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[tgtstart + j, srcstart + k] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) - def get_assumptions(self): - return "nranges>=1" + return loopy_knl - @memoize_method def get_optimized_kernel(self): - # FIXME loopy_knl = self.get_kernel() loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") @@ -397,7 +400,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def __call__(self, queue, targets, sources, centers, expansion_radii, tgtindices, srcindices, tgtranges, srcranges, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii,