diff --git a/benchmarks/bench_translations.py b/benchmarks/bench_translations.py index ed97ed5456d00fcdc99daeb8e14841db8155b4dd..9e01c72ab7da80866376295202ef74605833c20c 100644 --- a/benchmarks/bench_translations.py +++ b/benchmarks/bench_translations.py @@ -64,9 +64,15 @@ class TranslationBenchmarkSuite: dvec = sym.make_sym_vector("d", knl.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") - result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, - dvec, tgt_rscale) sac = SymbolicAssignmentCollection() + try: + result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, + dvec, tgt_rscale, sac) + except TypeError: + # Support older interface to make it possible to compare + # in CI run + result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, + dvec, tgt_rscale) for i, expr in enumerate(result): sac.assign_unique("coeff%d" % i, expr) sac.run_global_cse() @@ -76,7 +82,7 @@ class TranslationBenchmarkSuite: return sum([counter.rec(insn.expression)+1 for insn in insns]) track_m2l_op_count.unit = "ops" - track_m2l_op_count.timeout = 200.0 + track_m2l_op_count.timeout = 300.0 class LaplaceVolumeTaylorTranslation(TranslationBenchmarkSuite): diff --git a/sumpy/__init__.py b/sumpy/__init__.py index 31699b03738b2b82bf61d1ab47c13d8197ae35a3..3351ae45639d2a29bf93d241da22b05d62f76146 100644 --- a/sumpy/__init__.py +++ b/sumpy/__init__.py @@ -25,7 +25,8 @@ from sumpy.p2p import P2P, P2PFromCSR from sumpy.p2e import P2EFromSingleBox, P2EFromCSR from sumpy.e2p import E2PFromSingleBox, E2PFromCSR from sumpy.e2e import (E2EFromCSR, E2EFromChildren, E2EFromParent, - E2EFromCSRTranslationInvariant, E2EFromCSRTranslationClassesPrecompute) + E2EFromCSRTranslationInvariant, E2EFromCSRTranslationClassesPrecompute, + E2EFromCSRWithFFTPreprocess) from sumpy.version import VERSION_TEXT from pytools.persistent_dict import WriteOncePersistentDict @@ -34,7 +35,8 @@ __all__ = [ "P2EFromSingleBox", "P2EFromCSR", "E2PFromSingleBox", "E2PFromCSR", "E2EFromCSR", "E2EFromChildren", "E2EFromParent", - "E2EFromCSRTranslationInvariant", "E2EFromCSRTranslationClassesPrecompute"] + "E2EFromCSRTranslationInvariant", "E2EFromCSRTranslationClassesPrecompute", + "E2EFromCSRWithFFTPreprocess"] code_cache = WriteOncePersistentDict("sumpy-code-cache-v6-"+VERSION_TEXT) diff --git a/sumpy/assignment_collection.py b/sumpy/assignment_collection.py index df4502aba08b7064c30a297465a02e991d927e73..54a825cb1a1eaae0310ae1395ffe1817a8ac3ecc 100644 --- a/sumpy/assignment_collection.py +++ b/sumpy/assignment_collection.py @@ -22,6 +22,7 @@ THE SOFTWARE. import sumpy.symbolic as sym +from collections import OrderedDict import logging logger = logging.getLogger(__name__) @@ -95,14 +96,9 @@ class SymbolicAssignmentCollection: by this class, but not expressions using names defined in this collection. """ - def __init__(self, assignments=None): - """ - :arg assignments: mapping from *var_name* to expression - """ - - if assignments is None: - assignments = {} + def __init__(self): + assignments = OrderedDict() self.assignments = assignments self.reversed_assignments = {v: k for (k, v) in assignments.items()} @@ -191,18 +187,21 @@ class SymbolicAssignmentCollection: #from sumpy.symbolic import checked_cse from sumpy.cse import cse - new_assignments, new_exprs = cse(assign_exprs + extra_exprs, + new_assignments_list, new_exprs = cse(assign_exprs + extra_exprs, symbols=self.symbol_generator) new_assign_exprs = new_exprs[:len(assign_exprs)] new_extra_exprs = new_exprs[len(assign_exprs):] + new_assignments = OrderedDict() + for name, value in new_assignments_list: + assert isinstance(name, sym.Symbol) + new_assignments[name.name] = value + for name, new_expr in zip(assign_names, new_assign_exprs): - self.assignments[name] = new_expr + new_assignments[name] = new_expr - for name, value in new_assignments: - assert isinstance(name, sym.Symbol) - self.add_assignment(name.name, value) + self.assignments = new_assignments for name, new_expr in zip(assign_names, new_assign_exprs): # We want the assignment collection to be ordered correctly diff --git a/sumpy/codegen.py b/sumpy/codegen.py index 5120b6e5481f3132d9ad016c95d9aefc634040b9..1da10d00f521dc82c3ce1fa17d3cd8ea6a948154 100644 --- a/sumpy/codegen.py +++ b/sumpy/codegen.py @@ -571,24 +571,30 @@ class BigIntegerKiller(CSECachingMapperMixin, IdentityMapper): # }}} -# {{{ convert 123000000j to 123000000 * 1j +# {{{ convert complex to np.complex class ComplexRewriter(CSECachingMapperMixin, IdentityMapper): - def __init__(self, float_type=np.float32): + def __init__(self, complex_dtype=None): IdentityMapper.__init__(self) - self.float_type = float_type + self.complex_dtype = complex_dtype def map_constant(self, expr): - """Convert complex values not within complex64 to a product for loopy + """Convert complex values to numpy types """ - if not isinstance(expr, complex): + if not isinstance(expr, (complex, np.complex64, np.complex128)): return IdentityMapper.map_constant(self, expr) - if complex(self.float_type(expr.imag)) == expr.imag: - return IdentityMapper.map_constant(self, expr) + complex_dtype = self.complex_dtype + if complex_dtype is None: + if complex(np.complex64(expr)) == expr: + return np.complex64(expr) + complex_dtype = np.complex128 - return expr.real + prim.Product((expr.imag, 1j)) + if isinstance(complex_dtype, np.dtype): + return complex_dtype.type(expr) + else: + return complex_dtype(expr) map_common_subexpression_uncached = IdentityMapper.map_common_subexpression @@ -698,7 +704,7 @@ def to_loopy_insns(assignments, vector_names=set(), pymbolic_expr_maps=[], ssg = SumSignGrouper() fck = FractionKiller() bik = BigIntegerKiller() - cmr = ComplexRewriter() + cmr = ComplexRewriter(complex_dtype) def convert_expr(name, expr): logger.debug("generate expression for: %s" % name) diff --git a/sumpy/cse.py b/sumpy/cse.py index 0da0eadc7f1c0afb4c29dafe1569f19b5c36000d..d93cb503d76b433ee0724b7054dc9ad2d84f501b 100644 --- a/sumpy/cse.py +++ b/sumpy/cse.py @@ -143,7 +143,6 @@ class FuncArgTracker: for func_i, func in enumerate(funcs): func_argset = set() - for func_arg in func.args: arg_number = self.get_or_add_value_number(func_arg) func_argset.add(arg_number) diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 236090f3481566f75e2f259d291401e13eb195ac..1865833f4caf228824fdec37ca57f9f8dee46d9a 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -25,7 +25,7 @@ import loopy as lp import sumpy.symbolic as sym from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from sumpy.tools import KernelCacheWrapper +from sumpy.tools import KernelCacheWrapper, to_complex_dtype import logging logger = logging.getLogger(__name__) @@ -81,6 +81,7 @@ class E2EBase(KernelCacheWrapper): self.tgt_expansion = tgt_expansion self.name = name or self.default_name self.device = device + self.use_fft = self.tgt_expansion.use_fft if src_expansion.dim != tgt_expansion.dim: raise ValueError("source and target expansions must have " @@ -115,14 +116,14 @@ class E2EBase(KernelCacheWrapper): vector_names={"d"}, pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], retain_names=tgt_coeff_names, - complex_dtype=np.complex128 # FIXME ) def get_cache_key(self): return ( type(self).__name__, self.src_expansion, - self.tgt_expansion) + self.tgt_expansion, + ) def get_optimized_kernel(self): # FIXME @@ -222,6 +223,7 @@ class E2EFromCSR(E2EBase): loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) + loopy_knl = lp.set_options(loopy_knl, enforce_variable_access_ordered="no_check") return loopy_knl @@ -503,6 +505,457 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): # }}} +# {{{ + +class E2EFromCSRTranslationInvariant(E2EFromCSR): + """Implements translation from a "compressed sparse row"-like source box + list for translation invariant Kernels. + """ + default_name = "e2e_from_csr_translation_invariant" + + def get_translation_loopy_insns(self, result_dtype): + from sumpy.symbolic import make_sym_vector + dvec = make_sym_vector("d", self.dim) + + src_rscale = sym.Symbol("src_rscale") + + tgt_rscale = sym.Symbol("tgt_rscale") + + nprecomputed_exprs = \ + self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) + + if self.use_fft: + ncoeff_src = nprecomputed_exprs + else: + ncoeff_src = len(self.src_expansion) + + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(ncoeff_src)] + + precomputed_exprs = [sym.Symbol("precomputed_expr%d" % i) + for i in range(nprecomputed_exprs)] + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + tgt_coeff_names = [ + sac.assign_unique("coeff%d" % i, coeff_i) + for i, coeff_i in enumerate( + self.tgt_expansion.translate_from( + self.src_expansion, src_coeff_exprs, src_rscale, + dvec, tgt_rscale, sac=sac, + precomputed_exprs=precomputed_exprs))] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + return to_loopy_insns( + six.iteritems(sac.assignments), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=tgt_coeff_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + + def get_postprocess_loopy_insns(self, result_dtype): + + ncoeff_tgt = len(self.tgt_expansion) + nprecomputed_exprs = \ + self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) + + if self.use_fft: + ncoeff_tgt = nprecomputed_exprs + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + + tgt_coeff_exprs = [ + sym.Symbol("coeff_sum%d" % i) for i in range(ncoeff_tgt) + ] + + src_rscale = sym.Symbol("src_rscale") + tgt_rscale = sym.Symbol("tgt_rscale") + + if self.use_fft: + tgt_coeff_post_exprs = self.tgt_expansion.m2l_postprocess_exprs( + self.src_expansion, tgt_coeff_exprs, src_rscale, tgt_rscale, + sac=sac, use_fft=self.use_fft) + if result_dtype in (np.float32, np.float64): + real_func = sym.Function("real") + tgt_coeff_post_exprs = [real_func(expr) for expr in \ + tgt_coeff_post_exprs] + else: + tgt_coeff_post_exprs = tgt_coeff_exprs + + tgt_coeff_post_names = [ + sac.assign_unique("coeff_post%d" % i, coeff) + for i, coeff in enumerate(tgt_coeff_post_exprs) + ] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + insns = to_loopy_insns( + six.iteritems(sac.assignments), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=tgt_coeff_post_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + return insns + + + def get_kernel(self, result_dtype): + ncoeff_src = len(self.src_expansion) + ncoeff_tgt = len(self.tgt_expansion) + ncoeff_tgt_post = len(self.tgt_expansion) + nprecomputed_exprs = \ + self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion, + use_fft=self.use_fft) + + if self.use_fft: + ncoeff_src = nprecomputed_exprs + ncoeff_tgt = nprecomputed_exprs + + # To clarify terminology: + # + # isrc_box -> The index in a list of (in this case, source) boxes + # src_ibox -> The (global) box number for the (in this case, source) box + # + # (same for itgt_box, tgt_ibox) + + from sumpy.tools import gather_loopy_arguments + loopy_knl = lp.make_kernel( + [ + "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", + "{[isrc_box]: isrc_start<=isrc_box<isrc_stop}", + "{[idim]: 0<=idim<dim}", + ], + [""" + for itgt_box + <> tgt_ibox = target_boxes[itgt_box] + + <> tgt_center[idim] = centers[idim, tgt_ibox] \ + + <> isrc_start = src_box_starts[itgt_box] + <> isrc_stop = src_box_starts[itgt_box+1] + + for isrc_box + <> src_ibox = src_box_lists[isrc_box] \ + {id=read_src_ibox} + + <> src_center[idim] = centers[idim, src_ibox] {dup=idim} + <> d[idim] = tgt_center[idim] - src_center[idim] \ + {dup=idim} + <> translation_class = \ + m2l_translation_classes_lists[isrc_box] + <> translation_class_rel = translation_class - \ + translation_classes_level_start + """] + [""" + <> precomputed_expr{idx} = \ + m2l_precomputed_exprs[translation_class_rel, {idx}] + """.format(idx=idx) for idx in range( + nprecomputed_exprs)] + [""" + <> src_coeff{coeffidx} = \ + src_expansions[src_ibox - src_base_ibox, {coeffidx}] \ + {{dep=read_src_ibox}} + """.format(coeffidx=i) for i in range(ncoeff_src)] + [ + + ] + self.get_translation_loopy_insns(result_dtype) + [""" + end + """] + [""" + <> coeff_sum{coeffidx} = \ + simul_reduce(sum, isrc_box, coeff{coeffidx}) + """.format(coeffidx=i) for i in range(ncoeff_tgt)] + [ + ] + self.get_postprocess_loopy_insns(result_dtype) + [""" + tgt_expansions[tgt_ibox - tgt_base_ibox, {coeffidx}] = \ + coeff_post{coeffidx} {{id_prefix=write_expn}} + """.format(coeffidx=i) for i in range(ncoeff_tgt_post)] + [""" + end + """], + [ + lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), + lp.ValueArg("src_rscale,tgt_rscale", None), + lp.GlobalArg("src_box_starts, src_box_lists", + None, shape=None, strides=(1,), offset=lp.auto), + lp.ValueArg("aligned_nboxes,tgt_base_ibox,src_base_ibox", + np.int32), + lp.ValueArg("nsrc_level_boxes,ntgt_level_boxes", + np.int32), + lp.ValueArg("translation_classes_level_start", + np.int32), + lp.GlobalArg("src_expansions", None, + shape=("nsrc_level_boxes", ncoeff_src), offset=lp.auto), + lp.GlobalArg("tgt_expansions", None, offset=lp.auto, + shape=("ntgt_level_boxes", ncoeff_tgt_post)), + lp.ValueArg("ntranslation_classes, ntranslation_classes_lists", + np.int32), + lp.GlobalArg("m2l_translation_classes_lists", np.int32, + shape=("ntranslation_classes_lists"), strides=(1,), + offset=lp.auto), + lp.GlobalArg("m2l_precomputed_exprs", None, + shape=("ntranslation_classes", nprecomputed_exprs), + offset=lp.auto), + "..." + ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), + name=self.name, + assumptions="ntgt_boxes>=1", + silenced_warnings="write_race(write_expn*)", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim, + nprecomputed_exprs=nprecomputed_exprs), + lang_version=MOST_RECENT_LANGUAGE_VERSION + ) + + for expn in [self.src_expansion, self.tgt_expansion]: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) + loopy_knl = lp.set_options(loopy_knl, enforce_variable_access_ordered="no_check") + return loopy_knl + + def get_optimized_kernel(self, result_dtype): + # FIXME + knl = self.get_kernel(result_dtype) + knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") + + #print(knl) + import pickle + with open("complex_type_inference.lp.knl", "wb") as f: + pickle.dump(knl, f) + return knl + + + def __call__(self, queue, **kwargs): + """ + :arg src_expansions: + :arg tgt_expansions: + :arg src_box_starts: + :arg src_box_lists: + :arg src_rscale: + :arg tgt_rscale: + :arg centers: + """ + centers = kwargs.pop("centers") + # "1" may be passed for rscale, which won't have its type + # meaningfully inferred. Make the type of rscale explicit. + src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) + tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) + + tgt_expansions = kwargs.pop("tgt_expansions") + result_dtype = tgt_expansions.dtype + + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + + return knl(queue, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + tgt_expansions=tgt_expansions, + **kwargs) + +class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): + """Implements precomputing the translation classes dependent + derivatives. + """ + default_name = "e2e_from_csr_translation_classes_precompute" + + def get_translation_loopy_insns(self, result_dtype): + from sumpy.symbolic import make_sym_vector + dvec = make_sym_vector("d", self.dim) + + src_rscale = sym.Symbol("src_rscale") + tgt_rscale = sym.Symbol("tgt_rscale") + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + tgt_coeff_names = [ + sac.assign_unique("precomputed_expr%d" % i, coeff_i) + for i, coeff_i in enumerate( + self.tgt_expansion.m2l_global_precompute_exprs( + self.src_expansion, src_rscale, dvec, tgt_rscale, + sac=sac))] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + return to_loopy_insns( + six.iteritems(sac.assignments), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=tgt_coeff_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + + def get_kernel(self, result_dtype): + nprecomputed_exprs = \ + self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) + from sumpy.tools import gather_loopy_arguments + loopy_knl = lp.make_kernel( + [ + "{[itr_class]: 0<=itr_class<ntranslation_classes}", + "{[idim]: 0<=idim<dim}", + ], + [""" + for itr_class + <> d[idim] = m2l_translation_vectors[idim, \ + itr_class + translation_classes_level_start] {dup=idim} + + """] + self.get_translation_loopy_insns(result_dtype) + [""" + m2l_precomputed_exprs[itr_class, {idx}] = precomputed_expr{idx} + """.format(idx=i) for i in range(nprecomputed_exprs)] + [""" + end + """], + [ + lp.ValueArg("src_rscale", None), + lp.GlobalArg("m2l_precomputed_exprs", None, + shape=("ntranslation_classes", nprecomputed_exprs), + offset=lp.auto), + lp.GlobalArg("m2l_translation_vectors", None, + shape=("dim", "ntranslation_vectors")), + lp.ValueArg("ntranslation_classes", np.int32), + lp.ValueArg("ntranslation_vectors", np.int32), + lp.ValueArg("translation_classes_level_start", np.int32), + "..." + ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), + name=self.name, + assumptions="ntranslation_classes>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION + ) + + for expn in [self.src_expansion, self.tgt_expansion]: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) + loopy_knl = lp.set_options(loopy_knl, enforce_variable_access_ordered="no_check") + + return loopy_knl + + def get_optimized_kernel(self, result_dtype): + # FIXME + knl = self.get_kernel(result_dtype=result_dtype) + knl = lp.split_iname(knl, "itr_class", 16, outer_tag="g.0") + + return knl + + def __call__(self, queue, **kwargs): + """ + :arg src_rscale: + :arg translation_classes_level_start: + :arg ntranslation_classes: + :arg m2l_precomputed_exprs: + :arg m2l_translation_vectors: + """ + m2l_translation_vectors = kwargs.pop("m2l_translation_vectors") + # "1" may be passed for rscale, which won't have its type + # meaningfully inferred. Make the type of rscale explicit. + src_rscale = m2l_translation_vectors.dtype.type(kwargs.pop("src_rscale")) + + m2l_precomputed_exprs = kwargs.pop("m2l_precomputed_exprs") + result_dtype = m2l_precomputed_exprs.dtype + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + + return knl(queue, + src_rscale=src_rscale, + m2l_translation_vectors=m2l_translation_vectors, + m2l_precomputed_exprs=m2l_precomputed_exprs, + **kwargs) + + +class E2EFromCSRWithFFTPreprocess(E2EFromCSR): + """Implements preprocessing the multipole expansions + for M2L with FFT. + """ + default_name = "e2e_from_csr_with_fft_preprocess" + + def get_translation_loopy_insns(self, result_dtype): + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(len(self.src_expansion))] + + src_rscale = sym.Symbol("src_rscale") + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + + pp_src_coeff_names = [ + sac.assign_unique("pp_src_coeff%d" % i, coeff_i) + for i, coeff_i in enumerate( + self.tgt_expansion.m2l_preprocess_exprs( + self.src_expansion, src_coeff_exprs, + sac=sac, src_rscale=src_rscale))] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + return to_loopy_insns( + six.iteritems(sac.assignments), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=pp_src_coeff_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + + def get_kernel(self, result_dtype): + nsrc_coeffs = len(self.src_expansion) + npp_src_coeffs = \ + self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) + from sumpy.tools import gather_loopy_arguments + loopy_knl = lp.make_kernel( + [ + "{[isrc_box]: 0<=isrc_box<nsrc_boxes}", + ], + [""" + for isrc_box + """] + [""" + <> src_coeff{idx} = src_expansions[isrc_box, {idx}] + """.format(idx=i) for i in range(nsrc_coeffs)] + [ + ] + self.get_translation_loopy_insns(result_dtype) + [""" + pp_src_expansions[isrc_box, {idx}] = pp_src_coeff{idx} + """.format(idx=i) for i in range(npp_src_coeffs)] + [""" + end + """], + [ + lp.ValueArg("nsrc_boxes", np.int32), + lp.ValueArg("src_rscale", lp.auto), + lp.GlobalArg("src_expansions", None, + shape=("nsrc_boxes", nsrc_coeffs), offset=lp.auto), + lp.GlobalArg("pp_src_expansions", None, + shape=("nsrc_boxes", npp_src_coeffs), offset=lp.auto), + "..." + ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), + name=self.name, + assumptions="nsrc_boxes>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION + ) + + for expn in [self.src_expansion, self.tgt_expansion]: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.set_options(loopy_knl, enforce_variable_access_ordered="no_check") + return loopy_knl + + def get_optimized_kernel(self, result_dtype): + # FIXME + knl = self.get_kernel(result_dtype) + knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0") + return knl + + def __call__(self, queue, **kwargs): + """ + :arg src_expansions + :arg pp_src_expansions + """ + pp_src_expansions = kwargs.pop("pp_src_expansions") + result_dtype = pp_src_expansions.dtype + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + return knl(queue, pp_src_expansions=pp_src_expansions, **kwargs) +# }}} + + # {{{ translation from a box's children class E2EFromChildren(E2EBase): @@ -594,6 +1047,7 @@ class E2EFromChildren(E2EBase): loopy_knl = knl.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.set_options(loopy_knl, enforce_variable_access_ordered="no_check") return loopy_knl diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py index 79bc9cb8e79139f4b2470d4b88c69018992ca7f7..c8e4400b3d84a72b5c3da3865bf8583ca2ae40cc 100644 --- a/sumpy/expansion/__init__.py +++ b/sumpy/expansion/__init__.py @@ -53,6 +53,7 @@ class ExpansionBase: .. automethod:: translate_from .. automethod:: __eq__ .. automethod:: __ne__ + .. automethod:: get_kernel_derivative_taker """ init_arg_names = ("kernel", "order", "use_rscale") @@ -114,8 +115,8 @@ class ExpansionBase: :arg avec: vector from source to center. :arg bvec: vector from center to target. Not usually necessary, except for line-Taylor expansion. - :arg sac: a symbolic assignment collection where temporary - expressions are stored. + :arg sac: A SymbolicAssignmentCollction used for storing + temporary expressions. :returns: a list of :mod:`sympy` expressions representing the coefficients of the expansion. @@ -144,6 +145,9 @@ class ExpansionBase: def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): """ + :arg sac: A SymbolicAssignmentCollction used for storing + temporary expressions. + :return: a :mod:`sympy` expression corresponding to the evaluated expansion with the coefficients in *coeffs*. @@ -185,7 +189,6 @@ class ExpansionBase: return type(self)(**new_kwargs) - # }}} @@ -204,11 +207,11 @@ class ExpansionTermsWrangler: raise NotImplementedError def get_full_kernel_derivatives_from_stored(self, stored_kernel_derivatives, - rscale, sac=None): + rscale): raise NotImplementedError def get_stored_mpole_coefficients_from_full(self, full_mpole_coefficients, - rscale, sac=None): + rscale): raise NotImplementedError @memoize_method @@ -321,7 +324,7 @@ class FullExpansionTermsWrangler(ExpansionTermsWrangler): ExpansionTermsWrangler.get_full_coefficient_identifiers) def get_full_kernel_derivatives_from_stored(self, stored_kernel_derivatives, - rscale, sac=None): + rscale): return stored_kernel_derivatives get_stored_mpole_coefficients_from_full = ( @@ -687,6 +690,11 @@ class LaplaceConformingVolumeTaylorExpansion(VolumeTaylorExpansionBase): def __init__(self, kernel, order, use_rscale): self.expansion_terms_wrangler_key = (order, kernel.dim) + def get_kernel_derivative_taker(self, dvec, rscale, sac): + from sumpy.tools import LaplaceDerivativeTaker + return LaplaceDerivativeTaker(self.kernel.get_expression(dvec), dvec, + rscale, sac) + class HelmholtzConformingVolumeTaylorExpansion(VolumeTaylorExpansionBase): @@ -698,6 +706,11 @@ class HelmholtzConformingVolumeTaylorExpansion(VolumeTaylorExpansionBase): helmholtz_k_name = kernel.get_base_kernel().helmholtz_k_name self.expansion_terms_wrangler_key = (order, kernel.dim, helmholtz_k_name) + def get_kernel_derivative_taker(self, dvec, rscale, sac): + from sumpy.tools import HelmholtzYukawaDerivativeTaker + return HelmholtzYukawaDerivativeTaker(self.kernel.get_expression(dvec), dvec, + rscale, sac) + class BiharmonicConformingVolumeTaylorExpansion(VolumeTaylorExpansionBase): @@ -708,7 +721,6 @@ class BiharmonicConformingVolumeTaylorExpansion(VolumeTaylorExpansionBase): def __init__(self, kernel, order, use_rscale): self.expansion_terms_wrangler_key = (order, kernel.dim) - # }}} diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index ff970124b5b7b4a20cbfc60ce605411a29ac6d0f..bffeca35a214e4ce2b196e577ad79ced20dfffc9 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -21,7 +21,7 @@ THE SOFTWARE. """ import sumpy.symbolic as sym -from sumpy.tools import add_to_sac +from sumpy.tools import add_to_sac, fft from sumpy.expansion import ( ExpansionBase, VolumeTaylorExpansion, LaplaceConformingVolumeTaylorExpansion, @@ -187,9 +187,13 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): return kernel.postprocess_at_target(result, bvec) def m2l_global_precompute_nexpr(self, src_expansion): - return len(self.m2l_global_precompute_mis(src_expansion)[1]) + if self.use_fft: + nexpr = len(self._m2l_global_precompute_mis(src_expansion)[0]) + else: + nexpr = len(self._m2l_global_precompute_mis(src_expansion)[1]) + return nexpr - def m2l_global_precompute_mis(self, src_expansion): + def _m2l_global_precompute_mis(self, src_expansion): from pytools import generate_nonnegative_integer_tuples_below as gnitb from sumpy.tools import add_mi @@ -215,6 +219,7 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): def m2l_global_precompute_exprs(self, src_expansion, src_rscale, dvec, tgt_rscale, sac): + # We know the general form of the multipole expansion is: # # coeff0 * diff(kernel(src - c1), mi0) + @@ -234,7 +239,11 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): src_rscale = 1 toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ - self.m2l_global_precompute_mis(src_expansion) + self._m2l_global_precompute_mis(src_expansion) + + toeplitz_matrix_ident_to_index = dict((ident, i) for i, ident in + enumerate(toeplitz_matrix_coeffs)) +>>>>>>> 4b95905766d950f99b371222b93ff36b1b12358b # Create a expansion terms wrangler for derivatives up to order # (tgt order)+(src order) including a corresponding reduction matrix @@ -269,8 +278,57 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): for i, term in enumerate(needed_vector_terms): vector[i] = add_to_sac(sac, vector_full[srcplusderiv_ident_to_index[term]]) + + if self.use_fft: + # Add zero values needed to make the translation matrix toeplitz + derivatives_full = [0]*len(toeplitz_matrix_coeffs) + for expr, mi in zip(vector, needed_vector_terms): + derivatives_full[toeplitz_matrix_ident_to_index[mi]] = expr + return self._fft(list(reversed(derivatives_full)), sac=sac) + return vector + def m2l_preprocess_exprs(self, src_expansion, src_coeff_exprs, sac, + src_rscale): + toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ + self._m2l_global_precompute_mis(src_expansion) + toeplitz_matrix_ident_to_index = dict((ident, i) for i, ident in + enumerate(toeplitz_matrix_coeffs)) + + # Calculate the first row of the upper triangular Toeplitz matrix + toeplitz_first_row = [0] * len(toeplitz_matrix_coeffs) + for coeff, term in zip( + src_coeff_exprs, + src_expansion.get_coefficient_identifiers()): + toeplitz_first_row[toeplitz_matrix_ident_to_index[term]] = \ + add_to_sac(sac, coeff) + + if self.use_fft: + return self._fft(toeplitz_first_row, sac=sac) + else: + return toeplitz_first_row + + def m2l_postprocess_exprs(self, src_expansion, m2l_result, src_rscale, + tgt_rscale, sac): + toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ + self._m2l_global_precompute_mis(src_expansion) + toeplitz_matrix_ident_to_index = dict((ident, i) for i, ident in + enumerate(toeplitz_matrix_coeffs)) + + if self.use_fft: + n = len(toeplitz_matrix_coeffs) + m2l_result = fft(m2l_result, inverse=True, sac=sac) + m2l_result = list(reversed(m2l_result[:n])) + + # Filter out the dummy rows and scale them for target + result = [] + rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale) + for term in self.get_coefficient_identifiers(): + index = toeplitz_matrix_ident_to_index[term] + result.append(m2l_result[index]*rscale_ratio**sum(term)) + + return result + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, dvec, tgt_rscale, sac=None, _fast_version=True, precomputed_exprs=None): @@ -286,9 +344,10 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): tgt_rscale = 1 from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansionBase + if isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase): toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ - self.m2l_global_precompute_mis(src_expansion) + self._m2l_global_precompute_mis(src_expansion) toeplitz_matrix_ident_to_index = dict((ident, i) for i, ident in enumerate(toeplitz_matrix_coeffs)) @@ -298,32 +357,33 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): else: derivatives = precomputed_exprs + if self.use_fft: + assert precomputed_exprs is not None + assert len(src_coeff_exprs) == len(precomputed_exprs) + result = [] + for i in range(len(precomputed_exprs)): + a = precomputed_exprs[i] + b = src_coeff_exprs[i] + result.append(a*b) + return result + derivatives_full = [0]*len(toeplitz_matrix_coeffs) for expr, mi in zip(derivatives, needed_vector_terms): derivatives_full[toeplitz_matrix_ident_to_index[mi]] = expr - # Calculate the first row of the upper triangular Toeplitz matrix - toeplitz_first_row = [0] * len(toeplitz_matrix_coeffs) - for coeff, term in zip( - src_coeff_exprs, - src_expansion.get_coefficient_identifiers()): - toeplitz_first_row[toeplitz_matrix_ident_to_index[term]] = \ - add_to_sac(sac, coeff) + toeplitz_first_row = self.m2l_preprocess_exprs(src_expansion, + src_coeff_exprs, sac) # Do the matvec - if self.use_fft: + if 0: output = fft_toeplitz_upper_triangular(toeplitz_first_row, - derivatives_full) + derivatives_full, sac=sac) else: output = matvec_toeplitz_upper_triangular(toeplitz_first_row, derivatives_full) - # Filter out the dummy rows and scale them for target - result = [] - rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale) - for term in self.get_coefficient_identifiers(): - index = toeplitz_matrix_ident_to_index[term] - result.append(output[index]*rscale_ratio**sum(term)) + result = self.m2l_postprocess_exprs(src_expansion, output, src_rscale, + tgt_rscale, sac) logger.info("building translation operator: done") return result @@ -533,6 +593,8 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): if not self.use_rscale: rscale = 1 + if knl is None: + knl = self.kernel from sumpy.symbolic import sym_real_norm_2 bessel_j = sym.Function("bessel_j") @@ -548,8 +610,69 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): * sym.exp(sym.I * c * -target_angle_rel_center), bvec) for c in self.get_coefficient_identifiers()) + def m2l_global_precompute_nexpr(self, src_expansion): + nexpr = 4 * self.order + 1 + return nexpr + + def m2l_global_precompute_exprs(self, src_expansion, src_rscale, + dvec, tgt_rscale, sac): + + from sumpy.symbolic import sym_real_norm_2 + from sumpy.tools import fft + + dvec_len = sym_real_norm_2(dvec) + hankel_1 = sym.Function("hankel_1") + new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) + arg_scale = self.get_bessel_arg_scaling() + assert self.order == src_expansion.order + precomputed_exprs = [0] * (4*self.order + 1) + for j in self.get_coefficient_identifiers(): + for m in self.get_coefficient_identifiers(): + precomputed_exprs[m + j + 2 * self.order] = ( + hankel_1(m + j, arg_scale * dvec_len) + * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center)) + + if self.use_fft: + order = self.order + first, last = precomputed_exprs[:2*order], precomputed_exprs[2*order:] + return fft(list(last)+list(first), sac) + + return precomputed_exprs + + def m2l_preprocess_exprs(self, src_expansion, src_coeff_exprs, sac, + src_rscale): + + from sumpy.tools import fft + src_coeff_exprs = list(src_coeff_exprs) + for m in src_expansion.get_coefficient_identifiers(): + src_coeff_exprs[src_expansion.get_storage_index(m)] *= src_rscale**abs(m) + + if self.use_fft: + src_coeff_exprs = list(reversed(src_coeff_exprs)) + src_coeff_exprs += [0] * (len(src_coeff_exprs) - 1) + res = fft(src_coeff_exprs, sac=sac) + return res + else: + return src_coeff_exprs + + def m2l_postprocess_exprs(self, src_expansion, m2l_result, src_rscale, + tgt_rscale, sac): + + if self.use_fft: + n = 2 * self.order + 1 + m2l_result = fft(m2l_result, inverse=True, sac=sac) + m2l_result = m2l_result[:2*self.order+1] + + # Filter out the dummy rows and scale them for target + result = [] + for j in self.get_coefficient_identifiers(): + result.append(m2l_result[j + self.order] \ + * tgt_rscale**(abs(j)) * sym.Integer(-1)**j) + + return result + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, - dvec, tgt_rscale, sac=None): + dvec, tgt_rscale, sac=None, precomputed_exprs=None): from sumpy.symbolic import sym_real_norm_2 if not self.use_rscale: @@ -563,6 +686,7 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): bessel_j = sym.Function("bessel_j") new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) translated_coeffs = [] + for j in self.get_coefficient_identifiers(): translated_coeffs.append( sum(src_coeff_exprs[src_expansion.get_storage_index(m)] @@ -574,20 +698,31 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): return translated_coeffs if isinstance(src_expansion, self.mpole_expn_class): - dvec_len = sym_real_norm_2(dvec) - hankel_1 = sym.Function("hankel_1") - new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) + if precomputed_exprs is None: + derivatives = self.m2l_global_precompute_exprs(src_expansion, + src_rscale, dvec, tgt_rscale) + else: + derivatives = precomputed_exprs + translated_coeffs = [] + if self.use_fft: + assert precomputed_exprs is not None + assert len(derivatives) == len(src_coeff_exprs) + for a, b in zip(derivatives, src_coeff_exprs): + translated_coeffs.append(a * b) + return translated_coeffs + + src_coeff_exprs = self.m2l_preprocess_exprs(src_expansion, src_coeff_exprs, sac, + src_rscale) + for j in self.get_coefficient_identifiers(): - translated_coeffs.append( - sum( - sym.Integer(-1) ** j - * hankel_1(m + j, arg_scale * dvec_len) - * src_rscale ** abs(m) - * tgt_rscale ** abs(j) - * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center) + translated_coeff.append( + sum(derivatives[m + j + 2*self.order] * src_coeff_exprs[src_expansion.get_storage_index(m)] for m in src_expansion.get_coefficient_identifiers())) + + translated_coeffs = self.m2l_postprocess_exprs(src_expansion, translated_coeffs, src_rscale, + tgt_rscale, sac) return translated_coeffs raise RuntimeError("do not know how to translate %s to %s" @@ -601,7 +736,7 @@ class H2DLocalExpansion(_FourierBesselLocalExpansion): assert (isinstance(kernel.get_base_kernel(), HelmholtzKernel) and kernel.dim == 2) - super().__init__(kernel, order, use_rscale) + super().__init__(kernel, order, use_rscale, use_fft=use_fft) from sumpy.expansion.multipole import H2DMultipoleExpansion self.mpole_expn_class = H2DMultipoleExpansion @@ -616,7 +751,7 @@ class Y2DLocalExpansion(_FourierBesselLocalExpansion): assert (isinstance(kernel.get_base_kernel(), YukawaKernel) and kernel.dim == 2) - super().__init__(kernel, order, use_rscale) + super().__init__(kernel, order, use_rscale, use_fft=use_fft) from sumpy.expansion.multipole import Y2DMultipoleExpansion self.mpole_expn_class = Y2DMultipoleExpansion diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index b9b1a07700b1019d18ea06b2613cd9acca4f251c..240385e711cbae9ed228dd26f9e9e091ddf1764f 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -413,6 +413,8 @@ class _HankelBased2DMultipoleExpansion(MultipoleExpansionBase): def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): if not self.use_rscale: rscale = 1 + if knl is None: + knl = self.kernel from sumpy.symbolic import sym_real_norm_2 hankel_1 = sym.Function("hankel_1") diff --git a/sumpy/fmm.py b/sumpy/fmm.py index 900fdad2c102e719cd27629b3a81150bfa0047d9..4611814b2e1a599e3979e3019fe42d4e80d85791 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -30,14 +30,18 @@ __doc__ = """Integrates :mod:`boxtree` with :mod:`sumpy`. import pyopencl as cl import pyopencl.array # noqa +import numpy as np + from pytools import memoize_method from sumpy import ( P2EFromSingleBox, P2EFromCSR, E2PFromSingleBox, E2PFromCSR, P2PFromCSR, - E2EFromCSR, E2EFromChildren, E2EFromParent, - E2EFromCSRTranslationInvariant, E2EFromCSRTranslationClassesPrecompute) + E2EFromCSR, + E2EFromCSRTranslationInvariant, E2EFromCSRTranslationClassesPrecompute, + E2EFromCSRWithFFTPreprocess, + E2EFromChildren, E2EFromParent) def level_to_rscale(tree, level): @@ -128,13 +132,19 @@ class SumpyExpansionWranglerCodeContainer: def m2l_optimized(self, src_order, tgt_order): return E2EFromCSRTranslationInvariant(self.cl_context, self.multipole_expansion(src_order), - self.local_expansion(tgt_order)) + self.local_expansion(tgt_order), use_fft=self.use_fft) @memoize_method def m2l_optimized_precompute_kernel(self, src_order, tgt_order): return E2EFromCSRTranslationClassesPrecompute(self.cl_context, self.multipole_expansion(src_order), - self.local_expansion(tgt_order)) + self.local_expansion(tgt_order), use_fft=self.use_fft) + + @memoize_method + def m2l_preprocess_kernel(self, src_order, tgt_order): + return E2EFromCSRWithFFTPreprocess(self.cl_context, + self.multipole_expansion(src_order), + self.local_expansion(tgt_order), use_fft=self.use_fft) @memoize_method def l2l(self, src_order, tgt_order): @@ -168,8 +178,8 @@ class SumpyExpansionWranglerCodeContainer: translation_classes_data=None): return SumpyExpansionWrangler(self, queue, tree, dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, self_extra_kwargs, - translation_classes_data=translation_classes_data) - + translation_classes_data=translation_classes_data, + use_fft=self.use_fft) # }}} @@ -298,7 +308,8 @@ class SumpyExpansionWrangler: source_extra_kwargs, kernel_extra_kwargs=None, self_extra_kwargs=None, - translation_classes_data=None): + translation_classes_data=None, + complex_dtype=None): self.code = code_container self.queue = queue self.tree = tree @@ -306,6 +317,18 @@ class SumpyExpansionWrangler: self.dtype = dtype + if not self.code.use_fft: + # If not FFT, we don't need complex dtypes + self.complex_dtype = dtype + elif complex_dtype is not None: + self.complex_dtype = complex_dtype + elif self.dtype in (np.float32, np.complex64): + self.complex_dtype = np.complex64 + elif self.dtype in (np.float64, np.complex128): + self.complex_dtype = np.complex128 + else: + raise RuntimeError(f"Cannot compute complex type for {self.dtype}") + if kernel_extra_kwargs is None: kernel_extra_kwargs = {} @@ -369,7 +392,7 @@ class SumpyExpansionWrangler: @memoize_method def multipole_expansions_level_starts(self): return self._expansions_level_starts( - lambda order: len(self.code.multipole_expansion_factory(order)), + lambda order: len(self.code.multipole_expansion(order)), level_starts=self.tree.level_start_box_nrs) @memoize_method @@ -409,7 +432,7 @@ class SumpyExpansionWrangler: return cl.array.zeros( self.queue, self.m2l_precomputed_exprs_level_starts()[-1], - dtype=self.dtype) + dtype=self.complex_dtype) def multipole_expansions_view(self, mpole_exps, level): expn_start, expn_stop = \ @@ -437,6 +460,30 @@ class SumpyExpansionWrangler: return (translation_class_start, exprs_level.reshape( translation_class_stop - translation_class_start, -1)) + @memoize_method + def pp_multipole_expansions_level_starts(self): + def order_to_size(order): + mpole_expn = self.code.multipole_expansion_factory(order) + local_expn = self.code.local_expansion_factory(order) + return local_expn.m2l_global_precompute_nexpr(mpole_expn) + + return self._expansions_level_starts(order_to_size, + level_starts=self.tree.level_start_box_nrs) + + def pp_multipole_expansion_zeros(self): + return cl.array.zeros( + self.queue, + self.pp_multipole_expansions_level_starts()[-1], + dtype=self.complex_dtype) + + def pp_multipole_expansions_view(self, mpole_exps, level): + expn_start, expn_stop = \ + self.pp_multipole_expansions_level_starts()[level:level+2] + box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + + return (box_start, + mpole_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) + def output_zeros(self): from pytools.obj_array import make_obj_array return make_obj_array([ @@ -670,10 +717,39 @@ class SumpyExpansionWrangler: level_start_target_box_nrs, target_boxes, src_box_starts, src_box_lists, mpole_exps): + + if self.use_fft: + pp_mpole_exps = self.pp_multipole_expansion_zeros() + events = [] + for lev in range(self.tree.nlevels): + order = self.level_orders[lev] + preprocess_kernel = \ + self.code.m2l_preprocess_kernel(order, order) + + source_level_start_ibox, source_mpoles_view = \ + self.multipole_expansions_view(mpole_exps, lev) + + _, pp_source_mpoles_view = \ + self.pp_multipole_expansions_view(pp_mpole_exps, lev) + + tr_classes = self.m2l_translation_class_level_start_box_nrs() + # There's no M2L happening in this level + if tr_classes[lev] == tr_classes[lev + 1]: + continue + + evt, _ = preprocess_kernel( + self.queue, + src_expansions=source_mpoles_view, + pp_src_expansions=pp_source_mpoles_view, + src_rscale=level_to_rscale(self.tree, lev), + **self.kernel_extra_kwargs + ) + events.append(evt) + mpole_exps = pp_mpole_exps + local_exps = self.local_expansion_zeros() events = [] - for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -683,7 +759,7 @@ class SumpyExpansionWrangler: m2l = self.m2l_class(order, order) source_level_start_ibox, source_mpoles_view = \ - self.multipole_expansions_view(mpole_exps, lev) + self.m2l_mpole_expansions_view(mpole_exps, lev) target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, lev) @@ -703,6 +779,7 @@ class SumpyExpansionWrangler: **self.kernel_extra_kwargs) + # There's no m2ls needed in this level if not self.multipole_to_local_precompute(kwargs, lev): continue evt, _ = m2l(self.queue, **kwargs) @@ -722,6 +799,20 @@ class SumpyExpansionWrangler: wait_for = mpole_exps.events + for isrc_level, ssn in enumerate(source_boxes_by_level): + if len(target_boxes_by_source_level[isrc_level]) == 0: + continue + + m2p = self.code.m2p(self.level_orders[isrc_level]) + + source_level_start_ibox, source_mpoles_view = \ + self.multipole_expansions_view(mpole_exps, isrc_level) + kwargs.update(self.box_target_list_kwargs()) + + events = [] + + wait_for = mpole_exps.events + for isrc_level, ssn in enumerate(source_boxes_by_level): if len(target_boxes_by_source_level[isrc_level]) == 0: continue diff --git a/sumpy/kernel.py b/sumpy/kernel.py index 494a6107248004883cd70c9e79dc58474c8d74ea..00699ac13cddddeb77ab7b81e351b1ac28d6689c 100644 --- a/sumpy/kernel.py +++ b/sumpy/kernel.py @@ -20,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import loopy as lp import numpy as np from pymbolic.mapper import IdentityMapper, CSECachingMapperMixin @@ -371,6 +370,8 @@ class ExpressionKernel(Kernel): key_builder.rec(key_hash, value) mapper_method = "map_expression_kernel" + # TODO: Allow kernels that are not translation invariant + is_translation_invariant = True def get_derivative_taker(self, dvec, rscale, sac): """Return a :class:`sumpy.tools.ExprDerivativeTaker` instance that supports @@ -423,6 +424,7 @@ class LaplaceKernel(ExpressionKernel): return "LapKnl%dD" % self.dim mapper_method = "map_laplace_kernel" + is_translation_invariant = True def get_derivative_taker(self, dvec, rscale, sac): """Return a :class:`sumpy.tools.ExprDerivativeTaker` instance that supports @@ -466,6 +468,7 @@ class BiharmonicKernel(ExpressionKernel): return "BiharmKnl%dD" % self.dim mapper_method = "map_biharmonic_kernel" + is_translation_invariant = True def get_derivative_taker(self, dvec, rscale, sac): """Return a :class:`sumpy.tools.ExprDerivativeTaker` instance that supports @@ -544,6 +547,7 @@ class HelmholtzKernel(ExpressionKernel): )] mapper_method = "map_helmholtz_kernel" + is_translation_invariant = True def get_derivative_taker(self, dvec, rscale, sac): """Return a :class:`sumpy.tools.ExprDerivativeTaker` instance that supports @@ -625,6 +629,7 @@ class YukawaKernel(ExpressionKernel): )] mapper_method = "map_yukawa_kernel" + is_translation_invariant = True def get_derivative_taker(self, dvec, rscale, sac): """Return a :class:`sumpy.tools.ExprDerivativeTaker` instance that supports @@ -699,6 +704,7 @@ class StokesletKernel(ExpressionKernel): )] mapper_method = "map_stokeslet_kernel" + is_translation_invariant = True class StressletKernel(ExpressionKernel): diff --git a/sumpy/p2e.py b/sumpy/p2e.py index 6b149c7f2b5e398e4bf3135255a7f26dccd55e4e..4f19477a9e46ea212491bc5b5fc81292518e5082 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -109,7 +109,6 @@ class P2EBase(KernelComputation, KernelCacheWrapper): vector_names={"a"}, pymbolic_expr_maps=code_transformers, retain_names=coeff_names, - complex_dtype=np.complex128 # FIXME ) def get_cache_key(self): diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 92a1e76612087a338a2bf4222148721c77a521d6..3de4dd8d340562d491ada23e7b0948f6bd61e3b1 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -139,7 +139,6 @@ class P2PBase(KernelComputation, KernelCacheWrapper): [knl.get_code_transformer() for knl in self.source_kernels] + [knl.get_code_transformer() for knl in self.target_kernels]), retain_names=result_names, - complex_dtype=np.complex128 # FIXME ) from pymbolic import var diff --git a/sumpy/tools.py b/sumpy/tools.py index 1959ee69b9416d0e10c25beca3e524cfa3d3b9f4..79eb333de04920583c260a49ab69bda56f312200 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -24,7 +24,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ============================================================================= Copyright (c) 2006-2019 SymPy Development Team @@ -77,6 +76,9 @@ import numbers import numpy as np import sumpy.symbolic as sym +import numbers +import math +from collections import namedtuple import pyopencl as cl import pyopencl.array # noqa @@ -1051,7 +1053,7 @@ def is_obj_array_like(ary): or (isinstance(ary, np.ndarray) and ary.dtype.char == "O")) -# {{{ matrix +# {{{ matrices def reduced_row_echelon_form(m): """Calculates a reduced row echelon form of a @@ -1263,4 +1265,77 @@ def fft_toeplitz_upper_triangular(first_row, x, sac=None): # }}} +# }}} + + +# {{{ FFT + +def fft(seq, inverse=False, sac=None): + """ + Return the discrete fourier transform of the sequence seq. + seq should be a python iterable with tuples of length 2 + corresponding to the real part and imaginary part. + """ + + from pymbolic.algorithm import fft as _fft, ifft as _ifft + + def wrap(expr): + if isinstance(expr, np.ndarray): + res = [wrap(a) for a in expr] + return np.array(res, dtype=object).reshape(expr.shape) + return add_to_sac(sac, expr, "temp_fft") + + if inverse: + return _ifft(list(seq), wrap_intermediate=wrap).tolist() + else: + return _fft(list(seq), wrap_intermediate=wrap).tolist() + + +def fft_toeplitz_upper_triangular(first_row, x, sac=None): + """ + Returns the matvec of the Toeplitz matrix given by + the first row and the vector x using a Fourier transform + """ + assert len(first_row) == len(x) + n = len(first_row) + v = list(first_row) + v += [0]*(n-1) + + x = list(reversed(x)) + x += [0]*(n-1) + + v_fft = fft(v, sac) + x_fft = fft(x, sac) + res_fft = [add_to_sac(sac, a * b) for a, b in zip(v_fft, x_fft)] + res = fft(res_fft, inverse=True, sac=sac) + return list(reversed(res[:n])) + + +def fft_toeplitz_upper_triangular_lwork(n): + return n + + +def matvec_toeplitz_upper_triangular(first_row, vector): + n = len(first_row) + assert len(vector) == n + output = [0]*n + for row in range(n): + terms = tuple(first_row[col-row]*vector[col] for col in range(row, n)) + output[row] = sym.Add(*terms) + return output + + +def to_complex_dtype(dtype): + if dtype == np.complex64: + return np.complex64 + if dtype == np.complex128: + return np.complex128 + if dtype == np.float32: + return np.complex64 + if dtype == np.float64: + return np.complex128 + raise RuntimeError(f"Unknown dtype: {dtype}") + +# }}} + # vim: fdm=marker diff --git a/test/test_fmm.py b/test/test_fmm.py index 5a081cc5b9875cb66e9ece9af2b43a19d9efcba7..a4f0d5ec01d67238c7e3d0239c92bc7b7c7e1107 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -128,10 +128,10 @@ def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class, # {{{ plot tree if 0: - host_tree = tree.get() - host_trav = trav.get() + host_tree = tree.get(queue) + host_trav = trav.get(queue) - if 1: + if 0: print("src_box", host_tree.find_box_nr_for_source(403)) print("tgt_box", host_tree.find_box_nr_for_target(28)) print(list(host_trav.target_or_target_parent_boxes).index(37)) diff --git a/test/test_kernels.py b/test/test_kernels.py index 7efa44f4ebcc038151b752b529f6518042c1fa6f..3aa01bfc0dd9b3f2fce3c87d8b10224d5e7de9b4 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -44,6 +44,7 @@ from sumpy.kernel import (LaplaceKernel, HelmholtzKernel, AxisTargetDerivative, DirectionalSourceDerivative, BiharmonicKernel, StokesletKernel) import sumpy.symbolic as sym from pytools.convergence import PConvergenceVerifier +import sumpy.symbolic as sym import logging logger = logging.getLogger(__name__) @@ -455,6 +456,72 @@ def test_p2e2p(ctx_factory, base_knl, expn_class, order, with_source_derivative) assert eoc_rec_grad_x.order_estimate() > tgt_order_grad - grad_slack +# {{{ test toeplitz + +def _m2l_translate_simple(tgt_expansion, src_expansion, src_coeff_exprs, src_rscale, + dvec, tgt_rscale): + + if not tgt_expansion.use_rscale: + src_rscale = 1 + tgt_rscale = 1 + + from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansionBase + if not isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase): + return 1 + + # We know the general form of the multipole expansion is: + # + # coeff0 * diff(kernel, mi0) + coeff1 * diff(kernel, mi1) + ... + # + # To get the local expansion coefficients, we take derivatives of + # the multipole expansion. + taker = src_expansion.get_kernel_derivative_taker(dvec, src_rscale, sac=None) + + from sumpy.tools import add_mi + + result = [] + for deriv in tgt_expansion.get_coefficient_identifiers(): + local_result = [] + for coeff, term in zip( + src_coeff_exprs, + src_expansion.get_coefficient_identifiers()): + + kernel_deriv = taker.diff(add_mi(deriv, term)) / src_rscale**sum(deriv) + + local_result.append( + coeff * kernel_deriv * tgt_rscale**sum(deriv)) + result.append(sym.Add(*local_result)) + return result + + +def test_m2l_toeplitz(ctx_getter): + dim = 3 + knl = LaplaceKernel(dim) + local_expn_class = LaplaceConformingVolumeTaylorLocalExpansion + mpole_expn_class = LaplaceConformingVolumeTaylorMultipoleExpansion + + local_expn = local_expn_class(knl, order=5) + mpole_expn = mpole_expn_class(knl, order=5) + + dvec = sym.make_sym_vector("d", dim) + src_coeff_exprs = list(1 + np.random.randn(len(mpole_expn))) + src_rscale = 2.0 + tgt_rscale = 1.0 + + expected_output = _m2l_translate_simple(local_expn, mpole_expn, src_coeff_exprs, + src_rscale, dvec, tgt_rscale) + actual_output = local_expn.translate_from(mpole_expn, src_coeff_exprs, + src_rscale, dvec, tgt_rscale, sac=None) + + replace_dict = dict((d, np.random.rand(1)[0]) for d in dvec) + for sym_a, sym_b in zip(expected_output, actual_output): + num_a = sym_a.xreplace(replace_dict) + num_b = sym_b.xreplace(replace_dict) + assert abs(num_a - num_b)/abs(num_a) < 1e-10 + + +# }}} + @pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class", [ (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion), (LaplaceKernel(2), LaplaceConformingVolumeTaylorLocalExpansion,