diff --git a/benchmarks/bench_translations.py b/benchmarks/bench_translations.py index 1c66ba747e2c61998e3106ddd9ff5becea39d7ca..a6a920d08ac8070daa050b5f9fe639dc4eb592dd 100644 --- a/benchmarks/bench_translations.py +++ b/benchmarks/bench_translations.py @@ -62,9 +62,15 @@ class TranslationBenchmarkSuite: dvec = sym.make_sym_vector("d", knl.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") - result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, - dvec, tgt_rscale) sac = SymbolicAssignmentCollection() + try: + result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, + dvec, tgt_rscale, sac) + except TypeError: + # Support older interface to make it possible to compare + # in CI run + result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, + dvec, tgt_rscale) for i, expr in enumerate(result): sac.assign_unique("coeff%d" % i, expr) sac.run_global_cse() @@ -74,7 +80,7 @@ class TranslationBenchmarkSuite: return sum([counter.rec(insn.expression)+1 for insn in insns]) track_m2l_op_count.unit = "ops" - track_m2l_op_count.timeout = 200.0 + track_m2l_op_count.timeout = 300.0 class LaplaceVolumeTaylorTranslation(TranslationBenchmarkSuite): diff --git a/sumpy/__init__.py b/sumpy/__init__.py index e9745d3c5d50f2f672075631da7953c6a1e8abed..30b0cd66975cba61acefc2457175b57665ff50b5 100644 --- a/sumpy/__init__.py +++ b/sumpy/__init__.py @@ -25,7 +25,8 @@ from sumpy.p2p import P2P, P2PFromCSR from sumpy.p2e import P2EFromSingleBox, P2EFromCSR from sumpy.e2p import E2PFromSingleBox, E2PFromCSR from sumpy.e2e import (E2EFromCSR, E2EFromChildren, E2EFromParent, - E2EFromCSRTranslationClassesPrecompute) + M2LUsingTranslationClassesDependentData, + M2LGenerateTranslationClassesDependentData, M2LPreprocessMultipole) from sumpy.version import VERSION_TEXT from pytools.persistent_dict import WriteOncePersistentDict @@ -34,7 +35,9 @@ __all__ = [ "P2EFromSingleBox", "P2EFromCSR", "E2PFromSingleBox", "E2PFromCSR", "E2EFromCSR", "E2EFromChildren", "E2EFromParent", - "E2EFromCSRTranslationClassesPrecompute"] + "M2LUsingTranslationClassesDependentData", + "M2LGenerateTranslationClassesDependentData", + "M2LPreprocessMultipole"] code_cache = WriteOncePersistentDict("sumpy-code-cache-v6-"+VERSION_TEXT) diff --git a/sumpy/codegen.py b/sumpy/codegen.py index 4da7b6377b929234179b33356edc594321dae9f7..40a5df57bcdd43b7d42db53fcb7da2001958ed97 100644 --- a/sumpy/codegen.py +++ b/sumpy/codegen.py @@ -477,30 +477,31 @@ class BigIntegerKiller(CSECachingIdentityMapper, CallExternalRecMapper): # }}} -# {{{ convert 123000000j to 123000000 * 1j +# {{{ convert complex to np.complex class ComplexRewriter(CSECachingIdentityMapper, CallExternalRecMapper): - def __init__(self, float_type=np.float32): + def __init__(self, complex_dtype=None): super().__init__() - self.float_type = float_type + self.complex_dtype = complex_dtype def map_constant(self, expr, rec_self=None, *args, **kwargs): - """Convert complex values not within complex64 to a product for loopy + """Convert complex values to numpy types """ - if not isinstance(expr, complex): - return CSECachingIdentityMapper.map_constant( - rec_self or self, expr) - - if complex(self.float_type(expr.imag)) == expr.imag: - return CSECachingIdentityMapper.map_constant( - rec_self or self, expr) - - # avoid cycles - if expr == 1j: - return expr - - return expr.real + prim.Product((expr.imag, 1j)) + if not isinstance(expr, (complex, np.complex64, np.complex128)): + return IdentityMapper.map_constant(rec_self or self, expr, + rec_self=rec_self, *args, **kwargs) + + complex_dtype = self.complex_dtype + if complex_dtype is None: + if complex(np.complex64(expr)) == expr: + return np.complex64(expr) + complex_dtype = np.complex128 + + if isinstance(complex_dtype, np.dtype): + return complex_dtype.type(expr) + else: + return complex_dtype(expr) map_common_subexpression_uncached = IdentityMapper.map_common_subexpression @@ -664,7 +665,7 @@ def to_loopy_insns(assignments, vector_names=frozenset(), pymbolic_expr_maps=(), pwr = PowerRewriter() ssg = SumSignGrouper() bik = BigIntegerKiller() - cmr = ComplexRewriter() + cmr = ComplexRewriter(complex_dtype) cmb_mapper = combine_mappers(bdr, btog, vcr, pwr, ssg, bik, cmr) diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 8e6c567b3c0970b15c3747927bae05cfcee5cf30..b5782077c1bbf88db57b1b291723853adbea4122 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -25,7 +25,7 @@ import loopy as lp import sumpy.symbolic as sym from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from sumpy.tools import KernelCacheWrapper +from sumpy.tools import KernelCacheWrapper, to_complex_dtype from pytools import memoize_method import logging @@ -84,6 +84,8 @@ class E2EBase(KernelCacheWrapper): self.tgt_expansion = tgt_expansion self.name = name or self.default_name self.device = device + self.use_preprocessing_for_m2l = getattr(self.tgt_expansion, + "use_preprocessing_for_m2l", False) if src_expansion.dim != tgt_expansion.dim: raise ValueError("source and target expansions must have " @@ -119,14 +121,14 @@ class E2EBase(KernelCacheWrapper): vector_names={"d"}, pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], retain_names=tgt_coeff_names, - complex_dtype=np.complex128 # FIXME ) def get_cache_key(self): return ( type(self).__name__, self.src_expansion, - self.tgt_expansion) + self.tgt_expansion, + ) def get_optimized_kernel(self): # FIXME @@ -148,30 +150,22 @@ class E2EFromCSR(E2EBase): default_name = "e2e_from_csr" def __init__(self, ctx, src_expansion, tgt_expansion, - name=None, device=None, use_precomputed_exprs=False): + name=None, device=None): super().__init__(ctx, src_expansion, tgt_expansion, name=name, device=device) - self.use_precomputed_exprs = use_precomputed_exprs def get_translation_loopy_insns(self): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) - src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) - for i in range(len(self.src_expansion))] src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") - extra_kwargs = dict() - if self.use_precomputed_exprs: - nprecomputed_exprs = \ - self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) - precomputed_exprs = [sym.Symbol("precomputed_expr%d" % i) - for i in range(nprecomputed_exprs)] - extra_kwargs["precomputed_exprs"] = precomputed_exprs - else: - nprecomputed_exprs = 0 + ncoeff_src = len(self.src_expansion) + + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(ncoeff_src)] from sumpy.assignment_collection import SymbolicAssignmentCollection sac = SymbolicAssignmentCollection() @@ -180,7 +174,7 @@ class E2EFromCSR(E2EBase): for i, coeff_i in enumerate( self.tgt_expansion.translate_from( self.src_expansion, src_coeff_exprs, src_rscale, - dvec, tgt_rscale, sac, *extra_kwargs))] + dvec, tgt_rscale, sac))] sac.run_global_cse() @@ -190,19 +184,12 @@ class E2EFromCSR(E2EBase): vector_names=set(["d"]), pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], retain_names=tgt_coeff_names, - complex_dtype=np.complex128 # FIXME ) def get_kernel(self): ncoeff_src = len(self.src_expansion) ncoeff_tgt = len(self.tgt_expansion) - if self.use_precomputed_exprs: - nprecomputed_exprs = \ - self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) - else: - nprecomputed_exprs = 0 - # To clarify terminology: # # isrc_box -> The index in a list of (in this case, source) boxes @@ -220,42 +207,283 @@ class E2EFromCSR(E2EBase): [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] + <> tgt_center[idim] = centers[idim, tgt_ibox] + <> isrc_start = src_box_starts[itgt_box] + <> isrc_stop = src_box_starts[itgt_box+1] - <> tgt_center[idim] = centers[idim, tgt_ibox] \ + for isrc_box + <> src_ibox = src_box_lists[isrc_box] \ + {id=read_src_ibox} + <> src_center[idim] = centers[idim, src_ibox] {dup=idim} + <> d[idim] = tgt_center[idim] - src_center[idim] \ + {dup=idim} + """] + [""" + <> src_coeff{coeffidx} = \ + src_expansions[src_ibox - src_base_ibox, {coeffidx}] \ + {{dep=read_src_ibox}} + """.format(coeffidx=i) for i in range(ncoeff_src)] + [ + ] + self.get_translation_loopy_insns() + [""" + end + + """] + [f""" + tgt_expansions[tgt_ibox - tgt_base_ibox, {coeffidx}] = \ + simul_reduce(sum, isrc_box, coeff{coeffidx}) \ + {{id_prefix=write_expn}} + """ for coeffidx in range(ncoeff_tgt)] + [""" + end + """], + [ + lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), + lp.ValueArg("src_rscale,tgt_rscale", None), + lp.GlobalArg("src_box_starts, src_box_lists", + None, shape=None, strides=(1,), offset=lp.auto), + lp.ValueArg("aligned_nboxes,tgt_base_ibox,src_base_ibox", + np.int32), + lp.ValueArg("nsrc_level_boxes,ntgt_level_boxes", + np.int32), + lp.GlobalArg("src_expansions", None, + shape=("nsrc_level_boxes", ncoeff_src), offset=lp.auto), + lp.GlobalArg("tgt_expansions", None, + shape=("ntgt_level_boxes", ncoeff_tgt), offset=lp.auto), + "..." + ] + gather_loopy_arguments([self.src_expansion, + self.tgt_expansion]), + name=self.name, + assumptions="ntgt_boxes>=1", + silenced_warnings="write_race(write_expn*)", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION + ) + + for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.set_options(loopy_knl, + enforce_variable_access_ordered="no_check") + + return loopy_knl + + def get_optimized_kernel(self): + # FIXME + knl = self.get_kernel() + knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") + return knl + + def get_cache_key(self): + return ( + type(self).__name__, + self.src_expansion, + self.tgt_expansion, + ) + + def __call__(self, queue, **kwargs): + """ + :arg src_expansions: + :arg src_box_starts: + :arg src_box_lists: + :arg src_rscale: + :arg tgt_rscale: + :arg centers: + """ + centers = kwargs.pop("centers") + # "1" may be passed for rscale, which won't have its type + # meaningfully inferred. Make the type of rscale explicit. + src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) + tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) + + knl = self.get_cached_optimized_kernel() + + return knl(queue, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + +class M2LUsingTranslationClassesDependentData(E2EFromCSR): + """Implements translation from a "compressed sparse row"-like source box + list using M2L translation classes dependent data + """ + + default_name = "m2l_using_translation_classes_dependent_data" + + def get_translation_loopy_insns(self, result_dtype): + from sumpy.symbolic import make_sym_vector + dvec = make_sym_vector("d", self.dim) + + src_rscale = sym.Symbol("src_rscale") + tgt_rscale = sym.Symbol("tgt_rscale") + + m2l_translation_classes_dependent_ndata = \ + self.tgt_expansion.m2l_translation_classes_dependent_ndata( + self.src_expansion) + m2l_translation_classes_dependent_data = \ + [sym.Symbol("m2l_translation_classes_dependent_expr%d" % i) + for i in range(m2l_translation_classes_dependent_ndata)] + + if self.use_preprocessing_for_m2l: + ncoeff_src = self.tgt_expansion.m2l_preprocess_multipole_nexprs( + self.src_expansion) + else: + ncoeff_src = len(self.src_expansion) + + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(ncoeff_src)] + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + tgt_coeff_names = [ + sac.assign_unique("coeff%d" % i, coeff_i) + for i, coeff_i in enumerate( + self.tgt_expansion.translate_from( + self.src_expansion, src_coeff_exprs, src_rscale, + dvec, tgt_rscale, sac, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data)))] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + return to_loopy_insns( + sac.assignments.items(), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=tgt_coeff_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + + def get_postprocess_loopy_insns(self, result_dtype): + """Loopy instructions that happen only once for each target box. + + :arg result_dtype: A numpy dtype for the result. This is important + because depending on the input to the M2L being real or not + the code needs to be different. If the input was real, the + result should be the real part of the complex values from the + inverse FFT. In the input was complex, the result matches the + values from the inverse FFT. + """ + + ncoeff_tgt = len(self.tgt_expansion) + m2l_translation_classes_dependent_ndata = \ + self.tgt_expansion.m2l_translation_classes_dependent_ndata( + self.src_expansion) + + if self.use_preprocessing_for_m2l: + ncoeff_tgt = m2l_translation_classes_dependent_ndata + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + + tgt_coeff_exprs = [ + sym.Symbol("coeff_sum%d" % i) for i in range(ncoeff_tgt) + ] + + src_rscale = sym.Symbol("src_rscale") + tgt_rscale = sym.Symbol("tgt_rscale") + + if self.use_preprocessing_for_m2l: + tgt_coeff_post_exprs = self.tgt_expansion.m2l_postprocess_local_exprs( + self.src_expansion, tgt_coeff_exprs, src_rscale, tgt_rscale, + sac=sac) + if result_dtype in (np.float32, np.float64): + real_func = sym.Function("real") + tgt_coeff_post_exprs = [real_func(expr) for expr in + tgt_coeff_post_exprs] + else: + tgt_coeff_post_exprs = tgt_coeff_exprs + + tgt_coeff_post_names = [ + sac.assign_unique("coeff_post%d" % i, coeff) + for i, coeff in enumerate(tgt_coeff_post_exprs) + ] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + insns = to_loopy_insns( + sac.assignments.items(), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=tgt_coeff_post_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + return insns + + def get_kernel(self, result_dtype): + m2l_translation_classes_dependent_ndata = \ + self.tgt_expansion.m2l_translation_classes_dependent_ndata( + self.src_expansion) + + if self.use_preprocessing_for_m2l: + # number of expressions given as input to M2L after preprocessing + ncoeff_src = self.tgt_expansion.m2l_preprocess_multipole_nexprs( + self.src_expansion) + # number of expressions given as input to postprocessing + ncoeff_tgt_before_postprocess = \ + self.tgt_expansion.m2l_postprocess_local_nexprs( + self.src_expansion) + else: + ncoeff_src = len(self.src_expansion) + ncoeff_tgt_before_postprocess = len(self.tgt_expansion) + + ncoeff_tgt = len(self.tgt_expansion) + + # To clarify terminology: + # + # isrc_box -> The index in a list of (in this case, source) boxes + # src_ibox -> The (global) box number for the (in this case, source) box + # + # (same for itgt_box, tgt_ibox) + + from sumpy.tools import gather_loopy_arguments + loopy_knl = lp.make_kernel( + [ + "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] + <> tgt_center[idim] = centers[idim, tgt_ibox] <> isrc_start = src_box_starts[itgt_box] <> isrc_stop = src_box_starts[itgt_box+1] for isrc_box <> src_ibox = src_box_lists[isrc_box] \ {id=read_src_ibox} - <> src_center[idim] = centers[idim, src_ibox] {dup=idim} <> d[idim] = tgt_center[idim] - src_center[idim] \ {dup=idim} - """] + ([""" <> translation_class = \ m2l_translation_classes_lists[isrc_box] <> translation_class_rel = translation_class - \ translation_classes_level_start - """] if nprecomputed_exprs != 0 else []) + [""" - <> precomputed_expr{idx} = \ - m2l_precomputed_exprs[translation_class_rel, {idx}] + """] + [""" + <> m2l_translation_classes_dependent_expr{idx} = \ + m2l_translation_classes_dependent_data[ \ + translation_class_rel, {idx}] """.format(idx=idx) for idx in range( - nprecomputed_exprs)] + [""" + m2l_translation_classes_dependent_ndata)] + [""" <> src_coeff{coeffidx} = \ src_expansions[src_ibox - src_base_ibox, {coeffidx}] \ {{dep=read_src_ibox}} """.format(coeffidx=i) for i in range(ncoeff_src)] + [ - ] + self.get_translation_loopy_insns() + [""" + ] + self.get_translation_loopy_insns(result_dtype) + [""" end """] + [""" + <> coeff_sum{coeffidx} = \ + simul_reduce(sum, isrc_box, coeff{coeffidx}) + """.format(coeffidx=i) for i in + range(ncoeff_tgt_before_postprocess)] + [ + ] + self.get_postprocess_loopy_insns(result_dtype) + [f""" tgt_expansions[tgt_ibox - tgt_base_ibox, {coeffidx}] = \ - simul_reduce(sum, isrc_box, coeff{coeffidx}) \ - {{id_prefix=write_expn}} - """.format(coeffidx=i) for i in range(ncoeff_tgt)] + [""" + coeff_post{coeffidx} {{id_prefix=write_expn}} + """ for coeffidx in range(ncoeff_tgt)] + [""" end """], [ @@ -271,26 +499,27 @@ class E2EFromCSR(E2EBase): shape=("nsrc_level_boxes", ncoeff_src), offset=lp.auto), lp.GlobalArg("tgt_expansions", None, shape=("ntgt_level_boxes", ncoeff_tgt), offset=lp.auto), - "..." - ] + ([ lp.ValueArg("translation_classes_level_start", np.int32), - lp.GlobalArg("m2l_precomputed_exprs", None, - shape=("ntranslation_classes", nprecomputed_exprs), + lp.GlobalArg("m2l_translation_classes_dependent_data", None, + shape=("ntranslation_classes", + m2l_translation_classes_dependent_ndata), offset=lp.auto), lp.GlobalArg("m2l_translation_classes_lists", np.int32, shape=("ntranslation_classes_lists"), strides=(1,), offset=lp.auto), lp.ValueArg("ntranslation_classes, ntranslation_classes_lists", np.int32), - ] if nprecomputed_exprs != 0 else []) - + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), + "..." + ] + gather_loopy_arguments([self.src_expansion, + self.tgt_expansion]), name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", default_offset=lp.auto, fixed_parameters=dict(dim=self.dim, - nprecomputed_exprs=nprecomputed_exprs), + m2l_translation_classes_dependent_ndata=( + m2l_translation_classes_dependent_ndata)), lang_version=MOST_RECENT_LANGUAGE_VERSION ) @@ -298,14 +527,17 @@ class E2EFromCSR(E2EBase): loopy_knl = knl.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.set_options(loopy_knl, + enforce_variable_access_ordered="no_check") return loopy_knl - def get_cache_key(self): - return ( - type(self).__name__, - self.src_expansion, - self.tgt_expansion, self.use_precomputed_exprs) + def get_optimized_kernel(self, result_dtype): + # FIXME + knl = self.get_kernel(result_dtype) + knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") + + return knl def __call__(self, queue, **kwargs): """ @@ -316,27 +548,34 @@ class E2EFromCSR(E2EBase): :arg tgt_rscale: :arg centers: """ - knl = self.get_cached_optimized_kernel() - centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) + if "tgt_expansions" in kwargs: + tgt_expansions = kwargs["tgt_expansions"] + result_dtype = tgt_expansions.dtype + else: + src_expansions = kwargs["src_expansions"] + result_dtype = src_expansions.dtype + + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + return knl(queue, centers=centers, src_rscale=src_rscale, tgt_rscale=tgt_rscale, **kwargs) -class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): - """Implements precomputing the translation classes dependent - derivatives. +class M2LGenerateTranslationClassesDependentData(E2EBase): + """Implements precomputing the M2L kernel dependent data which are + translation classes dependent derivatives. """ - default_name = "e2e_from_csr_translation_classes_precompute" + default_name = "m2l_generate_translation_classes_dependent_data" - def get_translation_loopy_insns(self): + def get_translation_loopy_insns(self, result_dtype): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) @@ -346,9 +585,10 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): from sumpy.assignment_collection import SymbolicAssignmentCollection sac = SymbolicAssignmentCollection() tgt_coeff_names = [ - sac.assign_unique("precomputed_expr%d" % i, coeff_i) + sac.assign_unique("m2l_translation_classes_dependent_expr%d" % i, + coeff_i) for i, coeff_i in enumerate( - self.tgt_expansion.m2l_global_precompute_exprs( + self.tgt_expansion.m2l_translation_classes_dependent_data( self.src_expansion, src_rscale, dvec, tgt_rscale, sac))] @@ -360,12 +600,13 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): vector_names=set(["d"]), pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], retain_names=tgt_coeff_names, - complex_dtype=np.complex128 # FIXME + complex_dtype=to_complex_dtype(result_dtype), ) - def get_kernel(self): - nprecomputed_exprs = \ - self.tgt_expansion.m2l_global_precompute_nexpr(self.src_expansion) + def get_kernel(self, result_dtype): + m2l_translation_classes_dependent_ndata = \ + self.tgt_expansion.m2l_translation_classes_dependent_ndata( + self.src_expansion) from sumpy.tools import gather_loopy_arguments loopy_knl = lp.make_kernel( [ @@ -377,15 +618,18 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): <> d[idim] = m2l_translation_vectors[idim, \ itr_class + translation_classes_level_start] {dup=idim} - """] + self.get_translation_loopy_insns() + [""" - m2l_precomputed_exprs[itr_class, {idx}] = precomputed_expr{idx} - """.format(idx=i) for i in range(nprecomputed_exprs)] + [""" + """] + self.get_translation_loopy_insns(result_dtype) + [""" + m2l_translation_classes_dependent_data[itr_class, {idx}] = \ + m2l_translation_classes_dependent_expr{idx} + """.format(idx=i) for i in range( + m2l_translation_classes_dependent_ndata)] + [""" end """], [ lp.ValueArg("src_rscale", None), - lp.GlobalArg("m2l_precomputed_exprs", None, - shape=("ntranslation_classes", nprecomputed_exprs), + lp.GlobalArg("m2l_translation_classes_dependent_data", None, + shape=("ntranslation_classes", + m2l_translation_classes_dependent_ndata), offset=lp.auto), lp.GlobalArg("m2l_translation_vectors", None, shape=("dim", "ntranslation_vectors")), @@ -409,9 +653,9 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): return loopy_knl - def get_optimized_kernel(self): + def get_optimized_kernel(self, result_dtype): # FIXME - knl = self.get_kernel() + knl = self.get_kernel(result_dtype) knl = lp.tag_inames(knl, {"itr_class": "g.0"}) return knl @@ -421,24 +665,128 @@ class E2EFromCSRTranslationClassesPrecompute(E2EFromCSR): :arg src_rscale: :arg translation_classes_level_start: :arg ntranslation_classes: - :arg m2l_precomputed_exprs: + :arg m2l_translation_classes_dependent_data: :arg m2l_translation_vectors: """ - knl = self.get_cached_optimized_kernel() - m2l_translation_vectors = kwargs.pop("m2l_translation_vectors") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = m2l_translation_vectors.dtype.type(kwargs.pop("src_rscale")) + m2l_translation_classes_dependent_data = kwargs.pop( + "m2l_translation_classes_dependent_data") + result_dtype = m2l_translation_classes_dependent_data.dtype + + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + return knl(queue, src_rscale=src_rscale, m2l_translation_vectors=m2l_translation_vectors, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data), **kwargs) # }}} +# {{{ M2LPreprocessMultipole + +class M2LPreprocessMultipole(E2EBase): + """Computes the preprocessed multipole expansion for accelerated M2L""" + + default_name = "m2l_preprocess_multipole" + + def get_loopy_insns(self, result_dtype): + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(len(self.src_expansion))] + + src_rscale = sym.Symbol("src_rscale") + + from sumpy.assignment_collection import SymbolicAssignmentCollection + sac = SymbolicAssignmentCollection() + + preprocessed_src_coeff_names = [ + sac.assign_unique("preprocessed_src_coeff%d" % i, coeff_i) + for i, coeff_i in enumerate( + self.tgt_expansion.m2l_preprocess_multipole_exprs( + self.src_expansion, src_coeff_exprs, + sac=sac, src_rscale=src_rscale))] + + sac.run_global_cse() + + from sumpy.codegen import to_loopy_insns + return to_loopy_insns( + sac.assignments.items(), + vector_names=set(["d"]), + pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()], + retain_names=preprocessed_src_coeff_names, + complex_dtype=to_complex_dtype(result_dtype), + ) + + def get_kernel(self, result_dtype): + nsrc_coeffs = len(self.src_expansion) + npreprocessed_src_coeffs = \ + self.tgt_expansion.m2l_preprocess_multipole_nexprs(self.src_expansion) + from sumpy.tools import gather_loopy_arguments + loopy_knl = lp.make_kernel( + [ + "{[isrc_box]: 0<=isrc_box src_coeff{idx} = src_expansions[isrc_box, {idx}] + """.format(idx=i) for i in range(nsrc_coeffs)] + [ + ] + self.get_loopy_insns(result_dtype) + [""" + preprocessed_src_expansions[isrc_box, {idx}] = \ + preprocessed_src_coeff{idx} + """.format(idx=i) for i in range( + npreprocessed_src_coeffs)] + [""" + end + """], + [ + lp.ValueArg("nsrc_boxes", np.int32), + lp.ValueArg("src_rscale", lp.auto), + lp.GlobalArg("src_expansions", None, + shape=("nsrc_boxes", nsrc_coeffs), offset=lp.auto), + lp.GlobalArg("preprocessed_src_expansions", None, + shape=("nsrc_boxes", npreprocessed_src_coeffs), + offset=lp.auto), + "..." + ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), + name=self.name, + assumptions="nsrc_boxes>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION + ) + + for expn in [self.src_expansion.kernel, self.tgt_expansion.kernel]: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.set_options(loopy_knl, + enforce_variable_access_ordered="no_check") + return loopy_knl + + def get_optimized_kernel(self, result_dtype): + # FIXME + knl = self.get_kernel(result_dtype) + knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0") + return knl + + def __call__(self, queue, **kwargs): + """ + :arg src_expansions + :arg preprocessed_src_expansions + """ + preprocessed_src_expansions = kwargs.pop("preprocessed_src_expansions") + result_dtype = preprocessed_src_expansions.dtype + knl = self.get_cached_optimized_kernel(result_dtype=result_dtype) + return knl(queue, + preprocessed_src_expansions=preprocessed_src_expansions, **kwargs) +# }}} + + # {{{ translation from a box's children class E2EFromChildren(E2EBase): @@ -530,6 +878,8 @@ class E2EFromChildren(E2EBase): loopy_knl = knl.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.set_options(loopy_knl, + enforce_variable_access_ordered="no_check") return loopy_knl diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index 8e68ccc1d6d9218f5e97b8829d5655300b844048..726ab0863c81a34c05333cf341c3b0197ee3518e 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -21,7 +21,7 @@ THE SOFTWARE. """ import sumpy.symbolic as sym -from sumpy.tools import add_to_sac +from sumpy.tools import add_to_sac, fft from sumpy.expansion import ( ExpansionBase, VolumeTaylorExpansion, LinearPDEConformingVolumeTaylorExpansion) @@ -47,11 +47,44 @@ __doc__ = """ class LocalExpansionBase(ExpansionBase): """Base class for local expansions. - .. automethod:: m2l_global_precompute_exprs - .. automethod:: m2l_global_precompute_nexpr + .. attribute:: kernel + .. attribute:: order + .. attribute:: use_rscale + .. attribute:: use_preprocessing_for_m2l + + .. automethod:: m2l_translation_classes_dependent_data + .. automethod:: m2l_translation_classes_dependent_ndata + .. automethod:: m2l_preprocess_multipole_exprs + .. automethod:: m2l_preprocess_multipole_nexprs + .. automethod:: m2l_postprocess_local_exprs + .. automethod:: m2l_postprocess_local_nexprs .. automethod:: translate_from """ - def m2l_global_precompute_exprs(self, src_expansion, src_rscale, + init_arg_names = ("kernel", "order", "use_rscale", "use_preprocessing_for_m2l") + + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): + super().__init__(kernel, order, use_rscale) + self.use_preprocessing_for_m2l = use_preprocessing_for_m2l + + def with_kernel(self, kernel): + return type(self)(kernel, self.order, self.use_rscale, + use_preprocessing_for_m2l=self.use_preprocessing_for_m2l) + + def update_persistent_hash(self, key_hash, key_builder): + super().update_persistent_hash(key_hash, key_builder) + key_builder.rec(key_hash, self.use_preprocessing_for_m2l) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.kernel == other.kernel + and self.order == other.order + and self.use_rscale == other.use_rscale + and self.use_preprocessing_for_m2l == other.use_preprocessing_for_m2l + ) + + def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale, dvec, tgt_rscale, sac) -> Tuple[Any]: """Return an iterable of expressions that needs to be precomputed for multipole-to-local translations that depend only on the @@ -65,19 +98,67 @@ class LocalExpansionBase(ExpansionBase): """ return tuple() - def m2l_global_precompute_nexpr(self, src_expansion): + def m2l_translation_classes_dependent_ndata(self, src_expansion): """Return the number of expressions returned by - :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_global_precompute_exprs`. + :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data`. This method exists because calculating the number of expressions using the above method might be costly and - :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_global_precompute_exprs` + :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data` cannot be memoized due to it having side effects through the argument *sac*. """ return 0 + def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac, + src_rscale): + """Return the preprocessed multipole expansion for an optimized M2L. + Preprocessing happens once per source box before M2L translation is done. + + When FFT is turned on, the input expressions are transformed into Fourier + space. These expressions are used in a separate :mod:`loopy` kernel + to avoid having to transform for each target and source box pair. + When FFT is turned off, the expressions are equal to the multipole + expansion coefficients with zeros added + to make the M2L computation a circulant matvec. + """ + raise NotImplementedError + + def m2l_preprocess_multipole_nexprs(self, src_expansion): + """Return the number of expressions returned by + :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_preprocess_multipole_exprs`. + This method exists because calculating the number of expressions using + the above method might be costly and it cannot be memoized due to it having + side effects through the argument *sac*. + """ + # For all use-cases we have right now, this is equal to the number of + # translation classes dependent exprs. Use that as a default. + return self.m2l_translation_classes_dependent_ndata(src_expansion) + + def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale, + tgt_rscale, sac): + """Return postprocessed local expansion for an optimized M2L. + Postprocessing happens once per target box just after the M2L translation + is done and before storing the expansion coefficients for the local + expansion. + + When FFT is turned on, the output expressions are transformed from Fourier + space back to the original space. + """ + raise NotImplementedError + + def m2l_postprocess_local_nexprs(self, src_expansion): + """Return the number of expressions given as input to + :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_postprocess_local_exprs`. + This method exists because calculating the number of expressions using + the above method might be costly and it cannot be memoized due to it + having side effects through the argument *sac*. + """ + # For all use-cases we have right now, this is equal to the number of + # translation classes dependent exprs. Use that as a default. + return self.m2l_translation_classes_dependent_ndata(src_expansion) + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, - dvec, tgt_rscale, sac=None, precomputed_exprs=None): + dvec, tgt_rscale, sac=None, m2l_translation_classes_dependent_data=None): """Translate from a multipole or local expansion to a local expansion :arg src_expansion: The source expansion to translate from. @@ -90,9 +171,9 @@ class LocalExpansionBase(ExpansionBase): :arg sac: An object of type :class:`sumpy.assignment_collection.SymbolicAssignmentCollection` to collect common subexpressions or None. - :arg precomputed_exprs: An iterable of symbolic expressions representing the - expressions returned by - :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_global_precompute_exprs`. + :arg m2l_translation_classes_dependent_data: An iterable of symbolic + expressions representing the expressions returned by + :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data`. """ raise NotImplementedError @@ -224,14 +305,20 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): return kernel.postprocess_at_target(result, bvec) - def m2l_global_precompute_nexpr(self, src_expansion): + def m2l_translation_classes_dependent_ndata(self, src_expansion): """Returns number of expressions in M2L global precomputation step. """ - return len(self._m2l_global_precompute_mis(src_expansion)[1]) + mis_with_dummy_rows, mis_without_dummy_rows, _ = \ + self._m2l_translation_classes_dependent_data_mis(src_expansion) + + if self.use_preprocessing_for_m2l: + return len(mis_with_dummy_rows) + else: + return len(mis_without_dummy_rows) - def _m2l_global_precompute_mis(self, src_expansion): - """We would like to compute the M2L by way of a Toeplitz matrix below. - To get the matrix representing the M2L into Toeplitz form, a certain + def _m2l_translation_classes_dependent_data_mis(self, src_expansion): + """We would like to compute the M2L by way of a circulant matrix below. + To get the matrix representing the M2L into circulant form, a certain numbering of rows and columns (as identified by multi-indices) is required. This routine returns that numbering. @@ -243,7 +330,7 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): dropped from the computed result. This method returns the multi-indices representing the rows - of the Toeplitz matrix, the multi-indices representing the rows + of the circulant matrix, the multi-indices representing the rows of the M2L translation matrix and the maximum multi-index of the latter. """ @@ -261,17 +348,17 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): self.get_coefficient_identifiers()) # These are the multi-indices representing the rows - # in the Toeplitz matrix. Note that to get the Toeplitz - # matrix structure some multi-indices that is not in the + # in the circulant matrix. Note that to get the circulant + # matrix structure some multi-indices that are not in the # M2L translation matrix are added. - # This corresponds to adding $\mathcal{O}(p^{d-1})$ + # This corresponds to adding O(p^(d-1)) # additional rows and columns in the case of some PDEs - # like Laplace and $\mathcal{O}(p^d)$ in other cases. - toeplitz_matrix_coeffs = list(gnitb([m + 1 for m in max_mi])) + # like Laplace and O(p^d) in other cases. + circulant_matrix_mis = list(gnitb([m + 1 for m in max_mi])) # These are the multi-indices representing the rows # in the M2L translation matrix without the additional - # multi-indices in the Toeplitz matrix + # multi-indices in the circulant matrix needed_vector_terms = set() # For eg: 2D full Taylor Laplace, we only need kernel derivatives # (n1+n2, m1+m2), n1+m1<=p, n2+m2<=p @@ -281,10 +368,11 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): if needed not in needed_vector_terms: needed_vector_terms.add(needed) - return toeplitz_matrix_coeffs, tuple(needed_vector_terms), max_mi + return circulant_matrix_mis, tuple(needed_vector_terms), max_mi - def m2l_global_precompute_exprs(self, src_expansion, src_rscale, + def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale, dvec, tgt_rscale, sac): + # We know the general form of the multipole expansion is: # # coeff0 * diff(kernel(src - c1), mi0) + @@ -303,8 +391,11 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): if not self.use_rscale: src_rscale = 1 - toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ - self._m2l_global_precompute_mis(src_expansion) + circulant_matrix_mis, needed_vector_terms, max_mi = \ + self._m2l_translation_classes_dependent_data_mis(src_expansion) + + circulant_matrix_ident_to_index = {ident: i for i, ident in + enumerate(circulant_matrix_mis)} # Create a expansion terms wrangler for derivatives up to order # (tgt order)+(src order) including a corresponding reduction matrix @@ -339,11 +430,72 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): for i, term in enumerate(needed_vector_terms): vector[i] = add_to_sac(sac, vector_full[srcplusderiv_ident_to_index[term]]) + + if self.use_preprocessing_for_m2l: + # Add zero values needed to make the translation matrix circulant + derivatives_full = [0]*len(circulant_matrix_mis) + for expr, mi in zip(vector, needed_vector_terms): + derivatives_full[circulant_matrix_ident_to_index[mi]] = expr + + # Note that the matrix we have now is a mirror image of a + # circulant matrix. We reverse the first column to get the + # first column for the circulant matrix and then finally + # use the FFT for convolution represented by the circulant + # matrix. + return fft(list(reversed(derivatives_full)), sac=sac) + return vector + def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac, + src_rscale): + circulant_matrix_mis, needed_vector_terms, max_mi = \ + self._m2l_translation_classes_dependent_data_mis(src_expansion) + circulant_matrix_ident_to_index = dict((ident, i) for i, ident in + enumerate(circulant_matrix_mis)) + + # Calculate the input vector for the circulant matrix + input_vector = [0] * len(circulant_matrix_mis) + for coeff, term in zip( + src_coeff_exprs, + src_expansion.get_coefficient_identifiers()): + input_vector[circulant_matrix_ident_to_index[term]] = \ + add_to_sac(sac, coeff) + + if self.use_preprocessing_for_m2l: + return fft(input_vector, sac=sac) + else: + # When FFT is turned off, there is no preprocessing needed + # Therefore no copying is done and the multipole expansion is sent to + # the main M2L routine as it is. This method is used internally in the + # the main M2l routine to avoid code duplication. + return input_vector + + def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale, + tgt_rscale, sac): + circulant_matrix_mis, needed_vector_terms, max_mi = \ + self._m2l_translation_classes_dependent_data_mis(src_expansion) + circulant_matrix_ident_to_index = dict((ident, i) for i, ident in + enumerate(circulant_matrix_mis)) + + if self.use_preprocessing_for_m2l: + n = len(circulant_matrix_mis) + m2l_result = fft(m2l_result, inverse=True, sac=sac) + # since we reversed the M2L matrix, we reverse the result + # to get the correct result + m2l_result = list(reversed(m2l_result[:n])) + + # Filter out the dummy rows and scale them for target + rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale) + result = [ + m2l_result[circulant_matrix_ident_to_index[term]] + * rscale_ratio**sum(term) + for term in self.get_coefficient_identifiers()] + + return result + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, dvec, tgt_rscale, sac=None, _fast_version=True, - precomputed_exprs=None): + m2l_translation_classes_dependent_data=None): logger.info("building translation operator for %s: %s(%d) -> %s(%d): start", src_expansion.kernel, type(src_expansion).__name__, @@ -360,39 +512,37 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase): # {{{ M2L if isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase): - toeplitz_matrix_coeffs, needed_vector_terms, max_mi = \ - self._m2l_global_precompute_mis(src_expansion) - toeplitz_matrix_ident_to_index = {ident: i for i, ident in - enumerate(toeplitz_matrix_coeffs)} - - if not precomputed_exprs: - derivatives = self.m2l_global_precompute_exprs(src_expansion, - src_rscale, dvec, tgt_rscale, sac) + circulant_matrix_mis, needed_vector_terms, max_mi = \ + self._m2l_translation_classes_dependent_data_mis(src_expansion) + circulant_matrix_ident_to_index = {ident: i for i, ident in + enumerate(circulant_matrix_mis)} + + if not m2l_translation_classes_dependent_data: + derivatives = self.m2l_translation_classes_dependent_data( + src_expansion, src_rscale, dvec, tgt_rscale, sac) else: - derivatives = precomputed_exprs - - derivatives_full = [0]*len(toeplitz_matrix_coeffs) - for expr, mi in zip(derivatives, needed_vector_terms): - derivatives_full[toeplitz_matrix_ident_to_index[mi]] = expr - - # Calculate the first row of the upper triangular Toeplitz matrix - toeplitz_first_row = [0] * len(toeplitz_matrix_coeffs) - for coeff, term in zip( - src_coeff_exprs, - src_expansion.get_coefficient_identifiers()): - toeplitz_first_row[toeplitz_matrix_ident_to_index[term]] = \ - add_to_sac(sac, coeff) - - # Do the matvec - output = matvec_toeplitz_upper_triangular(toeplitz_first_row, - derivatives_full) - - # Filter out the dummy rows and scale them for target - result = [] - rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale) - for term in self.get_coefficient_identifiers(): - index = toeplitz_matrix_ident_to_index[term] - result.append(output[index]*rscale_ratio**sum(term)) + derivatives = m2l_translation_classes_dependent_data + + if self.use_preprocessing_for_m2l: + assert m2l_translation_classes_dependent_data is not None + assert len(src_coeff_exprs) == len( + m2l_translation_classes_dependent_data) + result = [a*b for a, b in zip(m2l_translation_classes_dependent_data, + src_coeff_exprs)] + else: + derivatives_full = [0]*len(circulant_matrix_mis) + for expr, mi in zip(derivatives, needed_vector_terms): + derivatives_full[circulant_matrix_ident_to_index[mi]] = expr + + input_vector = self.m2l_preprocess_multipole_exprs(src_expansion, + src_coeff_exprs, sac, src_rscale) + + # Do the matvec + output = matvec_toeplitz_upper_triangular(input_vector, + derivatives_full) + + result = self.m2l_postprocess_local_exprs(src_expansion, output, + src_rscale, tgt_rscale, sac) logger.info("building translation operator: done") return result @@ -537,8 +687,10 @@ class VolumeTaylorLocalExpansion( VolumeTaylorExpansion, VolumeTaylorLocalExpansionBase): - def __init__(self, kernel, order, use_rscale=None): - VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale) + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): + VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale, + use_preprocessing_for_m2l) VolumeTaylorExpansion.__init__(self, kernel, order, use_rscale) @@ -546,8 +698,10 @@ class LinearPDEConformingVolumeTaylorLocalExpansion( LinearPDEConformingVolumeTaylorExpansion, VolumeTaylorLocalExpansionBase): - def __init__(self, kernel, order, use_rscale=None): - VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale) + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): + VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale, + use_preprocessing_for_m2l) LinearPDEConformingVolumeTaylorExpansion.__init__( self, kernel, order, use_rscale) @@ -590,6 +744,22 @@ class BiharmonicConformingVolumeTaylorLocalExpansion( # {{{ 2D Bessel-based-expansion class _FourierBesselLocalExpansion(LocalExpansionBase): + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): + + if use_preprocessing_for_m2l: + # FIXME: expansion with FFT is correct symbolically and can be verified + # with sympy. However there are numerical issues that we have to deal + # with. Greengard and Rokhlin 1988 attributes this to numerical + # instability but gives rscale as a possible solution. Sumpy's rscale + # choice is slightly different from Greengard and Rokhlin and that + # might be the reason for this numerical issue. + raise ValueError("Bessel based expansions with FFT is not fully " + "supported yet.") + + super().__init__(kernel, order, use_rscale, + use_preprocessing_for_m2l=use_preprocessing_for_m2l) + def get_storage_index(self, k): return self.order+k @@ -630,9 +800,92 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): * sym.exp(sym.I * c * -target_angle_rel_center), bvec) for c in self.get_coefficient_identifiers()) + def m2l_translation_classes_dependent_ndata(self, src_expansion): + nexpr = 2 * self.order + 2 * src_expansion.order + 1 + return nexpr + + def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale, + dvec, tgt_rscale, sac): + + from sumpy.symbolic import sym_real_norm_2, Hankel1 + from sumpy.tools import fft + + dvec_len = sym_real_norm_2(dvec) + new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) + arg_scale = self.get_bessel_arg_scaling() + # [-(src_order+tgt_order), ..., 0, ..., (src_order + tgt_order)] + m2l_translation_classes_dependent_data = \ + [0] * (2*self.order + 2 * src_expansion.order + 1) + + # The M2L is a mirror image of a Toeplitz matvec with Hankel function + # evaluations. https://dlmf.nist.gov/10.23.F1 + # This loop computes the first row and the last column vector sufficient + # to specify the matrix entries. + for j in self.get_coefficient_identifiers(): + idx_j = self.get_storage_index(j) + for m in src_expansion.get_coefficient_identifiers(): + idx_m = src_expansion.get_storage_index(m) + m2l_translation_classes_dependent_data[idx_j + idx_m] = ( + Hankel1(m + j, arg_scale * dvec_len, 0) + * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center)) + + if self.use_preprocessing_for_m2l: + order = src_expansion.order + # For this expansion, we have a mirror image of a Toeplitz matrix. + # First, we have to take the mirror image of the M2L matrix. + # + # After that the Toeplitz matrix has to be embedded in a circulant + # matrix. In this cicrcular matrix the first part of the first + # column is the first column of the Toeplitz matrix which is + # the last column of the M2L matrix. The second part is the + # reverse of the first row of the Toeplitz matrix which + # is the reverse of the first row of the M2L matrix. + first_row_m2l, last_column_m2l = \ + m2l_translation_classes_dependent_data[:2*order], \ + m2l_translation_classes_dependent_data[2*order:] + first_column_toeplitz = last_column_m2l + first_row_toeplitz = list(reversed(first_row_m2l)) + + first_column_circulant = list(first_column_toeplitz) + \ + list(reversed(first_row_toeplitz)) + return fft(first_column_circulant, sac) + else: + return m2l_translation_classes_dependent_data + + def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac, + src_rscale): + + from sumpy.tools import fft + src_coeff_exprs = list(src_coeff_exprs) + for m in src_expansion.get_coefficient_identifiers(): + src_coeff_exprs[src_expansion.get_storage_index(m)] *= src_rscale**abs(m) + + if self.use_preprocessing_for_m2l: + src_coeff_exprs = list(reversed(src_coeff_exprs)) + src_coeff_exprs += [0] * (len(src_coeff_exprs) - 1) + res = fft(src_coeff_exprs, sac=sac) + return res + else: + return src_coeff_exprs + + def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale, + tgt_rscale, sac): + + if self.use_preprocessing_for_m2l: + m2l_result = fft(m2l_result, inverse=True, sac=sac) + m2l_result = m2l_result[:2*self.order+1] + + # Filter out the dummy rows and scale them for target + result = [] + for j in self.get_coefficient_identifiers(): + result.append(m2l_result[self.get_storage_index(j)] + * tgt_rscale**(abs(j)) * sym.Integer(-1)**j) + + return result + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, - dvec, tgt_rscale, sac=None, precomputed_exprs=None): - from sumpy.symbolic import sym_real_norm_2, BesselJ, Hankel1 + dvec, tgt_rscale, sac=None, m2l_translation_classes_dependent_data=None): + from sumpy.symbolic import sym_real_norm_2, BesselJ if not self.use_rscale: src_rscale = 1 @@ -644,6 +897,7 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): dvec_len = sym_real_norm_2(dvec) new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) translated_coeffs = [] + for j in self.get_coefficient_identifiers(): translated_coeffs.append( sum(src_coeff_exprs[src_expansion.get_storage_index(m)] @@ -655,19 +909,31 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): return translated_coeffs if isinstance(src_expansion, self.mpole_expn_class): - dvec_len = sym_real_norm_2(dvec) - new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0]) + if m2l_translation_classes_dependent_data is None: + derivatives = self.m2l_translation_classes_dependent_data( + src_expansion, src_rscale, dvec, tgt_rscale, sac=sac) + else: + derivatives = m2l_translation_classes_dependent_data + translated_coeffs = [] + if self.use_preprocessing_for_m2l: + assert m2l_translation_classes_dependent_data is not None + assert len(derivatives) == len(src_coeff_exprs) + for a, b in zip(derivatives, src_coeff_exprs): + translated_coeffs.append(a * b) + return translated_coeffs + + src_coeff_exprs = self.m2l_preprocess_multipole_exprs(src_expansion, + src_coeff_exprs, sac, src_rscale) + for j in self.get_coefficient_identifiers(): translated_coeffs.append( - sum( - sym.Integer(-1) ** j - * Hankel1(m + j, arg_scale * dvec_len, 0) - * src_rscale ** abs(m) - * tgt_rscale ** abs(j) - * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center) + sum(derivatives[m + j + self.order + src_expansion.order] * src_coeff_exprs[src_expansion.get_storage_index(m)] for m in src_expansion.get_coefficient_identifiers())) + + translated_coeffs = self.m2l_postprocess_local_exprs(src_expansion, + translated_coeffs, src_rscale, tgt_rscale, sac) return translated_coeffs raise RuntimeError("do not know how to translate %s to %s" @@ -676,12 +942,14 @@ class _FourierBesselLocalExpansion(LocalExpansionBase): class H2DLocalExpansion(_FourierBesselLocalExpansion): - def __init__(self, kernel, order, use_rscale=None): + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): from sumpy.kernel import HelmholtzKernel assert (isinstance(kernel.get_base_kernel(), HelmholtzKernel) and kernel.dim == 2) - super().__init__(kernel, order, use_rscale) + super().__init__(kernel, order, use_rscale, + use_preprocessing_for_m2l=use_preprocessing_for_m2l) from sumpy.expansion.multipole import H2DMultipoleExpansion self.mpole_expn_class = H2DMultipoleExpansion @@ -691,12 +959,14 @@ class H2DLocalExpansion(_FourierBesselLocalExpansion): class Y2DLocalExpansion(_FourierBesselLocalExpansion): - def __init__(self, kernel, order, use_rscale=None): + def __init__(self, kernel, order, use_rscale=None, + use_preprocessing_for_m2l=False): from sumpy.kernel import YukawaKernel assert (isinstance(kernel.get_base_kernel(), YukawaKernel) and kernel.dim == 2) - super().__init__(kernel, order, use_rscale) + super().__init__(kernel, order, use_rscale, + use_preprocessing_for_m2l=use_preprocessing_for_m2l) from sumpy.expansion.multipole import Y2DMultipoleExpansion self.mpole_expn_class = Y2DMultipoleExpansion diff --git a/sumpy/fmm.py b/sumpy/fmm.py index d0471f8659175d9b9150c27b63d6183dfa468d25..26acf71389d7ee5cb70ce91466d48e0365161012 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -36,8 +36,11 @@ from sumpy import ( P2EFromSingleBox, P2EFromCSR, E2PFromSingleBox, E2PFromCSR, P2PFromCSR, - E2EFromCSR, E2EFromChildren, E2EFromParent, - E2EFromCSRTranslationClassesPrecompute) + E2EFromCSR, M2LUsingTranslationClassesDependentData, + E2EFromChildren, E2EFromParent, + M2LGenerateTranslationClassesDependentData, + M2LPreprocessMultipole) +from sumpy.tools import to_complex_dtype def level_to_rscale(tree, level): @@ -62,7 +65,8 @@ class SumpyExpansionWranglerCodeContainer: multipole_expansion_factory, local_expansion_factory, target_kernels, exclude_self=False, use_rscale=None, - strength_usage=None, source_kernels=None): + strength_usage=None, source_kernels=None, + use_preprocessing_for_m2l=False): """ :arg multipole_expansion_factory: a callable of a single argument (order) that returns a multipole expansion. @@ -80,6 +84,7 @@ class SumpyExpansionWranglerCodeContainer: self.exclude_self = exclude_self self.use_rscale = use_rscale self.strength_usage = strength_usage + self.use_preprocessing_for_m2l = use_preprocessing_for_m2l self.cl_context = cl_context @@ -94,7 +99,8 @@ class SumpyExpansionWranglerCodeContainer: @memoize_method def local_expansion(self, order): - return self.local_expansion_factory(order, self.use_rscale) + return self.local_expansion_factory(order, self.use_rscale, + use_preprocessing_for_m2l=self.use_preprocessing_for_m2l) @memoize_method def p2m(self, tgt_order): @@ -117,15 +123,25 @@ class SumpyExpansionWranglerCodeContainer: self.multipole_expansion(tgt_order)) @memoize_method - def m2l(self, src_order, tgt_order, use_precomputed_exprs=False): - return E2EFromCSR(self.cl_context, + def m2l(self, src_order, tgt_order, + m2l_use_translation_classes_dependent_data=False): + if m2l_use_translation_classes_dependent_data: + m2l_class = M2LUsingTranslationClassesDependentData + else: + m2l_class = E2EFromCSR + return m2l_class(self.cl_context, + self.multipole_expansion(src_order), + self.local_expansion(tgt_order)) + + @memoize_method + def m2l_translation_class_dependent_data_kernel(self, src_order, tgt_order): + return M2LGenerateTranslationClassesDependentData(self.cl_context, self.multipole_expansion(src_order), - self.local_expansion(tgt_order), - use_precomputed_exprs=use_precomputed_exprs) + self.local_expansion(tgt_order)) @memoize_method - def m2l_optimized_precompute_kernel(self, src_order, tgt_order): - return E2EFromCSRTranslationClassesPrecompute(self.cl_context, + def m2l_preprocess_mpole_kernel(self, src_order, tgt_order): + return M2LPreprocessMultipole(self.cl_context, self.multipole_expansion(src_order), self.local_expansion(tgt_order)) @@ -166,7 +182,6 @@ class SumpyExpansionWranglerCodeContainer: return SumpyExpansionWrangler(self, queue, tree, dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, self_extra_kwargs, translation_classes_data=translation_classes_data) - # }}} @@ -287,13 +302,18 @@ class SumpyExpansionWrangler: Keyword arguments to be passed for handling self interactions (source and target particles are the same), provided special handling is needed + + .. attribute:: preprocessed_mpole_dtype + + Type for the preprocessed multipole expansion if used for M2L. """ def __init__(self, code_container, queue, tree, dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs=None, self_extra_kwargs=None, - translation_classes_data=None): + translation_classes_data=None, + preprocessed_mpole_dtype=None): self.code = code_container self.queue = queue self.tree = tree @@ -301,6 +321,15 @@ class SumpyExpansionWrangler: self.dtype = dtype + if not self.code.use_preprocessing_for_m2l: + # If not FFT, we don't need complex dtypes + self.preprocessed_mpole_dtype = dtype + elif preprocessed_mpole_dtype is not None: + self.preprocessed_mpole_dtype = preprocessed_mpole_dtype + else: + # FIXME: It is weird that the wrangler has to compute this. + self.preprocessed_mpole_dtype = to_complex_dtype(dtype) + if kernel_extra_kwargs is None: kernel_extra_kwargs = {} @@ -326,13 +355,19 @@ class SumpyExpansionWrangler: if base_kernel.is_translation_invariant: if translation_classes_data is None: from warnings import warn - warn( - "List 2 (multipole-to-local) translations will be " - "unoptimized. Supply a translation_classes_data argument to " - "the wrangler for optimized List 2.", - SumpyTranslationClassesDataNotSuppliedWarning, - - stacklevel=2) + if self.code.use_preprocessing_for_m2l: + raise NotImplementedError( + "FFT based List 2 (multipole-to-local) translations " + "without translation_classes_data argument is not " + "implemented. Supply a translation_classes_data argument " + "to the wrangler for optimized List 2.") + else: + warn( + "List 2 (multipole-to-local) translations will be " + "unoptimized. Supply a translation_classes_data argument " + "to the wrangler for optimized List 2.", + SumpyTranslationClassesDataNotSuppliedWarning, + stacklevel=2) self.supports_optimized_m2l = False else: self.supports_optimized_m2l = True @@ -340,6 +375,7 @@ class SumpyExpansionWrangler: self.supports_optimized_m2l = False self.translation_classes_data = translation_classes_data + self.use_preprocessing_for_m2l = self.code.use_preprocessing_for_m2l # {{{ data vector utilities def _expansions_level_starts(self, order_to_size): @@ -349,12 +385,12 @@ class SumpyExpansionWrangler: @memoize_method def multipole_expansions_level_starts(self): return self._expansions_level_starts( - lambda order: len(self.code.multipole_expansion_factory(order))) + lambda order: len(self.code.multipole_expansion(order))) @memoize_method def local_expansions_level_starts(self): return self._expansions_level_starts( - lambda order: len(self.code.local_expansion_factory(order))) + lambda order: len(self.code.local_expansion(order))) @memoize_method def m2l_translation_class_level_start_box_nrs(self): @@ -362,11 +398,11 @@ class SumpyExpansionWrangler: return data.m2l_translation_classes_level_starts().get(self.queue) @memoize_method - def m2l_precomputed_exprs_level_starts(self): + def m2l_translation_classes_dependent_data_level_starts(self): def order_to_size(order): - mpole_expn = self.code.multipole_expansion_factory(order) - local_expn = self.code.local_expansion_factory(order) - return local_expn.m2l_global_precompute_nexpr(mpole_expn) + mpole_expn = self.code.multipole_expansion(order) + local_expn = self.code.local_expansion(order) + return local_expn.m2l_translation_classes_dependent_ndata(mpole_expn) return build_csr_level_starts(self.level_orders, order_to_size, level_starts=self.m2l_translation_class_level_start_box_nrs()) @@ -383,11 +419,11 @@ class SumpyExpansionWrangler: self.local_expansions_level_starts()[-1], dtype=self.dtype) - def m2l_precomputed_exprs_zeros(self): + def m2l_translation_classes_dependent_data_zeros(self): return cl.array.zeros( self.queue, - self.m2l_precomputed_exprs_level_starts()[-1], - dtype=self.dtype) + self.m2l_translation_classes_dependent_data_level_starts()[-1], + dtype=self.preprocessed_mpole_dtype) def multipole_expansions_view(self, mpole_exps, level): expn_start, expn_stop = \ @@ -405,16 +441,42 @@ class SumpyExpansionWrangler: return (box_start, local_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) - def m2l_precomputed_exprs_view(self, m2l_precomputed_exprs, level): + def m2l_translation_classes_dependent_data_view(self, + m2l_translation_classes_dependent_data, level): expn_start, expn_stop = \ - self.m2l_precomputed_exprs_level_starts()[level:level+2] + self.m2l_translation_classes_dependent_data_level_starts()[level:level+2] translation_class_start, translation_class_stop = \ - self.m2l_translation_class_level_start_box_nrs()[level:level+2] + self.m2l_translation_class_level_start_box_nrs()[level:level+2] - exprs_level = m2l_precomputed_exprs[expn_start:expn_stop] + exprs_level = m2l_translation_classes_dependent_data[expn_start:expn_stop] return (translation_class_start, exprs_level.reshape( translation_class_stop - translation_class_start, -1)) + @memoize_method + def m2l_preproc_mpole_expansions_level_starts(self): + def order_to_size(order): + mpole_expn = self.code.multipole_expansion(order) + local_expn = self.code.local_expansion(order) + res = local_expn.m2l_translation_classes_dependent_ndata(mpole_expn) + return res + + return build_csr_level_starts(self.level_orders, order_to_size, + level_starts=self.tree.level_start_box_nrs) + + def m2l_preproc_mpole_expansion_zeros(self): + return cl.array.zeros( + self.queue, + self.m2l_preproc_mpole_expansions_level_starts()[-1], + dtype=self.preprocessed_mpole_dtype) + + def m2l_preproc_mpole_expansions_view(self, mpole_exps, level): + expn_start, expn_stop = \ + self.m2l_preproc_mpole_expansions_level_starts()[level:level+2] + box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + + return (box_start, + mpole_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) + def output_zeros(self): from pytools.obj_array import make_obj_array return make_obj_array([ @@ -583,16 +645,20 @@ class SumpyExpansionWrangler: @memoize_method def multipole_to_local_precompute(self, src_rscale): - m2l_precomputed_exprs = self.m2l_precomputed_exprs_zeros() + m2l_translation_classes_dependent_data = \ + self.m2l_translation_classes_dependent_data_zeros() for lev in range(self.tree.nlevels): order = self.level_orders[lev] precompute_kernel = \ - self.code.m2l_optimized_precompute_kernel(order, order) + self.code.m2l_translation_class_dependent_data_kernel(order, order) - translation_classes_level_start, precomputed_exprs_view = \ - self.m2l_precomputed_exprs_view(m2l_precomputed_exprs, lev) + translation_classes_level_start, \ + m2l_translation_classes_dependent_data_view = \ + self.m2l_translation_classes_dependent_data_view( + m2l_translation_classes_dependent_data, lev) - ntranslation_classes = precomputed_exprs_view.shape[0] + ntranslation_classes = \ + m2l_translation_classes_dependent_data_view.shape[0] if ntranslation_classes == 0: continue @@ -605,17 +671,19 @@ class SumpyExpansionWrangler: src_rscale=src_rscale, translation_classes_level_start=translation_classes_level_start, ntranslation_classes=ntranslation_classes, - m2l_precomputed_exprs=precomputed_exprs_view, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data_view), m2l_translation_vectors=m2l_translation_vectors, ntranslation_vectors=m2l_translation_vectors.shape[1], **self.kernel_extra_kwargs ) - m2l_precomputed_exprs.add_event(evt) + m2l_translation_classes_dependent_data.add_event(evt) - m2l_precomputed_exprs.finish() + m2l_translation_classes_dependent_data.finish() - return (m2l_precomputed_exprs, SumpyTimingFuture(self.queue, - m2l_precomputed_exprs.events[:])) + return (m2l_translation_classes_dependent_data, + SumpyTimingFuture(self.queue, + m2l_translation_classes_dependent_data.events[:])) def _add_m2l_precompute_kwargs(self, kwargs_for_m2l, lev): @@ -626,10 +694,14 @@ class SumpyExpansionWrangler: if not self.supports_optimized_m2l: return src_rscale = kwargs_for_m2l["src_rscale"] - m2l_precomputed_exprs, _ = self.multipole_to_local_precompute(src_rscale) - translation_classes_level_start, precomputed_exprs_view = \ - self.m2l_precomputed_exprs_view(m2l_precomputed_exprs, lev) - kwargs_for_m2l["m2l_precomputed_exprs"] = precomputed_exprs_view + m2l_translation_classes_dependent_data, _ = \ + self.multipole_to_local_precompute(src_rscale) + translation_classes_level_start, \ + m2l_translation_classes_dependent_data_view = \ + self.m2l_translation_classes_dependent_data_view( + m2l_translation_classes_dependent_data, lev) + kwargs_for_m2l["m2l_translation_classes_dependent_data"] = \ + m2l_translation_classes_dependent_data_view kwargs_for_m2l["translation_classes_level_start"] = \ translation_classes_level_start kwargs_for_m2l["m2l_translation_classes_lists"] = \ @@ -639,9 +711,43 @@ class SumpyExpansionWrangler: level_start_target_box_nrs, target_boxes, src_box_starts, src_box_lists, mpole_exps): - local_exps = self.local_expansion_zeros() + + precompute_evts = [] + + if self.use_preprocessing_for_m2l: + preprocessed_mpole_exps = self.m2l_preproc_mpole_expansion_zeros() + for lev in range(self.tree.nlevels): + order = self.level_orders[lev] + preprocess_mpole_kernel = \ + self.code.m2l_preprocess_mpole_kernel(order, order) + + source_level_start_ibox, source_mpoles_view = \ + self.multipole_expansions_view(mpole_exps, lev) + + _, preprocessed_source_mpoles_view = \ + self.m2l_preproc_mpole_expansions_view( + preprocessed_mpole_exps, lev) + + tr_classes = self.m2l_translation_class_level_start_box_nrs() + if tr_classes[lev] == tr_classes[lev + 1]: + # There is no M2L happening in this level + continue + + evt, _ = preprocess_mpole_kernel( + self.queue, + src_expansions=source_mpoles_view, + preprocessed_src_expansions=preprocessed_source_mpoles_view, + src_rscale=level_to_rscale(self.tree, lev), + **self.kernel_extra_kwargs + ) + precompute_evts.append(evt) + mpole_exps = preprocessed_mpole_exps + mpole_exps_view_func = self.m2l_preproc_mpole_expansions_view + else: + mpole_exps_view_func = self.multipole_expansions_view events = [] + local_exps = self.local_expansion_zeros() for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] @@ -652,7 +758,7 @@ class SumpyExpansionWrangler: m2l = self.code.m2l(order, order, self.supports_optimized_m2l) source_level_start_ibox, source_mpoles_view = \ - self.multipole_expansions_view(mpole_exps, lev) + mpole_exps_view_func(mpole_exps, lev) target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, lev) @@ -673,11 +779,11 @@ class SumpyExpansionWrangler: **self.kernel_extra_kwargs) self._add_m2l_precompute_kwargs(kwargs, lev) - if "m2l_precomputed_exprs" in kwargs and \ - kwargs["m2l_precomputed_exprs"].size == 0: - # There's nothing to do for this level + if "m2l_translation_classes_dependent_data" in kwargs and \ + kwargs["m2l_translation_classes_dependent_data"].size == 0: + # There is nothing to do for this level continue - evt, _ = m2l(self.queue, **kwargs) + evt, _ = m2l(self.queue, **kwargs, wait_for=precompute_evts) events.append(evt) diff --git a/sumpy/p2e.py b/sumpy/p2e.py index 1fdf8ab2008eea111d43863838a055bfa80bb629..cb4b6616dca435db2c77350f5aaf5952dacdf6d3 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -116,7 +116,6 @@ class P2EBase(KernelComputation, KernelCacheWrapper): vector_names={"a"}, pymbolic_expr_maps=code_transformers, retain_names=coeff_names, - complex_dtype=np.complex128 # FIXME ) def get_cache_key(self): diff --git a/sumpy/p2p.py b/sumpy/p2p.py index b1d07f8fd9ea88d357903b3720602e2582290089..c2aa895d5dec0c250149fe5166ede72cf8534e68 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -139,7 +139,6 @@ class P2PBase(KernelComputation, KernelCacheWrapper): [knl.get_code_transformer() for knl in self.source_kernels] + [knl.get_code_transformer() for knl in self.target_kernels]), retain_names=result_names, - complex_dtype=np.complex128 # FIXME ) from pymbolic import var diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py index eca0f0001dd706e0409d47d56ae02dd97785c464..37e2710630a1f8a8315160a7baf2199d858fb9ad 100644 --- a/sumpy/symbolic.py +++ b/sumpy/symbolic.py @@ -71,7 +71,7 @@ _find_symbolic_backend() # Before adding a function here, make sure it's present in both modules. SYMBOLIC_API = """ Add Basic Mul Pow exp sqrt log symbols sympify cos sin atan2 Function Symbol -Derivative Integer Matrix Subs I pi functions Number""".split() +Derivative Integer Matrix Subs I pi functions Number Float""".split() if USE_SYMENGINE: import symengine as sym diff --git a/sumpy/tools.py b/sumpy/tools.py index 4226a40b097796379db15416b74000d225b0a3f1..5b3e31308e72e28a8e6491d6cb83d1a3824b71be 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -1,6 +1,7 @@ __copyright__ = """ Copyright (C) 2012 Andreas Kloeckner Copyright (C) 2018 Alexandru Fikl +Copyright (C) 2020 Isuru Fernando """ __license__ = """ @@ -37,6 +38,7 @@ __doc__ = """ from pytools import memoize_method from pytools.tag import Tag, tag_dataclass +import numbers from pymbolic.mapper import WalkMapper import numpy as np @@ -90,7 +92,8 @@ def add_to_sac(sac, expr): if sac is None: return expr - if expr.is_Number or expr.is_Symbol: + if isinstance(expr, (numbers.Number, sym.Number, int, + float, complex, sym.Symbol)): return expr name = sac.assign_temp("temp", expr) @@ -726,7 +729,6 @@ class KernelCacheWrapper: @staticmethod def _allow_redundant_execution_of_knl_scaling(knl): from loopy.match import ObjTagged - from sumpy.tools import ScalingAssignmentTag return lp.add_inames_for_unused_hw_axes( knl, within=ObjTagged(ScalingAssignmentTag())) @@ -737,6 +739,8 @@ def is_obj_array_like(ary): or (isinstance(ary, np.ndarray) and ary.dtype.char == "O")) +# {{{ matrices + def reduced_row_echelon_form(m, atol=0): """Calculates a reduced row echelon form of a matrix `m`. @@ -853,6 +857,51 @@ def find_linear_relationship(matrix): return {} +# }}} + +# {{{ FFT + +def fft(seq, inverse=False, sac=None): + """ + Return the discrete fourier transform of the sequence seq. + seq should be a python iterable with tuples of length 2 + corresponding to the real part and imaginary part. + """ + + from pymbolic.algorithm import fft as _fft, ifft as _ifft + + def wrap(expr): + if isinstance(expr, np.ndarray): + res = [wrap(a) for a in expr] + return np.array(res, dtype=object).reshape(expr.shape) + return add_to_sac(sac, expr) + + if inverse: + return _ifft(list(seq), wrap_intermediate=wrap).tolist() + else: + return _fft(list(seq), wrap_intermediate=wrap).tolist() + + +def fft_toeplitz_upper_triangular(first_row, x, sac=None): + """ + Returns the matvec of the Toeplitz matrix given by + the first row and the vector x using a Fourier transform + """ + assert len(first_row) == len(x) + n = len(first_row) + v = list(first_row) + v += [0]*(n-1) + + x = list(reversed(x)) + x += [0]*(n-1) + + v_fft = fft(v, sac) + x_fft = fft(x, sac) + res_fft = [add_to_sac(sac, a * b) for a, b in zip(v_fft, x_fft)] + res = fft(res_fft, inverse=True, sac=sac) + return list(reversed(res[:n])) + + def matvec_toeplitz_upper_triangular(first_row, vector): n = len(first_row) assert len(vector) == n @@ -862,4 +911,22 @@ def matvec_toeplitz_upper_triangular(first_row, vector): output[row] = sym.Add(*terms) return output + +to_complex_type_dict = { + np.complex64: np.complex64, + np.complex128: np.complex128, + np.float32: np.complex64, + np.float64: np.complex128, +} + + +def to_complex_dtype(dtype): + np_type = np.dtype(dtype).type + try: + return to_complex_type_dict[np_type] + except KeyError: + raise RuntimeError(f"Unknown dtype: {dtype}") + +# }}} + # vim: fdm=marker diff --git a/test/test_fmm.py b/test/test_fmm.py index f587c5e3c7e501c0bf2ef5f224a0a58386d417dd..b50217cbcdeb23f4237457c0a788515eff71e3c0 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -51,38 +51,35 @@ else: faulthandler.enable() -@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class, optimized_m2l", [ - (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - False), - (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - True), +@pytest.mark.parametrize("optimized_m2l, use_fft", + [(False, False), (True, False), (True, True)]) +@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class", +[ + (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion), (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, True), - (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, False), - (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - False), - (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - True), - (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, True), + LinearPDEConformingVolumeTaylorMultipoleExpansion), + (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion), (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, False), - (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - False), + LinearPDEConformingVolumeTaylorMultipoleExpansion), + (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion), (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, False), - (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion, False), - (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, - False), + LinearPDEConformingVolumeTaylorMultipoleExpansion), + (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion), + (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion), (HelmholtzKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion, - LinearPDEConformingVolumeTaylorMultipoleExpansion, False), - (YukawaKernel(2), Y2DLocalExpansion, Y2DMultipoleExpansion, False), - ]) + LinearPDEConformingVolumeTaylorMultipoleExpansion), + (YukawaKernel(2), Y2DLocalExpansion, Y2DMultipoleExpansion), +]) def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class, - optimized_m2l): + optimized_m2l, use_fft): logging.basicConfig(level=logging.INFO) + if local_expn_class == VolumeTaylorLocalExpansion and use_fft: + pytest.skip("VolumeTaylorExpansion with FFT takes a lot of resources.") + + if local_expn_class in [H2DLocalExpansion, Y2DLocalExpansion] and use_fft: + pytest.skip("Fourier/Bessel based expansions with FFT is not supported yet.") + ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -123,10 +120,10 @@ def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class, # {{{ plot tree if 0: - host_tree = tree.get() - host_trav = trav.get() + host_tree = tree.get(queue) + host_trav = trav.get(queue) - if 1: + if 0: print("src_box", host_tree.find_box_nr_for_source(403)) print("tgt_box", host_tree.find_box_nr_for_target(28)) print(list(host_trav.target_or_target_parent_boxes).index(37)) @@ -163,7 +160,7 @@ def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class, if knl.dim == 3: order_values = [1, 2] elif knl.dim == 2 and issubclass(local_expn_class, H2DLocalExpansion): - order_values = [10, 12] + order_values = [4, 5] elif isinstance(knl, YukawaKernel): extra_kwargs["lam"] = 2 @@ -190,7 +187,7 @@ def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class, ctx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), - target_kernels) + target_kernels, use_preprocessing_for_m2l=use_fft) wrangler = wcc.get_wrangler(queue, tree, dtype, fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order, @@ -574,7 +571,8 @@ def test_sumpy_target_point_multiplier(ctx_factory): # You can test individual routines by typing -# $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context)' +# $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context, LaplaceKernel(2), +# VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion, False, False)' if __name__ == "__main__": if len(sys.argv) > 1: diff --git a/test/test_tools.py b/test/test_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..018a95ed9e727e806da34f47d45fd7d22c88f222 --- /dev/null +++ b/test/test_tools.py @@ -0,0 +1,56 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2020 Isuru Fernando" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import logging +logger = logging.getLogger(__name__) + +import sumpy.symbolic as sym +from sumpy.tools import (fft_toeplitz_upper_triangular, + matvec_toeplitz_upper_triangular) +import numpy as np + + +def test_fft(): + k = 5 + v = np.random.rand(k) + x = np.random.rand(k) + + fft = fft_toeplitz_upper_triangular(v, x) + matvec = matvec_toeplitz_upper_triangular(v, x) + + for i in range(k): + assert abs(fft[i] - matvec[i]) < 1e-14 + + +def test_fft_small_floats(): + k = 5 + v = sym.make_sym_vector("v", k) + x = sym.make_sym_vector("x", k) + + fft = fft_toeplitz_upper_triangular(v, x) + for expr in fft: + for f in expr.atoms(sym.Float): + if f == 0: + continue + assert abs(f) > 1e-10