diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9526444ebafca74473618978b184ea0ebad235e9..3792baafd56c996124259764322d0f000a82117b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,12 +1,19 @@ -Python 3.5 POCL: +# Environment variables +# +# * PYTEST_ADDOPTS is used to filter test runs. The default value is "-k-slowtest", +# which skips the slow running tests. +# * SKIP_EXAMPLES, if non-empty, can be used to skip the examples job. + +Python 2.7 POCL: script: - - export PY_EXE=python3.5 + - export PY_EXE=python2.7 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy scipy mako" + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - - python3.5 + - python2.7 - pocl - large-node except: @@ -16,7 +23,8 @@ Python 3.6 POCL: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy scipy mako" + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -28,9 +36,10 @@ Python 3.6 POCL: Python 3.6 POCL Examples: script: + - test -n "$SKIP_EXAMPLES" && exit - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib" + - export EXTRA_INSTALL="Cython pybind11 numpy mako pyvisfile matplotlib" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh - ". ./build-py-project-and-run-examples.sh" tags: @@ -43,8 +52,9 @@ Python 3.6 POCL Examples: Python 3.6 Conda: script: - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine - - CONDA_ENVIRONMENT=.test-conda-env-py3.yml - - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CONDA_ENVIRONMENT=.test-conda-env-py3.yml + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: @@ -53,27 +63,15 @@ Python 3.6 Conda: except: - tags -Python 2.7 POCL: - script: - - export PY_EXE=python2.7 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 scipy numpy mako" - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" - tags: - - python2.7 - - pocl - - large-node - except: - - tags - Python 3.6 Conda Apple: script: - export LC_ALL=en_US.UTF-8 - export LANG=en_US.UTF-8 - - export PYTEST_ADDOPTS=-k-slowtest - - CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml - - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CC=gcc + - set -o xtrace - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" @@ -88,7 +86,7 @@ Python 3.6 Conda Apple: Documentation: script: - - EXTRA_INSTALL="pybind11 numpy mako" + - EXTRA_INSTALL="Cython pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh - ". ./build-docs.sh" tags: diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index 7c0601cb2bf81ce546a590c71a8ea43fb87f9004..454ee30fa34af1eb333bc534b0357e5e2b4fae34 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -13,8 +13,8 @@ dependencies: - python=3.6 - symengine=0.3.0 - python-symengine=0.3.0 -- pyfmmlib - osx-pocl-opencl -# for OpenMP support in pyfmmlib -- libgfortran>=3.0.1 +- pyfmmlib +- gcc +- cython # things not in here: loopy boxtree pymbolic meshmode sumpy diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 16e053730d0b14a6374f60d48cb28c4a9308d598..34b92354d2d36c66f81c30b4483a2f2c71f93b6e 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -14,4 +14,5 @@ dependencies: - symengine=0.3.0 - python-symengine=0.3.0 - pyfmmlib +- cython # things not in here: loopy boxtree pymbolic meshmode sumpy diff --git a/README.rst b/README.rst index dfc5fe49b46f4a9ea13debdc136e9fbf3848c79d..ad3f79057516a8347148e9e9451aada8a73f553b 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ It relies on * `boxtree `_ for FMM tree building * `sumpy `_ for expansions and analytical routines * `modepy `_ for modes and nodes on simplices -* `meshmode `_ for modes and nodes on simplices +* `meshmode `_ for high order discretizations * `loopy `_ for fast array operations * `pytest `_ for automated testing diff --git a/doc/misc.rst b/doc/misc.rst index 198b87da15c8c033552fe8ee881082991a07bc03..da204eb0d5bfc490311121eaf7eecbece76ce06c 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -33,11 +33,12 @@ This set of instructions is intended for 64-bit Linux and macOS computers. #. (*macOS only*) ``conda install osx-pocl-opencl pocl pyopencl`` -#. ``conda install git pip pocl islpy pyopencl sympy pyfmmlib pytest`` +#. ``conda install gcc cython git pip pocl islpy pyopencl sympy pyfmmlib pytest`` -#. Type the following command:: +#. Type the following commands:: - hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode pytential; do python -m pip install git+https://github.com/inducer/$i; done + hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode; do python -m pip install git+https://github.com/inducer/$i; done + CC=gcc python -m pip install git+https://gitlab.tiker.net/inducer/pytential@tsqbx Next time you want to use :mod:`pytential`, just run the following command:: diff --git a/doc/qbx.rst b/doc/qbx.rst index b49698663b9de46e643f571ee46f3148f603c4d0..b640ddcde5dcddd7cfa069f909d8d7fb09a91f4c 100644 --- a/doc/qbx.rst +++ b/doc/qbx.rst @@ -28,5 +28,10 @@ Fast multipole driver .. automodule:: pytential.qbx.fmm +Performance model +----------------- + +.. automodule:: pytential.qbx.performance + .. vim: sw=4:tw=75 diff --git a/examples/performance.py b/examples/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2dc61b49349698bebfab694b5aac4cd9456c36 --- /dev/null +++ b/examples/performance.py @@ -0,0 +1,168 @@ +"""Trains a performance model and reports on the accuracy.""" + +import pyopencl as cl +import numpy as np + +from pytential import sym, bind +from pytools import one + + +# {{{ global params + +TARGET_ORDER = 8 +OVSMP_FACTOR = 5 +TCF = 0.9 +QBX_ORDER = 5 +FMM_ORDER = 10 +RUNS = 3 + +DEFAULT_LPOT_KWARGS = { + "_box_extent_norm": "l2", + "_from_sep_smaller_crit": "static_l2", + } + +PANELS_PER_ARM = 30 +TRAINING_ARMS = (10, 15, 25) +TESTING_ARMS = (20,) + + +def starfish_lpot_source(queue, n_arms): + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + from meshmode.mesh.generation import make_curve_mesh, NArmedStarfish + + mesh = make_curve_mesh( + NArmedStarfish(n_arms, 0.8), + np.linspace(0, 1, 1 + PANELS_PER_ARM * n_arms), + TARGET_ORDER) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(TARGET_ORDER)) + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + target_association_tolerance=0.025, + _expansion_stick_out_factor=TCF, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, + fmm_backend="fmmlib" + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR * TARGET_ORDER, + **lpot_kwargs) + + lpot_source, _ = lpot_source.with_refinement() + + return lpot_source + +# }}} + + +def training_geometries(queue): + for n_arms in TRAINING_ARMS: + yield starfish_lpot_source(queue, n_arms) + + +def test_geometries(queue): + for n_arms in TESTING_ARMS: + yield starfish_lpot_source(queue, n_arms) + + +def get_bound_op(lpot_source): + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + return bind(lpot_source, op) + + +def get_test_density(queue, lpot_source): + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + sigma = cl.clmath.sin(10 * nodes[0]) + + return sigma + + +def train_performance_model(ctx): + queue = cl.CommandQueue(ctx) + + from pytential.qbx.performance import ( + PerformanceModel, estimate_calibration_params) + + perf_model = PerformanceModel() + + model_results = [] + timing_results = [] + + for lpot_source in training_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + + model_results.append(one(perf_S.values())) + timing_results.append(one(timing_data.values())) + + calibration_params = ( + estimate_calibration_params(model_results, timing_results)) + + return perf_model.with_calibration_params(calibration_params) + + +def test_performance_model(ctx, perf_model): + queue = cl.CommandQueue(ctx) + + for lpot_source in test_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + model_result = ( + one(perf_S.values()) + .get_predicted_times(merge_close_lists=True)) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + temp_timing_results = [] + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + temp_timing_results.append(one(timing_data.values())) + + timing_result = {} + for param in model_result: + timing_result[param] = ( + sum(temp_timing_result[param]["process_elapsed"] + for temp_timing_result in temp_timing_results)) / RUNS + + print("=" * 20) + for stage in model_result: + print("stage: ", stage) + print("actual: ", timing_result[stage]) + print("predicted: ", model_result[stage]) + print("=" * 20) + + +def predict_performance(ctx): + model = train_performance_model(ctx) + test_performance_model(ctx, model) + + +if __name__ == "__main__": + predict_performance(cl.create_some_context(0)) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 188660f72b642ca67d0882a9e92b25e211e717fa..11ddae821c07d7d36f11a8a05e493dab18c68db3 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,7 +83,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", + _use_target_specific_qbx=False, geometry_data_inspector=None, + performance_model=None, fmm_backend="sumpy", target_stick_out_factor=_not_provided): """ @@ -202,7 +204,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind + self._use_target_specific_qbx = _use_target_specific_qbx self.geometry_data_inspector = geometry_data_inspector + self.performance_model = performance_model # /!\ *All* parameters set here must also be set by copy() below, # otherwise they will be reset to their default values behind your @@ -224,7 +228,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, + _use_target_specific_qbx=_not_provided, geometry_data_inspector=None, + performance_model=_not_provided, fmm_backend=None, debug=_not_provided, @@ -306,8 +312,16 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, + _use_target_specific_qbx=(_use_target_specific_qbx + if _use_target_specific_qbx is not _not_provided + else self._use_target_specific_qbx), geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), + performance_model=( + # None is a valid value here + performance_model + if performance_model is not _not_provided + else self.performance_model), fmm_backend=fmm_backend or self.fmm_backend, **kwargs) @@ -616,9 +630,35 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ internal functionality for execution - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if self.fmm_level_to_order is False: + func = self.exec_compute_potential_insn_direct + else: + func = self.exec_compute_potential_insn_fmm + + extra_args = {"return_timing_data": return_timing_data} + + return self._dispatch_compute_potential_insn( + queue, insn, bound_expr, evaluate, func, extra_args) + + def perf_model_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + if self.fmm_level_to_order is False: + raise NotImplementedError("perf modeling direct evaluations") + return self._dispatch_compute_potential_insn( + queue, insn, bound_expr, evaluate, + self.perf_model_compute_potential_insn_fmm) + + def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, + evaluate, func, extra_args=None): from pytools.obj_array import with_object_array_or_scalar + if not self._refined_for_global_qbx: + from warnings import warn + warn( + "Executing global QBX without refinement. " + "This is unlikely to work.") + def oversample_nonscalars(vec): from numbers import Number if isinstance(vec, Number): @@ -626,22 +666,14 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: return self.resampler(queue, vec) - if not self._refined_for_global_qbx: - from warnings import warn - warn( - "Executing global QBX without refinement. " - "This is unlikely to work.") - def evaluate_wrapper(expr): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - if self.fmm_level_to_order is False: - func = self.exec_compute_potential_insn_direct - else: - func = self.exec_compute_potential_insn_fmm + if extra_args is None: + extra_args = {} - return func(queue, insn, bound_expr, evaluate_wrapper) + return func(queue, insn, bound_expr, evaluate_wrapper, **extra_args) @property @memoize_method @@ -685,18 +717,19 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: raise ValueError("invalid FMM backend: %s" % self.fmm_backend) - def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate): - # {{{ build list of unique target discretizations used - + def get_target_discrs_and_qbx_sides(self, insn, bound_expr): + """Build the list of unique target discretizations used by the + provided instruction. + """ # map (name, qbx_side) to number in list - tgt_name_and_side_to_number = {} + target_name_and_side_to_number = {} # list of tuples (discr, qbx_side) target_discrs_and_qbx_sides = [] for o in insn.outputs: key = (o.target_name, o.qbx_forced_limit) - if key not in tgt_name_and_side_to_number: - tgt_name_and_side_to_number[key] = \ + if key not in target_name_and_side_to_number: + target_name_and_side_to_number[key] = \ len(target_discrs_and_qbx_sides) target_discr = bound_expr.places[o.target_name] @@ -710,13 +743,68 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): target_discrs_and_qbx_sides.append( (target_discr, qbx_forced_limit)) - target_discrs_and_qbx_sides = tuple(target_discrs_and_qbx_sides) + return target_name_and_side_to_number, tuple(target_discrs_and_qbx_sides) - # }}} + # {{{ execute fmm performance model + + def perf_model_compute_potential_insn_fmm(self, queue, insn, bound_expr, + evaluate): + target_name_and_side_to_number, target_discrs_and_qbx_sides = ( + self.get_target_discrs_and_qbx_sides(insn, bound_expr)) geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides) - # geo_data.plot() + if self.performance_model is None: + from pytential.qbx.performance import PerformanceModel + performance_model = PerformanceModel() + else: + performance_model = self.performance_model + + kernel_args = {} + for arg_name, arg_expr in six.iteritems(insn.kernel_arguments): + kernel_args[arg_name] = evaluate(arg_expr) + + performance_model_result = ( + performance_model(geo_data, insn.base_kernel, kernel_args)) + + # {{{ construct dummy outputs + + strengths = (evaluate(insn.density).with_queue(queue) + * self.weights_and_area_elements()) + out_kernels = tuple(knl for knl in insn.kernels) + fmm_kernel = self.get_fmm_kernel(out_kernels) + output_and_expansion_dtype = ( + self.get_fmm_output_and_expansion_dtype(fmm_kernel, strengths)) + + result = [] + + for o in insn.outputs: + target_side_number = target_name_and_side_to_number[ + o.target_name, o.qbx_forced_limit] + start, end = geo_data.target_info().target_discr_starts[ + target_side_number:target_side_number+2] + + output_array = cl.array.zeros( + queue, + end - start, + dtype=output_and_expansion_dtype) + + result.append((o.name, output_array)) + + return result, performance_model_result + + # }}} + + # }}} + + # {{{ execute fmm + + def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate, + return_timing_data): + target_name_and_side_to_number, target_discrs_and_qbx_sides = ( + self.get_target_discrs_and_qbx_sides(insn, bound_expr)) + + geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides) # FIXME Exert more positive control over geo_data attribute lifetimes using # geo_data..clear_cache(geo_data). @@ -729,7 +817,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): strengths = (evaluate(insn.density).with_queue(queue) * self.weights_and_area_elements()) - out_kernels = tuple(knl for knl in insn.kernels) fmm_kernel = self.get_fmm_kernel(out_kernels) output_and_expansion_dtype = ( @@ -745,14 +832,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.qbx_order, self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, - kernel_extra_kwargs=kernel_extra_kwargs) + kernel_extra_kwargs=kernel_extra_kwargs, + _use_target_specific_qbx=self._use_target_specific_qbx) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) == target_state.FAILED).any().get(): raise RuntimeError("geometry has failed targets") - # {{{ performance data hook + # {{{ geometry data inspection hook if self.geometry_data_inspector is not None: perform_fmm = self.geometry_data_inspector(insn, bound_expr, geo_data) @@ -764,23 +852,23 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX from pytential.qbx.fmm import drive_fmm - all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) + timing_data = {} if return_timing_data else None + all_potentials_on_every_target = drive_fmm(wrangler, strengths, timing_data) # }}} result = [] for o in insn.outputs: - tgt_side_number = tgt_name_and_side_to_number[ + target_side_number = target_name_and_side_to_number[ o.target_name, o.qbx_forced_limit] - tgt_slice = slice(*geo_data.target_info().target_discr_starts[ - tgt_side_number:tgt_side_number+2]) + target_slice = slice(*geo_data.target_info().target_discr_starts[ + target_side_number:target_side_number+2]) - result.append( - (o.name, - all_potentials_on_every_tgt[o.kernel_index][tgt_slice])) + result.append((o.name, + all_potentials_on_every_target[o.kernel_index][target_slice])) - return result + return result, timing_data # }}} @@ -838,7 +926,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): *count = item; """) - def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from pytential.source import UnableToCollectTimingData + from warnings import warn + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + lpot_applier = self.get_lpot_applier(insn.kernels) p2p = None lpot_applier_on_tgt_subset = None @@ -941,7 +1037,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result + timing_data = {} + return result, timing_data # }}} diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 46d7ae880384f7dfbe44c21b7aea836ffdd4f6d3..2e6f476a80c0c6452c37d44c7813dc2a225c7905 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -91,12 +91,14 @@ class QBXSumpyExpansionWranglerCodeContainer(SumpyExpansionWranglerCodeContainer def get_wrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs={}, - kernel_extra_kwargs=None): + kernel_extra_kwargs=None, + _use_target_specific_qbx=False): return QBXExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs) + kernel_extra_kwargs, + _use_target_specific_qbx) class QBXExpansionWrangler(SumpyExpansionWrangler): @@ -120,7 +122,11 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), def __init__(self, code_container, queue, geo_data, dtype, qbx_order, fmm_level_to_order, - source_extra_kwargs, kernel_extra_kwargs): + source_extra_kwargs, kernel_extra_kwargs, + _use_target_specific_qbx=False): + if _use_target_specific_qbx: + raise ValueError("TSQBX is not implemented in sumpy") + SumpyExpansionWrangler.__init__(self, code_container, queue, geo_data.tree(), dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs) @@ -367,6 +373,10 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), return (pot, SumpyTimingFuture(self.queue, [evt])) + @log_process(logger) + def eval_target_specific_qbx_locals(self, src_weights): + raise NotImplementedError() + # }}} # }}} @@ -374,7 +384,8 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), # {{{ FMM top-level -def drive_fmm(expansion_wrangler, src_weights, timing_data=None): +def drive_fmm(expansion_wrangler, src_weights, timing_data=None, + traversal=None): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`QBXFMMGeometryData` instance. @@ -392,8 +403,14 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): wrangler = expansion_wrangler geo_data = wrangler.geo_data - traversal = geo_data.traversal() + + use_tsqbx = geo_data.lpot_source._use_target_specific_qbx + + if traversal is None: + traversal = geo_data.traversal() + tree = traversal.tree + recorder = TimingRecorder() # Interface guidelines: Attributes of the tree are assumed to be known @@ -513,9 +530,12 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): # {{{ wrangle qbx expansions - qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights) + if not use_tsqbx: + qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights) - recorder.add("form_global_qbx_locals", timing_future) + recorder.add("form_global_qbx_locals", timing_future) + else: + qbx_expansions = wrangler.qbx_local_expansion_zeros() local_result, timing_future = ( wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)) @@ -535,6 +555,14 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): recorder.add("eval_qbx_expansions", timing_future) + if use_tsqbx: + tsqbx_result, timing_future = ( + wrangler.eval_target_specific_qbx_locals(src_weights)) + + recorder.add("eval_target_specific_qbx_locals", timing_future) + + qbx_potentials = qbx_potentials + tsqbx_result + # }}} # {{{ reorder potentials @@ -563,402 +591,6 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): if timing_data is not None: timing_data.update(recorder.summarize()) - - return result - -# }}} - - -# {{{ performance data - -def assemble_performance_data(geo_data, uses_pde_expansions, - translation_source_power=None, translation_target_power=None, - translation_max_power=None, - summarize_parallel=None, merge_close_lists=True): - """ - :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM - uses translation operators that make use of the knowledge that the - potential satisfies a PDE. - :arg summarize_parallel: a function of two arguments - *(parallel_array, sym_multipliers)* used to process an array of - workloads of 'parallelizable units'. By default, all workloads are - summed into one number encompassing the total workload. - :arg merge_close_lists: A :class:`bool` indicating whether or not all - boxes requiring direct evaluation should be merged into a single - interaction list. If *False*, *part_direct* and *p2qbxl* will be - suffixed with the originating list as follows: - - * *_neighbor* (List 1) - * *_sep_smaller* (List 3 close) - * *_sep_bigger* (List 4 close). - """ - - # FIXME: This should suport target filtering. - - if summarize_parallel is None: - def summarize_parallel(parallel_array, sym_multipliers): - return np.sum(parallel_array) * sym_multipliers - - from collections import OrderedDict - result = OrderedDict() - - from pymbolic import var - p_fmm = var("p_fmm") - p_qbx = var("p_qbx") - - nqbtl = geo_data.non_qbx_box_target_lists() - - with cl.CommandQueue(geo_data.cl_context) as queue: - tree = geo_data.tree().get(queue=queue) - traversal = geo_data.traversal(merge_close_lists).get(queue=queue) - box_target_counts_nonchild = ( - nqbtl.box_target_counts_nonchild.get(queue=queue)) - - d = tree.dimensions - if uses_pde_expansions: - ncoeffs_fmm = p_fmm ** (d-1) - ncoeffs_qbx = p_qbx ** (d-1) - - if d == 2: - default_translation_source_power = 1 - default_translation_target_power = 1 - default_translation_max_power = 0 - - elif d == 3: - # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. - default_translation_source_power = 0 - default_translation_target_power = 0 - default_translation_max_power = 3 - - else: - raise ValueError("Don't know how to estimate expansion complexities " - "for dimension %d" % d) - - else: - ncoeffs_fmm = p_fmm ** d - ncoeffs_qbx = p_qbx ** d - default_translation_source_power = d - default_translation_target_power = d - - if translation_source_power is None: - translation_source_power = default_translation_source_power - if translation_target_power is None: - translation_target_power = default_translation_target_power - if translation_max_power is None: - translation_max_power = default_translation_max_power - - def xlat_cost(p_source, p_target): - from pymbolic.primitives import Max - return ( - p_source ** translation_source_power - * p_target ** translation_target_power - * Max((p_source, p_target)) ** translation_max_power - ) - - result.update( - nlevels=tree.nlevels, - nboxes=tree.nboxes, - nsources=tree.nsources, - ntargets=tree.ntargets) - - # {{{ construct local multipoles - - result["form_mp"] = tree.nsources*ncoeffs_fmm - - # }}} - - # {{{ propagate multipoles upward - - result["prop_upward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm) - - # }}} - - # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) - - def process_direct(): - # box -> nsources * ntargets - npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = box_target_counts_nonchild[tgt_ibox] - - npart_direct_list1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list1_srcs += nsources - - npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs - - if merge_close_lists: - continue - - npart_direct_list3_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list3_srcs += nsources - - npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs - - npart_direct_list4_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list4_srcs += nsources - - npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs - - if merge_close_lists: - result["part_direct"] = summarize_parallel(npart_direct_list1, 1) - else: - result["part_direct_neighbor"] = ( - summarize_parallel(npart_direct_list1, 1)) - result["part_direct_sep_smaller"] = ( - summarize_parallel(npart_direct_list3, 1)) - result["part_direct_sep_bigger"] = ( - summarize_parallel(npart_direct_list4, 1)) - - process_direct() - - # }}} - - # {{{ translate separated siblings' ("list 2") mpoles to local - - def process_list2(): - nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): - start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] - - nm2l[itgt_box] += end-start - - result["m2l"] = summarize_parallel(nm2l, xlat_cost(p_fmm, p_fmm)) - - process_list2() - - # }}} - - # {{{ evaluate sep. smaller mpoles ("list 3") at particles - - def process_list3(): - nmp_eval = np.zeros( - (tree.nlevels, len(traversal.target_boxes)), - dtype=np.intp) - - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) - - for ilevel, sep_smaller_list in enumerate( - traversal.from_sep_smaller_by_level): - for itgt_box, tgt_ibox in enumerate( - traversal.target_boxes_sep_smaller_by_source_level[ilevel]): - ntargets = box_target_counts_nonchild[tgt_ibox] - start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] - nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = ( - ntargets * (end-start) - ) - - result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm) - - process_list3() - - # }}} - - # {{{ form locals for separated bigger source boxes ("list 4") - - def process_list4(): - nform_local = np.zeros( - len(traversal.target_or_target_parent_boxes), - dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): - start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] - - nform_local_box = 0 - for src_ibox in traversal.from_sep_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - nform_local_box += nsources - - nform_local[itgt_box] = nform_local_box - - result["form_local"] = summarize_parallel(nform_local, ncoeffs_fmm) - - process_list4() - - # }}} - - # {{{ propagate local_exps downward - - result["prop_downward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm) - - # }}} - - # {{{ evaluate locals - - non_qbx_box_targets = geo_data.non_qbx_box_target_lists() - result["eval_part"] = non_qbx_box_targets.nfiltered_targets * ncoeffs_fmm - - # }}} - - # {{{ form global qbx locals - - global_qbx_centers = geo_data.global_qbx_centers() - - # If merge_close_lists is False above, then this builds another traversal - # (which is OK). - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - center_to_targets_starts = geo_data.center_to_tree_targets().starts - qbx_center_to_target_box_source_level = np.empty( - (tree.nlevels,), dtype=object - ) - - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - geo_data.qbx_center_to_target_box_source_level(src_level) - ) - - with cl.CommandQueue(geo_data.cl_context) as queue: - global_qbx_centers = global_qbx_centers.get( - queue=queue) - qbx_center_to_target_box = qbx_center_to_target_box.get( - queue=queue) - center_to_targets_starts = center_to_targets_starts.get( - queue=queue) - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - qbx_center_to_target_box_source_level[src_level].get(queue=queue) - ) - - def process_form_qbxl(): - ncenters = geo_data.ncenters - - result["ncenters"] = ncenters - - # center -> nsources - np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - itgt_box = qbx_center_to_target_box[tgt_icenter] - - np2qbxl_list1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list1_srcs += nsources - - np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs - - if merge_close_lists: - continue - - np2qbxl_list3_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list3_srcs += nsources - - np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs - - np2qbxl_list4_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list4_srcs += nsources - - np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs - - if merge_close_lists: - result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx) - else: - result["p2qbxl_neighbor"] = ( - summarize_parallel(np2qbxl_list1, ncoeffs_qbx)) - result["p2qbxl_sep_smaller"] = ( - summarize_parallel(np2qbxl_list3, ncoeffs_qbx)) - result["p2qbxl_sep_bigger"] = ( - summarize_parallel(np2qbxl_list4, ncoeffs_qbx)) - - process_form_qbxl() - - # }}} - - # {{{ translate from list 3 multipoles to qbx local expansions - - def process_m2qbxl(): - nm2qbxl = np.zeros( - (tree.nlevels, len(global_qbx_centers)), - dtype=np.intp) - - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) - - for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - icontaining_tgt_box = qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] - - if icontaining_tgt_box == -1: - continue - - start, stop = ( - ssn.starts[icontaining_tgt_box], - ssn.starts[icontaining_tgt_box+1]) - - nm2qbxl[isrc_level, itgt_center] += stop-start - - result["m2qbxl"] = summarize_parallel(nm2qbxl, xlat_cost(p_fmm, p_qbx)) - - process_m2qbxl() - - # }}} - - # {{{ translate from box local expansions to qbx local expansions - - result["l2qbxl"] = geo_data.ncenters * xlat_cost(p_fmm, p_qbx) - - # }}} - - # {{{ evaluate qbx local expansions - - def process_eval_qbxl(): - nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp) - - for isrc_center, src_icenter in enumerate(global_qbx_centers): - start, end = center_to_targets_starts[src_icenter:src_icenter+2] - nqbx_eval[isrc_center] += end-start - - result["qbxl2p"] = summarize_parallel(nqbx_eval, ncoeffs_qbx) - - process_eval_qbxl() - - # }}} - return result # }}} diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 2d21a3d2ac6865d0d8d3c3d3f41c388b0fe160b0..beff36d0dfb50331c4345e59852a54af2fec35d0 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -23,11 +23,14 @@ THE SOFTWARE. """ import numpy as np -from pytools import memoize_method, Record +from pytools import Record, memoize_method import pyopencl as cl # noqa import pyopencl.array # noqa: F401 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler -from sumpy.kernel import LaplaceKernel, HelmholtzKernel +from sumpy.kernel import ( + LaplaceKernel, HelmholtzKernel, AxisTargetDerivative, + DirectionalSourceDerivative) +import pytential.qbx.target_specific as ts from boxtree.tools import return_timing_data @@ -55,70 +58,14 @@ class QBXFMMLibExpansionWranglerCodeContainer(object): def get_wrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs={}, - kernel_extra_kwargs=None): + kernel_extra_kwargs=None, + _use_target_specific_qbx=False): return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs) - -# }}} - - -# {{{ host geo data wrapper - -class ToHostTransferredGeoDataWrapper(object): - def __init__(self, queue, geo_data): - self.queue = queue - self.geo_data = geo_data - - @memoize_method - def tree(self): - return self.traversal().tree - - @memoize_method - def traversal(self): - return self.geo_data.traversal().get(queue=self.queue) - - @property - def ncenters(self): - return self.geo_data.ncenters - - @memoize_method - def centers(self): - return np.array([ - ci.get(queue=self.queue) - for ci in self.geo_data.centers()]) - - @memoize_method - def expansion_radii(self): - return self.geo_data.expansion_radii().get(queue=self.queue) - - @memoize_method - def global_qbx_centers(self): - return self.geo_data.global_qbx_centers().get(queue=self.queue) - - @memoize_method - def qbx_center_to_target_box(self): - return self.geo_data.qbx_center_to_target_box().get(queue=self.queue) - - @memoize_method - def qbx_center_to_target_box_source_level(self, source_level): - return self.geo_data.qbx_center_to_target_box_source_level( - source_level).get(queue=self.queue) - - @memoize_method - def non_qbx_box_target_lists(self): - return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue) - - @memoize_method - def center_to_tree_targets(self): - return self.geo_data.center_to_tree_targets().get(queue=self.queue) - - @memoize_method - def all_targets(self): - """All (not just non-QBX) targets packaged into a single array.""" - return np.array(list(self.tree().targets)) + kernel_extra_kwargs, + _use_target_specific_qbx) # }}} @@ -129,73 +76,80 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): def __init__(self, code, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs): + kernel_extra_kwargs, + _use_target_specific_qbx=False): self.code = code self.queue = queue + self._use_target_specific_qbx = _use_target_specific_qbx # FMMLib is CPU-only. This wrapper gets the geometry out of # OpenCL-land. + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper self.geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) self.qbx_order = qbx_order # {{{ digest out_kernels - from sumpy.kernel import AxisTargetDerivative, DirectionalSourceDerivative - - k_names = [] - source_deriv_names = [] - - def is_supported_helmknl(knl): - if isinstance(knl, DirectionalSourceDerivative): - source_deriv_name = knl.dir_vec_name - knl = knl.inner_kernel - else: - source_deriv_name = None - - if isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3]: - k_names.append(knl.helmholtz_k_name) - source_deriv_names.append(source_deriv_name) - return True - elif isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]: - k_names.append(None) - source_deriv_names.append(source_deriv_name) - return True - - return False - ifgrad = False outputs = [] + source_deriv_names = [] + k_names = [] + for out_knl in self.code.out_kernels: - if is_supported_helmknl(out_knl): + if ( + _use_target_specific_qbx + and not self.is_supported_helmknl_for_tsqbx(out_knl)): + raise ValueError("not all kernels passed support TSQBX") + + if self.is_supported_helmknl(out_knl): outputs.append(()) + no_target_deriv_knl = out_knl + elif (isinstance(out_knl, AxisTargetDerivative) - and is_supported_helmknl(out_knl.inner_kernel)): + and self.is_supported_helmknl(out_knl.inner_kernel)): outputs.append((out_knl.axis,)) ifgrad = True + no_target_deriv_knl = out_knl.inner_kernel + else: - raise NotImplementedError( + raise ValueError( "only the 2/3D Laplace and Helmholtz kernel " "and their derivatives are supported") + source_deriv_names.append(no_target_deriv_knl.dir_vec_name + if isinstance(no_target_deriv_knl, DirectionalSourceDerivative) + else None) + + base_knl = out_knl.get_base_kernel() + k_names.append(base_knl.helmholtz_k_name + if isinstance(base_knl, HelmholtzKernel) + else None) + + self.outputs = outputs + from pytools import is_single_valued + if not is_single_valued(source_deriv_names): raise ValueError("not all kernels passed are the same in " "whether they represent a source derivative") source_deriv_name = source_deriv_names[0] - self.outputs = outputs - # }}} + if not is_single_valued(k_names): + raise ValueError("not all kernels passed have the same " + "Helmholtz parameter") + + k_name = k_names[0] - from pytools import single_valued - k_name = single_valued(k_names) if k_name is None: helmholtz_k = 0 else: helmholtz_k = kernel_extra_kwargs[k_name] + # }}} + dipole_vec = None if source_deriv_name is not None: dipole_vec = np.array([ @@ -204,7 +158,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): order="F") def inner_fmm_level_to_nterms(tree, level): - from sumpy.kernel import LaplaceKernel, HelmholtzKernel if helmholtz_k == 0: return fmm_level_to_order( LaplaceKernel(tree.dimensions), @@ -225,6 +178,23 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad=ifgrad) + @staticmethod + def is_supported_helmknl_for_tsqbx(knl): + # Supports at most one derivative. + if isinstance(knl, (DirectionalSourceDerivative, AxisTargetDerivative)): + knl = knl.inner_kernel + + return (isinstance(knl, (LaplaceKernel, HelmholtzKernel)) + and knl.dim == 3) + + @staticmethod + def is_supported_helmknl(knl): + if isinstance(knl, DirectionalSourceDerivative): + knl = knl.inner_kernel + + return (isinstance(knl, (LaplaceKernel, HelmholtzKernel)) + and knl.dim in (2, 3)) + # {{{ data vector helpers def output_zeros(self): @@ -276,6 +246,13 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): raise ValueError("element '%s' of outputs array not " "understood" % out) + @memoize_method + def _get_single_centers_array(self): + return np.array([ + self.geo_data.centers()[idim] + for idim in range(self.dim) + ], order="F") + # }}} # {{{ override target lists to only hit non-QBX targets @@ -304,6 +281,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) @return_timing_data def form_global_qbx_locals(self, src_weights): + if self._use_target_specific_qbx: + return self.qbx_local_expansion_zeros() + geo_data = self.geo_data trav = geo_data.traversal() @@ -572,7 +552,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): taeval = self.get_expn_eval_routine("ta") - for isrc_center, src_icenter in enumerate(global_qbx_centers): + for src_icenter in global_qbx_centers: for icenter_tgt in range( ctt.starts[src_icenter], ctt.starts[src_icenter+1]): @@ -592,6 +572,60 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): return output + @log_process(logger) + @return_timing_data + def eval_target_specific_qbx_locals(self, src_weights): + if not self._use_target_specific_qbx: + return self.full_output_zeros() + + geo_data = self.geo_data + trav = geo_data.traversal() + + ctt = geo_data.center_to_tree_targets() + + src_weights = src_weights.astype(np.complex128) + + ifcharge = self.dipole_vec is None + ifdipole = self.dipole_vec is not None + + ifpot = any(not output for output in self.outputs) + ifgrad = self.ifgrad + + # Create temporary output arrays for potential / gradient. + pot = np.zeros(self.tree.ntargets, np.complex) if ifpot else None + grad = ( + np.zeros((self.dim, self.tree.ntargets), np.complex) + if ifgrad else None) + + ts.eval_target_specific_qbx_locals( + ifpot=ifpot, + ifgrad=ifgrad, + ifcharge=ifcharge, + ifdipole=ifdipole, + order=self.qbx_order, + sources=self._get_single_sources_array(), + targets=geo_data.all_targets(), + centers=self._get_single_centers_array(), + qbx_centers=geo_data.global_qbx_centers(), + qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), + center_to_target_starts=ctt.starts, + center_to_target_lists=ctt.lists, + source_box_starts=trav.neighbor_source_boxes_starts, + source_box_lists=trav.neighbor_source_boxes_lists, + box_source_starts=self.tree.box_source_starts, + box_source_counts_nonchild=self.tree.box_source_counts_nonchild, + helmholtz_k=self.kernel_kwargs.get("zk", 0), + charge=src_weights, + dipstr=src_weights, + dipvec=self.dipole_vec, + pot=pot, + grad=grad) + + output = self.full_output_zeros() + self.add_potgrad_onto_output(output, slice(None), pot, grad) + + return output + def finalize_potentials(self, potential): potential = super(QBXFMMLibExpansionWrangler, self).finalize_potentials( potential) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..f3c1ed649304e781fe74918e9d5af02a1da52e02 --- /dev/null +++ b/pytential/qbx/performance.py @@ -0,0 +1,926 @@ +from __future__ import division, absolute_import + +__copyright__ = """ +Copyright (C) 2013 Andreas Kloeckner +Copyright (C) 2018 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from collections import OrderedDict + +import logging + +import numpy as np +import pyopencl as cl +from six.moves import range +import sympy as sp + +from pytools import log_process +from pymbolic import var + +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. autoclass:: PerformanceModel +.. autoclass:: ParametrizedCosts + +.. autoclass:: TranslationCostModel + +.. autofunction:: pde_aware_translation_cost_model +.. autofunction:: taylor_translation_cost_model + +.. autofunction:: estimate_calibration_params +""" + + +# {{{ translation cost model + +class TranslationCostModel(object): + """Provides modeled costs for individual translations or evaluations.""" + + def __init__(self, ncoeffs_qbx, ncoeffs_fmm_by_level, uses_point_and_shoot): + self.ncoeffs_qbx = ncoeffs_qbx + self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level + self.uses_point_and_shoot = uses_point_and_shoot + + @staticmethod + def direct(): + return var("c_p2p") + + def p2qbxl(self): + return var("c_p2qbxl") * self.ncoeffs_qbx + + def p2p_tsqbx(self): + # This term should be linear in the QBX order, which is the + # square root of the number of QBX coefficients. + return var("c_p2p_tsqbx") * self.ncoeffs_qbx ** (1/2) + + def qbxl2p(self): + return var("c_qbxl2p") * self.ncoeffs_qbx + + def p2l(self, level): + return var("c_p2l") * self.ncoeffs_fmm_by_level[level] + + def l2p(self, level): + return var("c_l2p") * self.ncoeffs_fmm_by_level[level] + + def p2m(self, level): + return var("c_p2m") * self.ncoeffs_fmm_by_level[level] + + def m2p(self, level): + return var("c_m2p") * self.ncoeffs_fmm_by_level[level] + + def m2m(self, src_level, tgt_level): + return var("c_m2m") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def l2l(self, src_level, tgt_level): + return var("c_l2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def m2l(self, src_level, tgt_level): + return var("c_m2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def m2qbxl(self, level): + return var("c_m2qbxl") * self.e2e_cost( + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) + + def l2qbxl(self, level): + return var("c_l2qbxl") * self.e2e_cost( + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) + + def e2e_cost(self, nsource_coeffs, ntarget_coeffs): + if self.uses_point_and_shoot: + return ( + # Rotate the coordinate system to be z axis aligned. + nsource_coeffs ** (3 / 2) + # Translate the expansion along the z axis. + + nsource_coeffs ** (1 / 2) * ntarget_coeffs + # Rotate the coordinate system back. + + ntarget_coeffs ** (3 / 2)) + + return nsource_coeffs * ntarget_coeffs + +# }}} + + +# {{{ translation cost model factories + +def pde_aware_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation operators that make use of the + knowledge that the potential satisfies a PDE. + """ + p_qbx = var("p_qbx") + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + + uses_point_and_shoot = False + + ncoeffs_fmm = (p_fmm + 1) ** (dim - 1) + ncoeffs_qbx = (p_qbx + 1) ** (dim - 1) + + if dim == 3: + uses_point_and_shoot = True + + return TranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=uses_point_and_shoot) + + +def taylor_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation based on Taylor expansions + in Cartesian coordinates. + """ + p_qbx = var("p_qbx") + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + + ncoeffs_fmm = (p_fmm + 1) ** dim + ncoeffs_qbx = (p_qbx + 1) ** dim + + return TranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=False) + +# }}} + + +# {{{ parameterized costs returned by performance model + +class ParametrizedCosts(object): + """A container for data returned by the performance model. + + This holds both symbolic costs as well as parameter values. To obtain a + prediction of the running time, use :meth:`get_predicted_times`. + + .. attribute:: raw_costs + + A dictionary mapping algorithmic stage names to symbolic costs. + + .. attribute:: params + + A dictionary mapping names of symbolic parameters to values. Parameters + appear in *raw_costs* and may include values such as QBX or FMM order + as well as calibration constants. + + .. automethod:: copy + .. automethod:: with_params + .. automethod:: get_predicted_times + """ + + def __init__(self, raw_costs, params): + self.raw_costs = OrderedDict(raw_costs) + self.params = params + + def with_params(self, params): + """Return a copy of *self* with parameters updated to include *params*.""" + new_params = self.params.copy() + new_params.update(params) + return type(self)( + raw_costs=self.raw_costs.copy(), + params=new_params) + + def copy(self): + return self.with_params({}) + + def __str__(self): + return "".join([ + type(self).__name__, + "(raw_costs=", + str(self.raw_costs), + ", params=", + str(self.params), + ")"]) + + def __repr__(self): + return "".join([ + type(self).__name__, + "(raw_costs=", + repr(self.raw_costs), + ", params=", + repr(self.params), + ")"]) + + def get_predicted_times(self, merge_close_lists=False): + """Return a dictionary mapping stage names to predicted time in seconds. + + :arg merge_close_lists: If *True*, the returned estimate combines + the cost of "close" lists (Lists 1, 3 close, and 4 close). If + *False*, the time of each "close" list is reported separately. + """ + from pymbolic import evaluate + from functools import partial + + get_time = partial(evaluate, context=self.params) + + result = OrderedDict() + + for name, val in self.raw_costs.items(): + if merge_close_lists: + for suffix in ("_list1", "_list3", "_list4"): + if name.endswith(suffix): + name = name[:-len(suffix)] + break + + result[name] = get_time(val) + result.get(name, 0) + + return result + +# }}} + + +# {{{ performance model + +class PerformanceModel(object): + """ + .. automethod:: with_calibration_params + .. automethod:: __call__ + + The performance model relies on a translation cost model. See + :class:`TranslationCostModel` for the translation cost model interface. + """ + + def __init__(self, + translation_cost_model_factory=pde_aware_translation_cost_model, + calibration_params=None): + """ + :arg translation_cost_model_factory: A callable which, given arguments + (*dim*, *nlevels*), returns a translation cost model. + """ + self.translation_cost_model_factory = translation_cost_model_factory + if calibration_params is None: + calibration_params = dict() + self.calibration_params = calibration_params + + def with_calibration_params(self, calibration_params): + """Return a copy of *self* with a new set of calibration parameters.""" + return type(self)( + translation_cost_model_factory=self.translation_cost_model_factory, + calibration_params=calibration_params) + + # {{{ form multipoles + + def process_form_multipoles(self, xlat_cost, traversal, tree): + result = 0 + + for level in range(tree.nlevels): + src_count = 0 + start, stop = traversal.level_start_source_box_nrs[level:level + 2] + for src_ibox in traversal.source_boxes[start:stop]: + nsrcs = tree.box_source_counts_nonchild[src_ibox] + src_count += nsrcs + result += src_count * xlat_cost.p2m(level) + + return dict(form_multipoles=result) + + # }}} + + # {{{ propagate multipoles upward + + def process_coarsen_multipoles(self, xlat_cost, traversal, tree): + result = 0 + + # nlevels-1 is the last valid level index + # nlevels-2 is the last valid level that could have children + # + # 3 is the last relevant source_level. + # 2 is the last relevant target_level. + # (because no level 1 box will be well-separated from another) + for source_level in range(tree.nlevels-1, 2, -1): + target_level = source_level - 1 + cost = xlat_cost.m2m(source_level, target_level) + + nmultipoles = 0 + start, stop = traversal.level_start_source_parent_box_nrs[ + target_level:target_level+2] + for ibox in traversal.source_parent_boxes[start:stop]: + for child in tree.box_child_ids[:, ibox]: + if child: + nmultipoles += 1 + + result += cost * nmultipoles + + return dict(coarsen_multipoles=result) + + # }}} + + # {{{ collect direct interaction data + + @staticmethod + def _collect_direction_interaction_data(traversal, tree): + ntarget_boxes = len(traversal.target_boxes) + + # target box index -> nsources + nlist1_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + nlist3close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + nlist4close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + + for itgt_box in range(ntarget_boxes): + nlist1_srcs = 0 + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nlist1_srcs += tree.box_source_counts_nonchild[src_ibox] + + nlist1_srcs_by_itgt_box[itgt_box] = nlist1_srcs + + nlist3close_srcs = 0 + # Could be None, if not using targets with extent. + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nlist3close_srcs += tree.box_source_counts_nonchild[src_ibox] + + nlist3close_srcs_by_itgt_box[itgt_box] = nlist3close_srcs + + nlist4close_srcs = 0 + # Could be None, if not using targets with extent. + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nlist4close_srcs += tree.box_source_counts_nonchild[src_ibox] + + nlist4close_srcs_by_itgt_box[itgt_box] = nlist4close_srcs + + result = {} + result["nlist1_srcs_by_itgt_box"] = nlist1_srcs_by_itgt_box + result["nlist3close_srcs_by_itgt_box"] = nlist3close_srcs_by_itgt_box + result["nlist4close_srcs_by_itgt_box"] = nlist4close_srcs_by_itgt_box + + return result + + # }}} + + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + def process_direct(self, xlat_cost, traversal, direct_interaction_data, + box_target_counts_nonchild): + nlist1_srcs_by_itgt_box = ( + direct_interaction_data["nlist1_srcs_by_itgt_box"]) + nlist3close_srcs_by_itgt_box = ( + direct_interaction_data["nlist3close_srcs_by_itgt_box"]) + nlist4close_srcs_by_itgt_box = ( + direct_interaction_data["nlist4close_srcs_by_itgt_box"]) + + # list -> number of source-target interactions + npart_direct_list1 = 0 + npart_direct_list3 = 0 + npart_direct_list4 = 0 + + for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): + ntargets = box_target_counts_nonchild[tgt_ibox] + + npart_direct_list1 += ntargets * nlist1_srcs_by_itgt_box[itgt_box] + npart_direct_list3 += ntargets * nlist3close_srcs_by_itgt_box[itgt_box] + npart_direct_list4 += ntargets * nlist4close_srcs_by_itgt_box[itgt_box] + + result = {} + result["eval_direct_list1"] = npart_direct_list1 * xlat_cost.direct() + result["eval_direct_list3"] = npart_direct_list3 * xlat_cost.direct() + result["eval_direct_list4"] = npart_direct_list4 * xlat_cost.direct() + + return result + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + def process_list2(self, xlat_cost, traversal, tree): + nm2l_by_level = np.zeros(tree.nlevels, dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] + + level = tree.box_levels[tgt_ibox] + nm2l_by_level[level] += end-start + + result = sum( + cost * xlat_cost.m2l(ilevel, ilevel) + for ilevel, cost in enumerate(nm2l_by_level)) + + return dict(multipole_to_local=result) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + def process_list3(self, xlat_cost, traversal, tree, box_target_counts_nonchild): + nmp_eval_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + + assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + + for ilevel, sep_smaller_list in enumerate( + traversal.from_sep_smaller_by_level): + for itgt_box, tgt_ibox in enumerate( + traversal.target_boxes_sep_smaller_by_source_level[ilevel]): + ntargets = box_target_counts_nonchild[tgt_ibox] + start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] + nmp_eval_by_source_level[ilevel] += ntargets * (end-start) + + result = sum( + cost * xlat_cost.m2p(ilevel) + for ilevel, cost in enumerate(nmp_eval_by_source_level)) + + return dict(eval_multipoles=result) + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + def process_list4(self, xlat_cost, traversal, tree): + nform_local_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + + for itgt_box in range(len(traversal.target_or_target_parent_boxes)): + start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.from_sep_bigger_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + level = tree.box_levels[src_ibox] + nform_local_by_source_level[level] += nsources + + result = sum( + cost * xlat_cost.p2l(ilevel) + for ilevel, cost in enumerate(nform_local_by_source_level)) + + return dict(form_locals=result) + + # }}} + + # {{{ propogate locals downward + + def process_refine_locals(self, xlat_cost, traversal, tree): + result = 0 + + for target_lev in range(1, tree.nlevels): + start, stop = traversal.level_start_target_or_target_parent_box_nrs[ + target_lev:target_lev+2] + source_lev = target_lev - 1 + result += (stop-start) * xlat_cost.l2l(source_lev, target_lev) + + return dict(refine_locals=result) + + # }}} + + # {{{ evaluate local expansions at non-qbx targets + + def process_eval_locals(self, xlat_cost, traversal, tree, nqbtl): + ntargets_by_level = np.zeros(tree.nlevels, dtype=np.intp) + + for target_lev in range(tree.nlevels): + start, stop = traversal.level_start_target_box_nrs[ + target_lev:target_lev+2] + for tgt_ibox in traversal.target_boxes[start:stop]: + ntargets_by_level[target_lev] += ( + nqbtl.box_target_counts_nonchild[tgt_ibox]) + + result = sum( + cost * xlat_cost.l2p(ilevel) + for ilevel, cost in enumerate(ntargets_by_level)) + + return dict(eval_locals=result) + + # }}} + + # {{{ collect data about direct interactions with qbx centers + + @staticmethod + def _collect_qbxl_direct_interaction_data(direct_interaction_data, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): + nlist1_srcs_by_itgt_box = ( + direct_interaction_data["nlist1_srcs_by_itgt_box"]) + nlist3close_srcs_by_itgt_box = ( + direct_interaction_data["nlist3close_srcs_by_itgt_box"]) + nlist4close_srcs_by_itgt_box = ( + direct_interaction_data["nlist4close_srcs_by_itgt_box"]) + + # center -> nsources + np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + + # center -> number of associated targets + nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] + nqbxl2p_by_center[itgt_center] = end - start + + itgt_box = qbx_center_to_target_box[tgt_icenter] + np2qbxl_list1_by_center[itgt_center] = ( + nlist1_srcs_by_itgt_box[itgt_box]) + np2qbxl_list3_by_center[itgt_center] = ( + nlist3close_srcs_by_itgt_box[itgt_box]) + np2qbxl_list4_by_center[itgt_center] = ( + nlist4close_srcs_by_itgt_box[itgt_box]) + + result = {} + result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center + result["np2qbxl_list3_by_center"] = np2qbxl_list3_by_center + result["np2qbxl_list4_by_center"] = np2qbxl_list4_by_center + result["nqbxl2p_by_center"] = nqbxl2p_by_center + + return result + + # }}} + + # {{{ eval target specific qbx expansions + + def process_eval_target_specific_qbxl(self, xlat_cost, direct_interaction_data, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): + + counts = self._collect_qbxl_direct_interaction_data( + direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts) + + result = {} + result["eval_target_specific_qbx_locals_list1"] = ( + sum(counts["np2qbxl_list1_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) + result["eval_target_specific_qbx_locals_list3"] = ( + sum(counts["np2qbxl_list3_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) + result["eval_target_specific_qbx_locals_list4"] = ( + sum(counts["np2qbxl_list4_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) + + return result + + # }}} + + # {{{ form global qbx locals + + def process_form_qbxl(self, xlat_cost, direct_interaction_data, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): + + counts = self._collect_qbxl_direct_interaction_data( + direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts) + + result = {} + result["form_global_qbx_locals_list1"] = ( + sum(counts["np2qbxl_list1_by_center"]) * xlat_cost.p2qbxl()) + result["form_global_qbx_locals_list3"] = ( + sum(counts["np2qbxl_list3_by_center"]) * xlat_cost.p2qbxl()) + result["form_global_qbx_locals_list4"] = ( + sum(counts["np2qbxl_list4_by_center"]) * xlat_cost.p2qbxl()) + + return result + + # }}} + + # {{{ translate from list 3 multipoles to qbx local expansions + + def process_m2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box_source_level): + nm2qbxl_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + + assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + + for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): + for tgt_icenter in global_qbx_centers: + icontaining_tgt_box = qbx_center_to_target_box_source_level[ + isrc_level][tgt_icenter] + + if icontaining_tgt_box == -1: + continue + + start, stop = ( + ssn.starts[icontaining_tgt_box], + ssn.starts[icontaining_tgt_box+1]) + + nm2qbxl_by_source_level[isrc_level] += stop-start + + result = sum( + cost * xlat_cost.m2qbxl(ilevel) + for ilevel, cost in enumerate(nm2qbxl_by_source_level)) + + return dict(translate_box_multipoles_to_qbx_local=result) + + # }}} + + # {{{ translate from box locals to qbx local expansions + + def process_l2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box): + nl2qbxl_by_level = np.zeros(tree.nlevels, dtype=np.intp) + + for tgt_icenter in global_qbx_centers: + itgt_box = qbx_center_to_target_box[tgt_icenter] + tgt_ibox = traversal.target_boxes[itgt_box] + level = tree.box_levels[tgt_ibox] + nl2qbxl_by_level[level] += 1 + + result = sum( + cost * xlat_cost.l2qbxl(ilevel) + for ilevel, cost in enumerate(nl2qbxl_by_level)) + + return dict(translate_box_local_to_qbx_local=result) + + # }}} + + # {{{ evaluate qbx local expansions + + def process_eval_qbxl(self, xlat_cost, global_qbx_centers, + center_to_targets_starts): + result = 0 + + for src_icenter in global_qbx_centers: + start, end = center_to_targets_starts[src_icenter:src_icenter+2] + result += (end - start) + + result *= xlat_cost.qbxl2p() + + return dict(eval_qbx_expansions=result) + + # }}} + + @log_process(logger, "model performance") + def __call__(self, geo_data, kernel, kernel_arguments): + """Analyze the given geometry and return performance data. + + :returns: An instance of :class:`ParametrizedCosts`. + """ + # FIXME: This should suport target filtering. + + result = OrderedDict() + + lpot_source = geo_data.lpot_source + + use_tsqbx = lpot_source._use_target_specific_qbx + + with cl.CommandQueue(geo_data.cl_context) as queue: + tree = geo_data.tree().get(queue) + traversal = geo_data.traversal(merge_close_lists=False).get(queue) + nqbtl = geo_data.non_qbx_box_target_lists().get(queue) + + box_target_counts_nonchild = nqbtl.box_target_counts_nonchild + + params = dict( + nlevels=tree.nlevels, + nboxes=tree.nboxes, + nsources=tree.nsources, + ntargets=tree.ntargets, + ncenters=geo_data.ncenters, + p_qbx=lpot_source.qbx_order, + ) + + for ilevel in range(tree.nlevels): + params["p_fmm_lev%d" % ilevel] = ( + lpot_source.fmm_level_to_order( + kernel.get_base_kernel(), kernel_arguments, tree, ilevel)) + + params.update(self.calibration_params) + + xlat_cost = ( + self.translation_cost_model_factory(tree.dimensions, tree.nlevels)) + + # {{{ construct local multipoles + + result.update(self.process_form_multipoles(xlat_cost, traversal, tree)) + + # }}} + + # {{{ propagate multipoles upward + + result.update(self.process_coarsen_multipoles(xlat_cost, traversal, tree)) + + # }}} + + direct_interaction_data = ( + self._collect_direction_interaction_data(traversal, tree)) + + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + result.update(self.process_direct( + xlat_cost, traversal, direct_interaction_data, + box_target_counts_nonchild)) + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + result.update(self.process_list2(xlat_cost, traversal, tree)) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + result.update(self.process_list3( + xlat_cost, traversal, tree, box_target_counts_nonchild)) + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + result.update(self.process_list4(xlat_cost, traversal, tree)) + + # }}} + + # {{{ propagate local_exps downward + + result.update(self.process_refine_locals(xlat_cost, traversal, tree)) + + # }}} + + # {{{ evaluate locals + + result.update(self.process_eval_locals(xlat_cost, traversal, tree, nqbtl)) + + # }}} + + global_qbx_centers = geo_data.global_qbx_centers() + + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + center_to_targets_starts = geo_data.center_to_tree_targets().starts + qbx_center_to_target_box_source_level = np.empty( + (tree.nlevels,), dtype=object) + + for src_level in range(tree.nlevels): + qbx_center_to_target_box_source_level[src_level] = ( + geo_data.qbx_center_to_target_box_source_level(src_level)) + + with cl.CommandQueue(geo_data.cl_context) as queue: + global_qbx_centers = global_qbx_centers.get( + queue=queue) + qbx_center_to_target_box = qbx_center_to_target_box.get( + queue=queue) + center_to_targets_starts = center_to_targets_starts.get( + queue=queue) + for src_level in range(tree.nlevels): + qbx_center_to_target_box_source_level[src_level] = ( + qbx_center_to_target_box_source_level[src_level] + .get(queue=queue)) + + # {{{ form global qbx locals or evaluate target specific qbx expansions + + if use_tsqbx: + result.update(self.process_eval_target_specific_qbxl( + xlat_cost, direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts)) + else: + result.update(self.process_form_qbxl( + xlat_cost, direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts)) + + # }}} + + # {{{ translate from list 3 multipoles to qbx local expansions + + result.update(self.process_m2qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box_source_level)) + + # }}} + + # {{{ translate from box local expansions to qbx local expansions + + result.update(self.process_l2qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box)) + + # }}} + + # {{{ evaluate qbx local expansions + + result.update(self.process_eval_qbxl( + xlat_cost, global_qbx_centers, center_to_targets_starts)) + + # }}} + + return ParametrizedCosts(result, params) + +# }}} + + +# {{{ calibrate performance model + +def _collect(expr, variables): + """Collect terms with respect to a list of variables. + + This applies :func:`sympy.simplify.collect` to the a :mod:`pymbolic` expression + with respect to the iterable of names in *variables*. + + Returns a dictionary mapping variable names to terms. + """ + from pymbolic.interop.sympy import PymbolicToSympyMapper, SympyToPymbolicMapper + p2s = PymbolicToSympyMapper() + s2p = SympyToPymbolicMapper() + + from sympy.simplify import collect + sympy_variables = [sp.var(v) for v in variables] + collect_result = collect(p2s(expr), sympy_variables, evaluate=False) + + result = {} + for v in variables: + try: + result[v] = s2p(collect_result[sp.var(v)]) + except KeyError: + continue + + return result + + +_FMM_STAGE_TO_CALIBRATION_PARAMETER = { + "form_multipoles": "c_p2m", + "coarsen_multipoles": "c_m2m", + "eval_direct": "c_p2p", + "multipole_to_local": "c_m2l", + "eval_multipoles": "c_m2p", + "form_locals": "c_p2l", + "refine_locals": "c_l2l", + "eval_locals": "c_l2p", + "form_global_qbx_locals": "c_p2qbxl", + "translate_box_multipoles_to_qbx_local": "c_m2qbxl", + "translate_box_local_to_qbx_local": "c_l2qbxl", + "eval_qbx_expansions": "c_qbxl2p", + "eval_target_specific_qbx_locals": "c_p2p_tsqbx", + } + + +def estimate_calibration_params(model_results, timing_results): + """Given a set of model results and matching timing results, estimate the best + calibration parameters for the model. + """ + + params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values()) + + nresults = len(model_results) + + if nresults != len(timing_results): + raise ValueError("must have same number of model and timing results") + + uncalibrated_times = {} + actual_times = {} + + for param in params: + uncalibrated_times[param] = np.zeros(nresults) + actual_times[param] = np.zeros(nresults) + + from pymbolic import evaluate + + for i, model_result in enumerate(model_results): + context = model_result.params.copy() + for param in params: + context[param] = var(param) + + # Represents the total modeled cost, but leaves the calibration + # parameters symbolic. + total_modeled_cost = evaluate( + sum(model_result.raw_costs.values()), + context=context) + + collected_times = _collect(total_modeled_cost, params) + + for param, time in collected_times.items(): + uncalibrated_times[param][i] = time + + for i, timing_result in enumerate(timing_results): + for param, time in timing_result.items(): + calibration_param = ( + _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) + actual_times[calibration_param][i] = time["process_elapsed"] + + result = {} + + for param in params: + uncalibrated = uncalibrated_times[param] + actual = actual_times[param] + + if np.allclose(uncalibrated, 0): + result[param] = float("NaN") + continue + + result[param] = ( + actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated)) + + return result + +# }}} + +# vim: foldmethod=marker diff --git a/pytential/qbx/target_assoc.py b/pytential/qbx/target_assoc.py index 11b70c4b2f1cee8db85fd2c00a159a28682d7fa1..5273b72b6fad7927245ede5195ae79ee7d4ea187 100644 --- a/pytential/qbx/target_assoc.py +++ b/pytential/qbx/target_assoc.py @@ -528,9 +528,9 @@ class TargetAssociationWrangler(TreeWranglerBase): return (found_target_close_to_panel == 1).all().get() @log_process(logger) - def try_find_centers(self, tree, peer_lists, lpot_source, - target_status, target_flags, target_assoc, - target_association_tolerance, debug, wait_for=None): + def find_centers(self, tree, peer_lists, lpot_source, + target_status, target_flags, target_assoc, + target_association_tolerance, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil @@ -750,7 +750,7 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler, target_flags = wrangler.make_target_flags(target_discrs_and_qbx_sides) - wrangler.try_find_centers(tree, peer_lists, lpot_source, target_status, + wrangler.find_centers(tree, peer_lists, lpot_source, target_status, target_flags, target_assoc, target_association_tolerance, debug) center_not_found = ( diff --git a/pytential/qbx/target_specific/__init__.py b/pytential/qbx/target_specific/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fb612799e9581ab4f37bdc66b0675f70a9617459 --- /dev/null +++ b/pytential/qbx/target_specific/__init__.py @@ -0,0 +1,24 @@ +__copyright__ = "Copyright (C) 2018 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from ._internal import * # noqa diff --git a/pytential/qbx/target_specific/_helmholtz_utils.c b/pytential/qbx/target_specific/_helmholtz_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..76c6b3888d5914da9c719f2193d664beb669f5a1 --- /dev/null +++ b/pytential/qbx/target_specific/_helmholtz_utils.c @@ -0,0 +1,516 @@ +/* This file contains routines for evaluating spherical Bessel and Hankel + functions. + + This is based on cdjseval3d.f and helmrouts3d.f from fmmlib3d, translated + with a hacked version of f2c and manually postprocessed. */ + +/* Original copyright notice: */ + +/* ********************************************************************** + +Copyright (c) 2009-2012, Leslie Greengard, Zydrunas Gimbutas +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************** */ + +#include + +/* declarations for f2c generated things */ + +typedef int integer; +typedef double doublereal; +typedef double complex doublecomplex; + +static inline double z_abs(doublecomplex *z) { + return cabs(*z); +} + +static inline void z_exp(doublecomplex *out, doublecomplex *z) { + *out = cexp(*z); +} + +static inline void z_sin(doublecomplex *out, doublecomplex *z) { + *out = csin(*z); +} + +static inline void z_cos(doublecomplex *out, doublecomplex *z) { + *out = ccos(*z); +} + +/* Start of functions borrowed from cdjseval3d.f */ + +/* Computation of spherical Bessel functions via recurrence */ + +/* ********************************************************************** */ +/* Subroutine */ int jfuns3d_(integer *ier, integer *nterms, doublecomplex * + z__, doublereal *scale, doublecomplex *fjs, integer *ifder, + doublecomplex *fjder, integer *lwfjs, integer *iscale, integer *ntop) +{ + /* Initialized data */ + + static doublereal upbound2 = 1e40; + static doublereal upbound2inv = 1e-40; + static doublereal tiny = 1e-200; + static doublereal done = 1.; + static doublereal zero = 0.; + + /* System generated locals */ + integer i__1; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer i__; + doublereal d0, d1, dd, dc1, dc2; + doublecomplex fj0, fj1, zinv, ztmp; + doublereal dcoef; + doublereal sctot; + doublecomplex zscale; + doublereal scalinv; + +/* ********************************************************************** */ + +/* PURPOSE: */ + +/* This subroutine evaluates the first NTERMS spherical Bessel */ +/* functions and if required, their derivatives. */ +/* It incorporates a scaling parameter SCALE so that */ + +/* fjs_n(z)=j_n(z)/SCALE^n */ +/* fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} */ + +/* NOTE: The scaling parameter SCALE is meant to be used when */ +/* abs(z) < 1, in which case we recommend setting */ +/* SCALE = abs(z). This prevents the fjs_n from */ +/* underflowing too rapidly. */ +/* Otherwise, set SCALE=1. */ +/* Do not set SCALE = abs(z) if z could take on the */ +/* value zero. */ +/* In an FMM, when forming an expansion from a collection of */ +/* sources, set SCALE = min( abs(k*r), 1) */ +/* where k is the Helmholtz parameter and r is the box dimension */ +/* at the relevant level. */ + +/* INPUT: */ + +/* nterms (integer): order of expansion of output array fjs */ +/* z (complex *16): argument of the spherical Bessel functions */ +/* scale (real *8) : scaling factor (discussed above) */ +/* ifder (integer): flag indicating whether to calculate "fjder" */ +/* 0 NO */ +/* 1 YES */ +/* lwfjs (integer): upper limit of input arrays */ +/* fjs(0:lwfjs) and iscale(0:lwfjs) */ +/* iscale (integer): integer workspace used to keep track of */ +/* internal scaling */ + +/* OUTPUT: */ + +/* ier (integer): error return code */ +/* ier=0 normal return; */ +/* ier=8 insufficient array dimension lwfjs */ +/* fjs (complex *16): array of scaled Bessel functions. */ +/* fjder (complex *16): array of derivs of scaled Bessel functions. */ +/* ntop (integer) : highest index in arrays fjs that is nonzero */ + +/* NOTE, that fjs and fjder arrays must be at least (nterms+2) */ +/* complex *16 elements long. */ + + + + +/* ... Initializing ... */ + + *ier = 0; + +/* set to asymptotic values if argument is sufficiently small */ + + if (z_abs(z__) < tiny) { + fjs[0] = done; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + fjs[i__] = zero; + } + + if (*ifder == 1) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjder[i__] = zero; + } + fjder[1] = done / (*scale * 3); + } + + return 0; + } + +/* ... Step 1: recursion up to find ntop, starting from nterms */ + + *ntop = 0; + zinv = done / *z__; + fjs[*nterms] = done; + fjs[*nterms - 1] = zero; + + i__1 = *lwfjs; + for (i__ = *nterms; i__ <= i__1; ++i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ - 1]; + fjs[i__ + 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + *ntop = i__ + 1; + break; + } + } + if (*ntop == 0) { + *ier = 8; + return 0; + } + +/* ... Step 2: Recursion back down to generate the unscaled jfuns: */ +/* if magnitude exceeds UPBOUND2, rescale and continue the */ +/* recursion (saving the order at which rescaling occurred */ +/* in array iscale. */ + + i__1 = *ntop; + for (i__ = 0; i__ <= i__1; ++i__) { + iscale[i__] = 0; + } + + fjs[*ntop] = zero; + fjs[*ntop - 1] = done; + for (i__ = *ntop - 1; i__ >= 1; --i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ + 1]; + fjs[i__ - 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + fjs[i__] *= upbound2inv; + fjs[i__ - 1] *= upbound2inv; + iscale[i__] = 1; + } +/* L2200: */ + } + +/* ... Step 3: go back up to the top and make sure that all */ +/* Bessel functions are scaled by the same factor */ +/* (i.e. the net total of times rescaling was invoked */ +/* on the way down in the previous loop). */ +/* At the same time, add scaling to fjs array. */ + + scalinv = done / *scale; + sctot = 1.; + i__1 = *ntop; + for (i__ = 1; i__ <= i__1; ++i__) { + sctot *= scalinv; + if (iscale[i__ - 1] == 1) { + sctot *= upbound2inv; + } + fjs[i__] *= sctot; + } + +/* ... Determine the normalization parameter: */ + + z_sin(&z__1, z__); + fj0 = z__1 * zinv; + z_cos(&z__1, z__); + fj1 = fj0 * zinv - z__1 * zinv; + + d0 = z_abs(&fj0); + d1 = z_abs(&fj1); + if (d1 > d0) { + zscale = fj1 / (fjs[1] * *scale); + } else { + zscale = fj0 / fjs[0]; + } + +/* ... Scale the jfuns by zscale: */ + + ztmp = zscale; + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjs[i__] *= ztmp; + } + +/* ... Finally, calculate the derivatives if desired: */ + + if (*ifder == 1) { + fjs[*nterms + 1] *= ztmp; + + fjder[0] = -fjs[1] * *scale; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dc1 = i__ / ((i__ << 1) + done); + dc2 = done - dc1; + dc1 *= scalinv; + dc2 *= *scale; + fjder[i__] = dc1 * fjs[i__ - 1] - dc2 * fjs[i__ + 1]; + } + } + return 0; +} /* jfuns3d_ */ + +/* Start of functions borrowed from helmrouts3d.f */ + +/* This file contains the basic subroutines for */ +/* forming and evaluating multipole (partial wave) expansions. */ + +/* Documentation is incomplete and ongoing... */ + + +/* Remarks on scaling conventions. */ + +/* 1) Hankel and Bessel functions are consistently scaled as */ +/* hvec(n)= h_n(z)*scale^(n) */ +/* jvec(n)= j_n(z)/scale^(n) */ + +/* In some earlier FMM implementations, the convention */ +/* hvec(n)= h_n(z)*scale^(n+1) */ +/* was sometimes used, leading to various obscure rescaling */ +/* steps. */ + +/* scale should be of the order of |z| if |z| < 1. Otherwise, */ +/* scale should be set to 1. */ + + +/* 2) There are many definitions of the spherical harmonics, */ +/* which differ in terms of normalization constants. We */ +/* adopt the following convention: */ + +/* For m>0, we define Y_n^m according to */ + +/* Y_n^m = \sqrt{2n+1} \sqrt{\frac{ (n-m)!}{(n+m)!}} \cdot */ +/* P_n^m(\cos \theta) e^{i m phi} */ +/* and */ + +/* Y_n^-m = dconjg( Y_n^m ) */ + +/* We omit the Condon-Shortley phase factor (-1)^m in the */ +/* definition of Y_n^m for m<0. (This is standard in several */ +/* communities.) */ + +/* We also omit the factor \sqrt{\frac{1}{4 \pi}}, so that */ +/* the Y_n^m are orthogonal on the unit sphere but not */ +/* orthonormal. (This is also standard in several communities.) */ +/* More precisely, */ + +/* \int_S Y_n^m Y_n^m d\Omega = 4 \pi. */ + +/* Using our standard definition, the addition theorem takes */ +/* the simple form */ + +/* e^( i k r}/(ikr) = */ +/* \sum_n \sum_m j_n(k|S|) Ylm*(S) h_n(k|T|) Ylm(T) */ + + +/* ----------------------------------------------------------------------- */ +/* h3d01: computes h0, h1 (first two spherical Hankel fns.) */ +/* h3dall: computes Hankel functions of all orders and scales them */ +/* ********************************************************************** */ +/* Subroutine */ int h3d01_(doublecomplex *z__, doublecomplex *h0, + doublecomplex *h1) +{ + /* Initialized data */ + + static doublecomplex eye = I; + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* System generated locals */ + doublecomplex z__1; + + /* Local variables */ + doublecomplex cd; + +/* ********************************************************************** */ + +/* Compute spherical Hankel functions of order 0 and 1 */ + +/* h0(z) = exp(i*z)/(i*z), */ +/* h1(z) = - h0' = -h0*(i-1/z) = h0*(1/z-i) */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* z : argument of Hankel functions */ +/* if abs(z)<1.0d-15, returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* h0 : h0(z) (spherical Hankel function of order 0). */ +/* h1 : -h0'(z) (spherical Hankel function of order 1). */ + +/* ----------------------------------------------------------------------- */ + + if (z_abs(z__) < thresh) { + *h0 = 0.; + *h1 = 0.; + return 0; + } + +/* Otherwise, use formula */ + + cd = eye * *z__; + z_exp(&z__1, &cd); + *h0 = z__1 / cd; + *h1 = *h0 * (done / *z__ - eye); + + return 0; +} /* h3d01_ */ + + + + +/* ********************************************************************** */ +/* Subroutine */ int h3dall_(integer *nterms, doublecomplex *z__, doublereal * + scale, doublecomplex *hvec, integer *ifder, doublecomplex *hder) +{ + /* Initialized data */ + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* Builtin functions */ + double z_abs(doublecomplex *); + + /* Local variables */ + integer i__; + integer i__1; + doublereal dtmp; + doublecomplex zinv, ztmp; + doublereal scal2; + +/* ********************************************************************** */ + +/* This subroutine computes scaled versions of the spherical Hankel */ +/* functions h_n of orders 0 to nterms. */ + +/* hvec(n)= h_n(z)*scale^(n) */ + +/* The parameter SCALE is useful when |z| < 1, in which case */ +/* it damps out the rapid growth of h_n as n increases. In such */ +/* cases, we recommend setting */ + +/* scale = |z| */ + +/* or something close. If |z| > 1, set scale = 1. */ + +/* If the flag IFDER is set to one, it also computes the */ +/* derivatives of h_n. */ + +/* hder(n)= h_n'(z)*scale^(n) */ + +/* NOTE: If |z| < 1.0d-15, the subroutine returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* nterms : highest order of the Hankel functions to be computed. */ +/* z : argument of the Hankel functions. */ +/* scale : scaling parameter discussed above */ +/* ifder : flag indcating whether derivatives should be computed. */ +/* ifder = 1 ==> compute */ +/* ifder = 0 ==> do not compute */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* hvec : the vector of spherical Hankel functions */ +/* hder : the derivatives of the spherical Hankel functions */ + +/* ----------------------------------------------------------------------- */ + + +/* If |z| < thresh, return zeros. */ + + if (z_abs(z__) < thresh) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + hvec[i__] = 0; + hder[i__] = 0; + } + return 0; + } + +/* Otherwise, get h_0 and h_1 analytically and the rest via */ +/* recursion. */ + + h3d01_(z__, hvec, &hvec[1]); + hvec[0] = hvec[0]; + hvec[1] *= *scale; + +/* From Abramowitz and Stegun (10.1.19) */ + +/* h_{n+1}(z)=(2n+1)/z * h_n(z) - h_{n-1}(z) */ + +/* With scaling: */ + +/* hvec(n+1)=scale*(2n+1)/z * hvec(n) -(scale**2) hvec(n-1) */ + + scal2 = *scale * *scale; + zinv = *scale / *z__; + i__1 = *nterms - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = (i__ << 1) + done; + ztmp = zinv * dtmp; + hvec[i__ + 1] = ztmp * hvec[i__] - scal2 * hvec[i__ - 1]; + } + +/* From Abramowitz and Stegun (10.1.21) */ + +/* h_{n}'(z)= h_{n-1}(z) - (n+1)/z * h_n(z) */ + +/* With scaling: */ + +/* hder(n)=scale* hvec(n-1) - (n+1)/z * hvec(n) */ + + + if (*ifder == 1) { + + hder[0] = -hvec[1] / *scale; + zinv = 1. / *z__; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = i__ + done; + ztmp = zinv * dtmp; + hder[i__] = *scale * hvec[i__ - 1] - ztmp * hvec[i__]; + } + } + + return 0; +} /* h3dall_ */ + diff --git a/pytential/qbx/target_specific/_helmholtz_utils.h b/pytential/qbx/target_specific/_helmholtz_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..1e605a4df2c576be7aaa1d2dcb8981781f2a7d28 --- /dev/null +++ b/pytential/qbx/target_specific/_helmholtz_utils.h @@ -0,0 +1,14 @@ +#ifndef HELMHOLTZ_UTILS_H +#define HELMHOLTZ_UTILS_H + +#include + +extern int jfuns3d_(int *ier, int *nterms, double complex *z, + double *scale, double complex *fjs, int *ifder, + double complex *fjder, int *lwfjs, int *iscale, + int *ntop); + +extern int h3dall_(int *nterms, double complex *z, double *scale, + double complex *hvec, int *ifder, double complex *hder); + +#endif /* HELMHOLTZ_UTILS_H */ diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..272302d8cbc29254974ac45a374ffbd1f4f5a301 --- /dev/null +++ b/pytential/qbx/target_specific/_internal.h @@ -0,0 +1,10 @@ +#ifndef INTERNAL_H +#define INTERNAL_H + +// Temporary buffer size for holding e.g. Legendre polynomial values +#define BUFSIZE 64 + +// Padding for false sharing prevention +#define PADDING 65 + +#endif diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx new file mode 100644 index 0000000000000000000000000000000000000000..1175e37d9db2c0e9ea48aa1955d924425d2672f7 --- /dev/null +++ b/pytential/qbx/target_specific/_internal.pyx @@ -0,0 +1,764 @@ +#!python +#cython: warn.unused=True, warn.unused_arg=True, warn.unreachable=True, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True, language_level=3 + +import numpy as np +import cython +import cython.parallel + +from libc.math cimport sqrt +from libc.stdio cimport printf, fprintf, stderr +from libc.stdlib cimport abort + +cimport openmp + + +cdef extern from "complex.h" nogil: + double cabs(double complex) + + +cdef extern from "_helmholtz_utils.h" nogil: + int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale, + double complex *fjs, int *ifder, double complex *fjder, + int *lwfjs, int *iscale, int *ntop); + int h3dall_(int *nterms, double complex *z, double *scale, + double complex *hvec, int *ifder, double complex *hder); + + +cdef extern from "_internal.h" nogil: + const int BUFSIZE + const int PADDING + + +# {{{ (externally visible) wrappers for bessel / hankel functions + +def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): + """Evaluate spherical Bessel functions. + + Arguments: + nterms: Highest order to be computed + z: Argument + scale: Output scaling factor (recommended: min(abs(z), 1)) + fjs: Output array of complex doubles + fjder: *None*, or output array of complex double derivatives + """ + cdef: + double complex[BUFSIZE] fjstemp + double complex[BUFSIZE] fjdertmp + int[BUFSIZE] iscale + int ier, ifder, lwfjs, ntop, i, nterms_ + double scale_ + double complex z_ + + if nterms <= 0: + raise ValueError("nterms should be positive") + + nterms_ = nterms + z_ = z + scale_ = scale + ifder = fjder is not None + lwfjs = BUFSIZE + + jfuns3d_(&ier, &nterms_, &z_, &scale_, fjstemp, &ifder, fjdertmp, &lwfjs, + iscale, &ntop) + + if ier: + raise ValueError("jfuns3d_ returned error code %d" % ier) + + for i in range(1 + nterms): + fjs[i] = fjstemp[i] + if ifder: + fjder[i] = fjdertmp[i] + + +def h3dall_wrapper(nterms, z, scale, hs, hders): + """Evaluate spherical Hankel functions. + + Arguments: + nterms: Highest order to be computed + z: Argument + scale: Output scaling factor (recommended: min(abs(z), 1)) + hs: Output array of complex doubles + hders: *None*, or output array of complex double derivatives + """ + cdef: + int nterms_, ifder + double scale_ + double complex z_ + double complex[:] hvec = np.empty(1 + nterms, np.complex) + double complex[:] hdervec = np.empty(1 + nterms, np.complex) + + ifder = hders is not None + + if nterms <= 0: + raise ValueError("nterms should be positive") + + z_ = z + scale_ = scale + nterms_ = nterms + + h3dall_(&nterms_, &z_, &scale_, &hvec[0], &ifder, &hdervec[0]) + + hs[:1 + nterms] = hvec[:] + if ifder: + hders[:1 + nterms] = hdervec[:] + +# }}} + + +# {{{ helpers + +cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: + """Compute the values of the Legendre polynomial up to order n at x. + Optionally, if derivs is non-NULL, compute the values of the derivative too. + + Borrowed from fmmlib. + """ + cdef: + double pj, derj, pjm2, pjm1, derjm2, derjm1 + int j + + pjm2 = 1 + pjm1 = x + + vals[0] = 1 + if derivs != NULL: + derivs[0] = 0 + derjm2 = 0 + derjm1 = 1 + + if n == 0: + return + + vals[1] = x + if derivs != NULL: + derivs[1] = 1 + + if n == 1: + return + + for j in range(2, n + 1): + pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j + vals[j] = pj + + if derivs != NULL: + derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2 + derj = derj / j + derivs[j] = derj + derjm2 = derjm1 + derjm1 = derj + + pjm2 = pjm1 + pjm1 = pj + + +cdef double dist(double[3] a, double[3] b) nogil: + """Calculate the Euclidean distance between a and b.""" + return sqrt( + (a[0] - b[0]) * (a[0] - b[0]) + + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2])) + + +cdef void ts_helmholtz_precompute( + double[3] center, + double[3] target, + int order, + int ifder, + double complex k, + double complex[] jvals, + double complex[] jderivs, + double *jscale) nogil: + """Evaluate the source-invariant Bessel terms of the Helmholtz target-specific + expansion.""" + + cdef: + double complex z + double tc_d + int ier, ntop, lwfjs + int[BUFSIZE] iscale + + tc_d = dist(target, center) + jscale[0] = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 + + # Evaluate the spherical Bessel terms. + z = k * tc_d + lwfjs = BUFSIZE + # jfuns3d_ only supports order > 0 (goes out of bounds if order = 0) + order = max(1, order) + jfuns3d_(&ier, &order, &z, jscale, jvals, &ifder, jderivs, &lwfjs, iscale, + &ntop) + if ier: + # This could in theory fail. + fprintf(stderr, "array passed to jfuns3d_ was too small\n") + abort() + +# }}} + + +# {{{ Laplace S + +cdef double complex ts_laplace_s( + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order) nogil: + """Evaluate the target-specific expansion of the Laplace single-layer kernel.""" + + cdef: + double j + double result, r, sc_d, tc_d, cos_angle + # Legendre recurrence values + double pj, pjm1, pjm2 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + / (tc_d * sc_d)) + + if order == 0: + return charge / sc_d + + pjm2 = 1 + pjm1 = cos_angle + + result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) + + r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + + # Invariant: j matches loop counter. Using a double-precision version of the + # loop counter avoids an int-to-double conversion inside the loop. + j = 2. + + for _ in range(2, order + 1): + pj = ( (2.*j-1.)*cos_angle*pjm1-(j-1.)*pjm2 ) / j + result += pj * r + + r *= (tc_d / sc_d) + j += 1 + pjm2 = pjm1 + pjm1 = pj + + return charge * result + +# }}} + + +# {{{ Laplace grad(S) + +cdef void ts_laplace_sp( + double complex[3] grad, + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order) nogil: + """Evaluate the target-specific expansion of the gradient of the Laplace + single-layer kernel.""" + + cdef: + double[3] grad_tmp + double sc_d, tc_d, cos_angle, Rn + double[BUFSIZE] lvals, lderivs + double[3] smc, tmc + int n + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad_tmp[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Invariant: Rn = tc_d ** (n - 1) / sc_d ** (n + 1) + Rn = 1 / (sc_d * sc_d) + + for n in range(1, 1 + order): + for m in range(3): + grad_tmp[m] += Rn * ( + n * (tmc[m] / tc_d) * lvals[n] + + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d) * lderivs[n]) + Rn *= tc_d / sc_d + + for m in range(3): + grad[m] += charge * grad_tmp[m] + +# }}} + + +# {{{ Laplace D + +cdef double complex ts_laplace_d( + double[3] source, + double[3] center, + double[3] target, + double[3] dipole, + double complex dipstr, + int order) nogil: + """Evaluate the target-specific expansion of the Laplace double-layer kernel.""" + + cdef: + int n, m + double sc_d, tc_d, cos_angle, Rn + double[BUFSIZE] lvals, lderivs + double[3] smc, tmc, grad + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Invariant: Rn = (tc_d ** n / sc_d ** (n + 2)) + Rn = 1 / (sc_d * sc_d) + + for n in range(0, order + 1): + for m in range(3): + grad[m] += Rn * ( + -(n + 1) * (smc[m] / sc_d) * lvals[n] + + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d) * lderivs[n]) + Rn *= (tc_d / sc_d) + + return dipstr * ( + dipole[0] * grad[0] + dipole[1] * grad[1] + dipole[2] * grad[2]) + +# }}} + + +# {{{ Helmholtz S + +cdef double complex ts_helmholtz_s( + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order, + double complex k, + double complex[] jvals, + double jscale) nogil: + """Evaluate the target-specific expansion of the Helmholtz single-layer + kernel.""" + + cdef: + int n, ifder + double sc_d, tc_d, cos_angle + double[BUFSIZE] lvals + double complex[BUFSIZE] hvals + double hscale, unscale + double complex z, result + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + / (tc_d * sc_d)) + + # Evaluate the Legendre terms. + legvals(cos_angle, order, lvals, NULL) + + # Scaling magic for Hankel terms. + # These values are taken from the fmmlib documentation. + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 0 + h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) + + result = 0 + + for n in range(1 + order): + result += (2 * n + 1) * unscale * (jvals[n] * hvals[n] * lvals[n]) + unscale *= jscale / hscale + + return 1j * k * charge * result + +# }}} + + +# {{{ Helmholtz grad(S) + +cdef void ts_helmholtz_sp( + double complex[3] grad, + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order, + double complex k, + double complex[] jvals, + double complex[] jderivs, + double jscale) nogil: + """Evaluate the target-specific expansion of the gradient of the Helmholtz + single-layer kernel.""" + + cdef: + int n, m + int ifder + double sc_d, tc_d, cos_angle + double[3] smc, tmc + double complex[3] grad_tmp + double[BUFSIZE] lvals, lderivs + double complex z + double complex [BUFSIZE] hvals + double hscale, unscale + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad_tmp[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + # Evaluate the Legendre terms. + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Scaling magic for Hankel terms. + # These values are taken from the fmmlib documentation. + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 0 + h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) + + # + # This is a mess, but amounts to the t-gradient of: + # + # __ order + # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) + # /__ n = 0 n n n + # + # + for n in range(0, order + 1): + for m in range(3): + grad_tmp[m] += (2 * n + 1) * unscale * hvals[n] / tc_d * ( + k * jderivs[n] * lvals[n] * tmc[m] + + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d) + * jvals[n] * lderivs[n]) + unscale *= jscale / hscale + + for m in range(3): + grad[m] += 1j * k * charge * grad_tmp[m] + +# }}} + + +# {{{ Helmholtz D + +cdef double complex ts_helmholtz_d( + double[3] source, + double[3] center, + double[3] target, + double[3] dipole, + double complex dipstr, + int order, + double complex k, + double complex[] jvals, + double jscale) nogil: + """Evaluate the target-specific expansion of the Helmholtz double-layer + kernel.""" + + cdef: + int n, m + int ifder + double sc_d, tc_d, cos_angle + double[3] smc, tmc + double complex[3] grad + double[BUFSIZE] lvals, lderivs + double complex z + double complex [BUFSIZE] hvals, hderivs + double hscale, unscale + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + + # Evaluate the Legendre terms. + legvals(cos_angle, order, lvals, lderivs) + + # Scaling magic for Hankel terms. + # These values are taken from the fmmlib documentation. + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 1 + h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) + + # + # This is a mess, but amounts to the s-gradient of: + # + # __ order + # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) + # /__ n = 0 n n n + # + # + for n in range(0, order + 1): + for m in range(3): + grad[m] += (2 * n + 1) * unscale * jvals[n] / sc_d * ( + k * smc[m] * hderivs[n] * lvals[n] + + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d) + * hvals[n] * lderivs[n]) + unscale *= jscale / hscale + + return 1j * k * dipstr * ( + grad[0] * dipole[0] + grad[1] * dipole[1] + grad[2] * dipole[2]) + +# }}} + + +def eval_target_specific_qbx_locals( + int ifpot, + int ifgrad, + int ifcharge, + int ifdipole, + int order, + double[:,:] sources, + double[:,:] targets, + double[:,:] centers, + int[:] qbx_centers, + int[:] qbx_center_to_target_box, + int[:] center_to_target_starts, int[:] center_to_target_lists, + int[:] source_box_starts, int[:] source_box_lists, + int[:] box_source_starts, int[:] box_source_counts_nonchild, + double complex helmholtz_k, + double complex[:] charge, + double complex[:] dipstr, + double[:,:] dipvec, + double complex[:] pot, + double complex[:,:] grad): + """TSQBX entry point. + + Arguments: + ifpot: Flag indicating whether to evaluate the potential + ifgrad: Flag indicating whether to evaluate the gradient of the potential + ifcharge: Flag indicating whether to include monopole sources + ifdipole: Flag indicating whether to include dipole sources + order: Expansion order + sources: Array of sources of shape (3, *nsrcs*) + targets: Array of targets of shape (3, *ntgts*) + centers: Array of centers of shape (3, *nctrs*) + qbx_centers: Array of subset of indices into *centers* which are QBX centers + qbx_center_to_target_box: Array mapping centers to target box numbers + center_to_target_starts: "Start" indices for center-to-target CSR list + center_to_target_lists: Center-to-target CSR list + source_box_starts: "Start" indices for target-box-to-source-box CSR list + source_box_lists: Target-box-to-source-box CSR list + box_source_starts: "Start" indices for sources for each box + box_source_counts_nonchild: Number of sources per box + helmholtz_k: Helmholtz parameter (Pass 0 for Laplace) + charge: (Complex) Source strengths, shape (*nsrcs*,), or *None* + dipstr: (Complex) Dipole source strengths, shape (*nsrcs*,) or *None* + dipvec: (Real) Dipole source orientations, shape (3, *nsrcs*), or *None* + pot: (Complex) Output potential, shape (*ngts*,), or *None* + grad: (Complex) Output gradient, shape (3, *ntgts*), or *None* + """ + + cdef: + int tgt, ictr, ctr + int itgt, itgt_start, itgt_end + int tgt_box, src_ibox + int isrc_box, isrc_box_start, isrc_box_end + int isrc, isrc_start, isrc_end + int tid, m + double jscale + double complex result + double[:,:] source, center, target, dipole + double complex[:,:] result_grad, jvals, jderivs + int laplace_s, helmholtz_s, laplace_sp, helmholtz_sp, laplace_d, helmholtz_d + + # {{{ process arguments + + if ifcharge: + if charge is None: + raise ValueError("Missing charge") + + if ifdipole: + if dipstr is None: + raise ValueError("Missing dipstr") + if dipvec is None: + raise ValueError("Missing dipvec") + + if ifdipole and ifgrad: + raise ValueError("Does not support computing gradient of dipole sources") + + if helmholtz_k == 0: + helmholtz_s = helmholtz_sp = helmholtz_d = 0 + + if ifpot: + laplace_s = ifcharge + laplace_d = ifdipole + + if ifgrad: + laplace_sp = ifcharge + + else: + laplace_s = laplace_sp = laplace_d = 0 + + if ifpot: + helmholtz_s = ifcharge + helmholtz_d = ifdipole + + if ifgrad: + helmholtz_sp = ifcharge + + # }}} + + if not any([ + laplace_s, laplace_sp, laplace_d, helmholtz_s, helmholtz_sp, + helmholtz_d]): + return + + if qbx_centers.shape[0] == 0: + return + + # {{{ set up thread-local storage + + # Hack to obtain thread-local storage + maxthreads = openmp.omp_get_max_threads() + + # Prevent false sharing by padding the thread-local buffers + source = np.zeros((maxthreads, PADDING)) + target = np.zeros((maxthreads, PADDING)) + center = np.zeros((maxthreads, PADDING)) + dipole = np.zeros((maxthreads, PADDING)) + result_grad = np.zeros((maxthreads, PADDING), dtype=np.complex) + jvals = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex) + jderivs = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex) + + # TODO: Check that the order is not too high, since temporary + # arrays in this module that are limited by BUFSIZE may overflow + # if that is the case + + # }}} + + for ictr in cython.parallel.prange(0, qbx_centers.shape[0], + nogil=True, schedule="static", + chunksize=128): + # Assign to jscale so Cython marks it as private + jscale = 0 + ctr = qbx_centers[ictr] + itgt_start = center_to_target_starts[ctr] + itgt_end = center_to_target_starts[ctr + 1] + tgt_box = qbx_center_to_target_box[ctr] + tid = cython.parallel.threadid() + + for m in range(3): + center[tid, m] = centers[m, ctr] + + for itgt in range(itgt_start, itgt_end): + result = 0 + tgt = center_to_target_lists[itgt] + + for m in range(3): + target[tid, m] = targets[m, tgt] + if ifgrad: + result_grad[tid, m] = 0 + + if helmholtz_s or helmholtz_sp or helmholtz_d: + # Precompute source-invariant Helmholtz terms. + ts_helmholtz_precompute( + ¢er[tid, 0], &target[tid, 0], + order, ifgrad, helmholtz_k, &jvals[tid, 0], + &jderivs[tid, 0], &jscale) + + isrc_box_start = source_box_starts[tgt_box] + isrc_box_end = source_box_starts[tgt_box + 1] + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = source_box_lists[isrc_box] + isrc_start = box_source_starts[src_ibox] + isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for isrc in range(isrc_start, isrc_end): + + for m in range(3): + source[tid, m] = sources[m, isrc] + if ifdipole: + dipole[tid, m] = dipvec[m, isrc] + + # NOTE: Don't use +=, since that makes Cython think we are + # doing an OpenMP reduction. + + # {{{ evaluate potentials + + if laplace_s: + result = result + ( + ts_laplace_s( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order)) + + if laplace_sp: + ts_laplace_sp( + &result_grad[tid, 0], + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order) + + if laplace_d: + result = result + ( + ts_laplace_d( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + &dipole[tid, 0], dipstr[isrc], order)) + + if helmholtz_s: + result = result + ( + ts_helmholtz_s(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], charge[isrc], order, helmholtz_k, + &jvals[tid, 0], jscale)) + + if helmholtz_sp: + ts_helmholtz_sp( + &result_grad[tid, 0], + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order, helmholtz_k, + &jvals[tid, 0], &jderivs[tid, 0], jscale) + + if helmholtz_d: + result = result + ( + ts_helmholtz_d( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + &dipole[tid, 0], dipstr[isrc], order, helmholtz_k, + &jvals[tid, 0], jscale)) + + # }}} + + if ifpot: + pot[tgt] = result + + if ifgrad: + for m in range(3): + grad[m, tgt] = result_grad[tid, m] diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py index b0ffc066035c0c8f1aea690ecc5dcb2618367275..ca8b08df88a3fd10364a4c317bf779b1b134c7b5 100644 --- a/pytential/qbx/utils.py +++ b/pytential/qbx/utils.py @@ -601,4 +601,70 @@ def build_tree_with_qbx_metadata( # }}} + +# {{{ host geo data wrapper + +class ToHostTransferredGeoDataWrapper(object): + """Wraps an instance of :class:`pytential.qbx.geometry.QBXFMMGeometryData`, + automatically converting returned OpenCL arrays to host data. + """ + + def __init__(self, queue, geo_data): + self.queue = queue + self.geo_data = geo_data + + @memoize_method + def tree(self): + return self.geo_data.tree().get(queue=self.queue) + + @memoize_method + def traversal(self): + return self.geo_data.traversal().get(queue=self.queue) + + @property + def lpot_source(self): + return self.geo_data.lpot_source + + @property + def ncenters(self): + return self.geo_data.ncenters + + @memoize_method + def centers(self): + return np.array([ + ci.get(queue=self.queue) + for ci in self.geo_data.centers()]) + + @memoize_method + def expansion_radii(self): + return self.geo_data.expansion_radii().get(queue=self.queue) + + @memoize_method + def global_qbx_centers(self): + return self.geo_data.global_qbx_centers().get(queue=self.queue) + + @memoize_method + def qbx_center_to_target_box(self): + return self.geo_data.qbx_center_to_target_box().get(queue=self.queue) + + @memoize_method + def qbx_center_to_target_box_source_level(self, source_level): + return self.geo_data.qbx_center_to_target_box_source_level( + source_level).get(queue=self.queue) + + @memoize_method + def non_qbx_box_target_lists(self): + return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue) + + @memoize_method + def center_to_tree_targets(self): + return self.geo_data.center_to_tree_targets().get(queue=self.queue) + + @memoize_method + def all_targets(self): + """All (not just non-QBX) targets packaged into a single array.""" + return np.array(list(self.tree().targets)) + +# }}} + # vim: foldmethod=marker:filetype=pyopencl diff --git a/pytential/source.py b/pytential/source.py index 9334dff7e1785cf7d1b36c991effcd6382d2a52e..93d8050ef41fc5cf8c032790f1bc1b6674dbe547 100644 --- a/pytential/source.py +++ b/pytential/source.py @@ -27,6 +27,7 @@ import numpy as np # noqa: F401 import pyopencl as cl # noqa: F401 import six from pytools import memoize_method +from sumpy.fmm import UnableToCollectTimingData __doc__ = """ @@ -123,7 +124,18 @@ class PointPotentialSource(PotentialSource): return p2p - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def perf_model_compute_potential_insn(self, queue, insn, bound_expr, + evaluate, costs): + raise NotImplementedError + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from warnings import warn + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + p2p = None kernel_args = {} @@ -148,7 +160,8 @@ class PointPotentialSource(PotentialSource): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result + timing_data = {} + return result, timing_data @memoize_method def weights_and_area_elements(self): @@ -188,6 +201,7 @@ class LayerPotentialSourceBase(PotentialSource): .. rubric:: Execution .. method:: weights_and_area_elements + .. method:: perf_model_compute_potential_insn .. method:: exec_compute_potential_insn """ diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py index 456044e75a3903b08ffdfa6537f1fac0cd1f6e09..5959b598fa6b20c8b4d3f95a518174c7e600bc28 100644 --- a/pytential/symbolic/compiler.py +++ b/pytential/symbolic/compiler.py @@ -47,9 +47,6 @@ class Instruction(Record): def __str__(self): raise NotImplementedError - def get_exec_function(self, exec_mapper): - raise NotImplementedError - class Assign(Instruction): # attributes: names, exprs, do_not_return, priority @@ -111,9 +108,6 @@ class Assign(Instruction): lines.append("}") return "\n".join(lines) - def get_exec_function(self, exec_mapper): - return exec_mapper.exec_assign - def __hash__(self): return id(self) @@ -223,10 +217,6 @@ class ComputePotentialInstruction(Instruction): return "{ /* Pot(%s) */\n %s\n}" % ( ", ".join(args), "\n ".join(lines)) - def get_exec_function(self, exec_mapper): - source = exec_mapper.bound_expr.places[self.source] - return source.exec_compute_potential_insn - def __hash__(self): return id(self) @@ -362,6 +352,14 @@ class Code(object): return argmax2(available_insns), discardable_vars + @staticmethod + def get_exec_function(insn, exec_mapper): + if isinstance(insn, Assign): + return exec_mapper.exec_assign + if isinstance(insn, ComputePotentialInstruction): + return exec_mapper.exec_compute_potential_insn + raise ValueError("unknown instruction class: %s" % type(insn)) + def execute(self, exec_mapper, pre_assign_check=None): """Execute the instruction stream, make all scheduling decisions dynamically. @@ -389,7 +387,7 @@ class Code(object): done_insns.add(insn) assignments = ( - insn.get_exec_function(exec_mapper) + self.get_exec_function(insn, exec_mapper) (exec_mapper.queue, insn, exec_mapper.bound_expr, exec_mapper)) diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 976e352f48c6a03c61429830758eb24517a47407..bbd7308b772714e0632616f0a52b48bbe80737a3 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -29,7 +29,7 @@ import six from six.moves import zip from pymbolic.mapper.evaluator import ( - EvaluationMapper as EvaluationMapperBase) + EvaluationMapper as PymbolicEvaluationMapper) import numpy as np import pyopencl as cl @@ -48,11 +48,13 @@ from pytential.symbolic.primitives import ( # {{{ evaluation mapper -class EvaluationMapper(EvaluationMapperBase): - def __init__(self, bound_expr, queue, context={}, +class EvaluationMapperBase(PymbolicEvaluationMapper): + def __init__(self, bound_expr, queue, context=None, target_geometry=None, target_points=None, target_normals=None, target_tangents=None): - EvaluationMapperBase.__init__(self, context) + if context is None: + context = {} + PymbolicEvaluationMapper.__init__(self, context) self.bound_expr = bound_expr self.queue = queue @@ -181,6 +183,9 @@ class EvaluationMapper(EvaluationMapperBase): return [(name, evaluate(expr)) for name, expr in zip(insn.names, insn.exprs)] + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + raise NotImplementedError + # {{{ functions def apply_real(self, args): @@ -227,6 +232,69 @@ class EvaluationMapper(EvaluationMapperBase): # }}} +# {{{ evaluation mapper + +class EvaluationMapper(EvaluationMapperBase): + + def __init__(self, bound_expr, queue, context=None, + timing_data=None): + EvaluationMapperBase.__init__(self, bound_expr, queue, context) + self.timing_data = timing_data + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + source = bound_expr.places[insn.source] + + return_timing_data = self.timing_data is not None + + result, timing_data = ( + source.exec_compute_potential_insn( + queue, insn, bound_expr, evaluate, return_timing_data)) + + if return_timing_data: + self.timing_data[insn] = timing_data + + return result + +# }}} + + +# {{{ performance model mapper + +class PerformanceModelMapper(EvaluationMapperBase): + """Mapper for evaluating performance models. + + This executes everything *except* the layer potential operator. Instead of + executing the operator, the performance model gets run and the performance + data is collected. + """ + + def __init__(self, bound_expr, queue, context=None, + target_geometry=None, + target_points=None, target_normals=None, target_tangents=None): + if context is None: + context = {} + EvaluationMapperBase.__init__( + self, bound_expr, queue, context, + target_geometry, + target_points, + target_normals, + target_tangents) + self.modeled_performance = {} + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + source = bound_expr.places[insn.source] + result, perf_model_result = ( + source.perf_model_compute_potential_insn( + queue, insn, bound_expr, evaluate)) + self.modeled_performance[insn] = perf_model_result + return result + + def get_modeled_performance(self): + return self.modeled_performance + +# }}} + + # {{{ scipy-like mat-vec op class MatVecOp: @@ -473,6 +541,11 @@ class BoundExpression(object): def get_discretization(self, where): return self.places.get_discretization(where) + def get_modeled_performance(self, queue, **args): + perf_model_mapper = PerformanceModelMapper(self, queue, args) + self.code.execute(perf_model_mapper) + return perf_model_mapper.get_modeled_performance() + def scipy_op(self, queue, arg_name, dtype, domains=None, **extra_args): """ :arg domains: a list of discretization identifiers or @@ -511,10 +584,16 @@ class BoundExpression(object): return MatVecOp(self, queue, arg_name, dtype, total_dofs, starts_and_ends, extra_args) - def __call__(self, queue, **args): - exec_mapper = EvaluationMapper(self, queue, args) + def eval(self, queue, context=None, timing_data=None): + if context is None: + context = {} + exec_mapper = EvaluationMapper( + self, queue, context, timing_data=timing_data) return self.code.execute(exec_mapper) + def __call__(self, queue, **args): + return self.eval(queue, args) + def bind(places, expr, auto_where=None): """ diff --git a/pytential/unregularized.py b/pytential/unregularized.py index e7fe1b3e73bfe17e14ab9aa6d57a86f6c9c92375..cad85f99631020d404b4b8a9ced7f64a1fc2fc4a 100644 --- a/pytential/unregularized.py +++ b/pytential/unregularized.py @@ -131,7 +131,15 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): density_discr=density_discr or self.density_discr, debug=debug if debug is not None else self.debug) - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from warnings import warn + from pytential.source import UnableToCollectTimingData + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + from pytools.obj_array import with_object_array_or_scalar def evaluate_wrapper(expr): @@ -187,7 +195,8 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result + timing_data = {} + return result, timing_data # {{{ fmm-based execution @@ -270,8 +279,9 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): # }}} from boxtree.fmm import drive_fmm + timing_data = {} all_potentials_on_every_tgt = drive_fmm( - geo_data.traversal(), wrangler, strengths) + geo_data.traversal(), wrangler, strengths, timing_data) # {{{ postprocess fmm @@ -288,7 +298,7 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): # }}} - return result + return result, timing_data # }}} diff --git a/setup.py b/setup.py index e2a1ae90b75867829508b789fbe340e15b29c426..bf1a6b7c8c1a25331f78ad4abbd9588b97a7779f 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ import os from setuptools import setup, find_packages +from setuptools.extension import Extension +from Cython.Build import cythonize # {{{ capture git revision at install time @@ -54,6 +56,21 @@ write_git_revision("pytential") # }}} +ext_modules = [ + Extension( + "pytential.qbx.target_specific._internal", + sources=[ + "pytential/qbx/target_specific/_internal.pyx", + "pytential/qbx/target_specific/_helmholtz_utils.c"], + depends=[ + "pytential/qbx/target_specific/_internal.h", + "pytential/qbx/target_specific/_helmholtz_utils.h"], + extra_compile_args=["-Wall", "-fopenmp", "-Ofast"], + extra_link_args=["-fopenmp"] + ), +] + + version_dict = {} init_filename = "pytential/version.py" os.environ["AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY"] = "1" @@ -91,6 +108,8 @@ setup(name="pytential", packages=find_packages(), + ext_modules=cythonize(ext_modules), + install_requires=[ "pytest>=2.3", # FIXME leave out for now diff --git a/test/test_layer_pot.py b/test/test_layer_pot.py index 4b5d70b675871031023d3f5cf40c049c008d0f5b..b0edf93310b263ec02e3d694a7bb3e4b2a41cc77 100644 --- a/test/test_layer_pot.py +++ b/test/test_layer_pot.py @@ -183,7 +183,7 @@ def test_off_surface_eval_vs_direct(ctx_getter, do_plot=False): target_association_tolerance=0.05, ).with_refinement() - fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000) + fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500) from pytential.target import PointsTarget ptarget = PointsTarget(fplot.points) from sumpy.kernel import LaplaceKernel @@ -315,77 +315,6 @@ def test_unregularized_off_surface_fmm_vs_direct(ctx_getter): # }}} -# {{{ test performance data gathering - -def test_perf_data_gathering(ctx_getter, n_arms=5): - cl_ctx = ctx_getter() - queue = cl.CommandQueue(cl_ctx) - - # prevent cache 'splosion - from sympy.core.cache import clear_cache - clear_cache() - - target_order = 8 - - starfish_func = NArmedStarfish(n_arms, 0.8) - mesh = make_curve_mesh( - starfish_func, - np.linspace(0, 1, n_arms * 30), - target_order) - - sigma_sym = sym.var("sigma") - - # The kernel doesn't really matter here - from sumpy.kernel import LaplaceKernel - k_sym = LaplaceKernel(mesh.ambient_dim) - - sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - - from meshmode.discretization import Discretization - from meshmode.discretization.poly_element import ( - InterpolatoryQuadratureSimplexGroupFactory) - pre_density_discr = Discretization( - queue.context, mesh, - InterpolatoryQuadratureSimplexGroupFactory(target_order)) - - results = [] - - def inspect_geo_data(insn, bound_expr, geo_data): - from pytential.qbx.fmm import assemble_performance_data - perf_data = assemble_performance_data(geo_data, uses_pde_expansions=True) - results.append(perf_data) - - return False # no need to do the actual FMM - - from pytential.qbx import QBXLayerPotentialSource - lpot_source = QBXLayerPotentialSource( - pre_density_discr, 4*target_order, - # qbx order and fmm order don't really matter - 10, fmm_order=10, - _expansions_in_tree_have_extent=True, - _expansion_stick_out_factor=0.5, - geometry_data_inspector=inspect_geo_data, - target_association_tolerance=1e-10, - ) - - lpot_source, _ = lpot_source.with_refinement() - - density_discr = lpot_source.density_discr - - if 0: - from meshmode.discretization.visualization import draw_curve - draw_curve(density_discr) - import matplotlib.pyplot as plt - plt.show() - - nodes = density_discr.nodes().with_queue(queue) - sigma = cl.clmath.sin(10 * nodes[0]) - - bind(lpot_source, sym_op)(queue, sigma=sigma) - -# }}} - - # {{{ test 3D jump relations @pytest.mark.parametrize("relation", ["sp", "nxcurls", "div_s"]) diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py index c376fdf3e5072aba8604147c70a4f5d918c15f33..a43d2caf78b066218d622f39e250e1226d285891 100644 --- a/test/test_layer_pot_identity.py +++ b/test/test_layer_pot_identity.py @@ -233,32 +233,38 @@ class DynamicTestCase(object): if (self.geometry.mesh_name == "sphere" and self.k != 0 and self.fmm_backend == "sumpy"): - pytest.skip("both direct eval and generating the FMM kernels " + raise ValueError("both direct eval and generating the FMM kernels " "are too slow") if (self.geometry.mesh_name == "sphere" and self.expr.zero_op_name == "green_grad"): - pytest.skip("does not achieve sufficient precision") + raise ValueError("does not achieve sufficient precision") # {{{ integral identity tester + +@pytest.mark.slowtest +@pytest.mark.parametrize("case", [ + DynamicTestCase(SphereGeometry(), GreenExpr(), 0), +]) +def test_identity_convergence_slow(ctx_getter, case): + test_identity_convergence(ctx_getter, case) + + @pytest.mark.parametrize("case", [ - tc - for geom in [ - StarfishGeometry(), - SphereGeometry(), - ] - for tc in [ - DynamicTestCase(geom, GreenExpr(), 0), - DynamicTestCase(geom, GreenExpr(), 1.2), - DynamicTestCase(geom, GradGreenExpr(), 0), - DynamicTestCase(geom, GradGreenExpr(), 1.2), - DynamicTestCase(geom, ZeroCalderonExpr(), 0), - - DynamicTestCase(geom, GreenExpr(), 0, fmm_backend="fmmlib"), - DynamicTestCase(geom, GreenExpr(), 1.2, fmm_backend="fmmlib"), - ]]) + # 2d + DynamicTestCase(StarfishGeometry(), GreenExpr(), 0), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2), + DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 0), + DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 1.2), + DynamicTestCase(StarfishGeometry(), ZeroCalderonExpr(), 0), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib"), + # 3d + DynamicTestCase(SphereGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"), + DynamicTestCase(SphereGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib") +]) def test_identity_convergence(ctx_getter, case, visualize=False): logging.basicConfig(level=logging.INFO) diff --git a/test/test_performance_model.py b/test/test_performance_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e096e7468f1970cb6e01ad3d12f1490c0d81b19b --- /dev/null +++ b/test/test_performance_model.py @@ -0,0 +1,546 @@ +from __future__ import division, print_function + +__copyright__ = "Copyright (C) 2018 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import numpy.linalg as la # noqa + +from boxtree.tools import ConstantOneExpansionWrangler +import pyopencl as cl +import pyopencl.clmath # noqa +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from pytools import one +from sumpy.kernel import LaplaceKernel, HelmholtzKernel + +from pytential import bind, sym, norm # noqa +from pytential.qbx.performance import PerformanceModel + + +# {{{ global params + +TARGET_ORDER = 8 +OVSMP_FACTOR = 5 +TCF = 0.9 +QBX_ORDER = 5 +FMM_ORDER = 10 + +DEFAULT_LPOT_KWARGS = { + "_box_extent_norm": "l2", + "_from_sep_smaller_crit": "static_l2", + } + + +def get_lpot_source(queue, dim): + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + target_order = TARGET_ORDER + + if dim == 2: + from meshmode.mesh.generation import starfish, make_curve_mesh + mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order) + elif dim == 3: + from meshmode.mesh.generation import generate_torus + mesh = generate_torus(2, 1, order=target_order) + else: + raise ValueError("unsupported dimension: %d" % dim) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + _expansion_stick_out_factor=TCF, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR*target_order, + **lpot_kwargs) + + lpot_source, _ = lpot_source.with_refinement() + + return lpot_source + + +def get_density(queue, lpot_source): + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + return cl.clmath.sin(10 * nodes[0]) + +# }}} + + +# {{{ test timing data gathering + +def test_timing_data_gathering(ctx_getter): + pytest.importorskip("pyfmmlib") + + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + lpot_source = get_lpot_source(queue, 2) + sigma = get_density(queue, lpot_source) + + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + op_S = bind(lpot_source, sym_op_S) + + timing_data = {} + op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data) + assert timing_data + print(timing_data) + +# }}} + + +# {{{ test performance model + +@pytest.mark.parametrize("dim", (2, 3)) +def test_performance_model(ctx_getter, dim): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + lpot_source = get_lpot_source(queue, dim) + sigma = get_density(queue, lpot_source) + + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + op_S = bind(lpot_source, sym_op_S) + perf_S = op_S.get_modeled_performance(queue, sigma=sigma) + assert len(perf_S) == 1 + + sym_op_S_plus_D = ( + sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + sym.D(k_sym, sigma_sym)) + op_S_plus_D = bind(lpot_source, sym_op_S_plus_D) + perf_S_plus_D = op_S_plus_D.get_modeled_performance(queue, sigma=sigma) + assert len(perf_S_plus_D) == 2 + +# }}} + + +# {{{ test performance model parameter gathering + +def test_performance_model_parameter_gathering(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder + + fmm_level_to_order = SimpleExpansionOrderFinder(tol=1e-5) + + lpot_source = get_lpot_source(queue, 2).copy( + fmm_level_to_order=fmm_level_to_order) + + sigma = get_density(queue, lpot_source) + + sigma_sym = sym.var("sigma") + k_sym = HelmholtzKernel(2, "k") + k = 2 + + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1, k=sym.var("k")) + op_S = bind(lpot_source, sym_op_S) + + perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma, k=k).values()) + + geo_data = lpot_source.qbx_fmm_geometry_data( + target_discrs_and_qbx_sides=((lpot_source.density_discr, 1),)) + + tree = geo_data.tree() + + assert perf_S.params["p_qbx"] == QBX_ORDER + assert perf_S.params["nlevels"] == tree.nlevels + assert perf_S.params["nsources"] == tree.nsources + assert perf_S.params["ntargets"] == tree.ntargets + assert perf_S.params["ncenters"] == geo_data.ncenters + + for level in range(tree.nlevels): + assert ( + perf_S.params["p_fmm_lev%d" % level] + == fmm_level_to_order(k_sym, {"k": 2}, tree, level)) + +# }}} + + +# {{{ constant one wrangler + +class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): + + def __init__(self, queue, geo_data, use_target_specific_qbx): + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper + geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) + + self.geo_data = geo_data + self.trav = geo_data.traversal() + self.use_target_specific_qbx = use_target_specific_qbx + + ConstantOneExpansionWrangler.__init__(self, geo_data.tree()) + + def _get_target_slice(self, ibox): + non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() + pstart = non_qbx_box_target_lists.box_target_starts[ibox] + return slice( + pstart, pstart + + non_qbx_box_target_lists.box_target_counts_nonchild[ibox]) + + def output_zeros(self): + non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() + return np.zeros(non_qbx_box_target_lists.nfiltered_targets) + + def full_output_zeros(self): + from pytools.obj_array import make_obj_array + return make_obj_array([np.zeros(self.tree.ntargets)]) + + def qbx_local_expansion_zeros(self): + return np.zeros(self.geo_data.ncenters) + + def reorder_potentials(self, potentials): + raise NotImplementedError("reorder_potentials should not " + "be called on a QBXExpansionWrangler") + + def form_global_qbx_locals(self, src_weights): + local_exps = self.qbx_local_expansion_zeros() + ops = 0 + + if self.use_target_specific_qbx: + return local_exps, self.timing_future(ops) + + global_qbx_centers = self.geo_data.global_qbx_centers() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for tgt_icenter in global_qbx_centers: + itgt_box = qbx_center_to_target_box[tgt_icenter] + + start, end = ( + self.trav.neighbor_source_boxes_starts[itgt_box:itgt_box + 2]) + + src_sum = 0 + for src_ibox in self.trav.neighbor_source_boxes_lists[start:end]: + src_pslice = self._get_source_slice(src_ibox) + ops += src_pslice.stop - src_pslice.start + src_sum += np.sum(src_weights[src_pslice]) + + local_exps[tgt_icenter] = src_sum + + return local_exps, self.timing_future(ops) + + def translate_box_multipoles_to_qbx_local(self, multipole_exps): + local_exps = self.qbx_local_expansion_zeros() + ops = 0 + + global_qbx_centers = self.geo_data.global_qbx_centers() + + for isrc_level, ssn in enumerate(self.trav.from_sep_smaller_by_level): + for tgt_icenter in global_qbx_centers: + icontaining_tgt_box = ( + self.geo_data + .qbx_center_to_target_box_source_level(isrc_level) + [tgt_icenter]) + + if icontaining_tgt_box == -1: + continue + + start, stop = ( + ssn.starts[icontaining_tgt_box], + ssn.starts[icontaining_tgt_box+1]) + + for src_ibox in ssn.lists[start:stop]: + local_exps[tgt_icenter] += multipole_exps[src_ibox] + ops += 1 + + return local_exps, self.timing_future(ops) + + def translate_box_local_to_qbx_local(self, local_exps): + qbx_expansions = self.qbx_local_expansion_zeros() + ops = 0 + + global_qbx_centers = self.geo_data.global_qbx_centers() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for tgt_icenter in global_qbx_centers: + isrc_box = qbx_center_to_target_box[tgt_icenter] + src_ibox = self.trav.target_boxes[isrc_box] + qbx_expansions[tgt_icenter] += local_exps[src_ibox] + ops += 1 + + return qbx_expansions, self.timing_future(ops) + + def eval_qbx_expansions(self, qbx_expansions): + output = self.full_output_zeros() + ops = 0 + + global_qbx_centers = self.geo_data.global_qbx_centers() + center_to_tree_targets = self.geo_data.center_to_tree_targets() + + for src_icenter in global_qbx_centers: + start, end = ( + center_to_tree_targets.starts[src_icenter:src_icenter+2]) + for icenter_tgt in range(start, end): + center_itgt = center_to_tree_targets.lists[icenter_tgt] + output[0][center_itgt] += qbx_expansions[src_icenter] + ops += 1 + + return output, self.timing_future(ops) + + def eval_target_specific_qbx_locals(self, src_weights): + pot = self.full_output_zeros() + ops = 0 + + if not self.use_target_specific_qbx: + return pot, self.timing_future(ops) + + global_qbx_centers = self.geo_data.global_qbx_centers() + center_to_tree_targets = self.geo_data.center_to_tree_targets() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for ictr in global_qbx_centers: + tgt_ibox = qbx_center_to_target_box[ictr] + + ictr_tgt_start, ictr_tgt_end = center_to_tree_targets.starts[ictr:ictr+2] + + for ictr_tgt in range(ictr_tgt_start, ictr_tgt_end): + ctr_itgt = center_to_tree_targets.lists[ictr_tgt] + + isrc_box_start, isrc_box_end = ( + self.trav.neighbor_source_boxes_starts[tgt_ibox:tgt_ibox+2]) + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = self.trav.neighbor_source_boxes_lists[isrc_box] + + isrc_start = self.tree.box_source_starts[src_ibox] + isrc_end = (isrc_start + + self.tree.box_source_counts_nonchild[src_ibox]) + + pot[0][ctr_itgt] += sum(src_weights[isrc_start:isrc_end]) + ops += isrc_end - isrc_start + + return pot, self.timing_future(ops) + +# }}} + + +# {{{ verify performance model + +class OpCountingTranslationCostModel(object): + """A translation cost model which assigns at cost of 1 to each operation.""" + + def __init__(self, dim, nlevels): + pass + + @staticmethod + def direct(): + return 1 + + p2qbxl = direct + p2p_tsqbx = direct + qbxl2p = direct + + @staticmethod + def p2l(level): + return 1 + + l2p = p2l + p2m = p2l + m2p = p2l + m2qbxl = p2l + l2qbxl = p2l + + @staticmethod + def m2m(src_level, tgt_level): + return 1 + + l2l = m2m + m2l = m2m + + +@pytest.mark.parametrize("dim, off_surface, use_target_specific_qbx", ( + (2, False, False), + (2, True, False), + (3, False, False), + (3, False, True), + (3, True, False), + (3, True, True))) +def test_performance_model_correctness(ctx_getter, dim, off_surface, + use_target_specific_qbx): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + perf_model = ( + PerformanceModel( + translation_cost_model_factory=OpCountingTranslationCostModel)) + + lpot_source = get_lpot_source(queue, dim).copy( + performance_model=perf_model, + _use_target_specific_qbx=use_target_specific_qbx) + + # Construct targets. + if off_surface: + from pytential.target import PointsTarget + from boxtree.tools import make_uniform_particle_array + ntargets = 10 ** 3 + targets = PointsTarget( + make_uniform_particle_array(queue, ntargets, dim, np.float)) + target_discrs_and_qbx_sides = ((targets, 0),) + qbx_forced_limit = None + else: + targets = lpot_source.density_discr + target_discrs_and_qbx_sides = ((targets, 1),) + qbx_forced_limit = 1 + + # Construct bound op, run performance model. + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=qbx_forced_limit) + + op_S = bind((lpot_source, targets), sym_op_S) + sigma = get_density(queue, lpot_source) + + from pytools import one + perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) + + # Run FMM with ConstantOneWrangler. This can't be done with pytential's + # high-level interface, so call the FMM driver directly. + from pytential.qbx.fmm import drive_fmm + geo_data = lpot_source.qbx_fmm_geometry_data( + target_discrs_and_qbx_sides=target_discrs_and_qbx_sides) + + wrangler = ConstantOneQBXExpansionWrangler( + queue, geo_data, use_target_specific_qbx) + nnodes = lpot_source.quad_stage2_density_discr.nnodes + src_weights = np.ones(nnodes) + + timing_data = {} + potential = drive_fmm(wrangler, src_weights, timing_data, + traversal=wrangler.trav)[0][geo_data.ncenters:] + + # Check constant one wrangler for correctness. + assert (potential == nnodes).all() + + modeled_time = perf_S.get_predicted_times(merge_close_lists=True) + + # Check that the performance model matches the timing data returned by the + # constant one wrangler. + mismatches = [] + for stage in timing_data: + if timing_data[stage]["ops_elapsed"] != modeled_time[stage]: + mismatches.append( + (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage])) + + assert not mismatches, "\n".join(str(s) for s in mismatches) + +# }}} + + +# {{{ test order varying by level + +CONSTANT_ONE_PARAMS = dict( + c_l2l=1, + c_l2p=1, + c_l2qbxl=1, + c_m2l=1, + c_m2m=1, + c_m2p=1, + c_m2qbxl=1, + c_p2l=1, + c_p2m=1, + c_p2p=1, + c_p2qbxl=1, + c_qbxl2p=1, + c_p2p_tsqbx=1, + ) + + +def test_performance_model_order_varying_by_level(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + # {{{ constant level to order + + def level_to_order_constant(kernel, kernel_args, tree, level): + return 1 + + lpot_source = get_lpot_source(queue, 2).copy( + performance_model=PerformanceModel( + calibration_params=CONSTANT_ONE_PARAMS), + fmm_level_to_order=level_to_order_constant) + + sigma_sym = sym.var("sigma") + + k_sym = LaplaceKernel(2) + sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + sigma = get_density(queue, lpot_source) + + perf_constant = one( + bind(lpot_source, sym_op) + .get_modeled_performance(queue, sigma=sigma).values()) + + # }}} + + # {{{ varying level to order + + varying_order_params = perf_constant.params.copy() + + nlevels = perf_constant.params["nlevels"] + for level in range(nlevels): + varying_order_params["p_fmm_lev%d" % level] = nlevels - level + + perf_varying = perf_constant.with_params(varying_order_params) + + # }}} + + # This only checks to ensure that the costs are different. The varying-level + # case should have larger cost. + + assert ( + sum(perf_varying.get_predicted_times().values()) + > sum(perf_constant.get_predicted_times().values())) + +# }}} + + +# You can test individual routines by typing +# $ python test_performance_model.py 'test_routine()' + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + + +# vim: foldmethod=marker diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py new file mode 100644 index 0000000000000000000000000000000000000000..d711372dae82728ca398be59ef252508cefcb7af --- /dev/null +++ b/test/test_target_specific_qbx.py @@ -0,0 +1,219 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2013-2017 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np +import numpy.linalg as la # noqa +import pyopencl as cl +import pyopencl.clmath as clmath +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from functools import partial # noqa +from meshmode.mesh.generation import ( # noqa + ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle, + NArmedStarfish, + make_curve_mesh) + +from pytential import bind, sym, norm # noqa +from sumpy.kernel import LaplaceKernel, HelmholtzKernel + +import logging +logger = logging.getLogger(__name__) + + +def test_spherical_bessel_functions(): + import pytential.qbx.target_specific as ts + + nterms = 9 + z = 3j + scale = 1 + j = np.zeros(1 + nterms, dtype=np.complex) + jder = np.zeros(1 + nterms, dtype=np.complex) + ts.jfuns3d_wrapper(nterms, z, scale, j, jder) + + # Reference solution computed using scipy.special.spherical_jn + + j_expected = np.array([ + +3.33929164246994992e+00 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +2.24279011776926884e+00j, + -1.09650152470070195e+00 + +0.00000000000000000e+00j, + +2.77555756156289135e-17 + -4.15287576601431119e-01j, + +1.27497179297362317e-01 + +0.00000000000000000e+00j, + -3.46944695195361419e-18 + +3.27960387093445271e-02j, + -7.24503736309898075e-03 + -4.33680868994201774e-19j, + +0.00000000000000000e+00 + -1.40087680258226812e-03j, + +2.40653350187633002e-04 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +3.71744848523478122e-05j, + ]) + + assert np.allclose(j, j_expected) + + jder_expected = np.array([ + -0.00000000000000000e+00 + -2.24279011776926884e+00j, + +1.84409823062377076e+00 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +1.14628859306856690e+00j, + -5.42784755898793825e-01 + +3.70074341541718826e-17j, + +2.77555756156289135e-17 + -2.02792277772493951e-01j, + +6.19051018786732632e-02 + -6.93889390390722838e-18j, + -2.45752492430047685e-18 + +1.58909515287802387e-02j, + -3.50936588954626604e-03 + -4.33680868994201774e-19j, + +0.00000000000000000e+00 + -6.78916752019369197e-04j, + +1.16738400679806980e-04 + +0.00000000000000000e+00j, + ]) + + assert np.allclose(jder, jder_expected) + + +def test_spherical_hankel_functions(): + import pytential.qbx.target_specific as ts + + nterms = 9 + z = 2 + 3j + scale = 1 + h = np.zeros(1 + nterms, dtype=np.complex) + hder = np.zeros(1 + nterms, dtype=np.complex) + ts.h3dall_wrapper(nterms, z, scale, h, hder) + + # Reference solution computed using + # scipy.special.spherical_jn + 1j * scipy.special.spherical_yn + h_expected = np.array([ + +1.17460537937623677e-02 + -7.25971518952217565e-03j, + -7.12794888037171503e-03 + -1.55735608522498126e-02j, + -2.58175723285687941e-02 + +5.00665171335734627e-03j, + -6.95481631849959037e-03 + +4.92143379339500253e-02j, + +9.78278544942576822e-02 + +5.92281078069348405e-02j, + +2.65420992601874961e-01 + -1.70387117227806167e-01j, + -8.11750107462848453e-02 + -1.02133651818182791e+00j, + -3.49178056863992792e+00 + -1.62876088689699405e+00j, + -1.36147986022969878e+01 + +9.34959028601928743e+00j, + +4.56300765393887087e+00 + +7.94934376901125432e+01j, + ]) + + assert np.allclose(h, h_expected) + + hder_expected = np.array([ + +7.12794888037171503e-03 + +1.55735608522498126e-02j, + +2.11270661502996893e-02 + -5.75767287207851197e-03j, + +1.32171023895111261e-03 + -3.57580271012700734e-02j, + -6.69663049946767064e-02 + -3.16989251553807527e-02j, + -1.50547136475930293e-01 + +1.16532548652759055e-01j, + +8.87444851771816839e-02 + +5.84014513465967444e-01j, + +2.00269153354544205e+00 + +7.98384884993240895e-01j, + +7.22334424954346144e+00 + -5.46307186102847187e+00j, + -4.05890079026877615e+00 + -4.28512368415405192e+01j, + -2.04081205047078043e+02 + -1.02417988497371837e+02j, + ]) + + assert np.allclose(hder, hder_expected) + + +@pytest.mark.parametrize("op", ["S", "D", "Sp"]) +@pytest.mark.parametrize("helmholtz_k", [0, 1.2, 12 + 1.2j]) +@pytest.mark.parametrize("qbx_order", [0, 1, 5]) +def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): + logging.basicConfig(level=logging.INFO) + + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + target_order = 4 + fmm_tol = 1e-3 + + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(1, target_order) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + from pytential.qbx import QBXLayerPotentialSource + pre_density_discr = Discretization( + cl_ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder + + refiner_extra_kwargs = {} + + if helmholtz_k != 0: + refiner_extra_kwargs["kernel_length_scale"] = 5 / abs(helmholtz_k) + + qbx, _ = QBXLayerPotentialSource( + pre_density_discr, 4*target_order, + qbx_order=qbx_order, + fmm_level_to_order=SimpleExpansionOrderFinder(fmm_tol), + fmm_backend="fmmlib", + _expansions_in_tree_have_extent=True, + _expansion_stick_out_factor=0.9, + _use_target_specific_qbx=False, + ).with_refinement(**refiner_extra_kwargs) + + density_discr = qbx.density_discr + + nodes = density_discr.nodes().with_queue(queue) + u_dev = clmath.sin(nodes[0]) + + if helmholtz_k == 0: + kernel = LaplaceKernel(3) + kernel_kwargs = {} + else: + kernel = HelmholtzKernel(3, allow_evanescent=True) + kernel_kwargs = {"k": sym.var("k")} + + u_sym = sym.var("u") + + if op == "S": + op = sym.S + elif op == "D": + op = sym.D + elif op == "Sp": + op = sym.Sp + else: + raise ValueError("unknown operator: '%s'" % op) + + expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs) + + bound_op = bind(qbx, expr) + pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k).get() + + qbx = qbx.copy(_use_target_specific_qbx=True) + bound_op = bind(qbx, expr) + pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k).get() + + assert np.allclose(pot_tsqbx, pot_ref, atol=1e-13, rtol=1e-13) + + +# You can test individual routines by typing +# $ python test_target_specific_qbx.py 'test_routine()' + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker