From 0abf04e52a8f980544c8d14f89e5a2f1409134d1 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 26 Apr 2018 20:22:05 -0500 Subject: [PATCH 001/139] Add tsqbx module. --- pytential/qbx/target_specific.pyx | 138 ++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 pytential/qbx/target_specific.pyx diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx new file mode 100644 index 00000000..184ec4d5 --- /dev/null +++ b/pytential/qbx/target_specific.pyx @@ -0,0 +1,138 @@ +#!python +#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + +import numpy as np +import cython +import cython.parallel + +from libc.math cimport sqrt +cimport openmp + + +cdef double legendre(double x, int n, double[] coeffs) nogil: + """Evaluate the Legendre series of order n at x. + + Taken from SciPy. + """ + cdef: + double c0, c1, tmp + int nd, i + + if n == 0: + c0 = coeffs[0] + c1 = 0 + elif n == 1: + c0 = coeffs[0] + c1 = coeffs[1] + else: + nd = n + 1 + c0 = coeffs[n - 1] + c1 = coeffs[n] + + for i in range(3, n + 2): + tmp = c0 + nd = nd - 1 + c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd + c1 = tmp + (c1*x*(2*nd - 1))/n + return c0 + c1*x + + +cdef double dist(double[3] a, double[3] b) nogil: + return sqrt( + (a[0] - b[0]) * (a[0] - b[0]) + + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2])) + + +cdef double tsqbx_from_source( + double[3] source, + double[3] center, + double[3] target, + int order, + double[] tmp) nogil: + cdef: + int i + double r, sc_d, tc_d + double cos_angle + + tc_d = dist(target, center) + sc_d = dist(source, center) + r = sc_d / tc_d + tmp[0] = 1 / tc_d + + for i in range(1, order + 1): + tmp[i] = tmp[i - 1] * r + + cos_angle = (( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + / (tc_d * sc_d)) + + return legendre(cos_angle, order, tmp) + + +def form_target_specific_qbx_contributions( + double[:,:] sources, + double[:,:] targets, + double[:,:] global_qbx_centers, + int[:] target_to_center, + int order, + int[:] qbx_center_to_target_box, + int[:] source_box_starts, int[:] source_box_lists, + int[:] box_source_starts, int[:] box_source_counts_nonchild, + double[:] source_weights, + double[:] pot): + + cdef: + int itgt, icenter + int tgt_box, src_ibox + int isrc_box, isrc_box_start, isrc_box_end + int isrc, isrc_start, isrc_end + int i, tid + double result + double[:,:] source, center, target, tmp + + # Yucky thread-local hack + maxthreads = openmp.omp_get_max_threads() + + source = np.zeros((maxthreads, 3)) + target = np.zeros((maxthreads, 3)) + center = np.zeros((maxthreads, 3)) + tmp = np.zeros((maxthreads, 256)) + + # TODO: Check if order > 256 + + for itgt in cython.parallel.prange(0, targets.shape[1], nogil=True, + schedule="dynamic", chunksize=10): + icenter = target_to_center[itgt] + if icenter == -1: + continue + + result = 0 + + tgt_box = qbx_center_to_target_box[icenter] + + tid = cython.parallel.threadid() + + for i in range(3): + target[tid, i] = targets[itgt, i] + center[tid, i] = global_qbx_centers[icenter, i] + + isrc_box_start = source_box_starts[tgt_box] + isrc_box_end = source_box_starts[tgt_box + 1] + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = source_box_lists[isrc_box] + isrc_start = box_source_starts[src_ibox] + isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for isrc in range(isrc_start, isrc_end): + for i in range(3): + source[tid, i] = sources[i, isrc] + + result = result + source_weights[isrc] * ( + tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], order, &tmp[tid, 0])) + + pot[itgt] = pot[itgt] + result -- GitLab From fe82084331cbbc92b27c60d80584ace981da11d7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 26 Apr 2018 20:27:49 -0500 Subject: [PATCH 002/139] Add extension to setup.py --- setup.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/setup.py b/setup.py index 84438494..89dc02d4 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ import os from setuptools import setup, find_packages +from setuptools.extension import Extension +from Cython.Build import cythonize # {{{ capture git revision at install time @@ -54,6 +56,16 @@ write_git_revision("pytential") # }}} +ext_modules = [ + Extension( + "pytential.qbx.target_specific", + ["pytential/qbx/target_specific.pyx"], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp'] + ) +] + + version_dict = {} init_filename = "pytential/version.py" os.environ["AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY"] = "1" @@ -91,6 +103,8 @@ setup(name="pytential", packages=find_packages(), + ext_modules = cythonize(ext_modules), + install_requires=[ "pytest>=2.3", # FIXME leave out for now -- GitLab From ad70bb9d714d0fa4a0077c45c3dc5ca503b840bb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 26 Apr 2018 22:41:05 -0500 Subject: [PATCH 003/139] TSQBX working for SLP --- pytential/qbx/__init__.py | 8 ++- pytential/qbx/fmm.py | 20 ++++++- pytential/qbx/fmmlib.py | 53 ++++++++++++++++- pytential/qbx/target_specific.pyx | 99 +++++++++++++++++-------------- setup.py | 2 +- 5 files changed, 128 insertions(+), 54 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 4e03ceca..81900a9b 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,6 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", + _use_tsqbx_list1=False, geometry_data_inspector=None, fmm_backend="sumpy", target_stick_out_factor=_not_provided): @@ -179,6 +180,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind + self._use_tsqbx_list1 = _use_tsqbx_list1 self.geometry_data_inspector = geometry_data_inspector # /!\ *All* parameters set here must also be set by copy() below, @@ -198,6 +200,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _expansions_in_tree_have_extent=_not_provided, _expansion_stick_out_factor=_not_provided, _tree_kind=None, + _use_tsqbx_list1=_not_provided, geometry_data_inspector=None, debug=_not_provided, @@ -277,6 +280,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, + _use_tsqbx_list1=_use_tsqbx_list1 if _use_tsqbx_list1 is not _not_provided + else self._use_tsqbx_list1, geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), fmm_backend=self.fmm_backend, @@ -694,7 +699,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.qbx_order, self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, - kernel_extra_kwargs=kernel_extra_kwargs) + kernel_extra_kwargs=kernel_extra_kwargs, + _use_target_specific_list1=self._use_tsqbx_list1) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index d3622e59..9c39c9b4 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -90,12 +90,14 @@ class QBXSumpyExpansionWranglerCodeContainer(SumpyExpansionWranglerCodeContainer def get_wrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs={}, - kernel_extra_kwargs=None): + kernel_extra_kwargs=None, + _use_target_specific_list1=False): return QBXExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs) + kernel_extra_kwargs, + _use_target_specific_list1) class QBXExpansionWrangler(SumpyExpansionWrangler): @@ -119,7 +121,11 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), def __init__(self, code_container, queue, geo_data, dtype, qbx_order, fmm_level_to_order, - source_extra_kwargs, kernel_extra_kwargs): + source_extra_kwargs, kernel_extra_kwargs, + _use_target_specific_list1=False): + if _use_target_specific_list1: + raise NotImplementedError("Cannot use TSQBX with sumpy yet") + SumpyExpansionWrangler.__init__(self, code_container, queue, geo_data.tree(), dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs) @@ -358,6 +364,11 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), return pot + @log_process(logger) + def eval_target_specific_global_qbx_locals(self, src_weights): + # Not implemented + pass + # }}} # }}} @@ -490,6 +501,9 @@ def drive_fmm(expansion_wrangler, src_weights): qbx_potentials = wrangler.eval_qbx_expansions( qbx_expansions) + qbx_potentials = qbx_potentials + \ + wrangler.eval_target_specific_global_qbx_locals(src_weights) + # }}} # {{{ reorder potentials diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 2b6cbf88..f19d23e6 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -28,6 +28,7 @@ import pyopencl as cl # noqa import pyopencl.array # noqa: F401 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler from sumpy.kernel import LaplaceKernel, HelmholtzKernel +import pytential.qbx.target_specific as target_specific from pytools import log_process @@ -54,12 +55,14 @@ class QBXFMMLibExpansionWranglerCodeContainer(object): def get_wrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs={}, - kernel_extra_kwargs=None): + kernel_extra_kwargs=None, + _use_tsqbx_list1=False): return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs) + kernel_extra_kwargs, + _use_tsqbx_list1) # }}} @@ -128,10 +131,12 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): def __init__(self, code, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, - kernel_extra_kwargs): + kernel_extra_kwargs, + _use_target_specific_list1=False): self.code = code self.queue = queue + self._use_target_specific_list1 = _use_target_specific_list1 # FMMLib is CPU-only. This wrapper gets the geometry out of # OpenCL-land. @@ -275,6 +280,13 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): raise ValueError("element '%s' of outputs array not " "understood" % out) + @memoize_method + def _get_single_centers_array(self): + return np.array([ + self.geo_data.centers()[idim] + for idim in range(self.dim) + ], order="F") + # }}} # {{{ override target lists to only hit non-QBX targets @@ -302,6 +314,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) def form_global_qbx_locals(self, src_weights): + if self._use_target_specific_list1: + return self.qbx_local_expansion_zeros() + geo_data = self.geo_data trav = geo_data.traversal() @@ -557,6 +572,38 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): return output + @log_process(logger) + def eval_target_specific_global_qbx_locals(self, src_weights): + if not self._use_target_specific_list1: + return self.full_output_zeros() + + pot = self.full_output_zeros() + geo_data = self.geo_data + trav = geo_data.traversal() + + ctt = geo_data.center_to_tree_targets() + + # TODO: assert this is the Laplace single or double layer kernel + + for output in pot: + target_specific.eval_target_specific_global_qbx_locals( + order=self.qbx_order, + sources=self._get_single_sources_array(), + targets=geo_data.all_targets(), + centers=self._get_single_centers_array(), + global_qbx_centers=geo_data.global_qbx_centers(), + qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), + center_to_target_starts=ctt.starts, + center_to_target_lists=ctt.lists, + source_box_starts=trav.neighbor_source_boxes_starts, + source_box_lists=trav.neighbor_source_boxes_lists, + box_source_starts=self.tree.box_source_starts, + box_source_counts_nonchild=self.tree.box_source_counts_nonchild, + src_weights=src_weights, + pot=output) + + return pot + def finalize_potentials(self, potential): potential = super(QBXFMMLibExpansionWrangler, self).finalize_potentials( potential) diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index 184ec4d5..c668703a 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -6,6 +6,8 @@ import cython import cython.parallel from libc.math cimport sqrt +from libc.stdio cimport printf + cimport openmp @@ -17,7 +19,7 @@ cdef double legendre(double x, int n, double[] coeffs) nogil: cdef: double c0, c1, tmp int nd, i - + if n == 0: c0 = coeffs[0] c1 = 0 @@ -28,12 +30,12 @@ cdef double legendre(double x, int n, double[] coeffs) nogil: nd = n + 1 c0 = coeffs[n - 1] c1 = coeffs[n] - + for i in range(3, n + 2): tmp = c0 nd = nd - 1 c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd - c1 = tmp + (c1*x*(2*nd - 1))/n + c1 = tmp + (c1*x*(2*nd - 1))/nd return c0 + c1*x @@ -57,8 +59,8 @@ cdef double tsqbx_from_source( tc_d = dist(target, center) sc_d = dist(source, center) - r = sc_d / tc_d - tmp[0] = 1 / tc_d + r = tc_d / sc_d + tmp[0] = 1 / sc_d for i in range(1, order + 1): tmp[i] = tmp[i - 1] * r @@ -72,67 +74,72 @@ cdef double tsqbx_from_source( return legendre(cos_angle, order, tmp) -def form_target_specific_qbx_contributions( +def eval_target_specific_global_qbx_locals( + int order, double[:,:] sources, double[:,:] targets, - double[:,:] global_qbx_centers, - int[:] target_to_center, - int order, + double[:,:] centers, + int[:] global_qbx_centers, int[:] qbx_center_to_target_box, + int[:] center_to_target_starts, int[:] center_to_target_lists, int[:] source_box_starts, int[:] source_box_lists, int[:] box_source_starts, int[:] box_source_counts_nonchild, - double[:] source_weights, - double[:] pot): + double[:] src_weights, + double complex[:] pot): cdef: - int itgt, icenter + int tgt, ictr, ctr + int itgt, itgt_start, itgt_end int tgt_box, src_ibox int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end int i, tid - double result + double result double[:,:] source, center, target, tmp # Yucky thread-local hack maxthreads = openmp.omp_get_max_threads() - source = np.zeros((maxthreads, 3)) - target = np.zeros((maxthreads, 3)) - center = np.zeros((maxthreads, 3)) - tmp = np.zeros((maxthreads, 256)) + source = np.zeros((1 + maxthreads, 3)) + target = np.zeros((1 + maxthreads, 3)) + center = np.zeros((1 + maxthreads, 3)) + tmp = np.zeros((1 + maxthreads, 256)) # TODO: Check if order > 256 - for itgt in cython.parallel.prange(0, targets.shape[1], nogil=True, - schedule="dynamic", chunksize=10): - icenter = target_to_center[itgt] - if icenter == -1: - continue + for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0], + nogil=True, schedule="dynamic", + chunksize=10): + ctr = global_qbx_centers[ictr] + itgt_start = center_to_target_starts[ctr] + itgt_end = center_to_target_starts[ctr + 1] + tgt_box = qbx_center_to_target_box[ctr] + tid = cython.parallel.threadid() - result = 0 + for i in range(3): + center[tid, i] = centers[i, ctr] - tgt_box = qbx_center_to_target_box[icenter] + for itgt in range(itgt_start, itgt_end): + result = 0 + tgt = center_to_target_lists[itgt] - tid = cython.parallel.threadid() + for i in range(3): + target[tid, i] = targets[i, tgt] - for i in range(3): - target[tid, i] = targets[itgt, i] - center[tid, i] = global_qbx_centers[icenter, i] - - isrc_box_start = source_box_starts[tgt_box] - isrc_box_end = source_box_starts[tgt_box + 1] - - for isrc_box in range(isrc_box_start, isrc_box_end): - src_ibox = source_box_lists[isrc_box] - isrc_start = box_source_starts[src_ibox] - isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] - - for isrc in range(isrc_start, isrc_end): - for i in range(3): - source[tid, i] = sources[i, isrc] - - result = result + source_weights[isrc] * ( - tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], order, &tmp[tid, 0])) - - pot[itgt] = pot[itgt] + result + isrc_box_start = source_box_starts[tgt_box] + isrc_box_end = source_box_starts[tgt_box + 1] + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = source_box_lists[isrc_box] + isrc_start = box_source_starts[src_ibox] + isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for isrc in range(isrc_start, isrc_end): + for i in range(3): + source[tid, i] = sources[i, isrc] + + result = result + src_weights[isrc] * ( + tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], order, &tmp[tid, 0])) + + pot[tgt] = pot[tgt] + result diff --git a/setup.py b/setup.py index 89dc02d4..a39b21bb 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ setup(name="pytential", packages=find_packages(), - ext_modules = cythonize(ext_modules), + ext_modules = cythonize(ext_modules) install_requires=[ "pytest>=2.3", -- GitLab From ff09893c559d340e12cc28e8627c8754eaa78d4b Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 26 Apr 2018 23:32:41 -0500 Subject: [PATCH 004/139] Checkpoint --- pytential/qbx/fmmlib.py | 4 +- pytential/qbx/target_specific.pyx | 114 ++++++++++++++++++++++-------- setup.py | 2 +- 3 files changed, 88 insertions(+), 32 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index f19d23e6..ae323a24 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -56,13 +56,13 @@ class QBXFMMLibExpansionWranglerCodeContainer(object): qbx_order, fmm_level_to_order, source_extra_kwargs={}, kernel_extra_kwargs=None, - _use_tsqbx_list1=False): + _use_target_specific_list1=False): return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, - _use_tsqbx_list1) + _use_target_specific_list1) # }}} diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index c668703a..c349bb56 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -11,32 +11,47 @@ from libc.stdio cimport printf cimport openmp -cdef double legendre(double x, int n, double[] coeffs) nogil: - """Evaluate the Legendre series of order n at x. +cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: + """Compute the values of the Legendre polynomial up to order n at x. + Optionally, if derivs is non-NULL, compute the values of the derivative too. - Taken from SciPy. + Borrowed from fmmlib. """ cdef: - double c0, c1, tmp - int nd, i + double pj, derj, pjm2, pjm1, derjm2, derjm1 + int j + + pjm2 = 1 + pjm1 = x + + vals[0] = 1 + if derivs != NULL: + derivs[0] = 0 + derjm2 = 0 + derjm1 = 1 if n == 0: - c0 = coeffs[0] - c1 = 0 - elif n == 1: - c0 = coeffs[0] - c1 = coeffs[1] - else: - nd = n + 1 - c0 = coeffs[n - 1] - c1 = coeffs[n] - - for i in range(3, n + 2): - tmp = c0 - nd = nd - 1 - c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd - c1 = tmp + (c1*x*(2*nd - 1))/nd - return c0 + c1*x + return + + vals[1] = x + if derivs != NULL: + derivs[1] = 1 + + if n == 1: + return + + for j in range(2, n + 1): + pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j + vals[j] = pj + pjm2 = pjm1 + pjm1 = pj + + if derivs != NULL: + derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2 + derj = derj / j + derivs[j] = derj + derjm2 = derjm1 + derjm1 = derj cdef double dist(double[3] a, double[3] b) nogil: @@ -46,24 +61,56 @@ cdef double dist(double[3] a, double[3] b) nogil: (a[2] - b[2]) * (a[2] - b[2])) -cdef double tsqbx_from_source( +""" +cdef void tsqbx_grad_from_source( double[3] source, double[3] center, double[3] target, + double[3] grad, int order, double[] tmp) nogil: cdef: int i - double r, sc_d, tc_d - double cos_angle + double result, r, sc_d, tc_d, cos_angle, alpha + double *derivs + + derivs = &tmp[order + 1] tc_d = dist(target, center) sc_d = dist(source, center) - r = tc_d / sc_d - tmp[0] = 1 / sc_d - for i in range(1, order + 1): - tmp[i] = tmp[i - 1] * r + alpha = ( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2]) + + cos_angle = alpha / (tc_d * sc_d) + + legvals(cos_angle, order, tmp, derivs) + + result = 0 + r = 1 / sc_d + + for i in range(0, order + 1): + result = result + tmp[i] * r + r = r * (tc_d / sc_d) + + return result +""" + + +cdef double tsqbx_from_source( + double[3] source, + double[3] center, + double[3] target, + int order, + double[] tmp) nogil: + cdef: + int i + double result, r, sc_d, tc_d, cos_angle + + tc_d = dist(target, center) + sc_d = dist(source, center) cos_angle = (( (target[0] - center[0]) * (source[0] - center[0]) + @@ -71,7 +118,16 @@ cdef double tsqbx_from_source( (target[2] - center[2]) * (source[2] - center[2])) / (tc_d * sc_d)) - return legendre(cos_angle, order, tmp) + legvals(cos_angle, order, tmp, NULL) + + result = 0 + r = 1 / sc_d + + for i in range(0, order + 1): + result = result + tmp[i] * r + r = r * (tc_d / sc_d) + + return result def eval_target_specific_global_qbx_locals( diff --git a/setup.py b/setup.py index a39b21bb..89dc02d4 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ setup(name="pytential", packages=find_packages(), - ext_modules = cythonize(ext_modules) + ext_modules = cythonize(ext_modules), install_requires=[ "pytest>=2.3", -- GitLab From c62e9a3b81e7add42e6cdf829564488b8661ec13 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 27 Apr 2018 01:40:51 -0500 Subject: [PATCH 005/139] Get DLP working. --- pytential/qbx/fmmlib.py | 3 +- pytential/qbx/target_specific.pyx | 92 ++++++++++++++-------- test/test_target_specific_qbx.py | 123 ++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 33 deletions(-) create mode 100644 test/test_target_specific_qbx.py diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index ae323a24..43fcbb13 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -599,7 +599,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): source_box_lists=trav.neighbor_source_boxes_lists, box_source_starts=self.tree.box_source_starts, box_source_counts_nonchild=self.tree.box_source_counts_nonchild, - src_weights=src_weights, + dipstr=src_weights, + dipvec=self.dipole_vec, pot=output) return pot diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index c349bb56..cbb5d73b 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -43,8 +43,6 @@ cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: for j in range(2, n + 1): pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j vals[j] = pj - pjm2 = pjm1 - pjm1 = pj if derivs != NULL: derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2 @@ -53,6 +51,9 @@ cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: derjm2 = derjm1 derjm1 = derj + pjm2 = pjm1 + pjm1 = pj + cdef double dist(double[3] a, double[3] b) nogil: return sqrt( @@ -61,20 +62,24 @@ cdef double dist(double[3] a, double[3] b) nogil: (a[2] - b[2]) * (a[2] - b[2])) -""" cdef void tsqbx_grad_from_source( double[3] source, double[3] center, double[3] target, - double[3] grad, - int order, - double[] tmp) nogil: + double[3] grad, + int order) nogil: cdef: - int i - double result, r, sc_d, tc_d, cos_angle, alpha - double *derivs - - derivs = &tmp[order + 1] + int i, j + double result, sc_d, tc_d, cos_angle, alpha, R + double[128] tmp + double[128] derivs + double[3] cms + double[3] tmc + + for j in range(3): + cms[j] = center[j] - source[j] + tmc[j] = target[j] - center[j] + grad[j] = 0 tc_d = dist(target, center) sc_d = dist(source, center) @@ -82,32 +87,37 @@ cdef void tsqbx_grad_from_source( alpha = ( (target[0] - center[0]) * (source[0] - center[0]) + (target[1] - center[1]) * (source[1] - center[1]) + - (target[2] - center[2]) * (source[2] - center[2]) + (target[2] - center[2]) * (source[2] - center[2])) cos_angle = alpha / (tc_d * sc_d) legvals(cos_angle, order, tmp, derivs) - result = 0 - r = 1 / sc_d + R = 1 / sc_d for i in range(0, order + 1): - result = result + tmp[i] * r - r = r * (tc_d / sc_d) + # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) + for j in range(3): + grad[j] += (i + 1) * cms[j] / (sc_d ** 2) * R * tmp[i] + for j in range(3): + # Siegel and Tornberg has a sign flip here :( + grad[j] += ( + tmc[j] / (tc_d * sc_d) + + alpha * cms[j] / (tc_d * sc_d ** 3)) * R * derivs[i] + R *= (tc_d / sc_d) - return result -""" + return cdef double tsqbx_from_source( double[3] source, double[3] center, double[3] target, - int order, - double[] tmp) nogil: + int order) nogil: cdef: int i double result, r, sc_d, tc_d, cos_angle + double tmp[128] tc_d = dist(target, center) sc_d = dist(source, center) @@ -124,8 +134,8 @@ cdef double tsqbx_from_source( r = 1 / sc_d for i in range(0, order + 1): - result = result + tmp[i] * r - r = r * (tc_d / sc_d) + result += tmp[i] * r + r *= (tc_d / sc_d) return result @@ -140,7 +150,8 @@ def eval_target_specific_global_qbx_locals( int[:] center_to_target_starts, int[:] center_to_target_lists, int[:] source_box_starts, int[:] source_box_lists, int[:] box_source_starts, int[:] box_source_counts_nonchild, - double[:] src_weights, + double[:] dipstr, + double[:,:] dipvec, double complex[:] pot): cdef: @@ -151,15 +162,22 @@ def eval_target_specific_global_qbx_locals( int isrc, isrc_start, isrc_end int i, tid double result - double[:,:] source, center, target, tmp + double[:,:] source, center, target, grad + int slp, dlp + + slp = (dipstr is not None) and (dipvec is None) + dlp = (dipstr is not None) and (dipvec is not None) + + if not (slp or dlp): + raise ValueError("should specify exactly one of src_weights or dipvec") - # Yucky thread-local hack + # Hack to obtain thread-local storage maxthreads = openmp.omp_get_max_threads() - source = np.zeros((1 + maxthreads, 3)) - target = np.zeros((1 + maxthreads, 3)) - center = np.zeros((1 + maxthreads, 3)) - tmp = np.zeros((1 + maxthreads, 256)) + source = np.zeros((maxthreads, 3)) + target = np.zeros((maxthreads, 3)) + center = np.zeros((maxthreads, 3)) + grad = np.zeros((maxthreads, 3)) # TODO: Check if order > 256 @@ -194,8 +212,18 @@ def eval_target_specific_global_qbx_locals( for i in range(3): source[tid, i] = sources[i, isrc] - result = result + src_weights[isrc] * ( - tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], order, &tmp[tid, 0])) + if slp: + # Don't replace with +=, since that makes Cython think + # it is a reduction. + result = result + dipstr[isrc] * ( + tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], order)) + elif dlp: + tsqbx_grad_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], &grad[tid, 0], order) + result = result + dipstr[isrc] * ( + grad[tid, 0] * dipvec[0, isrc] + + grad[tid, 1] * dipvec[1, isrc] + + grad[tid, 2] * dipvec[2, isrc]) pot[tgt] = pot[tgt] + result diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py new file mode 100644 index 00000000..60a801d4 --- /dev/null +++ b/test/test_target_specific_qbx.py @@ -0,0 +1,123 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2013-2017 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.clmath # noqa +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from functools import partial +from meshmode.mesh.generation import ( # noqa + ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle, + NArmedStarfish, + make_curve_mesh) +# from sumpy.visualization import FieldPlotter +from pytential import bind, sym, norm +from sumpy.kernel import LaplaceKernel, HelmholtzKernel + +import logging +logger = logging.getLogger(__name__) + +try: + import matplotlib.pyplot as pt +except ImportError: + pass + + +@pytest.mark.parametrize("op", ["S", "D"]) +def test_target_specific_qbx(ctx_getter, op): + logging.basicConfig(level=logging.INFO) + + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + target_order = 4 + + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(1, target_order) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + from pytential.qbx import QBXLayerPotentialSource + pre_density_discr = Discretization( + cl_ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + qbx, _ = QBXLayerPotentialSource( + pre_density_discr, 4*target_order, + qbx_order=5, + fmm_order=10, + fmm_backend="fmmlib", + _expansions_in_tree_have_extent=True, + _expansion_stick_out_factor=0.9, + ).with_refinement() + + density_discr = qbx.density_discr + + nodes_host = density_discr.nodes().get(queue) + center = np.array([3, 1, 2]) + diff = nodes_host - center[:, np.newaxis] + + dist_squared = np.sum(diff**2, axis=0) + dist = np.sqrt(dist_squared) + u = 1/dist + + u_dev = cl.array.to_device(queue, u) + + kernel = LaplaceKernel(3) + u_sym = sym.var("u") + + if op == "S": + op = sym.S + elif op == "D": + op = sym.D + expr = op(kernel, u_sym, qbx_forced_limit=-1) + + bound_op = bind(qbx, expr) + slp_ref = bound_op(queue, u=u_dev) + + qbx = qbx.copy(_use_tsqbx_list1=True) + bound_op = bind(qbx, expr) + slp_tsqbx = bound_op(queue, u=u_dev) + + assert (np.max(np.abs(slp_ref.get() - slp_tsqbx.get()))) < 1e-13 + + +# You can test individual routines by typing +# $ python test_layer_pot_identity.py 'test_routine()' + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker -- GitLab From 29356b4154d5be017a24c899e8ee341259d9d1db Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 27 Apr 2018 18:53:18 -0500 Subject: [PATCH 006/139] [ci skip] Performance tweaking. --- pytential/qbx/target_specific.pyx | 19 +++++++++++-------- setup.py | 4 ++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index cbb5d73b..7cde3a57 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -98,12 +98,12 @@ cdef void tsqbx_grad_from_source( for i in range(0, order + 1): # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) for j in range(3): - grad[j] += (i + 1) * cms[j] / (sc_d ** 2) * R * tmp[i] + grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] for j in range(3): # Siegel and Tornberg has a sign flip here :( grad[j] += ( tmc[j] / (tc_d * sc_d) + - alpha * cms[j] / (tc_d * sc_d ** 3)) * R * derivs[i] + alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] R *= (tc_d / sc_d) return @@ -168,22 +168,25 @@ def eval_target_specific_global_qbx_locals( slp = (dipstr is not None) and (dipvec is None) dlp = (dipstr is not None) and (dipvec is not None) + print("Hi from Cython") + if not (slp or dlp): raise ValueError("should specify exactly one of src_weights or dipvec") # Hack to obtain thread-local storage maxthreads = openmp.omp_get_max_threads() - source = np.zeros((maxthreads, 3)) - target = np.zeros((maxthreads, 3)) - center = np.zeros((maxthreads, 3)) - grad = np.zeros((maxthreads, 3)) + # Prevent false sharing by over-allocating the buffers + source = np.zeros((maxthreads, 65)) + target = np.zeros((maxthreads, 65)) + center = np.zeros((maxthreads, 65)) + grad = np.zeros((maxthreads, 65)) # TODO: Check if order > 256 for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0], - nogil=True, schedule="dynamic", - chunksize=10): + nogil=True, schedule="static", + chunksize=128): ctr = global_qbx_centers[ictr] itgt_start = center_to_target_starts[ctr] itgt_end = center_to_target_starts[ctr + 1] diff --git a/setup.py b/setup.py index 89dc02d4..59414845 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,8 @@ ext_modules = [ Extension( "pytential.qbx.target_specific", ["pytential/qbx/target_specific.pyx"], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp'] + extra_compile_args=["-fopenmp"], + extra_link_args=["-fopenmp"] ) ] -- GitLab From 572b9fdc19bf4560b0000f2578ff94fc3065907e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 17 May 2018 17:42:48 -0500 Subject: [PATCH 007/139] Update cost model for TSQBX. --- pytential/qbx/fmm.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 9e4d2402..f199c3ca 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -540,7 +540,8 @@ def drive_fmm(expansion_wrangler, src_weights): def assemble_performance_data(geo_data, uses_pde_expansions, translation_source_power=None, translation_target_power=None, translation_max_power=None, - summarize_parallel=None, merge_close_lists=True): + summarize_parallel=None, merge_close_lists=True, + use_tsqbx=False): """ :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM uses translation operators that make use of the knowledge that the @@ -557,6 +558,9 @@ def assemble_performance_data(geo_data, uses_pde_expansions, * *_neighbor* (List 1) * *_sep_smaller* (List 3 close) * *_sep_bigger* (List 4 close). + :arg use_tsqbx: A :class:`bool` indicating whether to model + List 1/3close/4close interactions that involve QBX centers + using TSQBX. This affects the cost of the *p2qbxl* stage. """ # FIXME: This should suport target filtering. @@ -818,7 +822,13 @@ def assemble_performance_data(geo_data, uses_pde_expansions, np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) + # center -> number of associated targets + ntgts = np.zeros(len(global_qbx_centers), dtype=np.intp) + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] + ntgts[itgt_center] = end - start + itgt_box = qbx_center_to_target_box[tgt_icenter] np2qbxl_list1_srcs = 0 @@ -859,15 +869,20 @@ def assemble_performance_data(geo_data, uses_pde_expansions, np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs + if use_tsqbx: + mult = p_qbx + np2qbxl_list1 *= ntgts + np2qbxl_list3 *= ntgts + np2qbxl_list4 *= ntgts + else: + mult = ncoeffs_qbx + if merge_close_lists: - result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx) + result["p2qbxl"] = summarize_parallel(np2qbxl_list1, mult) else: - result["p2qbxl_neighbor"] = ( - summarize_parallel(np2qbxl_list1, ncoeffs_qbx)) - result["p2qbxl_sep_smaller"] = ( - summarize_parallel(np2qbxl_list3, ncoeffs_qbx)) - result["p2qbxl_sep_bigger"] = ( - summarize_parallel(np2qbxl_list4, ncoeffs_qbx)) + result["p2qbxl_neighbor"] = summarize_parallel(np2qbxl_list1, mult) + result["p2qbxl_sep_smaller"] = summarize_parallel(np2qbxl_list3, mult) + result["p2qbxl_sep_bigger"] = summarize_parallel(np2qbxl_list4, mult) process_form_qbxl() -- GitLab From 9ae6ed5c368d83a43138a798e23efde0f5c7dc51 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 25 May 2018 17:51:20 -0500 Subject: [PATCH 008/139] Change the TSQBX API. --- pytential/qbx/__init__.py | 12 ++++++------ pytential/qbx/fmm.py | 6 +++--- test/test_target_specific_qbx.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 24d548b9..5dc8a0de 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,7 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", - _use_tsqbx_list1=False, + _tsqbx_kind="", geometry_data_inspector=None, fmm_backend="sumpy", target_stick_out_factor=_not_provided): @@ -197,7 +197,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind - self._use_tsqbx_list1 = _use_tsqbx_list1 + self._tsqbx_kind = _tsqbx_kind self.geometry_data_inspector = geometry_data_inspector # /!\ *All* parameters set here must also be set by copy() below, @@ -220,7 +220,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, - _use_tsqbx_list1=_not_provided, + _tsqbx_kind=_not_provided, geometry_data_inspector=None, fmm_backend=None, @@ -303,8 +303,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, - _use_tsqbx_list1=_use_tsqbx_list1 if _use_tsqbx_list1 is not _not_provided - else self._use_tsqbx_list1, + _tsqbx_kind=_tsqbx_kind if _tsqbx_kind is not _not_provided + else self._tsqbx_kind, geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), fmm_backend=fmm_backend or self.fmm_backend, @@ -723,7 +723,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, kernel_extra_kwargs=kernel_extra_kwargs, - _use_target_specific_list1=self._use_tsqbx_list1) + _use_target_specific_list1="1" in self._tsqbx_kind) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index f199c3ca..3b452e72 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -541,7 +541,7 @@ def assemble_performance_data(geo_data, uses_pde_expansions, translation_source_power=None, translation_target_power=None, translation_max_power=None, summarize_parallel=None, merge_close_lists=True, - use_tsqbx=False): + use_tsqbx_list1=False): """ :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM uses translation operators that make use of the knowledge that the @@ -558,7 +558,7 @@ def assemble_performance_data(geo_data, uses_pde_expansions, * *_neighbor* (List 1) * *_sep_smaller* (List 3 close) * *_sep_bigger* (List 4 close). - :arg use_tsqbx: A :class:`bool` indicating whether to model + :arg use_tsqbx_list1: A :class:`bool` indicating whether to model List 1/3close/4close interactions that involve QBX centers using TSQBX. This affects the cost of the *p2qbxl* stage. """ @@ -869,7 +869,7 @@ def assemble_performance_data(geo_data, uses_pde_expansions, np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs - if use_tsqbx: + if use_tsqbx_list1: mult = p_qbx np2qbxl_list1 *= ntgts np2qbxl_list3 *= ntgts diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 60a801d4..e5f9ea38 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -102,7 +102,7 @@ def test_target_specific_qbx(ctx_getter, op): bound_op = bind(qbx, expr) slp_ref = bound_op(queue, u=u_dev) - qbx = qbx.copy(_use_tsqbx_list1=True) + qbx = qbx.copy(_tsqbx_kind="1") bound_op = bind(qbx, expr) slp_tsqbx = bound_op(queue, u=u_dev) -- GitLab From c374ffbb5ee3ef638d8a47154e15e4e5d92b8440 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 11 Jun 2018 18:53:30 -0500 Subject: [PATCH 009/139] New API for collecting performance model data. This adds a method *get_modeled_performance* to the `BoundExpression` class, which does a performance model evaluation over the expression. This lets us write code such as: op = bind(lpot_source, sym.S(knl, sigma_sym)) perf_model = op.get_modeled_performance(queue, sigma=sigma) A dictionary that is returned maps layer potential instructions to their corresponding modeled costs. The QBXLayerPotentialSource is extended to include a *performance_model* attribute, which is an instance of PerformanceModel. This lets us customize the performance model. Currently, all the non layer potential instructions in the bound expression are still evaluated to make sure that the shapes/data types all are as expected. In principle, we could do symbolic execution of those instructions instead, though that would require a bit of analysis of the expression. --- pytential/qbx/__init__.py | 135 ++++++++++++++++++++++++-------- pytential/symbolic/compiler.py | 19 +++-- pytential/symbolic/execution.py | 66 +++++++++++++++- test/test_performance_model.py | 23 +++--- 4 files changed, 186 insertions(+), 57 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index ff4c1ade..672ed56c 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -84,6 +84,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", geometry_data_inspector=None, + performance_model=None, fmm_backend="sumpy", target_stick_out_factor=_not_provided): """ @@ -197,6 +198,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind self.geometry_data_inspector = geometry_data_inspector + self.performance_model = performance_model # /!\ *All* parameters set here must also be set by copy() below, # otherwise they will be reset to their default values behind your @@ -219,6 +221,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _tree_kind=None, geometry_data_inspector=None, + performance_model=_not_provided, fmm_backend=None, debug=_not_provided, @@ -302,6 +305,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _tree_kind=_tree_kind or self._tree_kind, geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), + performance_model=( + # None is a valid value here + performance_model + if performance_model is not _not_provided + else self.performance_model), fmm_backend=fmm_backend or self.fmm_backend, **kwargs) @@ -589,8 +597,32 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ internal functionality for execution def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + if self.fmm_level_to_order is False: + func = self.exec_compute_potential_insn + else: + func = self.exec_compute_potential_insn_fmm + return self._dispatch_compute_potential_insn( + queue, insn, bound_expr, evaluate, func) + + def perf_model_compute_potential_insn(self, queue, insn, bound_expr, + evaluate, costs): + if self.fmm_level_to_order is False: + raise NotImplementedError("perf modeling direct evaluations") + return self._dispatch_compute_potential_insn( + queue, insn, bound_expr, evaluate, + self.perf_model_compute_potential_insn_fmm, + costs=costs) + + def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, + evaluate, func, **extra_args): from pytools.obj_array import with_object_array_or_scalar + if not self._refined_for_global_qbx: + from warnings import warn + warn( + "Executing global QBX without refinement. " + "This is unlikely to work.") + def oversample_nonscalars(vec): from numbers import Number if isinstance(vec, Number): @@ -598,22 +630,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: return self.resampler(queue, vec) - if not self._refined_for_global_qbx: - from warnings import warn - warn( - "Executing global QBX without refinement. " - "This is unlikely to work.") - def evaluate_wrapper(expr): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - if self.fmm_level_to_order is False: - func = self.exec_compute_potential_insn_direct - else: - func = self.exec_compute_potential_insn_fmm - - return func(queue, insn, bound_expr, evaluate_wrapper) + return func(queue, insn, bound_expr, evaluate_wrapper, **extra_args) @property @memoize_method @@ -657,18 +678,19 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: raise ValueError("invalid FMM backend: %s" % self.fmm_backend) - def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate): - # {{{ build list of unique target discretizations used - + def get_target_discrs_and_qbx_sides(self, insn, bound_expr): + """Build the list of unique target discretizations used by the + provided instruction. + """ # map (name, qbx_side) to number in list - tgt_name_and_side_to_number = {} + target_name_and_side_to_number = {} # list of tuples (discr, qbx_side) target_discrs_and_qbx_sides = [] for o in insn.outputs: key = (o.target_name, o.qbx_forced_limit) - if key not in tgt_name_and_side_to_number: - tgt_name_and_side_to_number[key] = \ + if key not in target_name_and_side_to_number: + target_name_and_side_to_number[key] = \ len(target_discrs_and_qbx_sides) target_discr = bound_expr.places[o.target_name] @@ -682,13 +704,63 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): target_discrs_and_qbx_sides.append( (target_discr, qbx_forced_limit)) - target_discrs_and_qbx_sides = tuple(target_discrs_and_qbx_sides) + return target_name_and_side_to_number, tuple(target_discrs_and_qbx_sides) - # }}} + # {{{ execute fmm performance model + + def perf_model_compute_potential_insn_fmm(self, queue, insn, bound_expr, + evaluate, costs): + target_name_and_side_to_number, target_discrs_and_qbx_sides = ( + self.get_target_discrs_and_qbx_sides(insn, bound_expr)) geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides) - # geo_data.plot() + if self.performance_model is None: + from pytential.qbx.performance import PerformanceModel + performance_model = PerformanceModel() + else: + performance_model = self.performance_model + + costs.update(performance_model(geo_data)) + + # {{{ construct dummy outputs + + strengths = (evaluate(insn.density).with_queue(queue) + * self.weights_and_area_elements()) + out_kernels = tuple(knl for knl in insn.kernels) + fmm_kernel = self.get_fmm_kernel(out_kernels) + output_and_expansion_dtype = ( + self.get_fmm_output_and_expansion_dtype(fmm_kernel, strengths)) + + result = [] + + for o in insn.outputs: + target_side_number = target_name_and_side_to_number[ + o.target_name, o.qbx_forced_limit] + start, end = geo_data.target_info().target_discr_starts[ + target_side_number:target_side_number+2] + + output_array = cl.array.zeros( + queue, + end - start, + dtype=output_and_expansion_dtype) + + result.append((o.name, output_array)) + + new_futures = [] + return result, new_futures + + # }}} + + # }}} + + # {{{ execute fmm + + def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate): + target_name_and_side_to_number, target_discrs_and_qbx_sides = ( + self.get_target_discrs_and_qbx_sides(insn, bound_expr)) + + geo_data = self.qbx_fmm_geometry_data(target_discrs_and_qbx_sides) # FIXME Exert more positive control over geo_data attribute lifetimes using # geo_data..clear_cache(geo_data). @@ -701,7 +773,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): strengths = (evaluate(insn.density).with_queue(queue) * self.weights_and_area_elements()) - out_kernels = tuple(knl for knl in insn.kernels) fmm_kernel = self.get_fmm_kernel(out_kernels) output_and_expansion_dtype = ( @@ -724,7 +795,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): == target_state.FAILED).any().get(): raise RuntimeError("geometry has failed targets") - # {{{ performance data hook + # {{{ geometry data inspection hook if self.geometry_data_inspector is not None: perform_fmm = self.geometry_data_inspector(insn, bound_expr, geo_data) @@ -736,23 +807,25 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX from pytential.qbx.fmm import drive_fmm - all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) + all_potentials_on_every_target = drive_fmm(wrangler, strengths) # }}} result = [] for o in insn.outputs: - tgt_side_number = tgt_name_and_side_to_number[ + target_side_number = target_name_and_side_to_number[ o.target_name, o.qbx_forced_limit] - tgt_slice = slice(*geo_data.target_info().target_discr_starts[ - tgt_side_number:tgt_side_number+2]) + target_slice = slice(*geo_data.target_info().target_discr_starts[ + target_side_number:target_side_number+2]) - result.append( - (o.name, - all_potentials_on_every_tgt[o.kernel_index][tgt_slice])) + result.append((o.name, + all_potentials_on_every_target[o.kernel_index][target_slice])) - return result, [] + new_futures = [] + return result, new_futures + + # }}} # }}} diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py index 17d23ebf..ddc46538 100644 --- a/pytential/symbolic/compiler.py +++ b/pytential/symbolic/compiler.py @@ -47,9 +47,6 @@ class Instruction(Record): def __str__(self): raise NotImplementedError - def get_exec_function(self, exec_mapper): - raise NotImplementedError - class Assign(Instruction): # attributes: names, exprs, do_not_return, priority @@ -111,9 +108,6 @@ class Assign(Instruction): lines.append("}") return "\n".join(lines) - def get_exec_function(self, exec_mapper): - return exec_mapper.exec_assign - def __hash__(self): return id(self) @@ -223,9 +217,6 @@ class ComputePotentialInstruction(Instruction): return "{ /* Pot(%s) */\n %s\n}" % ( ", ".join(args), "\n ".join(lines)) - def get_exec_function(self, exec_mapper): - source = exec_mapper.bound_expr.places[self.source] - return source.exec_compute_potential_insn def __hash__(self): return id(self) @@ -362,6 +353,14 @@ class Code(object): return argmax2(available_insns), discardable_vars + @staticmethod + def get_exec_function(insn, exec_mapper): + if isinstance(insn, Assign): + return exec_mapper.exec_assign + if isinstance(insn, ComputePotentialInstruction): + return exec_mapper.exec_compute_potential_insn + raise ValueError("unknown instruction class: %s" % type(insn)) + def execute(self, exec_mapper, pre_assign_check=None): """Execute the instruction stream, make all scheduling decisions dynamically. @@ -391,7 +390,7 @@ class Code(object): done_insns.add(insn) assignments, new_futures = ( - insn.get_exec_function(exec_mapper) + self.get_exec_function(insn, exec_mapper) (exec_mapper.queue, insn, exec_mapper.bound_expr, exec_mapper)) diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index f658a6aa..4b1dbccb 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -26,7 +26,7 @@ import six from six.moves import zip from pymbolic.mapper.evaluator import ( - EvaluationMapper as EvaluationMapperBase) + EvaluationMapper as PymbolicEvaluationMapper) import numpy as np import pyopencl as cl @@ -42,11 +42,13 @@ from pytools import memoize_in # {{{ evaluation mapper -class EvaluationMapper(EvaluationMapperBase): - def __init__(self, bound_expr, queue, context={}, +class EvaluationMapperBase(PymbolicEvaluationMapper): + def __init__(self, bound_expr, queue, context=None, target_geometry=None, target_points=None, target_normals=None, target_tangents=None): - EvaluationMapperBase.__init__(self, context) + if context is None: + context = {} + PymbolicEvaluationMapper.__init__(self, context) self.bound_expr = bound_expr self.queue = queue @@ -175,6 +177,9 @@ class EvaluationMapper(EvaluationMapperBase): return [(name, evaluate(expr)) for name, expr in zip(insn.names, insn.exprs)], [] + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + raise NotImplementedError + # {{{ functions def apply_real(self, args): @@ -221,6 +226,54 @@ class EvaluationMapper(EvaluationMapperBase): # }}} +# {{{ evaluation mapper + +class EvaluationMapper(EvaluationMapperBase): + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + source = bound_expr.places[insn.source] + return source.exec_compute_potential_insn(queue, insn, bound_expr, evaluate) + +# }}} + + +# {{{ performance model mapper + +class PerformanceModelMapper(EvaluationMapperBase): + """Mapper for evaluating performance models. + + This executes everything *except* the layer potential operator. Instead of + executing the operator, the performance model gets run and the performance + data is collected. + """ + + def __init__(self, bound_expr, queue, context=None, + target_geometry=None, + target_points=None, target_normals=None, target_tangents=None): + if context is None: + context = {} + EvaluationMapperBase.__init__( + self, bound_expr, queue, context, + target_geometry, + target_points, + target_normals, + target_tangents) + self.modeled_performance = {} + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + source = bound_expr.places[insn.source] + costs = {} + result = source.perf_model_compute_potential_insn( + queue, insn, bound_expr, evaluate, costs) + self.modeled_performance[insn] = costs + return result + + def get_modeled_performance(self): + return self.modeled_performance + +# }}} + + # {{{ scipy-like mat-vec op class MatVecOp: @@ -327,6 +380,11 @@ class BoundExpression: return discr + def get_modeled_performance(self, queue, **args): + perf_model_mapper = PerformanceModelMapper(self, queue, args) + self.code.execute(perf_model_mapper) + return perf_model_mapper.get_modeled_performance() + def scipy_op(self, queue, arg_name, dtype, domains=None, **extra_args): """ :arg domains: a list of discretization identifiers or diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 90a87d1c..d1b2fc3d 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -92,15 +92,6 @@ def test_performance_model(ctx_getter, dim): # {{{ run performance model - costs = {} - - def inspect_geo_data(insn, bound_expr, geo_data): - from pytential.qbx.performance import assemble_performance_data - costs["costs"] = assemble_performance_data( - geo_data, uses_pde_expansions=True, merge_close_lists=False) - return False - - lpot_source = lpot_source.copy(geometry_data_inspector=inspect_geo_data) density_discr = lpot_source.density_discr nodes = density_discr.nodes().with_queue(queue) sigma = cl.clmath.sin(10 * nodes[0]) @@ -108,10 +99,18 @@ def test_performance_model(ctx_getter, dim): from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") k_sym = LaplaceKernel(lpot_source.ambient_dim) - sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - bound_op = bind(lpot_source, sym_op) - bound_op(queue, sigma=sigma) + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + op_S = bind(lpot_source, sym_op_S) + perf_S = op_S.get_modeled_performance(queue, sigma=sigma) + assert len(perf_S) == 1 + + sym_op_S_plus_D = ( + sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + sym.D(k_sym, sigma_sym)) + op_S_plus_D = bind(lpot_source, sym_op_S_plus_D) + perf_S_plus_D = op_S_plus_D.get_modeled_performance(queue, sigma=sigma) + assert len(perf_S_plus_D) == 2 # }}} -- GitLab From acddba6a8a0c258c9e38ae6273ff14a985ba8742 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 11 Jun 2018 19:01:16 -0500 Subject: [PATCH 010/139] Add perf_model_compute_potential_insn as an abstract method to PotentialSource. --- pytential/source.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytential/source.py b/pytential/source.py index 6420494e..edb737ba 100644 --- a/pytential/source.py +++ b/pytential/source.py @@ -109,6 +109,10 @@ class PointPotentialSource(PotentialSource): return p2p + def perf_model_compute_potential_insn(self, queue, insn, bound_expr, + evaluate, costs): + raise NotImplementedError + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): p2p = None @@ -174,6 +178,7 @@ class LayerPotentialSourceBase(PotentialSource): .. rubric:: Execution .. method:: weights_and_area_elements + .. method:: perf_model_compute_potential_insn .. method:: exec_compute_potential_insn """ -- GitLab From c4fe38ab26f05da155f8354887bc3331ab9c45c4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 11 Jun 2018 19:48:58 -0500 Subject: [PATCH 011/139] Fix wrong function name for doing direct eval --- pytential/qbx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 672ed56c..741e564c 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -598,7 +598,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): if self.fmm_level_to_order is False: - func = self.exec_compute_potential_insn + func = self.exec_compute_potential_insn_direct else: func = self.exec_compute_potential_insn_fmm return self._dispatch_compute_potential_insn( -- GitLab From 58af2e59bc26e5c079a85764e2005753fe8dd0b0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 11 Jun 2018 19:49:45 -0500 Subject: [PATCH 012/139] flake8 fix --- pytential/symbolic/compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py index ddc46538..34d7bc6f 100644 --- a/pytential/symbolic/compiler.py +++ b/pytential/symbolic/compiler.py @@ -217,7 +217,6 @@ class ComputePotentialInstruction(Instruction): return "{ /* Pot(%s) */\n %s\n}" % ( ", ".join(args), "\n ".join(lines)) - def __hash__(self): return id(self) -- GitLab From 74d27274295f467f6b5d37964e334450492f5e35 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 27 Jun 2018 20:40:01 -0500 Subject: [PATCH 013/139] Revert "Revert "Merge branch 'master' of https://gitlab.tiker.net/inducer/pytential"" This reverts commit 7db97cd548c3d9929064d1bd0a80e7cf3e1d8d22. --- .gitlab-ci.yml | 40 +-- README.rst | 2 +- pytential/qbx/fmm.py | 393 --------------------- pytential/qbx/performance.py | 585 ++++++++++++++++++++++++++++++++ test/test_layer_pot.py | 73 +--- test/test_layer_pot_identity.py | 40 ++- test/test_performance_model.py | 131 +++++++ 7 files changed, 759 insertions(+), 505 deletions(-) create mode 100644 pytential/qbx/performance.py create mode 100644 test/test_performance_model.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 750bf6f4..6b6cf7a7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,12 +1,19 @@ -Python 3.5 POCL: +# Environment variables +# +# * PYTEST_ADDOPTS is used to filter test runs. The default value is "-k-slowtest", +# which skips the slow running tests. +# * SKIP_EXAMPLES, if non-empty, can be used to skip the examples job. + +Python 2.7 POCL: script: - - export PY_EXE=python3.5 + - export PY_EXE=python2.7 - export PYOPENCL_TEST=portable + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export EXTRA_INSTALL="numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - - python3.5 + - python2.7 - pocl - large-node except: @@ -16,6 +23,7 @@ Python 3.6 POCL: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export EXTRA_INSTALL="numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" @@ -28,6 +36,7 @@ Python 3.6 POCL: Python 3.6 POCL Examples: script: + - test -n "$SKIP_EXAMPLES" && exit - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako pyvisfile matplotlib" @@ -43,8 +52,9 @@ Python 3.6 POCL Examples: Python 3.5 Conda: script: - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine - - CONDA_ENVIRONMENT=.test-conda-env-py3.yml - - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CONDA_ENVIRONMENT=.test-conda-env-py3.yml + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: @@ -53,27 +63,13 @@ Python 3.5 Conda: except: - tags -Python 2.7 POCL: - script: - - export PY_EXE=python2.7 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="numpy mako" - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" - tags: - - python2.7 - - pocl - - large-node - except: - - tags - Python 3.5 Conda Apple: script: - export LC_ALL=en_US.UTF-8 - export LANG=en_US.UTF-8 - - export PYTEST_ADDOPTS=-k-slowtest - - CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml - - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml + - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} + - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: diff --git a/README.rst b/README.rst index bababc0a..8b354a93 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ It relies on * `boxtree `_ for FMM tree building * `sumpy `_ for expansions and analytical routines * `modepy `_ for modes and nodes on simplices -* `meshmode `_ for modes and nodes on simplices +* `meshmode `_ for high order discretizations * `loopy `_ for fast array operations * `pytest `_ for automated testing diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 4be9e5ca..4cfd0c81 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -521,397 +521,4 @@ def drive_fmm(expansion_wrangler, src_weights): # }}} -# {{{ performance data - -def assemble_performance_data(geo_data, uses_pde_expansions, - translation_source_power=None, translation_target_power=None, - translation_max_power=None, - summarize_parallel=None, merge_close_lists=True): - """ - :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM - uses translation operators that make use of the knowledge that the - potential satisfies a PDE. - :arg summarize_parallel: a function of two arguments - *(parallel_array, sym_multipliers)* used to process an array of - workloads of 'parallelizable units'. By default, all workloads are - summed into one number encompassing the total workload. - :arg merge_close_lists: A :class:`bool` indicating whether or not all - boxes requiring direct evaluation should be merged into a single - interaction list. If *False*, *part_direct* and *p2qbxl* will be - suffixed with the originating list as follows: - - * *_neighbor* (List 1) - * *_sep_smaller* (List 3 close) - * *_sep_bigger* (List 4 close). - """ - - # FIXME: This should suport target filtering. - - if summarize_parallel is None: - def summarize_parallel(parallel_array, sym_multipliers): - return np.sum(parallel_array) * sym_multipliers - - from collections import OrderedDict - result = OrderedDict() - - from pymbolic import var - p_fmm = var("p_fmm") - p_qbx = var("p_qbx") - - nqbtl = geo_data.non_qbx_box_target_lists() - - with cl.CommandQueue(geo_data.cl_context) as queue: - tree = geo_data.tree().get(queue=queue) - traversal = geo_data.traversal(merge_close_lists).get(queue=queue) - box_target_counts_nonchild = ( - nqbtl.box_target_counts_nonchild.get(queue=queue)) - - d = tree.dimensions - if uses_pde_expansions: - ncoeffs_fmm = p_fmm ** (d-1) - ncoeffs_qbx = p_qbx ** (d-1) - - if d == 2: - default_translation_source_power = 1 - default_translation_target_power = 1 - default_translation_max_power = 0 - - elif d == 3: - # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. - default_translation_source_power = 0 - default_translation_target_power = 0 - default_translation_max_power = 3 - - else: - raise ValueError("Don't know how to estimate expansion complexities " - "for dimension %d" % d) - - else: - ncoeffs_fmm = p_fmm ** d - ncoeffs_qbx = p_qbx ** d - default_translation_source_power = d - default_translation_target_power = d - - if translation_source_power is None: - translation_source_power = default_translation_source_power - if translation_target_power is None: - translation_target_power = default_translation_target_power - if translation_max_power is None: - translation_max_power = default_translation_max_power - - def xlat_cost(p_source, p_target): - from pymbolic.primitives import Max - return ( - p_source ** translation_source_power - * p_target ** translation_target_power - * Max((p_source, p_target)) ** translation_max_power - ) - - result.update( - nlevels=tree.nlevels, - nboxes=tree.nboxes, - nsources=tree.nsources, - ntargets=tree.ntargets) - - # {{{ construct local multipoles - - result["form_mp"] = tree.nsources*ncoeffs_fmm - - # }}} - - # {{{ propagate multipoles upward - - result["prop_upward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm) - - # }}} - - # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) - - def process_direct(): - # box -> nsources * ntargets - npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = box_target_counts_nonchild[tgt_ibox] - - npart_direct_list1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list1_srcs += nsources - - npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs - - if merge_close_lists: - continue - - npart_direct_list3_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list3_srcs += nsources - - npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs - - npart_direct_list4_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list4_srcs += nsources - - npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs - - if merge_close_lists: - result["part_direct"] = summarize_parallel(npart_direct_list1, 1) - else: - result["part_direct_neighbor"] = ( - summarize_parallel(npart_direct_list1, 1)) - result["part_direct_sep_smaller"] = ( - summarize_parallel(npart_direct_list3, 1)) - result["part_direct_sep_bigger"] = ( - summarize_parallel(npart_direct_list4, 1)) - - process_direct() - - # }}} - - # {{{ translate separated siblings' ("list 2") mpoles to local - - def process_list2(): - nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): - start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] - - nm2l[itgt_box] += end-start - - result["m2l"] = summarize_parallel(nm2l, xlat_cost(p_fmm, p_fmm)) - - process_list2() - - # }}} - - # {{{ evaluate sep. smaller mpoles ("list 3") at particles - - def process_list3(): - nmp_eval = np.zeros( - (tree.nlevels, len(traversal.target_boxes)), - dtype=np.intp) - - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) - - for ilevel, sep_smaller_list in enumerate( - traversal.from_sep_smaller_by_level): - for itgt_box, tgt_ibox in enumerate( - traversal.target_boxes_sep_smaller_by_source_level[ilevel]): - ntargets = box_target_counts_nonchild[tgt_ibox] - start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] - nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = ( - ntargets * (end-start) - ) - - result["mp_eval"] = summarize_parallel(nmp_eval, ncoeffs_fmm) - - process_list3() - - # }}} - - # {{{ form locals for separated bigger source boxes ("list 4") - - def process_list4(): - nform_local = np.zeros( - len(traversal.target_or_target_parent_boxes), - dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): - start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] - - nform_local_box = 0 - for src_ibox in traversal.from_sep_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - nform_local_box += nsources - - nform_local[itgt_box] = nform_local_box - - result["form_local"] = summarize_parallel(nform_local, ncoeffs_fmm) - - process_list4() - - # }}} - - # {{{ propagate local_exps downward - - result["prop_downward"] = tree.nboxes * xlat_cost(p_fmm, p_fmm) - - # }}} - - # {{{ evaluate locals - - result["eval_part"] = tree.ntargets * ncoeffs_fmm - - # }}} - - # {{{ form global qbx locals - - global_qbx_centers = geo_data.global_qbx_centers() - - # If merge_close_lists is False above, then this builds another traversal - # (which is OK). - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - center_to_targets_starts = geo_data.center_to_tree_targets().starts - qbx_center_to_target_box_source_level = np.empty( - (tree.nlevels,), dtype=object - ) - - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - geo_data.qbx_center_to_target_box_source_level(src_level) - ) - - with cl.CommandQueue(geo_data.cl_context) as queue: - global_qbx_centers = global_qbx_centers.get( - queue=queue) - qbx_center_to_target_box = qbx_center_to_target_box.get( - queue=queue) - center_to_targets_starts = center_to_targets_starts.get( - queue=queue) - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - qbx_center_to_target_box_source_level[src_level].get(queue=queue) - ) - - def process_form_qbxl(): - ncenters = geo_data.ncenters - - result["ncenters"] = ncenters - - # center -> nsources - np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - itgt_box = qbx_center_to_target_box[tgt_icenter] - - np2qbxl_list1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list1_srcs += nsources - - np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs - - if merge_close_lists: - continue - - np2qbxl_list3_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list3_srcs += nsources - - np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs - - np2qbxl_list4_srcs = 0 - - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list4_srcs += nsources - - np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs - - if merge_close_lists: - result["p2qbxl"] = summarize_parallel(np2qbxl_list1, ncoeffs_qbx) - else: - result["p2qbxl_neighbor"] = ( - summarize_parallel(np2qbxl_list1, ncoeffs_qbx)) - result["p2qbxl_sep_smaller"] = ( - summarize_parallel(np2qbxl_list3, ncoeffs_qbx)) - result["p2qbxl_sep_bigger"] = ( - summarize_parallel(np2qbxl_list4, ncoeffs_qbx)) - - process_form_qbxl() - - # }}} - - # {{{ translate from list 3 multipoles to qbx local expansions - - def process_m2qbxl(): - nm2qbxl = np.zeros( - (tree.nlevels, len(global_qbx_centers)), - dtype=np.intp) - - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) - - for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - icontaining_tgt_box = qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] - - if icontaining_tgt_box == -1: - continue - - start, stop = ( - ssn.starts[icontaining_tgt_box], - ssn.starts[icontaining_tgt_box+1]) - - nm2qbxl[isrc_level, itgt_center] += stop-start - - result["m2qbxl"] = summarize_parallel(nm2qbxl, xlat_cost(p_fmm, p_qbx)) - - process_m2qbxl() - - # }}} - - # {{{ translate from box local expansions to qbx local expansions - - result["l2qbxl"] = geo_data.ncenters * xlat_cost(p_fmm, p_qbx) - - # }}} - - # {{{ evaluate qbx local expansions - - def process_eval_qbxl(): - nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp) - - for isrc_center, src_icenter in enumerate(global_qbx_centers): - start, end = center_to_targets_starts[src_icenter:src_icenter+2] - nqbx_eval[isrc_center] += end-start - - result["qbxl2p"] = summarize_parallel(nqbx_eval, ncoeffs_qbx) - - process_eval_qbxl() - - # }}} - - return result - -# }}} - # vim: foldmethod=marker diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py new file mode 100644 index 00000000..424fbed5 --- /dev/null +++ b/pytential/qbx/performance.py @@ -0,0 +1,585 @@ +from __future__ import division, absolute_import + +__copyright__ = """ +Copyright (C) 2013 Andreas Kloeckner +Copyright (C) 2018 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from six.moves import range +import numpy as np # noqa +import pyopencl as cl # noqa +import pyopencl.array # noqa + + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. autoclass:: PerformanceModel +.. autofunction:: assemble_performance_data +""" + + +# {{{ translation cost model + +class TranslationCostModel(object): + """Provides modeled costs for individual translations or evaluations.""" + + def __init__(self, p_qbx, p_fmm, ncoeffs_qbx, ncoeffs_fmm, + translation_source_power, translation_target_power, + translation_max_power): + self.p_qbx = p_qbx + self.p_fmm = p_fmm + self.ncoeffs_qbx = ncoeffs_qbx + self.ncoeffs_fmm = ncoeffs_fmm + self.translation_source_power = translation_source_power + self.translation_target_power = translation_target_power + self.translation_max_power = translation_max_power + + def direct(self): + return 1 + + def p2qbxl(self): + return self.ncoeffs_qbx + + qbxl2p = p2qbxl + + def p2l(self): + return self.ncoeffs_fmm + + l2p = p2l + p2m = p2l + m2p = p2l + + def m2m(self): + return self.e2e_cost(self.p_fmm, self.p_fmm) + + l2l = m2m + m2l = m2m + + def m2qbxl(self): + return self.e2e_cost(self.p_fmm, self.p_qbx) + + l2qbxl = m2qbxl + + def e2e_cost(self, p_source, p_target): + from pymbolic.primitives import Max + return ( + p_source ** self.translation_source_power + * p_target ** self.translation_target_power + * Max((p_source, p_target)) ** self.translation_max_power) + +# }}} + + +# {{{ performance model + +class PerformanceModel(object): + + def __init__(self, + uses_pde_expansions=True, + translation_source_power=None, + translation_target_power=None, + translation_max_power=None, + summarize_parallel=None, + merge_close_lists=True): + """ + :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM + uses translation operators that make use of the knowledge that the + potential satisfies a PDE. + :arg summarize_parallel: a function of two arguments + *(parallel_array, sym_multipliers)* used to process an array of + workloads of 'parallelizable units'. By default, all workloads are + summed into one number encompassing the total workload. + :arg merge_close_lists: A :class:`bool` indicating whether or not all + boxes requiring direct evaluation should be merged into a single + interaction list. If *False*, *part_direct* and *p2qbxl* will be + suffixed with the originating list as follows: + + * *_neighbor* (List 1) + * *_sep_smaller* (List 3 close) + * *_sep_bigger* (List 4 close). + """ + self.uses_pde_expansions = uses_pde_expansions + self.translation_source_power = translation_source_power + self.translation_target_power = translation_target_power + self.translation_max_power = translation_max_power + if summarize_parallel is None: + summarize_parallel = self.summarize_parallel_default + self.summarize_parallel = summarize_parallel + self.merge_close_lists = merge_close_lists + + @staticmethod + def summarize_parallel_default(parallel_array, sym_multipliers): + return np.sum(parallel_array) * sym_multipliers + + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + def process_direct(self, xlat_cost, traversal, tree, box_target_counts_nonchild): + # box -> nsources * ntargets + npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp) + npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp) + npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): + ntargets = box_target_counts_nonchild[tgt_ibox] + + npart_direct_list1_srcs = 0 + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + npart_direct_list1_srcs += nsources + + npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs + + if self.merge_close_lists: + continue + + npart_direct_list3_srcs = 0 + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + npart_direct_list3_srcs += nsources + + npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs + + npart_direct_list4_srcs = 0 + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + npart_direct_list4_srcs += nsources + + npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs + + result = {} + if self.merge_close_lists: + result["part_direct"] = ( + self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) + else: + result["part_direct_neighbor"] = ( + self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) + result["part_direct_sep_smaller"] = ( + self.summarize_parallel(npart_direct_list3, xlat_cost.direct())) + result["part_direct_sep_bigger"] = ( + self.summarize_parallel(npart_direct_list4, xlat_cost.direct())) + + return result + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + def process_list2(self, xlat_cost, traversal): + nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] + + nm2l[itgt_box] += end-start + + return dict(m2l=self.summarize_parallel(nm2l, xlat_cost.m2l())) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + def process_list3(self, xlat_cost, traversal, tree, box_target_counts_nonchild): + nmp_eval = np.zeros( + (tree.nlevels, len(traversal.target_boxes)), + dtype=np.intp) + + assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + + for ilevel, sep_smaller_list in enumerate( + traversal.from_sep_smaller_by_level): + for itgt_box, tgt_ibox in enumerate( + traversal.target_boxes_sep_smaller_by_source_level[ilevel]): + ntargets = box_target_counts_nonchild[tgt_ibox] + start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] + nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = ( + ntargets * (end-start) + ) + + return dict( + mp_eval=self.summarize_parallel(nmp_eval, xlat_cost.m2p())) + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + def process_list4(self, xlat_cost, traversal, tree): + nform_local = np.zeros( + len(traversal.target_or_target_parent_boxes), + dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] + + nform_local_box = 0 + for src_ibox in traversal.from_sep_bigger_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + nform_local_box += nsources + + nform_local[itgt_box] = nform_local_box + + return dict(form_local=( + self.summarize_parallel(nform_local, xlat_cost.p2l()))) + + # }}} + + # {{{ form global qbx locals + + def process_form_qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box): + + # center -> nsources + np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + itgt_box = qbx_center_to_target_box[tgt_icenter] + + np2qbxl_list1_srcs = 0 + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + np2qbxl_list1_srcs += nsources + + np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs + + if self.merge_close_lists: + continue + + np2qbxl_list3_srcs = 0 + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + np2qbxl_list3_srcs += nsources + + np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs + + np2qbxl_list4_srcs = 0 + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + + np2qbxl_list4_srcs += nsources + + np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs + + result = {} + if self.merge_close_lists: + result["p2qbxl"] = ( + self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) + else: + result["p2qbxl_neighbor"] = ( + self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) + result["p2qbxl_sep_smaller"] = ( + self.summarize_parallel(np2qbxl_list3, xlat_cost.p2qbxl())) + result["p2qbxl_sep_bigger"] = ( + self.summarize_parallel(np2qbxl_list4, xlat_cost.p2qbxl())) + + return result + + # }}} + + # {{{ translate from list 3 multipoles to qbx local expansions + + def process_m2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box_source_level): + nm2qbxl = np.zeros( + (tree.nlevels, len(global_qbx_centers)), + dtype=np.intp) + + assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + + for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + icontaining_tgt_box = qbx_center_to_target_box_source_level[ + isrc_level][tgt_icenter] + + if icontaining_tgt_box == -1: + continue + + start, stop = ( + ssn.starts[icontaining_tgt_box], + ssn.starts[icontaining_tgt_box+1]) + + nm2qbxl[isrc_level, itgt_center] += stop-start + + return dict(m2qbxl=self.summarize_parallel(nm2qbxl, xlat_cost.m2qbxl())) + + # }}} + + # {{{ evaluate qbx local expansions + + def process_eval_qbxl(self, xlat_cost, global_qbx_centers, + center_to_targets_starts): + nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp) + + for isrc_center, src_icenter in enumerate(global_qbx_centers): + start, end = center_to_targets_starts[src_icenter:src_icenter+2] + nqbx_eval[isrc_center] += end-start + + return dict(qbxl2p=self.summarize_parallel(nqbx_eval, xlat_cost.qbxl2p())) + + # }}} + + # {{{ set up translation cost model + + def get_translation_cost_model(self, d): + from pymbolic import var + p_qbx = var("p_qbx") + p_fmm = var("p_fmm") + + if self.uses_pde_expansions: + ncoeffs_fmm = p_fmm ** (d-1) + ncoeffs_qbx = p_qbx ** (d-1) + + if d == 2: + default_translation_source_power = 1 + default_translation_target_power = 1 + default_translation_max_power = 0 + + elif d == 3: + # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. + default_translation_source_power = 0 + default_translation_target_power = 0 + default_translation_max_power = 3 + + else: + raise ValueError("Don't know how to estimate expansion complexities " + "for dimension %d" % d) + + else: + ncoeffs_fmm = p_fmm ** d + ncoeffs_qbx = p_qbx ** d + default_translation_source_power = d + default_translation_target_power = d + + translation_source_power = ( + default_translation_source_power + if self.translation_source_power is None + else self.translation_source_power) + + translation_target_power = ( + default_translation_target_power + if self.translation_target_power is None + else self.translation_target_power) + + translation_max_power = ( + default_translation_max_power + if self.translation_max_power is None + else self.translation_max_power) + + return TranslationCostModel( + p_qbx=p_qbx, + p_fmm=p_fmm, + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm=ncoeffs_fmm, + translation_source_power=translation_source_power, + translation_target_power=translation_target_power, + translation_max_power=translation_max_power) + + # }}} + + def __call__(self, geo_data): + # FIXME: This should suport target filtering. + + from collections import OrderedDict + result = OrderedDict() + + nqbtl = geo_data.non_qbx_box_target_lists() + + with cl.CommandQueue(geo_data.cl_context) as queue: + tree = geo_data.tree().get(queue=queue) + traversal = geo_data.traversal(self.merge_close_lists).get(queue=queue) + box_target_counts_nonchild = ( + nqbtl.box_target_counts_nonchild.get(queue=queue)) + + result.update( + nlevels=tree.nlevels, + nboxes=tree.nboxes, + nsources=tree.nsources, + ntargets=tree.ntargets, + ncenters=geo_data.ncenters) + + xlat_cost = self.get_translation_cost_model(tree.dimensions) + + # {{{ construct local multipoles + + result["form_mp"] = tree.nsources * xlat_cost.p2m() + + # }}} + + # {{{ propagate multipoles upward + + result["prop_upward"] = tree.nboxes * xlat_cost.m2m() + + # }}} + + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + result.update(self.process_direct( + xlat_cost, traversal, tree, box_target_counts_nonchild)) + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + result.update(self.process_list2(xlat_cost, traversal)) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + result.update(self.process_list3( + xlat_cost, traversal, tree, box_target_counts_nonchild)) + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + result.update(self.process_list4(xlat_cost, traversal, tree)) + + # }}} + + # {{{ propagate local_exps downward + + result["prop_downward"] = tree.nboxes * xlat_cost.l2l() + + # }}} + + # {{{ evaluate locals + + result["eval_part"] = tree.ntargets * xlat_cost.l2p() + + # }}} + + global_qbx_centers = geo_data.global_qbx_centers() + + # If self.merge_close_lists is False, then this builds another traversal + # (which is OK). + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + center_to_targets_starts = geo_data.center_to_tree_targets().starts + qbx_center_to_target_box_source_level = np.empty( + (tree.nlevels,), dtype=object) + + for src_level in range(tree.nlevels): + qbx_center_to_target_box_source_level[src_level] = ( + geo_data.qbx_center_to_target_box_source_level(src_level)) + + with cl.CommandQueue(geo_data.cl_context) as queue: + global_qbx_centers = global_qbx_centers.get( + queue=queue) + qbx_center_to_target_box = qbx_center_to_target_box.get( + queue=queue) + center_to_targets_starts = center_to_targets_starts.get( + queue=queue) + for src_level in range(tree.nlevels): + qbx_center_to_target_box_source_level[src_level] = ( + qbx_center_to_target_box_source_level[src_level] + .get(queue=queue)) + + # {{{ form global qbx locals + + result.update(self.process_form_qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box)) + + # }}} + + # {{{ translate from list 3 multipoles to qbx local expansions + + result.update(self.process_m2qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box_source_level)) + + # }}} + + # {{{ translate from box local expansions to qbx local expansions + + result["l2qbxl"] = geo_data.ncenters * xlat_cost.l2qbxl() + + # }}} + + # {{{ evaluate qbx local expansions + + result.update(self.process_eval_qbxl( + xlat_cost, global_qbx_centers, center_to_targets_starts)) + + # }}} + + return result + +# }}} + + +# {{{ assemble_performance_data + +def assemble_performance_data(geo_data, uses_pde_expansions, + translation_source_power=None, translation_target_power=None, + translation_max_power=None, + summarize_parallel=None, merge_close_lists=True): + """Compute modeled performance using :class:`PerformanceModel`. + + See :class:`PerformanceModel` for parameter documentation. + """ + + return PerformanceModel( + uses_pde_expansions, + translation_source_power, + translation_target_power, + translation_max_power, + summarize_parallel, + merge_close_lists)(geo_data) + +# }}} + +# vim: foldmethod=marker diff --git a/test/test_layer_pot.py b/test/test_layer_pot.py index 66445fee..810a50ed 100644 --- a/test/test_layer_pot.py +++ b/test/test_layer_pot.py @@ -183,7 +183,7 @@ def test_off_surface_eval_vs_direct(ctx_getter, do_plot=False): target_association_tolerance=0.05, ).with_refinement() - fplot = FieldPlotter(np.zeros(2), extent=5, npoints=1000) + fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500) from pytential.target import PointsTarget ptarget = PointsTarget(fplot.points) from sumpy.kernel import LaplaceKernel @@ -315,77 +315,6 @@ def test_unregularized_off_surface_fmm_vs_direct(ctx_getter): # }}} -# {{{ test performance data gathering - -def test_perf_data_gathering(ctx_getter, n_arms=5): - cl_ctx = ctx_getter() - queue = cl.CommandQueue(cl_ctx) - - # prevent cache 'splosion - from sympy.core.cache import clear_cache - clear_cache() - - target_order = 8 - - starfish_func = NArmedStarfish(n_arms, 0.8) - mesh = make_curve_mesh( - starfish_func, - np.linspace(0, 1, n_arms * 30), - target_order) - - sigma_sym = sym.var("sigma") - - # The kernel doesn't really matter here - from sumpy.kernel import LaplaceKernel - k_sym = LaplaceKernel(mesh.ambient_dim) - - sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - - from meshmode.discretization import Discretization - from meshmode.discretization.poly_element import ( - InterpolatoryQuadratureSimplexGroupFactory) - pre_density_discr = Discretization( - queue.context, mesh, - InterpolatoryQuadratureSimplexGroupFactory(target_order)) - - results = [] - - def inspect_geo_data(insn, bound_expr, geo_data): - from pytential.qbx.fmm import assemble_performance_data - perf_data = assemble_performance_data(geo_data, uses_pde_expansions=True) - results.append(perf_data) - - return False # no need to do the actual FMM - - from pytential.qbx import QBXLayerPotentialSource - lpot_source = QBXLayerPotentialSource( - pre_density_discr, 4*target_order, - # qbx order and fmm order don't really matter - 10, fmm_order=10, - _expansions_in_tree_have_extent=True, - _expansion_stick_out_factor=0.5, - geometry_data_inspector=inspect_geo_data, - target_association_tolerance=1e-10, - ) - - lpot_source, _ = lpot_source.with_refinement() - - density_discr = lpot_source.density_discr - - if 0: - from meshmode.discretization.visualization import draw_curve - draw_curve(density_discr) - import matplotlib.pyplot as plt - plt.show() - - nodes = density_discr.nodes().with_queue(queue) - sigma = cl.clmath.sin(10 * nodes[0]) - - bind(lpot_source, sym_op)(queue, sigma=sigma) - -# }}} - - # {{{ test 3D jump relations @pytest.mark.parametrize("relation", ["sp", "nxcurls", "div_s"]) diff --git a/test/test_layer_pot_identity.py b/test/test_layer_pot_identity.py index 942b6ed2..49369e2f 100644 --- a/test/test_layer_pot_identity.py +++ b/test/test_layer_pot_identity.py @@ -233,12 +233,12 @@ class DynamicTestCase(object): if (self.geometry.mesh_name == "sphere" and self.k != 0 and self.fmm_backend == "sumpy"): - pytest.skip("both direct eval and generating the FMM kernels " + raise ValueError("both direct eval and generating the FMM kernels " "are too slow") if (self.geometry.mesh_name == "sphere" and self.expr.zero_op_name == "green_grad"): - pytest.skip("does not achieve sufficient precision") + raise ValueError("does not achieve sufficient precision") if self.fmm_backend == "fmmlib": pytest.importorskip("pyfmmlib") @@ -246,22 +246,28 @@ class DynamicTestCase(object): # {{{ integral identity tester + +@pytest.mark.slowtest +@pytest.mark.parametrize("case", [ + DynamicTestCase(SphereGeometry(), GreenExpr(), 0), +]) +def test_identity_convergence_slow(ctx_getter, case): + test_identity_convergence(ctx_getter, case) + + @pytest.mark.parametrize("case", [ - tc - for geom in [ - StarfishGeometry(), - SphereGeometry(), - ] - for tc in [ - DynamicTestCase(geom, GreenExpr(), 0), - DynamicTestCase(geom, GreenExpr(), 1.2), - DynamicTestCase(geom, GradGreenExpr(), 0), - DynamicTestCase(geom, GradGreenExpr(), 1.2), - DynamicTestCase(geom, ZeroCalderonExpr(), 0), - - DynamicTestCase(geom, GreenExpr(), 0, fmm_backend="fmmlib"), - DynamicTestCase(geom, GreenExpr(), 1.2, fmm_backend="fmmlib"), - ]]) + # 2d + DynamicTestCase(StarfishGeometry(), GreenExpr(), 0), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2), + DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 0), + DynamicTestCase(StarfishGeometry(), GradGreenExpr(), 1.2), + DynamicTestCase(StarfishGeometry(), ZeroCalderonExpr(), 0), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"), + DynamicTestCase(StarfishGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib"), + # 3d + DynamicTestCase(SphereGeometry(), GreenExpr(), 0, fmm_backend="fmmlib"), + DynamicTestCase(SphereGeometry(), GreenExpr(), 1.2, fmm_backend="fmmlib") +]) def test_identity_convergence(ctx_getter, case, visualize=False): logging.basicConfig(level=logging.INFO) diff --git a/test/test_performance_model.py b/test/test_performance_model.py new file mode 100644 index 00000000..90a87d1c --- /dev/null +++ b/test/test_performance_model.py @@ -0,0 +1,131 @@ +from __future__ import division, print_function + +__copyright__ = "Copyright (C) 2018 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import numpy.linalg as la # noqa +import pyopencl as cl +import pyopencl.clmath # noqa +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from pytential import bind, sym, norm # noqa + + +# {{{ global params + +TARGET_ORDER = 8 +OVSMP_FACTOR = 5 +TCF = 0.9 +QBX_ORDER = 5 +FMM_ORDER = 10 + +DEFAULT_LPOT_KWARGS = { + "_box_extent_norm": "l2", + "_from_sep_smaller_crit": "static_l2", + } + +# }}} + + +@pytest.mark.parametrize("dim", (2, 3)) +def test_performance_model(ctx_getter, dim): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + # {{{ get lpot source + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + target_order = TARGET_ORDER + + if dim == 2: + from meshmode.mesh.generation import starfish, make_curve_mesh + mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order) + elif dim == 3: + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(r=1, order=target_order) + else: + raise ValueError("unknown dimension: %d" % dim) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + _expansion_stick_out_factor=TCF, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR*target_order, + **lpot_kwargs) + + lpot_source, _ = lpot_source.with_refinement() + + # }}} + + # {{{ run performance model + + costs = {} + + def inspect_geo_data(insn, bound_expr, geo_data): + from pytential.qbx.performance import assemble_performance_data + costs["costs"] = assemble_performance_data( + geo_data, uses_pde_expansions=True, merge_close_lists=False) + return False + + lpot_source = lpot_source.copy(geometry_data_inspector=inspect_geo_data) + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + sigma = cl.clmath.sin(10 * nodes[0]) + + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + bound_op = bind(lpot_source, sym_op) + bound_op(queue, sigma=sigma) + + # }}} + + +# You can test individual routines by typing +# $ python test_performance_model.py 'test_routine()' + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + + +# vim: foldmethod=marker -- GitLab From c5e86c0582d17d2192dc18f744528463db8ccf25 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 27 Jun 2018 20:41:14 -0500 Subject: [PATCH 014/139] s/try_find_centers/find_centers --- pytential/qbx/target_assoc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/target_assoc.py b/pytential/qbx/target_assoc.py index 11b70c4b..5273b72b 100644 --- a/pytential/qbx/target_assoc.py +++ b/pytential/qbx/target_assoc.py @@ -528,9 +528,9 @@ class TargetAssociationWrangler(TreeWranglerBase): return (found_target_close_to_panel == 1).all().get() @log_process(logger) - def try_find_centers(self, tree, peer_lists, lpot_source, - target_status, target_flags, target_assoc, - target_association_tolerance, debug, wait_for=None): + def find_centers(self, tree, peer_lists, lpot_source, + target_status, target_flags, target_assoc, + target_association_tolerance, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil @@ -750,7 +750,7 @@ def associate_targets_to_qbx_centers(lpot_source, wrangler, target_flags = wrangler.make_target_flags(target_discrs_and_qbx_sides) - wrangler.try_find_centers(tree, peer_lists, lpot_source, target_status, + wrangler.find_centers(tree, peer_lists, lpot_source, target_status, target_flags, target_assoc, target_association_tolerance, debug) center_not_found = ( -- GitLab From b269e84c5555982f1f94fe17e653011f923fae5c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 12 Jul 2018 12:19:42 -0500 Subject: [PATCH 015/139] Expose timing data gathering to the user via BoundExpression.exec(). This depends on !4 --- pytential/qbx/__init__.py | 9 ++++-- pytential/source.py | 4 ++- pytential/symbolic/execution.py | 25 +++++++++++++-- pytential/unregularized.py | 10 ++++-- test/test_performance_model.py | 57 +++++++++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 10 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index ec583733..e9bf6772 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -835,7 +835,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX from pytential.qbx.fmm import drive_fmm - all_potentials_on_every_target = drive_fmm(wrangler, strengths) + timing_data = {} + all_potentials_on_every_target = drive_fmm(wrangler, strengths, timing_data) # }}} @@ -851,7 +852,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): all_potentials_on_every_target[o.kernel_index][target_slice])) new_futures = [] - return result, new_futures + return result, new_futures, timing_data # }}} @@ -1014,7 +1015,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result, [] + timing_data = {} + new_futures = [] + return result, new_futures, timing_data # }}} diff --git a/pytential/source.py b/pytential/source.py index edb737ba..9ffa66c9 100644 --- a/pytential/source.py +++ b/pytential/source.py @@ -138,7 +138,9 @@ class PointPotentialSource(PotentialSource): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result, [] + timing_data = {} + new_futures = [] + return result, timing_data, [] @memoize_method def weights_and_area_elements(self): diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 4b1dbccb..a74d2cf8 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -230,9 +230,22 @@ class EvaluationMapperBase(PymbolicEvaluationMapper): class EvaluationMapper(EvaluationMapperBase): + def __init__(self, bound_expr, queue, context=None, + timing_data=None): + EvaluationMapperBase.__init__(self, bound_expr, queue, context) + self.timing_data = timing_data + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): source = bound_expr.places[insn.source] - return source.exec_compute_potential_insn(queue, insn, bound_expr, evaluate) + + result, futures, timing_data = ( + source.exec_compute_potential_insn( + queue, insn, bound_expr, evaluate)) + + if self.timing_data is not None: + self.timing_data[insn] = timing_data + + return (result, futures) # }}} @@ -424,10 +437,16 @@ class BoundExpression: return MatVecOp(self, queue, arg_name, dtype, total_dofs, starts_and_ends, extra_args) - def __call__(self, queue, **args): - exec_mapper = EvaluationMapper(self, queue, args) + def exec(self, queue, context=None, timing_data=None): + if context is None: + context = {} + exec_mapper = EvaluationMapper( + self, queue, context, timing_data=timing_data) return self.code.execute(exec_mapper) + def __call__(self, queue, **args): + return self.exec(queue, args) + # }}} diff --git a/pytential/unregularized.py b/pytential/unregularized.py index a2fe4ad0..c7e97545 100644 --- a/pytential/unregularized.py +++ b/pytential/unregularized.py @@ -187,7 +187,9 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_for_each_kernel[o.kernel_index])) - return result, [] + timing_data = {} + new_futures = [] + return result, new_futures, timing_data # {{{ fmm-based execution @@ -270,8 +272,9 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): # }}} from boxtree.fmm import drive_fmm + timing_data = {} all_potentials_on_every_tgt = drive_fmm( - geo_data.traversal(), wrangler, strengths) + geo_data.traversal(), wrangler, strengths, timing_data) # {{{ postprocess fmm @@ -288,7 +291,8 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): # }}} - return result, [] + new_futures = [] + return result, new_futures, timing_data # }}} diff --git a/test/test_performance_model.py b/test/test_performance_model.py index d1b2fc3d..1968fc8e 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -49,6 +49,61 @@ DEFAULT_LPOT_KWARGS = { # }}} +# {{{ test_timing_data_gathering + +def test_timing_data_gathering(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + target_order = TARGET_ORDER + + from meshmode.mesh.generation import starfish, make_curve_mesh + mesh = make_curve_mesh(starfish, np.linspace(0, 1, 1000), order=target_order) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + _expansion_stick_out_factor=TCF, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, + fmm_backend="fmmlib", + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR*target_order, + **lpot_kwargs) + + lpot_source, _ = lpot_source.with_refinement() + + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + sigma = cl.clmath.sin(10 * nodes[0]) + + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + op_S = bind(lpot_source, sym_op_S) + + timing_data = {} + op_S.exec(queue, dict(sigma=sigma), timing_data=timing_data) + assert timing_data + print(timing_data) + +# }}} + + +# {{{ test_performance_model + @pytest.mark.parametrize("dim", (2, 3)) def test_performance_model(ctx_getter, dim): cl_ctx = ctx_getter() @@ -114,6 +169,8 @@ def test_performance_model(ctx_getter, dim): # }}} +# }}} + # You can test individual routines by typing # $ python test_performance_model.py 'test_routine()' -- GitLab From 3f69adf7966fe53061db90732c8e5400d718cdf2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 12 Jul 2018 12:24:31 -0500 Subject: [PATCH 016/139] flake8 fix --- pytential/source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/source.py b/pytential/source.py index 9ffa66c9..487c73e1 100644 --- a/pytential/source.py +++ b/pytential/source.py @@ -140,7 +140,7 @@ class PointPotentialSource(PotentialSource): timing_data = {} new_futures = [] - return result, timing_data, [] + return result, new_futures, timing_data @memoize_method def weights_and_area_elements(self): -- GitLab From f2bc5633233f83c65b67352089758b4facd9d387 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 12 Jul 2018 14:14:17 -0500 Subject: [PATCH 017/139] Change exec() to eval() for Py2.7 compatibility --- pytential/symbolic/execution.py | 4 ++-- test/test_performance_model.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index a74d2cf8..9a5711a4 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -437,7 +437,7 @@ class BoundExpression: return MatVecOp(self, queue, arg_name, dtype, total_dofs, starts_and_ends, extra_args) - def exec(self, queue, context=None, timing_data=None): + def eval(self, queue, context=None, timing_data=None): if context is None: context = {} exec_mapper = EvaluationMapper( @@ -445,7 +445,7 @@ class BoundExpression: return self.code.execute(exec_mapper) def __call__(self, queue, **args): - return self.exec(queue, args) + return self.eval(queue, args) # }}} diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 1968fc8e..7243972a 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -95,7 +95,7 @@ def test_timing_data_gathering(ctx_getter): op_S = bind(lpot_source, sym_op_S) timing_data = {} - op_S.exec(queue, dict(sigma=sigma), timing_data=timing_data) + op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data) assert timing_data print(timing_data) -- GitLab From 9e1a1ef2e1eb6f0e8b950d73261d6c1e96d8ef7e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 13 Jul 2018 13:28:39 -0500 Subject: [PATCH 018/139] Skip if fmmlib import fails --- test/test_performance_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 7243972a..f6f9f53d 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -52,6 +52,8 @@ DEFAULT_LPOT_KWARGS = { # {{{ test_timing_data_gathering def test_timing_data_gathering(ctx_getter): + pytest.importorskip("pyfmmlib") + cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) -- GitLab From 57ab9cc29e77477f679031eefaa1eae6bd0a2ca8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 13 Jul 2018 19:15:48 -0500 Subject: [PATCH 019/139] Initial version of performance calibration code. --- examples/performance.py | 166 +++++++++++++++++ pytential/qbx/__init__.py | 15 +- pytential/qbx/performance.py | 313 +++++++++++++++++++++++--------- pytential/symbolic/execution.py | 10 +- 4 files changed, 408 insertions(+), 96 deletions(-) create mode 100644 examples/performance.py diff --git a/examples/performance.py b/examples/performance.py new file mode 100644 index 00000000..2a2f2ea7 --- /dev/null +++ b/examples/performance.py @@ -0,0 +1,166 @@ +"""Trains a performance model and reports on the accuracy.""" + +import pyopencl as cl +import numpy as np + +from pytential import sym, bind + +# {{{ global params + +TARGET_ORDER = 8 +OVSMP_FACTOR = 5 +TCF = 0.9 +QBX_ORDER = 5 +FMM_ORDER = 10 +RUNS = 3 + +DEFAULT_LPOT_KWARGS = { + "_box_extent_norm": "l2", + "_from_sep_smaller_crit": "static_l2", + } + +PANELS_PER_ARM = 30 +TRAINING_ARMS = (10, 15, 25) +TESTING_ARMS = (20,) + + +def starfish_lpot_source(queue, n_arms): + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + from meshmode.mesh.generation import make_curve_mesh, NArmedStarfish + + mesh = make_curve_mesh( + NArmedStarfish(n_arms, 0.8), + np.linspace(0, 1, 1 + PANELS_PER_ARM * n_arms), + TARGET_ORDER) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(TARGET_ORDER)) + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + target_association_tolerance=0.025, + _expansion_stick_out_factor=TCF, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, + fmm_backend="fmmlib" + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR * TARGET_ORDER, + **lpot_kwargs) + + lpot_source, _ = lpot_source.with_refinement() + + return lpot_source + +# }}} + + +def training_geometries(queue): + for n_arms in TRAINING_ARMS: + yield starfish_lpot_source(queue, n_arms) + + +def test_geometries(queue): + for n_arms in TESTING_ARMS: + yield starfish_lpot_source(queue, n_arms) + + +def get_bound_op(lpot_source): + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + return bind(lpot_source, op) + + +def get_test_density(queue, lpot_source): + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + sigma = cl.clmath.sin(10 * nodes[0]) + + return sigma + + +def train_performance_model(ctx): + queue = cl.CommandQueue(ctx) + + from pytential.qbx.performance import ( + PerformanceModel, estimate_calibration_params) + + perf_model = PerformanceModel() + + model_results = [] + timing_results = [] + + for lpot_source in training_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + + model_results.append(next(iter(perf_S.values()))) + timing_results.append(next(iter(timing_data.values()))) + + calibration_params = ( + estimate_calibration_params(model_results, timing_results)) + + return perf_model.with_calibration_params(calibration_params) + + +def test_performance_model(ctx, perf_model): + queue = cl.CommandQueue(ctx) + + for lpot_source in test_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + model_result = ( + next(iter(perf_S.values())) + .get_predicted_times(merge_close_lists=True)) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + temp_timing_results = [] + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + temp_timing_results.append(next(iter(timing_data.values()))) + + timing_result = {} + for param in model_result: + timing_result[param] = ( + sum(temp_timing_result[param].process_elapsed + for temp_timing_result in temp_timing_results)) / RUNS + + print("=" * 20) + for stage in model_result: + print("stage: ", stage) + print("actual: ", timing_result[stage]) + print("predicated: ", model_result[stage]) + print("=" * 20) + + +def predict_performance(ctx): + model = train_performance_model(ctx) + test_performance_model(ctx, model) + + +if __name__ == "__main__": + predict_performance(cl.create_some_context(0)) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index e9bf6772..9553c9c2 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -633,16 +633,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): queue, insn, bound_expr, evaluate, func) def perf_model_compute_potential_insn(self, queue, insn, bound_expr, - evaluate, costs): + evaluate): if self.fmm_level_to_order is False: raise NotImplementedError("perf modeling direct evaluations") return self._dispatch_compute_potential_insn( queue, insn, bound_expr, evaluate, - self.perf_model_compute_potential_insn_fmm, - costs=costs) + self.perf_model_compute_potential_insn_fmm) def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, - evaluate, func, **extra_args): + evaluate, func): from pytools.obj_array import with_object_array_or_scalar if not self._refined_for_global_qbx: @@ -662,7 +661,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - return func(queue, insn, bound_expr, evaluate_wrapper, **extra_args) + return func(queue, insn, bound_expr, evaluate_wrapper) @property @memoize_method @@ -737,7 +736,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute fmm performance model def perf_model_compute_potential_insn_fmm(self, queue, insn, bound_expr, - evaluate, costs): + evaluate): target_name_and_side_to_number, target_discrs_and_qbx_sides = ( self.get_target_discrs_and_qbx_sides(insn, bound_expr)) @@ -749,7 +748,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: performance_model = self.performance_model - costs.update(performance_model(geo_data)) + performance_model_result = performance_model(self, geo_data) # {{{ construct dummy outputs @@ -776,7 +775,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_array)) new_futures = [] - return result, new_futures + return result, new_futures, performance_model_result # }}} diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 424fbed5..cb0b66eb 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -30,6 +30,11 @@ from six.moves import range import numpy as np # noqa import pyopencl as cl # noqa import pyopencl.array # noqa +import sympy as sp + +from collections import OrderedDict +from collections.abc import MutableMapping +from pymbolic import var import logging @@ -38,7 +43,9 @@ logger = logging.getLogger(__name__) __doc__ = """ .. autoclass:: PerformanceModel -.. autofunction:: assemble_performance_data +.. autoclass:: PerformanceModelResult + +.. autofunction:: estimate_calibration_params """ @@ -59,30 +66,40 @@ class TranslationCostModel(object): self.translation_max_power = translation_max_power def direct(self): - return 1 + return var("c_p2p") def p2qbxl(self): - return self.ncoeffs_qbx + return var("c_p2qbxl") * self.ncoeffs_qbx - qbxl2p = p2qbxl + def qbxl2p(self): + return var("c_qbxl2p") * self.ncoeffs_qbx def p2l(self): - return self.ncoeffs_fmm + return var("c_p2l") * self.ncoeffs_fmm + + def l2p(self): + return var("c_l2p") * self.ncoeffs_fmm + + def p2m(self): + return var("c_p2m") * self.ncoeffs_fmm - l2p = p2l - p2m = p2l - m2p = p2l + def m2p(self): + return var("c_m2p") * self.ncoeffs_fmm def m2m(self): - return self.e2e_cost(self.p_fmm, self.p_fmm) + return var("c_m2m") * self.e2e_cost(self.p_fmm, self.p_fmm) - l2l = m2m - m2l = m2m + def l2l(self): + return var("c_l2l") * self.e2e_cost(self.p_fmm, self.p_fmm) + + def m2l(self): + return var("c_m2l") * self.e2e_cost(self.p_fmm, self.p_fmm) def m2qbxl(self): - return self.e2e_cost(self.p_fmm, self.p_qbx) + return var("c_m2qbxl") * self.e2e_cost(self.p_fmm, self.p_qbx) - l2qbxl = m2qbxl + def l2qbxl(self): + return var("c_l2qbxl") * self.e2e_cost(self.p_fmm, self.p_qbx) def e2e_cost(self, p_source, p_target): from pymbolic.primitives import Max @@ -94,6 +111,69 @@ class TranslationCostModel(object): # }}} +# {{{ performance model result + +class PerformanceModelResult(MutableMapping): + """A container for holding performance model results. + """ + + def __init__(self, perf_model_result, params): + self.perf_model_result = OrderedDict(perf_model_result) + self.params = params + + def with_params(self, params): + new_params = self.params.copy() + new_params.update(params) + return type(self)( + perf_model_result=self.perf_model_result.copy(), + params=new_params) + + def copy(self): + return self.with_params({}) + + def __getitem__(self, val): + return self.perf_model_result.__getitem__(val) + + def __setitem__(self, key, val): + return self.perf_model_result.__setitem__(key, val) + + def __delitem__(self, key): + return self.perf_model_result.__delitem__(key) + + def __iter__(self): + return self.perf_model_result.__iter__() + + def __len__(self): + return self.perf_model_result.__len__() + + def __str__(self): + return self.perf_model_result.__str__() + + def __repr__(self): + return self.perf_model_result.__repr__() + + def get_predicted_times(self, merge_close_lists=False): + from pymbolic import evaluate + from functools import partial + + get_time = partial(evaluate, context=self.params) + + result = OrderedDict() + + for name, val in self.perf_model_result.items(): + if merge_close_lists: + for suffix in ("_list1", "_list3", "_list4"): + if name.endswith(suffix): + name = name[:-len(suffix)] + break + + result[name] = get_time(val) + result.get(name, 0) + + return result + +# }}} + + # {{{ performance model class PerformanceModel(object): @@ -104,7 +184,7 @@ class PerformanceModel(object): translation_target_power=None, translation_max_power=None, summarize_parallel=None, - merge_close_lists=True): + calibration_params=None): """ :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM uses translation operators that make use of the knowledge that the @@ -113,14 +193,6 @@ class PerformanceModel(object): *(parallel_array, sym_multipliers)* used to process an array of workloads of 'parallelizable units'. By default, all workloads are summed into one number encompassing the total workload. - :arg merge_close_lists: A :class:`bool` indicating whether or not all - boxes requiring direct evaluation should be merged into a single - interaction list. If *False*, *part_direct* and *p2qbxl* will be - suffixed with the originating list as follows: - - * *_neighbor* (List 1) - * *_sep_smaller* (List 3 close) - * *_sep_bigger* (List 4 close). """ self.uses_pde_expansions = uses_pde_expansions self.translation_source_power = translation_source_power @@ -129,7 +201,18 @@ class PerformanceModel(object): if summarize_parallel is None: summarize_parallel = self.summarize_parallel_default self.summarize_parallel = summarize_parallel - self.merge_close_lists = merge_close_lists + if calibration_params is None: + calibration_params = dict() + self.calibration_params = calibration_params + + def with_calibration_params(self, calibration_params): + return type(self)( + uses_pde_expansions=self.uses_pde_expansions, + translation_source_power=self.translation_source_power, + translation_target_power=self.translation_target_power, + translation_max_power=self.translation_max_power, + summarize_parallel=self.summarize_parallel, + calibration_params=calibration_params) @staticmethod def summarize_parallel_default(parallel_array, sym_multipliers): @@ -155,9 +238,6 @@ class PerformanceModel(object): npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs - if self.merge_close_lists: - continue - npart_direct_list3_srcs = 0 # Could be None, if not using targets with extent. @@ -185,16 +265,12 @@ class PerformanceModel(object): npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs result = {} - if self.merge_close_lists: - result["part_direct"] = ( - self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) - else: - result["part_direct_neighbor"] = ( - self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) - result["part_direct_sep_smaller"] = ( - self.summarize_parallel(npart_direct_list3, xlat_cost.direct())) - result["part_direct_sep_bigger"] = ( - self.summarize_parallel(npart_direct_list4, xlat_cost.direct())) + result["eval_direct_list1"] = ( + self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) + result["eval_direct_list3"] = ( + self.summarize_parallel(npart_direct_list3, xlat_cost.direct())) + result["eval_direct_list4"] = ( + self.summarize_parallel(npart_direct_list4, xlat_cost.direct())) return result @@ -205,12 +281,13 @@ class PerformanceModel(object): def process_list2(self, xlat_cost, traversal): nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp) - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + for itgt_box in range(len(traversal.target_or_target_parent_boxes)): start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] nm2l[itgt_box] += end-start - return dict(m2l=self.summarize_parallel(nm2l, xlat_cost.m2l())) + return dict(multipole_to_local=( + self.summarize_parallel(nm2l, xlat_cost.m2l()))) # }}} @@ -233,8 +310,8 @@ class PerformanceModel(object): ntargets * (end-start) ) - return dict( - mp_eval=self.summarize_parallel(nmp_eval, xlat_cost.m2p())) + return dict(eval_multipoles=( + self.summarize_parallel(nmp_eval, xlat_cost.m2p()))) # }}} @@ -256,7 +333,7 @@ class PerformanceModel(object): nform_local[itgt_box] = nform_local_box - return dict(form_local=( + return dict(form_locals=( self.summarize_parallel(nform_local, xlat_cost.p2l()))) # }}} @@ -283,9 +360,6 @@ class PerformanceModel(object): np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs - if self.merge_close_lists: - continue - np2qbxl_list3_srcs = 0 # Could be None, if not using targets with extent. @@ -313,16 +387,12 @@ class PerformanceModel(object): np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs result = {} - if self.merge_close_lists: - result["p2qbxl"] = ( - self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) - else: - result["p2qbxl_neighbor"] = ( - self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) - result["p2qbxl_sep_smaller"] = ( - self.summarize_parallel(np2qbxl_list3, xlat_cost.p2qbxl())) - result["p2qbxl_sep_bigger"] = ( - self.summarize_parallel(np2qbxl_list4, xlat_cost.p2qbxl())) + result["form_global_qbx_locals_list1"] = ( + self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) + result["form_global_qbx_locals_list3"] = ( + self.summarize_parallel(np2qbxl_list3, xlat_cost.p2qbxl())) + result["form_global_qbx_locals_list4"] = ( + self.summarize_parallel(np2qbxl_list4, xlat_cost.p2qbxl())) return result @@ -353,7 +423,8 @@ class PerformanceModel(object): nm2qbxl[isrc_level, itgt_center] += stop-start - return dict(m2qbxl=self.summarize_parallel(nm2qbxl, xlat_cost.m2qbxl())) + return dict(translate_box_multipoles_to_qbx_local=( + self.summarize_parallel(nm2qbxl, xlat_cost.m2qbxl()))) # }}} @@ -367,7 +438,8 @@ class PerformanceModel(object): start, end = center_to_targets_starts[src_icenter:src_icenter+2] nqbx_eval[isrc_center] += end-start - return dict(qbxl2p=self.summarize_parallel(nqbx_eval, xlat_cost.qbxl2p())) + return dict(eval_qbx_expansions=( + self.summarize_parallel(nqbx_eval, xlat_cost.qbxl2p()))) # }}} @@ -429,38 +501,43 @@ class PerformanceModel(object): # }}} - def __call__(self, geo_data): + def __call__(self, lpot_source, geo_data): # FIXME: This should suport target filtering. - from collections import OrderedDict result = OrderedDict() nqbtl = geo_data.non_qbx_box_target_lists() with cl.CommandQueue(geo_data.cl_context) as queue: tree = geo_data.tree().get(queue=queue) - traversal = geo_data.traversal(self.merge_close_lists).get(queue=queue) + traversal = geo_data.traversal(merge_close_lists=False).get(queue=queue) box_target_counts_nonchild = ( nqbtl.box_target_counts_nonchild.get(queue=queue)) - result.update( + params = dict( nlevels=tree.nlevels, nboxes=tree.nboxes, nsources=tree.nsources, ntargets=tree.ntargets, - ncenters=geo_data.ncenters) + ncenters=geo_data.ncenters, + p_qbx=lpot_source.qbx_order, + # FIXME: Assumes this is a constant + p_fmm=lpot_source.fmm_level_to_order(None, None, None, None), + ) + + params.update(self.calibration_params) xlat_cost = self.get_translation_cost_model(tree.dimensions) # {{{ construct local multipoles - result["form_mp"] = tree.nsources * xlat_cost.p2m() + result["form_multipoles"] = tree.nsources * xlat_cost.p2m() # }}} # {{{ propagate multipoles upward - result["prop_upward"] = tree.nboxes * xlat_cost.m2m() + result["coarsen_multipoles"] = tree.nboxes * xlat_cost.m2m() # }}} @@ -492,20 +569,18 @@ class PerformanceModel(object): # {{{ propagate local_exps downward - result["prop_downward"] = tree.nboxes * xlat_cost.l2l() + result["refine_locals"] = tree.nboxes * xlat_cost.l2l() # }}} # {{{ evaluate locals - result["eval_part"] = tree.ntargets * xlat_cost.l2p() + result["eval_locals"] = tree.ntargets * xlat_cost.l2p() # }}} global_qbx_centers = geo_data.global_qbx_centers() - # If self.merge_close_lists is False, then this builds another traversal - # (which is OK). qbx_center_to_target_box = geo_data.qbx_center_to_target_box() center_to_targets_starts = geo_data.center_to_tree_targets().starts qbx_center_to_target_box_source_level = np.empty( @@ -545,7 +620,8 @@ class PerformanceModel(object): # {{{ translate from box local expansions to qbx local expansions - result["l2qbxl"] = geo_data.ncenters * xlat_cost.l2qbxl() + result["translate_box_local_to_qbx_local"] = ( + geo_data.ncenters * xlat_cost.l2qbxl()) # }}} @@ -556,29 +632,100 @@ class PerformanceModel(object): # }}} - return result + return PerformanceModelResult(result, params) # }}} -# {{{ assemble_performance_data +# {{{ calibrate performance model + +def _collect(expr, variables): + from pymbolic.interop.sympy import PymbolicToSympyMapper, SympyToPymbolicMapper + p2s = PymbolicToSympyMapper() + s2p = SympyToPymbolicMapper() + + from sympy.simplify import collect + sympy_variables = [sp.var(v) for v in variables] + collect_result = collect(p2s(expr), sympy_variables, evaluate=False) + + result = {} + for v in variables: + try: + result[v] = s2p(collect_result[sp.var(v)]) + except KeyError: + continue + + return result + + +_FMM_STAGE_TO_CALIBRATION_PARAMETER = { + "form_multipoles": "c_p2m", + "coarsen_multipoles": "c_m2m", + "eval_direct": "c_p2p", + "multipole_to_local": "c_m2l", + "eval_multipoles": "c_m2p", + "form_locals": "c_p2l", + "refine_locals": "c_l2l", + "eval_locals": "c_l2p", + "form_global_qbx_locals": "c_p2qbxl", + "translate_box_multipoles_to_qbx_local": "c_m2qbxl", + "translate_box_local_to_qbx_local": "c_l2qbxl", + "eval_qbx_expansions": "c_qbxl2p", + } -def assemble_performance_data(geo_data, uses_pde_expansions, - translation_source_power=None, translation_target_power=None, - translation_max_power=None, - summarize_parallel=None, merge_close_lists=True): - """Compute modeled performance using :class:`PerformanceModel`. - See :class:`PerformanceModel` for parameter documentation. +def estimate_calibration_params(model_results, timing_results): + """Given a set of model results and matching timing results, estimate the best + calibration parameters for the model. """ - return PerformanceModel( - uses_pde_expansions, - translation_source_power, - translation_target_power, - translation_max_power, - summarize_parallel, - merge_close_lists)(geo_data) + params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values()) + + nresults = len(model_results) + + if nresults != len(timing_results): + raise ValueError("must have same number of model and timing results") + + uncalibrated_times = {} + actual_times = {} + + for param in params: + uncalibrated_times[param] = np.zeros(nresults) + actual_times[param] = np.zeros(nresults) + + from pymbolic import evaluate + + for i, model_result in enumerate(model_results): + context = model_result.params.copy() + for param in params: + context[param] = var(param) + + total_modeled_cost = evaluate(sum(model_result.values()), context=context) + collected_times = _collect(total_modeled_cost, params) + + for param, time in collected_times.items(): + uncalibrated_times[param][i] = time + + for i, timing_result in enumerate(timing_results): + for param, time in timing_result.items(): + calibration_param = ( + _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) + actual_times[calibration_param][i] = time.process_elapsed + + result = {} + + for param in params: + uncalibrated = uncalibrated_times[param] + actual = actual_times[param] + + if np.allclose(uncalibrated, 0): + result[param] = float("NaN") + continue + + result[param] = ( + actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated)) + + return result # }}} diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 9a5711a4..9469fb19 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -275,11 +275,11 @@ class PerformanceModelMapper(EvaluationMapperBase): def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): source = bound_expr.places[insn.source] - costs = {} - result = source.perf_model_compute_potential_insn( - queue, insn, bound_expr, evaluate, costs) - self.modeled_performance[insn] = costs - return result + result, futures, perf_model_result = ( + source.perf_model_compute_potential_insn( + queue, insn, bound_expr, evaluate)) + self.modeled_performance[insn] = perf_model_result + return result, futures def get_modeled_performance(self): return self.modeled_performance -- GitLab From 257d87b7f3425b1d77ed4f9b67a13f393f03e648 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 13 Jul 2018 19:28:42 -0500 Subject: [PATCH 020/139] [ci skip] Fix a spelling error caused by lack of coffee --- examples/performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performance.py b/examples/performance.py index 2a2f2ea7..97e61864 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -153,7 +153,7 @@ def test_performance_model(ctx, perf_model): for stage in model_result: print("stage: ", stage) print("actual: ", timing_result[stage]) - print("predicated: ", model_result[stage]) + print("predicted: ", model_result[stage]) print("=" * 20) -- GitLab From 84df50e82535cbb5df6c509f92c7663865ca6bab Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 13 Jul 2018 19:35:37 -0500 Subject: [PATCH 021/139] Py2.7 fix --- pytential/qbx/performance.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index cb0b66eb..b1e88d33 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -32,10 +32,14 @@ import pyopencl as cl # noqa import pyopencl.array # noqa import sympy as sp -from collections import OrderedDict -from collections.abc import MutableMapping -from pymbolic import var +from pymbolic import var +from collections import OrderedDict +try: + from collections.abc import MutableMapping +except ModuleNotFoundError: + # Py 2.7 + from collections import MutableMapping import logging logger = logging.getLogger(__name__) -- GitLab From 62c59a5f7de1a3303b819bff53066966ac670e4e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 13 Jul 2018 19:36:51 -0500 Subject: [PATCH 022/139] Fix name of exception --- pytential/qbx/performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index b1e88d33..17283e13 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -37,7 +37,7 @@ from pymbolic import var from collections import OrderedDict try: from collections.abc import MutableMapping -except ModuleNotFoundError: +except ImportError: # Py 2.7 from collections import MutableMapping -- GitLab From eb5cef4d427f0f578a0f1c895a0b67c1345c975c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 14 Jul 2018 04:23:56 -0500 Subject: [PATCH 023/139] Improve idiom for getting the only value in an iterable. --- examples/performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/performance.py b/examples/performance.py index 97e61864..cc0c01db 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -4,6 +4,8 @@ import pyopencl as cl import numpy as np from pytential import sym, bind +from pytools import one + # {{{ global params @@ -112,8 +114,8 @@ def train_performance_model(ctx): timing_data = {} bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) - model_results.append(next(iter(perf_S.values()))) - timing_results.append(next(iter(timing_data.values()))) + model_results.append(one(perf_S.values())) + timing_results.append(one(timing_data.values())) calibration_params = ( estimate_calibration_params(model_results, timing_results)) @@ -131,7 +133,7 @@ def test_performance_model(ctx, perf_model): perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) model_result = ( - next(iter(perf_S.values())) + one(perf_S.values()) .get_predicted_times(merge_close_lists=True)) # Warm-up run. @@ -141,7 +143,7 @@ def test_performance_model(ctx, perf_model): for _ in range(RUNS): timing_data = {} bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) - temp_timing_results.append(next(iter(timing_data.values()))) + temp_timing_results.append(one(timing_data.values())) timing_result = {} for param in model_result: -- GitLab From 48cb27f5a5720886c16a246270bd40c6a4c519ca Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 17 Jul 2018 14:09:36 -0500 Subject: [PATCH 024/139] Revert "Change the TSQBX API." This reverts commit 9ae6ed5c368d83a43138a798e23efde0f5c7dc51. --- pytential/qbx/__init__.py | 12 ++++++------ test/test_target_specific_qbx.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 292d7994..646dff59 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,7 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", - _tsqbx_kind="", + _use_tsqbx_list1=False, geometry_data_inspector=None, performance_model=None, fmm_backend="sumpy", @@ -204,7 +204,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind - self._tsqbx_kind = _tsqbx_kind + self._use_tsqbx_list1 = _use_tsqbx_list1 self.geometry_data_inspector = geometry_data_inspector self.performance_model = performance_model @@ -228,7 +228,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, - _tsqbx_kind=_not_provided, + _use_tsqbx_list1=_not_provided, geometry_data_inspector=None, performance_model=_not_provided, fmm_backend=None, @@ -312,8 +312,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, - _tsqbx_kind=_tsqbx_kind if _tsqbx_kind is not _not_provided - else self._tsqbx_kind, + _use_tsqbx_list1=_use_tsqbx_list1 if _use_tsqbx_list1 is not _not_provided + else self._use_tsqbx_list1, geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), performance_model=( @@ -821,7 +821,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, kernel_extra_kwargs=kernel_extra_kwargs, - _use_target_specific_list1="1" in self._tsqbx_kind) + _use_target_specific_list1=self._use_tsqbx_list1) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index e5f9ea38..60a801d4 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -102,7 +102,7 @@ def test_target_specific_qbx(ctx_getter, op): bound_op = bind(qbx, expr) slp_ref = bound_op(queue, u=u_dev) - qbx = qbx.copy(_tsqbx_kind="1") + qbx = qbx.copy(_use_tsqbx_list1=True) bound_op = bind(qbx, expr) slp_tsqbx = bound_op(queue, u=u_dev) -- GitLab From 03e4e82312fc45a76b76b5871626f56b3ab7bf9d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 17 Jul 2018 18:34:38 -0500 Subject: [PATCH 025/139] [ci skip] Hack towards performance data gathering for TSQBX. --- pytential/qbx/__init__.py | 12 +++--- pytential/qbx/fmm.py | 19 ++++++--- pytential/qbx/fmmlib.py | 3 +- pytential/qbx/performance.py | 82 +++++++++++++++++++++++++++++++----- 4 files changed, 94 insertions(+), 22 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 646dff59..51ca47b0 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,7 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", - _use_tsqbx_list1=False, + _use_tsqbx=False, geometry_data_inspector=None, performance_model=None, fmm_backend="sumpy", @@ -204,7 +204,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind - self._use_tsqbx_list1 = _use_tsqbx_list1 + self._use_tsqbx = _use_tsqbx self.geometry_data_inspector = geometry_data_inspector self.performance_model = performance_model @@ -228,7 +228,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, - _use_tsqbx_list1=_not_provided, + _use_tsqbx=_not_provided, geometry_data_inspector=None, performance_model=_not_provided, fmm_backend=None, @@ -312,8 +312,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, - _use_tsqbx_list1=_use_tsqbx_list1 if _use_tsqbx_list1 is not _not_provided - else self._use_tsqbx_list1, + _use_tsqbx=(_use_tsqbx + if _use_tsqbx is not _not_provided else self._use_tsqbx), geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), performance_model=( @@ -821,7 +821,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, kernel_extra_kwargs=kernel_extra_kwargs, - _use_target_specific_list1=self._use_tsqbx_list1) + _use_target_specific_list1=self._use_tsqbx) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 7d95065c..be85a7f5 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -385,7 +385,8 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), # {{{ FMM top-level -def drive_fmm(expansion_wrangler, src_weights, timing_data=None): +def drive_fmm(expansion_wrangler, src_weights, timing_data=None, + use_tsqbx=False): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`QBXFMMGeometryData` instance. @@ -524,9 +525,12 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): # {{{ wrangle qbx expansions - qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights) + if not use_tsqbx: + qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights) - recorder.add("form_global_qbx_locals", timing_future) + recorder.add("form_global_qbx_locals", timing_future) + else: + qbx_expansions = wrangler.qbx_local_expansion_zeros() local_result, timing_future = ( wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)) @@ -546,8 +550,13 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): recorder.add("eval_qbx_expansions", timing_future) - qbx_potentials = qbx_potentials + \ - wrangler.eval_target_specific_global_qbx_locals(src_weights) + if use_tsqbx: + tsqbx_result, timing_future = ( + wrangler.eval_target_specific_qbx_locals(src_weights)) + + recorder.add("eval_target_specific_qbx_locals", timing_future) + + qbx_potentials = qbx_potentials + tsqbx_result # }}} diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index ed16f5f8..c4ba6637 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -578,7 +578,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): return output @log_process(logger) - def eval_target_specific_global_qbx_locals(self, src_weights): + @return_timing_data + def eval_target_specific_qbx_locals(self, src_weights): if not self._use_target_specific_list1: return self.full_output_zeros() diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 17283e13..007e03ef 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -75,6 +75,9 @@ class TranslationCostModel(object): def p2qbxl(self): return var("c_p2qbxl") * self.ncoeffs_qbx + def p2p_tsqbx(self): + return var("c_p2p_tsqbx") * self.ncoeffs_qbx + def qbxl2p(self): return var("c_qbxl2p") * self.ncoeffs_qbx @@ -342,17 +345,23 @@ class PerformanceModel(object): # }}} - # {{{ form global qbx locals + # {{{ collect data about direct interactions with qbx centers - def process_form_qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box): + def _collect_qbxl_direct_interaction_data(self, xlat_cost, traversal, tree, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): # center -> nsources np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp) np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) + # center -> number of associated targets + nqbxl2p = np.zeros(len(global_qbx_centers), dtype=np.intp) + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] + nqbxl2p[itgt_center] = end - start + itgt_box = qbx_center_to_target_box[tgt_icenter] np2qbxl_list1_srcs = 0 @@ -390,13 +399,59 @@ class PerformanceModel(object): np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs + result = {} + result["np2qbxl_list1"] = np2qbxl_list1 + result["np2qbxl_list3"] = np2qbxl_list3 + result["np2qbxl_list4"] = np2qbxl_list4 + result["nqbxl2p"] = nqbxl2p + + return result + + # }}} + + # {{{ eval target specific qbx expansions + + def process_eval_target_specific_qbxl(self, xlat_cost, traversal, tree, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): + + counts = self._collect_qbxl_direct_interaction_data( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box) + + result = {} + result["eval_target_specific_qbx_locals_list1"] = ( + self.summarize_parallel( + counts["np2qbxl_list1"] * counts["nqbxl2p"], + xlat_cost.p2p_tsqbx())) + result["eval_target_specific_qbx_locals_list3"] = ( + self.summarize_parallel( + counts["np2qbxl_list3"] * counts["nqbxl2p"], + xlat_cost.p2p_tsqbx())) + result["eval_target_specific_qbx_locals_list4"] = ( + self.summarize_parallel( + counts["np2qbxl_list4"] * counts["nqbxl2p"], + xlat_cost.p2p_tsqbx())) + + return result + + # }}} + + # {{{ form global qbx locals + + def process_form_qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts): + + counts = self._collect_qbxl_direct_interaction_data( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts) + result = {} result["form_global_qbx_locals_list1"] = ( - self.summarize_parallel(np2qbxl_list1, xlat_cost.p2qbxl())) + self.summarize_parallel(counts["np2qbxl_list1"], xlat_cost.p2qbxl())) result["form_global_qbx_locals_list3"] = ( - self.summarize_parallel(np2qbxl_list3, xlat_cost.p2qbxl())) + self.summarize_parallel(counts["np2qbxl_list3"], xlat_cost.p2qbxl())) result["form_global_qbx_locals_list4"] = ( - self.summarize_parallel(np2qbxl_list4, xlat_cost.p2qbxl())) + self.summarize_parallel(counts["np2qbxl_list4"], xlat_cost.p2qbxl())) return result @@ -511,6 +566,7 @@ class PerformanceModel(object): result = OrderedDict() nqbtl = geo_data.non_qbx_box_target_lists() + use_tsqbx = lpot_source._use_tsqbx with cl.CommandQueue(geo_data.cl_context) as queue: tree = geo_data.tree().get(queue=queue) @@ -606,11 +662,16 @@ class PerformanceModel(object): qbx_center_to_target_box_source_level[src_level] .get(queue=queue)) - # {{{ form global qbx locals + # {{{ form global qbx locals or evaluate target specific qbx expansions - result.update(self.process_form_qbxl( - xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box)) + if use_tsqbx: + result.update(self.process_eval_target_specific_qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts)) + else: + result.update(self.process_form_qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts)) # }}} @@ -675,6 +736,7 @@ _FMM_STAGE_TO_CALIBRATION_PARAMETER = { "translate_box_multipoles_to_qbx_local": "c_m2qbxl", "translate_box_local_to_qbx_local": "c_l2qbxl", "eval_qbx_expansions": "c_qbxl2p", + "eval_target_specific_qbx_locals": "c_p2p_tsqbx", } -- GitLab From 675b0d82013dfa0f83279ac474fd880fb24f1366 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 18 Jul 2018 22:58:46 -0500 Subject: [PATCH 026/139] Fix missing argument --- pytential/qbx/performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 007e03ef..34c1f306 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -416,7 +416,7 @@ class PerformanceModel(object): counts = self._collect_qbxl_direct_interaction_data( xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box) + qbx_center_to_target_box, center_to_targets_starts) result = {} result["eval_target_specific_qbx_locals_list1"] = ( -- GitLab From 5e5afe91bdfd677f763175c5e287c3463c2b8831 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 18 Jul 2018 22:59:02 -0500 Subject: [PATCH 027/139] [ci skip] Add 3D perf example --- examples/performance-3d.py | 187 +++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 examples/performance-3d.py diff --git a/examples/performance-3d.py b/examples/performance-3d.py new file mode 100644 index 00000000..433f70ae --- /dev/null +++ b/examples/performance-3d.py @@ -0,0 +1,187 @@ +"""Trains a performance model and reports on the accuracy.""" + +import pyopencl as cl +import numpy as np + +from pytential import sym, bind +from pytools import one + + +# {{{ global params + +TARGET_ORDER = 8 +OVSMP_FACTOR = 5 +TCF = 0.9 +QBX_ORDER = 5 +FMM_ORDER = 10 +MESH_TOL = 1e-10 +FORCE_STAGE2_UNIFORM_REFINEMENT_ROUNDS = 1 +SCALED_MAX_CURVATURE_THRESHOLD = 0.8 +MAX_LEAF_REFINE_WEIGHT = 512 +RUNS = 3 + +DEFAULT_LPOT_KWARGS = { + "_box_extent_norm": "l2", + "_from_sep_smaller_crit": "static_l2", + } + +TRAINING_ARMS = (2, 3, 6) +TESTING_ARMS = (5,) + + +def urchin_lpot_source(queue, sph_harm_tuple, + from_sep_smaller_threshold=None, use_tsqbx=True): + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) + + target_order = TARGET_ORDER + + sph_m, sph_n = sph_harm_tuple + + from meshmode.mesh.generation import generate_urchin as get_urchin + mesh = get_urchin( + order=target_order, m=sph_m, n=sph_n, + est_rel_interp_tolerance=MESH_TOL) + + pre_density_discr = Discretization( + queue.context, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + refiner_extra_kwargs = { + #"visualize": True, + "_force_stage2_uniform_refinement_rounds": ( + FORCE_STAGE2_UNIFORM_REFINEMENT_ROUNDS), + "_scaled_max_curvature_threshold": ( + SCALED_MAX_CURVATURE_THRESHOLD), + } + + lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() + lpot_kwargs.update( + fmm_backend="fmmlib", + _well_sep_is_n_away=2, + _expansions_in_tree_have_extent=True, + _expansion_stick_out_factor=TCF, + _max_leaf_refine_weight=MAX_LEAF_REFINE_WEIGHT, + target_association_tolerance=1e-3, + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, + _from_sep_smaller_min_nsources_cumul=from_sep_smaller_threshold, + _use_tsqbx=use_tsqbx, + ) + + from pytential.qbx import QBXLayerPotentialSource + lpot_source = QBXLayerPotentialSource( + pre_density_discr, OVSMP_FACTOR*target_order, + **lpot_kwargs,) + + lpot_source, _ = lpot_source.with_refinement(**refiner_extra_kwargs) + + return lpot_source + +# }}} + + +def training_geometries(queue): + for n_arms in TRAINING_ARMS: + yield urchin_lpot_source(queue, (n_arms // 2, n_arms), 100) + + +def test_geometries(queue): + for n_arms in TESTING_ARMS: + yield urchin_lpot_source(queue, (n_arms // 2, n_arms), 100) + + +def get_bound_op(lpot_source): + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + return bind(lpot_source, op) + + +def get_test_density(queue, lpot_source): + density_discr = lpot_source.density_discr + nodes = density_discr.nodes().with_queue(queue) + sigma = cl.clmath.sin(10 * nodes[0]) + + return sigma + + +def train_performance_model(ctx): + queue = cl.CommandQueue(ctx) + + from pytential.qbx.performance import ( + PerformanceModel, estimate_calibration_params) + + perf_model = PerformanceModel() + + model_results = [] + timing_results = [] + + for lpot_source in training_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + + model_results.append(one(perf_S.values())) + timing_results.append(one(timing_data.values())) + + calibration_params = ( + estimate_calibration_params(model_results, timing_results)) + + return perf_model.with_calibration_params(calibration_params) + + +def test_performance_model(ctx, perf_model): + queue = cl.CommandQueue(ctx) + + for lpot_source in test_geometries(queue): + lpot_source = lpot_source.copy(performance_model=perf_model) + bound_op = get_bound_op(lpot_source) + sigma = get_test_density(queue, lpot_source) + + perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) + model_result = ( + one(perf_S.values()) + .get_predicted_times(merge_close_lists=True)) + + # Warm-up run. + bound_op.eval(queue, {"sigma": sigma}) + + temp_timing_results = [] + for _ in range(RUNS): + timing_data = {} + bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) + temp_timing_results.append(one(timing_data.values())) + + timing_result = {} + for param in model_result: + timing_result[param] = ( + sum(temp_timing_result[param].process_elapsed + for temp_timing_result in temp_timing_results)) / RUNS + + print("=" * 20) + for stage in model_result: + print("stage: ", stage) + print("actual: ", timing_result[stage]) + print("predicted: ", model_result[stage]) + print("=" * 20) + + +def predict_performance(ctx): + model = train_performance_model(ctx) + test_performance_model(ctx, model) + + +if __name__ == "__main__": + predict_performance(cl.create_some_context(0)) -- GitLab From ae19468430e5f7835498a16e7a596870fd1d61e9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 18 Jul 2018 23:04:08 -0500 Subject: [PATCH 028/139] [ci skip] Fix test --- pytential/qbx/__init__.py | 3 ++- test/test_target_specific_qbx.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 51ca47b0..1fbc633d 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -841,7 +841,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): from pytential.qbx.fmm import drive_fmm timing_data = {} - all_potentials_on_every_target = drive_fmm(wrangler, strengths, timing_data) + all_potentials_on_every_target = drive_fmm( + wrangler, strengths, timing_data, use_tsqbx=self._use_tsqbx) # }}} diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 60a801d4..e3273757 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -102,7 +102,7 @@ def test_target_specific_qbx(ctx_getter, op): bound_op = bind(qbx, expr) slp_ref = bound_op(queue, u=u_dev) - qbx = qbx.copy(_use_tsqbx_list1=True) + qbx = qbx.copy(_use_tsqbx=True) bound_op = bind(qbx, expr) slp_tsqbx = bound_op(queue, u=u_dev) -- GitLab From ed0b3fd7f8e1ba8f8198648f5b5d28fdb9cb695d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 19 Jul 2018 02:14:35 -0500 Subject: [PATCH 029/139] [ci skip] Add log_process decorator --- pytential/qbx/performance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 34c1f306..340c6d1e 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -34,6 +34,7 @@ import sympy as sp from pymbolic import var +from pytools import log_process from collections import OrderedDict try: from collections.abc import MutableMapping @@ -560,6 +561,7 @@ class PerformanceModel(object): # }}} + @log_process(logger, "gather performance model data") def __call__(self, lpot_source, geo_data): # FIXME: This should suport target filtering. -- GitLab From 616e8cd471c1c0616b5653baad72115117c8cc74 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 20 Jul 2018 17:17:58 -0500 Subject: [PATCH 030/139] [ci skip] Add -ffast-math to compile flags --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index aa8a32ac..4024fe65 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ ext_modules = [ Extension( "pytential.qbx.target_specific", ["pytential/qbx/target_specific.pyx"], - extra_compile_args=["-fopenmp"], + extra_compile_args=["-fopenmp", "-ffast-math"], extra_link_args=["-fopenmp"] ) ] -- GitLab From 3da0d7c5337f418eb506a5a9de37fa89456cdf1d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 16:45:48 -0500 Subject: [PATCH 031/139] Add Cython to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 6d1e4cce..91e07ccf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://github.com/inducer/pyfmmlib +Cython -- GitLab From 221929f28f06ebc09a60bdeeddf5d043024f94eb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 18:00:11 -0500 Subject: [PATCH 032/139] Revert "Add Cython to requirements.txt" This reverts commit 3da0d7c5337f418eb506a5a9de37fa89456cdf1d. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 91e07ccf..6d1e4cce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,3 @@ git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://github.com/inducer/pyfmmlib -Cython -- GitLab From 681bdd3b0468bcb47568fed1a53a4c9d41a42cc7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 18:06:27 -0500 Subject: [PATCH 033/139] [ci skip] Flake8 fixes; disable 3D example because it is slow. --- examples/performance-3d.py | 6 ++++-- test/test_target_specific_qbx.py | 19 +++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/examples/performance-3d.py b/examples/performance-3d.py index 433f70ae..72ad59af 100644 --- a/examples/performance-3d.py +++ b/examples/performance-3d.py @@ -1,7 +1,7 @@ """Trains a performance model and reports on the accuracy.""" import pyopencl as cl -import numpy as np +import numpy as np # noqa from pytential import sym, bind from pytools import one @@ -184,4 +184,6 @@ def predict_performance(ctx): if __name__ == "__main__": - predict_performance(cl.create_some_context(0)) + if 0: + # Disabled - this is slow. + predict_performance(cl.create_some_context(0)) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index e3273757..f2b39aa3 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -24,30 +24,25 @@ THE SOFTWARE. import numpy as np -import numpy.linalg as la +import numpy.linalg as la # noqa import pyopencl as cl import pyopencl.clmath # noqa import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) -from functools import partial +from functools import partial # noqa from meshmode.mesh.generation import ( # noqa ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle, NArmedStarfish, make_curve_mesh) # from sumpy.visualization import FieldPlotter -from pytential import bind, sym, norm -from sumpy.kernel import LaplaceKernel, HelmholtzKernel +from pytential import bind, sym, norm # noqa +from sumpy.kernel import LaplaceKernel, HelmholtzKernel # noqa import logging logger = logging.getLogger(__name__) -try: - import matplotlib.pyplot as pt -except ImportError: - pass - @pytest.mark.parametrize("op", ["S", "D"]) def test_target_specific_qbx(ctx_getter, op): @@ -83,16 +78,16 @@ def test_target_specific_qbx(ctx_getter, op): nodes_host = density_discr.nodes().get(queue) center = np.array([3, 1, 2]) diff = nodes_host - center[:, np.newaxis] - + dist_squared = np.sum(diff**2, axis=0) dist = np.sqrt(dist_squared) u = 1/dist - + u_dev = cl.array.to_device(queue, u) kernel = LaplaceKernel(3) u_sym = sym.var("u") - + if op == "S": op = sym.S elif op == "D": -- GitLab From 3aeaf0786ff3a17a3284233863f59cbf82a2a266 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 18:28:31 -0500 Subject: [PATCH 034/139] Try to get the build working again --- .gitlab-ci.yml | 6 +++--- .test-conda-env-py3-macos.yml | 6 ++++-- .test-conda-env-py3-requirements.txt | 1 + .test-conda-env-py3.yml | 3 +-- setup.py | 1 + 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6c1c852b..daa3acca 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,7 +9,7 @@ Python 2.7 POCL: - export PY_EXE=python2.7 - export PYOPENCL_TEST=portable - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - - export EXTRA_INSTALL="numpy mako" + - export EXTRA_INSTALL="Cython numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -24,7 +24,7 @@ Python 3.6 POCL: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - - export EXTRA_INSTALL="numpy mako" + - export EXTRA_INSTALL="Cython numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -39,7 +39,7 @@ Python 3.6 POCL Examples: - test -n "$SKIP_EXAMPLES" && exit - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="numpy mako pyvisfile matplotlib" + - export EXTRA_INSTALL="Cython numpy mako pyvisfile matplotlib" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh - ". ./build-py-project-and-run-examples.sh" tags: diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index 9d36c780..ed104c10 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -12,6 +12,8 @@ dependencies: - python=3.6 - symengine=0.3.0 - python-symengine=0.3.0 -- pyfmmlib - osx-pocl-opencl -# things not in here: loopy boxtree pymbolic meshmode sumpy +- gcc +- gfortran-osx_64 +- cython +# things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib diff --git a/.test-conda-env-py3-requirements.txt b/.test-conda-env-py3-requirements.txt index fa6c0426..7c053825 100644 --- a/.test-conda-env-py3-requirements.txt +++ b/.test-conda-env-py3-requirements.txt @@ -1,3 +1,4 @@ +git+https://gitlab.tiker.net/inducer/pyfmmlib git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/pymbolic git+https://github.com/inducer/loopy diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 8d60a1f0..62803d9c 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -12,5 +12,4 @@ dependencies: - python=3.6 - symengine=0.3.0 - python-symengine=0.3.0 -- pyfmmlib -# things not in here: loopy boxtree pymbolic meshmode sumpy +# things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib diff --git a/setup.py b/setup.py index 4024fe65..d5c5423d 100644 --- a/setup.py +++ b/setup.py @@ -106,6 +106,7 @@ setup(name="pytential", ext_modules = cythonize(ext_modules), install_requires=[ + "Cython", "pytest>=2.3", # FIXME leave out for now # https://code.google.com/p/sympy/issues/detail?id=3874 -- GitLab From 197a58c7090bc1d756e1980d9b8bfbf0f65bd151 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 18:33:57 -0500 Subject: [PATCH 035/139] Fix name of gfortran package --- .test-conda-env-py3-macos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index ed104c10..f30cd9d6 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -14,6 +14,6 @@ dependencies: - python-symengine=0.3.0 - osx-pocl-opencl - gcc -- gfortran-osx_64 +- gfortran_osx-64 - cython # things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib -- GitLab From be6a41f36b7ae58b080ccf47f094950978eb3296 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 18:45:39 -0500 Subject: [PATCH 036/139] Add Cython to conda environment for non-mac build --- .test-conda-env-py3.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 62803d9c..cd28e93c 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -12,4 +12,5 @@ dependencies: - python=3.6 - symengine=0.3.0 - python-symengine=0.3.0 +- cython # things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib -- GitLab From 647b36e056325a0128e75413d1e2401ef40a94b1 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 19:57:14 -0500 Subject: [PATCH 037/139] Echo command back in macOS build to see what is happening --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index daa3acca..f14be164 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -70,6 +70,7 @@ Python 3.5 Conda Apple: - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - set -o xtrace - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: -- GitLab From 555520ecbc4c3d9b8161f77f0dcee5dbdfba3077 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 20:30:23 -0500 Subject: [PATCH 038/139] Try without gcc / gfortran --- .test-conda-env-py3-macos.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index f30cd9d6..63051638 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -13,7 +13,5 @@ dependencies: - symengine=0.3.0 - python-symengine=0.3.0 - osx-pocl-opencl -- gcc -- gfortran_osx-64 - cython # things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib -- GitLab From fce1d149be811b80f3d53a9b66a14dbc1797580a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 20:33:58 -0500 Subject: [PATCH 039/139] Also set CC on macOS to point to real GCC --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f14be164..25cc1f99 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -71,6 +71,7 @@ Python 3.5 Conda Apple: - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt - set -o xtrace + - export CC=gcc-8 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: -- GitLab From 39e34fcea1bfa500ddb55f3daed1c8b3174b1bcd Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 21:06:03 -0500 Subject: [PATCH 040/139] Try specifying gcc alone --- .test-conda-env-py3-macos.yml | 2 ++ .test-conda-env-py3-requirements.txt | 1 - .test-conda-env-py3.yml | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index 63051638..941b00c3 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -13,5 +13,7 @@ dependencies: - symengine=0.3.0 - python-symengine=0.3.0 - osx-pocl-opencl +- pyfmmlib +- gcc - cython # things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib diff --git a/.test-conda-env-py3-requirements.txt b/.test-conda-env-py3-requirements.txt index 7c053825..fa6c0426 100644 --- a/.test-conda-env-py3-requirements.txt +++ b/.test-conda-env-py3-requirements.txt @@ -1,4 +1,3 @@ -git+https://gitlab.tiker.net/inducer/pyfmmlib git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/pymbolic git+https://github.com/inducer/loopy diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index cd28e93c..8d60a1f0 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -12,5 +12,5 @@ dependencies: - python=3.6 - symengine=0.3.0 - python-symengine=0.3.0 -- cython -# things not in here: loopy boxtree pymbolic meshmode sumpy pyfmmlib +- pyfmmlib +# things not in here: loopy boxtree pymbolic meshmode sumpy -- GitLab From ac45b7783c84d55353dd434966f8d15b4ff11533 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 21:07:31 -0500 Subject: [PATCH 041/139] Unset CC --- .gitlab-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 25cc1f99..f14be164 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -71,7 +71,6 @@ Python 3.5 Conda Apple: - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt - set -o xtrace - - export CC=gcc-8 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: -- GitLab From fe90d5eee2d23aaf12f742e578ee58b6c21794ae Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 21:20:32 -0500 Subject: [PATCH 042/139] Remove Cython from setup.py --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index d5c5423d..4024fe65 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,6 @@ setup(name="pytential", ext_modules = cythonize(ext_modules), install_requires=[ - "Cython", "pytest>=2.3", # FIXME leave out for now # https://code.google.com/p/sympy/issues/detail?id=3874 -- GitLab From 8f5ce9f7f3798dd9da265659eb40727b633d3aaf Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Jul 2018 21:24:58 -0500 Subject: [PATCH 043/139] Add cython to the Conda 3.6 env --- .test-conda-env-py3.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 8d60a1f0..08ed2fcf 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -13,4 +13,5 @@ dependencies: - symengine=0.3.0 - python-symengine=0.3.0 - pyfmmlib +- cython # things not in here: loopy boxtree pymbolic meshmode sumpy -- GitLab From 34d5ba4d9bf93ceece7ed3ff6ae0be84cd95fc53 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 22 Jul 2018 15:56:00 -0500 Subject: [PATCH 044/139] [ci skip] Fix name of file in comment --- test/test_target_specific_qbx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index f2b39aa3..a2b9317a 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -105,7 +105,7 @@ def test_target_specific_qbx(ctx_getter, op): # You can test individual routines by typing -# $ python test_layer_pot_identity.py 'test_routine()' +# $ python test_target_specific_qbx.py 'test_routine()' if __name__ == "__main__": import sys -- GitLab From 12197c0663a4c12d2a5111c65db03bad7d1aea3a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 24 Jul 2018 19:11:27 -0500 Subject: [PATCH 045/139] More detailed modeling of point and shoot in the translation cost model --- pytential/qbx/performance.py | 84 +++++++++--------------------------- 1 file changed, 20 insertions(+), 64 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 340c6d1e..a1a7e550 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -59,16 +59,10 @@ __doc__ = """ class TranslationCostModel(object): """Provides modeled costs for individual translations or evaluations.""" - def __init__(self, p_qbx, p_fmm, ncoeffs_qbx, ncoeffs_fmm, - translation_source_power, translation_target_power, - translation_max_power): - self.p_qbx = p_qbx - self.p_fmm = p_fmm + def __init__(self, ncoeffs_qbx, ncoeffs_fmm, uses_point_and_shoot): self.ncoeffs_qbx = ncoeffs_qbx self.ncoeffs_fmm = ncoeffs_fmm - self.translation_source_power = translation_source_power - self.translation_target_power = translation_target_power - self.translation_max_power = translation_max_power + self.uses_point_and_shoot = uses_point_and_shoot def direct(self): return var("c_p2p") @@ -95,26 +89,28 @@ class TranslationCostModel(object): return var("c_m2p") * self.ncoeffs_fmm def m2m(self): - return var("c_m2m") * self.e2e_cost(self.p_fmm, self.p_fmm) + return var("c_m2m") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) def l2l(self): - return var("c_l2l") * self.e2e_cost(self.p_fmm, self.p_fmm) + return var("c_l2l") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) def m2l(self): - return var("c_m2l") * self.e2e_cost(self.p_fmm, self.p_fmm) + return var("c_m2l") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) def m2qbxl(self): - return var("c_m2qbxl") * self.e2e_cost(self.p_fmm, self.p_qbx) + return var("c_m2qbxl") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_qbx) def l2qbxl(self): - return var("c_l2qbxl") * self.e2e_cost(self.p_fmm, self.p_qbx) + return var("c_l2qbxl") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_qbx) - def e2e_cost(self, p_source, p_target): - from pymbolic.primitives import Max - return ( - p_source ** self.translation_source_power - * p_target ** self.translation_target_power - * Max((p_source, p_target)) ** self.translation_max_power) + def e2e_cost(self, nsource_coeffs, ntarget_coeffs): + if self.uses_point_and_shoot: + return ( + nsource_coeffs ** (3 / 2) + + nsource_coeffs ** (1 / 2) * ntarget_coeffs + + ntarget_coeffs ** (3 / 2)) + + return nsource_coeffs * ntarget_coeffs # }}} @@ -188,9 +184,6 @@ class PerformanceModel(object): def __init__(self, uses_pde_expansions=True, - translation_source_power=None, - translation_target_power=None, - translation_max_power=None, summarize_parallel=None, calibration_params=None): """ @@ -203,9 +196,6 @@ class PerformanceModel(object): summed into one number encompassing the total workload. """ self.uses_pde_expansions = uses_pde_expansions - self.translation_source_power = translation_source_power - self.translation_target_power = translation_target_power - self.translation_max_power = translation_max_power if summarize_parallel is None: summarize_parallel = self.summarize_parallel_default self.summarize_parallel = summarize_parallel @@ -216,9 +206,6 @@ class PerformanceModel(object): def with_calibration_params(self, calibration_params): return type(self)( uses_pde_expansions=self.uses_pde_expansions, - translation_source_power=self.translation_source_power, - translation_target_power=self.translation_target_power, - translation_max_power=self.translation_max_power, summarize_parallel=self.summarize_parallel, calibration_params=calibration_params) @@ -510,54 +497,23 @@ class PerformanceModel(object): p_qbx = var("p_qbx") p_fmm = var("p_fmm") + uses_point_and_shoot = False + if self.uses_pde_expansions: ncoeffs_fmm = p_fmm ** (d-1) ncoeffs_qbx = p_qbx ** (d-1) - if d == 2: - default_translation_source_power = 1 - default_translation_target_power = 1 - default_translation_max_power = 0 - - elif d == 3: - # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. - default_translation_source_power = 0 - default_translation_target_power = 0 - default_translation_max_power = 3 - - else: - raise ValueError("Don't know how to estimate expansion complexities " - "for dimension %d" % d) + if d == 3: + uses_point_and_shoot = True else: ncoeffs_fmm = p_fmm ** d ncoeffs_qbx = p_qbx ** d - default_translation_source_power = d - default_translation_target_power = d - - translation_source_power = ( - default_translation_source_power - if self.translation_source_power is None - else self.translation_source_power) - - translation_target_power = ( - default_translation_target_power - if self.translation_target_power is None - else self.translation_target_power) - - translation_max_power = ( - default_translation_max_power - if self.translation_max_power is None - else self.translation_max_power) return TranslationCostModel( - p_qbx=p_qbx, - p_fmm=p_fmm, ncoeffs_qbx=ncoeffs_qbx, ncoeffs_fmm=ncoeffs_fmm, - translation_source_power=translation_source_power, - translation_target_power=translation_target_power, - translation_max_power=translation_max_power) + uses_point_and_shoot=uses_point_and_shoot) # }}} -- GitLab From d7ceb7dcb082cbb97cafa5dd095993606ca06864 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 25 Jul 2018 13:58:14 -0500 Subject: [PATCH 046/139] Try inlining the Legendre recurrence for the SLP --- pytential/qbx/target_specific.pyx | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index 7cde3a57..c04cfcfc 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -115,9 +115,10 @@ cdef double tsqbx_from_source( double[3] target, int order) nogil: cdef: - int i + int j double result, r, sc_d, tc_d, cos_angle - double tmp[128] + # Legendre recurrence values + double pj, pjm1, pjm2 tc_d = dist(target, center) sc_d = dist(source, center) @@ -128,14 +129,23 @@ cdef double tsqbx_from_source( (target[2] - center[2]) * (source[2] - center[2])) / (tc_d * sc_d)) - legvals(cos_angle, order, tmp, NULL) + if order == 0: + return 1 / sc_d + + pjm2 = 1 + pjm1 = cos_angle - result = 0 - r = 1 / sc_d + result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) + + r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + + for j in range(2, order + 1): + pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j + result += pj * r - for i in range(0, order + 1): - result += tmp[i] * r r *= (tc_d / sc_d) + pjm2 = pjm1 + pjm1 = pj return result -- GitLab From 97e4efe2318181f51d15d5eea7ce89c798974d04 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 26 Jul 2018 02:05:02 -0500 Subject: [PATCH 047/139] Add description of point and shoot costs --- pytential/qbx/performance.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index a1a7e550..16f7522c 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -106,8 +106,11 @@ class TranslationCostModel(object): def e2e_cost(self, nsource_coeffs, ntarget_coeffs): if self.uses_point_and_shoot: return ( + # Rotate the coordinate system to be z axis aligned. nsource_coeffs ** (3 / 2) + + # Translate the expansion along the z axis. nsource_coeffs ** (1 / 2) * ntarget_coeffs + + # Rotate the coordinate system back. ntarget_coeffs ** (3 / 2)) return nsource_coeffs * ntarget_coeffs -- GitLab From 10c0b49e34c1def319572f6c4e92b7167930f1ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Fri, 27 Jul 2018 18:44:19 -0400 Subject: [PATCH 048/139] Fix inducer/pytential#103 --- pytential/qbx/performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 16f7522c..221b86fb 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -596,7 +596,7 @@ class PerformanceModel(object): # {{{ evaluate locals - result["eval_locals"] = tree.ntargets * xlat_cost.l2p() + result["eval_locals"] = (tree.ntargets - geo_data.ncenters) * xlat_cost.l2p() # }}} -- GitLab From d19925ef792d254c8312de9fcfa156b183a661a8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 28 Jul 2018 13:33:42 -0500 Subject: [PATCH 049/139] Set the compiler to GCC --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f14be164..d984ba3c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -70,6 +70,7 @@ Python 3.5 Conda Apple: - export CONDA_ENVIRONMENT=.test-conda-env-py3-macos.yml - export PYTEST_ADDOPTS=${PYTEST_ADDOPTS:--k-slowtest} - export REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - export CC=gcc - set -o xtrace - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" -- GitLab From bb7129aba5e9cdb4b350f5121cc749356aed2572 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 30 Jul 2018 13:47:09 -0500 Subject: [PATCH 050/139] pylint fixes --- pytential/qbx/performance.py | 37 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 221b86fb..c4245104 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -25,17 +25,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -from six.moves import range -import numpy as np # noqa -import pyopencl as cl # noqa -import pyopencl.array # noqa -import sympy as sp - - -from pymbolic import var -from pytools import log_process from collections import OrderedDict + try: from collections.abc import MutableMapping except ImportError: @@ -43,6 +34,15 @@ except ImportError: from collections import MutableMapping import logging + +import numpy as np +import pyopencl as cl +from six.moves import range +import sympy as sp + +from pytools import log_process +from pymbolic import var + logger = logging.getLogger(__name__) @@ -64,7 +64,8 @@ class TranslationCostModel(object): self.ncoeffs_fmm = ncoeffs_fmm self.uses_point_and_shoot = uses_point_and_shoot - def direct(self): + @staticmethod + def direct(): return var("c_p2p") def p2qbxl(self): @@ -320,7 +321,7 @@ class PerformanceModel(object): len(traversal.target_or_target_parent_boxes), dtype=np.intp) - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + for itgt_box in range(len(traversal.target_or_target_parent_boxes)): start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] nform_local_box = 0 @@ -338,7 +339,8 @@ class PerformanceModel(object): # {{{ collect data about direct interactions with qbx centers - def _collect_qbxl_direct_interaction_data(self, xlat_cost, traversal, tree, + @staticmethod + def _collect_qbxl_direct_interaction_data(traversal, tree, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): # center -> nsources @@ -406,8 +408,8 @@ class PerformanceModel(object): global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): counts = self._collect_qbxl_direct_interaction_data( - xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts) + traversal, tree, global_qbx_centers, qbx_center_to_target_box, + center_to_targets_starts) result = {} result["eval_target_specific_qbx_locals_list1"] = ( @@ -433,8 +435,8 @@ class PerformanceModel(object): qbx_center_to_target_box, center_to_targets_starts): counts = self._collect_qbxl_direct_interaction_data( - xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts) + traversal, tree, global_qbx_centers, qbx_center_to_target_box, + center_to_targets_starts) result = {} result["form_global_qbx_locals_list1"] = ( @@ -496,7 +498,6 @@ class PerformanceModel(object): # {{{ set up translation cost model def get_translation_cost_model(self, d): - from pymbolic import var p_qbx = var("p_qbx") p_fmm = var("p_fmm") -- GitLab From 5fe5766f6f7b4696c68000a3155dff6d311d4730 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 2 Aug 2018 16:01:58 -0500 Subject: [PATCH 051/139] Add a check that the kernel can be used with TSQBX. --- pytential/qbx/fmmlib.py | 67 ++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index c4ba6637..9d52719b 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -27,7 +27,9 @@ from pytools import memoize_method, Record import pyopencl as cl # noqa import pyopencl.array # noqa: F401 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler -from sumpy.kernel import LaplaceKernel, HelmholtzKernel +from sumpy.kernel import ( + LaplaceKernel, HelmholtzKernel, AxisTargetDerivative, + DirectionalSourceDerivative) import pytential.qbx.target_specific as target_specific @@ -57,13 +59,13 @@ class QBXFMMLibExpansionWranglerCodeContainer(object): qbx_order, fmm_level_to_order, source_extra_kwargs={}, kernel_extra_kwargs=None, - _use_target_specific_list1=False): + _use_target_specific_qbx=False): return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, - _use_target_specific_list1) + _use_target_specific_qbx) # }}} @@ -133,11 +135,11 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): qbx_order, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, - _use_target_specific_list1=False): + _use_target_specific_qbx=False): self.code = code self.queue = queue - self._use_target_specific_list1 = _use_target_specific_list1 + self._use_target_specific_qbx = _use_target_specific_qbx # FMMLib is CPU-only. This wrapper gets the geometry out of # OpenCL-land. @@ -147,36 +149,25 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): # {{{ digest out_kernels - from sumpy.kernel import AxisTargetDerivative, DirectionalSourceDerivative + may_use_tsqbx = all( + self.is_supported_helmknl_for_tsqbx(out_knl) + for out_knl in self.code.out_kernels) + + if _use_target_specific_qbx and not may_use_tsqbx: + raise ValueError("TSQBX not supported for supplied kernels") k_names = [] source_deriv_names = [] - def is_supported_helmknl(knl): - if isinstance(knl, DirectionalSourceDerivative): - source_deriv_name = knl.dir_vec_name - knl = knl.inner_kernel - else: - source_deriv_name = None - - if isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3]: - k_names.append(knl.helmholtz_k_name) - source_deriv_names.append(source_deriv_name) - return True - elif isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]: - k_names.append(None) - source_deriv_names.append(source_deriv_name) - return True - - return False - ifgrad = False outputs = [] for out_knl in self.code.out_kernels: - if is_supported_helmknl(out_knl): + if isinstance(knl, DirectionalSourceDerivative): + source_deriv_names.append(knl.dir_vec_name) + if self.is_supported_helmknl(out_knl): outputs.append(()) elif (isinstance(out_knl, AxisTargetDerivative) - and is_supported_helmknl(out_knl.inner_kernel)): + and self.is_supported_helmknl(out_knl.inner_kernel)): outputs.append((out_knl.axis,)) ifgrad = True else: @@ -209,7 +200,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): order="F") def inner_fmm_level_to_nterms(tree, level): - from sumpy.kernel import LaplaceKernel, HelmholtzKernel if helmholtz_k == 0: return fmm_level_to_order( LaplaceKernel(tree.dimensions), @@ -230,6 +220,23 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad=ifgrad) + @staticmethod + def is_supported_helmknl_for_tsqbx(knl): + if isinstance(knl, DirectionalSourceDerivative): + knl = knl.inner_kernel + + return isinstance(knl, LaplaceKernel) and knl.dim == 3 + + @staticmethod + def is_supported_helmknl(knl): + if isinstance(knl, DirectionalSourceDerivative): + knl = knl.inner_kernel + + return ( + isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3] + or isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]) + + # {{{ data vector helpers def output_zeros(self): @@ -316,7 +323,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) @return_timing_data def form_global_qbx_locals(self, src_weights): - if self._use_target_specific_list1: + if self._use_target_specific_qbx: return self.qbx_local_expansion_zeros() geo_data = self.geo_data @@ -580,7 +587,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) @return_timing_data def eval_target_specific_qbx_locals(self, src_weights): - if not self._use_target_specific_list1: + if not self._use_target_specific_qbx: return self.full_output_zeros() pot = self.full_output_zeros() @@ -589,8 +596,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ctt = geo_data.center_to_tree_targets() - # TODO: assert this is the Laplace single or double layer kernel - for output in pot: target_specific.eval_target_specific_global_qbx_locals( order=self.qbx_order, -- GitLab From a20e7cc6958e2ed123e9b05bcd1caf770e6c6fca Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 2 Aug 2018 16:02:53 -0500 Subject: [PATCH 052/139] Fix PerformanceModelResult.__repr__() --- pytential/qbx/performance.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index c4245104..e9a83dc9 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -158,7 +158,13 @@ class PerformanceModelResult(MutableMapping): return self.perf_model_result.__str__() def __repr__(self): - return self.perf_model_result.__repr__() + return "".join([ + type(self).__name__, + "(", + repr(self.perf_model_result), + ",", + repr(self.params), + ")"]) def get_predicted_times(self, merge_close_lists=False): from pymbolic import evaluate -- GitLab From 2c6257d7738a67c890eb5ecf1209066184eb0c7d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 2 Aug 2018 16:05:44 -0500 Subject: [PATCH 053/139] flake8 fixes --- pytential/qbx/fmmlib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 9d52719b..e7fcf880 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -162,8 +162,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad = False outputs = [] for out_knl in self.code.out_kernels: - if isinstance(knl, DirectionalSourceDerivative): - source_deriv_names.append(knl.dir_vec_name) + if isinstance(out_knl, DirectionalSourceDerivative): + source_deriv_names.append(out_knl.dir_vec_name) if self.is_supported_helmknl(out_knl): outputs.append(()) elif (isinstance(out_knl, AxisTargetDerivative) @@ -236,7 +236,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3] or isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]) - # {{{ data vector helpers def output_zeros(self): -- GitLab From 104a2964c4a55d7c9375cf118b715b04df148d27 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 00:17:37 -0500 Subject: [PATCH 054/139] Address review comments. --- doc/qbx.rst | 5 ++ pytential/qbx/__init__.py | 2 +- pytential/qbx/performance.py | 107 +++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 36 deletions(-) diff --git a/doc/qbx.rst b/doc/qbx.rst index b4969866..3837469f 100644 --- a/doc/qbx.rst +++ b/doc/qbx.rst @@ -28,5 +28,10 @@ Fast multipole driver .. automodule:: pytential.qbx.fmm +Performance Model +----------------- + +.. automodule:: pytential.qbx.performance + .. vim: sw=4:tw=75 diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 9553c9c2..6a48dc5a 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -748,7 +748,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: performance_model = self.performance_model - performance_model_result = performance_model(self, geo_data) + performance_model_result = performance_model(geo_data) # {{{ construct dummy outputs diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 17283e13..a3b10205 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -35,11 +35,6 @@ import sympy as sp from pymbolic import var from collections import OrderedDict -try: - from collections.abc import MutableMapping -except ImportError: - # Py 2.7 - from collections import MutableMapping import logging logger = logging.getLogger(__name__) @@ -47,7 +42,7 @@ logger = logging.getLogger(__name__) __doc__ = """ .. autoclass:: PerformanceModel -.. autoclass:: PerformanceModelResult +.. autoclass:: ParametrizedCosts .. autofunction:: estimate_calibration_params """ @@ -115,48 +110,68 @@ class TranslationCostModel(object): # }}} -# {{{ performance model result +# {{{ parameterized costs returned by performance model -class PerformanceModelResult(MutableMapping): - """A container for holding performance model results. +class ParametrizedCosts(object): + """A container for data returned by the performance model. + + This holds both symbolic costs as well as parameter values. To obtain a + prediction of the running time, use :meth:`get_predicted_times`. + + .. attribute:: raw_costs + + A dictionary mapping algorithmic stage names to symbolic costs. + + .. attribute:: params + + A dictionary mapping names of symbolic parameters (such as FMM or QBX + order) to values. + + .. automethod:: copy + .. automethod:: with_params + .. automethod:: get_predicted_times """ - def __init__(self, perf_model_result, params): - self.perf_model_result = OrderedDict(perf_model_result) + def __init__(self, raw_costs, params): + self.raw_costs = OrderedDict(raw_costs) self.params = params def with_params(self, params): + """Return a copy of *self* with parameters updated to include *params*.""" new_params = self.params.copy() new_params.update(params) return type(self)( - perf_model_result=self.perf_model_result.copy(), + raw_costs=self.raw_costs.copy(), params=new_params) def copy(self): return self.with_params({}) - def __getitem__(self, val): - return self.perf_model_result.__getitem__(val) - - def __setitem__(self, key, val): - return self.perf_model_result.__setitem__(key, val) - - def __delitem__(self, key): - return self.perf_model_result.__delitem__(key) - - def __iter__(self): - return self.perf_model_result.__iter__() - - def __len__(self): - return self.perf_model_result.__len__() - def __str__(self): - return self.perf_model_result.__str__() + return "".join([ + type(self).__name__, + "(raw_costs=", + str(self.raw_costs), + ", params=", + str(self.params), + ")"]) def __repr__(self): - return self.perf_model_result.__repr__() + return "".join([ + type(self).__name__, + "(raw_costs=", + repr(self.raw_costs), + ", params=", + repr(self.params), + ")"]) def get_predicted_times(self, merge_close_lists=False): + """Return a dictionary mapping stage names to predicted time in seconds. + + :arg merge_close_lists: If *True*, the returned estimate combines + the cost of "close" lists (Lists 1, 3 close, and 4 close). If + *False*, the time of each "close" list is reported separately. + """ from pymbolic import evaluate from functools import partial @@ -164,7 +179,7 @@ class PerformanceModelResult(MutableMapping): result = OrderedDict() - for name, val in self.perf_model_result.items(): + for name, val in self.raw_costs.items(): if merge_close_lists: for suffix in ("_list1", "_list3", "_list4"): if name.endswith(suffix): @@ -181,6 +196,10 @@ class PerformanceModelResult(MutableMapping): # {{{ performance model class PerformanceModel(object): + """ + .. automethod:: with_calibration_params + .. automethod:: __call__ + """ def __init__(self, uses_pde_expansions=True, @@ -193,10 +212,14 @@ class PerformanceModel(object): :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM uses translation operators that make use of the knowledge that the potential satisfies a PDE. + :arg summarize_parallel: a function of two arguments - *(parallel_array, sym_multipliers)* used to process an array of - workloads of 'parallelizable units'. By default, all workloads are - summed into one number encompassing the total workload. + *(parallel_array, sym_multipliers)* used to model the cost after + taking into account parallelization. *parallel_array* represents a + partitioning of the work into elementary (typically box-based) tasks, + each with a given number of operations. *sym_multipliers* is a symbolic + value representing time per modeled operation. By default, all tasks + are summed into one number encompassing the total cost. """ self.uses_pde_expansions = uses_pde_expansions self.translation_source_power = translation_source_power @@ -210,6 +233,7 @@ class PerformanceModel(object): self.calibration_params = calibration_params def with_calibration_params(self, calibration_params): + """Return a copy of *self* with a new set of calibration parameters.""" return type(self)( uses_pde_expansions=self.uses_pde_expansions, translation_source_power=self.translation_source_power, @@ -505,11 +529,17 @@ class PerformanceModel(object): # }}} - def __call__(self, lpot_source, geo_data): + def __call__(self, geo_data): + """Analyze the given geometry and return performance data. + + :returns: An instance of :class:`ParametrizedCosts`. + """ # FIXME: This should suport target filtering. result = OrderedDict() + lpot_source = geo_data.lpot_source + nqbtl = geo_data.non_qbx_box_target_lists() with cl.CommandQueue(geo_data.cl_context) as queue: @@ -636,7 +666,7 @@ class PerformanceModel(object): # }}} - return PerformanceModelResult(result, params) + return ParametrizedCosts(result, params) # }}} @@ -644,6 +674,13 @@ class PerformanceModel(object): # {{{ calibrate performance model def _collect(expr, variables): + """Collect terms with respect to a list of variables. + + This applies :func:`sympy.simplify.collect` to the a :mod:`pymbolic` expression + with respect to the iterable of names in *variables*. + + Returns a dictionary mapping variable names to terms. + """ from pymbolic.interop.sympy import PymbolicToSympyMapper, SympyToPymbolicMapper p2s = PymbolicToSympyMapper() s2p = SympyToPymbolicMapper() -- GitLab From ce1218b6c9a0fbcda629bffadf063bd08f1ae1d6 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 00:22:29 -0500 Subject: [PATCH 055/139] Clarify what parameters mean --- pytential/qbx/performance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index a3b10205..9d784012 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -124,8 +124,9 @@ class ParametrizedCosts(object): .. attribute:: params - A dictionary mapping names of symbolic parameters (such as FMM or QBX - order) to values. + A dictionary mapping names of symbolic parameters to values. Parameters + appear in *raw_costs* and may include values such as QBX or FMM order + as well as calibration constants. .. automethod:: copy .. automethod:: with_params -- GitLab From d43dc134eb58663e3dfdbd39caa66cfb3b200f7b Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 00:23:17 -0500 Subject: [PATCH 056/139] Model -> model --- doc/qbx.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/qbx.rst b/doc/qbx.rst index 3837469f..b640ddcd 100644 --- a/doc/qbx.rst +++ b/doc/qbx.rst @@ -28,7 +28,7 @@ Fast multipole driver .. automodule:: pytential.qbx.fmm -Performance Model +Performance model ----------------- .. automodule:: pytential.qbx.performance -- GitLab From 73de50804e150d1e92e4e9dc45ddc47a2ad87adb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 01:09:16 -0500 Subject: [PATCH 057/139] Fix use of ParameterizedCosts after dropping mapping interface. --- pytential/qbx/performance.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 9d784012..2e42203d 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -742,7 +742,12 @@ def estimate_calibration_params(model_results, timing_results): for param in params: context[param] = var(param) - total_modeled_cost = evaluate(sum(model_result.values()), context=context) + # Represents the total modeled cost, but leaves the calibration + # parameters symbolic. + total_modeled_cost = evaluate( + sum(model_result.raw_costs.values()), + context=context) + collected_times = _collect(total_modeled_cost, params) for param, time in collected_times.items(): -- GitLab From 941d0a784db09e09179b6f00e31c8eb3ae8467ff Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 16:07:11 -0500 Subject: [PATCH 058/139] Fix parameter renaming --- pytential/qbx/__init__.py | 2 +- pytential/qbx/fmm.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index c892e132..1b87fc5c 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -821,7 +821,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, kernel_extra_kwargs=kernel_extra_kwargs, - _use_target_specific_list1=self._use_tsqbx) + _use_target_specific_qbx=self._use_tsqbx) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index be85a7f5..635c26ad 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -92,13 +92,13 @@ class QBXSumpyExpansionWranglerCodeContainer(SumpyExpansionWranglerCodeContainer qbx_order, fmm_level_to_order, source_extra_kwargs={}, kernel_extra_kwargs=None, - _use_target_specific_list1=False): + _use_target_specific_qbx=False): return QBXExpansionWrangler(self, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, - _use_target_specific_list1) + _use_target_specific_qbx) class QBXExpansionWrangler(SumpyExpansionWrangler): @@ -123,8 +123,8 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), def __init__(self, code_container, queue, geo_data, dtype, qbx_order, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs, - _use_target_specific_list1=False): - if _use_target_specific_list1: + _use_target_specific_qbx=False): + if _use_target_specific_qbx: raise NotImplementedError("Cannot use TSQBX with sumpy yet") SumpyExpansionWrangler.__init__(self, -- GitLab From 545c693f09d65d163615f8a2cedd862db6cbd1cb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 17:00:09 -0500 Subject: [PATCH 059/139] Actually fix out_kernels processing logic in pytential.qbx.fmmlib --- pytential/qbx/fmmlib.py | 43 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index e7fcf880..7f86e1d1 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -149,21 +149,17 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): # {{{ digest out_kernels - may_use_tsqbx = all( - self.is_supported_helmknl_for_tsqbx(out_knl) - for out_knl in self.code.out_kernels) - - if _use_target_specific_qbx and not may_use_tsqbx: - raise ValueError("TSQBX not supported for supplied kernels") - - k_names = [] - source_deriv_names = [] - ifgrad = False outputs = [] + source_deriv_names = [] + k_names = [] + for out_knl in self.code.out_kernels: - if isinstance(out_knl, DirectionalSourceDerivative): - source_deriv_names.append(out_knl.dir_vec_name) + if ( + _use_target_specific_qbx + and not self.is_supported_helmknl_for_tsqbx(out_knl)): + raise ValueError("not all kernels passed support TSQBX") + if self.is_supported_helmknl(out_knl): outputs.append(()) elif (isinstance(out_knl, AxisTargetDerivative) @@ -171,27 +167,40 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): outputs.append((out_knl.axis,)) ifgrad = True else: - raise NotImplementedError( + raise ValueError( "only the 2/3D Laplace and Helmholtz kernel " "and their derivatives are supported") + source_deriv_names.append(out_knl.dir_vec_name + if isinstance(out_knl, DirectionalSourceDerivative) + else None) + k_names.append(out_knl.helmholtz_k_name + if isinstance(out_knl, HelmholtzKernel) + else None) + + self.outputs = outputs + from pytools import is_single_valued + if not is_single_valued(source_deriv_names): raise ValueError("not all kernels passed are the same in " "whether they represent a source derivative") source_deriv_name = source_deriv_names[0] - self.outputs = outputs - # }}} + if not is_single_valued(k_names): + raise ValueError("not all kernels passed have the same " + "Helmholtz parameter") + + k_name = k_names[0] - from pytools import single_valued - k_name = single_valued(k_names) if k_name is None: helmholtz_k = 0 else: helmholtz_k = kernel_extra_kwargs[k_name] + # }}} + dipole_vec = None if source_deriv_name is not None: dipole_vec = np.array([ -- GitLab From e41e8ca25d4f860332558598aab807d4132314cc Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 3 Aug 2018 17:28:12 -0500 Subject: [PATCH 060/139] Fix helmholtz k name getting --- pytential/qbx/fmmlib.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 7f86e1d1..5a258caa 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -174,8 +174,10 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): source_deriv_names.append(out_knl.dir_vec_name if isinstance(out_knl, DirectionalSourceDerivative) else None) - k_names.append(out_knl.helmholtz_k_name - if isinstance(out_knl, HelmholtzKernel) + + base_knl = out_knl.get_base_kernel() + k_names.append(base_knl.helmholtz_k_name + if isinstance(base_knl, HelmholtzKernel) else None) self.outputs = outputs -- GitLab From a2d1d40321b913d230aade2a82d342e9ef0cdf38 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 01:40:43 -0500 Subject: [PATCH 061/139] WIP: Verify performance model with counts returned by ConstantOneExpansionWrangler. --- pytential/qbx/fmm.py | 8 +- pytential/qbx/performance.py | 1 + requirements.txt | 2 +- test/test_performance_model.py | 252 +++++++++++++++++++++++++-------- 4 files changed, 200 insertions(+), 63 deletions(-) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index b1ce86cc..e41aa350 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -374,7 +374,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), # {{{ FMM top-level -def drive_fmm(expansion_wrangler, src_weights, timing_data=None): +def drive_fmm(expansion_wrangler, src_weights, timing_data=None, traversal=None): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`QBXFMMGeometryData` instance. @@ -392,8 +392,12 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): wrangler = expansion_wrangler geo_data = wrangler.geo_data - traversal = geo_data.traversal() + + if traversal is None: + traversal = geo_data.traversal() + tree = traversal.tree + recorder = TimingRecorder() # Interface guidelines: Attributes of the tree are assumed to be known diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 2e42203d..8f428f4a 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -503,6 +503,7 @@ class PerformanceModel(object): ncoeffs_qbx = p_qbx ** d default_translation_source_power = d default_translation_target_power = d + default_translation_max_power = 0 translation_source_power = ( default_translation_source_power diff --git a/requirements.txt b/requirements.txt index 6d1e4cce..9e58527f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/inducer/modepy git+https://github.com/inducer/pyopencl git+https://github.com/inducer/islpy git+https://github.com/inducer/loopy -git+https://gitlab.tiker.net/inducer/boxtree +git+https://gitlab.tiker.net/inducer/boxtree@move-constant-one-wrangler-to-tools git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://github.com/inducer/pyfmmlib diff --git a/test/test_performance_model.py b/test/test_performance_model.py index f6f9f53d..cb782466 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -24,6 +24,8 @@ THE SOFTWARE. import numpy as np import numpy.linalg as la # noqa + +from boxtree.tools import ConstantOneExpansionWrangler import pyopencl as cl import pyopencl.clmath # noqa import pytest @@ -46,26 +48,22 @@ DEFAULT_LPOT_KWARGS = { "_from_sep_smaller_crit": "static_l2", } -# }}} - - -# {{{ test_timing_data_gathering - -def test_timing_data_gathering(ctx_getter): - pytest.importorskip("pyfmmlib") - - cl_ctx = ctx_getter() - queue = cl.CommandQueue(cl_ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) +def get_lpot_source(queue, dim): from meshmode.discretization import Discretization from meshmode.discretization.poly_element import ( InterpolatoryQuadratureSimplexGroupFactory) target_order = TARGET_ORDER - from meshmode.mesh.generation import starfish, make_curve_mesh - mesh = make_curve_mesh(starfish, np.linspace(0, 1, 1000), order=target_order) + if dim == 2: + from meshmode.mesh.generation import starfish, make_curve_mesh + mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order) + elif dim == 3: + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(r=1, order=target_order) + else: + raise ValueError("unknown dimension: %d" % dim) pre_density_discr = Discretization( queue.context, mesh, @@ -74,8 +72,7 @@ def test_timing_data_gathering(ctx_getter): lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() lpot_kwargs.update( _expansion_stick_out_factor=TCF, - fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, - fmm_backend="fmmlib", + fmm_order=FMM_ORDER, qbx_order=QBX_ORDER ) from pytential.qbx import QBXLayerPotentialSource @@ -85,16 +82,38 @@ def test_timing_data_gathering(ctx_getter): lpot_source, _ = lpot_source.with_refinement() + return lpot_source + + +def get_density(queue, lpot_source): density_discr = lpot_source.density_discr nodes = density_discr.nodes().with_queue(queue) - sigma = cl.clmath.sin(10 * nodes[0]) + return cl.clmath.sin(10 * nodes[0]) + +def get_bound_slp_op(lpot_source): from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") k_sym = LaplaceKernel(lpot_source.ambient_dim) sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - op_S = bind(lpot_source, sym_op_S) + return bind(lpot_source, sym_op_S) + +# }}} + + +# {{{ test timing data gathering + +def test_timing_data_gathering(ctx_getter): + pytest.importorskip("pyfmmlib") + + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + lpot_source = get_lpot_source(queue, 2) + sigma = get_density(queue, lpot_source) + op_S = get_bound_slp_op(lpot_source) timing_data = {} op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data) @@ -104,54 +123,15 @@ def test_timing_data_gathering(ctx_getter): # }}} -# {{{ test_performance_model +# {{{ test performance model @pytest.mark.parametrize("dim", (2, 3)) def test_performance_model(ctx_getter, dim): cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) - # {{{ get lpot source - - from meshmode.discretization import Discretization - from meshmode.discretization.poly_element import ( - InterpolatoryQuadratureSimplexGroupFactory) - - target_order = TARGET_ORDER - - if dim == 2: - from meshmode.mesh.generation import starfish, make_curve_mesh - mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order) - elif dim == 3: - from meshmode.mesh.generation import generate_icosphere - mesh = generate_icosphere(r=1, order=target_order) - else: - raise ValueError("unknown dimension: %d" % dim) - - pre_density_discr = Discretization( - queue.context, mesh, - InterpolatoryQuadratureSimplexGroupFactory(target_order)) - - lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() - lpot_kwargs.update( - _expansion_stick_out_factor=TCF, - fmm_order=FMM_ORDER, qbx_order=QBX_ORDER - ) - - from pytential.qbx import QBXLayerPotentialSource - lpot_source = QBXLayerPotentialSource( - pre_density_discr, OVSMP_FACTOR*target_order, - **lpot_kwargs) - - lpot_source, _ = lpot_source.with_refinement() - - # }}} - - # {{{ run performance model - - density_discr = lpot_source.density_discr - nodes = density_discr.nodes().with_queue(queue) - sigma = cl.clmath.sin(10 * nodes[0]) + lpot_source = get_lpot_source(queue, dim) + sigma = get_density(queue, lpot_source) from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") @@ -169,7 +149,159 @@ def test_performance_model(ctx_getter, dim): perf_S_plus_D = op_S_plus_D.get_modeled_performance(queue, sigma=sigma) assert len(perf_S_plus_D) == 2 - # }}} +# }}} + + +# {{{ constant one wrangler + +class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): + + def __init__(self, queue, geo_data): + host_tree = geo_data.tree().get(queue) + ConstantOneExpansionWrangler.__init__(self, host_tree) + + self.geo_data = geo_data + + self.qbx_center_to_target_box = ( + geo_data.qbx_center_to_target_box().get(queue)) + self.qbx_center_to_target_box_source_level = [ + geo_data.qbx_center_to_target_box_source_level(lev).get(queue) + for lev in range(host_tree.nlevels)] + self.global_qbx_centers = geo_data.global_qbx_centers().get(queue) + self.trav = geo_data.traversal().get(queue) + self.center_to_tree_targets = geo_data.center_to_tree_targets().get(queue) + + def output_zeros(self): + non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() + return np.zeros(non_qbx_box_target_lists.nfiltered_targets) + + def full_output_zeros(self): + from pytools.obj_array import make_obj_array + return make_obj_array([np.zeros(self.tree.ntargets)]) + + def qbx_local_expansion_zeros(self): + return np.zeros(self.geo_data.ncenters) + + def reorder_potentials(self, potentials): + raise NotImplementedError("reorder_potentials should not " + "be called on a QBXExpansionWrangler") + + def form_global_qbx_locals(self, src_weights): + local_exps = self.qbx_local_expansion_zeros() + ops = 0 + + for itgt_center, tgt_icenter in enumerate(self.global_qbx_centers): + itgt_box = self.qbx_center_to_target_box[tgt_icenter] + + start, end = ( + self.trav.neighbor_source_boxes_starts[itgt_box:itgt_box + 2]) + + src_sum = 0 + for src_ibox in self.trav.neighbor_source_boxes_lists[start:end]: + src_pslice = self._get_source_slice(src_ibox) + ops += src_pslice.stop - src_pslice.start + src_sum += np.sum(src_weights[src_pslice]) + + local_exps[tgt_icenter] = src_sum + + return local_exps, self.timing_future(ops) + + def translate_box_multipoles_to_qbx_local(self, multipole_exps): + local_exps = self.qbx_local_expansion_zeros() + ops = 0 + for isrc_level, ssn in enumerate(self.trav.from_sep_smaller_by_level): + for tgt_icenter in self.global_qbx_centers: + icontaining_tgt_box = self.qbx_center_to_target_box_source_level[ + isrc_level][tgt_icenter] + + if icontaining_tgt_box == -1: + continue + + start, stop = ( + ssn.starts[icontaining_tgt_box], + ssn.starts[icontaining_tgt_box+1]) + + for src_ibox in ssn.lists[start:stop]: + local_exps[tgt_icenter] += multipole_exps[src_ibox] + ops += 1 + + return local_exps, self.timing_future(ops) + + def translate_box_local_to_qbx_local(self, local_exps): + qbx_expansions = self.qbx_local_expansion_zeros() + ops = 0 + + for tgt_icenter in self.global_qbx_centers: + isrc_box = self.qbx_center_to_target_box[tgt_icenter] + src_ibox = self.trav.target_boxes[isrc_box] + qbx_expansions[tgt_icenter] += local_exps[src_ibox] + ops += 1 + + return qbx_expansions, self.timing_future(ops) + + def eval_qbx_expansions(self, qbx_expansions): + output = self.full_output_zeros() + ops = 0 + for src_icenter in self.global_qbx_centers: + start, end = ( + self.center_to_tree_targets.starts[src_icenter:src_icenter+2]) + for icenter_tgt in range(start, end): + center_itgt = self.center_to_tree_targets.lists[icenter_tgt] + output[0][center_itgt] += qbx_expansions[src_icenter] + ops += 1 + + return output, self.timing_future(ops) + +# }}} + + +# {{{ verify performance model + +@pytest.mark.parametrize("dim", (2, 3)) +def test_performance_model_correctness(ctx_getter, dim): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + from pytential.qbx.performance import PerformanceModel + + # We set uses_pde_expansions=False, so that a translation is modeled as + # simply costing nsrc_coeffs * ntgt_coeffs. By adjusting the symbolic + # parameters to equal 1 (done below), this provides a straightforward way + # to obtain the raw operation count for each FMM stage. + lpot_source = get_lpot_source(queue, dim).copy( + performance_model=PerformanceModel(uses_pde_expansions=False)) + + sigma = get_density(queue, lpot_source) + op_S = get_bound_slp_op(lpot_source) + + from pytools import one + perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) + # Set all parameters equal to 1, to obtain raw op counts. + perf_S = perf_S.with_params(dict((param, 1) for param in perf_S.params)) + + from pytential.qbx.fmm import drive_fmm + geo_data = lpot_source.qbx_fmm_geometry_data( + target_discrs_and_qbx_sides=((lpot_source.density_discr, +1),)) + + wrangler = ConstantOneQBXExpansionWrangler(queue, geo_data) + + nnodes = lpot_source.quad_stage2_density_discr.nnodes + + src_weights = np.ones(nnodes) + + timing_data = {} + potential = drive_fmm(wrangler, src_weights, timing_data, + traversal=wrangler.trav)[0][geo_data.ncenters:] + + print("potential is", potential) + + # Check constant one wrangler for correctness. + assert (potential == nnodes).all() + + # Check that the performance model matches the timing data returned by the + # constant one wrangler. + print("timing_data", timing_data) + print("perf model", perf_S.raw_costs) # }}} -- GitLab From b061bff8e182a989914dbd569a4bd8226ac32d71 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 02:04:45 -0500 Subject: [PATCH 062/139] Use a torus, which is less boring --- test/test_performance_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index cb782466..28a914cf 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -60,8 +60,8 @@ def get_lpot_source(queue, dim): from meshmode.mesh.generation import starfish, make_curve_mesh mesh = make_curve_mesh(starfish, np.linspace(0, 1, 50), order=target_order) elif dim == 3: - from meshmode.mesh.generation import generate_icosphere - mesh = generate_icosphere(r=1, order=target_order) + from meshmode.mesh.generation import generate_torus + mesh = generate_torus(2, 1, order=target_order) else: raise ValueError("unknown dimension: %d" % dim) -- GitLab From b293fb9cf9434d93b231f1342d83d1344d969181 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 02:05:06 -0500 Subject: [PATCH 063/139] Check for op count equality. --- test/test_performance_model.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 28a914cf..35021c8f 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -257,6 +257,24 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): # {{{ verify performance model +CONSTANT_ONE_PARAMS = dict( + p_qbx=1, + p_fmm=1, + c_l2l=1, + c_l2p=1, + c_l2qbxl=1, + c_m2l=1, + c_m2m=1, + c_m2p=1, + c_m2qbxl=1, + c_p2l=1, + c_p2m=1, + c_p2p=1, + c_p2qbxl=1, + c_qbxl2p=1, + ) + + @pytest.mark.parametrize("dim", (2, 3)) def test_performance_model_correctness(ctx_getter, dim): cl_ctx = ctx_getter() @@ -277,7 +295,7 @@ def test_performance_model_correctness(ctx_getter, dim): from pytools import one perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) # Set all parameters equal to 1, to obtain raw op counts. - perf_S = perf_S.with_params(dict((param, 1) for param in perf_S.params)) + perf_S = perf_S.with_params(CONSTANT_ONE_PARAMS) from pytential.qbx.fmm import drive_fmm geo_data = lpot_source.qbx_fmm_geometry_data( @@ -293,15 +311,20 @@ def test_performance_model_correctness(ctx_getter, dim): potential = drive_fmm(wrangler, src_weights, timing_data, traversal=wrangler.trav)[0][geo_data.ncenters:] - print("potential is", potential) - # Check constant one wrangler for correctness. assert (potential == nnodes).all() + modeled_time = perf_S.get_predicted_times(merge_close_lists=True) + # Check that the performance model matches the timing data returned by the # constant one wrangler. - print("timing_data", timing_data) - print("perf model", perf_S.raw_costs) + mismatches = [] + for stage in timing_data: + if timing_data[stage].process_elapsed != modeled_time[stage]: + mismatches.append( + (stage, timing_data[stage].process_elapsed, modeled_time[stage])) + + assert not mismatches, str("\n".join(str(s) for s in mismatches)) # }}} -- GitLab From c2ee6505d3d2a7c38f7be4607bf9603537c23768 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 15:38:03 -0500 Subject: [PATCH 064/139] Implement _get_target_slice --- test/test_performance_model.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 35021c8f..bf399e4b 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -170,6 +170,14 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): self.global_qbx_centers = geo_data.global_qbx_centers().get(queue) self.trav = geo_data.traversal().get(queue) self.center_to_tree_targets = geo_data.center_to_tree_targets().get(queue) + self.non_qbx_box_target_lists = ( + geo_data.non_qbx_box_target_lists().get(queue)) + + def _get_target_slice(self, ibox): + pstart = self.non_qbx_box_target_lists.box_target_starts[ibox] + return slice( + pstart, pstart + + self.non_qbx_box_target_lists.box_target_counts_nonchild[ibox]) def output_zeros(self): non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() @@ -324,7 +332,7 @@ def test_performance_model_correctness(ctx_getter, dim): mismatches.append( (stage, timing_data[stage].process_elapsed, modeled_time[stage])) - assert not mismatches, str("\n".join(str(s) for s in mismatches)) + assert not mismatches, "\n".join(str(s) for s in mismatches) # }}} -- GitLab From 9eeb7040dc66c9e8826f53212f1eadee1ef8d52c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 17:42:58 -0500 Subject: [PATCH 065/139] Fix performance model --- pytential/qbx/performance.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 8f428f4a..66af69db 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -247,6 +247,31 @@ class PerformanceModel(object): def summarize_parallel_default(parallel_array, sym_multipliers): return np.sum(parallel_array) * sym_multipliers + # {{{ propagate multipoles upward + + def process_coarsen_multipoles(self, xlat_cost, tree, traversal): + nmultipoles = 0 + + # nlevels-1 is the last valid level index + # nlevels-2 is the last valid level that could have children + # + # 3 is the last relevant source_level. + # 2 is the last relevant target_level. + # (because no level 1 box will be well-separated from another) + for source_level in range(tree.nlevels-1, 2, -1): + target_level = source_level - 1 + start, stop = traversal.level_start_source_parent_box_nrs[ + target_level:target_level+2] + for ibox in traversal.source_parent_boxes[start:stop]: + for child in tree.box_child_ids[:, ibox]: + if child: + nmultipoles += 1 + + return dict(coarsen_multipoles=( + self.summarize_parallel(nmultipoles, xlat_cost.m2m()))) + + # }}} + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) def process_direct(self, xlat_cost, traversal, tree, box_target_counts_nonchild): @@ -573,7 +598,7 @@ class PerformanceModel(object): # {{{ propagate multipoles upward - result["coarsen_multipoles"] = tree.nboxes * xlat_cost.m2m() + result.update(self.process_coarsen_multipoles(xlat_cost, tree, traversal)) # }}} @@ -605,13 +630,14 @@ class PerformanceModel(object): # {{{ propagate local_exps downward - result["refine_locals"] = tree.nboxes * xlat_cost.l2l() + result["refine_locals"] = ( + traversal.ntarget_or_target_parent_boxes * xlat_cost.l2l()) # }}} # {{{ evaluate locals - result["eval_locals"] = tree.ntargets * xlat_cost.l2p() + result["eval_locals"] = nqbtl.nfiltered_targets * xlat_cost.l2p() # }}} @@ -657,7 +683,7 @@ class PerformanceModel(object): # {{{ translate from box local expansions to qbx local expansions result["translate_box_local_to_qbx_local"] = ( - geo_data.ncenters * xlat_cost.l2qbxl()) + len(global_qbx_centers) * xlat_cost.l2qbxl()) # }}} -- GitLab From ecfc80916a1ff95828586af068489dc73b86959c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 17:46:01 -0500 Subject: [PATCH 066/139] Point conda requirements.txt to correct boxtree branch --- .test-conda-env-py3-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test-conda-env-py3-requirements.txt b/.test-conda-env-py3-requirements.txt index fa6c0426..aa1f7e42 100644 --- a/.test-conda-env-py3-requirements.txt +++ b/.test-conda-env-py3-requirements.txt @@ -1,4 +1,4 @@ -git+https://gitlab.tiker.net/inducer/boxtree +git+https://gitlab.tiker.net/inducer/boxtree@move-constant-one-wrangler-to-tools git+https://github.com/inducer/pymbolic git+https://github.com/inducer/loopy git+https://gitlab.tiker.net/inducer/sumpy -- GitLab From a4aa38163963ecae6eef6e1f2bfc1680ab2439a0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 7 Aug 2018 23:34:46 -0500 Subject: [PATCH 067/139] Slightly improve performance of single-layer slp. --- pytential/qbx/target_specific.pyx | 9 +++++++-- setup.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx index c04cfcfc..a32c0a86 100644 --- a/pytential/qbx/target_specific.pyx +++ b/pytential/qbx/target_specific.pyx @@ -118,7 +118,7 @@ cdef double tsqbx_from_source( int j double result, r, sc_d, tc_d, cos_angle # Legendre recurrence values - double pj, pjm1, pjm2 + double pj, pjm1, pjm2, jj tc_d = dist(target, center) sc_d = dist(source, center) @@ -139,11 +139,16 @@ cdef double tsqbx_from_source( r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + # Invariant: jj == j. Using a double-precision copy of j avoids an + # int-to-double conversion inside the loop. + jj = 2 + for j in range(2, order + 1): - pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j + pj = ( (2.*jj-1.)*cos_angle*pjm1-(jj-1.)*pjm2 ) / jj result += pj * r r *= (tc_d / sc_d) + jj += 1 pjm2 = pjm1 pjm1 = pj diff --git a/setup.py b/setup.py index c9e0d161..86108abc 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ ext_modules = [ Extension( "pytential.qbx.target_specific", ["pytential/qbx/target_specific.pyx"], - extra_compile_args=["-fopenmp", "-ffast-math"], + extra_compile_args=["-fopenmp", "-Ofast", "-march=native"], extra_link_args=["-fopenmp"] ) ] -- GitLab From a2d28250e90d0e393a2a39628771e75270b56c24 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 9 Aug 2018 17:21:35 -0500 Subject: [PATCH 068/139] [ci skip] Add test for Helmholtz TSQBX, single and double layer --- test/test_target_specific_qbx.py | 53 +++++++++++++++++++------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index a2b9317a..b19d9fb3 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -26,7 +26,7 @@ THE SOFTWARE. import numpy as np import numpy.linalg as la # noqa import pyopencl as cl -import pyopencl.clmath # noqa +import pyopencl.clmath as clmath import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) @@ -36,22 +36,24 @@ from meshmode.mesh.generation import ( # noqa ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle, NArmedStarfish, make_curve_mesh) -# from sumpy.visualization import FieldPlotter + from pytential import bind, sym, norm # noqa -from sumpy.kernel import LaplaceKernel, HelmholtzKernel # noqa +from sumpy.kernel import LaplaceKernel, HelmholtzKernel import logging logger = logging.getLogger(__name__) @pytest.mark.parametrize("op", ["S", "D"]) -def test_target_specific_qbx(ctx_getter, op): +@pytest.mark.parametriz("helmholtz_k", [0, 1.2]) +def test_target_specific_qbx(ctx_getter, op, helmholtz_k): logging.basicConfig(level=logging.INFO) cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) - target_order = 4 + target_order = 8 + fmm_tol = 1e-5 from meshmode.mesh.generation import generate_icosphere mesh = generate_icosphere(1, target_order) @@ -61,47 +63,54 @@ def test_target_specific_qbx(ctx_getter, op): InterpolatoryQuadratureSimplexGroupFactory from pytential.qbx import QBXLayerPotentialSource pre_density_discr = Discretization( - cl_ctx, mesh, - InterpolatoryQuadratureSimplexGroupFactory(target_order)) + cl_ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder + + refiner_extra_kwargs = {} + + if helmholtz_k != 0: + refiner_extra_kwargs["kernel_length_scale"] = 5 / helmholtz_k qbx, _ = QBXLayerPotentialSource( pre_density_discr, 4*target_order, qbx_order=5, - fmm_order=10, + fmm_level_to_order=SimpleExpansionOrderFinder(fmm_tol), fmm_backend="fmmlib", _expansions_in_tree_have_extent=True, _expansion_stick_out_factor=0.9, - ).with_refinement() + ).with_refinement(**refiner_extra_kwargs) density_discr = qbx.density_discr - nodes_host = density_discr.nodes().get(queue) - center = np.array([3, 1, 2]) - diff = nodes_host - center[:, np.newaxis] + nodes = density_discr.nodes().with_queue(queue) + u_dev = clmath.sin(nodes[0]) - dist_squared = np.sum(diff**2, axis=0) - dist = np.sqrt(dist_squared) - u = 1/dist - - u_dev = cl.array.to_device(queue, u) + if helmholtz_k == 0: + kernel = LaplaceKernel(3) + kernel_kwargs = {} + else: + kernel = HelmholtzKernel(3) + kernel_kwargs = {"k": sym.var("k")} - kernel = LaplaceKernel(3) u_sym = sym.var("u") if op == "S": op = sym.S elif op == "D": op = sym.D - expr = op(kernel, u_sym, qbx_forced_limit=-1) + + expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs) bound_op = bind(qbx, expr) - slp_ref = bound_op(queue, u=u_dev) + pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k) qbx = qbx.copy(_use_tsqbx=True) bound_op = bind(qbx, expr) - slp_tsqbx = bound_op(queue, u=u_dev) + pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k) - assert (np.max(np.abs(slp_ref.get() - slp_tsqbx.get()))) < 1e-13 + assert (np.max(np.abs(pot_ref.get() - pot_tsqbx.get()))) < 1e-13 # You can test individual routines by typing -- GitLab From 704d4202a5ec25da272bba48df978e72fa8f18a7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 9 Aug 2018 22:37:42 -0500 Subject: [PATCH 069/139] [WIP] Add Fortran subroutines for modified spherical Bessel functions. --- pytential/qbx/jfuns3d.f90 | 226 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 pytential/qbx/jfuns3d.f90 diff --git a/pytential/qbx/jfuns3d.f90 b/pytential/qbx/jfuns3d.f90 new file mode 100644 index 00000000..03fa6ef8 --- /dev/null +++ b/pytential/qbx/jfuns3d.f90 @@ -0,0 +1,226 @@ +!! Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas +!! Contact: greengard@cims.nyu.edu +!! +!! This software is being released under a modified FreeBSD license +!! (see COPYING in home directory). +!********************************************************************** + +! $Date: 2011-07-15 16:28:31 -0400 (Fri, 15 Jul 2011) $ +! $Revision: 2253 $ + + +! Computation of spherical Bessel functions via recurrence + +!********************************************************************** + subroutine jfuns3d(ier,nterms,z,scale,fjs,ifder,fjder, & + lwfjs,iscale,ntop) + implicit none + integer :: ier,nterms,ifder,lwfjs,ntop,i,ncntr + real *8 :: scale,d0,d1,dc1,dc2,dcoef,dd,done,tiny,zero + real *8 :: scalinv,sctot,upbound,upbound2,upbound2inv +!********************************************************************** + +! PURPOSE: + +! This subroutine evaluates the first NTERMS spherical Bessel +! functions and if required, their derivatives. +! It incorporates a scaling parameter SCALE so that + +! fjs_n(z)=j_n(z)/SCALE^n +! fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} + +! NOTE: The scaling parameter SCALE is meant to be used when +! abs(z) < 1, in which case we recommend setting +! SCALE = abs(z). This prevents the fjs_n from +! underflowing too rapidly. +! Otherwise, set SCALE=1. +! Do not set SCALE = abs(z) if z could take on the +! value zero. +! In an FMM, when forming an expansion from a collection of +! sources, set SCALE = min( abs(k*r), 1) +! where k is the Helmholtz parameter and r is the box dimension +! at the relevant level. + +! INPUT: + +! nterms (integer): order of expansion of output array fjs +! z (complex *16): argument of the spherical Bessel functions +! scale (real *8) : scaling factor (discussed above) +! ifder (integer): flag indicating whether to calculate "fjder" +! 0 NO +! 1 YES +! lwfjs (integer): upper limit of input arrays +! fjs(0:lwfjs) and iscale(0:lwfjs) +! iscale (integer): integer workspace used to keep track of +! internal scaling + +! OUTPUT: + +! ier (integer): error return code +! ier=0 normal return; +! ier=8 insufficient array dimension lwfjs +! fjs (complex *16): array of scaled Bessel functions. +! fjder (complex *16): array of derivs of scaled Bessel functions. +! ntop (integer) : highest index in arrays fjs that is nonzero + +! NOTE, that fjs and fjder arrays must be at least (nterms+2) +! complex *16 elements long. + + + integer :: iscale(0:lwfjs) + complex *16 :: wavek,fjs(0:lwfjs),fjder(0:*) + complex *16 :: z,zinv,com,fj0,fj1,zscale,ztmp + + data upbound/1.0d+32/, upbound2/1.0d+40/, upbound2inv/1.0d-40/ + data tiny/1.0d-200/,done/1.0d0/,zero/0.0d0/ + +! ... Initializing ... + + ier=0 + +! set to asymptotic values if argument is sufficiently small + + if (abs(z) < tiny) then + fjs(0) = done + do i = 1, nterms + fjs(i) = zero + enddo + + if (ifder == 1) then + do i=0,nterms + fjder(i)=zero + enddo + fjder(1)=done/(3*scale) + endif + + RETURN + endif + +! ... Step 1: recursion up to find ntop, starting from nterms + + ntop=0 + zinv=done/z + fjs(nterms)=done + fjs(nterms-1)=zero + + do i=nterms,lwfjs + dcoef=2*i+done + ztmp=dcoef*zinv*fjs(i)-fjs(i-1) + fjs(i+1)=ztmp + + dd = dreal(ztmp)**2 + dimag(ztmp)**2 + if (dd > upbound2) then + ntop=i+1 + exit + endif + enddo + if (ntop == 0) then + ier=8 + return + endif + +! ... Step 2: Recursion back down to generate the unscaled jfuns: +! if magnitude exceeds UPBOUND2, rescale and continue the +! recursion (saving the order at which rescaling occurred +! in array iscale. + + do i=0,ntop + iscale(i)=0 + enddo + + fjs(ntop)=zero + fjs(ntop-1)=done + do i=ntop-1,1,-1 + dcoef=2*i+done + ztmp=dcoef*zinv*fjs(i)-fjs(i+1) + fjs(i-1)=ztmp + + dd = dreal(ztmp)**2 + dimag(ztmp)**2 + if (dd > UPBOUND2) then + fjs(i) = fjs(i)*UPBOUND2inv + fjs(i-1) = fjs(i-1)*UPBOUND2inv + iscale(i) = 1 + endif + enddo + +! ... Step 3: go back up to the top and make sure that all +! Bessel functions are scaled by the same factor +! (i.e. the net total of times rescaling was invoked +! on the way down in the previous loop). +! At the same time, add scaling to fjs array. + + ncntr=0 + scalinv=done/scale + sctot = 1.0d0 + do i=1,ntop + sctot = sctot*scalinv + if(iscale(i-1) == 1) sctot=sctot*UPBOUND2inv + fjs(i)=fjs(i)*sctot + enddo + +! ... Determine the normalization parameter: + + fj0=sin(z)*zinv + fj1=fj0*zinv-cos(z)*zinv + + d0=abs(fj0) + d1=abs(fj1) + if (d1 > d0) then + zscale=fj1/(fjs(1)*scale) + else + zscale=fj0/fjs(0) + endif + +! ... Scale the jfuns by zscale: + + ztmp=zscale + do i=0,nterms + fjs(i)=fjs(i)*ztmp + enddo + +! ... Finally, calculate the derivatives if desired: + + if (ifder == 1) then + fjs(nterms+1)=fjs(nterms+1)*ztmp + + fjder(0)=-fjs(1)*scale + do i=1,nterms + dc1=i/(2*i+done) + dc2=done-dc1 + dc1=dc1*scalinv + dc2=dc2*scale + fjder(i)=dc1*fjs(i-1)-dc2*fjs(i+1) + enddo + endif + return + end subroutine jfuns3d + + ! void c_jfuns3( + ! int *ier, + ! int nterms, + ! complex double z, + ! double scale, + ! complex double *fjs, + ! int ifder, + ! complex double *fjder, + ! int lwfjs, + ! int *iscale, + ! int *ntop) + subroutine c_jfuns3d(ier, nterms, z, scale, fjs, ifder, fjder, lwfjs,& + & iscale, ntop) bind(c) + use iso_c_binding + implicit none + integer (c_int) :: ier + integer (c_int), value :: nterms + complex (c_double_complex), value :: z + real (c_double), value :: scale + complex (c_double_complex) :: fjs(0:lwfjs) + integer (c_int), value :: ifder + complex (c_double_complex) :: fjder(0:*) + integer (c_int), value :: lwfjs + integer (c_int) :: iscale(0:lwfjs) + integer (c_int) :: ntop + + call jfuns3d(ier, nterms, z, scale, fjs, ifder, fjder, lwfjs, iscale,& + & ntop) + end subroutine c_jfuns3d -- GitLab From bd16f0033fc68917359964f17d0858916ee293f2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Aug 2018 14:47:54 -0500 Subject: [PATCH 070/139] Use an F2C-based approach for porting FMMLIB code. --- pytential/qbx/jfuns3d.f90 | 226 ---------------- pytential/qbx/target_specific/_internal.pyx | 284 ++++++++++++++++++++ pytential/qbx/target_specific/cdjseval3d.c | 237 ++++++++++++++++ 3 files changed, 521 insertions(+), 226 deletions(-) delete mode 100644 pytential/qbx/jfuns3d.f90 create mode 100644 pytential/qbx/target_specific/_internal.pyx create mode 100644 pytential/qbx/target_specific/cdjseval3d.c diff --git a/pytential/qbx/jfuns3d.f90 b/pytential/qbx/jfuns3d.f90 deleted file mode 100644 index 03fa6ef8..00000000 --- a/pytential/qbx/jfuns3d.f90 +++ /dev/null @@ -1,226 +0,0 @@ -!! Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas -!! Contact: greengard@cims.nyu.edu -!! -!! This software is being released under a modified FreeBSD license -!! (see COPYING in home directory). -!********************************************************************** - -! $Date: 2011-07-15 16:28:31 -0400 (Fri, 15 Jul 2011) $ -! $Revision: 2253 $ - - -! Computation of spherical Bessel functions via recurrence - -!********************************************************************** - subroutine jfuns3d(ier,nterms,z,scale,fjs,ifder,fjder, & - lwfjs,iscale,ntop) - implicit none - integer :: ier,nterms,ifder,lwfjs,ntop,i,ncntr - real *8 :: scale,d0,d1,dc1,dc2,dcoef,dd,done,tiny,zero - real *8 :: scalinv,sctot,upbound,upbound2,upbound2inv -!********************************************************************** - -! PURPOSE: - -! This subroutine evaluates the first NTERMS spherical Bessel -! functions and if required, their derivatives. -! It incorporates a scaling parameter SCALE so that - -! fjs_n(z)=j_n(z)/SCALE^n -! fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} - -! NOTE: The scaling parameter SCALE is meant to be used when -! abs(z) < 1, in which case we recommend setting -! SCALE = abs(z). This prevents the fjs_n from -! underflowing too rapidly. -! Otherwise, set SCALE=1. -! Do not set SCALE = abs(z) if z could take on the -! value zero. -! In an FMM, when forming an expansion from a collection of -! sources, set SCALE = min( abs(k*r), 1) -! where k is the Helmholtz parameter and r is the box dimension -! at the relevant level. - -! INPUT: - -! nterms (integer): order of expansion of output array fjs -! z (complex *16): argument of the spherical Bessel functions -! scale (real *8) : scaling factor (discussed above) -! ifder (integer): flag indicating whether to calculate "fjder" -! 0 NO -! 1 YES -! lwfjs (integer): upper limit of input arrays -! fjs(0:lwfjs) and iscale(0:lwfjs) -! iscale (integer): integer workspace used to keep track of -! internal scaling - -! OUTPUT: - -! ier (integer): error return code -! ier=0 normal return; -! ier=8 insufficient array dimension lwfjs -! fjs (complex *16): array of scaled Bessel functions. -! fjder (complex *16): array of derivs of scaled Bessel functions. -! ntop (integer) : highest index in arrays fjs that is nonzero - -! NOTE, that fjs and fjder arrays must be at least (nterms+2) -! complex *16 elements long. - - - integer :: iscale(0:lwfjs) - complex *16 :: wavek,fjs(0:lwfjs),fjder(0:*) - complex *16 :: z,zinv,com,fj0,fj1,zscale,ztmp - - data upbound/1.0d+32/, upbound2/1.0d+40/, upbound2inv/1.0d-40/ - data tiny/1.0d-200/,done/1.0d0/,zero/0.0d0/ - -! ... Initializing ... - - ier=0 - -! set to asymptotic values if argument is sufficiently small - - if (abs(z) < tiny) then - fjs(0) = done - do i = 1, nterms - fjs(i) = zero - enddo - - if (ifder == 1) then - do i=0,nterms - fjder(i)=zero - enddo - fjder(1)=done/(3*scale) - endif - - RETURN - endif - -! ... Step 1: recursion up to find ntop, starting from nterms - - ntop=0 - zinv=done/z - fjs(nterms)=done - fjs(nterms-1)=zero - - do i=nterms,lwfjs - dcoef=2*i+done - ztmp=dcoef*zinv*fjs(i)-fjs(i-1) - fjs(i+1)=ztmp - - dd = dreal(ztmp)**2 + dimag(ztmp)**2 - if (dd > upbound2) then - ntop=i+1 - exit - endif - enddo - if (ntop == 0) then - ier=8 - return - endif - -! ... Step 2: Recursion back down to generate the unscaled jfuns: -! if magnitude exceeds UPBOUND2, rescale and continue the -! recursion (saving the order at which rescaling occurred -! in array iscale. - - do i=0,ntop - iscale(i)=0 - enddo - - fjs(ntop)=zero - fjs(ntop-1)=done - do i=ntop-1,1,-1 - dcoef=2*i+done - ztmp=dcoef*zinv*fjs(i)-fjs(i+1) - fjs(i-1)=ztmp - - dd = dreal(ztmp)**2 + dimag(ztmp)**2 - if (dd > UPBOUND2) then - fjs(i) = fjs(i)*UPBOUND2inv - fjs(i-1) = fjs(i-1)*UPBOUND2inv - iscale(i) = 1 - endif - enddo - -! ... Step 3: go back up to the top and make sure that all -! Bessel functions are scaled by the same factor -! (i.e. the net total of times rescaling was invoked -! on the way down in the previous loop). -! At the same time, add scaling to fjs array. - - ncntr=0 - scalinv=done/scale - sctot = 1.0d0 - do i=1,ntop - sctot = sctot*scalinv - if(iscale(i-1) == 1) sctot=sctot*UPBOUND2inv - fjs(i)=fjs(i)*sctot - enddo - -! ... Determine the normalization parameter: - - fj0=sin(z)*zinv - fj1=fj0*zinv-cos(z)*zinv - - d0=abs(fj0) - d1=abs(fj1) - if (d1 > d0) then - zscale=fj1/(fjs(1)*scale) - else - zscale=fj0/fjs(0) - endif - -! ... Scale the jfuns by zscale: - - ztmp=zscale - do i=0,nterms - fjs(i)=fjs(i)*ztmp - enddo - -! ... Finally, calculate the derivatives if desired: - - if (ifder == 1) then - fjs(nterms+1)=fjs(nterms+1)*ztmp - - fjder(0)=-fjs(1)*scale - do i=1,nterms - dc1=i/(2*i+done) - dc2=done-dc1 - dc1=dc1*scalinv - dc2=dc2*scale - fjder(i)=dc1*fjs(i-1)-dc2*fjs(i+1) - enddo - endif - return - end subroutine jfuns3d - - ! void c_jfuns3( - ! int *ier, - ! int nterms, - ! complex double z, - ! double scale, - ! complex double *fjs, - ! int ifder, - ! complex double *fjder, - ! int lwfjs, - ! int *iscale, - ! int *ntop) - subroutine c_jfuns3d(ier, nterms, z, scale, fjs, ifder, fjder, lwfjs,& - & iscale, ntop) bind(c) - use iso_c_binding - implicit none - integer (c_int) :: ier - integer (c_int), value :: nterms - complex (c_double_complex), value :: z - real (c_double), value :: scale - complex (c_double_complex) :: fjs(0:lwfjs) - integer (c_int), value :: ifder - complex (c_double_complex) :: fjder(0:*) - integer (c_int), value :: lwfjs - integer (c_int) :: iscale(0:lwfjs) - integer (c_int) :: ntop - - call jfuns3d(ier, nterms, z, scale, fjs, ifder, fjder, lwfjs, iscale,& - & ntop) - end subroutine c_jfuns3d diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx new file mode 100644 index 00000000..68696f32 --- /dev/null +++ b/pytential/qbx/target_specific/_internal.pyx @@ -0,0 +1,284 @@ +#!python +#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + +import numpy as np +import cython +import cython.parallel + +from libc.math cimport sqrt +from libc.stdio cimport printf + +cimport openmp + + +cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: + """Compute the values of the Legendre polynomial up to order n at x. + Optionally, if derivs is non-NULL, compute the values of the derivative too. + + Borrowed from fmmlib. + """ + cdef: + double pj, derj, pjm2, pjm1, derjm2, derjm1 + int j + + pjm2 = 1 + pjm1 = x + + vals[0] = 1 + if derivs != NULL: + derivs[0] = 0 + derjm2 = 0 + derjm1 = 1 + + if n == 0: + return + + vals[1] = x + if derivs != NULL: + derivs[1] = 1 + + if n == 1: + return + + for j in range(2, n + 1): + pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j + vals[j] = pj + + if derivs != NULL: + derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2 + derj = derj / j + derivs[j] = derj + derjm2 = derjm1 + derjm1 = derj + + pjm2 = pjm1 + pjm1 = pj + + +cdef double dist(double[3] a, double[3] b) nogil: + return sqrt( + (a[0] - b[0]) * (a[0] - b[0]) + + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2])) + + +cdef void tsqbx_grad_from_source( + double[3] source, + double[3] center, + double[3] target, + double[3] grad, + int order) nogil: + cdef: + int i, j + double result, sc_d, tc_d, cos_angle, alpha, R + double[128] tmp + double[128] derivs + double[3] cms + double[3] tmc + + for j in range(3): + cms[j] = center[j] - source[j] + tmc[j] = target[j] - center[j] + grad[j] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + alpha = ( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + + cos_angle = alpha / (tc_d * sc_d) + + legvals(cos_angle, order, tmp, derivs) + + R = 1 / sc_d + + for i in range(0, order + 1): + # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) + for j in range(3): + grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] + for j in range(3): + # Siegel and Tornberg has a sign flip here :( + grad[j] += ( + tmc[j] / (tc_d * sc_d) + + alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] + R *= (tc_d / sc_d) + + return + + +cdef double tsqbx_from_source( + double[3] source, + double[3] center, + double[3] target, + int order) nogil: + cdef: + int j + double result, r, sc_d, tc_d, cos_angle + # Legendre recurrence values + double pj, pjm1, pjm2 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + / (tc_d * sc_d)) + + if order == 0: + return 1 / sc_d + + pjm2 = 1 + pjm1 = cos_angle + + result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) + + r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + + for j in range(2, order + 1): + pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j + result += pj * r + + r *= (tc_d / sc_d) + pjm2 = pjm1 + pjm1 = pj + + return result + + +cdef double tsqbx_helmholtz_from_source( + double[3] source, + double[3] center, + double[3] target, + double k, + int order) nogil: + cdef: + int j + double result, r, sc_d, tc_d, cos_angle + # Legendre recurrence values + double pj, pjm1, pjm2 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + / (tc_d * sc_d)) + + if order == 0: + return 1 / sc_d + + pjm2 = 1 + pjm1 = cos_angle + + result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) + + r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + + for j in range(2, order + 1): + pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j + result += pj * r + + r *= (tc_d / sc_d) + pjm2 = pjm1 + pjm1 = pj + + return result + + +def eval_target_specific_global_qbx_locals( + int order, + double[:,:] sources, + double[:,:] targets, + double[:,:] centers, + int[:] global_qbx_centers, + int[:] qbx_center_to_target_box, + int[:] center_to_target_starts, int[:] center_to_target_lists, + int[:] source_box_starts, int[:] source_box_lists, + int[:] box_source_starts, int[:] box_source_counts_nonchild, + double[:] dipstr, + double[:,:] dipvec, + double complex[:] pot): + + cdef: + int tgt, ictr, ctr + int itgt, itgt_start, itgt_end + int tgt_box, src_ibox + int isrc_box, isrc_box_start, isrc_box_end + int isrc, isrc_start, isrc_end + int i, tid + double result + double[:,:] source, center, target, grad + int slp, dlp + + slp = (dipstr is not None) and (dipvec is None) + dlp = (dipstr is not None) and (dipvec is not None) + + print("Hi from Cython") + + if not (slp or dlp): + raise ValueError("should specify exactly one of src_weights or dipvec") + + # Hack to obtain thread-local storage + maxthreads = openmp.omp_get_max_threads() + + # Prevent false sharing by over-allocating the buffers + source = np.zeros((maxthreads, 65)) + target = np.zeros((maxthreads, 65)) + center = np.zeros((maxthreads, 65)) + grad = np.zeros((maxthreads, 65)) + + # TODO: Check if order > 256 + + for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0], + nogil=True, schedule="static", + chunksize=128): + ctr = global_qbx_centers[ictr] + itgt_start = center_to_target_starts[ctr] + itgt_end = center_to_target_starts[ctr + 1] + tgt_box = qbx_center_to_target_box[ctr] + tid = cython.parallel.threadid() + + for i in range(3): + center[tid, i] = centers[i, ctr] + + for itgt in range(itgt_start, itgt_end): + result = 0 + tgt = center_to_target_lists[itgt] + + for i in range(3): + target[tid, i] = targets[i, tgt] + + isrc_box_start = source_box_starts[tgt_box] + isrc_box_end = source_box_starts[tgt_box + 1] + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = source_box_lists[isrc_box] + isrc_start = box_source_starts[src_ibox] + isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for isrc in range(isrc_start, isrc_end): + for i in range(3): + source[tid, i] = sources[i, isrc] + + if slp: + # Don't replace with +=, since that makes Cython think + # it is a reduction. + result = result + dipstr[isrc] * ( + tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], order)) + elif dlp: + tsqbx_grad_from_source(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], &grad[tid, 0], order) + result = result + dipstr[isrc] * ( + grad[tid, 0] * dipvec[0, isrc] + + grad[tid, 1] * dipvec[1, isrc] + + grad[tid, 2] * dipvec[2, isrc]) + + pot[tgt] = pot[tgt] + result diff --git a/pytential/qbx/target_specific/cdjseval3d.c b/pytential/qbx/target_specific/cdjseval3d.c new file mode 100644 index 00000000..cba25fc8 --- /dev/null +++ b/pytential/qbx/target_specific/cdjseval3d.c @@ -0,0 +1,237 @@ +/* cdjseval3d.f -- translated by f2c (version 20160102). +New version +*/ + +#include "f2c.h" + +/* c Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ +/* c Contact: greengard@cims.nyu.edu */ +/* c */ +/* c This software is being released under a modified FreeBSD license */ +/* c (see COPYING in home directory). */ +/* cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ + +/* $Date: 2011-07-15 16:28:31 -0400 (Fri, 15 Jul 2011) $ */ +/* $Revision: 2253 $ */ + + +/* Computation of spherical Bessel functions via recurrence */ + +/* ********************************************************************** */ +/* Subroutine */ int jfuns3d_(integer *ier, integer *nterms, doublecomplex * + z__, doublereal *scale, doublecomplex *fjs, integer *ifder, + doublecomplex *fjder, integer *lwfjs, integer *iscale, integer *ntop) +{ + /* Initialized data */ + + static doublereal upbound2 = 1e40; + static doublereal upbound2inv = 1e-40; + static doublereal tiny = 1e-200; + static doublereal done = 1.; + static doublereal zero = 0.; + + /* System generated locals */ + integer i__1; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer i__; + doublereal d0, d1, dd, dc1, dc2; + doublecomplex fj0, fj1, zinv, ztmp; + doublereal dcoef; + doublereal sctot; + doublecomplex zscale; + doublereal scalinv; + +/* ********************************************************************** */ + +/* PURPOSE: */ + +/* This subroutine evaluates the first NTERMS spherical Bessel */ +/* functions and if required, their derivatives. */ +/* It incorporates a scaling parameter SCALE so that */ + +/* fjs_n(z)=j_n(z)/SCALE^n */ +/* fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} */ + +/* NOTE: The scaling parameter SCALE is meant to be used when */ +/* abs(z) < 1, in which case we recommend setting */ +/* SCALE = abs(z). This prevents the fjs_n from */ +/* underflowing too rapidly. */ +/* Otherwise, set SCALE=1. */ +/* Do not set SCALE = abs(z) if z could take on the */ +/* value zero. */ +/* In an FMM, when forming an expansion from a collection of */ +/* sources, set SCALE = min( abs(k*r), 1) */ +/* where k is the Helmholtz parameter and r is the box dimension */ +/* at the relevant level. */ + +/* INPUT: */ + +/* nterms (integer): order of expansion of output array fjs */ +/* z (complex *16): argument of the spherical Bessel functions */ +/* scale (real *8) : scaling factor (discussed above) */ +/* ifder (integer): flag indicating whether to calculate "fjder" */ +/* 0 NO */ +/* 1 YES */ +/* lwfjs (integer): upper limit of input arrays */ +/* fjs(0:lwfjs) and iscale(0:lwfjs) */ +/* iscale (integer): integer workspace used to keep track of */ +/* internal scaling */ + +/* OUTPUT: */ + +/* ier (integer): error return code */ +/* ier=0 normal return; */ +/* ier=8 insufficient array dimension lwfjs */ +/* fjs (complex *16): array of scaled Bessel functions. */ +/* fjder (complex *16): array of derivs of scaled Bessel functions. */ +/* ntop (integer) : highest index in arrays fjs that is nonzero */ + +/* NOTE, that fjs and fjder arrays must be at least (nterms+2) */ +/* complex *16 elements long. */ + + + + +/* ... Initializing ... */ + + *ier = 0; + +/* set to asymptotic values if argument is sufficiently small */ + + if (z_abs(z__) < tiny) { + fjs[0] = done; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + fjs[i__] = zero; + } + + if (*ifder == 1) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjder[i__] = zero; + } + fjder[1] = done / (*scale * 3); + } + + return 0; + } + +/* ... Step 1: recursion up to find ntop, starting from nterms */ + + *ntop = 0; + zinv = done / *z__; + fjs[*nterms] = done; + fjs[*nterms - 1] = zero; + + i__1 = *lwfjs; + for (i__ = *nterms; i__ <= i__1; ++i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ - 1]; + fjs[i__ + 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + *ntop = i__ + 1; + break; + } + } + if (*ntop == 0) { + *ier = 8; + return 0; + } + +/* ... Step 2: Recursion back down to generate the unscaled jfuns: */ +/* if magnitude exceeds UPBOUND2, rescale and continue the */ +/* recursion (saving the order at which rescaling occurred */ +/* in array iscale. */ + + i__1 = *ntop; + for (i__ = 0; i__ <= i__1; ++i__) { + iscale[i__] = 0; + } + + fjs[*ntop] = zero; + fjs[*ntop - 1] = done; + for (i__ = *ntop - 1; i__ >= 1; --i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ + 1]; + fjs[i__ - 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + fjs[i__] *= upbound2inv; + fjs[i__ - 1] *= upbound2inv; + iscale[i__] = 1; + } +/* L2200: */ + } + +/* ... Step 3: go back up to the top and make sure that all */ +/* Bessel functions are scaled by the same factor */ +/* (i.e. the net total of times rescaling was invoked */ +/* on the way down in the previous loop). */ +/* At the same time, add scaling to fjs array. */ + + scalinv = done / *scale; + sctot = 1.; + i__1 = *ntop; + for (i__ = 1; i__ <= i__1; ++i__) { + sctot *= scalinv; + if (iscale[i__ - 1] == 1) { + sctot *= upbound2inv; + } + fjs[i__] *= sctot; + } + +/* ... Determine the normalization parameter: */ + + z_sin(&z__1, z__); + fj0 = z__1 * zinv; + z_cos(&z__1, z__); + fj1 = fj0 * zinv - z__1 * zinv; + + d0 = z_abs(&fj0); + d1 = z_abs(&fj1); + if (d1 > d0) { + zscale = fj1 / (fjs[1] * *scale); + } else { + zscale = fj0 / fjs[0]; + } + +/* ... Scale the jfuns by zscale: */ + + ztmp = zscale; + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjs[i__] *= ztmp; + } + +/* ... Finally, calculate the derivatives if desired: */ + + if (*ifder == 1) { + fjs[*nterms + 1] *= ztmp; + + fjder[0] = -fjs[1] * *scale; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dc1 = i__ / ((i__ << 1) + done); + dc2 = done - dc1; + dc1 *= scalinv; + dc2 *= *scale; + fjder[i__] = dc1 * fjs[i__ - 1] - dc2 * fjs[i__ + 1]; + } + } + return 0; +} /* jfuns3d_ */ + -- GitLab From 5871b914781ea3737c9169b97e3ae75798bd75b4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Aug 2018 14:50:46 -0500 Subject: [PATCH 071/139] Fixes --- pytential/qbx/target_specific.pyx | 242 --------------------- pytential/qbx/target_specific/cdjseval3d.c | 19 +- 2 files changed, 10 insertions(+), 251 deletions(-) delete mode 100644 pytential/qbx/target_specific.pyx diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx deleted file mode 100644 index c04cfcfc..00000000 --- a/pytential/qbx/target_specific.pyx +++ /dev/null @@ -1,242 +0,0 @@ -#!python -#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True - -import numpy as np -import cython -import cython.parallel - -from libc.math cimport sqrt -from libc.stdio cimport printf - -cimport openmp - - -cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: - """Compute the values of the Legendre polynomial up to order n at x. - Optionally, if derivs is non-NULL, compute the values of the derivative too. - - Borrowed from fmmlib. - """ - cdef: - double pj, derj, pjm2, pjm1, derjm2, derjm1 - int j - - pjm2 = 1 - pjm1 = x - - vals[0] = 1 - if derivs != NULL: - derivs[0] = 0 - derjm2 = 0 - derjm1 = 1 - - if n == 0: - return - - vals[1] = x - if derivs != NULL: - derivs[1] = 1 - - if n == 1: - return - - for j in range(2, n + 1): - pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j - vals[j] = pj - - if derivs != NULL: - derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2 - derj = derj / j - derivs[j] = derj - derjm2 = derjm1 - derjm1 = derj - - pjm2 = pjm1 - pjm1 = pj - - -cdef double dist(double[3] a, double[3] b) nogil: - return sqrt( - (a[0] - b[0]) * (a[0] - b[0]) + - (a[1] - b[1]) * (a[1] - b[1]) + - (a[2] - b[2]) * (a[2] - b[2])) - - -cdef void tsqbx_grad_from_source( - double[3] source, - double[3] center, - double[3] target, - double[3] grad, - int order) nogil: - cdef: - int i, j - double result, sc_d, tc_d, cos_angle, alpha, R - double[128] tmp - double[128] derivs - double[3] cms - double[3] tmc - - for j in range(3): - cms[j] = center[j] - source[j] - tmc[j] = target[j] - center[j] - grad[j] = 0 - - tc_d = dist(target, center) - sc_d = dist(source, center) - - alpha = ( - (target[0] - center[0]) * (source[0] - center[0]) + - (target[1] - center[1]) * (source[1] - center[1]) + - (target[2] - center[2]) * (source[2] - center[2])) - - cos_angle = alpha / (tc_d * sc_d) - - legvals(cos_angle, order, tmp, derivs) - - R = 1 / sc_d - - for i in range(0, order + 1): - # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) - for j in range(3): - grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] - for j in range(3): - # Siegel and Tornberg has a sign flip here :( - grad[j] += ( - tmc[j] / (tc_d * sc_d) + - alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] - R *= (tc_d / sc_d) - - return - - -cdef double tsqbx_from_source( - double[3] source, - double[3] center, - double[3] target, - int order) nogil: - cdef: - int j - double result, r, sc_d, tc_d, cos_angle - # Legendre recurrence values - double pj, pjm1, pjm2 - - tc_d = dist(target, center) - sc_d = dist(source, center) - - cos_angle = (( - (target[0] - center[0]) * (source[0] - center[0]) + - (target[1] - center[1]) * (source[1] - center[1]) + - (target[2] - center[2]) * (source[2] - center[2])) - / (tc_d * sc_d)) - - if order == 0: - return 1 / sc_d - - pjm2 = 1 - pjm1 = cos_angle - - result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) - - r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) - - for j in range(2, order + 1): - pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j - result += pj * r - - r *= (tc_d / sc_d) - pjm2 = pjm1 - pjm1 = pj - - return result - - -def eval_target_specific_global_qbx_locals( - int order, - double[:,:] sources, - double[:,:] targets, - double[:,:] centers, - int[:] global_qbx_centers, - int[:] qbx_center_to_target_box, - int[:] center_to_target_starts, int[:] center_to_target_lists, - int[:] source_box_starts, int[:] source_box_lists, - int[:] box_source_starts, int[:] box_source_counts_nonchild, - double[:] dipstr, - double[:,:] dipvec, - double complex[:] pot): - - cdef: - int tgt, ictr, ctr - int itgt, itgt_start, itgt_end - int tgt_box, src_ibox - int isrc_box, isrc_box_start, isrc_box_end - int isrc, isrc_start, isrc_end - int i, tid - double result - double[:,:] source, center, target, grad - int slp, dlp - - slp = (dipstr is not None) and (dipvec is None) - dlp = (dipstr is not None) and (dipvec is not None) - - print("Hi from Cython") - - if not (slp or dlp): - raise ValueError("should specify exactly one of src_weights or dipvec") - - # Hack to obtain thread-local storage - maxthreads = openmp.omp_get_max_threads() - - # Prevent false sharing by over-allocating the buffers - source = np.zeros((maxthreads, 65)) - target = np.zeros((maxthreads, 65)) - center = np.zeros((maxthreads, 65)) - grad = np.zeros((maxthreads, 65)) - - # TODO: Check if order > 256 - - for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0], - nogil=True, schedule="static", - chunksize=128): - ctr = global_qbx_centers[ictr] - itgt_start = center_to_target_starts[ctr] - itgt_end = center_to_target_starts[ctr + 1] - tgt_box = qbx_center_to_target_box[ctr] - tid = cython.parallel.threadid() - - for i in range(3): - center[tid, i] = centers[i, ctr] - - for itgt in range(itgt_start, itgt_end): - result = 0 - tgt = center_to_target_lists[itgt] - - for i in range(3): - target[tid, i] = targets[i, tgt] - - isrc_box_start = source_box_starts[tgt_box] - isrc_box_end = source_box_starts[tgt_box + 1] - - for isrc_box in range(isrc_box_start, isrc_box_end): - src_ibox = source_box_lists[isrc_box] - isrc_start = box_source_starts[src_ibox] - isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] - - for isrc in range(isrc_start, isrc_end): - for i in range(3): - source[tid, i] = sources[i, isrc] - - if slp: - # Don't replace with +=, since that makes Cython think - # it is a reduction. - result = result + dipstr[isrc] * ( - tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], order)) - elif dlp: - tsqbx_grad_from_source(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], &grad[tid, 0], order) - result = result + dipstr[isrc] * ( - grad[tid, 0] * dipvec[0, isrc] + - grad[tid, 1] * dipvec[1, isrc] + - grad[tid, 2] * dipvec[2, isrc]) - - pot[tgt] = pot[tgt] + result diff --git a/pytential/qbx/target_specific/cdjseval3d.c b/pytential/qbx/target_specific/cdjseval3d.c index cba25fc8..40cd5d37 100644 --- a/pytential/qbx/target_specific/cdjseval3d.c +++ b/pytential/qbx/target_specific/cdjseval3d.c @@ -1,15 +1,16 @@ -/* cdjseval3d.f -- translated by f2c (version 20160102). -New version -*/ +/* Based on cdjseval3d.f from fmmlib3d, translated with modified f2c */ #include "f2c.h" -/* c Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ -/* c Contact: greengard@cims.nyu.edu */ -/* c */ -/* c This software is being released under a modified FreeBSD license */ -/* c (see COPYING in home directory). */ -/* cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ +/* Original copyright notice: */ + +/* ********************************************************************** */ +/* Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ +/* Contact: greengard@cims.nyu.edu */ +/* +/* This software is being released under a modified FreeBSD license */ +/* (see COPYING in home directory). */ +/* ********************************************************************** */ /* $Date: 2011-07-15 16:28:31 -0400 (Fri, 15 Jul 2011) $ */ /* $Revision: 2253 $ */ -- GitLab From 042d7c8c2f457c6e2055e8ec6f0b7f3548a90b69 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Aug 2018 16:33:28 -0500 Subject: [PATCH 072/139] Add Bessel eval routines --- pytential/qbx/target_specific/__init__.py | 24 +++++++++++ pytential/qbx/target_specific/_internal.h | 11 +++++ pytential/qbx/target_specific/_internal.pyx | 42 ++++++++++++++++++ pytential/qbx/target_specific/f2c.h | 29 +++++++++++++ setup.py | 7 +-- test/test_target_specific_qbx.py | 48 ++++++++++++++++++++- 6 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 pytential/qbx/target_specific/__init__.py create mode 100644 pytential/qbx/target_specific/_internal.h create mode 100644 pytential/qbx/target_specific/f2c.h diff --git a/pytential/qbx/target_specific/__init__.py b/pytential/qbx/target_specific/__init__.py new file mode 100644 index 00000000..fb612799 --- /dev/null +++ b/pytential/qbx/target_specific/__init__.py @@ -0,0 +1,24 @@ +__copyright__ = "Copyright (C) 2018 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from ._internal import * # noqa diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h new file mode 100644 index 00000000..04f991bb --- /dev/null +++ b/pytential/qbx/target_specific/_internal.h @@ -0,0 +1,11 @@ +#ifndef UTILS_H +#define UTILS_H + +#include + +extern int jfuns3d_(int *ier, int *nterms, double complex *z, + double *scale, double complex *fjs, int *ifder, + double complex *fjder, int *lwfjs, int *iscale, + int *ntop); + +#endif // UTILS_H diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 68696f32..5f91655f 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -11,6 +11,48 @@ from libc.stdio cimport printf cimport openmp +cdef extern from "_internal.h" nogil: + int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale, + double complex *fjs, int *ifder, double complex *fjder, + int *lwfjs, int *iscale, int *ntop); + + +def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): + """Evaluate spherical Bessel functions. + + Arguments: + nterms: Number of terms to evaluate + z: Argument + scale: Output scaling factor (recommended: min(abs(z), 1)) + fjs: Output array of complex doubles + fjder: *None*, or output array of complex double derivatives + """ + cdef: + double complex[1024] fjstemp + double complex[1024] fjdertmp + int[1024] iscale + int ier, ifder, lwfjs, ntop, i, nterms_ + double scale_ + double complex z_ + + nterms_ = nterms + z_ = z + scale_ = scale + ifder = fjder is not None + lwfjs = 1024 + + jfuns3d_(&ier, &nterms_, &z_, &scale_, fjstemp, &ifder, fjdertmp, &lwfjs, + iscale, &ntop) + + if ier: + raise ValueError("jfuns3d_ returned error code %d" % ier) + + for i in range(nterms): + fjs[i] = fjstemp[i] + if ifder: + fjder[i] = fjdertmp[i] + + cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: """Compute the values of the Legendre polynomial up to order n at x. Optionally, if derivs is non-NULL, compute the values of the derivative too. diff --git a/pytential/qbx/target_specific/f2c.h b/pytential/qbx/target_specific/f2c.h new file mode 100644 index 00000000..0005f876 --- /dev/null +++ b/pytential/qbx/target_specific/f2c.h @@ -0,0 +1,29 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include + +typedef int integer; +typedef unsigned int uinteger; +typedef double doublereal; +typedef double complex doublecomplex; + +static inline double z_abs(doublecomplex *z) { + return cabs(*z); +} + +static inline void z_sin(doublecomplex *out, doublecomplex *z) { + *out = csin(*z); +} + +static inline void z_cos(doublecomplex *out, doublecomplex *z) { + *out = ccos(*z); +} + +#endif diff --git a/setup.py b/setup.py index c9e0d161..76fa7f80 100644 --- a/setup.py +++ b/setup.py @@ -58,11 +58,12 @@ write_git_revision("pytential") ext_modules = [ Extension( - "pytential.qbx.target_specific", - ["pytential/qbx/target_specific.pyx"], + "pytential.qbx.target_specific._internal", + ["pytential/qbx/target_specific/_internal.pyx", + "pytential/qbx/target_specific/cdjseval3d.c"], extra_compile_args=["-fopenmp", "-ffast-math"], extra_link_args=["-fopenmp"] - ) + ), ] diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index b19d9fb3..522cce5d 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -44,11 +44,57 @@ import logging logger = logging.getLogger(__name__) +def test_spherical_bessel_functions(): + import pytential.qbx.target_specific as ts + + nterms = 10 + z = 3j + scale = 1 + j = np.zeros(nterms, dtype=np.complex) + jder = np.zeros(nterms, dtype=np.complex) + ts.jfuns3d_wrapper(nterms, z, scale, j, jder) + + # Reference solution from scipy.special.spherical_jn + + j_expected = np.array([ + +3.33929164246994992e+00 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +2.24279011776926884e+00j, + -1.09650152470070195e+00 + +0.00000000000000000e+00j, + +2.77555756156289135e-17 + -4.15287576601431119e-01j, + +1.27497179297362317e-01 + +0.00000000000000000e+00j, + -3.46944695195361419e-18 + +3.27960387093445271e-02j, + -7.24503736309898075e-03 + -4.33680868994201774e-19j, + +0.00000000000000000e+00 + -1.40087680258226812e-03j, + +2.40653350187633002e-04 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +3.71744848523478122e-05j, + ]) + + assert np.allclose(j, j_expected, rtol=1e-13, atol=0) + + jder_expected = np.array([ + -0.00000000000000000e+00 + -2.24279011776926884e+00j, + +1.84409823062377076e+00 + +0.00000000000000000e+00j, + +0.00000000000000000e+00 + +1.14628859306856690e+00j, + -5.42784755898793825e-01 + +3.70074341541718826e-17j, + +2.77555756156289135e-17 + -2.02792277772493951e-01j, + +6.19051018786732632e-02 + -6.93889390390722838e-18j, + -2.45752492430047685e-18 + +1.58909515287802387e-02j, + -3.50936588954626604e-03 + -4.33680868994201774e-19j, + +0.00000000000000000e+00 + -6.78916752019369197e-04j, + +1.16738400679806980e-04 + +0.00000000000000000e+00j, + ]) + + assert np.allclose(jder, jder_expected, rtol=1e-13, atol=0) + + @pytest.mark.parametrize("op", ["S", "D"]) -@pytest.mark.parametriz("helmholtz_k", [0, 1.2]) +@pytest.mark.parametrize("helmholtz_k", [0, 1.2]) def test_target_specific_qbx(ctx_getter, op, helmholtz_k): logging.basicConfig(level=logging.INFO) + if helmholtz_k != 0: + pytest.xfail("not implemented yet") + cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) -- GitLab From 53cf770843a3eaa6c109cf9bd102061179fa3265 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Aug 2018 17:15:21 -0500 Subject: [PATCH 073/139] Add spherical Hankel functions --- pytential/qbx/target_specific/_internal.h | 3 + pytential/qbx/target_specific/_internal.pyx | 35 +++ pytential/qbx/target_specific/f2c.h | 4 + pytential/qbx/target_specific/helmrouts3d.c | 242 ++++++++++++++++++++ setup.py | 3 +- test/test_target_specific_qbx.py | 61 ++++- 6 files changed, 338 insertions(+), 10 deletions(-) create mode 100644 pytential/qbx/target_specific/helmrouts3d.c diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h index 04f991bb..b97a6a79 100644 --- a/pytential/qbx/target_specific/_internal.h +++ b/pytential/qbx/target_specific/_internal.h @@ -8,4 +8,7 @@ extern int jfuns3d_(int *ier, int *nterms, double complex *z, double complex *fjder, int *lwfjs, int *iscale, int *ntop); +extern int h3dall_(int *nterms, double complex *z, double *scale, + double complex *hvec, int *ifder, double complex *hder); + #endif // UTILS_H diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 5f91655f..31b27c07 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -15,6 +15,8 @@ cdef extern from "_internal.h" nogil: int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale, double complex *fjs, int *ifder, double complex *fjder, int *lwfjs, int *iscale, int *ntop); + int h3dall_(int *nterms, double complex *z, double *scale, + double complex *hvec, int *ifder, double complex *hder); def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): @@ -53,6 +55,39 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): fjder[i] = fjdertmp[i] +def h3dall_wrapper(nterms, z, scale, hs, hders): + """Evaluate spherical Hankel functions. + + Arguments: + nterms: Number of terms to evaluate + z: Argument + scale: Output scaling factor (recommended: min(abs(z), 1)) + hs: Output array of complex doubles + hders: *None*, or output array of complex double derivatives + """ + cdef: + int nterms_, ifder + double scale_ + double complex z_ + double complex[:] hvec = np.empty(nterms, np.complex) + double complex[:] hdervec = np.empty(nterms, np.complex) + + ifder = hders is not None + + if nterms == 0: + return + + nterms_ = nterms - 1 + z_ = z + scale_ = scale + + h3dall_(&nterms_, &z_, &scale_, &hvec[0], &ifder, &hdervec[0]) + + hs[:nterms] = hvec[:] + if ifder: + hders[:nterms] = hdervec[:] + + cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: """Compute the values of the Legendre polynomial up to order n at x. Optionally, if derivs is non-NULL, compute the values of the derivative too. diff --git a/pytential/qbx/target_specific/f2c.h b/pytential/qbx/target_specific/f2c.h index 0005f876..897b5bd0 100644 --- a/pytential/qbx/target_specific/f2c.h +++ b/pytential/qbx/target_specific/f2c.h @@ -18,6 +18,10 @@ static inline double z_abs(doublecomplex *z) { return cabs(*z); } +static inline void z_exp(doublecomplex *out, doublecomplex *z) { + *out = cexp(*z); +} + static inline void z_sin(doublecomplex *out, doublecomplex *z) { *out = csin(*z); } diff --git a/pytential/qbx/target_specific/helmrouts3d.c b/pytential/qbx/target_specific/helmrouts3d.c new file mode 100644 index 00000000..f4ab60e1 --- /dev/null +++ b/pytential/qbx/target_specific/helmrouts3d.c @@ -0,0 +1,242 @@ +/* Based on helmrouts3d.f from fmmlib3d, translated with modified f2c */ + +#include "f2c.h" + +/* Original copyright notice: */ + +/* ************************************************************************** */ +/* Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ +/* Contact: greengard@cims.nyu.edu */ +/* */ +/* This software is being released under a modified FreeBSD license */ +/* (see COPYING in home directory). */ +/* ************************************************************************** */ + +/* This file contains the basic subroutines for */ +/* forming and evaluating multipole (partial wave) expansions. */ + +/* Documentation is incomplete and ongoing... */ + + +/* Remarks on scaling conventions. */ + +/* 1) Hankel and Bessel functions are consistently scaled as */ +/* hvec(n)= h_n(z)*scale^(n) */ +/* jvec(n)= j_n(z)/scale^(n) */ + +/* In some earlier FMM implementations, the convention */ +/* hvec(n)= h_n(z)*scale^(n+1) */ +/* was sometimes used, leading to various obscure rescaling */ +/* steps. */ + +/* scale should be of the order of |z| if |z| < 1. Otherwise, */ +/* scale should be set to 1. */ + + +/* 2) There are many definitions of the spherical harmonics, */ +/* which differ in terms of normalization constants. We */ +/* adopt the following convention: */ + +/* For m>0, we define Y_n^m according to */ + +/* Y_n^m = \sqrt{2n+1} \sqrt{\frac{ (n-m)!}{(n+m)!}} \cdot */ +/* P_n^m(\cos \theta) e^{i m phi} */ +/* and */ + +/* Y_n^-m = dconjg( Y_n^m ) */ + +/* We omit the Condon-Shortley phase factor (-1)^m in the */ +/* definition of Y_n^m for m<0. (This is standard in several */ +/* communities.) */ + +/* We also omit the factor \sqrt{\frac{1}{4 \pi}}, so that */ +/* the Y_n^m are orthogonal on the unit sphere but not */ +/* orthonormal. (This is also standard in several communities.) */ +/* More precisely, */ + +/* \int_S Y_n^m Y_n^m d\Omega = 4 \pi. */ + +/* Using our standard definition, the addition theorem takes */ +/* the simple form */ + +/* e^( i k r}/(ikr) = */ +/* \sum_n \sum_m j_n(k|S|) Ylm*(S) h_n(k|T|) Ylm(T) */ + + +/* ----------------------------------------------------------------------- */ +/* h3d01: computes h0, h1 (first two spherical Hankel fns.) */ +/* h3dall: computes Hankel functions of all orders and scales them */ +/* ********************************************************************** */ +/* Subroutine */ int h3d01_(doublecomplex *z__, doublecomplex *h0, + doublecomplex *h1) +{ + /* Initialized data */ + + static doublecomplex eye = I; + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* System generated locals */ + doublecomplex z__1; + + /* Local variables */ + doublecomplex cd; + +/* ********************************************************************** */ + +/* Compute spherical Hankel functions of order 0 and 1 */ + +/* h0(z) = exp(i*z)/(i*z), */ +/* h1(z) = - h0' = -h0*(i-1/z) = h0*(1/z-i) */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* z : argument of Hankel functions */ +/* if abs(z)<1.0d-15, returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* h0 : h0(z) (spherical Hankel function of order 0). */ +/* h1 : -h0'(z) (spherical Hankel function of order 1). */ + +/* ----------------------------------------------------------------------- */ + + if (z_abs(z__) < thresh) { + *h0 = 0.; + *h1 = 0.; + return 0; + } + +/* Otherwise, use formula */ + + cd = eye * *z__; + z_exp(&z__1, &cd); + *h0 = z__1 / cd; + *h1 = *h0 * (done / *z__ - eye); + + return 0; +} /* h3d01_ */ + + + + +/* ********************************************************************** */ +/* Subroutine */ int h3dall_(integer *nterms, doublecomplex *z__, doublereal * + scale, doublecomplex *hvec, integer *ifder, doublecomplex *hder) +{ + /* Initialized data */ + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* Builtin functions */ + double z_abs(doublecomplex *); + + /* Local variables */ + integer i__; + integer i__1; + doublereal dtmp; + doublecomplex zinv, ztmp; + doublereal scal2; + +/* ********************************************************************** */ + +/* This subroutine computes scaled versions of the spherical Hankel */ +/* functions h_n of orders 0 to nterms. */ + +/* hvec(n)= h_n(z)*scale^(n) */ + +/* The parameter SCALE is useful when |z| < 1, in which case */ +/* it damps out the rapid growth of h_n as n increases. In such */ +/* cases, we recommend setting */ + +/* scale = |z| */ + +/* or something close. If |z| > 1, set scale = 1. */ + +/* If the flag IFDER is set to one, it also computes the */ +/* derivatives of h_n. */ + +/* hder(n)= h_n'(z)*scale^(n) */ + +/* NOTE: If |z| < 1.0d-15, the subroutine returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* nterms : highest order of the Hankel functions to be computed. */ +/* z : argument of the Hankel functions. */ +/* scale : scaling parameter discussed above */ +/* ifder : flag indcating whether derivatives should be computed. */ +/* ifder = 1 ==> compute */ +/* ifder = 0 ==> do not compute */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* hvec : the vector of spherical Hankel functions */ +/* hder : the derivatives of the spherical Hankel functions */ + +/* ----------------------------------------------------------------------- */ + + +/* If |z| < thresh, return zeros. */ + + if (z_abs(z__) < thresh) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + hvec[i__] = 0; + hder[i__] = 0; + } + return 0; + } + +/* Otherwise, get h_0 and h_1 analytically and the rest via */ +/* recursion. */ + + h3d01_(z__, hvec, &hvec[1]); + hvec[0] = hvec[0]; + hvec[1] *= *scale; + +/* From Abramowitz and Stegun (10.1.19) */ + +/* h_{n+1}(z)=(2n+1)/z * h_n(z) - h_{n-1}(z) */ + +/* With scaling: */ + +/* hvec(n+1)=scale*(2n+1)/z * hvec(n) -(scale**2) hvec(n-1) */ + + scal2 = *scale * *scale; + zinv = *scale / *z__; + i__1 = *nterms - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = (i__ << 1) + done; + ztmp = zinv * dtmp; + hvec[i__ + 1] = ztmp * hvec[i__] - scal2 * hvec[i__ - 1]; + } + +/* From Abramowitz and Stegun (10.1.21) */ + +/* h_{n}'(z)= h_{n-1}(z) - (n+1)/z * h_n(z) */ + +/* With scaling: */ + +/* hder(n)=scale* hvec(n-1) - (n+1)/z * hvec(n) */ + + + if (*ifder == 1) { + + hder[0] = -hvec[1] / *scale; + zinv = 1. / *z__; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = i__ + done; + ztmp = zinv * dtmp; + hder[i__] = *scale * hvec[i__ - 1] - ztmp * hvec[i__]; + } + } + + return 0; +} /* h3dall_ */ + diff --git a/setup.py b/setup.py index 76fa7f80..e81a6c01 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,8 @@ ext_modules = [ Extension( "pytential.qbx.target_specific._internal", ["pytential/qbx/target_specific/_internal.pyx", - "pytential/qbx/target_specific/cdjseval3d.c"], + "pytential/qbx/target_specific/cdjseval3d.c", + "pytential/qbx/target_specific/helmrouts3d.c"], extra_compile_args=["-fopenmp", "-ffast-math"], extra_link_args=["-fopenmp"] ), diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 522cce5d..31bf4da4 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) def test_spherical_bessel_functions(): import pytential.qbx.target_specific as ts - + nterms = 10 z = 3j scale = 1 @@ -54,8 +54,8 @@ def test_spherical_bessel_functions(): jder = np.zeros(nterms, dtype=np.complex) ts.jfuns3d_wrapper(nterms, z, scale, j, jder) - # Reference solution from scipy.special.spherical_jn - + # Reference solution computed using scipy.special.spherical_jn + j_expected = np.array([ +3.33929164246994992e+00 + +0.00000000000000000e+00j, +0.00000000000000000e+00 + +2.24279011776926884e+00j, @@ -67,9 +67,9 @@ def test_spherical_bessel_functions(): +0.00000000000000000e+00 + -1.40087680258226812e-03j, +2.40653350187633002e-04 + +0.00000000000000000e+00j, +0.00000000000000000e+00 + +3.71744848523478122e-05j, - ]) - - assert np.allclose(j, j_expected, rtol=1e-13, atol=0) + ]) + + assert np.allclose(j, j_expected) jder_expected = np.array([ -0.00000000000000000e+00 + -2.24279011776926884e+00j, @@ -82,10 +82,53 @@ def test_spherical_bessel_functions(): -3.50936588954626604e-03 + -4.33680868994201774e-19j, +0.00000000000000000e+00 + -6.78916752019369197e-04j, +1.16738400679806980e-04 + +0.00000000000000000e+00j, - ]) + ]) + + assert np.allclose(jder, jder_expected) + + +def test_spherical_hankel_functions(): + import pytential.qbx.target_specific as ts + + nterms = 10 + z = 2 + 3j + scale = 1 + h = np.zeros(nterms, dtype=np.complex) + hder = np.zeros(nterms, dtype=np.complex) + ts.h3dall_wrapper(nterms, z, scale, h, hder) + + # Reference solution computed using + # scipy.special.spherical_jn + 1j * scipy.special.spherical_yn + h_expected = np.array([ + +1.17460537937623677e-02 + -7.25971518952217565e-03j, + -7.12794888037171503e-03 + -1.55735608522498126e-02j, + -2.58175723285687941e-02 + +5.00665171335734627e-03j, + -6.95481631849959037e-03 + +4.92143379339500253e-02j, + +9.78278544942576822e-02 + +5.92281078069348405e-02j, + +2.65420992601874961e-01 + -1.70387117227806167e-01j, + -8.11750107462848453e-02 + -1.02133651818182791e+00j, + -3.49178056863992792e+00 + -1.62876088689699405e+00j, + -1.36147986022969878e+01 + +9.34959028601928743e+00j, + +4.56300765393887087e+00 + +7.94934376901125432e+01j, + ]) + + assert np.allclose(h, h_expected) + + hder_expected = np.array([ + +7.12794888037171503e-03 + +1.55735608522498126e-02j, + +2.11270661502996893e-02 + -5.75767287207851197e-03j, + +1.32171023895111261e-03 + -3.57580271012700734e-02j, + -6.69663049946767064e-02 + -3.16989251553807527e-02j, + -1.50547136475930293e-01 + +1.16532548652759055e-01j, + +8.87444851771816839e-02 + +5.84014513465967444e-01j, + +2.00269153354544205e+00 + +7.98384884993240895e-01j, + +7.22334424954346144e+00 + -5.46307186102847187e+00j, + -4.05890079026877615e+00 + -4.28512368415405192e+01j, + -2.04081205047078043e+02 + -1.02417988497371837e+02j, + ]) + + assert np.allclose(hder, hder_expected) - assert np.allclose(jder, jder_expected, rtol=1e-13, atol=0) - @pytest.mark.parametrize("op", ["S", "D"]) @pytest.mark.parametrize("helmholtz_k", [0, 1.2]) -- GitLab From b250cbbeea36fa373c27227c6f88949bad5a308e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Aug 2018 23:55:40 -0500 Subject: [PATCH 074/139] Get Helmholtz SLP working. --- pytential/qbx/__init__.py | 1 - pytential/qbx/fmmlib.py | 11 +- pytential/qbx/target_specific/_internal.pyx | 146 +++++++++++++------- test/test_target_specific_qbx.py | 11 +- 4 files changed, 112 insertions(+), 57 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 1b87fc5c..b791a8d8 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -585,7 +585,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): return maxstretch - @memoize_method def qbx_fmm_geometry_data(self, target_discrs_and_qbx_sides): """ :arg target_discrs_and_qbx_sides: diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 5a258caa..c52cb7f5 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -30,7 +30,7 @@ from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler from sumpy.kernel import ( LaplaceKernel, HelmholtzKernel, AxisTargetDerivative, DirectionalSourceDerivative) -import pytential.qbx.target_specific as target_specific +import pytential.qbx.target_specific as ts from boxtree.tools import return_timing_data @@ -236,6 +236,10 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): if isinstance(knl, DirectionalSourceDerivative): knl = knl.inner_kernel + else: + if isinstance(knl, HelmholtzKernel) and knl.dim == 3: + return True + return isinstance(knl, LaplaceKernel) and knl.dim == 3 @staticmethod @@ -607,12 +611,12 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ctt = geo_data.center_to_tree_targets() for output in pot: - target_specific.eval_target_specific_global_qbx_locals( + ts.eval_target_specific_qbx_locals( order=self.qbx_order, sources=self._get_single_sources_array(), targets=geo_data.all_targets(), centers=self._get_single_centers_array(), - global_qbx_centers=geo_data.global_qbx_centers(), + qbx_centers=geo_data.global_qbx_centers(), qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), center_to_target_starts=ctt.starts, center_to_target_lists=ctt.lists, @@ -620,6 +624,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): source_box_lists=trav.neighbor_source_boxes_lists, box_source_starts=self.tree.box_source_starts, box_source_counts_nonchild=self.tree.box_source_counts_nonchild, + helmholtz_k=self.kernel_kwargs.get("zk", 0), dipstr=src_weights, dipvec=self.dipole_vec, pot=output) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 31b27c07..9b55465a 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -6,7 +6,8 @@ import cython import cython.parallel from libc.math cimport sqrt -from libc.stdio cimport printf +from libc.stdio cimport printf, fprintf, stderr +from libc.stdlib cimport abort cimport openmp @@ -18,6 +19,9 @@ cdef extern from "_internal.h" nogil: int h3dall_(int *nterms, double complex *z, double *scale, double complex *hvec, int *ifder, double complex *hder); +cdef extern from "complex.h" nogil: + double cabs(double complex) + def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): """Evaluate spherical Bessel functions. @@ -30,9 +34,9 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): fjder: *None*, or output array of complex double derivatives """ cdef: - double complex[1024] fjstemp - double complex[1024] fjdertmp - int[1024] iscale + double complex[128] fjstemp + double complex[128] fjdertmp + int[128] iscale int ier, ifder, lwfjs, ntop, i, nterms_ double scale_ double complex z_ @@ -139,7 +143,7 @@ cdef double dist(double[3] a, double[3] b) nogil: (a[2] - b[2]) * (a[2] - b[2])) -cdef void tsqbx_grad_from_source( +cdef void tsqbx_laplace_dlp( double[3] source, double[3] center, double[3] target, @@ -186,7 +190,7 @@ cdef void tsqbx_grad_from_source( return -cdef double tsqbx_from_source( +cdef double tsqbx_laplace_slp( double[3] source, double[3] center, double[3] target, @@ -227,17 +231,20 @@ cdef double tsqbx_from_source( return result -cdef double tsqbx_helmholtz_from_source( +cdef double complex tsqbx_helmholtz_slp( double[3] source, double[3] center, double[3] target, - double k, - int order) nogil: + int order, + double complex k) nogil: cdef: - int j - double result, r, sc_d, tc_d, cos_angle - # Legendre recurrence values - double pj, pjm1, pjm2 + int j, ntop, ier, ifder, lwfjs + double r, sc_d, tc_d, cos_angle + double[128] lvals + double complex[128] jvals, hvals + int[128] iscale + double jscale, hscale, unscale + double complex z, result tc_d = dist(target, center) sc_d = dist(source, center) @@ -248,40 +255,74 @@ cdef double tsqbx_helmholtz_from_source( (target[2] - center[2]) * (source[2] - center[2])) / (tc_d * sc_d)) - if order == 0: - return 1 / sc_d - - pjm2 = 1 - pjm1 = cos_angle - - result = 1 / sc_d + (cos_angle * tc_d) / (sc_d * sc_d) + # Evaluate the Legendre terms. + legvals(cos_angle, order, lvals, NULL) + + # Scaling magic for Bessel and Hankel terms. + # These values are taken from the fmmlib documentation. + jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # Multiply against unscale to remove the scaling. + unscale = jscale / hscale + + # Evaluate the spherical Bessel terms. + z = k * tc_d + ifder = 0 + lwfjs = 128 + jfuns3d_(&ier, &order, &z, &jscale, jvals, &ifder, NULL, &lwfjs, iscale, + &ntop) + if ier: + # This could in theory fail. + fprintf(stderr, "array passed to jfuns3d was too small\n") + abort() - r = (tc_d * tc_d) / (sc_d * sc_d * sc_d) + # Evaluate the spherical Hankel terms. + z = k * sc_d + h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) - for j in range(2, order + 1): - pj = ( (2*j-1)*cos_angle*pjm1-(j-1)*pjm2 ) / j - result += pj * r + result = jvals[0] * hvals[0] * lvals[0] - r *= (tc_d / sc_d) - pjm2 = pjm1 - pjm1 = pj + for j in range(1, 1 + order): + result += (2 * j + 1) * unscale * (jvals[j] * hvals[j] * lvals[j]) + unscale *= jscale / hscale - return result + return result * 1j * k -def eval_target_specific_global_qbx_locals( +def eval_target_specific_qbx_locals( int order, double[:,:] sources, double[:,:] targets, double[:,:] centers, - int[:] global_qbx_centers, + int[:] qbx_centers, int[:] qbx_center_to_target_box, int[:] center_to_target_starts, int[:] center_to_target_lists, int[:] source_box_starts, int[:] source_box_lists, int[:] box_source_starts, int[:] box_source_counts_nonchild, + double complex helmholtz_k, double[:] dipstr, double[:,:] dipvec, double complex[:] pot): + """TSQBX entry point. + + Arguments: + order: Expansion order + sources: Array of sources of shape (3, *nsrcs*) + targets: Array of targets of shape (3, *ntgts*) + centers: Array of centers of shape (3, *nctrs*) + qbx_centers: Array of subset of indices into *centers* which are QBX centers + qbx_center_to_target_box: Array mapping centers to target box numbers + center_to_target_starts: "Start" indices for center-to-target CSR list + center_to_target_lists: Center-to-target CSR list + source_box_starts: "Start" indices for target-box-to-source-box CSR list + source_box_lists: Target-box-to-source-box CSR list + box_source_starts: "Start" indices for sources for each box + box_source_counts_nonchild: Number of sources per box + helmholtz_k: Helmholtz parameter (Pass 0 for Laplace) + dipstr: Source weights, shape (*nsrcs*,) + dipvec: Source gradient weights, shape (3, *nsrcs*), or *None* + pot: Output potential, shape (*ngts*,) + """ cdef: int tgt, ictr, ctr @@ -290,17 +331,16 @@ def eval_target_specific_global_qbx_locals( int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end int i, tid - double result + double complex result double[:,:] source, center, target, grad - int slp, dlp + int laplace_slp, helmholtz_slp, laplace_dlp - slp = (dipstr is not None) and (dipvec is None) - dlp = (dipstr is not None) and (dipvec is not None) + laplace_slp = (helmholtz_k == 0) and (dipstr is not None) and (dipvec is None) + laplace_dlp = (helmholtz_k == 0) and (dipstr is not None) and (dipvec is not None) + helmholtz_slp = (helmholtz_k != 0) and (dipstr is not None) and (dipvec is None) - print("Hi from Cython") - - if not (slp or dlp): - raise ValueError("should specify exactly one of src_weights or dipvec") + if not (laplace_slp or laplace_dlp or helmholtz_slp): + raise ValueError("unknown kernel") # Hack to obtain thread-local storage maxthreads = openmp.omp_get_max_threads() @@ -311,12 +351,13 @@ def eval_target_specific_global_qbx_locals( center = np.zeros((maxthreads, 65)) grad = np.zeros((maxthreads, 65)) - # TODO: Check if order > 256 + # TODO: Check that the order is not too high, since some temporary arrays + # used above might overflow if that is the case. - for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0], + for ictr in cython.parallel.prange(0, qbx_centers.shape[0], nogil=True, schedule="static", chunksize=128): - ctr = global_qbx_centers[ictr] + ctr = qbx_centers[ictr] itgt_start = center_to_target_starts[ctr] itgt_end = center_to_target_starts[ctr + 1] tgt_box = qbx_center_to_target_box[ctr] @@ -344,15 +385,24 @@ def eval_target_specific_global_qbx_locals( for i in range(3): source[tid, i] = sources[i, isrc] - if slp: - # Don't replace with +=, since that makes Cython think - # it is a reduction. + # NOTE: Don't replace with +=, since that makes Cython think + # we are doing an OpenMP reduction. + + if laplace_slp: result = result + dipstr[isrc] * ( - tsqbx_from_source(&source[tid, 0], ¢er[tid, 0], + tsqbx_laplace_slp(&source[tid, 0], ¢er[tid, 0], &target[tid, 0], order)) - elif dlp: - tsqbx_grad_from_source(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], &grad[tid, 0], order) + + elif helmholtz_slp: + result = result + dipstr[isrc] * ( + tsqbx_helmholtz_slp(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], order, + helmholtz_k)) + + elif laplace_dlp: + tsqbx_laplace_dlp(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], &grad[tid, 0], order) + result = result + dipstr[isrc] * ( grad[tid, 0] * dipvec[0, isrc] + grad[tid, 1] * dipvec[1, isrc] + diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 31bf4da4..0cc531dd 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -132,17 +132,18 @@ def test_spherical_hankel_functions(): @pytest.mark.parametrize("op", ["S", "D"]) @pytest.mark.parametrize("helmholtz_k", [0, 1.2]) -def test_target_specific_qbx(ctx_getter, op, helmholtz_k): +@pytest.mark.parametrize("qbx_order", [0, 1, 5]) +def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): logging.basicConfig(level=logging.INFO) - if helmholtz_k != 0: + if helmholtz_k != 0 and op == "D": pytest.xfail("not implemented yet") cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) - target_order = 8 - fmm_tol = 1e-5 + target_order = 4 + fmm_tol = 1e-3 from meshmode.mesh.generation import generate_icosphere mesh = generate_icosphere(1, target_order) @@ -164,7 +165,7 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k): qbx, _ = QBXLayerPotentialSource( pre_density_discr, 4*target_order, - qbx_order=5, + qbx_order=qbx_order, fmm_level_to_order=SimpleExpansionOrderFinder(fmm_tol), fmm_backend="fmmlib", _expansions_in_tree_have_extent=True, -- GitLab From 1d1bcd98148a9c6e4c6db3956ae4a85ce69dc7f5 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 11 Aug 2018 00:06:32 -0500 Subject: [PATCH 075/139] Refactor Helmholtz utility files. --- .../qbx/target_specific/_helmholtz_utils.c | 516 ++++++++++++++++++ .../{_internal.h => _helmholtz_utils.h} | 6 +- pytential/qbx/target_specific/_internal.pyx | 3 +- pytential/qbx/target_specific/cdjseval3d.c | 238 -------- pytential/qbx/target_specific/f2c.h | 33 -- pytential/qbx/target_specific/helmrouts3d.c | 242 -------- setup.py | 3 +- 7 files changed, 522 insertions(+), 519 deletions(-) create mode 100644 pytential/qbx/target_specific/_helmholtz_utils.c rename pytential/qbx/target_specific/{_internal.h => _helmholtz_utils.h} (82%) delete mode 100644 pytential/qbx/target_specific/cdjseval3d.c delete mode 100644 pytential/qbx/target_specific/f2c.h delete mode 100644 pytential/qbx/target_specific/helmrouts3d.c diff --git a/pytential/qbx/target_specific/_helmholtz_utils.c b/pytential/qbx/target_specific/_helmholtz_utils.c new file mode 100644 index 00000000..76c6b388 --- /dev/null +++ b/pytential/qbx/target_specific/_helmholtz_utils.c @@ -0,0 +1,516 @@ +/* This file contains routines for evaluating spherical Bessel and Hankel + functions. + + This is based on cdjseval3d.f and helmrouts3d.f from fmmlib3d, translated + with a hacked version of f2c and manually postprocessed. */ + +/* Original copyright notice: */ + +/* ********************************************************************** + +Copyright (c) 2009-2012, Leslie Greengard, Zydrunas Gimbutas +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************** */ + +#include + +/* declarations for f2c generated things */ + +typedef int integer; +typedef double doublereal; +typedef double complex doublecomplex; + +static inline double z_abs(doublecomplex *z) { + return cabs(*z); +} + +static inline void z_exp(doublecomplex *out, doublecomplex *z) { + *out = cexp(*z); +} + +static inline void z_sin(doublecomplex *out, doublecomplex *z) { + *out = csin(*z); +} + +static inline void z_cos(doublecomplex *out, doublecomplex *z) { + *out = ccos(*z); +} + +/* Start of functions borrowed from cdjseval3d.f */ + +/* Computation of spherical Bessel functions via recurrence */ + +/* ********************************************************************** */ +/* Subroutine */ int jfuns3d_(integer *ier, integer *nterms, doublecomplex * + z__, doublereal *scale, doublecomplex *fjs, integer *ifder, + doublecomplex *fjder, integer *lwfjs, integer *iscale, integer *ntop) +{ + /* Initialized data */ + + static doublereal upbound2 = 1e40; + static doublereal upbound2inv = 1e-40; + static doublereal tiny = 1e-200; + static doublereal done = 1.; + static doublereal zero = 0.; + + /* System generated locals */ + integer i__1; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer i__; + doublereal d0, d1, dd, dc1, dc2; + doublecomplex fj0, fj1, zinv, ztmp; + doublereal dcoef; + doublereal sctot; + doublecomplex zscale; + doublereal scalinv; + +/* ********************************************************************** */ + +/* PURPOSE: */ + +/* This subroutine evaluates the first NTERMS spherical Bessel */ +/* functions and if required, their derivatives. */ +/* It incorporates a scaling parameter SCALE so that */ + +/* fjs_n(z)=j_n(z)/SCALE^n */ +/* fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} */ + +/* NOTE: The scaling parameter SCALE is meant to be used when */ +/* abs(z) < 1, in which case we recommend setting */ +/* SCALE = abs(z). This prevents the fjs_n from */ +/* underflowing too rapidly. */ +/* Otherwise, set SCALE=1. */ +/* Do not set SCALE = abs(z) if z could take on the */ +/* value zero. */ +/* In an FMM, when forming an expansion from a collection of */ +/* sources, set SCALE = min( abs(k*r), 1) */ +/* where k is the Helmholtz parameter and r is the box dimension */ +/* at the relevant level. */ + +/* INPUT: */ + +/* nterms (integer): order of expansion of output array fjs */ +/* z (complex *16): argument of the spherical Bessel functions */ +/* scale (real *8) : scaling factor (discussed above) */ +/* ifder (integer): flag indicating whether to calculate "fjder" */ +/* 0 NO */ +/* 1 YES */ +/* lwfjs (integer): upper limit of input arrays */ +/* fjs(0:lwfjs) and iscale(0:lwfjs) */ +/* iscale (integer): integer workspace used to keep track of */ +/* internal scaling */ + +/* OUTPUT: */ + +/* ier (integer): error return code */ +/* ier=0 normal return; */ +/* ier=8 insufficient array dimension lwfjs */ +/* fjs (complex *16): array of scaled Bessel functions. */ +/* fjder (complex *16): array of derivs of scaled Bessel functions. */ +/* ntop (integer) : highest index in arrays fjs that is nonzero */ + +/* NOTE, that fjs and fjder arrays must be at least (nterms+2) */ +/* complex *16 elements long. */ + + + + +/* ... Initializing ... */ + + *ier = 0; + +/* set to asymptotic values if argument is sufficiently small */ + + if (z_abs(z__) < tiny) { + fjs[0] = done; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + fjs[i__] = zero; + } + + if (*ifder == 1) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjder[i__] = zero; + } + fjder[1] = done / (*scale * 3); + } + + return 0; + } + +/* ... Step 1: recursion up to find ntop, starting from nterms */ + + *ntop = 0; + zinv = done / *z__; + fjs[*nterms] = done; + fjs[*nterms - 1] = zero; + + i__1 = *lwfjs; + for (i__ = *nterms; i__ <= i__1; ++i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ - 1]; + fjs[i__ + 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + *ntop = i__ + 1; + break; + } + } + if (*ntop == 0) { + *ier = 8; + return 0; + } + +/* ... Step 2: Recursion back down to generate the unscaled jfuns: */ +/* if magnitude exceeds UPBOUND2, rescale and continue the */ +/* recursion (saving the order at which rescaling occurred */ +/* in array iscale. */ + + i__1 = *ntop; + for (i__ = 0; i__ <= i__1; ++i__) { + iscale[i__] = 0; + } + + fjs[*ntop] = zero; + fjs[*ntop - 1] = done; + for (i__ = *ntop - 1; i__ >= 1; --i__) { + dcoef = (i__ << 1) + done; + ztmp = dcoef * zinv * fjs[i__] - fjs[i__ + 1]; + fjs[i__ - 1] = ztmp; + +/* Computing 2nd power */ + d__1 = creal(ztmp); +/* Computing 2nd power */ + d__2 = cimag(ztmp); + dd = d__1 * d__1 + d__2 * d__2; + if (dd > upbound2) { + fjs[i__] *= upbound2inv; + fjs[i__ - 1] *= upbound2inv; + iscale[i__] = 1; + } +/* L2200: */ + } + +/* ... Step 3: go back up to the top and make sure that all */ +/* Bessel functions are scaled by the same factor */ +/* (i.e. the net total of times rescaling was invoked */ +/* on the way down in the previous loop). */ +/* At the same time, add scaling to fjs array. */ + + scalinv = done / *scale; + sctot = 1.; + i__1 = *ntop; + for (i__ = 1; i__ <= i__1; ++i__) { + sctot *= scalinv; + if (iscale[i__ - 1] == 1) { + sctot *= upbound2inv; + } + fjs[i__] *= sctot; + } + +/* ... Determine the normalization parameter: */ + + z_sin(&z__1, z__); + fj0 = z__1 * zinv; + z_cos(&z__1, z__); + fj1 = fj0 * zinv - z__1 * zinv; + + d0 = z_abs(&fj0); + d1 = z_abs(&fj1); + if (d1 > d0) { + zscale = fj1 / (fjs[1] * *scale); + } else { + zscale = fj0 / fjs[0]; + } + +/* ... Scale the jfuns by zscale: */ + + ztmp = zscale; + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + fjs[i__] *= ztmp; + } + +/* ... Finally, calculate the derivatives if desired: */ + + if (*ifder == 1) { + fjs[*nterms + 1] *= ztmp; + + fjder[0] = -fjs[1] * *scale; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dc1 = i__ / ((i__ << 1) + done); + dc2 = done - dc1; + dc1 *= scalinv; + dc2 *= *scale; + fjder[i__] = dc1 * fjs[i__ - 1] - dc2 * fjs[i__ + 1]; + } + } + return 0; +} /* jfuns3d_ */ + +/* Start of functions borrowed from helmrouts3d.f */ + +/* This file contains the basic subroutines for */ +/* forming and evaluating multipole (partial wave) expansions. */ + +/* Documentation is incomplete and ongoing... */ + + +/* Remarks on scaling conventions. */ + +/* 1) Hankel and Bessel functions are consistently scaled as */ +/* hvec(n)= h_n(z)*scale^(n) */ +/* jvec(n)= j_n(z)/scale^(n) */ + +/* In some earlier FMM implementations, the convention */ +/* hvec(n)= h_n(z)*scale^(n+1) */ +/* was sometimes used, leading to various obscure rescaling */ +/* steps. */ + +/* scale should be of the order of |z| if |z| < 1. Otherwise, */ +/* scale should be set to 1. */ + + +/* 2) There are many definitions of the spherical harmonics, */ +/* which differ in terms of normalization constants. We */ +/* adopt the following convention: */ + +/* For m>0, we define Y_n^m according to */ + +/* Y_n^m = \sqrt{2n+1} \sqrt{\frac{ (n-m)!}{(n+m)!}} \cdot */ +/* P_n^m(\cos \theta) e^{i m phi} */ +/* and */ + +/* Y_n^-m = dconjg( Y_n^m ) */ + +/* We omit the Condon-Shortley phase factor (-1)^m in the */ +/* definition of Y_n^m for m<0. (This is standard in several */ +/* communities.) */ + +/* We also omit the factor \sqrt{\frac{1}{4 \pi}}, so that */ +/* the Y_n^m are orthogonal on the unit sphere but not */ +/* orthonormal. (This is also standard in several communities.) */ +/* More precisely, */ + +/* \int_S Y_n^m Y_n^m d\Omega = 4 \pi. */ + +/* Using our standard definition, the addition theorem takes */ +/* the simple form */ + +/* e^( i k r}/(ikr) = */ +/* \sum_n \sum_m j_n(k|S|) Ylm*(S) h_n(k|T|) Ylm(T) */ + + +/* ----------------------------------------------------------------------- */ +/* h3d01: computes h0, h1 (first two spherical Hankel fns.) */ +/* h3dall: computes Hankel functions of all orders and scales them */ +/* ********************************************************************** */ +/* Subroutine */ int h3d01_(doublecomplex *z__, doublecomplex *h0, + doublecomplex *h1) +{ + /* Initialized data */ + + static doublecomplex eye = I; + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* System generated locals */ + doublecomplex z__1; + + /* Local variables */ + doublecomplex cd; + +/* ********************************************************************** */ + +/* Compute spherical Hankel functions of order 0 and 1 */ + +/* h0(z) = exp(i*z)/(i*z), */ +/* h1(z) = - h0' = -h0*(i-1/z) = h0*(1/z-i) */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* z : argument of Hankel functions */ +/* if abs(z)<1.0d-15, returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* h0 : h0(z) (spherical Hankel function of order 0). */ +/* h1 : -h0'(z) (spherical Hankel function of order 1). */ + +/* ----------------------------------------------------------------------- */ + + if (z_abs(z__) < thresh) { + *h0 = 0.; + *h1 = 0.; + return 0; + } + +/* Otherwise, use formula */ + + cd = eye * *z__; + z_exp(&z__1, &cd); + *h0 = z__1 / cd; + *h1 = *h0 * (done / *z__ - eye); + + return 0; +} /* h3d01_ */ + + + + +/* ********************************************************************** */ +/* Subroutine */ int h3dall_(integer *nterms, doublecomplex *z__, doublereal * + scale, doublecomplex *hvec, integer *ifder, doublecomplex *hder) +{ + /* Initialized data */ + static doublereal thresh = 1e-15; + static doublereal done = 1.; + + /* Builtin functions */ + double z_abs(doublecomplex *); + + /* Local variables */ + integer i__; + integer i__1; + doublereal dtmp; + doublecomplex zinv, ztmp; + doublereal scal2; + +/* ********************************************************************** */ + +/* This subroutine computes scaled versions of the spherical Hankel */ +/* functions h_n of orders 0 to nterms. */ + +/* hvec(n)= h_n(z)*scale^(n) */ + +/* The parameter SCALE is useful when |z| < 1, in which case */ +/* it damps out the rapid growth of h_n as n increases. In such */ +/* cases, we recommend setting */ + +/* scale = |z| */ + +/* or something close. If |z| > 1, set scale = 1. */ + +/* If the flag IFDER is set to one, it also computes the */ +/* derivatives of h_n. */ + +/* hder(n)= h_n'(z)*scale^(n) */ + +/* NOTE: If |z| < 1.0d-15, the subroutine returns zero. */ + +/* ----------------------------------------------------------------------- */ +/* INPUT: */ + +/* nterms : highest order of the Hankel functions to be computed. */ +/* z : argument of the Hankel functions. */ +/* scale : scaling parameter discussed above */ +/* ifder : flag indcating whether derivatives should be computed. */ +/* ifder = 1 ==> compute */ +/* ifder = 0 ==> do not compute */ + +/* ----------------------------------------------------------------------- */ +/* OUTPUT: */ + +/* hvec : the vector of spherical Hankel functions */ +/* hder : the derivatives of the spherical Hankel functions */ + +/* ----------------------------------------------------------------------- */ + + +/* If |z| < thresh, return zeros. */ + + if (z_abs(z__) < thresh) { + i__1 = *nterms; + for (i__ = 0; i__ <= i__1; ++i__) { + hvec[i__] = 0; + hder[i__] = 0; + } + return 0; + } + +/* Otherwise, get h_0 and h_1 analytically and the rest via */ +/* recursion. */ + + h3d01_(z__, hvec, &hvec[1]); + hvec[0] = hvec[0]; + hvec[1] *= *scale; + +/* From Abramowitz and Stegun (10.1.19) */ + +/* h_{n+1}(z)=(2n+1)/z * h_n(z) - h_{n-1}(z) */ + +/* With scaling: */ + +/* hvec(n+1)=scale*(2n+1)/z * hvec(n) -(scale**2) hvec(n-1) */ + + scal2 = *scale * *scale; + zinv = *scale / *z__; + i__1 = *nterms - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = (i__ << 1) + done; + ztmp = zinv * dtmp; + hvec[i__ + 1] = ztmp * hvec[i__] - scal2 * hvec[i__ - 1]; + } + +/* From Abramowitz and Stegun (10.1.21) */ + +/* h_{n}'(z)= h_{n-1}(z) - (n+1)/z * h_n(z) */ + +/* With scaling: */ + +/* hder(n)=scale* hvec(n-1) - (n+1)/z * hvec(n) */ + + + if (*ifder == 1) { + + hder[0] = -hvec[1] / *scale; + zinv = 1. / *z__; + i__1 = *nterms; + for (i__ = 1; i__ <= i__1; ++i__) { + dtmp = i__ + done; + ztmp = zinv * dtmp; + hder[i__] = *scale * hvec[i__ - 1] - ztmp * hvec[i__]; + } + } + + return 0; +} /* h3dall_ */ + diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_helmholtz_utils.h similarity index 82% rename from pytential/qbx/target_specific/_internal.h rename to pytential/qbx/target_specific/_helmholtz_utils.h index b97a6a79..1e605a4d 100644 --- a/pytential/qbx/target_specific/_internal.h +++ b/pytential/qbx/target_specific/_helmholtz_utils.h @@ -1,5 +1,5 @@ -#ifndef UTILS_H -#define UTILS_H +#ifndef HELMHOLTZ_UTILS_H +#define HELMHOLTZ_UTILS_H #include @@ -11,4 +11,4 @@ extern int jfuns3d_(int *ier, int *nterms, double complex *z, extern int h3dall_(int *nterms, double complex *z, double *scale, double complex *hvec, int *ifder, double complex *hder); -#endif // UTILS_H +#endif /* HELMHOLTZ_UTILS_H */ diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 9b55465a..051cfa08 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -12,13 +12,14 @@ from libc.stdlib cimport abort cimport openmp -cdef extern from "_internal.h" nogil: +cdef extern from "_helmholtz_utils.h" nogil: int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale, double complex *fjs, int *ifder, double complex *fjder, int *lwfjs, int *iscale, int *ntop); int h3dall_(int *nterms, double complex *z, double *scale, double complex *hvec, int *ifder, double complex *hder); + cdef extern from "complex.h" nogil: double cabs(double complex) diff --git a/pytential/qbx/target_specific/cdjseval3d.c b/pytential/qbx/target_specific/cdjseval3d.c deleted file mode 100644 index 40cd5d37..00000000 --- a/pytential/qbx/target_specific/cdjseval3d.c +++ /dev/null @@ -1,238 +0,0 @@ -/* Based on cdjseval3d.f from fmmlib3d, translated with modified f2c */ - -#include "f2c.h" - -/* Original copyright notice: */ - -/* ********************************************************************** */ -/* Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ -/* Contact: greengard@cims.nyu.edu */ -/* -/* This software is being released under a modified FreeBSD license */ -/* (see COPYING in home directory). */ -/* ********************************************************************** */ - -/* $Date: 2011-07-15 16:28:31 -0400 (Fri, 15 Jul 2011) $ */ -/* $Revision: 2253 $ */ - - -/* Computation of spherical Bessel functions via recurrence */ - -/* ********************************************************************** */ -/* Subroutine */ int jfuns3d_(integer *ier, integer *nterms, doublecomplex * - z__, doublereal *scale, doublecomplex *fjs, integer *ifder, - doublecomplex *fjder, integer *lwfjs, integer *iscale, integer *ntop) -{ - /* Initialized data */ - - static doublereal upbound2 = 1e40; - static doublereal upbound2inv = 1e-40; - static doublereal tiny = 1e-200; - static doublereal done = 1.; - static doublereal zero = 0.; - - /* System generated locals */ - integer i__1; - doublereal d__1, d__2; - doublecomplex z__1; - - /* Local variables */ - integer i__; - doublereal d0, d1, dd, dc1, dc2; - doublecomplex fj0, fj1, zinv, ztmp; - doublereal dcoef; - doublereal sctot; - doublecomplex zscale; - doublereal scalinv; - -/* ********************************************************************** */ - -/* PURPOSE: */ - -/* This subroutine evaluates the first NTERMS spherical Bessel */ -/* functions and if required, their derivatives. */ -/* It incorporates a scaling parameter SCALE so that */ - -/* fjs_n(z)=j_n(z)/SCALE^n */ -/* fjder_n(z)=\frac{\partial fjs_n(z)}{\partial z} */ - -/* NOTE: The scaling parameter SCALE is meant to be used when */ -/* abs(z) < 1, in which case we recommend setting */ -/* SCALE = abs(z). This prevents the fjs_n from */ -/* underflowing too rapidly. */ -/* Otherwise, set SCALE=1. */ -/* Do not set SCALE = abs(z) if z could take on the */ -/* value zero. */ -/* In an FMM, when forming an expansion from a collection of */ -/* sources, set SCALE = min( abs(k*r), 1) */ -/* where k is the Helmholtz parameter and r is the box dimension */ -/* at the relevant level. */ - -/* INPUT: */ - -/* nterms (integer): order of expansion of output array fjs */ -/* z (complex *16): argument of the spherical Bessel functions */ -/* scale (real *8) : scaling factor (discussed above) */ -/* ifder (integer): flag indicating whether to calculate "fjder" */ -/* 0 NO */ -/* 1 YES */ -/* lwfjs (integer): upper limit of input arrays */ -/* fjs(0:lwfjs) and iscale(0:lwfjs) */ -/* iscale (integer): integer workspace used to keep track of */ -/* internal scaling */ - -/* OUTPUT: */ - -/* ier (integer): error return code */ -/* ier=0 normal return; */ -/* ier=8 insufficient array dimension lwfjs */ -/* fjs (complex *16): array of scaled Bessel functions. */ -/* fjder (complex *16): array of derivs of scaled Bessel functions. */ -/* ntop (integer) : highest index in arrays fjs that is nonzero */ - -/* NOTE, that fjs and fjder arrays must be at least (nterms+2) */ -/* complex *16 elements long. */ - - - - -/* ... Initializing ... */ - - *ier = 0; - -/* set to asymptotic values if argument is sufficiently small */ - - if (z_abs(z__) < tiny) { - fjs[0] = done; - i__1 = *nterms; - for (i__ = 1; i__ <= i__1; ++i__) { - fjs[i__] = zero; - } - - if (*ifder == 1) { - i__1 = *nterms; - for (i__ = 0; i__ <= i__1; ++i__) { - fjder[i__] = zero; - } - fjder[1] = done / (*scale * 3); - } - - return 0; - } - -/* ... Step 1: recursion up to find ntop, starting from nterms */ - - *ntop = 0; - zinv = done / *z__; - fjs[*nterms] = done; - fjs[*nterms - 1] = zero; - - i__1 = *lwfjs; - for (i__ = *nterms; i__ <= i__1; ++i__) { - dcoef = (i__ << 1) + done; - ztmp = dcoef * zinv * fjs[i__] - fjs[i__ - 1]; - fjs[i__ + 1] = ztmp; - -/* Computing 2nd power */ - d__1 = creal(ztmp); -/* Computing 2nd power */ - d__2 = cimag(ztmp); - dd = d__1 * d__1 + d__2 * d__2; - if (dd > upbound2) { - *ntop = i__ + 1; - break; - } - } - if (*ntop == 0) { - *ier = 8; - return 0; - } - -/* ... Step 2: Recursion back down to generate the unscaled jfuns: */ -/* if magnitude exceeds UPBOUND2, rescale and continue the */ -/* recursion (saving the order at which rescaling occurred */ -/* in array iscale. */ - - i__1 = *ntop; - for (i__ = 0; i__ <= i__1; ++i__) { - iscale[i__] = 0; - } - - fjs[*ntop] = zero; - fjs[*ntop - 1] = done; - for (i__ = *ntop - 1; i__ >= 1; --i__) { - dcoef = (i__ << 1) + done; - ztmp = dcoef * zinv * fjs[i__] - fjs[i__ + 1]; - fjs[i__ - 1] = ztmp; - -/* Computing 2nd power */ - d__1 = creal(ztmp); -/* Computing 2nd power */ - d__2 = cimag(ztmp); - dd = d__1 * d__1 + d__2 * d__2; - if (dd > upbound2) { - fjs[i__] *= upbound2inv; - fjs[i__ - 1] *= upbound2inv; - iscale[i__] = 1; - } -/* L2200: */ - } - -/* ... Step 3: go back up to the top and make sure that all */ -/* Bessel functions are scaled by the same factor */ -/* (i.e. the net total of times rescaling was invoked */ -/* on the way down in the previous loop). */ -/* At the same time, add scaling to fjs array. */ - - scalinv = done / *scale; - sctot = 1.; - i__1 = *ntop; - for (i__ = 1; i__ <= i__1; ++i__) { - sctot *= scalinv; - if (iscale[i__ - 1] == 1) { - sctot *= upbound2inv; - } - fjs[i__] *= sctot; - } - -/* ... Determine the normalization parameter: */ - - z_sin(&z__1, z__); - fj0 = z__1 * zinv; - z_cos(&z__1, z__); - fj1 = fj0 * zinv - z__1 * zinv; - - d0 = z_abs(&fj0); - d1 = z_abs(&fj1); - if (d1 > d0) { - zscale = fj1 / (fjs[1] * *scale); - } else { - zscale = fj0 / fjs[0]; - } - -/* ... Scale the jfuns by zscale: */ - - ztmp = zscale; - i__1 = *nterms; - for (i__ = 0; i__ <= i__1; ++i__) { - fjs[i__] *= ztmp; - } - -/* ... Finally, calculate the derivatives if desired: */ - - if (*ifder == 1) { - fjs[*nterms + 1] *= ztmp; - - fjder[0] = -fjs[1] * *scale; - i__1 = *nterms; - for (i__ = 1; i__ <= i__1; ++i__) { - dc1 = i__ / ((i__ << 1) + done); - dc2 = done - dc1; - dc1 *= scalinv; - dc2 *= *scale; - fjder[i__] = dc1 * fjs[i__ - 1] - dc2 * fjs[i__ + 1]; - } - } - return 0; -} /* jfuns3d_ */ - diff --git a/pytential/qbx/target_specific/f2c.h b/pytential/qbx/target_specific/f2c.h deleted file mode 100644 index 897b5bd0..00000000 --- a/pytential/qbx/target_specific/f2c.h +++ /dev/null @@ -1,33 +0,0 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - -#include - -typedef int integer; -typedef unsigned int uinteger; -typedef double doublereal; -typedef double complex doublecomplex; - -static inline double z_abs(doublecomplex *z) { - return cabs(*z); -} - -static inline void z_exp(doublecomplex *out, doublecomplex *z) { - *out = cexp(*z); -} - -static inline void z_sin(doublecomplex *out, doublecomplex *z) { - *out = csin(*z); -} - -static inline void z_cos(doublecomplex *out, doublecomplex *z) { - *out = ccos(*z); -} - -#endif diff --git a/pytential/qbx/target_specific/helmrouts3d.c b/pytential/qbx/target_specific/helmrouts3d.c deleted file mode 100644 index f4ab60e1..00000000 --- a/pytential/qbx/target_specific/helmrouts3d.c +++ /dev/null @@ -1,242 +0,0 @@ -/* Based on helmrouts3d.f from fmmlib3d, translated with modified f2c */ - -#include "f2c.h" - -/* Original copyright notice: */ - -/* ************************************************************************** */ -/* Copyright (C) 2009-2012: Leslie Greengard and Zydrunas Gimbutas */ -/* Contact: greengard@cims.nyu.edu */ -/* */ -/* This software is being released under a modified FreeBSD license */ -/* (see COPYING in home directory). */ -/* ************************************************************************** */ - -/* This file contains the basic subroutines for */ -/* forming and evaluating multipole (partial wave) expansions. */ - -/* Documentation is incomplete and ongoing... */ - - -/* Remarks on scaling conventions. */ - -/* 1) Hankel and Bessel functions are consistently scaled as */ -/* hvec(n)= h_n(z)*scale^(n) */ -/* jvec(n)= j_n(z)/scale^(n) */ - -/* In some earlier FMM implementations, the convention */ -/* hvec(n)= h_n(z)*scale^(n+1) */ -/* was sometimes used, leading to various obscure rescaling */ -/* steps. */ - -/* scale should be of the order of |z| if |z| < 1. Otherwise, */ -/* scale should be set to 1. */ - - -/* 2) There are many definitions of the spherical harmonics, */ -/* which differ in terms of normalization constants. We */ -/* adopt the following convention: */ - -/* For m>0, we define Y_n^m according to */ - -/* Y_n^m = \sqrt{2n+1} \sqrt{\frac{ (n-m)!}{(n+m)!}} \cdot */ -/* P_n^m(\cos \theta) e^{i m phi} */ -/* and */ - -/* Y_n^-m = dconjg( Y_n^m ) */ - -/* We omit the Condon-Shortley phase factor (-1)^m in the */ -/* definition of Y_n^m for m<0. (This is standard in several */ -/* communities.) */ - -/* We also omit the factor \sqrt{\frac{1}{4 \pi}}, so that */ -/* the Y_n^m are orthogonal on the unit sphere but not */ -/* orthonormal. (This is also standard in several communities.) */ -/* More precisely, */ - -/* \int_S Y_n^m Y_n^m d\Omega = 4 \pi. */ - -/* Using our standard definition, the addition theorem takes */ -/* the simple form */ - -/* e^( i k r}/(ikr) = */ -/* \sum_n \sum_m j_n(k|S|) Ylm*(S) h_n(k|T|) Ylm(T) */ - - -/* ----------------------------------------------------------------------- */ -/* h3d01: computes h0, h1 (first two spherical Hankel fns.) */ -/* h3dall: computes Hankel functions of all orders and scales them */ -/* ********************************************************************** */ -/* Subroutine */ int h3d01_(doublecomplex *z__, doublecomplex *h0, - doublecomplex *h1) -{ - /* Initialized data */ - - static doublecomplex eye = I; - static doublereal thresh = 1e-15; - static doublereal done = 1.; - - /* System generated locals */ - doublecomplex z__1; - - /* Local variables */ - doublecomplex cd; - -/* ********************************************************************** */ - -/* Compute spherical Hankel functions of order 0 and 1 */ - -/* h0(z) = exp(i*z)/(i*z), */ -/* h1(z) = - h0' = -h0*(i-1/z) = h0*(1/z-i) */ - -/* ----------------------------------------------------------------------- */ -/* INPUT: */ - -/* z : argument of Hankel functions */ -/* if abs(z)<1.0d-15, returns zero. */ - -/* ----------------------------------------------------------------------- */ -/* OUTPUT: */ - -/* h0 : h0(z) (spherical Hankel function of order 0). */ -/* h1 : -h0'(z) (spherical Hankel function of order 1). */ - -/* ----------------------------------------------------------------------- */ - - if (z_abs(z__) < thresh) { - *h0 = 0.; - *h1 = 0.; - return 0; - } - -/* Otherwise, use formula */ - - cd = eye * *z__; - z_exp(&z__1, &cd); - *h0 = z__1 / cd; - *h1 = *h0 * (done / *z__ - eye); - - return 0; -} /* h3d01_ */ - - - - -/* ********************************************************************** */ -/* Subroutine */ int h3dall_(integer *nterms, doublecomplex *z__, doublereal * - scale, doublecomplex *hvec, integer *ifder, doublecomplex *hder) -{ - /* Initialized data */ - static doublereal thresh = 1e-15; - static doublereal done = 1.; - - /* Builtin functions */ - double z_abs(doublecomplex *); - - /* Local variables */ - integer i__; - integer i__1; - doublereal dtmp; - doublecomplex zinv, ztmp; - doublereal scal2; - -/* ********************************************************************** */ - -/* This subroutine computes scaled versions of the spherical Hankel */ -/* functions h_n of orders 0 to nterms. */ - -/* hvec(n)= h_n(z)*scale^(n) */ - -/* The parameter SCALE is useful when |z| < 1, in which case */ -/* it damps out the rapid growth of h_n as n increases. In such */ -/* cases, we recommend setting */ - -/* scale = |z| */ - -/* or something close. If |z| > 1, set scale = 1. */ - -/* If the flag IFDER is set to one, it also computes the */ -/* derivatives of h_n. */ - -/* hder(n)= h_n'(z)*scale^(n) */ - -/* NOTE: If |z| < 1.0d-15, the subroutine returns zero. */ - -/* ----------------------------------------------------------------------- */ -/* INPUT: */ - -/* nterms : highest order of the Hankel functions to be computed. */ -/* z : argument of the Hankel functions. */ -/* scale : scaling parameter discussed above */ -/* ifder : flag indcating whether derivatives should be computed. */ -/* ifder = 1 ==> compute */ -/* ifder = 0 ==> do not compute */ - -/* ----------------------------------------------------------------------- */ -/* OUTPUT: */ - -/* hvec : the vector of spherical Hankel functions */ -/* hder : the derivatives of the spherical Hankel functions */ - -/* ----------------------------------------------------------------------- */ - - -/* If |z| < thresh, return zeros. */ - - if (z_abs(z__) < thresh) { - i__1 = *nterms; - for (i__ = 0; i__ <= i__1; ++i__) { - hvec[i__] = 0; - hder[i__] = 0; - } - return 0; - } - -/* Otherwise, get h_0 and h_1 analytically and the rest via */ -/* recursion. */ - - h3d01_(z__, hvec, &hvec[1]); - hvec[0] = hvec[0]; - hvec[1] *= *scale; - -/* From Abramowitz and Stegun (10.1.19) */ - -/* h_{n+1}(z)=(2n+1)/z * h_n(z) - h_{n-1}(z) */ - -/* With scaling: */ - -/* hvec(n+1)=scale*(2n+1)/z * hvec(n) -(scale**2) hvec(n-1) */ - - scal2 = *scale * *scale; - zinv = *scale / *z__; - i__1 = *nterms - 1; - for (i__ = 1; i__ <= i__1; ++i__) { - dtmp = (i__ << 1) + done; - ztmp = zinv * dtmp; - hvec[i__ + 1] = ztmp * hvec[i__] - scal2 * hvec[i__ - 1]; - } - -/* From Abramowitz and Stegun (10.1.21) */ - -/* h_{n}'(z)= h_{n-1}(z) - (n+1)/z * h_n(z) */ - -/* With scaling: */ - -/* hder(n)=scale* hvec(n-1) - (n+1)/z * hvec(n) */ - - - if (*ifder == 1) { - - hder[0] = -hvec[1] / *scale; - zinv = 1. / *z__; - i__1 = *nterms; - for (i__ = 1; i__ <= i__1; ++i__) { - dtmp = i__ + done; - ztmp = zinv * dtmp; - hder[i__] = *scale * hvec[i__ - 1] - ztmp * hvec[i__]; - } - } - - return 0; -} /* h3dall_ */ - diff --git a/setup.py b/setup.py index e81a6c01..4d68287c 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,7 @@ ext_modules = [ Extension( "pytential.qbx.target_specific._internal", ["pytential/qbx/target_specific/_internal.pyx", - "pytential/qbx/target_specific/cdjseval3d.c", - "pytential/qbx/target_specific/helmrouts3d.c"], + "pytential/qbx/target_specific/_helmholtz_utils.c"], extra_compile_args=["-fopenmp", "-ffast-math"], extra_link_args=["-fopenmp"] ), -- GitLab From 809d2979007b7ab2f0e41e79c23b995c5be12048 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 11 Aug 2018 00:07:58 -0500 Subject: [PATCH 076/139] Add back memoize_method --- pytential/qbx/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index b791a8d8..1b87fc5c 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -585,6 +585,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): return maxstretch + @memoize_method def qbx_fmm_geometry_data(self, target_discrs_and_qbx_sides): """ :arg target_discrs_and_qbx_sides: -- GitLab From 11f682c1b66660e253555079e9dd0b07782826ad Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 14:01:48 -0500 Subject: [PATCH 077/139] Suppress maybe-uninitialized warnings. --- pytential/qbx/target_specific/_internal.h | 6 ++++++ pytential/qbx/target_specific/_internal.pyx | 9 ++++++++- setup.py | 12 ++++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 pytential/qbx/target_specific/_internal.h diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h new file mode 100644 index 00000000..cef63d8d --- /dev/null +++ b/pytential/qbx/target_specific/_internal.h @@ -0,0 +1,6 @@ +#ifndef INTERNAL_H +#define INTERNAL_H + +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + +#endif diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 051cfa08..a9117ce8 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -1,5 +1,5 @@ #!python -#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True +#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True import numpy as np import cython @@ -24,6 +24,10 @@ cdef extern from "complex.h" nogil: double cabs(double complex) +cdef extern from "_internal.h" nogil: + pass + + def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): """Evaluate spherical Bessel functions. @@ -343,6 +347,9 @@ def eval_target_specific_qbx_locals( if not (laplace_slp or laplace_dlp or helmholtz_slp): raise ValueError("unknown kernel") + if qbx_centers.shape[0] == 0: + return + # Hack to obtain thread-local storage maxthreads = openmp.omp_get_max_threads() diff --git a/setup.py b/setup.py index 4d68287c..a0a7ef68 100644 --- a/setup.py +++ b/setup.py @@ -59,9 +59,13 @@ write_git_revision("pytential") ext_modules = [ Extension( "pytential.qbx.target_specific._internal", - ["pytential/qbx/target_specific/_internal.pyx", - "pytential/qbx/target_specific/_helmholtz_utils.c"], - extra_compile_args=["-fopenmp", "-ffast-math"], + sources=[ + "pytential/qbx/target_specific/_internal.pyx", + "pytential/qbx/target_specific/_helmholtz_utils.c"], + depends=[ + "pytential/qbx/target_specific/_internal.h", + "pytential/qbx/target_specific/_helmholtz_utils.h"], + extra_compile_args=["-Wall", "-fopenmp", "-ffast-math"], extra_link_args=["-fopenmp"] ), ] @@ -104,7 +108,7 @@ setup(name="pytential", packages=find_packages(), - ext_modules = cythonize(ext_modules), + ext_modules=cythonize(ext_modules), install_requires=[ "pytest>=2.3", -- GitLab From 506098b7d7adfbd1ee9a9443f27b7374a077175f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 15:20:25 -0500 Subject: [PATCH 078/139] Towards supporting Helmholtz DLP. --- pytential/qbx/fmmlib.py | 12 +--- pytential/qbx/target_specific/_internal.pyx | 79 ++++++++++++++++++--- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index c52cb7f5..83bc4fe9 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -231,16 +231,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad=ifgrad) - @staticmethod - def is_supported_helmknl_for_tsqbx(knl): - if isinstance(knl, DirectionalSourceDerivative): - knl = knl.inner_kernel - - else: - if isinstance(knl, HelmholtzKernel) and knl.dim == 3: - return True - - return isinstance(knl, LaplaceKernel) and knl.dim == 3 + def is_supported_helmknl_for_tsqbx(self, knl): + return self.is_supported_helmknl(knl) @staticmethod def is_supported_helmknl(knl): diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index a9117ce8..ac816614 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -195,6 +195,56 @@ cdef void tsqbx_laplace_dlp( return +cdef void tsqbx_helmholtz_dlp( + double[3] source, + double[3] center, + double[3] target, + double[3] grad, + int order, + double complex k) nogil: + cdef: + int i, j + double result, sc_d, tc_d, cos_angle, alpha, R + double[128] tmp + double[128] derivs + double[3] cms + double[3] tmc + + """ + for j in range(3): + cms[j] = center[j] - source[j] + tmc[j] = target[j] - center[j] + grad[j] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + alpha = ( + (target[0] - center[0]) * (source[0] - center[0]) + + (target[1] - center[1]) * (source[1] - center[1]) + + (target[2] - center[2]) * (source[2] - center[2])) + + cos_angle = alpha / (tc_d * sc_d) + + legvals(cos_angle, order, tmp, derivs) + + R = 1 / sc_d + + for i in range(0, order + 1): + # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) + for j in range(3): + grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] + for j in range(3): + # Siegel and Tornberg has a sign flip here :( + grad[j] += ( + tmc[j] / (tc_d * sc_d) + + alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] + R *= (tc_d / sc_d) + """ + + return + + cdef double tsqbx_laplace_slp( double[3] source, double[3] center, @@ -338,14 +388,17 @@ def eval_target_specific_qbx_locals( int i, tid double complex result double[:,:] source, center, target, grad - int laplace_slp, helmholtz_slp, laplace_dlp + int laplace_slp, helmholtz_slp, laplace_dlp, helmholtz_dlp - laplace_slp = (helmholtz_k == 0) and (dipstr is not None) and (dipvec is None) - laplace_dlp = (helmholtz_k == 0) and (dipstr is not None) and (dipvec is not None) - helmholtz_slp = (helmholtz_k != 0) and (dipstr is not None) and (dipvec is None) + if dipstr is None: + raise ValueError("must specify dipvec") - if not (laplace_slp or laplace_dlp or helmholtz_slp): - raise ValueError("unknown kernel") + laplace_slp = (helmholtz_k == 0) and (dipvec is None) + laplace_dlp = (helmholtz_k == 0) and (dipvec is not None) + helmholtz_slp = (helmholtz_k != 0) and (dipvec is None) + helmholtz_dlp = (helmholtz_k != 0) and (dipvec is not None) + + assert laplace_slp or laplace_dlp or helmholtz_slp or helmholtz_dlp if qbx_centers.shape[0] == 0: return @@ -393,8 +446,8 @@ def eval_target_specific_qbx_locals( for i in range(3): source[tid, i] = sources[i, isrc] - # NOTE: Don't replace with +=, since that makes Cython think - # we are doing an OpenMP reduction. + # NOTE: Don't use +=, since that makes Cython think we are + # doing an OpenMP reduction. if laplace_slp: result = result + dipstr[isrc] * ( @@ -416,4 +469,14 @@ def eval_target_specific_qbx_locals( grad[tid, 1] * dipvec[1, isrc] + grad[tid, 2] * dipvec[2, isrc]) + elif helmholtz_dlp: + tsqbx_helmholtz_dlp(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], &grad[tid, 0], order, + helmholtz_k) + + result = result + dipstr[isrc] * ( + grad[tid, 0] * dipvec[0, isrc] + + grad[tid, 1] * dipvec[1, isrc] + + grad[tid, 2] * dipvec[2, isrc]) + pot[tgt] = pot[tgt] + result -- GitLab From 94aa9289156726cc2086e741d12f9072d186a181 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 16:55:49 -0500 Subject: [PATCH 079/139] Make Helmholtz DLP work --- pytential/qbx/target_specific/_internal.h | 3 +- pytential/qbx/target_specific/_internal.pyx | 146 +++++++++++++------- test/test_target_specific_qbx.py | 3 - 3 files changed, 96 insertions(+), 56 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h index cef63d8d..914b2d05 100644 --- a/pytential/qbx/target_specific/_internal.h +++ b/pytential/qbx/target_specific/_internal.h @@ -1,6 +1,7 @@ #ifndef INTERNAL_H #define INTERNAL_H -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +// Temporary buffer size for holding e.g. Legendre polynomial values +#define BUFSIZE 64 #endif diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index ac816614..e05fd24b 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -1,5 +1,5 @@ #!python -#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True +#cython: warn.unused=True, warn.unused_arg=True, warn.unreachable=True, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True import numpy as np import cython @@ -12,6 +12,10 @@ from libc.stdlib cimport abort cimport openmp +cdef extern from "complex.h" nogil: + double cabs(double complex) + + cdef extern from "_helmholtz_utils.h" nogil: int jfuns3d_(int *ier, int *nterms, double complex * z, double *scale, double complex *fjs, int *ifder, double complex *fjder, @@ -20,12 +24,8 @@ cdef extern from "_helmholtz_utils.h" nogil: double complex *hvec, int *ifder, double complex *hder); -cdef extern from "complex.h" nogil: - double cabs(double complex) - - cdef extern from "_internal.h" nogil: - pass + const int BUFSIZE def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): @@ -39,9 +39,9 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): fjder: *None*, or output array of complex double derivatives """ cdef: - double complex[128] fjstemp - double complex[128] fjdertmp - int[128] iscale + double complex[BUFSIZE] fjstemp + double complex[BUFSIZE] fjdertmp + int[BUFSIZE] iscale int ier, ifder, lwfjs, ntop, i, nterms_ double scale_ double complex z_ @@ -50,7 +50,7 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): z_ = z scale_ = scale ifder = fjder is not None - lwfjs = 1024 + lwfjs = BUFSIZE jfuns3d_(&ier, &nterms_, &z_, &scale_, fjstemp, &ifder, fjdertmp, &lwfjs, iscale, &ntop) @@ -156,9 +156,9 @@ cdef void tsqbx_laplace_dlp( int order) nogil: cdef: int i, j - double result, sc_d, tc_d, cos_angle, alpha, R - double[128] tmp - double[128] derivs + double sc_d, tc_d, cos_angle, alpha, R + double[BUFSIZE] tmp + double[BUFSIZE] derivs double[3] cms double[3] tmc @@ -177,6 +177,7 @@ cdef void tsqbx_laplace_dlp( cos_angle = alpha / (tc_d * sc_d) + # Evaluate the Legendre terms. legvals(cos_angle, order, tmp, derivs) R = 1 / sc_d @@ -199,22 +200,25 @@ cdef void tsqbx_helmholtz_dlp( double[3] source, double[3] center, double[3] target, - double[3] grad, + double complex[3] grad, int order, double complex k) nogil: cdef: - int i, j - double result, sc_d, tc_d, cos_angle, alpha, R - double[128] tmp - double[128] derivs - double[3] cms - double[3] tmc + int m, n + int ier, ntop, ifder, lwfjs + double sc_d, tc_d, cos_angle, alpha + double[3] cms, tmc + double complex[3] grad_tmp + double[BUFSIZE] lvals, lderivs + double complex z + double complex[BUFSIZE] jvals, hvals, hderivs + int[BUFSIZE] iscale + double jscale, hscale, unscale - """ - for j in range(3): - cms[j] = center[j] - source[j] - tmc[j] = target[j] - center[j] - grad[j] = 0 + for m in range(3): + cms[m] = center[m] - source[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 tc_d = dist(target, center) sc_d = dist(source, center) @@ -226,21 +230,46 @@ cdef void tsqbx_helmholtz_dlp( cos_angle = alpha / (tc_d * sc_d) - legvals(cos_angle, order, tmp, derivs) + # Evaluate the Legendre terms. + legvals(cos_angle, order, lvals, lderivs) - R = 1 / sc_d + # Scaling magic for Bessel and Hankel terms. + # These values are taken from the fmmlib documentation. + jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 - for i in range(0, order + 1): - # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) - for j in range(3): - grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] - for j in range(3): - # Siegel and Tornberg has a sign flip here :( - grad[j] += ( - tmc[j] / (tc_d * sc_d) + - alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] - R *= (tc_d / sc_d) - """ + # Evaluate the spherical Bessel terms. + z = k * tc_d + ifder = 0 + lwfjs = BUFSIZE + jfuns3d_(&ier, &order, &z, &jscale, jvals, &ifder, NULL, &lwfjs, iscale, + &ntop) + if ier: + # This could in theory fail. + fprintf(stderr, "array passed to jfuns3d was too small\n") + abort() + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 1 + h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) + + for n in range(0, order + 1): + for m in range(3): + grad_tmp[m] = -hderivs[n] * k * cms[m] * lvals[n] / sc_d + for m in range(3): + grad_tmp[m] += hvals[n] * ( + tmc[m] / (tc_d * sc_d) + + alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[n] + for m in range(3): + grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n]) + unscale *= jscale / hscale + + for m in range(3): + grad[m] *= 1j * k return @@ -294,10 +323,10 @@ cdef double complex tsqbx_helmholtz_slp( double complex k) nogil: cdef: int j, ntop, ier, ifder, lwfjs - double r, sc_d, tc_d, cos_angle - double[128] lvals - double complex[128] jvals, hvals - int[128] iscale + double sc_d, tc_d, cos_angle + double[BUFSIZE] lvals + double complex[BUFSIZE] jvals, hvals + int[BUFSIZE] iscale double jscale, hscale, unscale double complex z, result @@ -317,13 +346,14 @@ cdef double complex tsqbx_helmholtz_slp( # These values are taken from the fmmlib documentation. jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ^ n # Multiply against unscale to remove the scaling. - unscale = jscale / hscale + unscale = 1 # Evaluate the spherical Bessel terms. z = k * tc_d ifder = 0 - lwfjs = 128 + lwfjs = BUFSIZE jfuns3d_(&ier, &order, &z, &jscale, jvals, &ifder, NULL, &lwfjs, iscale, &ntop) if ier: @@ -335,9 +365,9 @@ cdef double complex tsqbx_helmholtz_slp( z = k * sc_d h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) - result = jvals[0] * hvals[0] * lvals[0] - - for j in range(1, 1 + order): + result = 0 + + for j in range(1 + order): result += (2 * j + 1) * unscale * (jvals[j] * hvals[j] * lvals[j]) unscale *= jscale / hscale @@ -388,6 +418,7 @@ def eval_target_specific_qbx_locals( int i, tid double complex result double[:,:] source, center, target, grad + double complex[:,:] grad_complex int laplace_slp, helmholtz_slp, laplace_dlp, helmholtz_dlp if dipstr is None: @@ -411,6 +442,7 @@ def eval_target_specific_qbx_locals( target = np.zeros((maxthreads, 65)) center = np.zeros((maxthreads, 65)) grad = np.zeros((maxthreads, 65)) + grad_complex = np.zeros((maxthreads, 65), dtype=np.complex) # TODO: Check that the order is not too high, since some temporary arrays # used above might overflow if that is the case. @@ -471,12 +503,22 @@ def eval_target_specific_qbx_locals( elif helmholtz_dlp: tsqbx_helmholtz_dlp(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], &grad[tid, 0], order, - helmholtz_k) + &target[tid, 0], &grad_complex[tid, 0], + order, helmholtz_k) result = result + dipstr[isrc] * ( - grad[tid, 0] * dipvec[0, isrc] + - grad[tid, 1] * dipvec[1, isrc] + - grad[tid, 2] * dipvec[2, isrc]) + grad_complex[tid, 0] * dipvec[0, isrc] + + grad_complex[tid, 1] * dipvec[1, isrc] + + grad_complex[tid, 2] * dipvec[2, isrc]) pot[tgt] = pot[tgt] + result + + # The Cython-generated OpenMP loop marks these variables as lastprivate. + # Due to this GCC warns that these could be used without being initialized. + # Initialize them here to suppress the warning. + result = 0 + tid = 0 + ctr = 0 + src_ibox = tgt_box = 0 + tgt = itgt = itgt_start = itgt_end = 0 + isrc = isrc_box = isrc_start = isrc_end = isrc_box_start = isrc_box_end = 0 diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 0cc531dd..eea5825d 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -136,9 +136,6 @@ def test_spherical_hankel_functions(): def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): logging.basicConfig(level=logging.INFO) - if helmholtz_k != 0 and op == "D": - pytest.xfail("not implemented yet") - cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) -- GitLab From f39e064101de132fb86517cbed7ec698be0a11f0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 17:03:32 -0500 Subject: [PATCH 080/139] Rename some variables. --- pytential/qbx/target_specific/_internal.pyx | 52 ++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index e05fd24b..fced6776 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -155,17 +155,17 @@ cdef void tsqbx_laplace_dlp( double[3] grad, int order) nogil: cdef: - int i, j + int j, m double sc_d, tc_d, cos_angle, alpha, R double[BUFSIZE] tmp double[BUFSIZE] derivs double[3] cms double[3] tmc - for j in range(3): - cms[j] = center[j] - source[j] - tmc[j] = target[j] - center[j] - grad[j] = 0 + for m in range(3): + cms[m] = center[m] - source[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 tc_d = dist(target, center) sc_d = dist(source, center) @@ -182,15 +182,15 @@ cdef void tsqbx_laplace_dlp( R = 1 / sc_d - for i in range(0, order + 1): - # Invariant: R = (t_cd ** i / sc_d ** (i + 1)) - for j in range(3): - grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i] - for j in range(3): + for j in range(0, order + 1): + # Invariant: R = (t_cd ** j / sc_d ** (j + 1)) + for m in range(3): + grad[m] += (j + 1) * cms[m] / (sc_d * sc_d) * R * tmp[j] + for m in range(3): # Siegel and Tornberg has a sign flip here :( - grad[j] += ( - tmc[j] / (tc_d * sc_d) + - alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i] + grad[m] += ( + tmc[m] / (tc_d * sc_d) + + alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[j] R *= (tc_d / sc_d) return @@ -204,7 +204,7 @@ cdef void tsqbx_helmholtz_dlp( int order, double complex k) nogil: cdef: - int m, n + int j, m int ier, ntop, ifder, lwfjs double sc_d, tc_d, cos_angle, alpha double[3] cms, tmc @@ -257,15 +257,15 @@ cdef void tsqbx_helmholtz_dlp( ifder = 1 h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) - for n in range(0, order + 1): + for j in range(0, order + 1): for m in range(3): - grad_tmp[m] = -hderivs[n] * k * cms[m] * lvals[n] / sc_d + grad_tmp[m] = -hderivs[j] * k * cms[m] * lvals[j] / sc_d for m in range(3): - grad_tmp[m] += hvals[n] * ( + grad_tmp[m] += hvals[j] * ( tmc[m] / (tc_d * sc_d) + - alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[n] + alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[j] for m in range(3): - grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n]) + grad[m] += (2 * j + 1) * unscale * (grad_tmp[m] * jvals[j]) unscale *= jscale / hscale for m in range(3): @@ -415,7 +415,7 @@ def eval_target_specific_qbx_locals( int tgt_box, src_ibox int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end - int i, tid + int m, tid double complex result double[:,:] source, center, target, grad double complex[:,:] grad_complex @@ -456,15 +456,15 @@ def eval_target_specific_qbx_locals( tgt_box = qbx_center_to_target_box[ctr] tid = cython.parallel.threadid() - for i in range(3): - center[tid, i] = centers[i, ctr] + for m in range(3): + center[tid, m] = centers[m, ctr] for itgt in range(itgt_start, itgt_end): result = 0 tgt = center_to_target_lists[itgt] - for i in range(3): - target[tid, i] = targets[i, tgt] + for m in range(3): + target[tid, m] = targets[m, tgt] isrc_box_start = source_box_starts[tgt_box] isrc_box_end = source_box_starts[tgt_box + 1] @@ -475,8 +475,8 @@ def eval_target_specific_qbx_locals( isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for isrc in range(isrc_start, isrc_end): - for i in range(3): - source[tid, i] = sources[i, isrc] + for m in range(3): + source[tid, m] = sources[m, isrc] # NOTE: Don't use +=, since that makes Cython think we are # doing an OpenMP reduction. -- GitLab From 4349f0dc8f5b61c42658a4ce27df22a5d6f9df80 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 17:31:04 -0500 Subject: [PATCH 081/139] Add comments. --- pytential/qbx/target_specific/_internal.pyx | 32 +++++++++++++-------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 8f5033de..97eb93bf 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -204,7 +204,7 @@ cdef void tsqbx_helmholtz_dlp( int order, double complex k) nogil: cdef: - int j, m + int n, m int ier, ntop, ifder, lwfjs double sc_d, tc_d, cos_angle, alpha double[3] cms, tmc @@ -257,16 +257,24 @@ cdef void tsqbx_helmholtz_dlp( ifder = 1 h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) - for j in range(0, order + 1): + # + # This is a mess, but amounts to the s-gradient of: + # + # __ order + # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) + # /__ n = 0 n n n + # + # + for n in range(0, order + 1): for m in range(3): - grad_tmp[m] = -hderivs[j] * k * cms[m] * lvals[j] / sc_d + grad_tmp[m] = -hderivs[n] * k * cms[m] * lvals[n] / sc_d for m in range(3): - grad_tmp[m] += hvals[j] * ( + grad_tmp[m] += hvals[n] * ( tmc[m] / (tc_d * sc_d) + - alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[j] + alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[n] for m in range(3): - grad[m] += (2 * j + 1) * unscale * (grad_tmp[m] * jvals[j]) - unscale *= jscale / hscale + grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n]) + unscale *= nscale / hscale for m in range(3): grad[m] *= 1j * k @@ -327,7 +335,7 @@ cdef double complex tsqbx_helmholtz_slp( int order, double complex k) nogil: cdef: - int j, ntop, ier, ifder, lwfjs + int n, ntop, ier, ifder, lwfjs double sc_d, tc_d, cos_angle double[BUFSIZE] lvals double complex[BUFSIZE] jvals, hvals @@ -351,7 +359,7 @@ cdef double complex tsqbx_helmholtz_slp( # These values are taken from the fmmlib documentation. jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 - # unscale = (jscale / hscale) ^ n + # unscale = (jscale / hscale) ** n # Multiply against unscale to remove the scaling. unscale = 1 @@ -371,9 +379,9 @@ cdef double complex tsqbx_helmholtz_slp( h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) result = 0 - - for j in range(1 + order): - result += (2 * j + 1) * unscale * (jvals[j] * hvals[j] * lvals[j]) + + for n in range(1 + order): + result += (2 * n + 1) * unscale * (jvals[n] * hvals[n] * lvals[n]) unscale *= jscale / hscale return result * 1j * k -- GitLab From e3f38d9dc43f584e1a36916c9cdd8bdd946006ff Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Aug 2018 17:32:03 -0500 Subject: [PATCH 082/139] Fix typo --- pytential/qbx/target_specific/_internal.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 97eb93bf..d44cbaff 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -274,7 +274,7 @@ cdef void tsqbx_helmholtz_dlp( alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[n] for m in range(3): grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n]) - unscale *= nscale / hscale + unscale *= jscale / hscale for m in range(3): grad[m] *= 1j * k -- GitLab From 1bd11526a80b7e2dcd8d06e8911c5a0abdf090eb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 16 Aug 2018 15:25:23 -0500 Subject: [PATCH 083/139] Add a test case for Helmholtz with complex wavenumber --- test/test_target_specific_qbx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index eea5825d..4fda27a9 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -131,7 +131,7 @@ def test_spherical_hankel_functions(): @pytest.mark.parametrize("op", ["S", "D"]) -@pytest.mark.parametrize("helmholtz_k", [0, 1.2]) +@pytest.mark.parametrize("helmholtz_k", [0, 1.2, 1.2j]) @pytest.mark.parametrize("qbx_order", [0, 1, 5]) def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): logging.basicConfig(level=logging.INFO) @@ -158,7 +158,7 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): refiner_extra_kwargs = {} if helmholtz_k != 0: - refiner_extra_kwargs["kernel_length_scale"] = 5 / helmholtz_k + refiner_extra_kwargs["kernel_length_scale"] = 5 / abs(helmholtz_k) qbx, _ = QBXLayerPotentialSource( pre_density_discr, 4*target_order, @@ -178,7 +178,7 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): kernel = LaplaceKernel(3) kernel_kwargs = {} else: - kernel = HelmholtzKernel(3) + kernel = HelmholtzKernel(3, allow_evanescent=True) kernel_kwargs = {"k": sym.var("k")} u_sym = sym.var("u") -- GitLab From 371ce251e29edebdc0eeafe86339f7b764fdd515 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 16 Aug 2018 20:25:36 -0500 Subject: [PATCH 084/139] Update for TimingResult changes --- examples/performance.py | 2 +- pytential/qbx/performance.py | 2 +- test/test_performance_model.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/performance.py b/examples/performance.py index cc0c01db..3e2dc61b 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -148,7 +148,7 @@ def test_performance_model(ctx, perf_model): timing_result = {} for param in model_result: timing_result[param] = ( - sum(temp_timing_result[param].process_elapsed + sum(temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS print("=" * 20) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 66af69db..f06679d0 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -784,7 +784,7 @@ def estimate_calibration_params(model_results, timing_results): for param, time in timing_result.items(): calibration_param = ( _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) - actual_times[calibration_param][i] = time.process_elapsed + actual_times[calibration_param][i] = time["process_elapsed"] result = {} diff --git a/test/test_performance_model.py b/test/test_performance_model.py index bf399e4b..f7517a72 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -328,9 +328,9 @@ def test_performance_model_correctness(ctx_getter, dim): # constant one wrangler. mismatches = [] for stage in timing_data: - if timing_data[stage].process_elapsed != modeled_time[stage]: + if timing_data[stage]["ops_elapsed"] != modeled_time[stage]: mismatches.append( - (stage, timing_data[stage].process_elapsed, modeled_time[stage])) + (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage])) assert not mismatches, "\n".join(str(s) for s in mismatches) -- GitLab From 408698d3cb3a7fd1a6055c829d51276c158a2ebb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Aug 2018 22:39:23 -0500 Subject: [PATCH 085/139] Use a k value as suggested by the fmmlib documentation to avoid unspecified scaling issues. --- test/test_target_specific_qbx.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 4fda27a9..3e9c7302 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -131,7 +131,7 @@ def test_spherical_hankel_functions(): @pytest.mark.parametrize("op", ["S", "D"]) -@pytest.mark.parametrize("helmholtz_k", [0, 1.2, 1.2j]) +@pytest.mark.parametrize("helmholtz_k", [0, 1.2, 12 + 1.2j]) @pytest.mark.parametrize("qbx_order", [0, 1, 5]) def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): logging.basicConfig(level=logging.INFO) @@ -191,13 +191,13 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs) bound_op = bind(qbx, expr) - pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k) + pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k).get() qbx = qbx.copy(_use_tsqbx=True) bound_op = bind(qbx, expr) - pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k) + pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k).get() - assert (np.max(np.abs(pot_ref.get() - pot_tsqbx.get()))) < 1e-13 + assert np.allclose(pot_tsqbx, pot_ref, atol=1e-13, rtol=1e-13) # You can test individual routines by typing -- GitLab From 1441cef2c946edb220e83eee80140ab317d8e203 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Aug 2018 22:40:25 -0500 Subject: [PATCH 086/139] Fix a naming issue: Don't use *dipstr* to refer to the monopole strengths. Use *charge* instead. --- pytential/qbx/fmmlib.py | 3 ++- pytential/qbx/target_specific/_internal.pyx | 17 +++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 83bc4fe9..42f8644c 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -617,7 +617,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): box_source_starts=self.tree.box_source_starts, box_source_counts_nonchild=self.tree.box_source_counts_nonchild, helmholtz_k=self.kernel_kwargs.get("zk", 0), - dipstr=src_weights, + charge=src_weights if self.dipole_vec is None else None, + dipstr=src_weights if self.dipole_vec is not None else None, dipvec=self.dipole_vec, pot=output) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index d44cbaff..d90d0412 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -398,6 +398,7 @@ def eval_target_specific_qbx_locals( int[:] source_box_starts, int[:] source_box_lists, int[:] box_source_starts, int[:] box_source_counts_nonchild, double complex helmholtz_k, + double[:] charge, double[:] dipstr, double[:,:] dipvec, double complex[:] pot): @@ -417,8 +418,9 @@ def eval_target_specific_qbx_locals( box_source_starts: "Start" indices for sources for each box box_source_counts_nonchild: Number of sources per box helmholtz_k: Helmholtz parameter (Pass 0 for Laplace) - dipstr: Source weights, shape (*nsrcs*,) - dipvec: Source gradient weights, shape (3, *nsrcs*), or *None* + charge: Source strengths, shape (*nsrcs*,) or *None* + dipstr: Dipole source strengths, shape (*nsrcs*,) or *None* + dipvec: Dipole source orientations, shape (3, *nsrcs*), or *None* pot: Output potential, shape (*ngts*,) """ @@ -434,8 +436,11 @@ def eval_target_specific_qbx_locals( double complex[:,:] grad_complex int laplace_slp, helmholtz_slp, laplace_dlp, helmholtz_dlp - if dipstr is None: - raise ValueError("must specify dipvec") + if charge is None and (dipstr is None or dipvec is None): + raise ValueError("must specify either charge, or both dipstr and dipvec") + + if charge is not None and (dipstr is not None or dipvec is not None): + raise ValueError("does not support simultaneous monopoles and dipoles") laplace_slp = (helmholtz_k == 0) and (dipvec is None) laplace_dlp = (helmholtz_k == 0) and (dipvec is not None) @@ -495,12 +500,12 @@ def eval_target_specific_qbx_locals( # doing an OpenMP reduction. if laplace_slp: - result = result + dipstr[isrc] * ( + result = result + charge[isrc] * ( tsqbx_laplace_slp(&source[tid, 0], ¢er[tid, 0], &target[tid, 0], order)) elif helmholtz_slp: - result = result + dipstr[isrc] * ( + result = result + charge[isrc] * ( tsqbx_helmholtz_slp(&source[tid, 0], ¢er[tid, 0], &target[tid, 0], order, helmholtz_k)) -- GitLab From 7dbf92117ebac73304cb6fcf0674c62ba0d8895f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Aug 2018 22:53:15 -0500 Subject: [PATCH 087/139] Fixes for TimingResult changes. --- examples/performance-3d.py | 2 +- examples/performance.py | 2 +- pytential/qbx/performance.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/performance-3d.py b/examples/performance-3d.py index 72ad59af..a045c658 100644 --- a/examples/performance-3d.py +++ b/examples/performance-3d.py @@ -167,7 +167,7 @@ def test_performance_model(ctx, perf_model): timing_result = {} for param in model_result: timing_result[param] = ( - sum(temp_timing_result[param].process_elapsed + sum(temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS print("=" * 20) diff --git a/examples/performance.py b/examples/performance.py index cc0c01db..3e2dc61b 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -148,7 +148,7 @@ def test_performance_model(ctx, perf_model): timing_result = {} for param in model_result: timing_result[param] = ( - sum(temp_timing_result[param].process_elapsed + sum(temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS print("=" * 20) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index ee7416c3..65894a17 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -780,7 +780,7 @@ def estimate_calibration_params(model_results, timing_results): for param, time in timing_result.items(): calibration_param = ( _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) - actual_times[calibration_param][i] = time.process_elapsed + actual_times[calibration_param][i] = time["process_elapsed"] result = {} -- GitLab From 03cc44b0f5c2cf413ddf3f202dd9da3d67d1a341 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 19 Aug 2018 17:08:32 -0500 Subject: [PATCH 088/139] Fixes for TimingResult changes. --- examples/performance.py | 2 +- pytential/qbx/performance.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/performance.py b/examples/performance.py index cc0c01db..3e2dc61b 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -148,7 +148,7 @@ def test_performance_model(ctx, perf_model): timing_result = {} for param in model_result: timing_result[param] = ( - sum(temp_timing_result[param].process_elapsed + sum(temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS print("=" * 20) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 2e42203d..8c886f40 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -757,7 +757,7 @@ def estimate_calibration_params(model_results, timing_results): for param, time in timing_result.items(): calibration_param = ( _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) - actual_times[calibration_param][i] = time.process_elapsed + actual_times[calibration_param][i] = time["process_elapsed"] result = {} -- GitLab From 0515a39d8dd6a4b989a807ab947282b0612274ee Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:17:02 -0500 Subject: [PATCH 089/139] Move ToHostTransferredGeoDataWrapper to utils. --- pytential/qbx/fmmlib.py | 59 +-------------------------------------- pytential/qbx/utils.py | 62 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 58 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index dfd03b25..96268277 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -65,64 +65,6 @@ class QBXFMMLibExpansionWranglerCodeContainer(object): # }}} -# {{{ host geo data wrapper - -class ToHostTransferredGeoDataWrapper(object): - def __init__(self, queue, geo_data): - self.queue = queue - self.geo_data = geo_data - - @memoize_method - def tree(self): - return self.traversal().tree - - @memoize_method - def traversal(self): - return self.geo_data.traversal().get(queue=self.queue) - - @property - def ncenters(self): - return self.geo_data.ncenters - - @memoize_method - def centers(self): - return np.array([ - ci.get(queue=self.queue) - for ci in self.geo_data.centers()]) - - @memoize_method - def expansion_radii(self): - return self.geo_data.expansion_radii().get(queue=self.queue) - - @memoize_method - def global_qbx_centers(self): - return self.geo_data.global_qbx_centers().get(queue=self.queue) - - @memoize_method - def qbx_center_to_target_box(self): - return self.geo_data.qbx_center_to_target_box().get(queue=self.queue) - - @memoize_method - def qbx_center_to_target_box_source_level(self, source_level): - return self.geo_data.qbx_center_to_target_box_source_level( - source_level).get(queue=self.queue) - - @memoize_method - def non_qbx_box_target_lists(self): - return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue) - - @memoize_method - def center_to_tree_targets(self): - return self.geo_data.center_to_tree_targets().get(queue=self.queue) - - @memoize_method - def all_targets(self): - """All (not just non-QBX) targets packaged into a single array.""" - return np.array(list(self.tree().targets)) - -# }}} - - # {{{ fmmlib expansion wrangler class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @@ -136,6 +78,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): # FMMLib is CPU-only. This wrapper gets the geometry out of # OpenCL-land. + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper self.geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) self.qbx_order = qbx_order diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py index b0ffc066..ed9e67bc 100644 --- a/pytential/qbx/utils.py +++ b/pytential/qbx/utils.py @@ -601,4 +601,66 @@ def build_tree_with_qbx_metadata( # }}} + +# {{{ host geo data wrapper + +class ToHostTransferredGeoDataWrapper(object): + """Wraps an instance of :class:`pytential.qbx.geometry.QBXFMMGeometryData`, + automatically converting returned OpenCL arrays to host data. + """ + + def __init__(self, queue, geo_data): + self.queue = queue + self.geo_data = geo_data + + @memoize_method + def tree(self): + return self.geo_data.tree().get(queue=self.queue) + + @memoize_method + def traversal(self): + return self.geo_data.traversal().get(queue=self.queue) + + @property + def ncenters(self): + return self.geo_data.ncenters + + @memoize_method + def centers(self): + return np.array([ + ci.get(queue=self.queue) + for ci in self.geo_data.centers()]) + + @memoize_method + def expansion_radii(self): + return self.geo_data.expansion_radii().get(queue=self.queue) + + @memoize_method + def global_qbx_centers(self): + return self.geo_data.global_qbx_centers().get(queue=self.queue) + + @memoize_method + def qbx_center_to_target_box(self): + return self.geo_data.qbx_center_to_target_box().get(queue=self.queue) + + @memoize_method + def qbx_center_to_target_box_source_level(self, source_level): + return self.geo_data.qbx_center_to_target_box_source_level( + source_level).get(queue=self.queue) + + @memoize_method + def non_qbx_box_target_lists(self): + return self.geo_data.non_qbx_box_target_lists().get(queue=self.queue) + + @memoize_method + def center_to_tree_targets(self): + return self.geo_data.center_to_tree_targets().get(queue=self.queue) + + @memoize_method + def all_targets(self): + """All (not just non-QBX) targets packaged into a single array.""" + return np.array(list(self.tree().targets)) + +# }}} + # vim: foldmethod=marker:filetype=pyopencl -- GitLab From 2b2b631f31442f3cfd50c3462e056753aa177c6e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:17:24 -0500 Subject: [PATCH 090/139] Test off-surface eval. --- test/test_performance_model.py | 106 ++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index f7517a72..8897fecc 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -63,7 +63,7 @@ def get_lpot_source(queue, dim): from meshmode.mesh.generation import generate_torus mesh = generate_torus(2, 1, order=target_order) else: - raise ValueError("unknown dimension: %d" % dim) + raise ValueError("unsupported dimension: %d" % dim) pre_density_discr = Discretization( queue.context, mesh, @@ -90,15 +90,6 @@ def get_density(queue, lpot_source): nodes = density_discr.nodes().with_queue(queue) return cl.clmath.sin(10 * nodes[0]) - -def get_bound_slp_op(lpot_source): - from sumpy.kernel import LaplaceKernel - sigma_sym = sym.var("sigma") - k_sym = LaplaceKernel(lpot_source.ambient_dim) - - sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - return bind(lpot_source, sym_op_S) - # }}} @@ -113,7 +104,13 @@ def test_timing_data_gathering(ctx_getter): lpot_source = get_lpot_source(queue, 2) sigma = get_density(queue, lpot_source) - op_S = get_bound_slp_op(lpot_source) + + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + op_S = bind(lpot_source, sym_op_S) timing_data = {} op_S.eval(queue, dict(sigma=sigma), timing_data=timing_data) @@ -157,27 +154,20 @@ def test_performance_model(ctx_getter, dim): class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): def __init__(self, queue, geo_data): - host_tree = geo_data.tree().get(queue) - ConstantOneExpansionWrangler.__init__(self, host_tree) + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper + geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) self.geo_data = geo_data + self.trav = geo_data.traversal() - self.qbx_center_to_target_box = ( - geo_data.qbx_center_to_target_box().get(queue)) - self.qbx_center_to_target_box_source_level = [ - geo_data.qbx_center_to_target_box_source_level(lev).get(queue) - for lev in range(host_tree.nlevels)] - self.global_qbx_centers = geo_data.global_qbx_centers().get(queue) - self.trav = geo_data.traversal().get(queue) - self.center_to_tree_targets = geo_data.center_to_tree_targets().get(queue) - self.non_qbx_box_target_lists = ( - geo_data.non_qbx_box_target_lists().get(queue)) + ConstantOneExpansionWrangler.__init__(self, geo_data.tree()) def _get_target_slice(self, ibox): - pstart = self.non_qbx_box_target_lists.box_target_starts[ibox] + non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() + pstart = non_qbx_box_target_lists.box_target_starts[ibox] return slice( pstart, pstart - + self.non_qbx_box_target_lists.box_target_counts_nonchild[ibox]) + + non_qbx_box_target_lists.box_target_counts_nonchild[ibox]) def output_zeros(self): non_qbx_box_target_lists = self.geo_data.non_qbx_box_target_lists() @@ -198,8 +188,11 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): local_exps = self.qbx_local_expansion_zeros() ops = 0 - for itgt_center, tgt_icenter in enumerate(self.global_qbx_centers): - itgt_box = self.qbx_center_to_target_box[tgt_icenter] + global_qbx_centers = self.geo_data.global_qbx_centers() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + itgt_box = qbx_center_to_target_box[tgt_icenter] start, end = ( self.trav.neighbor_source_boxes_starts[itgt_box:itgt_box + 2]) @@ -217,10 +210,15 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): def translate_box_multipoles_to_qbx_local(self, multipole_exps): local_exps = self.qbx_local_expansion_zeros() ops = 0 + + global_qbx_centers = self.geo_data.global_qbx_centers() + for isrc_level, ssn in enumerate(self.trav.from_sep_smaller_by_level): - for tgt_icenter in self.global_qbx_centers: - icontaining_tgt_box = self.qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] + for tgt_icenter in global_qbx_centers: + icontaining_tgt_box = ( + self.geo_data + .qbx_center_to_target_box_source_level(isrc_level) + [tgt_icenter]) if icontaining_tgt_box == -1: continue @@ -239,8 +237,11 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): qbx_expansions = self.qbx_local_expansion_zeros() ops = 0 - for tgt_icenter in self.global_qbx_centers: - isrc_box = self.qbx_center_to_target_box[tgt_icenter] + global_qbx_centers = self.geo_data.global_qbx_centers() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for tgt_icenter in global_qbx_centers: + isrc_box = qbx_center_to_target_box[tgt_icenter] src_ibox = self.trav.target_boxes[isrc_box] qbx_expansions[tgt_icenter] += local_exps[src_ibox] ops += 1 @@ -250,11 +251,15 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): def eval_qbx_expansions(self, qbx_expansions): output = self.full_output_zeros() ops = 0 - for src_icenter in self.global_qbx_centers: + + global_qbx_centers = self.geo_data.global_qbx_centers() + center_to_tree_targets = self.geo_data.center_to_tree_targets() + + for src_icenter in global_qbx_centers: start, end = ( - self.center_to_tree_targets.starts[src_icenter:src_icenter+2]) + center_to_tree_targets.starts[src_icenter:src_icenter+2]) for icenter_tgt in range(start, end): - center_itgt = self.center_to_tree_targets.lists[icenter_tgt] + center_itgt = center_to_tree_targets.lists[icenter_tgt] output[0][center_itgt] += qbx_expansions[src_icenter] ops += 1 @@ -284,7 +289,8 @@ CONSTANT_ONE_PARAMS = dict( @pytest.mark.parametrize("dim", (2, 3)) -def test_performance_model_correctness(ctx_getter, dim): +@pytest.mark.parametrize("off_surface", (True, False)) +def test_performance_model_correctness(ctx_getter, dim, off_surface): cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) @@ -297,22 +303,42 @@ def test_performance_model_correctness(ctx_getter, dim): lpot_source = get_lpot_source(queue, dim).copy( performance_model=PerformanceModel(uses_pde_expansions=False)) + # Construct targets. + if off_surface: + from pytential.target import PointsTarget + from boxtree.tools import make_uniform_particle_array + ntargets = 10 ** 3 + targets = PointsTarget( + make_uniform_particle_array(queue, ntargets, dim, np.float)) + target_discrs_and_qbx_sides = ((targets, 0),) + qbx_forced_limit = None + else: + targets = lpot_source.density_discr + target_discrs_and_qbx_sides = ((targets, 1),) + qbx_forced_limit = 1 + + # Construct bound op, run performance model. + from sumpy.kernel import LaplaceKernel + sigma_sym = sym.var("sigma") + k_sym = LaplaceKernel(lpot_source.ambient_dim) + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=qbx_forced_limit) + + op_S = bind((lpot_source, targets), sym_op_S) sigma = get_density(queue, lpot_source) - op_S = get_bound_slp_op(lpot_source) from pytools import one perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) # Set all parameters equal to 1, to obtain raw op counts. perf_S = perf_S.with_params(CONSTANT_ONE_PARAMS) + # Run FMM with ConstantOneWrangler. This can't be done with pytential's + # high-level interface, so call the FMM driver directly. from pytential.qbx.fmm import drive_fmm geo_data = lpot_source.qbx_fmm_geometry_data( - target_discrs_and_qbx_sides=((lpot_source.density_discr, +1),)) + target_discrs_and_qbx_sides=target_discrs_and_qbx_sides) wrangler = ConstantOneQBXExpansionWrangler(queue, geo_data) - nnodes = lpot_source.quad_stage2_density_discr.nnodes - src_weights = np.ones(nnodes) timing_data = {} -- GitLab From 6e7e134bee903a59fa3670d65a7480daa4fd5c98 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:20:37 -0500 Subject: [PATCH 091/139] Point requirements.txt back to boxtree master --- .test-conda-env-py3-requirements.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.test-conda-env-py3-requirements.txt b/.test-conda-env-py3-requirements.txt index aa1f7e42..fa6c0426 100644 --- a/.test-conda-env-py3-requirements.txt +++ b/.test-conda-env-py3-requirements.txt @@ -1,4 +1,4 @@ -git+https://gitlab.tiker.net/inducer/boxtree@move-constant-one-wrangler-to-tools +git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/pymbolic git+https://github.com/inducer/loopy git+https://gitlab.tiker.net/inducer/sumpy diff --git a/requirements.txt b/requirements.txt index 9e58527f..6d1e4cce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/inducer/modepy git+https://github.com/inducer/pyopencl git+https://github.com/inducer/islpy git+https://github.com/inducer/loopy -git+https://gitlab.tiker.net/inducer/boxtree@move-constant-one-wrangler-to-tools +git+https://gitlab.tiker.net/inducer/boxtree git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://github.com/inducer/pyfmmlib -- GitLab From 196d50c27f4bd922756d0eb0cdbb59246dcb1e8f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:24:31 -0500 Subject: [PATCH 092/139] flake8 fix --- pytential/qbx/fmmlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 96268277..d8211a68 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -23,7 +23,7 @@ THE SOFTWARE. """ import numpy as np -from pytools import memoize_method, Record +from pytools import Record import pyopencl as cl # noqa import pyopencl.array # noqa: F401 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler -- GitLab From bf49b57435b309daa62bbe01932c2384a300c7f9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:36:57 -0500 Subject: [PATCH 093/139] Fix, simplify is_supported_helmknl and is_supported_helmknl_for_tsqbx --- pytential/qbx/fmmlib.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 42f8644c..243ed5de 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -231,17 +231,21 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad=ifgrad) + @staticmethod def is_supported_helmknl_for_tsqbx(self, knl): - return self.is_supported_helmknl(knl) + if isinstance(knl, DirectionalSourceDerivative): + knl = knl.inner_kernel + + return (isinstance(knl, (LaplaceKernel, HelmholtzKernel)) + and knl.dim == 3) @staticmethod def is_supported_helmknl(knl): if isinstance(knl, DirectionalSourceDerivative): knl = knl.inner_kernel - return ( - isinstance(knl, HelmholtzKernel) and knl.dim in [2, 3] - or isinstance(knl, LaplaceKernel) and knl.dim in [2, 3]) + return (isinstance(knl, (LaplaceKernel, HelmholtzKernel)) + and knl.dim in (2, 3)) # {{{ data vector helpers -- GitLab From b940b03e1943f7458bd08d8d69637b9828cf6f29 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 20 Aug 2018 15:37:44 -0500 Subject: [PATCH 094/139] Remove self argument to staticmethod --- pytential/qbx/fmmlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 243ed5de..0cf0d7dc 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -232,7 +232,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad=ifgrad) @staticmethod - def is_supported_helmknl_for_tsqbx(self, knl): + def is_supported_helmknl_for_tsqbx(knl): if isinstance(knl, DirectionalSourceDerivative): knl = knl.inner_kernel -- GitLab From ed2aa5e6db23cc5e2e49e3952bad4af23ebae46a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 21 Aug 2018 18:26:33 -0500 Subject: [PATCH 095/139] Import memoize_method --- pytential/qbx/fmmlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 4ebbc48b..cdbecf69 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -23,7 +23,7 @@ THE SOFTWARE. """ import numpy as np -from pytools import Record +from pytools import Record, memoize_method import pyopencl as cl # noqa import pyopencl.array # noqa: F401 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler -- GitLab From 53d31cb9f949939f2e4bb58314afff0cccd4cccf Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 21 Aug 2018 18:49:48 -0500 Subject: [PATCH 096/139] Verify TSQBX in test_performance_model_correctness() --- test/test_performance_model.py | 64 +++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 8897fecc..2c54dd88 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -153,12 +153,13 @@ def test_performance_model(ctx_getter, dim): class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): - def __init__(self, queue, geo_data): + def __init__(self, queue, geo_data, use_target_specific_qbx): from pytential.qbx.utils import ToHostTransferredGeoDataWrapper geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) self.geo_data = geo_data self.trav = geo_data.traversal() + self.use_target_specific_qbx = use_target_specific_qbx ConstantOneExpansionWrangler.__init__(self, geo_data.tree()) @@ -188,6 +189,9 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): local_exps = self.qbx_local_expansion_zeros() ops = 0 + if self.use_target_specific_qbx: + return local_exps, self.timing_future(ops) + global_qbx_centers = self.geo_data.global_qbx_centers() qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() @@ -265,6 +269,40 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): return output, self.timing_future(ops) + def eval_target_specific_qbx_locals(self, src_weights): + pot = self.full_output_zeros() + ops = 0 + + if not self.use_target_specific_qbx: + return pot, self.timing_future(ops) + + global_qbx_centers = self.geo_data.global_qbx_centers() + center_to_tree_targets = self.geo_data.center_to_tree_targets() + qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() + + for ictr in global_qbx_centers: + tgt_ibox = qbx_center_to_target_box[ictr] + + ictr_tgt_start, ictr_tgt_end = center_to_tree_targets.starts[ictr:ictr+2] + + for ictr_tgt in range(ictr_tgt_start, ictr_tgt_end): + ctr_itgt = center_to_tree_targets.lists[ictr_tgt] + + isrc_box_start, isrc_box_end = ( + self.trav.neighbor_source_boxes_starts[tgt_ibox:tgt_ibox+2]) + + for isrc_box in range(isrc_box_start, isrc_box_end): + src_ibox = self.trav.neighbor_source_boxes_lists[isrc_box] + + isrc_start = self.tree.box_source_starts[src_ibox] + isrc_end = (isrc_start + + self.tree.box_source_counts_nonchild[src_ibox]) + + pot[0][ctr_itgt] += sum(src_weights[isrc_start:isrc_end]) + ops += isrc_end - isrc_start + + return pot, self.timing_future(ops) + # }}} @@ -285,12 +323,19 @@ CONSTANT_ONE_PARAMS = dict( c_p2p=1, c_p2qbxl=1, c_qbxl2p=1, + c_p2p_tsqbx=1, ) -@pytest.mark.parametrize("dim", (2, 3)) -@pytest.mark.parametrize("off_surface", (True, False)) -def test_performance_model_correctness(ctx_getter, dim, off_surface): +@pytest.mark.parametrize("dim, off_surface, use_target_specific_qbx", ( + (2, False, False), + (2, True, False), + (3, False, False), + (3, False, True), + (3, True, False), + (3, True, True))) +def test_performance_model_correctness(ctx_getter, dim, off_surface, + use_target_specific_qbx): cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) @@ -301,7 +346,8 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface): # parameters to equal 1 (done below), this provides a straightforward way # to obtain the raw operation count for each FMM stage. lpot_source = get_lpot_source(queue, dim).copy( - performance_model=PerformanceModel(uses_pde_expansions=False)) + performance_model=PerformanceModel(uses_pde_expansions=False), + _use_tsqbx=use_target_specific_qbx) # Construct targets. if off_surface: @@ -337,13 +383,15 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface): geo_data = lpot_source.qbx_fmm_geometry_data( target_discrs_and_qbx_sides=target_discrs_and_qbx_sides) - wrangler = ConstantOneQBXExpansionWrangler(queue, geo_data) + wrangler = ConstantOneQBXExpansionWrangler( + queue, geo_data, use_target_specific_qbx) nnodes = lpot_source.quad_stage2_density_discr.nnodes src_weights = np.ones(nnodes) timing_data = {} - potential = drive_fmm(wrangler, src_weights, timing_data, - traversal=wrangler.trav)[0][geo_data.ncenters:] + potential = drive_fmm( + wrangler, src_weights, timing_data, traversal=wrangler.trav, + use_tsqbx=use_target_specific_qbx)[0][geo_data.ncenters:] # Check constant one wrangler for correctness. assert (potential == nnodes).all() -- GitLab From 6c08a137ea566fe087f67a91d94e0c38c81ab917 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 22 Aug 2018 00:02:17 -0500 Subject: [PATCH 097/139] Algebraically simplify TSQBX expression for Laplace DLP. --- pytential/qbx/target_specific/_internal.pyx | 25 ++++++++++----------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index d90d0412..8733f142 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -156,11 +156,9 @@ cdef void tsqbx_laplace_dlp( int order) nogil: cdef: int j, m - double sc_d, tc_d, cos_angle, alpha, R - double[BUFSIZE] tmp - double[BUFSIZE] derivs - double[3] cms - double[3] tmc + double sc_d, tc_d, cos_angle, alpha, Rj + double[BUFSIZE] lvals, lderivs + double[3] cms, tmc, grad_tmp for m in range(3): cms[m] = center[m] - source[m] @@ -178,20 +176,21 @@ cdef void tsqbx_laplace_dlp( cos_angle = alpha / (tc_d * sc_d) # Evaluate the Legendre terms. - legvals(cos_angle, order, tmp, derivs) + legvals(cos_angle, order, lvals, lderivs) - R = 1 / sc_d + # Invariant: Rj = (t_cd ** j / sc_d ** (j + 2)) + Rj = 1 / (sc_d * sc_d) for j in range(0, order + 1): - # Invariant: R = (t_cd ** j / sc_d ** (j + 1)) for m in range(3): - grad[m] += (j + 1) * cms[m] / (sc_d * sc_d) * R * tmp[j] + grad_tmp[m] = (j + 1) * (cms[m] / sc_d) * lvals[j] for m in range(3): # Siegel and Tornberg has a sign flip here :( - grad[m] += ( - tmc[m] / (tc_d * sc_d) + - alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[j] - R *= (tc_d / sc_d) + grad_tmp[m] += (tmc[m] / tc_d + cos_angle * cms[m] / sc_d) * lderivs[j] + for m in range(3): + grad[m] += Rj * grad_tmp[m] + + Rj *= (tc_d / sc_d) return -- GitLab From ac8661e3de59fbf6bfb985154874e641f8bed80d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 22 Aug 2018 00:14:46 -0500 Subject: [PATCH 098/139] Whitespace fix --- pytential/qbx/target_specific/_internal.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 8733f142..66fcf0fe 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -178,7 +178,7 @@ cdef void tsqbx_laplace_dlp( # Evaluate the Legendre terms. legvals(cos_angle, order, lvals, lderivs) - # Invariant: Rj = (t_cd ** j / sc_d ** (j + 2)) + # Invariant: Rj = (t_cd ** j / sc_d ** (j + 2)) Rj = 1 / (sc_d * sc_d) for j in range(0, order + 1): -- GitLab From 159797471e0b6eed94c9bcd66fe78131cedbac8f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 22 Aug 2018 11:39:21 -0500 Subject: [PATCH 099/139] Algebraically simplify Helmholtz DLP TSQBX --- pytential/qbx/target_specific/_internal.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 66fcf0fe..cc003840 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -266,13 +266,13 @@ cdef void tsqbx_helmholtz_dlp( # for n in range(0, order + 1): for m in range(3): - grad_tmp[m] = -hderivs[n] * k * cms[m] * lvals[n] / sc_d + grad_tmp[m] = -k * cms[m] * hderivs[n] * lvals[n] for m in range(3): - grad_tmp[m] += hvals[n] * ( - tmc[m] / (tc_d * sc_d) + - alpha * cms[m] / (tc_d * sc_d * sc_d * sc_d)) * lderivs[n] + grad_tmp[m] += ( + (tmc[m] / tc_d) + + cos_angle * (cms[m] / sc_d)) * hvals[n] * lderivs[n] for m in range(3): - grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n]) + grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n] / sc_d) unscale *= jscale / hscale for m in range(3): -- GitLab From ef5e47cf9609ca45e2536948dc69ddedc4c6f651 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 11:56:14 -0500 Subject: [PATCH 100/139] Fix misuse of NotImplementedError --- pytential/qbx/fmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 8eb4a28a..ded7dfc7 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -125,7 +125,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), source_extra_kwargs, kernel_extra_kwargs, _use_target_specific_qbx=False): if _use_target_specific_qbx: - raise NotImplementedError("Cannot use TSQBX with sumpy yet") + raise ValueError("TSQBX is not implemented in sumpy") SumpyExpansionWrangler.__init__(self, code_container, queue, geo_data.tree(), -- GitLab From cdf3b372fd7c11d12b2ba2d36a0d5fe5238264cf Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 11:59:53 -0500 Subject: [PATCH 101/139] Remove example for being too slow. --- examples/performance-3d.py | 189 ------------------------------------- 1 file changed, 189 deletions(-) delete mode 100644 examples/performance-3d.py diff --git a/examples/performance-3d.py b/examples/performance-3d.py deleted file mode 100644 index a045c658..00000000 --- a/examples/performance-3d.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Trains a performance model and reports on the accuracy.""" - -import pyopencl as cl -import numpy as np # noqa - -from pytential import sym, bind -from pytools import one - - -# {{{ global params - -TARGET_ORDER = 8 -OVSMP_FACTOR = 5 -TCF = 0.9 -QBX_ORDER = 5 -FMM_ORDER = 10 -MESH_TOL = 1e-10 -FORCE_STAGE2_UNIFORM_REFINEMENT_ROUNDS = 1 -SCALED_MAX_CURVATURE_THRESHOLD = 0.8 -MAX_LEAF_REFINE_WEIGHT = 512 -RUNS = 3 - -DEFAULT_LPOT_KWARGS = { - "_box_extent_norm": "l2", - "_from_sep_smaller_crit": "static_l2", - } - -TRAINING_ARMS = (2, 3, 6) -TESTING_ARMS = (5,) - - -def urchin_lpot_source(queue, sph_harm_tuple, - from_sep_smaller_threshold=None, use_tsqbx=True): - from meshmode.discretization import Discretization - from meshmode.discretization.poly_element import ( - InterpolatoryQuadratureSimplexGroupFactory) - - target_order = TARGET_ORDER - - sph_m, sph_n = sph_harm_tuple - - from meshmode.mesh.generation import generate_urchin as get_urchin - mesh = get_urchin( - order=target_order, m=sph_m, n=sph_n, - est_rel_interp_tolerance=MESH_TOL) - - pre_density_discr = Discretization( - queue.context, mesh, - InterpolatoryQuadratureSimplexGroupFactory(target_order)) - - refiner_extra_kwargs = { - #"visualize": True, - "_force_stage2_uniform_refinement_rounds": ( - FORCE_STAGE2_UNIFORM_REFINEMENT_ROUNDS), - "_scaled_max_curvature_threshold": ( - SCALED_MAX_CURVATURE_THRESHOLD), - } - - lpot_kwargs = DEFAULT_LPOT_KWARGS.copy() - lpot_kwargs.update( - fmm_backend="fmmlib", - _well_sep_is_n_away=2, - _expansions_in_tree_have_extent=True, - _expansion_stick_out_factor=TCF, - _max_leaf_refine_weight=MAX_LEAF_REFINE_WEIGHT, - target_association_tolerance=1e-3, - fmm_order=FMM_ORDER, qbx_order=QBX_ORDER, - _from_sep_smaller_min_nsources_cumul=from_sep_smaller_threshold, - _use_tsqbx=use_tsqbx, - ) - - from pytential.qbx import QBXLayerPotentialSource - lpot_source = QBXLayerPotentialSource( - pre_density_discr, OVSMP_FACTOR*target_order, - **lpot_kwargs,) - - lpot_source, _ = lpot_source.with_refinement(**refiner_extra_kwargs) - - return lpot_source - -# }}} - - -def training_geometries(queue): - for n_arms in TRAINING_ARMS: - yield urchin_lpot_source(queue, (n_arms // 2, n_arms), 100) - - -def test_geometries(queue): - for n_arms in TESTING_ARMS: - yield urchin_lpot_source(queue, (n_arms // 2, n_arms), 100) - - -def get_bound_op(lpot_source): - from sumpy.kernel import LaplaceKernel - sigma_sym = sym.var("sigma") - k_sym = LaplaceKernel(lpot_source.ambient_dim) - op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - - return bind(lpot_source, op) - - -def get_test_density(queue, lpot_source): - density_discr = lpot_source.density_discr - nodes = density_discr.nodes().with_queue(queue) - sigma = cl.clmath.sin(10 * nodes[0]) - - return sigma - - -def train_performance_model(ctx): - queue = cl.CommandQueue(ctx) - - from pytential.qbx.performance import ( - PerformanceModel, estimate_calibration_params) - - perf_model = PerformanceModel() - - model_results = [] - timing_results = [] - - for lpot_source in training_geometries(queue): - lpot_source = lpot_source.copy(performance_model=perf_model) - bound_op = get_bound_op(lpot_source) - sigma = get_test_density(queue, lpot_source) - - perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) - - # Warm-up run. - bound_op.eval(queue, {"sigma": sigma}) - - for _ in range(RUNS): - timing_data = {} - bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) - - model_results.append(one(perf_S.values())) - timing_results.append(one(timing_data.values())) - - calibration_params = ( - estimate_calibration_params(model_results, timing_results)) - - return perf_model.with_calibration_params(calibration_params) - - -def test_performance_model(ctx, perf_model): - queue = cl.CommandQueue(ctx) - - for lpot_source in test_geometries(queue): - lpot_source = lpot_source.copy(performance_model=perf_model) - bound_op = get_bound_op(lpot_source) - sigma = get_test_density(queue, lpot_source) - - perf_S = bound_op.get_modeled_performance(queue, sigma=sigma) - model_result = ( - one(perf_S.values()) - .get_predicted_times(merge_close_lists=True)) - - # Warm-up run. - bound_op.eval(queue, {"sigma": sigma}) - - temp_timing_results = [] - for _ in range(RUNS): - timing_data = {} - bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) - temp_timing_results.append(one(timing_data.values())) - - timing_result = {} - for param in model_result: - timing_result[param] = ( - sum(temp_timing_result[param]["process_elapsed"] - for temp_timing_result in temp_timing_results)) / RUNS - - print("=" * 20) - for stage in model_result: - print("stage: ", stage) - print("actual: ", timing_result[stage]) - print("predicted: ", model_result[stage]) - print("=" * 20) - - -def predict_performance(ctx): - model = train_performance_model(ctx) - test_performance_model(ctx, model) - - -if __name__ == "__main__": - if 0: - # Disabled - this is slow. - predict_performance(cl.create_some_context(0)) -- GitLab From feaf18c055d320dd11b133387645c2d7896bc419 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 12:03:45 -0500 Subject: [PATCH 102/139] _use_tsqbx -> _use_target_specific_qbx --- pytential/qbx/__init__.py | 16 ++++++++-------- pytential/qbx/fmm.py | 4 +++- pytential/qbx/performance.py | 2 +- test/test_performance_model.py | 4 ++-- test/test_target_specific_qbx.py | 3 ++- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 1b87fc5c..86fa930d 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -83,7 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", - _use_tsqbx=False, + _use_target_specific_qbx=False, geometry_data_inspector=None, performance_model=None, fmm_backend="sumpy", @@ -204,7 +204,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._from_sep_smaller_min_nsources_cumul = \ _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind - self._use_tsqbx = _use_tsqbx + self._use_target_specific_qbx = _use_target_specific_qbx self.geometry_data_inspector = geometry_data_inspector self.performance_model = performance_model @@ -228,7 +228,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, - _use_tsqbx=_not_provided, + _use_target_specific_qbx=_not_provided, geometry_data_inspector=None, performance_model=_not_provided, fmm_backend=None, @@ -312,8 +312,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=( self._from_sep_smaller_min_nsources_cumul), _tree_kind=_tree_kind or self._tree_kind, - _use_tsqbx=(_use_tsqbx - if _use_tsqbx is not _not_provided else self._use_tsqbx), + _use_target_specific_qbx=(_use_target_specific_qbx + if _use_target_specific_qbx is not _not_provided + else self._use_target_specific_qbx), geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), performance_model=( @@ -821,7 +822,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.fmm_level_to_order, source_extra_kwargs=source_extra_kwargs, kernel_extra_kwargs=kernel_extra_kwargs, - _use_target_specific_qbx=self._use_tsqbx) + _use_target_specific_qbx=self._use_target_specific_qbx) from pytential.qbx.geometry import target_state if (geo_data.user_target_to_center().with_queue(queue) @@ -841,8 +842,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): from pytential.qbx.fmm import drive_fmm timing_data = {} - all_potentials_on_every_target = drive_fmm( - wrangler, strengths, timing_data, use_tsqbx=self._use_tsqbx) + all_potentials_on_every_target = drive_fmm(wrangler, strengths, timing_data) # }}} diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index ded7dfc7..366eaa23 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -386,7 +386,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), # {{{ FMM top-level def drive_fmm(expansion_wrangler, src_weights, timing_data=None, - traversal=None, use_tsqbx=False): + traversal=None): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`QBXFMMGeometryData` instance. @@ -405,6 +405,8 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None, geo_data = wrangler.geo_data + use_tsqbx = geo_data.lpot_source._use_target_specific_qbx + if traversal is None: traversal = geo_data.traversal() diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 691af8a2..62940a86 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -583,7 +583,7 @@ class PerformanceModel(object): lpot_source = geo_data.lpot_source nqbtl = geo_data.non_qbx_box_target_lists() - use_tsqbx = lpot_source._use_tsqbx + use_tsqbx = lpot_source._use_target_specific_qbx with cl.CommandQueue(geo_data.cl_context) as queue: tree = geo_data.tree().get(queue=queue) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 2c54dd88..39a378e3 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -347,7 +347,7 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, # to obtain the raw operation count for each FMM stage. lpot_source = get_lpot_source(queue, dim).copy( performance_model=PerformanceModel(uses_pde_expansions=False), - _use_tsqbx=use_target_specific_qbx) + _use_target_specific_qbx=use_target_specific_qbx) # Construct targets. if off_surface: @@ -391,7 +391,7 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, timing_data = {} potential = drive_fmm( wrangler, src_weights, timing_data, traversal=wrangler.trav, - use_tsqbx=use_target_specific_qbx)[0][geo_data.ncenters:] + _use_target_specific_qbx=use_target_specific_qbx)[0][geo_data.ncenters:] # Check constant one wrangler for correctness. assert (potential == nnodes).all() diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 3e9c7302..dd71676a 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -167,6 +167,7 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): fmm_backend="fmmlib", _expansions_in_tree_have_extent=True, _expansion_stick_out_factor=0.9, + _use_target_specific_qbx=False, ).with_refinement(**refiner_extra_kwargs) density_discr = qbx.density_discr @@ -193,7 +194,7 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): bound_op = bind(qbx, expr) pot_ref = bound_op(queue, u=u_dev, k=helmholtz_k).get() - qbx = qbx.copy(_use_tsqbx=True) + qbx = qbx.copy(_use_target_specific_qbx=True) bound_op = bind(qbx, expr) pot_tsqbx = bound_op(queue, u=u_dev, k=helmholtz_k).get() -- GitLab From 037e9bde4de99203bf5981ae9eec46f453fe0a5d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 12:10:07 -0500 Subject: [PATCH 103/139] Rename wrangler function --- pytential/qbx/fmm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 366eaa23..2e6f476a 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -374,9 +374,8 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), return (pot, SumpyTimingFuture(self.queue, [evt])) @log_process(logger) - def eval_target_specific_global_qbx_locals(self, src_weights): - # Not implemented - pass + def eval_target_specific_qbx_locals(self, src_weights): + raise NotImplementedError() # }}} -- GitLab From 9d6b89abf7d18c2d2e55247a455d91528585d595 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 12:31:33 -0500 Subject: [PATCH 104/139] Add lpot_source to ToHostTransferredGeoDataWrapper --- pytential/qbx/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py index ed9e67bc..ca8b08df 100644 --- a/pytential/qbx/utils.py +++ b/pytential/qbx/utils.py @@ -621,6 +621,10 @@ class ToHostTransferredGeoDataWrapper(object): def traversal(self): return self.geo_data.traversal().get(queue=self.queue) + @property + def lpot_source(self): + return self.geo_data.lpot_source + @property def ncenters(self): return self.geo_data.ncenters -- GitLab From 6c3aedee20cd05b715d3f09fb6e07558fbf9f7a8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 13:32:53 -0500 Subject: [PATCH 105/139] Fix drive_fmm invocation --- test/test_performance_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 39a378e3..7d027221 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -389,9 +389,8 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, src_weights = np.ones(nnodes) timing_data = {} - potential = drive_fmm( - wrangler, src_weights, timing_data, traversal=wrangler.trav, - _use_target_specific_qbx=use_target_specific_qbx)[0][geo_data.ncenters:] + potential = drive_fmm(wrangler, src_weights, timing_data, + traversal=wrangler.trav)[0][geo_data.ncenters:] # Check constant one wrangler for correctness. assert (potential == nnodes).all() -- GitLab From d8f604d00eb2cc2fabe5d40714c4ca405a10d936 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 24 Aug 2018 13:33:00 -0500 Subject: [PATCH 106/139] Fix fmmlib kernel logic again. --- pytential/qbx/fmmlib.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index cdbecf69..b2d7ace5 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -105,17 +105,21 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): if self.is_supported_helmknl(out_knl): outputs.append(()) + no_target_deriv_knl = out_knl + elif (isinstance(out_knl, AxisTargetDerivative) and self.is_supported_helmknl(out_knl.inner_kernel)): outputs.append((out_knl.axis,)) ifgrad = True + no_target_deriv_knl = out_knl.inner_kernel + else: raise ValueError( "only the 2/3D Laplace and Helmholtz kernel " "and their derivatives are supported") - source_deriv_names.append(out_knl.dir_vec_name - if isinstance(out_knl, DirectionalSourceDerivative) + source_deriv_names.append(no_target_deriv_knl.dir_vec_name + if isinstance(no_target_deriv_knl, DirectionalSourceDerivative) else None) base_knl = out_knl.get_base_kernel() -- GitLab From 31ded3267d2199a69100cbe4176de26ccab384ee Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 25 Aug 2018 18:41:12 -0500 Subject: [PATCH 107/139] Pass source strengths as complex array. --- pytential/qbx/fmmlib.py | 2 ++ pytential/qbx/target_specific/_internal.pyx | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index b2d7ace5..3318e666 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -553,6 +553,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ctt = geo_data.center_to_tree_targets() + src_weights = src_weights.astype(np.complex128) + for output in pot: ts.eval_target_specific_qbx_locals( order=self.qbx_order, diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index cc003840..ab258f8c 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -397,8 +397,8 @@ def eval_target_specific_qbx_locals( int[:] source_box_starts, int[:] source_box_lists, int[:] box_source_starts, int[:] box_source_counts_nonchild, double complex helmholtz_k, - double[:] charge, - double[:] dipstr, + double complex[:] charge, + double complex[:] dipstr, double[:,:] dipvec, double complex[:] pot): """TSQBX entry point. @@ -417,10 +417,10 @@ def eval_target_specific_qbx_locals( box_source_starts: "Start" indices for sources for each box box_source_counts_nonchild: Number of sources per box helmholtz_k: Helmholtz parameter (Pass 0 for Laplace) - charge: Source strengths, shape (*nsrcs*,) or *None* - dipstr: Dipole source strengths, shape (*nsrcs*,) or *None* - dipvec: Dipole source orientations, shape (3, *nsrcs*), or *None* - pot: Output potential, shape (*ngts*,) + charge: (Complex) Source strengths, shape (*nsrcs*,), or *None* + dipstr: (Complex) Dipole source strengths, shape (*nsrcs*,) or *None* + dipvec: (Real) Dipole source orientations, shape (3, *nsrcs*), or *None* + pot: (Complex) Output potential, shape (*ngts*,) """ cdef: -- GitLab From 68db3a415768d928acb0fe506d664b5fe3c550a6 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 27 Aug 2018 18:51:53 -0500 Subject: [PATCH 108/139] Allow order varying by level in perf model (closes #5) This also gets rid of summarize_parallel, which was broken / not fully supported anyway. This ends up simplifying a bit of code. --- pytential/qbx/__init__.py | 3 +- pytential/qbx/performance.py | 342 ++++++++++++++++++++------------- test/test_performance_model.py | 83 +++++++- 3 files changed, 281 insertions(+), 147 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 86fa930d..b6004826 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -754,7 +754,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: performance_model = self.performance_model - performance_model_result = performance_model(geo_data) + performance_model_result = performance_model( + geo_data, insn.base_kernel, insn.kernel_arguments) # {{{ construct dummy outputs diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 62940a86..3d83204c 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -53,9 +53,9 @@ __doc__ = """ class TranslationCostModel(object): """Provides modeled costs for individual translations or evaluations.""" - def __init__(self, ncoeffs_qbx, ncoeffs_fmm, uses_point_and_shoot): + def __init__(self, ncoeffs_qbx, ncoeffs_fmm_by_level, uses_point_and_shoot): self.ncoeffs_qbx = ncoeffs_qbx - self.ncoeffs_fmm = ncoeffs_fmm + self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level self.uses_point_and_shoot = uses_point_and_shoot @staticmethod @@ -71,32 +71,42 @@ class TranslationCostModel(object): def qbxl2p(self): return var("c_qbxl2p") * self.ncoeffs_qbx - def p2l(self): - return var("c_p2l") * self.ncoeffs_fmm + def p2l(self, level): + return var("c_p2l") * self.ncoeffs_fmm_by_level[level] - def l2p(self): - return var("c_l2p") * self.ncoeffs_fmm + def l2p(self, level): + return var("c_l2p") * self.ncoeffs_fmm_by_level[level] - def p2m(self): - return var("c_p2m") * self.ncoeffs_fmm + def p2m(self, level): + return var("c_p2m") * self.ncoeffs_fmm_by_level[level] - def m2p(self): - return var("c_m2p") * self.ncoeffs_fmm + def m2p(self, level): + return var("c_m2p") * self.ncoeffs_fmm_by_level[level] - def m2m(self): - return var("c_m2m") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) + def m2m(self, src_level, tgt_level): + return var("c_m2m") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) - def l2l(self): - return var("c_l2l") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) + def l2l(self, src_level, tgt_level): + return var("c_l2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) - def m2l(self): - return var("c_m2l") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_fmm) + def m2l(self, src_level, tgt_level): + return var("c_m2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) - def m2qbxl(self): - return var("c_m2qbxl") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_qbx) + def m2qbxl(self, level): + return var("c_m2qbxl") * self.e2e_cost( + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) - def l2qbxl(self): - return var("c_l2qbxl") * self.e2e_cost(self.ncoeffs_fmm, self.ncoeffs_qbx) + def l2qbxl(self, level): + return var("c_l2qbxl") * self.e2e_cost( + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) def e2e_cost(self, nsource_coeffs, ntarget_coeffs): if self.uses_point_and_shoot: @@ -205,27 +215,13 @@ class PerformanceModel(object): .. automethod:: __call__ """ - def __init__(self, - uses_pde_expansions=True, - summarize_parallel=None, - calibration_params=None): + def __init__(self, uses_pde_expansions=True, calibration_params=None): """ :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM uses translation operators that make use of the knowledge that the potential satisfies a PDE. - - :arg summarize_parallel: a function of two arguments - *(parallel_array, sym_multipliers)* used to model the cost after - taking into account parallelization. *parallel_array* represents a - partitioning of the work into elementary (typically box-based) tasks, - each with a given number of operations. *sym_multipliers* is a symbolic - value representing time per modeled operation. By default, all tasks - are summed into one number encompassing the total cost. """ self.uses_pde_expansions = uses_pde_expansions - if summarize_parallel is None: - summarize_parallel = self.summarize_parallel_default - self.summarize_parallel = summarize_parallel if calibration_params is None: calibration_params = dict() self.calibration_params = calibration_params @@ -234,17 +230,29 @@ class PerformanceModel(object): """Return a copy of *self* with a new set of calibration parameters.""" return type(self)( uses_pde_expansions=self.uses_pde_expansions, - summarize_parallel=self.summarize_parallel, calibration_params=calibration_params) - @staticmethod - def summarize_parallel_default(parallel_array, sym_multipliers): - return np.sum(parallel_array) * sym_multipliers + # {{{ form multipoles + + def process_form_multipoles(self, xlat_cost, traversal, tree): + result = 0 + + for level in range(tree.nlevels): + src_count = 0 + start, stop = traversal.level_start_source_box_nrs[level:level + 2] + for src_ibox in traversal.source_boxes[start:stop]: + nsrcs = tree.box_source_counts_nonchild[src_ibox] + src_count += nsrcs + result += src_count * xlat_cost.p2m(level) + + return dict(form_multipoles=result) + + # }}} # {{{ propagate multipoles upward - def process_coarsen_multipoles(self, xlat_cost, tree, traversal): - nmultipoles = 0 + def process_coarsen_multipoles(self, xlat_cost, traversal, tree): + result = 0 # nlevels-1 is the last valid level index # nlevels-2 is the last valid level that could have children @@ -254,6 +262,9 @@ class PerformanceModel(object): # (because no level 1 box will be well-separated from another) for source_level in range(tree.nlevels-1, 2, -1): target_level = source_level - 1 + cost = xlat_cost.m2m(source_level, target_level) + + nmultipoles = 0 start, stop = traversal.level_start_source_parent_box_nrs[ target_level:target_level+2] for ibox in traversal.source_parent_boxes[start:stop]: @@ -261,18 +272,19 @@ class PerformanceModel(object): if child: nmultipoles += 1 - return dict(coarsen_multipoles=( - self.summarize_parallel(nmultipoles, xlat_cost.m2m()))) + result += cost * nmultipoles + + return dict(coarsen_multipoles=result) # }}} # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) def process_direct(self, xlat_cost, traversal, tree, box_target_counts_nonchild): - # box -> nsources * ntargets - npart_direct_list1 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list3 = np.zeros(len(traversal.target_boxes), dtype=np.intp) - npart_direct_list4 = np.zeros(len(traversal.target_boxes), dtype=np.intp) + # list -> number of source-target interactions + npart_direct_list1 = 0 + npart_direct_list3 = 0 + npart_direct_list4 = 0 for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): ntargets = box_target_counts_nonchild[tgt_ibox] @@ -284,7 +296,7 @@ class PerformanceModel(object): npart_direct_list1_srcs += nsources - npart_direct_list1[itgt_box] = ntargets * npart_direct_list1_srcs + npart_direct_list1 += ntargets * npart_direct_list1_srcs npart_direct_list3_srcs = 0 @@ -297,7 +309,7 @@ class PerformanceModel(object): npart_direct_list3_srcs += nsources - npart_direct_list3[itgt_box] = ntargets * npart_direct_list3_srcs + npart_direct_list3 += ntargets * npart_direct_list3_srcs npart_direct_list4_srcs = 0 @@ -310,15 +322,12 @@ class PerformanceModel(object): npart_direct_list4_srcs += nsources - npart_direct_list4[itgt_box] = ntargets * npart_direct_list4_srcs + npart_direct_list4 += ntargets * npart_direct_list4_srcs result = {} - result["eval_direct_list1"] = ( - self.summarize_parallel(npart_direct_list1, xlat_cost.direct())) - result["eval_direct_list3"] = ( - self.summarize_parallel(npart_direct_list3, xlat_cost.direct())) - result["eval_direct_list4"] = ( - self.summarize_parallel(npart_direct_list4, xlat_cost.direct())) + result["eval_direct_list1"] = npart_direct_list1 * xlat_cost.direct() + result["eval_direct_list3"] = npart_direct_list3 * xlat_cost.direct() + result["eval_direct_list4"] = npart_direct_list4 * xlat_cost.direct() return result @@ -326,25 +335,27 @@ class PerformanceModel(object): # {{{ translate separated siblings' ("list 2") mpoles to local - def process_list2(self, xlat_cost, traversal): - nm2l = np.zeros(len(traversal.target_or_target_parent_boxes), dtype=np.intp) + def process_list2(self, xlat_cost, traversal, tree): + nm2l_by_level = np.zeros(tree.nlevels, dtype=np.intp) - for itgt_box in range(len(traversal.target_or_target_parent_boxes)): + for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] - nm2l[itgt_box] += end-start + level = tree.box_levels[tgt_ibox] + nm2l_by_level[level] += end-start - return dict(multipole_to_local=( - self.summarize_parallel(nm2l, xlat_cost.m2l()))) + result = sum( + cost * xlat_cost.m2l(ilevel, ilevel) + for ilevel, cost in enumerate(nm2l_by_level)) + + return dict(multipole_to_local=result) # }}} # {{{ evaluate sep. smaller mpoles ("list 3") at particles def process_list3(self, xlat_cost, traversal, tree, box_target_counts_nonchild): - nmp_eval = np.zeros( - (tree.nlevels, len(traversal.target_boxes)), - dtype=np.intp) + nmp_eval_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) assert tree.nlevels == len(traversal.from_sep_smaller_by_level) @@ -354,35 +365,68 @@ class PerformanceModel(object): traversal.target_boxes_sep_smaller_by_source_level[ilevel]): ntargets = box_target_counts_nonchild[tgt_ibox] start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] - nmp_eval[ilevel, sep_smaller_list.nonempty_indices[itgt_box]] = ( - ntargets * (end-start) - ) + nmp_eval_by_source_level[ilevel] += ntargets * (end-start) + + result = sum( + cost * xlat_cost.m2p(ilevel) + for ilevel, cost in enumerate(nmp_eval_by_source_level)) - return dict(eval_multipoles=( - self.summarize_parallel(nmp_eval, xlat_cost.m2p()))) + return dict(eval_multipoles=result) # }}} # {{{ form locals for separated bigger source boxes ("list 4") def process_list4(self, xlat_cost, traversal, tree): - nform_local = np.zeros( - len(traversal.target_or_target_parent_boxes), - dtype=np.intp) + nform_local_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) for itgt_box in range(len(traversal.target_or_target_parent_boxes)): start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] - - nform_local_box = 0 for src_ibox in traversal.from_sep_bigger_lists[start:end]: nsources = tree.box_source_counts_nonchild[src_ibox] + level = tree.box_levels[src_ibox] + nform_local_by_source_level[level] += nsources + + result = sum( + cost * xlat_cost.p2l(ilevel) + for ilevel, cost in enumerate(nform_local_by_source_level)) + + return dict(form_locals=result) + + # }}} + + # {{{ propogate locals downward + + def process_refine_locals(self, xlat_cost, traversal, tree): + result = 0 - nform_local_box += nsources + for target_lev in range(1, tree.nlevels): + start, stop = traversal.level_start_target_or_target_parent_box_nrs[ + target_lev:target_lev+2] + source_lev = target_lev - 1 + result += (stop-start) * xlat_cost.l2l(source_lev, target_lev) - nform_local[itgt_box] = nform_local_box + return dict(refine_locals=result) - return dict(form_locals=( - self.summarize_parallel(nform_local, xlat_cost.p2l()))) + # }}} + + # {{{ evaluate local expansions at non-qbx targets + + def process_eval_locals(self, xlat_cost, traversal, tree, nqbtl): + ntargets_by_level = np.zeros(tree.nlevels, dtype=np.intp) + + for target_lev in range(tree.nlevels): + start, stop = traversal.level_start_target_box_nrs[ + target_lev:target_lev+2] + for tgt_ibox in traversal.target_boxes[start:stop]: + ntargets_by_level[target_lev] += ( + nqbtl.box_target_counts_nonchild[tgt_ibox]) + + result = sum( + cost * xlat_cost.l2p(ilevel) + for ilevel, cost in enumerate(ntargets_by_level)) + + return dict(eval_locals=result) # }}} @@ -393,16 +437,16 @@ class PerformanceModel(object): global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): # center -> nsources - np2qbxl_list1 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list3 = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4 = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) # center -> number of associated targets - nqbxl2p = np.zeros(len(global_qbx_centers), dtype=np.intp) + nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) for itgt_center, tgt_icenter in enumerate(global_qbx_centers): start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] - nqbxl2p[itgt_center] = end - start + nqbxl2p_by_center[itgt_center] = end - start itgt_box = qbx_center_to_target_box[tgt_icenter] @@ -413,7 +457,7 @@ class PerformanceModel(object): np2qbxl_list1_srcs += nsources - np2qbxl_list1[itgt_center] = np2qbxl_list1_srcs + np2qbxl_list1_by_center[itgt_center] = np2qbxl_list1_srcs np2qbxl_list3_srcs = 0 @@ -426,7 +470,7 @@ class PerformanceModel(object): np2qbxl_list3_srcs += nsources - np2qbxl_list3[itgt_center] = np2qbxl_list3_srcs + np2qbxl_list3_by_center[itgt_center] = np2qbxl_list3_srcs np2qbxl_list4_srcs = 0 @@ -439,13 +483,13 @@ class PerformanceModel(object): np2qbxl_list4_srcs += nsources - np2qbxl_list4[itgt_center] = np2qbxl_list4_srcs + np2qbxl_list4_by_center[itgt_center] = np2qbxl_list4_srcs result = {} - result["np2qbxl_list1"] = np2qbxl_list1 - result["np2qbxl_list3"] = np2qbxl_list3 - result["np2qbxl_list4"] = np2qbxl_list4 - result["nqbxl2p"] = nqbxl2p + result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center + result["np2qbxl_list3_by_center"] = np2qbxl_list3_by_center + result["np2qbxl_list4_by_center"] = np2qbxl_list4_by_center + result["nqbxl2p_by_center"] = nqbxl2p_by_center return result @@ -462,17 +506,14 @@ class PerformanceModel(object): result = {} result["eval_target_specific_qbx_locals_list1"] = ( - self.summarize_parallel( - counts["np2qbxl_list1"] * counts["nqbxl2p"], - xlat_cost.p2p_tsqbx())) + sum(counts["np2qbxl_list1_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) result["eval_target_specific_qbx_locals_list3"] = ( - self.summarize_parallel( - counts["np2qbxl_list3"] * counts["nqbxl2p"], - xlat_cost.p2p_tsqbx())) + sum(counts["np2qbxl_list3_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) result["eval_target_specific_qbx_locals_list4"] = ( - self.summarize_parallel( - counts["np2qbxl_list4"] * counts["nqbxl2p"], - xlat_cost.p2p_tsqbx())) + sum(counts["np2qbxl_list4_by_center"] * counts["nqbxl2p_by_center"]) + * xlat_cost.p2p_tsqbx()) return result @@ -489,11 +530,11 @@ class PerformanceModel(object): result = {} result["form_global_qbx_locals_list1"] = ( - self.summarize_parallel(counts["np2qbxl_list1"], xlat_cost.p2qbxl())) + sum(counts["np2qbxl_list1_by_center"]) * xlat_cost.p2qbxl()) result["form_global_qbx_locals_list3"] = ( - self.summarize_parallel(counts["np2qbxl_list3"], xlat_cost.p2qbxl())) + sum(counts["np2qbxl_list3_by_center"]) * xlat_cost.p2qbxl()) result["form_global_qbx_locals_list4"] = ( - self.summarize_parallel(counts["np2qbxl_list4"], xlat_cost.p2qbxl())) + sum(counts["np2qbxl_list4_by_center"]) * xlat_cost.p2qbxl()) return result @@ -503,9 +544,7 @@ class PerformanceModel(object): def process_m2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, qbx_center_to_target_box_source_level): - nm2qbxl = np.zeros( - (tree.nlevels, len(global_qbx_centers)), - dtype=np.intp) + nm2qbxl_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) assert tree.nlevels == len(traversal.from_sep_smaller_by_level) @@ -522,10 +561,33 @@ class PerformanceModel(object): ssn.starts[icontaining_tgt_box], ssn.starts[icontaining_tgt_box+1]) - nm2qbxl[isrc_level, itgt_center] += stop-start + nm2qbxl_by_source_level[isrc_level] += stop-start + + result = sum( + cost * xlat_cost.m2qbxl(ilevel) + for ilevel, cost in enumerate(nm2qbxl_by_source_level)) + + return dict(translate_box_multipoles_to_qbx_local=result) + + # }}} + + # {{{ translate from box locals to qbx local expansions + + def process_l2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box): + nl2qbxl_by_level = np.zeros(tree.nlevels, dtype=np.intp) + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + itgt_box = qbx_center_to_target_box[tgt_icenter] + tgt_ibox = traversal.target_boxes[itgt_box] + level = tree.box_levels[tgt_ibox] + nl2qbxl_by_level[level] += 1 + + result = sum( + cost * xlat_cost.l2qbxl(ilevel) + for ilevel, cost in enumerate(nl2qbxl_by_level)) - return dict(translate_box_multipoles_to_qbx_local=( - self.summarize_parallel(nm2qbxl, xlat_cost.m2qbxl()))) + return dict(translate_box_local_to_qbx_local=result) # }}} @@ -533,45 +595,46 @@ class PerformanceModel(object): def process_eval_qbxl(self, xlat_cost, global_qbx_centers, center_to_targets_starts): - nqbx_eval = np.zeros(len(global_qbx_centers), dtype=np.intp) + result = 0 for isrc_center, src_icenter in enumerate(global_qbx_centers): start, end = center_to_targets_starts[src_icenter:src_icenter+2] - nqbx_eval[isrc_center] += end-start + result += (end - start) + + result *= xlat_cost.qbxl2p() - return dict(eval_qbx_expansions=( - self.summarize_parallel(nqbx_eval, xlat_cost.qbxl2p()))) + return dict(eval_qbx_expansions=result) # }}} # {{{ set up translation cost model - def get_translation_cost_model(self, d): + def get_translation_cost_model(self, dim, nlevels): p_qbx = var("p_qbx") - p_fmm = var("p_fmm") + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) uses_point_and_shoot = False if self.uses_pde_expansions: - ncoeffs_fmm = p_fmm ** (d-1) - ncoeffs_qbx = p_qbx ** (d-1) + ncoeffs_fmm = p_fmm ** (dim-1) + ncoeffs_qbx = p_qbx ** (dim-1) - if d == 3: + if dim == 3: uses_point_and_shoot = True else: - ncoeffs_fmm = p_fmm ** d - ncoeffs_qbx = p_qbx ** d + ncoeffs_fmm = p_fmm ** dim + ncoeffs_qbx = p_qbx ** dim return TranslationCostModel( ncoeffs_qbx=ncoeffs_qbx, - ncoeffs_fmm=ncoeffs_fmm, + ncoeffs_fmm_by_level=ncoeffs_fmm, uses_point_and_shoot=uses_point_and_shoot) # }}} @log_process(logger, "gather performance model data") - def __call__(self, geo_data): + def __call__(self, geo_data, kernel, kernel_arguments): """Analyze the given geometry and return performance data. :returns: An instance of :class:`ParametrizedCosts`. @@ -582,14 +645,14 @@ class PerformanceModel(object): lpot_source = geo_data.lpot_source - nqbtl = geo_data.non_qbx_box_target_lists() use_tsqbx = lpot_source._use_target_specific_qbx with cl.CommandQueue(geo_data.cl_context) as queue: - tree = geo_data.tree().get(queue=queue) - traversal = geo_data.traversal(merge_close_lists=False).get(queue=queue) - box_target_counts_nonchild = ( - nqbtl.box_target_counts_nonchild.get(queue=queue)) + tree = geo_data.tree().get(queue) + traversal = geo_data.traversal(merge_close_lists=False).get(queue) + nqbtl = geo_data.non_qbx_box_target_lists().get(queue) + + box_target_counts_nonchild = nqbtl.box_target_counts_nonchild params = dict( nlevels=tree.nlevels, @@ -598,23 +661,26 @@ class PerformanceModel(object): ntargets=tree.ntargets, ncenters=geo_data.ncenters, p_qbx=lpot_source.qbx_order, - # FIXME: Assumes this is a constant - p_fmm=lpot_source.fmm_level_to_order(None, None, None, None), ) + for ilevel in range(tree.nlevels): + params["p_fmm_lev%d" % ilevel] = ( + lpot_source.fmm_level_to_order( + kernel, kernel_arguments, tree, ilevel)) + params.update(self.calibration_params) - xlat_cost = self.get_translation_cost_model(tree.dimensions) + xlat_cost = self.get_translation_cost_model(tree.dimensions, tree.nlevels) # {{{ construct local multipoles - result["form_multipoles"] = tree.nsources * xlat_cost.p2m() + result.update(self.process_form_multipoles(xlat_cost, traversal, tree)) # }}} # {{{ propagate multipoles upward - result.update(self.process_coarsen_multipoles(xlat_cost, tree, traversal)) + result.update(self.process_coarsen_multipoles(xlat_cost, traversal, tree)) # }}} @@ -627,7 +693,7 @@ class PerformanceModel(object): # {{{ translate separated siblings' ("list 2") mpoles to local - result.update(self.process_list2(xlat_cost, traversal)) + result.update(self.process_list2(xlat_cost, traversal, tree)) # }}} @@ -646,14 +712,13 @@ class PerformanceModel(object): # {{{ propagate local_exps downward - result["refine_locals"] = ( - traversal.ntarget_or_target_parent_boxes * xlat_cost.l2l()) + result.update(self.process_refine_locals(xlat_cost, traversal, tree)) # }}} # {{{ evaluate locals - result["eval_locals"] = nqbtl.nfiltered_targets * xlat_cost.l2p() + result.update(self.process_eval_locals(xlat_cost, traversal, tree, nqbtl)) # }}} @@ -703,8 +768,9 @@ class PerformanceModel(object): # {{{ translate from box local expansions to qbx local expansions - result["translate_box_local_to_qbx_local"] = ( - len(global_qbx_centers) * xlat_cost.l2qbxl()) + result.update(self.process_l2qbxl( + xlat_cost, traversal, tree, global_qbx_centers, + qbx_center_to_target_box)) # }}} diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 7d027221..0abb5c26 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -32,7 +32,11 @@ import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from pytools import one +from sumpy.kernel import LaplaceKernel, HelmholtzKernel + from pytential import bind, sym, norm # noqa +from pytential.qbx.performance import PerformanceModel # {{{ global params @@ -105,7 +109,6 @@ def test_timing_data_gathering(ctx_getter): lpot_source = get_lpot_source(queue, 2) sigma = get_density(queue, lpot_source) - from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") k_sym = LaplaceKernel(lpot_source.ambient_dim) sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) @@ -130,7 +133,6 @@ def test_performance_model(ctx_getter, dim): lpot_source = get_lpot_source(queue, dim) sigma = get_density(queue, lpot_source) - from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") k_sym = LaplaceKernel(lpot_source.ambient_dim) @@ -310,7 +312,6 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): CONSTANT_ONE_PARAMS = dict( p_qbx=1, - p_fmm=1, c_l2l=1, c_l2p=1, c_l2qbxl=1, @@ -327,6 +328,20 @@ CONSTANT_ONE_PARAMS = dict( ) +def _get_params_for_raw_op_counts(perf_result): + """Return a set of parameters suitable for obtaining raw + operation counts from the model.""" + + # Sets model / calibration parameters equal to 1, to obtain raw op counts. + constant_one_params = CONSTANT_ONE_PARAMS.copy() + + # Set p_fmm_lev* equal to 1. + for level in perf_result.params["nlevels"]: + constant_one_params["p_fmm_lev%d" % level] = 1 + + return constant_one_params + + @pytest.mark.parametrize("dim, off_surface, use_target_specific_qbx", ( (2, False, False), (2, True, False), @@ -339,8 +354,6 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) - from pytential.qbx.performance import PerformanceModel - # We set uses_pde_expansions=False, so that a translation is modeled as # simply costing nsrc_coeffs * ntgt_coeffs. By adjusting the symbolic # parameters to equal 1 (done below), this provides a straightforward way @@ -364,7 +377,6 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, qbx_forced_limit = 1 # Construct bound op, run performance model. - from sumpy.kernel import LaplaceKernel sigma_sym = sym.var("sigma") k_sym = LaplaceKernel(lpot_source.ambient_dim) sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=qbx_forced_limit) @@ -374,8 +386,7 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, from pytools import one perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) - # Set all parameters equal to 1, to obtain raw op counts. - perf_S = perf_S.with_params(CONSTANT_ONE_PARAMS) + perf_S = perf_S.with_params(_get_params_for_raw_op_counts(perf_S)) # Run FMM with ConstantOneWrangler. This can't be done with pytential's # high-level interface, so call the FMM driver directly. @@ -410,6 +421,62 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, # }}} +# {{{ test order varying by level + +def test_performance_model_order_varying_by_level(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + # {{{ constant level to order + + def level_to_order_constant(kernel, kernel_args, tree, level): + return 1 + + lpot_source = get_lpot_source(queue, 2).copy( + performance_model=PerformanceModel(uses_pde_expansions=False), + fmm_level_to_order=level_to_order_constant) + + sigma_sym = sym.var("sigma") + + k_sym = LaplaceKernel(2) + sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + sigma = get_density(queue, lpot_source) + + perf_constant = one( + bind(lpot_source, sym_op) + .get_modeled_performance(queue, sigma=sigma).values()) + + perf_constant = perf_constant.with_params(CONSTANT_ONE_PARAMS) + + # }}} + + # {{{ varying level to order + + def level_to_order_varying(kernel, kernel_args, tree, level): + return tree.nlevels - level + + lpot_source = lpot_source.copy(fmm_level_to_order=level_to_order_varying) + + perf_varying = one( + bind(lpot_source, sym_op) + .get_modeled_performance(queue, sigma=sigma).values()) + + perf_varying = perf_varying.with_params(CONSTANT_ONE_PARAMS) + + # }}} + + # This only checks to ensure that the costs are different. The varying-level + # case should have larger cost. + + assert ( + sum(perf_varying.get_predicted_times().values()) > + sum(perf_constant.get_predicted_times().values())) + + +# }}} + + # You can test individual routines by typing # $ python test_performance_model.py 'test_routine()' -- GitLab From 5bddc15826ce7731162e8ea44093dd96ca96881d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 27 Aug 2018 21:40:23 -0500 Subject: [PATCH 109/139] Fix test failure --- test/test_performance_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 0abb5c26..83af2475 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -336,7 +336,7 @@ def _get_params_for_raw_op_counts(perf_result): constant_one_params = CONSTANT_ONE_PARAMS.copy() # Set p_fmm_lev* equal to 1. - for level in perf_result.params["nlevels"]: + for level in range(perf_result.params["nlevels"]): constant_one_params["p_fmm_lev%d" % level] = 1 return constant_one_params -- GitLab From 168afe0b133a5df678b7adb6401a5b52f9222a19 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 27 Aug 2018 21:40:29 -0500 Subject: [PATCH 110/139] Fix kernel parameter gathering, test that parameters are as expected for Helmholtz kernels. --- pytential/qbx/__init__.py | 8 ++- test/test_performance_model.py | 94 ++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index b6004826..340bd6e0 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -754,8 +754,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: performance_model = self.performance_model - performance_model_result = performance_model( - geo_data, insn.base_kernel, insn.kernel_arguments) + kernel_args = {} + for arg_name, arg_expr in six.iteritems(insn.kernel_arguments): + kernel_args[arg_name] = evaluate(arg_expr) + + performance_model_result = ( + performance_model(geo_data, insn.base_kernel, kernel_args)) # {{{ construct dummy outputs diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 83af2475..8927a66e 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -151,6 +151,49 @@ def test_performance_model(ctx_getter, dim): # }}} +# {{{ test performance model parameter gathering + +def test_performance_model_parameter_gathering(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder + + fmm_level_to_order = SimpleExpansionOrderFinder(tol=1e-5) + + lpot_source = get_lpot_source(queue, 2).copy( + fmm_level_to_order=fmm_level_to_order) + + sigma = get_density(queue, lpot_source) + + sigma_sym = sym.var("sigma") + k_sym = HelmholtzKernel(2, "k") + k = 2 + + sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1, k=sym.var("k")) + op_S = bind(lpot_source, sym_op_S) + + perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma, k=k).values()) + + geo_data = lpot_source.qbx_fmm_geometry_data( + target_discrs_and_qbx_sides=((lpot_source.density_discr, 1),)) + + tree = geo_data.tree() + + assert perf_S.params["p_qbx"] == QBX_ORDER + assert perf_S.params["nlevels"] == tree.nlevels + assert perf_S.params["nsources"] == tree.nsources + assert perf_S.params["ntargets"] == tree.ntargets + assert perf_S.params["ncenters"] == geo_data.ncenters + + for level in range(tree.nlevels): + assert ( + perf_S.params["p_fmm_lev%d" % level] == + fmm_level_to_order(k_sym, {"k": 2}, tree, level)) + +# }}} + + # {{{ constant one wrangler class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): @@ -474,6 +517,57 @@ def test_performance_model_order_varying_by_level(ctx_getter): sum(perf_constant.get_predicted_times().values())) +def test_performance_model_order_varying_by_level(ctx_getter): + cl_ctx = ctx_getter() + queue = cl.CommandQueue(cl_ctx) + + # {{{ constant level to order + + def level_to_order_constant(kernel, kernel_args, tree, level): + return 1 + + lpot_source = get_lpot_source(queue, 2).copy( + performance_model=PerformanceModel(uses_pde_expansions=False), + fmm_level_to_order=level_to_order_constant) + + sigma_sym = sym.var("sigma") + + k_sym = LaplaceKernel(2) + sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + + sigma = get_density(queue, lpot_source) + + perf_constant = one( + bind(lpot_source, sym_op) + .get_modeled_performance(queue, sigma=sigma).values()) + + perf_constant = perf_constant.with_params(CONSTANT_ONE_PARAMS) + + # }}} + + # {{{ varying level to order + + def level_to_order_varying(kernel, kernel_args, tree, level): + return tree.nlevels - level + + lpot_source = lpot_source.copy(fmm_level_to_order=level_to_order_varying) + + perf_varying = one( + bind(lpot_source, sym_op) + .get_modeled_performance(queue, sigma=sigma).values()) + + perf_varying = perf_varying.with_params(CONSTANT_ONE_PARAMS) + + # }}} + + # This only checks to ensure that the costs are different. The varying-level + # case should have larger cost. + + assert ( + sum(perf_varying.get_predicted_times().values()) > + sum(perf_constant.get_predicted_times().values())) + + # }}} -- GitLab From eb11961e1f55f1c13505cdba647711e49a931571 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 27 Aug 2018 21:42:05 -0500 Subject: [PATCH 111/139] Remove duplicated function --- test/test_performance_model.py | 52 ---------------------------------- 1 file changed, 52 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 8927a66e..08bbea4a 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -516,58 +516,6 @@ def test_performance_model_order_varying_by_level(ctx_getter): sum(perf_varying.get_predicted_times().values()) > sum(perf_constant.get_predicted_times().values())) - -def test_performance_model_order_varying_by_level(ctx_getter): - cl_ctx = ctx_getter() - queue = cl.CommandQueue(cl_ctx) - - # {{{ constant level to order - - def level_to_order_constant(kernel, kernel_args, tree, level): - return 1 - - lpot_source = get_lpot_source(queue, 2).copy( - performance_model=PerformanceModel(uses_pde_expansions=False), - fmm_level_to_order=level_to_order_constant) - - sigma_sym = sym.var("sigma") - - k_sym = LaplaceKernel(2) - sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) - - sigma = get_density(queue, lpot_source) - - perf_constant = one( - bind(lpot_source, sym_op) - .get_modeled_performance(queue, sigma=sigma).values()) - - perf_constant = perf_constant.with_params(CONSTANT_ONE_PARAMS) - - # }}} - - # {{{ varying level to order - - def level_to_order_varying(kernel, kernel_args, tree, level): - return tree.nlevels - level - - lpot_source = lpot_source.copy(fmm_level_to_order=level_to_order_varying) - - perf_varying = one( - bind(lpot_source, sym_op) - .get_modeled_performance(queue, sigma=sigma).values()) - - perf_varying = perf_varying.with_params(CONSTANT_ONE_PARAMS) - - # }}} - - # This only checks to ensure that the costs are different. The varying-level - # case should have larger cost. - - assert ( - sum(perf_varying.get_predicted_times().values()) > - sum(perf_constant.get_predicted_times().values())) - - # }}} -- GitLab From 15d69ee0f62953c924ec60e9c47e9bb9d0936fda Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 28 Aug 2018 18:37:52 -0500 Subject: [PATCH 112/139] Simplify test_performance_model_order_varying_by_level() --- test/test_performance_model.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 08bbea4a..e24cce96 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -496,16 +496,13 @@ def test_performance_model_order_varying_by_level(ctx_getter): # {{{ varying level to order - def level_to_order_varying(kernel, kernel_args, tree, level): - return tree.nlevels - level + varying_order_params = CONSTANT_ONE_PARAMS.copy() - lpot_source = lpot_source.copy(fmm_level_to_order=level_to_order_varying) + nlevels = perf_constant.params["nlevels"] + for level in range(nlevels): + varying_order_params["p_fmm_lev%d" % level] = nlevels - level - perf_varying = one( - bind(lpot_source, sym_op) - .get_modeled_performance(queue, sigma=sigma).values()) - - perf_varying = perf_varying.with_params(CONSTANT_ONE_PARAMS) + perf_varying = perf_constant.with_params(varying_order_params) # }}} -- GitLab From 62ffaf99c6575a29a48ff2c6d33f1280f33bc5de Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 28 Aug 2018 20:01:10 -0500 Subject: [PATCH 113/139] Improve scaling of performance model gathering code, by not traversing the "close" list for every QBX center. --- pytential/qbx/performance.py | 53 +++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 3d83204c..ea5a60b9 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -436,54 +436,57 @@ class PerformanceModel(object): def _collect_qbxl_direct_interaction_data(traversal, tree, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): - # center -> nsources - np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - - # center -> number of associated targets - nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] - nqbxl2p_by_center[itgt_center] = end - start + ntarget_boxes = len(traversal.target_boxes) - itgt_box = qbx_center_to_target_box[tgt_icenter] + # target box index -> nsources + np2qbxl_list1_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + np2qbxl_list3_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + np2qbxl_list4_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + for itgt_box in range(ntarget_boxes): np2qbxl_list1_srcs = 0 start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list1_srcs += nsources + np2qbxl_list1_srcs += tree.box_source_counts_nonchild[src_ibox] - np2qbxl_list1_by_center[itgt_center] = np2qbxl_list1_srcs + np2qbxl_list1_by_itgt_box[itgt_box] = np2qbxl_list1_srcs np2qbxl_list3_srcs = 0 - # Could be None, if not using targets with extent. if traversal.from_sep_close_smaller_starts is not None: start, end = ( traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] + np2qbxl_list3_srcs += tree.box_source_counts_nonchild[src_ibox] - np2qbxl_list3_srcs += nsources - - np2qbxl_list3_by_center[itgt_center] = np2qbxl_list3_srcs + np2qbxl_list3_by_itgt_box[itgt_box] = np2qbxl_list3_srcs np2qbxl_list4_srcs = 0 - # Could be None, if not using targets with extent. if traversal.from_sep_close_bigger_starts is not None: start, end = ( traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] + np2qbxl_list4_srcs += tree.box_source_counts_nonchild[src_ibox] + + np2qbxl_list4_by_itgt_box[itgt_box] = np2qbxl_list4_srcs + + # center -> nsources + np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + + # center -> number of associated targets + nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4_srcs += nsources + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] + nqbxl2p_by_center[itgt_center] = end - start - np2qbxl_list4_by_center[itgt_center] = np2qbxl_list4_srcs + itgt_box = qbx_center_to_target_box[tgt_icenter] + np2qbxl_list1_by_center[itgt_center] = np2qbxl_list1_by_itgt_box[itgt_box] + np2qbxl_list3_by_center[itgt_center] = np2qbxl_list3_by_itgt_box[itgt_box] + np2qbxl_list4_by_center[itgt_center] = np2qbxl_list4_by_itgt_box[itgt_box] result = {} result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center -- GitLab From 3cb406d50dfd775623e918daaa18ffd8728cecd4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 28 Aug 2018 20:08:16 -0500 Subject: [PATCH 114/139] flake8 fix --- pytential/qbx/performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index ea5a60b9..d81948b0 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -484,9 +484,12 @@ class PerformanceModel(object): nqbxl2p_by_center[itgt_center] = end - start itgt_box = qbx_center_to_target_box[tgt_icenter] - np2qbxl_list1_by_center[itgt_center] = np2qbxl_list1_by_itgt_box[itgt_box] - np2qbxl_list3_by_center[itgt_center] = np2qbxl_list3_by_itgt_box[itgt_box] - np2qbxl_list4_by_center[itgt_center] = np2qbxl_list4_by_itgt_box[itgt_box] + np2qbxl_list1_by_center[itgt_center] = ( + np2qbxl_list1_by_itgt_box[itgt_box]) + np2qbxl_list3_by_center[itgt_center] = ( + np2qbxl_list3_by_itgt_box[itgt_box]) + np2qbxl_list4_by_center[itgt_center] = ( + np2qbxl_list4_by_itgt_box[itgt_box]) result = {} result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center -- GitLab From fe3e652417241d9575529e6fa4fbe8bfe413f0e7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 29 Aug 2018 14:48:44 -0500 Subject: [PATCH 115/139] [ci skip] gather performance model data -> model performance --- pytential/qbx/performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index d81948b0..e9221848 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -639,7 +639,7 @@ class PerformanceModel(object): # }}} - @log_process(logger, "gather performance model data") + @log_process(logger, "model performance") def __call__(self, geo_data, kernel, kernel_arguments): """Analyze the given geometry and return performance data. -- GitLab From 821dbd168dc1ece4826307906776b41b96c46191 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 30 Aug 2018 23:32:24 -0500 Subject: [PATCH 116/139] Fix c_p2p_tsqbx multiplier. --- pytential/qbx/performance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index e9221848..9a648781 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -66,7 +66,9 @@ class TranslationCostModel(object): return var("c_p2qbxl") * self.ncoeffs_qbx def p2p_tsqbx(self): - return var("c_p2p_tsqbx") * self.ncoeffs_qbx + # This term should be linear in the QBX order, which is the + # square root of the number of QBX coefficients. + return var("c_p2p_tsqbx") * self.ncoeffs_qbx ** (1/2) def qbxl2p(self): return var("c_qbxl2p") * self.ncoeffs_qbx -- GitLab From 140565ff8b978f2fb0bf143aad10dc845eac0563 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 31 Aug 2018 15:06:56 -0500 Subject: [PATCH 117/139] Clean up code duplication --- pytential/qbx/performance.py | 146 +++++++++++++++++------------------ 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 9a648781..11909479 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -280,51 +280,76 @@ class PerformanceModel(object): # }}} - # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + # {{{ collect direct interaction data - def process_direct(self, xlat_cost, traversal, tree, box_target_counts_nonchild): - # list -> number of source-target interactions - npart_direct_list1 = 0 - npart_direct_list3 = 0 - npart_direct_list4 = 0 + @staticmethod + def _collect_direction_interaction_data(traversal, tree): + ntarget_boxes = len(traversal.target_boxes) - for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = box_target_counts_nonchild[tgt_ibox] + # target box index -> nsources + nlist1_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + nlist3close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + nlist4close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - npart_direct_list1_srcs = 0 + for itgt_box in range(ntarget_boxes): + nlist1_srcs = 0 start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] + nlist1_srcs += tree.box_source_counts_nonchild[src_ibox] - npart_direct_list1_srcs += nsources - - npart_direct_list1 += ntargets * npart_direct_list1_srcs - - npart_direct_list3_srcs = 0 + nlist1_srcs_by_itgt_box[itgt_box] = nlist1_srcs + nlist3close_srcs = 0 # Could be None, if not using targets with extent. if traversal.from_sep_close_smaller_starts is not None: start, end = ( traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - - npart_direct_list3_srcs += nsources + nlist3close_srcs += tree.box_source_counts_nonchild[src_ibox] - npart_direct_list3 += ntargets * npart_direct_list3_srcs - - npart_direct_list4_srcs = 0 + nlist3close_srcs_by_itgt_box[itgt_box] = nlist3close_srcs + nlist4close_srcs = 0 # Could be None, if not using targets with extent. if traversal.from_sep_close_bigger_starts is not None: start, end = ( traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] + nlist4close_srcs += tree.box_source_counts_nonchild[src_ibox] + + nlist4close_srcs_by_itgt_box[itgt_box] = nlist4close_srcs + + result = {} + result["nlist1_srcs_by_itgt_box"] = nlist1_srcs_by_itgt_box + result["nlist3close_srcs_by_itgt_box"] = nlist3close_srcs_by_itgt_box + result["nlist4close_srcs_by_itgt_box"] = nlist4close_srcs_by_itgt_box - npart_direct_list4_srcs += nsources + return result + + # }}} - npart_direct_list4 += ntargets * npart_direct_list4_srcs + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + def process_direct(self, xlat_cost, traversal, direct_interaction_data, + box_target_counts_nonchild): + nlist1_srcs_by_itgt_box = ( + direct_interaction_data["nlist1_srcs_by_itgt_box"]) + nlist3close_srcs_by_itgt_box = ( + direct_interaction_data["nlist3close_srcs_by_itgt_box"]) + nlist4close_srcs_by_itgt_box = ( + direct_interaction_data["nlist4close_srcs_by_itgt_box"]) + + # list -> number of source-target interactions + npart_direct_list1 = 0 + npart_direct_list3 = 0 + npart_direct_list4 = 0 + + for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): + ntargets = box_target_counts_nonchild[tgt_ibox] + + npart_direct_list1 += ntargets * nlist1_srcs_by_itgt_box[itgt_box] + npart_direct_list3 += ntargets * nlist3close_srcs_by_itgt_box[itgt_box] + npart_direct_list4 += ntargets * nlist4close_srcs_by_itgt_box[itgt_box] result = {} result["eval_direct_list1"] = npart_direct_list1 * xlat_cost.direct() @@ -435,43 +460,14 @@ class PerformanceModel(object): # {{{ collect data about direct interactions with qbx centers @staticmethod - def _collect_qbxl_direct_interaction_data(traversal, tree, + def _collect_qbxl_direct_interaction_data(direct_interaction_data, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): - - ntarget_boxes = len(traversal.target_boxes) - - # target box index -> nsources - np2qbxl_list1_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - np2qbxl_list3_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - np2qbxl_list4_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - - for itgt_box in range(ntarget_boxes): - np2qbxl_list1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - np2qbxl_list1_srcs += tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list1_by_itgt_box[itgt_box] = np2qbxl_list1_srcs - - np2qbxl_list3_srcs = 0 - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - np2qbxl_list3_srcs += tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list3_by_itgt_box[itgt_box] = np2qbxl_list3_srcs - - np2qbxl_list4_srcs = 0 - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - np2qbxl_list4_srcs += tree.box_source_counts_nonchild[src_ibox] - - np2qbxl_list4_by_itgt_box[itgt_box] = np2qbxl_list4_srcs + nlist1_srcs_by_itgt_box = ( + direct_interaction_data["nlist1_srcs_by_itgt_box"]) + nlist3close_srcs_by_itgt_box = ( + direct_interaction_data["nlist3close_srcs_by_itgt_box"]) + nlist4close_srcs_by_itgt_box = ( + direct_interaction_data["nlist4close_srcs_by_itgt_box"]) # center -> nsources np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) @@ -487,11 +483,11 @@ class PerformanceModel(object): itgt_box = qbx_center_to_target_box[tgt_icenter] np2qbxl_list1_by_center[itgt_center] = ( - np2qbxl_list1_by_itgt_box[itgt_box]) + nlist1_srcs_by_itgt_box[itgt_box]) np2qbxl_list3_by_center[itgt_center] = ( - np2qbxl_list3_by_itgt_box[itgt_box]) + nlist3close_srcs_by_itgt_box[itgt_box]) np2qbxl_list4_by_center[itgt_center] = ( - np2qbxl_list4_by_itgt_box[itgt_box]) + nlist4close_srcs_by_itgt_box[itgt_box]) result = {} result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center @@ -505,12 +501,12 @@ class PerformanceModel(object): # {{{ eval target specific qbx expansions - def process_eval_target_specific_qbxl(self, xlat_cost, traversal, tree, + def process_eval_target_specific_qbxl(self, xlat_cost, direct_interaction_data, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): counts = self._collect_qbxl_direct_interaction_data( - traversal, tree, global_qbx_centers, qbx_center_to_target_box, - center_to_targets_starts) + direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts) result = {} result["eval_target_specific_qbx_locals_list1"] = ( @@ -529,12 +525,12 @@ class PerformanceModel(object): # {{{ form global qbx locals - def process_form_qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts): + def process_form_qbxl(self, xlat_cost, direct_interaction_data, + global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): counts = self._collect_qbxl_direct_interaction_data( - traversal, tree, global_qbx_centers, qbx_center_to_target_box, - center_to_targets_starts) + direct_interaction_data, global_qbx_centers, + qbx_center_to_target_box, center_to_targets_starts) result = {} result["form_global_qbx_locals_list1"] = ( @@ -692,10 +688,14 @@ class PerformanceModel(object): # }}} + direct_interaction_data = ( + self._collect_direction_interaction_data(traversal, tree)) + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) result.update(self.process_direct( - xlat_cost, traversal, tree, box_target_counts_nonchild)) + xlat_cost, traversal, direct_interaction_data, + box_target_counts_nonchild)) # }}} @@ -757,11 +757,11 @@ class PerformanceModel(object): if use_tsqbx: result.update(self.process_eval_target_specific_qbxl( - xlat_cost, traversal, tree, global_qbx_centers, + xlat_cost, direct_interaction_data, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts)) else: result.update(self.process_form_qbxl( - xlat_cost, traversal, tree, global_qbx_centers, + xlat_cost, direct_interaction_data, global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts)) # }}} -- GitLab From 630921161514158ed574c198d54d9d10f8d50b1a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 3 Sep 2018 15:55:58 -0500 Subject: [PATCH 118/139] Fix coefficient counts --- pytential/qbx/performance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 11909479..ccad4f7e 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -620,15 +620,15 @@ class PerformanceModel(object): uses_point_and_shoot = False if self.uses_pde_expansions: - ncoeffs_fmm = p_fmm ** (dim-1) - ncoeffs_qbx = p_qbx ** (dim-1) + ncoeffs_fmm = (p_fmm + 1) ** (dim-1) + ncoeffs_qbx = (p_qbx + 1) ** (dim-1) if dim == 3: uses_point_and_shoot = True else: - ncoeffs_fmm = p_fmm ** dim - ncoeffs_qbx = p_qbx ** dim + ncoeffs_fmm = (p_fmm + 1) ** dim + ncoeffs_qbx = (p_qbx + 1) ** dim return TranslationCostModel( ncoeffs_qbx=ncoeffs_qbx, -- GitLab From 89c027a1d2de4f448cae4773f65049c3fa1b39c8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 3 Sep 2018 16:29:32 -0500 Subject: [PATCH 119/139] Update test_performance_model for differing coefficient counts --- test/test_performance_model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index e24cce96..5b7894e4 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -354,7 +354,8 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): # {{{ verify performance model CONSTANT_ONE_PARAMS = dict( - p_qbx=1, + # Number of QBX coefficients: 1 + p_qbx=0, c_l2l=1, c_l2p=1, c_l2qbxl=1, @@ -378,9 +379,9 @@ def _get_params_for_raw_op_counts(perf_result): # Sets model / calibration parameters equal to 1, to obtain raw op counts. constant_one_params = CONSTANT_ONE_PARAMS.copy() - # Set p_fmm_lev* equal to 1. + # Set p_fmm_lev* equal to 0 (sets number of coeffs to 1). for level in range(perf_result.params["nlevels"]): - constant_one_params["p_fmm_lev%d" % level] = 1 + constant_one_params["p_fmm_lev%d" % level] = 0 return constant_one_params -- GitLab From 95712b4fdc1be890de631a42bf5fda2ea4814ee1 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 5 Sep 2018 16:41:00 -0500 Subject: [PATCH 120/139] Remove unused variable --- test/test_performance_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 5b7894e4..8f43f7d8 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -240,7 +240,7 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): global_qbx_centers = self.geo_data.global_qbx_centers() qbx_center_to_target_box = self.geo_data.qbx_center_to_target_box() - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + for tgt_icenter in global_qbx_centers: itgt_box = qbx_center_to_target_box[tgt_icenter] start, end = ( -- GitLab From 585ad5f31e8d1c54e0bd81cfd93668854e6a0329 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 5 Sep 2018 16:47:05 -0500 Subject: [PATCH 121/139] Follow some reasonable pylint suggestions --- pytential/qbx/fmmlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 3318e666..1dce5762 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -486,7 +486,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): # Is the box number on the level currently under # consideration? - in_range = (lev_box_start <= src_ibox and src_ibox < lev_box_stop) + in_range = (lev_box_start <= src_ibox < lev_box_stop) if in_range: src_center = self.tree.box_centers[:, src_ibox] @@ -521,7 +521,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): taeval = self.get_expn_eval_routine("ta") - for isrc_center, src_icenter in enumerate(global_qbx_centers): + for src_icenter in global_qbx_centers: for icenter_tgt in range( ctt.starts[src_icenter], ctt.starts[src_icenter+1]): -- GitLab From 14dd2bca5b7a890c89301b5755790edc22b1721a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 5 Sep 2018 16:54:09 -0500 Subject: [PATCH 122/139] Fix exec function getting --- pytential/symbolic/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py index 2a06ba43..5959b598 100644 --- a/pytential/symbolic/compiler.py +++ b/pytential/symbolic/compiler.py @@ -387,7 +387,7 @@ class Code(object): done_insns.add(insn) assignments = ( - insn.get_exec_function(exec_mapper) + self.get_exec_function(insn, exec_mapper) (exec_mapper.queue, insn, exec_mapper.bound_expr, exec_mapper)) -- GitLab From 24605be986be129427b3b82547efefcba06fe092 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 11 Sep 2018 17:21:58 -0500 Subject: [PATCH 123/139] Use the base kernel to obtain the FMM level to order --- pytential/qbx/performance.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index ccad4f7e..0ab4f952 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -670,7 +670,7 @@ class PerformanceModel(object): for ilevel in range(tree.nlevels): params["p_fmm_lev%d" % ilevel] = ( lpot_source.fmm_level_to_order( - kernel, kernel_arguments, tree, ilevel)) + kernel.get_base_kernel(), kernel_arguments, tree, ilevel)) params.update(self.calibration_params) diff --git a/setup.py b/setup.py index c7f5e2f0..bf1a6b7c 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ ext_modules = [ depends=[ "pytential/qbx/target_specific/_internal.h", "pytential/qbx/target_specific/_helmholtz_utils.h"], - extra_compile_args=["-Wall", "-fopenmp", "-ffast-math", "-march=native"], + extra_compile_args=["-Wall", "-fopenmp", "-Ofast"], extra_link_args=["-fopenmp"] ), ] -- GitLab From 3b574437cf1be5222a97bbd2b66ab0dcd438fe46 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 11 Sep 2018 21:39:35 -0500 Subject: [PATCH 124/139] TSQBX: Hoist the target/Bessel computation out of the inner loop to avoid expensive Bessel eval --- pytential/qbx/target_specific/_internal.pyx | 23 ++++++++++++--------- test/test_target_specific_qbx.py | 12 +++++------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index ab258f8c..e318f2b7 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -32,7 +32,7 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): """Evaluate spherical Bessel functions. Arguments: - nterms: Number of terms to evaluate + nterms: Highest order to be computed z: Argument scale: Output scaling factor (recommended: min(abs(z), 1)) fjs: Output array of complex doubles @@ -46,6 +46,9 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): double scale_ double complex z_ + if nterms <= 0: + raise ValueError("nterms should be positive") + nterms_ = nterms z_ = z scale_ = scale @@ -58,7 +61,7 @@ def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): if ier: raise ValueError("jfuns3d_ returned error code %d" % ier) - for i in range(nterms): + for i in range(1 + nterms): fjs[i] = fjstemp[i] if ifder: fjder[i] = fjdertmp[i] @@ -68,7 +71,7 @@ def h3dall_wrapper(nterms, z, scale, hs, hders): """Evaluate spherical Hankel functions. Arguments: - nterms: Number of terms to evaluate + nterms: Highest order to be computed z: Argument scale: Output scaling factor (recommended: min(abs(z), 1)) hs: Output array of complex doubles @@ -78,23 +81,23 @@ def h3dall_wrapper(nterms, z, scale, hs, hders): int nterms_, ifder double scale_ double complex z_ - double complex[:] hvec = np.empty(nterms, np.complex) - double complex[:] hdervec = np.empty(nterms, np.complex) + double complex[:] hvec = np.empty(1 + nterms, np.complex) + double complex[:] hdervec = np.empty(1 + nterms, np.complex) ifder = hders is not None - if nterms == 0: - return + if nterms <= 0: + raise ValueError("nterms should be positive") - nterms_ = nterms - 1 z_ = z scale_ = scale + nterms_ = nterms h3dall_(&nterms_, &z_, &scale_, &hvec[0], &ifder, &hdervec[0]) - hs[:nterms] = hvec[:] + hs[:1 + nterms] = hvec[:] if ifder: - hders[:nterms] = hdervec[:] + hders[:1 + nterms] = hdervec[:] cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index dd71676a..09982924 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -47,11 +47,11 @@ logger = logging.getLogger(__name__) def test_spherical_bessel_functions(): import pytential.qbx.target_specific as ts - nterms = 10 + nterms = 9 z = 3j scale = 1 - j = np.zeros(nterms, dtype=np.complex) - jder = np.zeros(nterms, dtype=np.complex) + j = np.zeros(1 + nterms, dtype=np.complex) + jder = np.zeros(1 + nterms, dtype=np.complex) ts.jfuns3d_wrapper(nterms, z, scale, j, jder) # Reference solution computed using scipy.special.spherical_jn @@ -90,11 +90,11 @@ def test_spherical_bessel_functions(): def test_spherical_hankel_functions(): import pytential.qbx.target_specific as ts - nterms = 10 + nterms = 9 z = 2 + 3j scale = 1 - h = np.zeros(nterms, dtype=np.complex) - hder = np.zeros(nterms, dtype=np.complex) + h = np.zeros(1 + nterms, dtype=np.complex) + hder = np.zeros(1 + nterms, dtype=np.complex) ts.h3dall_wrapper(nterms, z, scale, h, hder) # Reference solution computed using -- GitLab From 887a25be117c97103bf6a36242cb54d894b018af Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 11 Sep 2018 21:59:13 -0500 Subject: [PATCH 125/139] Actually include code for hoisting the source-invariant computation out of the loop --- pytential/qbx/target_specific/_internal.pyx | 106 ++++++++++++-------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index e318f2b7..0be53ad8 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -198,24 +198,58 @@ cdef void tsqbx_laplace_dlp( return +cdef void tsqbx_helmholtz_precompute( + double[3] center, + double[3] target, + int order, + double complex k, + double complex *jvals, + double *jscale) nogil: + """Evaluate the source-invariant Bessel terms for the Helmholtz TSQBX + kernel.""" + + cdef: + double complex z + double tc_d + int ier, ntop, ifder, lwfjs + int[BUFSIZE] iscale + + tc_d = dist(target, center) + jscale[0] = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 + + # Evaluate the spherical Bessel terms. + z = k * tc_d + ifder = 0 + lwfjs = BUFSIZE + # jfuns3d_ only supports order > 0 (goes out of bounds if order = 0) + order = max(1, order) + jfuns3d_(&ier, &order, &z, jscale, jvals, &ifder, NULL, &lwfjs, iscale, + &ntop) + if ier: + # This could in theory fail. + fprintf(stderr, "array passed to jfuns3d was too small\n") + abort() + + cdef void tsqbx_helmholtz_dlp( double[3] source, double[3] center, double[3] target, double complex[3] grad, int order, - double complex k) nogil: + double complex k, + double complex *jvals, + double jscale) nogil: cdef: int n, m - int ier, ntop, ifder, lwfjs + int ifder double sc_d, tc_d, cos_angle, alpha double[3] cms, tmc double complex[3] grad_tmp double[BUFSIZE] lvals, lderivs double complex z - double complex[BUFSIZE] jvals, hvals, hderivs - int[BUFSIZE] iscale - double jscale, hscale, unscale + double complex [BUFSIZE] hvals, hderivs + double hscale, unscale for m in range(3): cms[m] = center[m] - source[m] @@ -235,25 +269,13 @@ cdef void tsqbx_helmholtz_dlp( # Evaluate the Legendre terms. legvals(cos_angle, order, lvals, lderivs) - # Scaling magic for Bessel and Hankel terms. + # Scaling magic for Hankel terms. # These values are taken from the fmmlib documentation. - jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 # unscale = (jscale / hscale) ** n # Multiply against unscale to remove the scaling. unscale = 1 - # Evaluate the spherical Bessel terms. - z = k * tc_d - ifder = 0 - lwfjs = BUFSIZE - jfuns3d_(&ier, &order, &z, &jscale, jvals, &ifder, NULL, &lwfjs, iscale, - &ntop) - if ier: - # This could in theory fail. - fprintf(stderr, "array passed to jfuns3d was too small\n") - abort() - # Evaluate the spherical Hankel terms. z = k * sc_d ifder = 1 @@ -335,14 +357,15 @@ cdef double complex tsqbx_helmholtz_slp( double[3] center, double[3] target, int order, - double complex k) nogil: + double complex k, + double complex *jvals, + double jscale) nogil: cdef: - int n, ntop, ier, ifder, lwfjs + int n, ifder double sc_d, tc_d, cos_angle double[BUFSIZE] lvals - double complex[BUFSIZE] jvals, hvals - int[BUFSIZE] iscale - double jscale, hscale, unscale + double complex[BUFSIZE] hvals + double hscale, unscale double complex z, result tc_d = dist(target, center) @@ -357,27 +380,16 @@ cdef double complex tsqbx_helmholtz_slp( # Evaluate the Legendre terms. legvals(cos_angle, order, lvals, NULL) - # Scaling magic for Bessel and Hankel terms. + # Scaling magic for Hankel terms. # These values are taken from the fmmlib documentation. - jscale = cabs(k * tc_d) if (cabs(k * tc_d) < 1) else 1 hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 # unscale = (jscale / hscale) ** n # Multiply against unscale to remove the scaling. unscale = 1 - # Evaluate the spherical Bessel terms. - z = k * tc_d - ifder = 0 - lwfjs = BUFSIZE - jfuns3d_(&ier, &order, &z, &jscale, jvals, &ifder, NULL, &lwfjs, iscale, - &ntop) - if ier: - # This could in theory fail. - fprintf(stderr, "array passed to jfuns3d was too small\n") - abort() - # Evaluate the spherical Hankel terms. z = k * sc_d + ifder = 0 h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) result = 0 @@ -433,9 +445,10 @@ def eval_target_specific_qbx_locals( int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end int m, tid + double jscale double complex result double[:,:] source, center, target, grad - double complex[:,:] grad_complex + double complex[:,:] grad_complex, jvals int laplace_slp, helmholtz_slp, laplace_dlp, helmholtz_dlp if charge is None and (dipstr is None or dipvec is None): @@ -463,13 +476,17 @@ def eval_target_specific_qbx_locals( center = np.zeros((maxthreads, 65)) grad = np.zeros((maxthreads, 65)) grad_complex = np.zeros((maxthreads, 65), dtype=np.complex) + jvals = np.zeros((maxthreads, BUFSIZE + 65), dtype=np.complex) - # TODO: Check that the order is not too high, since some temporary arrays - # used above might overflow if that is the case. + # TODO: Check that the order is not too high, since temporary + # arrays in this module that are limited by BUFSIZE may overflow + # if that is the case for ictr in cython.parallel.prange(0, qbx_centers.shape[0], nogil=True, schedule="static", chunksize=128): + # Assign to jscale so Cython marks it as private + jscale = 0 ctr = qbx_centers[ictr] itgt_start = center_to_target_starts[ctr] itgt_end = center_to_target_starts[ctr + 1] @@ -486,6 +503,11 @@ def eval_target_specific_qbx_locals( for m in range(3): target[tid, m] = targets[m, tgt] + if helmholtz_slp or helmholtz_dlp: + tsqbx_helmholtz_precompute(¢er[tid, 0], &target[tid, 0], + order, helmholtz_k, &jvals[tid, 0], + &jscale) + isrc_box_start = source_box_starts[tgt_box] isrc_box_end = source_box_starts[tgt_box + 1] @@ -510,7 +532,8 @@ def eval_target_specific_qbx_locals( result = result + charge[isrc] * ( tsqbx_helmholtz_slp(&source[tid, 0], ¢er[tid, 0], &target[tid, 0], order, - helmholtz_k)) + helmholtz_k, &jvals[tid, 0], + jscale)) elif laplace_dlp: tsqbx_laplace_dlp(&source[tid, 0], ¢er[tid, 0], @@ -524,7 +547,8 @@ def eval_target_specific_qbx_locals( elif helmholtz_dlp: tsqbx_helmholtz_dlp(&source[tid, 0], ¢er[tid, 0], &target[tid, 0], &grad_complex[tid, 0], - order, helmholtz_k) + order, helmholtz_k, &jvals[tid, 0], + jscale) result = result + dipstr[isrc] * ( grad_complex[tid, 0] * dipvec[0, isrc] + -- GitLab From 417b57190299a4b4e8b5370b259cc177f84969a0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 12 Sep 2018 10:12:52 -0500 Subject: [PATCH 126/139] Get rid of warning-avoidance hackery: no longer needed? --- pytential/qbx/target_specific/_internal.pyx | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 0be53ad8..353a0fc2 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -556,13 +556,3 @@ def eval_target_specific_qbx_locals( grad_complex[tid, 2] * dipvec[2, isrc]) pot[tgt] = pot[tgt] + result - - # The Cython-generated OpenMP loop marks these variables as lastprivate. - # Due to this GCC warns that these could be used without being initialized. - # Initialize them here to suppress the warning. - result = 0 - tid = 0 - ctr = 0 - src_ibox = tgt_box = 0 - tgt = itgt = itgt_start = itgt_end = 0 - isrc = isrc_box = isrc_start = isrc_end = isrc_box_start = isrc_box_end = 0 -- GitLab From 363d6735ad036ee7d1310b0687a673e4f474370f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 12 Sep 2018 10:13:32 -0500 Subject: [PATCH 127/139] Remove unused variables --- pytential/qbx/performance.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 0ab4f952..2762c2ee 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -553,10 +553,9 @@ class PerformanceModel(object): assert tree.nlevels == len(traversal.from_sep_smaller_by_level) for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + for tgt_icenter in global_qbx_centers: icontaining_tgt_box = qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] + isrc_level][tgt_icenter] if icontaining_tgt_box == -1: continue @@ -581,7 +580,7 @@ class PerformanceModel(object): qbx_center_to_target_box): nl2qbxl_by_level = np.zeros(tree.nlevels, dtype=np.intp) - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + for tgt_icenter in global_qbx_centers: itgt_box = qbx_center_to_target_box[tgt_icenter] tgt_ibox = traversal.target_boxes[itgt_box] level = tree.box_levels[tgt_ibox] @@ -601,7 +600,7 @@ class PerformanceModel(object): center_to_targets_starts): result = 0 - for isrc_center, src_icenter in enumerate(global_qbx_centers): + for src_icenter in global_qbx_centers: start, end = center_to_targets_starts[src_icenter:src_icenter+2] result += (end - start) -- GitLab From a6e40a0a68ae57ef8e090f9e70b19703f8771270 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 15 Sep 2018 00:55:00 -0500 Subject: [PATCH 128/139] Remove futures. --- pytential/qbx/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index ad7ed933..91c227e6 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -774,8 +774,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): result.append((o.name, output_array)) - new_futures = [] - return result, new_futures, performance_model_result + return result, performance_model_result # }}} -- GitLab From 9cda7789d653c21303f9a3ef2e9fc7356ebde2db Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 15 Sep 2018 15:06:02 -0500 Subject: [PATCH 129/139] Fix counting for refine_locals. --- pytential/qbx/performance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index f06679d0..deedb240 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -631,7 +631,9 @@ class PerformanceModel(object): # {{{ propagate local_exps downward result["refine_locals"] = ( - traversal.ntarget_or_target_parent_boxes * xlat_cost.l2l()) + # Don't count the root box. + max(traversal.ntarget_or_target_parent_boxes - 1, 0) + * xlat_cost.l2l()) # }}} -- GitLab From 084131f929e4ef86d5ee7615f3f46225fac7c792 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 25 Sep 2018 18:15:29 -0500 Subject: [PATCH 130/139] Allow supplying translation cost models to the performance model through a factory parameter. Remove the uses_pde_expansions argument. --- pytential/qbx/performance.py | 91 ++++++++++++++++++++++------------ test/test_performance_model.py | 86 ++++++++++++++++++-------------- 2 files changed, 108 insertions(+), 69 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 2762c2ee..316de3c0 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -44,6 +44,11 @@ __doc__ = """ .. autoclass:: PerformanceModel .. autoclass:: ParametrizedCosts +.. autoclass:: TranslationCostModel + +.. autofunction:: pde_aware_translation_cost_model +.. autofunction:: taylor_translation_cost_model + .. autofunction:: estimate_calibration_params """ @@ -125,6 +130,47 @@ class TranslationCostModel(object): # }}} +# {{{ translation cost model factories + +def pde_aware_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation operators that make use of the + knowledge that the potential satisfies a PDE. + """ + p_qbx = var("p_qbx") + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + + uses_point_and_shoot = False + + ncoeffs_fmm = (p_fmm + 1) ** (dim - 1) + ncoeffs_qbx = (p_qbx + 1) ** (dim - 1) + + if dim == 3: + uses_point_and_shoot = True + + return TranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=uses_point_and_shoot) + + +def taylor_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation based on Taylor expansions + in Cartesian coordinates. + """ + p_qbx = var("p_qbx") + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + + ncoeffs_fmm = (p_fmm + 1) ** dim + ncoeffs_qbx = (p_qbx + 1) ** dim + + return TranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=False) + +# }}} + + # {{{ parameterized costs returned by performance model class ParametrizedCosts(object): @@ -215,15 +261,19 @@ class PerformanceModel(object): """ .. automethod:: with_calibration_params .. automethod:: __call__ + + The performance model relies on a translation cost model. See + :class:`TranslationCostModel` for the translation cost model interface. """ - def __init__(self, uses_pde_expansions=True, calibration_params=None): + def __init__(self, + translation_cost_model_factory=pde_aware_translation_cost_model, + calibration_params=None): """ - :arg uses_pde_expansions: A :class:`bool` indicating whether the FMM - uses translation operators that make use of the knowledge that the - potential satisfies a PDE. + :arg translation_cost_model_factory: A callable which, given arguments + (*dim*, *nlevels*), returns a translation cost model. """ - self.uses_pde_expansions = uses_pde_expansions + self.translation_cost_model_factory = translation_cost_model_factory if calibration_params is None: calibration_params = dict() self.calibration_params = calibration_params @@ -231,7 +281,7 @@ class PerformanceModel(object): def with_calibration_params(self, calibration_params): """Return a copy of *self* with a new set of calibration parameters.""" return type(self)( - uses_pde_expansions=self.uses_pde_expansions, + translation_cost_model_factory=self.translation_cost_model_factory, calibration_params=calibration_params) # {{{ form multipoles @@ -610,32 +660,6 @@ class PerformanceModel(object): # }}} - # {{{ set up translation cost model - - def get_translation_cost_model(self, dim, nlevels): - p_qbx = var("p_qbx") - p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) - - uses_point_and_shoot = False - - if self.uses_pde_expansions: - ncoeffs_fmm = (p_fmm + 1) ** (dim-1) - ncoeffs_qbx = (p_qbx + 1) ** (dim-1) - - if dim == 3: - uses_point_and_shoot = True - - else: - ncoeffs_fmm = (p_fmm + 1) ** dim - ncoeffs_qbx = (p_qbx + 1) ** dim - - return TranslationCostModel( - ncoeffs_qbx=ncoeffs_qbx, - ncoeffs_fmm_by_level=ncoeffs_fmm, - uses_point_and_shoot=uses_point_and_shoot) - - # }}} - @log_process(logger, "model performance") def __call__(self, geo_data, kernel, kernel_arguments): """Analyze the given geometry and return performance data. @@ -673,7 +697,8 @@ class PerformanceModel(object): params.update(self.calibration_params) - xlat_cost = self.get_translation_cost_model(tree.dimensions, tree.nlevels) + xlat_cost = ( + self.translation_cost_model_factory(tree.dimensions, tree.nlevels)) # {{{ construct local multipoles diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 8f43f7d8..7c1f31a9 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -353,37 +353,36 @@ class ConstantOneQBXExpansionWrangler(ConstantOneExpansionWrangler): # {{{ verify performance model -CONSTANT_ONE_PARAMS = dict( - # Number of QBX coefficients: 1 - p_qbx=0, - c_l2l=1, - c_l2p=1, - c_l2qbxl=1, - c_m2l=1, - c_m2m=1, - c_m2p=1, - c_m2qbxl=1, - c_p2l=1, - c_p2m=1, - c_p2p=1, - c_p2qbxl=1, - c_qbxl2p=1, - c_p2p_tsqbx=1, - ) +class OpCountingTranslationCostModel(object): + """A translation cost model which assigns at cost of 1 to each operation.""" + def __init__(self, dim, nlevels): + pass -def _get_params_for_raw_op_counts(perf_result): - """Return a set of parameters suitable for obtaining raw - operation counts from the model.""" + @staticmethod + def direct(): + return 1 + + p2qbxl = direct + p2p_tsqbx = direct + qbxl2p = direct - # Sets model / calibration parameters equal to 1, to obtain raw op counts. - constant_one_params = CONSTANT_ONE_PARAMS.copy() + @staticmethod + def p2l(level): + return 1 - # Set p_fmm_lev* equal to 0 (sets number of coeffs to 1). - for level in range(perf_result.params["nlevels"]): - constant_one_params["p_fmm_lev%d" % level] = 0 + l2p = p2l + p2m = p2l + m2p = p2l + m2qbxl = p2l + l2qbxl = p2l + + @staticmethod + def m2m(src_level, tgt_level): + return 1 - return constant_one_params + l2l = m2m + m2l = m2m @pytest.mark.parametrize("dim, off_surface, use_target_specific_qbx", ( @@ -398,12 +397,12 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) - # We set uses_pde_expansions=False, so that a translation is modeled as - # simply costing nsrc_coeffs * ntgt_coeffs. By adjusting the symbolic - # parameters to equal 1 (done below), this provides a straightforward way - # to obtain the raw operation count for each FMM stage. + perf_model = ( + PerformanceModel( + translation_cost_model_factory=OpCountingTranslationCostModel)) + lpot_source = get_lpot_source(queue, dim).copy( - performance_model=PerformanceModel(uses_pde_expansions=False), + performance_model=perf_model, _use_target_specific_qbx=use_target_specific_qbx) # Construct targets. @@ -430,7 +429,6 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, from pytools import one perf_S = one(op_S.get_modeled_performance(queue, sigma=sigma).values()) - perf_S = perf_S.with_params(_get_params_for_raw_op_counts(perf_S)) # Run FMM with ConstantOneWrangler. This can't be done with pytential's # high-level interface, so call the FMM driver directly. @@ -467,6 +465,23 @@ def test_performance_model_correctness(ctx_getter, dim, off_surface, # {{{ test order varying by level +CONSTANT_ONE_PARAMS = dict( + c_l2l=1, + c_l2p=1, + c_l2qbxl=1, + c_m2l=1, + c_m2m=1, + c_m2p=1, + c_m2qbxl=1, + c_p2l=1, + c_p2m=1, + c_p2p=1, + c_p2qbxl=1, + c_qbxl2p=1, + c_p2p_tsqbx=1, + ) + + def test_performance_model_order_varying_by_level(ctx_getter): cl_ctx = ctx_getter() queue = cl.CommandQueue(cl_ctx) @@ -477,7 +492,8 @@ def test_performance_model_order_varying_by_level(ctx_getter): return 1 lpot_source = get_lpot_source(queue, 2).copy( - performance_model=PerformanceModel(uses_pde_expansions=False), + performance_model=PerformanceModel( + calibration_params=CONSTANT_ONE_PARAMS), fmm_level_to_order=level_to_order_constant) sigma_sym = sym.var("sigma") @@ -491,13 +507,11 @@ def test_performance_model_order_varying_by_level(ctx_getter): bind(lpot_source, sym_op) .get_modeled_performance(queue, sigma=sigma).values()) - perf_constant = perf_constant.with_params(CONSTANT_ONE_PARAMS) - # }}} # {{{ varying level to order - varying_order_params = CONSTANT_ONE_PARAMS.copy() + varying_order_params = perf_constant.params.copy() nlevels = perf_constant.params["nlevels"] for level in range(nlevels): -- GitLab From 8b215e4ff47ffffb5e089e62a5fa8e65867eb563 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 30 Sep 2018 19:03:43 -0500 Subject: [PATCH 131/139] Don't collect timing data by default (closes #1) --- pytential/qbx/__init__.py | 33 +++++++++++++++++++++------------ pytential/source.py | 10 +++++++++- pytential/symbolic/execution.py | 6 ++++-- pytential/unregularized.py | 10 +++++++++- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 91c227e6..af7bc29a 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -624,16 +624,16 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ internal functionality for execution - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): if self.fmm_level_to_order is False: func = self.exec_compute_potential_insn_direct else: func = self.exec_compute_potential_insn_fmm return self._dispatch_compute_potential_insn( - queue, insn, bound_expr, evaluate, func) + queue, insn, bound_expr, evaluate, func, return_timing_data) - def perf_model_compute_potential_insn(self, queue, insn, bound_expr, - evaluate): + def perf_model_compute_potential_insn(self, queue, insn, bound_expr, evaluate): if self.fmm_level_to_order is False: raise NotImplementedError("perf modeling direct evaluations") return self._dispatch_compute_potential_insn( @@ -641,14 +641,14 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.perf_model_compute_potential_insn_fmm) def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, - evaluate, func): + evaluate, func, return_timing_data): from pytools.obj_array import with_object_array_or_scalar if not self._refined_for_global_qbx: from warnings import warn warn( - "Executing global QBX without refinement. " - "This is unlikely to work.") + "Executing global QBX without refinement. " + "This is unlikely to work.") def oversample_nonscalars(vec): from numbers import Number @@ -661,7 +661,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - return func(queue, insn, bound_expr, evaluate_wrapper) + return func(queue, insn, bound_expr, evaluate_wrapper, return_timing_data) @property @memoize_method @@ -736,7 +736,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute fmm performance model def perf_model_compute_potential_insn_fmm(self, queue, insn, bound_expr, - evaluate): + evaluate): target_name_and_side_to_number, target_discrs_and_qbx_sides = ( self.get_target_discrs_and_qbx_sides(insn, bound_expr)) @@ -782,7 +782,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute fmm - def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate, + return_timing_data): target_name_and_side_to_number, target_discrs_and_qbx_sides = ( self.get_target_discrs_and_qbx_sides(insn, bound_expr)) @@ -833,7 +834,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX from pytential.qbx.fmm import drive_fmm - timing_data = {} + timing_data = {} if return_timing_data else None all_potentials_on_every_target = drive_fmm(wrangler, strengths, timing_data) # }}} @@ -907,7 +908,15 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): *count = item; """) - def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn_direct(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from pytential.source import UnableToCollectTimingData + from warnings import warn + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + lpot_applier = self.get_lpot_applier(insn.kernels) p2p = None lpot_applier_on_tgt_subset = None diff --git a/pytential/source.py b/pytential/source.py index 5bed0656..93d8050e 100644 --- a/pytential/source.py +++ b/pytential/source.py @@ -27,6 +27,7 @@ import numpy as np # noqa: F401 import pyopencl as cl # noqa: F401 import six from pytools import memoize_method +from sumpy.fmm import UnableToCollectTimingData __doc__ = """ @@ -127,7 +128,14 @@ class PointPotentialSource(PotentialSource): evaluate, costs): raise NotImplementedError - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from warnings import warn + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + p2p = None kernel_args = {} diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 5a2e65ea..bbd7308b 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -244,11 +244,13 @@ class EvaluationMapper(EvaluationMapperBase): def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): source = bound_expr.places[insn.source] + return_timing_data = self.timing_data is not None + result, timing_data = ( source.exec_compute_potential_insn( - queue, insn, bound_expr, evaluate)) + queue, insn, bound_expr, evaluate, return_timing_data)) - if self.timing_data is not None: + if return_timing_data: self.timing_data[insn] = timing_data return result diff --git a/pytential/unregularized.py b/pytential/unregularized.py index 607ddc9c..cad85f99 100644 --- a/pytential/unregularized.py +++ b/pytential/unregularized.py @@ -131,7 +131,15 @@ class UnregularizedLayerPotentialSource(LayerPotentialSourceBase): density_discr=density_discr or self.density_discr, debug=debug if debug is not None else self.debug) - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + return_timing_data): + if return_timing_data: + from warnings import warn + from pytential.source import UnableToCollectTimingData + warn( + "Timing data collection not supported.", + category=UnableToCollectTimingData) + from pytools.obj_array import with_object_array_or_scalar def evaluate_wrapper(expr): -- GitLab From 15393135d4e52ac54aee93e3b15e07578f976212 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 30 Sep 2018 19:16:57 -0500 Subject: [PATCH 132/139] Fix perf model dispatching --- pytential/qbx/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index af7bc29a..6ded7e93 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -630,8 +630,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): func = self.exec_compute_potential_insn_direct else: func = self.exec_compute_potential_insn_fmm + + extra_args = {"return_timing_data": return_timing_data} + return self._dispatch_compute_potential_insn( - queue, insn, bound_expr, evaluate, func, return_timing_data) + queue, insn, bound_expr, evaluate, func, extra_args) def perf_model_compute_potential_insn(self, queue, insn, bound_expr, evaluate): if self.fmm_level_to_order is False: @@ -641,7 +644,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.perf_model_compute_potential_insn_fmm) def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, - evaluate, func, return_timing_data): + evaluate, func, extra_args=None): from pytools.obj_array import with_object_array_or_scalar if not self._refined_for_global_qbx: @@ -661,7 +664,10 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - return func(queue, insn, bound_expr, evaluate_wrapper, return_timing_data) + if extra_args is None: + extra_args = {} + + return func(queue, insn, bound_expr, evaluate_wrapper, **extra_args) @property @memoize_method -- GitLab From f8b16c37f0cac125a32c0b712848ab0f330fadf9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 22 Oct 2018 19:56:05 -0500 Subject: [PATCH 133/139] TS: Support target derivatives (closes #7) --- pytential/qbx/fmmlib.py | 65 ++- pytential/qbx/target_specific/_internal.h | 3 + pytential/qbx/target_specific/_internal.pyx | 600 +++++++++++++------- test/test_target_specific_qbx.py | 6 +- 4 files changed, 451 insertions(+), 223 deletions(-) diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 15567e0c..a44eb4a4 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -180,7 +180,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @staticmethod def is_supported_helmknl_for_tsqbx(knl): - if isinstance(knl, DirectionalSourceDerivative): + # Supports at most one derivative. + if isinstance(knl, (DirectionalSourceDerivative, AxisTargetDerivative)): knl = knl.inner_kernel return (isinstance(knl, (LaplaceKernel, HelmholtzKernel)) @@ -577,7 +578,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): if not self._use_target_specific_qbx: return self.full_output_zeros() - pot = self.full_output_zeros() geo_data = self.geo_data trav = geo_data.traversal() @@ -585,27 +585,46 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): src_weights = src_weights.astype(np.complex128) - for output in pot: - ts.eval_target_specific_qbx_locals( - order=self.qbx_order, - sources=self._get_single_sources_array(), - targets=geo_data.all_targets(), - centers=self._get_single_centers_array(), - qbx_centers=geo_data.global_qbx_centers(), - qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), - center_to_target_starts=ctt.starts, - center_to_target_lists=ctt.lists, - source_box_starts=trav.neighbor_source_boxes_starts, - source_box_lists=trav.neighbor_source_boxes_lists, - box_source_starts=self.tree.box_source_starts, - box_source_counts_nonchild=self.tree.box_source_counts_nonchild, - helmholtz_k=self.kernel_kwargs.get("zk", 0), - charge=src_weights if self.dipole_vec is None else None, - dipstr=src_weights if self.dipole_vec is not None else None, - dipvec=self.dipole_vec, - pot=output) - - return pot + ifcharge = self.dipole_vec is None + ifdipole = self.dipole_vec is not None + + ifpot = any(not output for output in self.outputs) + ifgrad = self.ifgrad + + # Create temporary output arrays for potential / gradient. + pot = np.zeros(self.tree.ntargets, np.complex) if ifpot else None + grad = ( + np.zeros((self.dim, self.tree.ntargets), np.complex) + if ifgrad else None) + + ts.eval_target_specific_qbx_locals( + ifpot=ifpot, + ifgrad=ifgrad, + ifcharge=ifcharge, + ifdipole=ifdipole, + order=self.qbx_order, + sources=self._get_single_sources_array(), + targets=geo_data.all_targets(), + centers=self._get_single_centers_array(), + qbx_centers=geo_data.global_qbx_centers(), + qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), + center_to_target_starts=ctt.starts, + center_to_target_lists=ctt.lists, + source_box_starts=trav.neighbor_source_boxes_starts, + source_box_lists=trav.neighbor_source_boxes_lists, + box_source_starts=self.tree.box_source_starts, + box_source_counts_nonchild=self.tree.box_source_counts_nonchild, + helmholtz_k=self.kernel_kwargs.get("zk", 0), + charge=src_weights, + dipstr=src_weights, + dipvec=self.dipole_vec, + pot=pot, + grad=grad) + + output = self.full_output_zeros() + self.add_potgrad_onto_output(output, slice(None), pot, grad) + + return output def finalize_potentials(self, potential): potential = super(QBXFMMLibExpansionWrangler, self).finalize_potentials( diff --git a/pytential/qbx/target_specific/_internal.h b/pytential/qbx/target_specific/_internal.h index 914b2d05..272302d8 100644 --- a/pytential/qbx/target_specific/_internal.h +++ b/pytential/qbx/target_specific/_internal.h @@ -4,4 +4,7 @@ // Temporary buffer size for holding e.g. Legendre polynomial values #define BUFSIZE 64 +// Padding for false sharing prevention +#define PADDING 65 + #endif diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 353a0fc2..6f52c9f7 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -26,8 +26,11 @@ cdef extern from "_helmholtz_utils.h" nogil: cdef extern from "_internal.h" nogil: const int BUFSIZE + const int PADDING +# {{{ (externally visible) wrappers for bessel / hankel functions + def jfuns3d_wrapper(nterms, z, scale, fjs, fjder): """Evaluate spherical Bessel functions. @@ -99,6 +102,10 @@ def h3dall_wrapper(nterms, z, scale, hs, hders): if ifder: hders[:1 + nterms] = hdervec[:] +# }}} + + +# {{{ helpers cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: """Compute the values of the Legendre polynomial up to order n at x. @@ -145,73 +152,29 @@ cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil: cdef double dist(double[3] a, double[3] b) nogil: + """Calculate the Euclidean distance between a and b.""" return sqrt( (a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1]) + (a[2] - b[2]) * (a[2] - b[2])) -cdef void tsqbx_laplace_dlp( - double[3] source, - double[3] center, - double[3] target, - double[3] grad, - int order) nogil: - cdef: - int j, m - double sc_d, tc_d, cos_angle, alpha, Rj - double[BUFSIZE] lvals, lderivs - double[3] cms, tmc, grad_tmp - - for m in range(3): - cms[m] = center[m] - source[m] - tmc[m] = target[m] - center[m] - grad[m] = 0 - - tc_d = dist(target, center) - sc_d = dist(source, center) - - alpha = ( - (target[0] - center[0]) * (source[0] - center[0]) + - (target[1] - center[1]) * (source[1] - center[1]) + - (target[2] - center[2]) * (source[2] - center[2])) - - cos_angle = alpha / (tc_d * sc_d) - - # Evaluate the Legendre terms. - legvals(cos_angle, order, lvals, lderivs) - - # Invariant: Rj = (t_cd ** j / sc_d ** (j + 2)) - Rj = 1 / (sc_d * sc_d) - - for j in range(0, order + 1): - for m in range(3): - grad_tmp[m] = (j + 1) * (cms[m] / sc_d) * lvals[j] - for m in range(3): - # Siegel and Tornberg has a sign flip here :( - grad_tmp[m] += (tmc[m] / tc_d + cos_angle * cms[m] / sc_d) * lderivs[j] - for m in range(3): - grad[m] += Rj * grad_tmp[m] - - Rj *= (tc_d / sc_d) - - return - - -cdef void tsqbx_helmholtz_precompute( +cdef void ts_helmholtz_precompute( double[3] center, double[3] target, int order, + int ifder, double complex k, - double complex *jvals, + double complex[] jvals, + double complex[] jderivs, double *jscale) nogil: - """Evaluate the source-invariant Bessel terms for the Helmholtz TSQBX - kernel.""" + """Evaluate the source-invariant Bessel terms of the Helmholtz target-specific + expansion.""" cdef: double complex z double tc_d - int ier, ntop, ifder, lwfjs + int ier, ntop, lwfjs int[BUFSIZE] iscale tc_d = dist(target, center) @@ -219,98 +182,29 @@ cdef void tsqbx_helmholtz_precompute( # Evaluate the spherical Bessel terms. z = k * tc_d - ifder = 0 lwfjs = BUFSIZE # jfuns3d_ only supports order > 0 (goes out of bounds if order = 0) order = max(1, order) - jfuns3d_(&ier, &order, &z, jscale, jvals, &ifder, NULL, &lwfjs, iscale, + jfuns3d_(&ier, &order, &z, jscale, jvals, &ifder, jderivs, &lwfjs, iscale, &ntop) if ier: # This could in theory fail. - fprintf(stderr, "array passed to jfuns3d was too small\n") + fprintf(stderr, "array passed to jfuns3d_ was too small\n") abort() +# }}} -cdef void tsqbx_helmholtz_dlp( - double[3] source, - double[3] center, - double[3] target, - double complex[3] grad, - int order, - double complex k, - double complex *jvals, - double jscale) nogil: - cdef: - int n, m - int ifder - double sc_d, tc_d, cos_angle, alpha - double[3] cms, tmc - double complex[3] grad_tmp - double[BUFSIZE] lvals, lderivs - double complex z - double complex [BUFSIZE] hvals, hderivs - double hscale, unscale - - for m in range(3): - cms[m] = center[m] - source[m] - tmc[m] = target[m] - center[m] - grad[m] = 0 - - tc_d = dist(target, center) - sc_d = dist(source, center) - - alpha = ( - (target[0] - center[0]) * (source[0] - center[0]) + - (target[1] - center[1]) * (source[1] - center[1]) + - (target[2] - center[2]) * (source[2] - center[2])) - cos_angle = alpha / (tc_d * sc_d) +# {{{ Laplace S - # Evaluate the Legendre terms. - legvals(cos_angle, order, lvals, lderivs) - - # Scaling magic for Hankel terms. - # These values are taken from the fmmlib documentation. - hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 - # unscale = (jscale / hscale) ** n - # Multiply against unscale to remove the scaling. - unscale = 1 - - # Evaluate the spherical Hankel terms. - z = k * sc_d - ifder = 1 - h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) - - # - # This is a mess, but amounts to the s-gradient of: - # - # __ order - # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) - # /__ n = 0 n n n - # - # - for n in range(0, order + 1): - for m in range(3): - grad_tmp[m] = -k * cms[m] * hderivs[n] * lvals[n] - for m in range(3): - grad_tmp[m] += ( - (tmc[m] / tc_d) + - cos_angle * (cms[m] / sc_d)) * hvals[n] * lderivs[n] - for m in range(3): - grad[m] += (2 * n + 1) * unscale * (grad_tmp[m] * jvals[n] / sc_d) - unscale *= jscale / hscale - - for m in range(3): - grad[m] *= 1j * k - - return - - -cdef double tsqbx_laplace_slp( +cdef double complex ts_laplace_s( double[3] source, double[3] center, double[3] target, + double complex charge, int order) nogil: + """Evaluate the target-specific expansion of the Laplace single-layer kernel.""" + cdef: double j double result, r, sc_d, tc_d, cos_angle @@ -327,7 +221,7 @@ cdef double tsqbx_laplace_slp( / (tc_d * sc_d)) if order == 0: - return 1 / sc_d + return charge / sc_d pjm2 = 1 pjm1 = cos_angle @@ -349,17 +243,115 @@ cdef double tsqbx_laplace_slp( pjm2 = pjm1 pjm1 = pj - return result + return charge * result + +# }}} + + +# {{{ Laplace grad(S) + +cdef void ts_laplace_sp( + double complex[3] grad, + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order) nogil: + """Evaluate the target-specific expansion of the gradient of the Laplace + single-layer kernel.""" + + cdef: + double[3] grad_tmp + double sc_d, tc_d, cos_angle, Rn + double[BUFSIZE] lvals, lderivs + double[3] smc, tmc + int n + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad_tmp[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Invariant: Rn = tc_d ** (n - 1) / sc_d ** (n + 1) + Rn = 1 / (sc_d * sc_d) + + for n in range(1, 1 + order): + for m in range(3): + grad_tmp[m] += Rn * ( + n * (tmc[m] / tc_d) * lvals[n] + + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d) * lderivs[n]) + Rn *= tc_d / sc_d + + for m in range(3): + grad[m] += charge * grad_tmp[m] + +# }}} + + +# {{{ Laplace D + +cdef double complex ts_laplace_d( + double[3] source, + double[3] center, + double[3] target, + double[3] dipole, + double complex dipstr, + int order) nogil: + """Evaluate the target-specific expansion of the Laplace double-layer kernel.""" + + cdef: + int n, m + double sc_d, tc_d, cos_angle, Rn + double[BUFSIZE] lvals, lderivs + double[3] smc, tmc, grad + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Invariant: Rn = (tc_d ** n / sc_d ** (n + 2)) + Rn = 1 / (sc_d * sc_d) + + for n in range(0, order + 1): + for m in range(3): + grad[m] += Rn * ( + -(n + 1) * (smc[m] / sc_d) * lvals[n] + + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d) * lderivs[n]) + Rn *= (tc_d / sc_d) + + return dipstr * ( + dipole[0] * grad[0] + dipole[1] * grad[1] + dipole[2] * grad[2]) +# }}} -cdef double complex tsqbx_helmholtz_slp( + +# {{{ Helmholtz S + +cdef double complex ts_helmholtz_s( double[3] source, double[3] center, double[3] target, + double complex charge, int order, double complex k, - double complex *jvals, + double complex[] jvals, double jscale) nogil: + """Evaluate the target-specific expansion of the Helmholtz single-layer + kernel.""" + cdef: int n, ifder double sc_d, tc_d, cos_angle @@ -398,10 +390,162 @@ cdef double complex tsqbx_helmholtz_slp( result += (2 * n + 1) * unscale * (jvals[n] * hvals[n] * lvals[n]) unscale *= jscale / hscale - return result * 1j * k + return 1j * k * charge * result + +# }}} + + +# {{{ Helmholtz grad(S) + +cdef void ts_helmholtz_sp( + double complex[3] grad, + double[3] source, + double[3] center, + double[3] target, + double complex charge, + int order, + double complex k, + double complex[] jvals, + double complex[] jderivs, + double jscale) nogil: + """Evaluate the target-specific expansion of the gradient of the Helmholtz + single-layer kernel.""" + + cdef: + int n, m + int ifder + double sc_d, tc_d, cos_angle + double[3] smc, tmc + double complex[3] grad_tmp + double[BUFSIZE] lvals, lderivs + double complex z + double complex [BUFSIZE] hvals + double hscale, unscale + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad_tmp[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + # Evaluate the Legendre terms. + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + legvals(cos_angle, order, lvals, lderivs) + + # Scaling magic for Hankel terms. + # These values are taken from the fmmlib documentation. + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 0 + h3dall_(&order, &z, &hscale, hvals, &ifder, NULL) + + # + # This is a mess, but amounts to the t-gradient of: + # + # __ order + # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) + # /__ n = 0 n n n + # + # + for n in range(0, order + 1): + for m in range(3): + grad_tmp[m] += (2 * n + 1) * unscale * hvals[n] / tc_d * ( + k * jderivs[n] * lvals[n] * tmc[m] + + (smc[m] / sc_d - cos_angle * tmc[m] / tc_d) + * jvals[n] * lderivs[n]) + unscale *= jscale / hscale + + for m in range(3): + grad[m] += 1j * k * charge * grad_tmp[m] + +# }}} + + +# {{{ Helmholtz D + +cdef double complex ts_helmholtz_d( + double[3] source, + double[3] center, + double[3] target, + double[3] dipole, + double complex dipstr, + int order, + double complex k, + double complex[] jvals, + double jscale) nogil: + """Evaluate the target-specific expansion of the Helmholtz double-layer + kernel.""" + + cdef: + int n, m + int ifder + double sc_d, tc_d, cos_angle + double[3] smc, tmc + double complex[3] grad + double[BUFSIZE] lvals, lderivs + double complex z + double complex [BUFSIZE] hvals, hderivs + double hscale, unscale + + for m in range(3): + smc[m] = source[m] - center[m] + tmc[m] = target[m] - center[m] + grad[m] = 0 + + tc_d = dist(target, center) + sc_d = dist(source, center) + + cos_angle = (tmc[0] * smc[0] + tmc[1] * smc[1] + tmc[2] * smc[2]) / (tc_d * sc_d) + + # Evaluate the Legendre terms. + legvals(cos_angle, order, lvals, lderivs) + + # Scaling magic for Hankel terms. + # These values are taken from the fmmlib documentation. + hscale = cabs(k * sc_d) if (cabs(k * sc_d) < 1) else 1 + # unscale = (jscale / hscale) ** n + # Multiply against unscale to remove the scaling. + unscale = 1 + + # Evaluate the spherical Hankel terms. + z = k * sc_d + ifder = 1 + h3dall_(&order, &z, &hscale, hvals, &ifder, hderivs) + + # + # This is a mess, but amounts to the s-gradient of: + # + # __ order + # ik \ (2n + 1) j (k |t - c|) h (k |s - c|) P (cos θ) + # /__ n = 0 n n n + # + # + for n in range(0, order + 1): + for m in range(3): + grad[m] += (2 * n + 1) * unscale * jvals[n] / sc_d * ( + k * smc[m] * hderivs[n] * lvals[n] + + (tmc[m] / tc_d - cos_angle * smc[m] / sc_d) + * hvals[n] * lderivs[n]) + unscale *= jscale / hscale + + return 1j * k * dipstr * ( + grad[0] * dipole[0] + grad[1] * dipole[1] + grad[2] * dipole[2]) + +# }}} def eval_target_specific_qbx_locals( + int ifpot, + int ifgrad, + int ifcharge, + int ifdipole, int order, double[:,:] sources, double[:,:] targets, @@ -415,10 +559,15 @@ def eval_target_specific_qbx_locals( double complex[:] charge, double complex[:] dipstr, double[:,:] dipvec, - double complex[:] pot): + double complex[:] pot, + double complex[:,:] grad): """TSQBX entry point. Arguments: + ifpot: Flag indicating whether to evaluate the potential + ifgrad: Flag indicating whether to evaluate the gradient of the potential + ifcharge: Flag indicating whether to include monopole sources + ifdipole: Flag indicating whether to include dipole sources order: Expansion order sources: Array of sources of shape (3, *nsrcs*) targets: Array of targets of shape (3, *ntgts*) @@ -435,7 +584,8 @@ def eval_target_specific_qbx_locals( charge: (Complex) Source strengths, shape (*nsrcs*,), or *None* dipstr: (Complex) Dipole source strengths, shape (*nsrcs*,) or *None* dipvec: (Real) Dipole source orientations, shape (3, *nsrcs*), or *None* - pot: (Complex) Output potential, shape (*ngts*,) + pot: (Complex) Output potential, shape (*ngts*,), or *None* + grad: (Complex) Output gradient, shape (3, *ntgts*), or *None* """ cdef: @@ -444,44 +594,78 @@ def eval_target_specific_qbx_locals( int tgt_box, src_ibox int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end - int m, tid + int tid double jscale double complex result - double[:,:] source, center, target, grad - double complex[:,:] grad_complex, jvals - int laplace_slp, helmholtz_slp, laplace_dlp, helmholtz_dlp + double[:,:] source, center, target, dipole + double complex[:,:] result_grad, jvals, jderivs + int laplace_s, helmholtz_s, laplace_sp, helmholtz_sp, laplace_d, helmholtz_d + + # {{{ process arguments + + if ifcharge: + if charge is None: + raise ValueError("Missing charge") + + if ifdipole: + if dipstr is None: + raise ValueError("Missing dipstr") + if dipvec is None: + raise ValueError("Missing dipvec") + + if ifdipole and ifgrad: + raise ValueError("Does not support computing gradient of dipole sources") + + if helmholtz_k == 0: + helmholtz_s = helmholtz_sp = helmholtz_d = 0 - if charge is None and (dipstr is None or dipvec is None): - raise ValueError("must specify either charge, or both dipstr and dipvec") + if ifpot: + laplace_s = ifcharge + laplace_d = ifdipole - if charge is not None and (dipstr is not None or dipvec is not None): - raise ValueError("does not support simultaneous monopoles and dipoles") + if ifgrad: + laplace_sp = ifcharge - laplace_slp = (helmholtz_k == 0) and (dipvec is None) - laplace_dlp = (helmholtz_k == 0) and (dipvec is not None) - helmholtz_slp = (helmholtz_k != 0) and (dipvec is None) - helmholtz_dlp = (helmholtz_k != 0) and (dipvec is not None) + else: + laplace_s = laplace_sp = laplace_d = 0 - assert laplace_slp or laplace_dlp or helmholtz_slp or helmholtz_dlp + if ifpot: + helmholtz_s = ifcharge + helmholtz_d = ifdipole + + if ifgrad: + helmholtz_sp = ifcharge + + # }}} + + if not any([ + laplace_s, laplace_sp, laplace_d, helmholtz_s, helmholtz_sp, + helmholtz_d]): + return if qbx_centers.shape[0] == 0: return + # {{{ set up thread-local storage + # Hack to obtain thread-local storage maxthreads = openmp.omp_get_max_threads() - # Prevent false sharing by over-allocating the buffers - source = np.zeros((maxthreads, 65)) - target = np.zeros((maxthreads, 65)) - center = np.zeros((maxthreads, 65)) - grad = np.zeros((maxthreads, 65)) - grad_complex = np.zeros((maxthreads, 65), dtype=np.complex) - jvals = np.zeros((maxthreads, BUFSIZE + 65), dtype=np.complex) + # Prevent false sharing by padding the thread-local buffers + source = np.zeros((maxthreads, PADDING)) + target = np.zeros((maxthreads, PADDING)) + center = np.zeros((maxthreads, PADDING)) + dipole = np.zeros((maxthreads, PADDING)) + result_grad = np.zeros((maxthreads, PADDING), dtype=np.complex) + jvals = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex) + jderivs = np.zeros((maxthreads, BUFSIZE + PADDING), dtype=np.complex) # TODO: Check that the order is not too high, since temporary # arrays in this module that are limited by BUFSIZE may overflow # if that is the case + # }}} + for ictr in cython.parallel.prange(0, qbx_centers.shape[0], nogil=True, schedule="static", chunksize=128): @@ -493,20 +677,22 @@ def eval_target_specific_qbx_locals( tgt_box = qbx_center_to_target_box[ctr] tid = cython.parallel.threadid() - for m in range(3): - center[tid, m] = centers[m, ctr] + center[tid, :3] = centers[:, ctr] for itgt in range(itgt_start, itgt_end): result = 0 tgt = center_to_target_lists[itgt] - for m in range(3): - target[tid, m] = targets[m, tgt] + target[tid, :3] = targets[:, tgt] + if ifgrad: + result_grad[tid, :3] = 0 - if helmholtz_slp or helmholtz_dlp: - tsqbx_helmholtz_precompute(¢er[tid, 0], &target[tid, 0], - order, helmholtz_k, &jvals[tid, 0], - &jscale) + if helmholtz_s or helmholtz_sp or helmholtz_d: + # Precompute source-invariant Helmholtz terms. + ts_helmholtz_precompute( + ¢er[tid, 0], &target[tid, 0], + order, ifgrad, helmholtz_k, &jvals[tid, 0], + &jderivs[tid, 0], &jscale) isrc_box_start = source_box_starts[tgt_box] isrc_box_end = source_box_starts[tgt_box + 1] @@ -517,42 +703,58 @@ def eval_target_specific_qbx_locals( isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for isrc in range(isrc_start, isrc_end): - for m in range(3): - source[tid, m] = sources[m, isrc] + source[tid, :3] = sources[:, isrc] + + if ifdipole: + dipole[tid, :3] = dipvec[:, isrc] # NOTE: Don't use +=, since that makes Cython think we are # doing an OpenMP reduction. - if laplace_slp: - result = result + charge[isrc] * ( - tsqbx_laplace_slp(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], order)) - - elif helmholtz_slp: - result = result + charge[isrc] * ( - tsqbx_helmholtz_slp(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], order, - helmholtz_k, &jvals[tid, 0], - jscale)) - - elif laplace_dlp: - tsqbx_laplace_dlp(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], &grad[tid, 0], order) - - result = result + dipstr[isrc] * ( - grad[tid, 0] * dipvec[0, isrc] + - grad[tid, 1] * dipvec[1, isrc] + - grad[tid, 2] * dipvec[2, isrc]) - - elif helmholtz_dlp: - tsqbx_helmholtz_dlp(&source[tid, 0], ¢er[tid, 0], - &target[tid, 0], &grad_complex[tid, 0], - order, helmholtz_k, &jvals[tid, 0], - jscale) - - result = result + dipstr[isrc] * ( - grad_complex[tid, 0] * dipvec[0, isrc] + - grad_complex[tid, 1] * dipvec[1, isrc] + - grad_complex[tid, 2] * dipvec[2, isrc]) - - pot[tgt] = pot[tgt] + result + # {{{ evaluate potentials + + if laplace_s: + result = result + ( + ts_laplace_s( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order)) + + if laplace_sp: + ts_laplace_sp( + &result_grad[tid, 0], + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order) + + if laplace_d: + result = result + ( + ts_laplace_d( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + &dipole[tid, 0], dipstr[isrc], order)) + + if helmholtz_s: + result = result + ( + ts_helmholtz_s(&source[tid, 0], ¢er[tid, 0], + &target[tid, 0], charge[isrc], order, helmholtz_k, + &jvals[tid, 0], jscale)) + + if helmholtz_sp: + ts_helmholtz_sp( + &result_grad[tid, 0], + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + charge[isrc], order, helmholtz_k, + &jvals[tid, 0], &jderivs[tid, 0], jscale) + + if helmholtz_d: + result = result + ( + ts_helmholtz_d( + &source[tid, 0], ¢er[tid, 0], &target[tid, 0], + &dipole[tid, 0], dipstr[isrc], order, helmholtz_k, + &jvals[tid, 0], jscale)) + + # }}} + + if ifpot: + pot[tgt] = result + + if ifgrad: + grad[:, tgt] = result_grad[tid, :3] diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py index 09982924..d711372d 100644 --- a/test/test_target_specific_qbx.py +++ b/test/test_target_specific_qbx.py @@ -130,7 +130,7 @@ def test_spherical_hankel_functions(): assert np.allclose(hder, hder_expected) -@pytest.mark.parametrize("op", ["S", "D"]) +@pytest.mark.parametrize("op", ["S", "D", "Sp"]) @pytest.mark.parametrize("helmholtz_k", [0, 1.2, 12 + 1.2j]) @pytest.mark.parametrize("qbx_order", [0, 1, 5]) def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): @@ -188,6 +188,10 @@ def test_target_specific_qbx(ctx_getter, op, helmholtz_k, qbx_order): op = sym.S elif op == "D": op = sym.D + elif op == "Sp": + op = sym.Sp + else: + raise ValueError("unknown operator: '%s'" % op) expr = op(kernel, u_sym, qbx_forced_limit=-1, **kernel_kwargs) -- GitLab From bec64ed171bae9aaf57b4890e05fd83d4878f579 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 24 Oct 2018 14:14:40 -0500 Subject: [PATCH 134/139] Cython: avoid overhead of building slices when doing copies --- pytential/qbx/target_specific/_internal.pyx | 22 ++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 6f52c9f7..49492fd2 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -594,7 +594,7 @@ def eval_target_specific_qbx_locals( int tgt_box, src_ibox int isrc_box, isrc_box_start, isrc_box_end int isrc, isrc_start, isrc_end - int tid + int tid, m double jscale double complex result double[:,:] source, center, target, dipole @@ -677,15 +677,17 @@ def eval_target_specific_qbx_locals( tgt_box = qbx_center_to_target_box[ctr] tid = cython.parallel.threadid() - center[tid, :3] = centers[:, ctr] + for m in range(3): + center[tid, m] = centers[m, ctr] for itgt in range(itgt_start, itgt_end): result = 0 tgt = center_to_target_lists[itgt] - target[tid, :3] = targets[:, tgt] - if ifgrad: - result_grad[tid, :3] = 0 + for m in range(3): + target[tid, m] = targets[m, tgt] + if ifgrad: + result_grad[tid, m] = 0 if helmholtz_s or helmholtz_sp or helmholtz_d: # Precompute source-invariant Helmholtz terms. @@ -703,10 +705,11 @@ def eval_target_specific_qbx_locals( isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for isrc in range(isrc_start, isrc_end): - source[tid, :3] = sources[:, isrc] - if ifdipole: - dipole[tid, :3] = dipvec[:, isrc] + for m in range(3): + source[tid, m] = sources[m, isrc] + if ifdipole: + dipole[tid, m] = dipvec[m, isrc] # NOTE: Don't use +=, since that makes Cython think we are # doing an OpenMP reduction. @@ -757,4 +760,5 @@ def eval_target_specific_qbx_locals( pot[tgt] = result if ifgrad: - grad[:, tgt] = result_grad[tid, :3] + for m in range(3): + grad[m, tgt] = result_grad[tid, m] -- GitLab From d5ab6a60d66dd670db7e73373d7564a07f6cd735 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 26 Oct 2018 21:42:01 -0500 Subject: [PATCH 135/139] flake8 W504 fixes --- pytential/qbx/performance.py | 6 +++--- test/test_performance_model.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 316de3c0..f3c1ed64 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -119,11 +119,11 @@ class TranslationCostModel(object): if self.uses_point_and_shoot: return ( # Rotate the coordinate system to be z axis aligned. - nsource_coeffs ** (3 / 2) + + nsource_coeffs ** (3 / 2) # Translate the expansion along the z axis. - nsource_coeffs ** (1 / 2) * ntarget_coeffs + + + nsource_coeffs ** (1 / 2) * ntarget_coeffs # Rotate the coordinate system back. - ntarget_coeffs ** (3 / 2)) + + ntarget_coeffs ** (3 / 2)) return nsource_coeffs * ntarget_coeffs diff --git a/test/test_performance_model.py b/test/test_performance_model.py index 7c1f31a9..e096e746 100644 --- a/test/test_performance_model.py +++ b/test/test_performance_model.py @@ -188,8 +188,8 @@ def test_performance_model_parameter_gathering(ctx_getter): for level in range(tree.nlevels): assert ( - perf_S.params["p_fmm_lev%d" % level] == - fmm_level_to_order(k_sym, {"k": 2}, tree, level)) + perf_S.params["p_fmm_lev%d" % level] + == fmm_level_to_order(k_sym, {"k": 2}, tree, level)) # }}} @@ -525,8 +525,8 @@ def test_performance_model_order_varying_by_level(ctx_getter): # case should have larger cost. assert ( - sum(perf_varying.get_predicted_times().values()) > - sum(perf_constant.get_predicted_times().values())) + sum(perf_varying.get_predicted_times().values()) + > sum(perf_constant.get_predicted_times().values())) # }}} -- GitLab From e1268d45addcbc4568a9c4de80618291c4fd56e3 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 29 Oct 2018 19:28:04 -0500 Subject: [PATCH 136/139] Update install instructions for TS branch --- doc/misc.rst | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/doc/misc.rst b/doc/misc.rst index 198b87da..78549d32 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -1,21 +1,11 @@ Installation and Usage ====================== -Installing :mod:`pytential` ---------------------------- +Installing :mod:`pytential` (TSQBX support) +------------------------------------------- This set of instructions is intended for 64-bit Linux and macOS computers. -#. Make sure your system has the basics to build software. - - On Debian derivatives (Ubuntu and many more), - installing ``build-essential`` should do the trick. - - On macOS, run ``xcode-select --install`` to install build tools. - - Everywhere else, just making sure you have the ``g++`` package should be - enough. - #. Install your favorite variant of `miniconda `_. (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.) @@ -33,11 +23,12 @@ This set of instructions is intended for 64-bit Linux and macOS computers. #. (*macOS only*) ``conda install osx-pocl-opencl pocl pyopencl`` -#. ``conda install git pip pocl islpy pyopencl sympy pyfmmlib pytest`` +#. ``conda install gcc cython git pip pocl islpy pyopencl sympy pyfmmlib pytest`` -#. Type the following command:: +#. Type the following commands:: - hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode pytential; do python -m pip install git+https://github.com/inducer/$i; done + hash -r; for i in pymbolic cgen genpy gmsh_interop modepy pyvisfile loopy boxtree sumpy meshmode; do python -m pip install git+https://github.com/inducer/$i; done + CC=gcc python -m pip install git+https://gitlab.tiker.net/inducer/pytential@tsqbx Next time you want to use :mod:`pytential`, just run the following command:: -- GitLab From b124609e8c88e8a247d91665a0287326bdfc4e3e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 29 Oct 2018 19:53:11 -0500 Subject: [PATCH 137/139] Looks like we need build tools after all --- doc/misc.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/misc.rst b/doc/misc.rst index 78549d32..6d589c27 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -6,6 +6,16 @@ Installing :mod:`pytential` (TSQBX support) This set of instructions is intended for 64-bit Linux and macOS computers. +#. Make sure your system has the basics to build software. + + On Debian derivatives (Ubuntu and many more), + installing ``build-essential`` should do the trick. + + On macOS, run ``xcode-select --install`` to install build tools. + + Everywhere else, just making sure you have the ``g++`` package should be + enough. + #. Install your favorite variant of `miniconda `_. (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.) -- GitLab From 4bb845f3c1f452f0b1417b2953ccf90cb791b961 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 2 Nov 2018 16:08:14 -0500 Subject: [PATCH 138/139] Cython: Set language_level to avoid warning --- pytential/qbx/target_specific/_internal.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/target_specific/_internal.pyx b/pytential/qbx/target_specific/_internal.pyx index 49492fd2..1175e37d 100644 --- a/pytential/qbx/target_specific/_internal.pyx +++ b/pytential/qbx/target_specific/_internal.pyx @@ -1,5 +1,5 @@ #!python -#cython: warn.unused=True, warn.unused_arg=True, warn.unreachable=True, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True +#cython: warn.unused=True, warn.unused_arg=True, warn.unreachable=True, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, embedsignature=True, language_level=3 import numpy as np import cython -- GitLab From ac32f2ee202e2a3fad372176c90c71e305785564 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 2 Nov 2018 16:24:30 -0500 Subject: [PATCH 139/139] Towards MR into master --- doc/misc.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/misc.rst b/doc/misc.rst index 6d589c27..da204eb0 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -1,8 +1,8 @@ Installation and Usage ====================== -Installing :mod:`pytential` (TSQBX support) -------------------------------------------- +Installing :mod:`pytential` +--------------------------- This set of instructions is intended for 64-bit Linux and macOS computers. -- GitLab