diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1467e0c1cdc71a45465b96db3e1b44e50892216e..dab8555f40404cb7cbc91151ca5b60bd5a960955 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -80,6 +80,23 @@ Python 3 POCL: reports: junit: test/pytest.xml +Python 3 POCL Examples: + script: + - test -n "$SKIP_EXAMPLES" && exit + - export PY_EXE=python3 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh + - ". ./build-py-project-and-run-examples.sh" + tags: + - python3 + - pocl + except: + - tags + artifacts: + reports: + junit: test/pytest.xml + Pylint: script: - export PY_EXE=python3 diff --git a/boxtree/cost.py b/boxtree/cost.py new file mode 100644 index 0000000000000000000000000000000000000000..033739b3e16a27d2769c0acfbac7d32c3eb85b0c --- /dev/null +++ b/boxtree/cost.py @@ -0,0 +1,1294 @@ +""" +This module helps predict the running time of each step of FMM. There are two +implementations of the interface :class:`AbstractFMMCostModel`, namely +:class:`CLFMMCostModel` using OpenCL and :class:`PythonFMMCostModel` using pure +Python. + +An implementation of :class:`AbstractFMMCostModel` uses a +:class:`FMMTranslationCostModel` to assign translation costs to a +:class:`boxtree.traversal.FMMTraversalInfo` object. +""" + +from __future__ import division, absolute_import + +__copyright__ = """ +Copyright (C) 2013 Andreas Kloeckner +Copyright (C) 2018 Matt Wala +Copyright (C) 2018 Hao Gao +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.array # noqa: F401 +from pyopencl.elementwise import ElementwiseKernel +from pyopencl.tools import dtype_to_ctype +from mako.template import Template +from functools import partial +from pymbolic import var, evaluate +from pytools import memoize_method +import sys + +if sys.version_info >= (3, 0): + Template = partial(Template, strict_undefined=True) +else: + Template = partial(Template, strict_undefined=True, disable_unicode=True) + +if sys.version_info >= (3, 4): + from abc import ABC, abstractmethod +else: + from abc import ABCMeta, abstractmethod + ABC = ABCMeta('ABC', (), {}) + + +class FMMTranslationCostModel(object): + """Provides modeled costs for individual translations or evaluations. + + .. note:: Current implementation assumes the calibration parameters are linear + in the modeled cost. For example, + `var("c_p2l") * self.ncoeffs_fmm_by_level[level]` is valid, but + `var("c_p2l") ** 2 * self.ncoeffs_fmm_by_level[level]` is not. + """ + + def __init__(self, ncoeffs_fmm_by_level, uses_point_and_shoot): + self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level + self.uses_point_and_shoot = uses_point_and_shoot + + @staticmethod + def direct(): + return var("c_p2p") + + def p2l(self, level): + return var("c_p2l") * self.ncoeffs_fmm_by_level[level] + + def l2p(self, level): + return var("c_l2p") * self.ncoeffs_fmm_by_level[level] + + def p2m(self, level): + return var("c_p2m") * self.ncoeffs_fmm_by_level[level] + + def m2p(self, level): + return var("c_m2p") * self.ncoeffs_fmm_by_level[level] + + def m2m(self, src_level, tgt_level): + return var("c_m2m") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def l2l(self, src_level, tgt_level): + return var("c_l2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def m2l(self, src_level, tgt_level): + return var("c_m2l") * self.e2e_cost( + self.ncoeffs_fmm_by_level[src_level], + self.ncoeffs_fmm_by_level[tgt_level]) + + def e2e_cost(self, nsource_coeffs, ntarget_coeffs): + if self.uses_point_and_shoot: + return ( + # Rotate the coordinate system to be z axis aligned. + nsource_coeffs ** (3 / 2) + # Translate the expansion along the z axis. + + nsource_coeffs ** (1 / 2) * ntarget_coeffs + # Rotate the coordinate system back. + + ntarget_coeffs ** (3 / 2)) + + return nsource_coeffs * ntarget_coeffs + + +# {{{ translation cost model factories + +def pde_aware_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation operators that make use of the + knowledge that the potential satisfies a PDE. + + For example, this factory is used for complex Taylor and Fourier-Bessel + expansions in 2D, and spherical harmonics (with point-and-shoot) in 3D. + """ + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + ncoeffs_fmm = (p_fmm + 1) ** (dim - 1) + + if dim == 3: + uses_point_and_shoot = True + else: + uses_point_and_shoot = False + + return FMMTranslationCostModel( + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=uses_point_and_shoot + ) + + +def taylor_translation_cost_model(dim, nlevels): + """Create a cost model for FMM translation based on Taylor expansions + in Cartesian coordinates. + """ + p_fmm = np.array([var("p_fmm_lev%d" % i) for i in range(nlevels)]) + ncoeffs_fmm = (p_fmm + 1) ** dim + + return FMMTranslationCostModel( + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=False + ) + +# }}} + + +class AbstractFMMCostModel(ABC): + @abstractmethod + def process_form_multipoles(self, traversal, p2m_cost): + """Cost for forming multipole expansions of each box. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels,) representing the cost of forming the multipole + expansion of one source at each level. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (nsource_boxes,), with each entry represents the cost of the box. + """ + pass + + @abstractmethod + def process_coarsen_multipoles(self, traversal, m2m_cost): + """Cost for upward propagation. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels-1,), where the ith entry represents the + multipole-to-multipole cost from source level i+1 to target level i. + :return: a :class:`float`, the overall cost of upward propagation. + + .. note:: This method returns a number instead of an array, because it is not + immediate clear how per-box cost of upward propagation will be useful for + distributed load balancing. + """ + pass + + @abstractmethod + def get_ndirect_sources_per_target_box(self, traversal): + """Collect the number of direct evaluation sources (list 1, list 3 close and + list 4 close) for each target box. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (ntarget_boxes,), with each entry representing the number of direct + evaluation sources for that target box. + """ + pass + + @abstractmethod + def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + box_target_counts_nonchild=None): + """Direct evaluation cost of each target box of *traversal*. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry + representing the number of direct evaluation sources for that target box. + :arg p2p_cost: a constant representing the cost of one point-to-point + evaluation. + :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets + using direct evaluation in this box. For example, this is useful in QBX + by specifying the number of non-QBX targets. If None, all targets in + boxes are considered. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (ntarget_boxes,), with each entry represents the cost of the box. + """ + pass + + @abstractmethod + def process_list2(self, traversal, m2l_cost): + """ + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels,) representing the translation cost of each level. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (ntarget_or_target_parent_boxes,), with each entry representing the cost + of multipole-to-local translations to this box. + """ + pass + + @abstractmethod + def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + """ + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels,) where the ith entry represents the evaluation cost + from multipole expansion at level i to a point. + :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets + using multiple-to-point translations in this box. For example, this is + useful in QBX by specifying the number of non-QBX targets. If None, all + targets in boxes are considered. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (nboxes,), with each entry representing the cost of evaluating all + targets inside this box from multipole expansions of list-3 boxes. + """ + pass + + @abstractmethod + def process_list4(self, traversal, p2l_cost): + """ + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels,) where the ith entry represents the translation cost + from a point to the local expansion at level i. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (ntarget_or_target_parent_boxes,), with each entry representing the cost + of point-to-local translations to this box. + """ + pass + + @abstractmethod + def process_eval_locals(self, traversal, l2p_cost, + box_target_counts_nonchild=None): + """ + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels,) where the ith entry represents the cost of evaluating + the potential of a target in a box of level i using the box's local + expansion. + :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets + which need evaluation. For example, this is useful in QBX by specifying + the number of non-QBX targets. If None, use + traversal.tree.box_target_counts_nonchild. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (ntarget_boxes,), the cost of evaluating the potentials of all targets + inside this box from its local expansion. + """ + pass + + @abstractmethod + def process_refine_locals(self, traversal, l2l_cost): + """Cost of downward propagation. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape (nlevels-1,), where the ith entry represents the cost of + tranlating local expansion from level i to level i+1. + :return: a :class:`float`, the overall cost of downward propagation. + + .. note:: This method returns a number instead of an array, because it is not + immediate clear how per-box cost of downward propagation will be useful + for distributed load balancing. + """ + pass + + @abstractmethod + def aggregate(self, per_box_result): + """Sum all entries of *per_box_result* into a number. + + :arg per_box_result: an object of :class:`numpy.ndarray` or + :class:`pyopencl.array.Array`, the result to be sumed. + :return: a :class:`float`, the result of the sum. + """ + pass + + def fmm_cost_factors_for_kernels_from_model(self, nlevels, xlat_cost, context): + """Evaluate translation cost factors from symbolic model. The result of this + function can be used for process_* methods in this class. + + :arg nlevels: the number of tree levels. + :arg xlat_cost: a :class:`FMMTranslationCostModel`. + :arg context: a :class:`dict` of parameters passed as context when + evaluating symbolic expressions in *xlat_cost*. + :return: a :class:`dict`, the translation cost of each step in FMM. + """ + return { + "p2m_cost": np.array([ + evaluate(xlat_cost.p2m(ilevel), context=context) + for ilevel in range(nlevels) + ], dtype=np.float64), + "m2m_cost": np.array([ + evaluate(xlat_cost.m2m(ilevel+1, ilevel), context=context) + for ilevel in range(nlevels-1) + ], dtype=np.float64), + "c_p2p": evaluate(xlat_cost.direct(), context=context), + "m2l_cost": np.array([ + evaluate(xlat_cost.m2l(ilevel, ilevel), context=context) + for ilevel in range(nlevels) + ], dtype=np.float64), + "m2p_cost": np.array([ + evaluate(xlat_cost.m2p(ilevel), context=context) + for ilevel in range(nlevels) + ], dtype=np.float64), + "p2l_cost": np.array([ + evaluate(xlat_cost.p2l(ilevel), context=context) + for ilevel in range(nlevels) + ], dtype=np.float64), + "l2l_cost": np.array([ + evaluate(xlat_cost.l2l(ilevel, ilevel+1), context=context) + for ilevel in range(nlevels-1) + ], dtype=np.float64), + "l2p_cost": np.array([ + evaluate(xlat_cost.l2p(ilevel), context=context) + for ilevel in range(nlevels) + ], dtype=np.float64) + } + + def get_fmm_modeled_cost(self, traversal, level_to_order, + ndirect_sources_per_target_box, + calibration_params, + box_target_counts_nonchild=None): + """Predict cost of a new traversal object. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg level_to_order: a :class:`numpy.ndarray` of shape + (traversal.tree.nlevels,) representing the expansion orders + of different levels. + :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of + direct evaluation sources (list 1, list 3 close, list 4 close) for each + target box. You may find :func:`get_ndirect_sources_per_target_box` + helpful. + :arg calibration_params: a :class:`dict` of calibration parameters. These + parameters can be got from `estimate_calibration_params`. + :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets + which need evaluation. For example, this is useful in QBX by specifying + the number of non-QBX targets. If None, all targets are considered, + namely traversal.tree.box_target_counts_nonchild. + :return: a :class:`dict`, the cost of fmm stages. + """ + tree = traversal.tree + result = {} + + for ilevel in range(tree.nlevels): + calibration_params["p_fmm_lev%d" % ilevel] = level_to_order[ilevel] + + xlat_cost = self.translation_cost_model_factory( + tree.dimensions, tree.nlevels + ) + + translation_cost = self.fmm_cost_factors_for_kernels_from_model( + tree.nlevels, xlat_cost, calibration_params + ) + + if box_target_counts_nonchild is None: + box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild + + result["form_multipoles"] = self.process_form_multipoles( + traversal, translation_cost["p2m_cost"] + ) + + result["coarsen_multipoles"] = self.process_coarsen_multipoles( + traversal, translation_cost["m2m_cost"] + ) + + result["eval_direct"] = self.process_direct( + traversal, ndirect_sources_per_target_box, translation_cost["c_p2p"], + box_target_counts_nonchild=box_target_counts_nonchild + ) + + result["multipole_to_local"] = self.process_list2( + traversal, translation_cost["m2l_cost"] + ) + + result["eval_multipoles"] = self.process_list3( + traversal, translation_cost["m2p_cost"], + box_target_counts_nonchild=box_target_counts_nonchild + ) + + result["form_locals"] = self.process_list4( + traversal, translation_cost["p2l_cost"] + ) + + result["refine_locals"] = self.process_refine_locals( + traversal, translation_cost["l2l_cost"] + ) + + result["eval_locals"] = self.process_eval_locals( + traversal, translation_cost["l2p_cost"], + box_target_counts_nonchild=box_target_counts_nonchild + ) + + return result + + def __call__(self, traversal, level_to_order, calibration_params): + """Top-level entry point for predicting cost of a new traversal object. + + Also see :func:`get_fmm_modeled_cost` for more customization. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg level_to_order: a :class:`numpy.ndarray` of shape + (traversal.tree.nlevels,) representing the expansion orders + of different levels. + :arg calibration_params: a :class:`dict` of calibration parameters. These + parameters can be got from `estimate_calibration_params`. + + :return: a :class:`dict`, the cost of fmm stages. + """ + ndirect_sources_per_target_box = ( + self.get_ndirect_sources_per_target_box(traversal) + ) + + return self.get_fmm_modeled_cost( + traversal, level_to_order, ndirect_sources_per_target_box, + calibration_params + ) + + @abstractmethod + def aggregate_stage_costs_per_box(self, traversal, cost_result): + """Given per-stage costs, this method calculates the sum of costs from all + stages for each box. This is used for load balancing in distributed + implementation. + + :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. + :arg cost_result: modeled cost of each stage by + :func:`get_fmm_modeled_cost`. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + (nboxes,), where the ith entry represents the cost of all stages for box + i. + """ + pass + + @staticmethod + def get_constantone_calibration_params(): + return dict( + c_l2l=1.0, + c_l2p=1.0, + c_m2l=1.0, + c_m2m=1.0, + c_m2p=1.0, + c_p2l=1.0, + c_p2m=1.0, + c_p2p=1.0, + ) + + def estimate_calibration_params(self, model_results, timing_results, + time_field_name="wall_elapsed", + additional_stage_to_param_names=()): + """ + :arg model_results: a :class:`list` of the modeled cost for each step of FMM, + returned by :func:`get_fmm_modeled_cost` with constant 1 calibration + parameters. + :arg timing_results: a :class:`list` of the same length as *model_results*. + Each entry is a :class:`dict` filled with timing data returned by + *boxtree.fmm.drive_fmm* + :arg time_field_name: a :class:`str`, the field name from the timing result. + Usually this can be "wall_elapsed" or "process_elapsed". + :arg additional_stage_to_param_names: a :class:`dict` for mapping stage names + to parameter names. This is useful for supplying additional stages of + QBX. + :return: a :class:`dict` of calibration parameters. If there is no model + result for a particular stage, the estimated calibration parameter for + that stage is NaN. + """ + nresults = len(model_results) + assert len(timing_results) == nresults + + _FMM_STAGE_TO_CALIBRATION_PARAMETER = { + "form_multipoles": "c_p2m", + "coarsen_multipoles": "c_m2m", + "eval_direct": "c_p2p", + "multipole_to_local": "c_m2l", + "eval_multipoles": "c_m2p", + "form_locals": "c_p2l", + "refine_locals": "c_l2l", + "eval_locals": "c_l2p" + } + + stage_to_param_names = _FMM_STAGE_TO_CALIBRATION_PARAMETER.copy() + stage_to_param_names.update(additional_stage_to_param_names) + + params = set(stage_to_param_names.values()) + + uncalibrated_times = {} + actual_times = {} + + for param in params: + uncalibrated_times[param] = np.zeros(nresults) + actual_times[param] = np.zeros(nresults) + + for icase, model_result in enumerate(model_results): + for stage_name, param_name in stage_to_param_names.items(): + if stage_name in model_result: + uncalibrated_times[param_name][icase] = ( + self.aggregate(model_result[stage_name])) + + for icase, timing_result in enumerate(timing_results): + for stage_name, time in timing_result.items(): + param_name = stage_to_param_names[stage_name] + actual_times[param_name][icase] = time[time_field_name] + + result = {} + + for param in params: + uncalibrated = uncalibrated_times[param] + actual = actual_times[param] + + if np.allclose(uncalibrated, 0): + result[param] = 0.0 + continue + + result[param] = ( + actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated)) + + return result + + +class CLFMMCostModel(AbstractFMMCostModel): + """ + .. note:: For methods in this class, argument *traversal* should live on device + memory. + """ + def __init__(self, queue, + translation_cost_model_factory=pde_aware_translation_cost_model): + """ + :arg queue: a :class:`pyopencl.CommandQueue` object on which the execution + of this object runs. + :arg translation_cost_model_factory: a function, which takes tree dimension + and the number of tree levels as arguments, returns an object of + :class:`TranslationCostModel`. + """ + self.queue = queue + self.translation_cost_model_factory = translation_cost_model_factory + + # {{{ form multipoles + + @memoize_method + def process_form_multipoles_knl(self, box_id_dtype, particle_id_dtype, + box_level_dtype): + return ElementwiseKernel( + self.queue.context, + Template(r""" + double *np2m, + ${box_id_t} *source_boxes, + ${particle_id_t} *box_source_counts_nonchild, + ${box_level_t} *box_levels, + double *p2m_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = source_boxes[i]; + ${particle_id_t} nsources = box_source_counts_nonchild[box_idx]; + ${box_level_t} ilevel = box_levels[box_idx]; + np2m[i] = nsources * p2m_cost[ilevel]; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_form_multipoles" + ) + + def process_form_multipoles(self, traversal, p2m_cost): + tree = traversal.tree + np2m = cl.array.zeros( + self.queue, len(traversal.source_boxes), dtype=np.float64 + ) + + process_form_multipoles_knl = self.process_form_multipoles_knl( + tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype + ) + + process_form_multipoles_knl( + np2m, + traversal.source_boxes, + tree.box_source_counts_nonchild, + tree.box_levels, + p2m_cost + ) + + return np2m + + # }}} + + # {{{ propagate multipoles upward + + @memoize_method + def process_coarsen_multipoles_knl(self, ndimensions, box_id_dtype, + box_level_dtype, nlevels): + return ElementwiseKernel( + self.queue.context, + Template(r""" + ${box_id_t} *source_parent_boxes, + ${box_level_t} *box_levels, + double *m2m_cost, + double *nm2m, + % for i in range(2**ndimensions): + % if i == 2**ndimensions - 1: + ${box_id_t} *box_child_ids_${i} + % else: + ${box_id_t} *box_child_ids_${i}, + % endif + % endfor + """).render( + ndimensions=ndimensions, + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = source_parent_boxes[i]; + ${box_level_t} target_level = box_levels[box_idx]; + if(target_level <= 1) { + nm2m[i] = 0.0; + } else { + int nchild = 0; + % for i in range(2**ndimensions): + if(box_child_ids_${i}[box_idx]) + nchild += 1; + % endfor + nm2m[i] = nchild * m2m_cost[target_level]; + } + """).render( + ndimensions=ndimensions, + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype), + nlevels=nlevels + ), + name="process_coarsen_multipoles" + ) + + def process_coarsen_multipoles(self, traversal, m2m_cost): + tree = traversal.tree + nm2m = cl.array.zeros( + self.queue, len(traversal.source_parent_boxes), dtype=np.float64 + ) + + process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl( + tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels + ) + + process_coarsen_multipoles_knl( + traversal.source_parent_boxes, + tree.box_levels, + m2m_cost, + nm2m, + *tree.box_child_ids, + queue=self.queue + ) + + return self.aggregate(nm2m) + + # }}} + + # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + + @memoize_method + def _get_ndirect_sources_knl(self, particle_id_dtype, box_id_dtype): + return ElementwiseKernel( + self.queue.context, + Template(""" + ${particle_id_t} *ndirect_sources_by_itgt_box, + ${box_id_t} *source_boxes_starts, + ${box_id_t} *source_boxes_lists, + ${particle_id_t} *box_source_counts_nonchild + """).render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_id_t=dtype_to_ctype(box_id_dtype) + ), + Template(r""" + ${particle_id_t} nsources = 0; + ${box_id_t} source_boxes_start_idx = source_boxes_starts[i]; + ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1]; + + for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx; + cur_source_boxes_idx < source_boxes_end_idx; + cur_source_boxes_idx++) + { + ${box_id_t} cur_source_box = source_boxes_lists[ + cur_source_boxes_idx + ]; + nsources += box_source_counts_nonchild[cur_source_box]; + } + + ndirect_sources_by_itgt_box[i] += nsources; + """).render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_id_t=dtype_to_ctype(box_id_dtype) + ), + name="get_ndirect_sources" + ) + + def get_ndirect_sources_per_target_box(self, traversal): + tree = traversal.tree + ntarget_boxes = len(traversal.target_boxes) + particle_id_dtype = tree.particle_id_dtype + box_id_dtype = tree.box_id_dtype + + get_ndirect_sources_knl = self._get_ndirect_sources_knl( + particle_id_dtype, box_id_dtype + ) + + ndirect_sources_by_itgt_box = cl.array.zeros( + self.queue, ntarget_boxes, dtype=particle_id_dtype + ) + + # List 1 + get_ndirect_sources_knl( + ndirect_sources_by_itgt_box, + traversal.neighbor_source_boxes_starts, + traversal.neighbor_source_boxes_lists, + tree.box_source_counts_nonchild + ) + + # List 3 close + if traversal.from_sep_close_smaller_starts is not None: + self.queue.finish() + get_ndirect_sources_knl( + ndirect_sources_by_itgt_box, + traversal.from_sep_close_smaller_starts, + traversal.from_sep_close_smaller_lists, + tree.box_source_counts_nonchild + ) + + # List 4 close + if traversal.from_sep_close_bigger_starts is not None: + self.queue.finish() + get_ndirect_sources_knl( + ndirect_sources_by_itgt_box, + traversal.from_sep_close_bigger_starts, + traversal.from_sep_close_bigger_lists, + tree.box_source_counts_nonchild + ) + + return ndirect_sources_by_itgt_box + + def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + box_target_counts_nonchild=None): + if box_target_counts_nonchild is None: + box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild + + from pyopencl.array import take + ntargets_by_itgt_box = take( + box_target_counts_nonchild, + traversal.target_boxes, + queue=self.queue + ) + + return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + @memoize_method + def process_list2_knl(self, box_id_dtype, box_level_dtype): + return ElementwiseKernel( + self.queue.context, + Template(r""" + double *nm2l, + ${box_id_t} *target_or_target_parent_boxes, + ${box_id_t} *from_sep_siblings_starts, + ${box_level_t} *box_levels, + double *m2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} start = from_sep_siblings_starts[i]; + ${box_id_t} end = from_sep_siblings_starts[i+1]; + ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]]; + + nm2l[i] = (end - start) * m2l_cost[ilevel]; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_list2" + ) + + def process_list2(self, traversal, m2l_cost): + tree = traversal.tree + box_id_dtype = tree.box_id_dtype + box_level_dtype = tree.box_level_dtype + + ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) + nm2l = cl.array.zeros( + self.queue, (ntarget_or_target_parent_boxes,), dtype=np.float64 + ) + + process_list2_knl = self.process_list2_knl(box_id_dtype, box_level_dtype) + process_list2_knl( + nm2l, + traversal.target_or_target_parent_boxes, + traversal.from_sep_siblings_starts, + tree.box_levels, + m2l_cost + ) + + return nm2l + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + @memoize_method + def process_list3_knl(self, box_id_dtype, particle_id_dtype): + return ElementwiseKernel( + self.queue.context, + Template(r""" + ${box_id_t} *target_boxes_sep_smaller, + ${box_id_t} *sep_smaller_start, + ${particle_id_t} *box_target_counts_nonchild, + double m2p_cost_current_level, + double *nm2p + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + Template(r""" + ${box_id_t} target_box = target_boxes_sep_smaller[i]; + ${box_id_t} start = sep_smaller_start[i]; + ${box_id_t} end = sep_smaller_start[i+1]; + ${particle_id_t} ntargets = box_target_counts_nonchild[target_box]; + nm2p[target_box] += ( + ntargets * (end - start) * m2p_cost_current_level + ); + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + name="process_list3" + ) + + def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + tree = traversal.tree + nm2p = cl.array.zeros(self.queue, tree.nboxes, dtype=np.float64) + + if box_target_counts_nonchild is None: + box_target_counts_nonchild = tree.box_target_counts_nonchild + + process_list3_knl = self.process_list3_knl( + tree.box_id_dtype, tree.particle_id_dtype + ) + + for ilevel, sep_smaller_list in enumerate( + traversal.from_sep_smaller_by_level): + process_list3_knl( + traversal.target_boxes_sep_smaller_by_source_level[ilevel], + sep_smaller_list.starts, + box_target_counts_nonchild, + m2p_cost[ilevel].get().reshape(-1)[0], + nm2p, + queue=self.queue + ) + self.queue.finish() + + return nm2p + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + @memoize_method + def process_list4_knl(self, box_id_dtype, particle_id_dtype, box_level_dtype): + return ElementwiseKernel( + self.queue.context, + Template(r""" + double *nm2p, + ${box_id_t} *from_sep_bigger_starts, + ${box_id_t} *from_sep_bigger_lists, + ${particle_id_t} *box_source_counts_nonchild, + ${box_level_t} *box_levels, + double *p2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} start = from_sep_bigger_starts[i]; + ${box_id_t} end = from_sep_bigger_starts[i+1]; + for(${box_id_t} idx=start; idx < end; idx++) { + ${box_id_t} src_ibox = from_sep_bigger_lists[idx]; + ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox]; + ${box_level_t} ilevel = box_levels[src_ibox]; + nm2p[i] += nsources * p2l_cost[ilevel]; + } + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_list4" + ) + + def process_list4(self, traversal, p2l_cost): + tree = traversal.tree + target_or_target_parent_boxes = traversal.target_or_target_parent_boxes + nm2p = cl.array.zeros( + self.queue, len(target_or_target_parent_boxes), dtype=np.float64 + ) + + process_list4_knl = self.process_list4_knl( + tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype + ) + + process_list4_knl( + nm2p, + traversal.from_sep_bigger_starts, + traversal.from_sep_bigger_lists, + tree.box_source_counts_nonchild, + tree.box_levels, + p2l_cost + ) + + return nm2p + + # }}} + + # {{{ evaluate local expansions at targets + + @memoize_method + def process_eval_locals_knl(self, box_id_dtype, particle_id_dtype, + box_level_dtype): + return ElementwiseKernel( + self.queue.context, + Template(r""" + double *neval_locals, + ${box_id_t} *target_boxes, + ${particle_id_t} *box_target_counts_nonchild, + ${box_level_t} *box_levels, + double *l2p_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = target_boxes[i]; + ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx]; + ${box_level_t} ilevel = box_levels[box_idx]; + neval_locals[i] = ntargets * l2p_cost[ilevel]; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_eval_locals" + ) + + def process_eval_locals(self, traversal, l2p_cost, + box_target_counts_nonchild=None): + tree = traversal.tree + ntarget_boxes = len(traversal.target_boxes) + neval_locals = cl.array.zeros(self.queue, ntarget_boxes, dtype=np.float64) + + if box_target_counts_nonchild is None: + box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild + + process_eval_locals_knl = self.process_eval_locals_knl( + tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype + ) + + process_eval_locals_knl( + neval_locals, + traversal.target_boxes, + box_target_counts_nonchild, + tree.box_levels, + l2p_cost + ) + + return neval_locals + + # }}} + + # {{{ propogate locals downward + + @memoize_method + def process_refine_locals_knl(self, box_id_dtype): + from pyopencl.reduction import ReductionKernel + return ReductionKernel( + self.queue.context, + np.float64, + neutral="0.0", + reduce_expr="a+b", + map_expr=r""" + (level_start_target_or_target_parent_box_nrs[i + 1] + - level_start_target_or_target_parent_box_nrs[i]) + * l2l_cost[i - 1] + """, + arguments=Template(r""" + ${box_id_t} *level_start_target_or_target_parent_box_nrs, + double *l2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype) + ), + name="process_refine_locals" + ) + + def process_refine_locals(self, traversal, l2l_cost): + tree = traversal.tree + process_refine_locals_knl = self.process_refine_locals_knl(tree.box_id_dtype) + + level_start_target_or_target_parent_box_nrs = cl.array.to_device( + self.queue, traversal.level_start_target_or_target_parent_box_nrs + ) + + cost = process_refine_locals_knl( + level_start_target_or_target_parent_box_nrs, + l2l_cost, + range=slice(1, tree.nlevels) + ).get() + + return cost.reshape(-1)[0] + + # }}} + + def aggregate(self, per_box_result): + if isinstance(per_box_result, float): + return per_box_result + else: + return cl.array.sum(per_box_result).get().reshape(-1)[0] + + def translation_costs_to_dev(self, translation_costs): + """This helper function transfers all :class:`numpy.ndarray` fields in + *translation_costs* to device memory as :class:`pyopencl.array.Array`. + """ + for name in translation_costs: + if not isinstance(translation_costs[name], np.ndarray): + continue + translation_costs[name] = cl.array.to_device( + self.queue, translation_costs[name] + ) + + return translation_costs + + def fmm_cost_factors_for_kernels_from_model(self, nlevels, xlat_cost, context): + translation_costs = ( + AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( + self, nlevels, xlat_cost, context + ) + ) + + return self.translation_costs_to_dev(translation_costs) + + def aggregate_stage_costs_per_box(self, traversal, cost_result): + tree = traversal.tree + nboxes = tree.nboxes + source_boxes = traversal.source_boxes + target_boxes = traversal.target_boxes + target_or_target_parent_boxes = traversal.target_or_target_parent_boxes + + cost_per_box = cl.array.zeros(self.queue, (nboxes,), dtype=np.float64) + + cost_per_box[source_boxes] += cost_result["form_multipoles"] + cost_per_box[target_boxes] += cost_result["eval_direct"] + cost_per_box[target_or_target_parent_boxes] += \ + cost_result["multipole_to_local"] + cost_per_box += cost_result["eval_multipoles"] + cost_per_box[target_or_target_parent_boxes] += cost_result["form_locals"] + cost_per_box[target_boxes] += cost_result["eval_locals"] + + return cost_per_box + + +class PythonFMMCostModel(AbstractFMMCostModel): + def __init__(self, + translation_cost_model_factory=pde_aware_translation_cost_model): + """ + :arg translation_cost_model_factory: a function, which takes tree dimension + and the number of tree levels as arguments, returns an object of + :class:`TranslationCostModel`. + """ + self.translation_cost_model_factory = translation_cost_model_factory + + def process_form_multipoles(self, traversal, p2m_cost): + tree = traversal.tree + np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64) + + for ilevel in range(tree.nlevels): + start, stop = traversal.level_start_source_box_nrs[ilevel:ilevel + 2] + for isrc_box, src_ibox in enumerate( + traversal.source_boxes[start:stop], start): + nsources = tree.box_source_counts_nonchild[src_ibox] + np2m[isrc_box] = nsources * p2m_cost[ilevel] + + return np2m + + def get_ndirect_sources_per_target_box(self, traversal): + tree = traversal.tree + ntarget_boxes = len(traversal.target_boxes) + + # target box index -> nsources + ndirect_sources_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.float64) + + for itgt_box in range(ntarget_boxes): + nsources = 0 + + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2] + ) + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + # Could be None, if not using targets with extent. + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2] + ) + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + ndirect_sources_by_itgt_box[itgt_box] = nsources + + return ndirect_sources_by_itgt_box + + def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + box_target_counts_nonchild=None): + if box_target_counts_nonchild is None: + box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild + + ntargets_by_itgt_box = box_target_counts_nonchild[traversal.target_boxes] + + return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost + + def process_list2(self, traversal, m2l_cost): + tree = traversal.tree + ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) + nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64) + + for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): + start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] + + ilevel = tree.box_levels[tgt_ibox] + nm2l[itgt_box] += m2l_cost[ilevel] * (end - start) + + return nm2l + + def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + tree = traversal.tree + nm2p = np.zeros(tree.nboxes, dtype=np.float64) + + if box_target_counts_nonchild is None: + box_target_counts_nonchild = tree.box_target_counts_nonchild + + for ilevel, sep_smaller_list in enumerate( + traversal.from_sep_smaller_by_level): + for itgt_box, tgt_ibox in enumerate( + traversal.target_boxes_sep_smaller_by_source_level[ilevel]): + ntargets = box_target_counts_nonchild[tgt_ibox] + start, end = sep_smaller_list.starts[itgt_box:itgt_box + 2] + nm2p[tgt_ibox] += ntargets * (end - start) * m2p_cost[ilevel] + + return nm2p + + def process_list4(self, traversal, p2l_cost): + tree = traversal.tree + target_or_target_parent_boxes = traversal.target_or_target_parent_boxes + nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64) + + for itgt_box in range(len(target_or_target_parent_boxes)): + start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] + for src_ibox in traversal.from_sep_bigger_lists[start:end]: + nsources = tree.box_source_counts_nonchild[src_ibox] + ilevel = tree.box_levels[src_ibox] + nm2p[itgt_box] += nsources * p2l_cost[ilevel] + + return nm2p + + def process_eval_locals(self, traversal, l2p_cost, + box_target_counts_nonchild=None): + tree = traversal.tree + ntarget_boxes = len(traversal.target_boxes) + neval_locals = np.zeros(ntarget_boxes, dtype=np.float64) + if box_target_counts_nonchild is None: + box_target_counts_nonchild = tree.box_target_counts_nonchild + + for target_lev in range(tree.nlevels): + start, stop = traversal.level_start_target_box_nrs[ + target_lev:target_lev+2] + for itgt_box, tgt_ibox in enumerate( + traversal.target_boxes[start:stop], start): + neval_locals[itgt_box] += (box_target_counts_nonchild[tgt_ibox] + * l2p_cost[target_lev]) + + return neval_locals + + def process_coarsen_multipoles(self, traversal, m2m_cost): + tree = traversal.tree + result = 0.0 + + # nlevels-1 is the last valid level index + # nlevels-2 is the last valid level that could have children + # + # 3 is the last relevant source_level. + # 2 is the last relevant target_level. + # (because no level 1 box will be well-separated from another) + for source_level in range(tree.nlevels-1, 2, -1): + target_level = source_level - 1 + cost = m2m_cost[target_level] + + nmultipoles = 0 + start, stop = traversal.level_start_source_parent_box_nrs[ + target_level:target_level+2] + for ibox in traversal.source_parent_boxes[start:stop]: + for child in tree.box_child_ids[:, ibox]: + if child: + nmultipoles += 1 + + result += cost * nmultipoles + + return result + + def process_refine_locals(self, traversal, l2l_cost): + tree = traversal.tree + result = 0.0 + + for target_lev in range(1, tree.nlevels): + start, stop = traversal.level_start_target_or_target_parent_box_nrs[ + target_lev:target_lev+2] + source_lev = target_lev - 1 + result += (stop-start) * l2l_cost[source_lev] + + return result + + def aggregate(self, per_box_result): + if isinstance(per_box_result, float): + return per_box_result + else: + return np.sum(per_box_result) + + def aggregate_stage_costs_per_box(self, traversal, cost_result): + tree = traversal.tree + nboxes = tree.nboxes + source_boxes = traversal.source_boxes + target_boxes = traversal.target_boxes + target_or_target_parent_boxes = traversal.target_or_target_parent_boxes + + cost_per_box = np.zeros(nboxes, dtype=np.float64) + + cost_per_box[source_boxes] += cost_result["form_multipoles"] + cost_per_box[target_boxes] += cost_result["eval_direct"] + cost_per_box[target_or_target_parent_boxes] += \ + cost_result["multipole_to_local"] + cost_per_box += cost_result["eval_multipoles"] + cost_per_box[target_or_target_parent_boxes] += cost_result["form_locals"] + cost_per_box[target_boxes] += cost_result["eval_locals"] + + return cost_per_box diff --git a/doc/cost.rst b/doc/cost.rst new file mode 100644 index 0000000000000000000000000000000000000000..02acdf9c97cba0774615f7d4ff15a747449475f6 --- /dev/null +++ b/doc/cost.rst @@ -0,0 +1,29 @@ +FMM cost model +============== + +.. module:: boxtree.cost + +.. automodule:: boxtree.cost + +Translation cost model +---------------------- + +.. autoclass:: FMMTranslationCostModel + +.. autofunction:: pde_aware_translation_cost_model + +.. autofunction:: taylor_translation_cost_model + +Cost model interface +-------------------- + +.. autoclass:: AbstractFMMCostModel + :members: + :member-order: bysource + +Cost model implementation +------------------------- + +.. autoclass:: CLFMMCostModel + +.. autoclass:: PythonFMMCostModel \ No newline at end of file diff --git a/doc/index.rst b/doc/index.rst index 9d86c8880f6a0c58e13b5f1fffcc3df8b362180f..edff8a3b72b295a07660e3366b93eaab800b0170 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -40,6 +40,7 @@ Overview traversal fmm lookup + cost misc Indices and tables diff --git a/examples/cost_model.py b/examples/cost_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8ebac0559994a79b4f6c1b6f478c79a618b59f47 --- /dev/null +++ b/examples/cost_model.py @@ -0,0 +1,121 @@ +import numpy as np +import pyopencl as cl +import sys + +import logging +import os + +# Configure the root logger +logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) + +logger = logging.getLogger(__name__) + +# Set the logger level of this module to INFO so that logging outputs of this module +# are shown +logger.setLevel(logging.INFO) + +# `process_elapsed` in `ProcessTimer` is only supported for Python >= 3.3 +SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3)) + + +def demo_cost_model(): + if not SUPPORTS_PROCESS_TIME: + raise NotImplementedError( + "Currently this script uses process time which only works on Python>=3.3" + ) + + from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler + + nsources_list = [1000, 2000, 3000, 4000, 5000] + ntargets_list = [1000, 2000, 3000, 4000, 5000] + dims = 3 + dtype = np.float64 + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + traversals = [] + traversals_dev = [] + level_to_orders = [] + timing_results = [] + + def fmm_level_to_nterms(tree, ilevel): + return 10 + + for nsources, ntargets in zip(nsources_list, ntargets_list): + # {{{ Generate sources, targets and target_radii + + from boxtree.tools import make_normal_particle_array as p_normal + sources = p_normal(queue, nsources, dims, dtype, seed=15) + targets = p_normal(queue, ntargets, dims, dtype, seed=18) + + from pyopencl.clrandom import PhiloxGenerator + rng = PhiloxGenerator(queue.context, seed=22) + target_radii = rng.uniform( + queue, ntargets, a=0, b=0.05, dtype=dtype + ).get() + + # }}} + + # {{{ Generate tree and traversal + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + tree, _ = tb( + queue, sources, targets=targets, target_radii=target_radii, + stick_out_factor=0.15, max_particles_in_box=30, debug=True + ) + + from boxtree.traversal import FMMTraversalBuilder + tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) + trav_dev, _ = tg(queue, tree, debug=True) + trav = trav_dev.get(queue=queue) + + traversals.append(trav) + traversals_dev.append(trav_dev) + + # }}} + + wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms) + level_to_orders.append(wrangler.level_nterms) + + timing_data = {} + from boxtree.fmm import drive_fmm + src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) + drive_fmm(trav, wrangler, src_weights, timing_data=timing_data) + + timing_results.append(timing_data) + + time_field_name = "process_elapsed" + + from boxtree.cost import CLFMMCostModel + from boxtree.cost import pde_aware_translation_cost_model + cost_model = CLFMMCostModel(queue, pde_aware_translation_cost_model) + + model_results = [] + for icase in range(len(traversals)-1): + traversal = traversals_dev[icase] + model_results.append( + cost_model(traversal, level_to_orders[icase], + CLFMMCostModel.get_constantone_calibration_params()) + ) + + params = cost_model.estimate_calibration_params( + model_results, timing_results[:-1], time_field_name=time_field_name + ) + + predicted_time = cost_model(traversals_dev[-1], level_to_orders[-1], params) + + for field in ["form_multipoles", "eval_direct", "multipole_to_local", + "eval_multipoles", "form_locals", "eval_locals", + "coarsen_multipoles", "refine_locals"]: + logger.info("predicted time for {0}: {1}".format( + field, str(cost_model.aggregate(predicted_time[field])) + )) + logger.info("actual time for {0}: {1}".format( + field, str(timing_results[-1][field]["process_elapsed"]) + )) + + +if __name__ == '__main__': + demo_cost_model() diff --git a/test/test_cost_model.py b/test/test_cost_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e575d10a8af1f44050c55ad3983c940aa7becbf1 --- /dev/null +++ b/test/test_cost_model.py @@ -0,0 +1,598 @@ +import numpy as np +import pyopencl as cl +import time + +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from pymbolic import evaluate +from boxtree.cost import CLFMMCostModel, PythonFMMCostModel +from boxtree.cost import pde_aware_translation_cost_model +import sys + +import logging +import os +logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3)) + + +@pytest.mark.opencl +@pytest.mark.parametrize( + ("nsources", "ntargets", "dims", "dtype"), [ + (50000, 50000, 3, np.float64) + ] +) +def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dtype): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + # {{{ Generate sources, targets and target_radii + + from boxtree.tools import make_normal_particle_array as p_normal + sources = p_normal(queue, nsources, dims, dtype, seed=15) + targets = p_normal(queue, ntargets, dims, dtype, seed=18) + + from pyopencl.clrandom import PhiloxGenerator + rng = PhiloxGenerator(queue.context, seed=22) + target_radii = rng.uniform( + queue, ntargets, a=0, b=0.05, dtype=dtype + ).get() + + # }}} + + # {{{ Generate tree and traversal + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + tree, _ = tb( + queue, sources, targets=targets, target_radii=target_radii, + stick_out_factor=0.15, max_particles_in_box=30, debug=True + ) + + from boxtree.traversal import FMMTraversalBuilder + tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) + trav_dev, _ = tg(queue, tree, debug=True) + trav = trav_dev.get(queue=queue) + + # }}} + + # {{{ Construct cost models + + cl_cost_model = CLFMMCostModel(queue, None) + python_cost_model = PythonFMMCostModel(None) + + constant_one_params = cl_cost_model.get_constantone_calibration_params().copy() + for ilevel in range(trav.tree.nlevels): + constant_one_params["p_fmm_lev%d" % ilevel] = 10 + + xlat_cost = pde_aware_translation_cost_model(dims, trav.tree.nlevels) + + # }}} + + # {{{ Test process_form_multipoles + + nlevels = trav.tree.nlevels + p2m_cost = np.zeros(nlevels, dtype=np.float64) + for ilevel in range(nlevels): + p2m_cost[ilevel] = evaluate( + xlat_cost.p2m(ilevel), + context=constant_one_params + ) + p2m_cost_dev = cl.array.to_device(queue, p2m_cost) + + queue.finish() + start_time = time.time() + + cl_form_multipoles = cl_cost_model.process_form_multipoles( + trav_dev, p2m_cost_dev + ) + + queue.finish() + logger.info("OpenCL time for process_form_multipoles: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_form_multipoles = python_cost_model.process_form_multipoles( + trav, p2m_cost + ) + + logger.info("Python time for process_form_multipoles: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_form_multipoles.get(), python_form_multipoles) + + # }}} + + # {{{ Test process_coarsen_multipoles + + m2m_cost = np.zeros(nlevels - 1, dtype=np.float64) + for target_level in range(nlevels - 1): + m2m_cost[target_level] = evaluate( + xlat_cost.m2m(target_level + 1, target_level), + context=constant_one_params + ) + m2m_cost_dev = cl.array.to_device(queue, m2m_cost) + + queue.finish() + start_time = time.time() + cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles( + trav_dev, m2m_cost_dev + ) + + queue.finish() + logger.info("OpenCL time for coarsen_multipoles: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles( + trav, m2m_cost + ) + + logger.info("Python time for coarsen_multipoles: {0}".format( + str(time.time() - start_time) + )) + + assert cl_coarsen_multipoles == python_coarsen_multipoles + + # }}} + + # {{{ Test process_direct + + queue.finish() + start_time = time.time() + + cl_ndirect_sources_per_target_box = \ + cl_cost_model.get_ndirect_sources_per_target_box(trav_dev) + + cl_direct = cl_cost_model.process_direct( + trav_dev, cl_ndirect_sources_per_target_box, 5.0 + ) + + queue.finish() + logger.info("OpenCL time for process_direct: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_ndirect_sources_per_target_box = \ + python_cost_model.get_ndirect_sources_per_target_box(trav) + + python_direct = python_cost_model.process_direct( + trav, python_ndirect_sources_per_target_box, 5.0 + ) + + logger.info("Python time for process_direct: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_direct.get(), python_direct) + + # }}} + + # {{{ Test aggregate + + start_time = time.time() + + cl_direct_aggregate = cl_cost_model.aggregate(cl_direct) + + queue.finish() + logger.info("OpenCL time for aggregate: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_direct_aggregate = python_cost_model.aggregate(python_direct) + + logger.info("Python time for aggregate: {0}".format( + str(time.time() - start_time) + )) + + assert cl_direct_aggregate == python_direct_aggregate + + # }}} + + # {{{ Test process_list2 + + nlevels = trav.tree.nlevels + m2l_cost = np.zeros(nlevels, dtype=np.float64) + for ilevel in range(nlevels): + m2l_cost[ilevel] = evaluate( + xlat_cost.m2l(ilevel, ilevel), + context=constant_one_params + ) + m2l_cost_dev = cl.array.to_device(queue, m2l_cost) + + queue.finish() + start_time = time.time() + + cl_m2l_cost = cl_cost_model.process_list2(trav_dev, m2l_cost_dev) + + queue.finish() + logger.info("OpenCL time for process_list2: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + python_m2l_cost = python_cost_model.process_list2(trav, m2l_cost) + logger.info("Python time for process_list2: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_m2l_cost.get(), python_m2l_cost) + + # }}} + + # {{{ Test process_list 3 + + m2p_cost = np.zeros(nlevels, dtype=np.float64) + for ilevel in range(nlevels): + m2p_cost[ilevel] = evaluate( + xlat_cost.m2p(ilevel), + context=constant_one_params + ) + m2p_cost_dev = cl.array.to_device(queue, m2p_cost) + + queue.finish() + start_time = time.time() + + cl_m2p_cost = cl_cost_model.process_list3(trav_dev, m2p_cost_dev) + + queue.finish() + logger.info("OpenCL time for process_list3: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + python_m2p_cost = python_cost_model.process_list3(trav, m2p_cost) + logger.info("Python time for process_list3: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_m2p_cost.get(), python_m2p_cost) + + # }}} + + # {{{ Test process_list4 + + p2l_cost = np.zeros(nlevels, dtype=np.float64) + for ilevel in range(nlevels): + p2l_cost[ilevel] = evaluate( + xlat_cost.p2l(ilevel), + context=constant_one_params + ) + p2l_cost_dev = cl.array.to_device(queue, p2l_cost) + + queue.finish() + start_time = time.time() + + cl_p2l_cost = cl_cost_model.process_list4(trav_dev, p2l_cost_dev) + + queue.finish() + logger.info("OpenCL time for process_list4: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + python_p2l_cost = python_cost_model.process_list4(trav, p2l_cost) + logger.info("Python time for process_list4: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_p2l_cost.get(), python_p2l_cost) + + # }}} + + # {{{ Test process_refine_locals + + l2l_cost = np.zeros(nlevels - 1, dtype=np.float64) + for ilevel in range(nlevels - 1): + l2l_cost[ilevel] = evaluate( + xlat_cost.l2l(ilevel, ilevel + 1), + context=constant_one_params + ) + l2l_cost_dev = cl.array.to_device(queue, l2l_cost) + + queue.finish() + start_time = time.time() + + cl_refine_locals_cost = cl_cost_model.process_refine_locals( + trav_dev, l2l_cost_dev + ) + + queue.finish() + logger.info("OpenCL time for refine_locals: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + python_refine_locals_cost = python_cost_model.process_refine_locals( + trav, l2l_cost + ) + logger.info("Python time for refine_locals: {0}".format( + str(time.time() - start_time) + )) + + assert cl_refine_locals_cost == python_refine_locals_cost + + # }}} + + # {{{ Test process_eval_locals + + l2p_cost = np.zeros(nlevels, dtype=np.float64) + for ilevel in range(nlevels): + l2p_cost[ilevel] = evaluate( + xlat_cost.l2p(ilevel), + context=constant_one_params + ) + l2p_cost_dev = cl.array.to_device(queue, l2p_cost) + + queue.finish() + start_time = time.time() + + cl_l2p_cost = cl_cost_model.process_eval_locals(trav_dev, l2p_cost_dev) + + queue.finish() + logger.info("OpenCL time for process_eval_locals: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + python_l2p_cost = python_cost_model.process_eval_locals(trav, l2p_cost) + logger.info("Python time for process_eval_locals: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_l2p_cost.get(), python_l2p_cost) + + # }}} + + +@pytest.mark.opencl +def test_estimate_calibration_params(ctx_factory): + from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler + + nsources_list = [1000, 2000, 3000, 4000] + ntargets_list = [1000, 2000, 3000, 4000] + dims = 3 + dtype = np.float64 + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + traversals = [] + traversals_dev = [] + level_to_orders = [] + timing_results = [] + + def fmm_level_to_nterms(tree, ilevel): + return 10 + + for nsources, ntargets in zip(nsources_list, ntargets_list): + # {{{ Generate sources, targets and target_radii + + from boxtree.tools import make_normal_particle_array as p_normal + sources = p_normal(queue, nsources, dims, dtype, seed=15) + targets = p_normal(queue, ntargets, dims, dtype, seed=18) + + from pyopencl.clrandom import PhiloxGenerator + rng = PhiloxGenerator(queue.context, seed=22) + target_radii = rng.uniform( + queue, ntargets, a=0, b=0.05, dtype=dtype + ).get() + + # }}} + + # {{{ Generate tree and traversal + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + tree, _ = tb( + queue, sources, targets=targets, target_radii=target_radii, + stick_out_factor=0.15, max_particles_in_box=30, debug=True + ) + + from boxtree.traversal import FMMTraversalBuilder + tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) + trav_dev, _ = tg(queue, tree, debug=True) + trav = trav_dev.get(queue=queue) + + traversals.append(trav) + traversals_dev.append(trav_dev) + + # }}} + + wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms) + level_to_orders.append(wrangler.level_nterms) + + timing_data = {} + from boxtree.fmm import drive_fmm + src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) + drive_fmm(trav, wrangler, src_weights, timing_data=timing_data) + + timing_results.append(timing_data) + + if SUPPORTS_PROCESS_TIME: + time_field_name = "process_elapsed" + else: + time_field_name = "wall_elapsed" + + def test_params_sanity(test_params): + param_names = ["c_p2m", "c_m2m", "c_p2p", "c_m2l", "c_m2p", "c_p2l", "c_l2l", + "c_l2p"] + for name in param_names: + assert isinstance(test_params[name], np.float64) + + def test_params_equal(test_params1, test_params2): + param_names = ["c_p2m", "c_m2m", "c_p2p", "c_m2l", "c_m2p", "c_p2l", "c_l2l", + "c_l2p"] + for name in param_names: + assert test_params1[name] == test_params2[name] + + python_cost_model = PythonFMMCostModel(pde_aware_translation_cost_model) + + python_model_results = [] + + for icase in range(len(traversals)-1): + traversal = traversals[icase] + level_to_order = level_to_orders[icase] + + python_model_results.append(python_cost_model( + traversal, level_to_order, + PythonFMMCostModel.get_constantone_calibration_params() + )) + + python_params = python_cost_model.estimate_calibration_params( + python_model_results, timing_results[:-1], time_field_name=time_field_name + ) + + test_params_sanity(python_params) + + cl_cost_model = CLFMMCostModel( + queue, + pde_aware_translation_cost_model + ) + + cl_model_results = [] + + for icase in range(len(traversals_dev)-1): + traversal = traversals_dev[icase] + level_to_order = level_to_orders[icase] + + cl_model_results.append(cl_cost_model( + traversal, level_to_order, + CLFMMCostModel.get_constantone_calibration_params() + )) + + cl_params = cl_cost_model.estimate_calibration_params( + cl_model_results, timing_results[:-1], time_field_name=time_field_name + ) + test_params_sanity(cl_params) + + if SUPPORTS_PROCESS_TIME: + test_params_equal(cl_params, python_params) + + +class OpCountingTranslationCostModel(object): + """A translation cost model which assigns at cost of 1 to each operation.""" + + def __init__(self, dim, nlevels): + pass + + @staticmethod + def direct(): + return 1 + + @staticmethod + def p2l(level): + return 1 + + l2p = p2l + p2m = p2l + m2p = p2l + + @staticmethod + def m2m(src_level, tgt_level): + return 1 + + l2l = m2m + m2l = m2m + + +@pytest.mark.opencl +@pytest.mark.parametrize( + ("nsources", "ntargets", "dims", "dtype"), [ + (5000, 5000, 3, np.float64) + ] +) +def test_cost_model_gives_correct_op_counts_with_constantone_wrangler( + ctx_factory, nsources, ntargets, dims, dtype): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from boxtree.tools import make_normal_particle_array as p_normal + sources = p_normal(queue, nsources, dims, dtype, seed=16) + targets = p_normal(queue, ntargets, dims, dtype, seed=19) + + from pyopencl.clrandom import PhiloxGenerator + rng = PhiloxGenerator(queue.context, seed=20) + target_radii = rng.uniform( + queue, ntargets, a=0, b=0.04, dtype=dtype + ).get() + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + tree, _ = tb( + queue, sources, targets=targets, target_radii=target_radii, + stick_out_factor=0.15, max_particles_in_box=30, debug=True + ) + + from boxtree.traversal import FMMTraversalBuilder + tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) + trav_dev, _ = tg(queue, tree, debug=True) + trav = trav_dev.get(queue=queue) + + from boxtree.tools import ConstantOneExpansionWrangler + wrangler = ConstantOneExpansionWrangler(trav.tree) + + timing_data = {} + from boxtree.fmm import drive_fmm + src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) + drive_fmm(trav, wrangler, src_weights, timing_data=timing_data) + + cost_model = CLFMMCostModel( + queue, + translation_cost_model_factory=OpCountingTranslationCostModel + ) + + level_to_order = np.array([1 for _ in range(tree.nlevels)]) + + modeled_time = cost_model( + trav_dev, level_to_order, + CLFMMCostModel.get_constantone_calibration_params() + ) + + mismatches = [] + for stage in timing_data: + if (timing_data[stage]["ops_elapsed"] + != cost_model.aggregate(modeled_time[stage])): + mismatches.append( + (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage])) + + assert not mismatches, "\n".join(str(s) for s in mismatches) + + # {{{ Test aggregate_stage_costs_per_box + + total_cost = 0.0 + for stage in timing_data: + total_cost += timing_data[stage]["ops_elapsed"] + + per_box_cost = cost_model.aggregate_stage_costs_per_box(trav_dev, modeled_time) + total_aggregate_cost = cost_model.aggregate(per_box_cost) + assert total_cost == ( + total_aggregate_cost + + modeled_time["coarsen_multipoles"] + + modeled_time["refine_locals"] + ) + + # }}} + + +# You can test individual routines by typing +# $ python test_cost_model.py 'test_routine(cl.create_some_context)' + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__])