diff --git a/examples/cost.py b/examples/cost.py index 0070aaa6402f0233acd254cccaf2f0a8f8fa0fb3..791cd7f9d7d7c34a93edc51e7eca6eca0d861ac1 100644 --- a/examples/cost.py +++ b/examples/cost.py @@ -1,3 +1,30 @@ +from __future__ import division, print_function + +__copyright__ = """ + Copyright (C) 2018 Matt Wala + Copyright (C) 2019 Hao Gao +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + """Calibrates a cost model and reports on the accuracy.""" import pyopencl as cl @@ -6,6 +33,7 @@ from meshmode.array_context import PyOpenCLArrayContext from meshmode.dof_array import thaw from pytential import sym, bind +from pytential.qbx.cost import QBXCostModel from pytools import one @@ -90,9 +118,7 @@ def get_test_density(actx, density_discr): def calibrate_cost_model(ctx): queue = cl.CommandQueue(ctx) actx = PyOpenCLArrayContext(queue) - - from pytential.qbx.cost import CostModel, estimate_calibration_params - cost_model = CostModel() + cost_model = QBXCostModel() model_results = [] timing_results = [] @@ -107,7 +133,7 @@ def calibrate_cost_model(ctx): bound_op = get_bound_op(places) sigma = get_test_density(actx, density_discr) - cost_S = bound_op.get_modeled_cost(actx, sigma=sigma) + modeled_cost, _ = bound_op.cost_per_stage("constant_one", sigma=sigma) # Warm-up run. bound_op.eval({"sigma": sigma}, array_context=actx) @@ -117,18 +143,20 @@ def calibrate_cost_model(ctx): bound_op.eval({"sigma": sigma}, array_context=actx, timing_data=timing_data) - model_results.append(one(cost_S.values())) - timing_results.append(one(timing_data.values())) + model_results.append(modeled_cost) + timing_results.append(timing_data) - calibration_params = ( - estimate_calibration_params(model_results, timing_results)) + calibration_params = cost_model.estimate_kernel_specific_calibration_params( + model_results, timing_results, time_field_name="process_elapsed" + ) - return cost_model.with_calibration_params(calibration_params) + return calibration_params -def test_cost_model(ctx, cost_model): +def test_cost_model(ctx, calibration_params): queue = cl.CommandQueue(ctx) actx = PyOpenCLArrayContext(queue) + cost_model = QBXCostModel() for lpot_source in test_geometries(actx): lpot_source = lpot_source.copy(cost_model=cost_model) @@ -140,10 +168,8 @@ def test_cost_model(ctx, cost_model): bound_op = get_bound_op(places) sigma = get_test_density(actx, density_discr) - cost_S = bound_op.get_modeled_cost(actx, sigma=sigma) - model_result = ( - one(cost_S.values()) - .get_predicted_times(merge_close_lists=True)) + cost_S, _ = bound_op.cost_per_stage(calibration_params, sigma=sigma) + model_result = one(cost_S.values()) # Warm-up run. bound_op.eval({"sigma": sigma}, array_context=actx) @@ -168,15 +194,19 @@ def test_cost_model(ctx, cost_model): row = [ stage, "%.2f" % timing_result[stage], - "%.2f" % model_result[stage]] + "%.2f" % model_result[stage] + ] table.add_row(row) print(table) def predict_cost(ctx): - model = calibrate_cost_model(ctx) - test_cost_model(ctx, model) + import logging + logging.basicConfig(level=logging.WARNING) # INFO for more progress info + + params = calibrate_cost_model(ctx) + test_cost_model(ctx, params) if __name__ == "__main__": diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index db87e79acae37e6423784b6fc0a7d57dd1ce8e91..a2ebfe8a834211b41fc69f9e891c297dcd58efba 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -108,9 +108,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): :arg _use_target_specific_qbx: Whether to use target-specific acceleration by default if possible. *None* means "use if possible". - :arg cost_model: Either *None* or instance of - :class:`~pytential.qbx.cost.CostModel`, used for gathering modeled - costs (experimental) + :arg cost_model: Either *None* or an object implementing the + :class:`~pytential.qbx.cost.AbstractQBXCostModel` interface, used for + gathering modeled costs if provided (experimental) """ # {{{ argument processing @@ -213,8 +213,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self.geometry_data_inspector = geometry_data_inspector if cost_model is None: - from pytential.qbx.cost import CostModel - cost_model = CostModel() + from pytential.qbx.cost import QBXCostModel + cost_model = QBXCostModel() self.cost_model = cost_model @@ -437,7 +437,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ internal functionality for execution - def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate, + def exec_compute_potential_insn(self, actx, insn, bound_expr, evaluate, return_timing_data): extra_args = {} @@ -460,40 +460,57 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): extra_args["fmm_driver"] = drive_fmm return self._dispatch_compute_potential_insn( - queue, insn, bound_expr, evaluate, func, extra_args) + actx, insn, bound_expr, evaluate, func, extra_args) - def cost_model_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + def cost_model_compute_potential_insn(self, actx, insn, bound_expr, evaluate, + calibration_params, per_box): """Using :attr:`cost_model`, evaluate the cost of executing *insn*. Cost model results are gathered in :attr:`pytential.symbolic.execution.BoundExpression.modeled_cost` along the way. + :arg calibration_params: a :class:`dict` of calibration parameters, mapping + from parameter names to calibration values. + :arg per_box: if *true*, cost model result will be a :class:`numpy.ndarray` + or :class:`pyopencl.array.Array` with shape of the number of boxes, where + the ith entry is the sum of the cost of all stages for box i. If *false*, + cost model result will be a :class:`dict`, mapping from the stage name to + predicted cost of the stage for all boxes. + :returns: whatever :meth:`exec_compute_potential_insn_fmm` returns. """ - if self.fmm_level_to_order is False: raise NotImplementedError("perf modeling direct evaluations") def drive_cost_model( wrangler, strengths, geo_data, kernel, kernel_arguments): del strengths - cost_model_result = ( - self.cost_model(wrangler, geo_data, kernel, kernel_arguments)) - from pytools.obj_array import obj_array_vectorize - output_placeholder = obj_array_vectorize( - wrangler.finalize_potentials, - wrangler.full_output_zeros() - ) + if per_box: + cost_model_result, metadata = self.cost_model.qbx_cost_per_box( + actx.queue, geo_data, kernel, kernel_arguments, + calibration_params + ) + else: + cost_model_result, metadata = self.cost_model.qbx_cost_per_stage( + actx.queue, geo_data, kernel, kernel_arguments, + calibration_params + ) - return output_placeholder, cost_model_result + from pytools.obj_array import obj_array_vectorize + return ( + obj_array_vectorize( + wrangler.finalize_potentials, + wrangler.full_output_zeros()), + (cost_model_result, metadata)) return self._dispatch_compute_potential_insn( - queue, insn, bound_expr, evaluate, - self.exec_compute_potential_insn_fmm, - extra_args={"fmm_driver": drive_cost_model}) + actx, insn, bound_expr, evaluate, + self.exec_compute_potential_insn_fmm, + extra_args={"fmm_driver": drive_cost_model} + ) - def _dispatch_compute_potential_insn(self, queue, insn, bound_expr, + def _dispatch_compute_potential_insn(self, actx, insn, bound_expr, evaluate, func, extra_args=None): if self._disable_refinement: from warnings import warn @@ -504,7 +521,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): if extra_args is None: extra_args = {} - return func(queue, insn, bound_expr, evaluate, **extra_args) + return func(actx, insn, bound_expr, evaluate, **extra_args) # {{{ fmm-based execution diff --git a/pytential/qbx/cost.py b/pytential/qbx/cost.py index 46244b540843e63a5868d53ab098ae8a6102c3a6..6071dea5c86afd04f3678718b7fef62deda95489 100644 --- a/pytential/qbx/cost.py +++ b/pytential/qbx/cost.py @@ -3,6 +3,7 @@ from __future__ import division, absolute_import __copyright__ = """ Copyright (C) 2013 Andreas Kloeckner Copyright (C) 2018 Matt Wala +Copyright (C) 2019 Hao Gao """ __license__ = """ @@ -25,17 +26,30 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from collections import OrderedDict - -import logging - +from six.moves import range import numpy as np import pyopencl as cl -from six.moves import range - -from pytools import log_process -from pymbolic import var +import pyopencl.array # noqa: F401 +from pyopencl.array import take +from pyopencl.elementwise import ElementwiseKernel +from pyopencl.tools import dtype_to_ctype +from mako.template import Template +from pymbolic import var, evaluate +from pytools import memoize_method +from functools import partial +import sys + +from boxtree.cost import ( + FMMTranslationCostModel, AbstractFMMCostModel, FMMCostModel, _PythonFMMCostModel +) +from abc import abstractmethod + +if sys.version_info >= (3, 0): + Template = partial(Template, strict_undefined=True) +else: + Template = partial(Template, strict_undefined=True, disable_unicode=True) +import logging logger = logging.getLogger(__name__) @@ -45,31 +59,72 @@ __doc__ = """ This module is experimental. Its interface is subject to change until this notice is removed. -.. autoclass:: CostModel -.. autoclass:: ParametrizedCosts +This module helps predict the running time of each step of QBX, as an extension of +the similar module :mod:`boxtree.cost` in boxtree. + +:class:`QBXTranslationCostModel` describes the translation or evaluation cost of a +single operation. For example, *m2qbxl* describes the cost for translating a single +multipole expansion to a QBX local expansion. + +:class:`AbstractQBXCostModel` uses :class:`QBXTranslationCostModel` and +kernel-specific calibration parameter to compute the total cost of each step of QBX +in each box. :class:`QBXCostModel` is one implementation of +:class:`AbstractQBXCostModel` using OpenCL. + +:file:`examples/cost.py` in the source distribution demonstrates how the calibration +and evaluation are performed. + +Translation Cost of a Single Operation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: QBXTranslationCostModel + +.. autofunction:: make_pde_aware_translation_cost_model + +.. autofunction:: make_taylor_translation_cost_model + +Cost Model Classes +^^^^^^^^^^^^^^^^^^ + +.. autoclass:: AbstractQBXCostModel -.. autoclass:: TranslationCostModel +.. autoclass:: QBXCostModel -.. autofunction:: pde_aware_translation_cost_model -.. autofunction:: taylor_translation_cost_model +Calibration (Generate Calibration Parameters) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: estimate_calibration_params +.. automethod:: AbstractQBXCostModel.estimate_kernel_specific_calibration_params + +Evaluating +^^^^^^^^^^ + +.. automethod:: AbstractQBXCostModel.qbx_cost_per_stage + +.. automethod:: AbstractQBXCostModel.qbx_cost_per_box + +To get the cost from `BoundExpression`, refer to +:meth:`pytential.symbolic.execution.BoundExpression.cost_per_stage` and +:meth:`pytential.symbolic.execution.BoundExpression.cost_per_box`. + +Utilities +^^^^^^^^^ + +.. automethod:: boxtree.cost.AbstractFMMCostModel.aggregate_over_boxes + +.. automethod:: AbstractQBXCostModel.get_unit_calibration_params """ # {{{ translation cost model -class TranslationCostModel(object): +class QBXTranslationCostModel(FMMTranslationCostModel): """Provides modeled costs for individual translations or evaluations.""" def __init__(self, ncoeffs_qbx, ncoeffs_fmm_by_level, uses_point_and_shoot): self.ncoeffs_qbx = ncoeffs_qbx - self.ncoeffs_fmm_by_level = ncoeffs_fmm_by_level - self.uses_point_and_shoot = uses_point_and_shoot - - @staticmethod - def direct(): - return var("c_p2p") + FMMTranslationCostModel.__init__( + self, ncoeffs_fmm_by_level, uses_point_and_shoot + ) def p2qbxl(self): return var("c_p2qbxl") * self.ncoeffs_qbx @@ -77,66 +132,27 @@ class TranslationCostModel(object): def p2p_tsqbx(self): # This term should be linear in the QBX order, which is the # square root of the number of QBX coefficients. - return var("c_p2p_tsqbx") * self.ncoeffs_qbx ** (1/2) + return var("c_p2p_tsqbx") * self.ncoeffs_qbx ** (1 / 2) def qbxl2p(self): return var("c_qbxl2p") * self.ncoeffs_qbx - def p2l(self, level): - return var("c_p2l") * self.ncoeffs_fmm_by_level[level] - - def l2p(self, level): - return var("c_l2p") * self.ncoeffs_fmm_by_level[level] - - def p2m(self, level): - return var("c_p2m") * self.ncoeffs_fmm_by_level[level] - - def m2p(self, level): - return var("c_m2p") * self.ncoeffs_fmm_by_level[level] - - def m2m(self, src_level, tgt_level): - return var("c_m2m") * self.e2e_cost( - self.ncoeffs_fmm_by_level[src_level], - self.ncoeffs_fmm_by_level[tgt_level]) - - def l2l(self, src_level, tgt_level): - return var("c_l2l") * self.e2e_cost( - self.ncoeffs_fmm_by_level[src_level], - self.ncoeffs_fmm_by_level[tgt_level]) - - def m2l(self, src_level, tgt_level): - return var("c_m2l") * self.e2e_cost( - self.ncoeffs_fmm_by_level[src_level], - self.ncoeffs_fmm_by_level[tgt_level]) - def m2qbxl(self, level): return var("c_m2qbxl") * self.e2e_cost( - self.ncoeffs_fmm_by_level[level], - self.ncoeffs_qbx) + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) def l2qbxl(self, level): return var("c_l2qbxl") * self.e2e_cost( - self.ncoeffs_fmm_by_level[level], - self.ncoeffs_qbx) - - def e2e_cost(self, nsource_coeffs, ntarget_coeffs): - if self.uses_point_and_shoot: - return ( - # Rotate the coordinate system to be z axis aligned. - nsource_coeffs ** (3 / 2) - # Translate the expansion along the z axis. - + nsource_coeffs ** (1 / 2) * ntarget_coeffs - # Rotate the coordinate system back. - + ntarget_coeffs ** (3 / 2)) - - return nsource_coeffs * ntarget_coeffs + self.ncoeffs_fmm_by_level[level], + self.ncoeffs_qbx) # }}} # {{{ translation cost model factories -def pde_aware_translation_cost_model(dim, nlevels): +def make_pde_aware_translation_cost_model(dim, nlevels): """Create a cost model for FMM translation operators that make use of the knowledge that the potential satisfies a PDE. """ @@ -151,13 +167,13 @@ def pde_aware_translation_cost_model(dim, nlevels): if dim == 3: uses_point_and_shoot = True - return TranslationCostModel( - ncoeffs_qbx=ncoeffs_qbx, - ncoeffs_fmm_by_level=ncoeffs_fmm, - uses_point_and_shoot=uses_point_and_shoot) + return QBXTranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=uses_point_and_shoot) -def taylor_translation_cost_model(dim, nlevels): +def make_taylor_translation_cost_model(dim, nlevels): """Create a cost model for FMM translation based on Taylor expansions in Cartesian coordinates. """ @@ -167,747 +183,785 @@ def taylor_translation_cost_model(dim, nlevels): ncoeffs_fmm = (p_fmm + 1) ** dim ncoeffs_qbx = (p_qbx + 1) ** dim - return TranslationCostModel( - ncoeffs_qbx=ncoeffs_qbx, - ncoeffs_fmm_by_level=ncoeffs_fmm, - uses_point_and_shoot=False) + return QBXTranslationCostModel( + ncoeffs_qbx=ncoeffs_qbx, + ncoeffs_fmm_by_level=ncoeffs_fmm, + uses_point_and_shoot=False) # }}} -# {{{ parameterized costs returned by cost model - -class ParametrizedCosts(object): - """A container for data returned by the cost model. +# {{{ abstract cost model - This holds both symbolic costs as well as parameter values. To obtain a - prediction of the running time, use :meth:`get_predicted_times`. +class AbstractQBXCostModel(AbstractFMMCostModel): + """An interface to obtain both QBX operation counts and calibrated (e.g. in + seconds) cost estimates. - .. attribute:: raw_costs + * To obtain operation counts only, use :meth:`get_unit_calibration_params` + with :meth:`qbx_cost_per_stage` or :meth:`qbx_cost_per_box`. - A dictionary mapping algorithmic stage names to symbolic costs. + * To calibrate the model, pass operation counts per stage together with timing + data to :meth:`estimate_kernel_specific_calibration_params`. - .. attribute:: params - - A dictionary mapping names of symbolic parameters to values. Parameters - appear in *raw_costs* and may include values such as QBX or FMM order - as well as calibration constants. - - .. automethod:: copy - .. automethod:: with_params - .. automethod:: get_predicted_times + * To evaluate the calibrated models, pass the kernel-specific calibration + parameters from :meth:`estimate_kernel_specific_calibration_params` to + :meth:`qbx_cost_per_stage` or :meth:`qbx_cost_per_box`. """ - def __init__(self, raw_costs, params): - self.raw_costs = OrderedDict(raw_costs) - self.params = params - - def with_params(self, params): - """Return a copy of *self* with parameters updated to include *params*.""" - new_params = self.params.copy() - new_params.update(params) - return type(self)( - raw_costs=self.raw_costs.copy(), - params=new_params) - - def copy(self): - return self.with_params({}) - - def __str__(self): - return "".join([ - type(self).__name__, - "(raw_costs=", - str(self.raw_costs), - ", params=", - str(self.params), - ")"]) - - def __repr__(self): - return "".join([ - type(self).__name__, - "(raw_costs=", - repr(self.raw_costs), - ", params=", - repr(self.params), - ")"]) - - def get_predicted_times(self, merge_close_lists=False): - """Return a dictionary mapping stage names to predicted time in seconds. - - :arg merge_close_lists: If *True*, the returned estimate combines - the cost of "close" lists (Lists 1, 3 close, and 4 close). If - *False*, the time of each "close" list is reported separately. + @abstractmethod + def process_form_qbxl(self, queue, geo_data, p2qbxl_cost, + ndirect_sources_per_target_box): """ - from pymbolic import evaluate - from functools import partial - - get_time = partial(evaluate, context=self.params) - - result = OrderedDict() - - for name, val in self.raw_costs.items(): - if merge_close_lists: - for suffix in ("_list1", "_list3", "_list4"): - if name.endswith(suffix): - name = name[:-len(suffix)] - break - - result[name] = get_time(val) + result.get(name, 0) - - return result - -# }}} - - -# {{{ cost model - -class CostModel(object): - """ - .. automethod:: with_calibration_params - .. automethod:: __call__ - - The cost model relies on a translation cost model. See - :class:`TranslationCostModel` for the translation cost model interface. - """ + :arg queue: a :class:`pyopencl.CommandQueue` object. + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg p2qbxl_cost: a :class:`numpy.float64` constant representing the cost of + adding a source to a QBX local expansion. + :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape ``(ntarget_boxes,)``, with the + *i*th entry representing the number of direct evaluation sources (list 1, + list 3 close and list 4 close) for ``target_boxes[i]``. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + ``(ntarget_boxes,)``, with the *i*th entry representing the cost of + adding all direct evaluation sources to QBX local expansions of centers + in ``target_boxes[i]``. + """ + pass - def __init__(self, - translation_cost_model_factory=pde_aware_translation_cost_model, - calibration_params=None): + @abstractmethod + def process_m2qbxl(self, queue, geo_data, m2qbxl_cost): """ - :arg translation_cost_model_factory: A callable which, given arguments - (*dim*, *nlevels*), returns a :class:`TranslationCostModel`. + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg m2qbxl_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape ``(nlevels,)`` where the *i*th entry represents the translation + cost from multipole expansion at level *i* to a QBX center. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + ``(ntarget_boxes,)``, with the *i*th entry representing the cost of + translating multipole expansions of list 3 boxes at all source levels to + all QBX centers in ``target_boxes[i]``. """ - self.translation_cost_model_factory = translation_cost_model_factory - if calibration_params is None: - calibration_params = dict() - self.calibration_params = calibration_params - - def with_calibration_params(self, calibration_params): - """Return a copy of *self* with a new set of calibration parameters.""" - return type(self)( - translation_cost_model_factory=self.translation_cost_model_factory, - calibration_params=calibration_params) - - # {{{ form multipoles - - def process_form_multipoles(self, xlat_cost, traversal, tree): - result = 0 - - for level in range(tree.nlevels): - src_count = 0 - start, stop = traversal.level_start_source_box_nrs[level:level + 2] - for src_ibox in traversal.source_boxes[start:stop]: - nsrcs = tree.box_source_counts_nonchild[src_ibox] - src_count += nsrcs - result += src_count * xlat_cost.p2m(level) - - return dict(form_multipoles=result) - - # }}} - - # {{{ propagate multipoles upward - - def process_coarsen_multipoles(self, xlat_cost, traversal, tree): - result = 0 - - # nlevels-1 is the last valid level index - # nlevels-2 is the last valid level that could have children - # - # 3 is the last relevant source_level. - # 2 is the last relevant target_level. - # (because no level 1 box will be well-separated from another) - for source_level in range(tree.nlevels-1, 2, -1): - target_level = source_level - 1 - cost = xlat_cost.m2m(source_level, target_level) - - nmultipoles = 0 - start, stop = traversal.level_start_source_parent_box_nrs[ - target_level:target_level+2] - for ibox in traversal.source_parent_boxes[start:stop]: - for child in tree.box_child_ids[:, ibox]: - if child: - nmultipoles += 1 + pass - result += cost * nmultipoles - - return dict(coarsen_multipoles=result) + @abstractmethod + def process_l2qbxl(self, queue, geo_data, l2qbxl_cost): + """ + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg l2qbxl_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` + of shape ``(nlevels,)`` where each entry represents the translation + cost from a box local expansion to a QBX local expansion. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + ``(ntarget_boxes,)``, with each entry representing the cost of + translating box local expansions to all QBX local expansions. + """ + pass - # }}} + @abstractmethod + def process_eval_qbxl(self, queue, geo_data, qbxl2p_cost): + """ + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg qbxl2p_cost: a :class:`numpy.float64` constant, representing the + evaluation cost of a target from its QBX local expansion. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + ``(ntarget_boxes,)``, with the *i*th entry representing the cost of + evaluating all targets associated with QBX centers in ``target_boxes[i]`` + from QBX local expansions. + """ + pass - # {{{ collect direct interaction data + @abstractmethod + def process_eval_target_specific_qbxl(self, queue, geo_data, p2p_tsqbx_cost, + ndirect_sources_per_target_box): + """ + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg p2p_tsqbx_cost: a :class:`numpy.float64` constant representing the + evaluation cost of a target from a direct evaluation source of the target + box containing the expansion center. + :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape ``(ntarget_boxes,)``, with the + *i*th entry representing the number of direct evaluation sources + (list 1, list 3 close and list 4 close) for ``target_boxes[i]``. + :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape + ``(ntarget_boxes,)``, with the *i*th entry representing the evaluation + cost of all targets associated with centers in ``target_boxes[i]`` from + the direct evaluation sources of ``target_boxes[i]``. + """ + pass + + def qbx_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): + """Evaluate translation cost factors from symbolic model. The result of this + function can be used for process_* methods in this class. + + This method overwrite the method in parent + :class:`boxtree.cost.AbstractFMMCostModel` to support operations specific to + QBX. + + :arg queue: If not None, the cost factor arrays will be transferred to device + using this queue. + :arg nlevels: the number of tree levels. + :arg xlat_cost: a :class:`QBXTranslationCostModel`. + :arg context: a :class:`dict` mapping from the symbolic names of parameters + to their values, serving as context when evaluating symbolic expressions + in *xlat_cost*. + :return: a :class:`dict`, mapping from stage names to the translation costs + of those stages in FMM and QBX. + """ + cost_factors = self.fmm_cost_factors_for_kernels_from_model( + queue, nlevels, xlat_cost, context + ) + + cost_factors.update({ + "p2qbxl_cost": evaluate(xlat_cost.p2qbxl(), context=context), + "m2qbxl_cost": np.array([ + evaluate(xlat_cost.m2qbxl(ilevel), context=context) + for ilevel in range(nlevels) + ]), + "l2qbxl_cost": np.array([ + evaluate(xlat_cost.l2qbxl(ilevel), context=context) + for ilevel in range(nlevels) + ]), + "qbxl2p_cost": evaluate(xlat_cost.qbxl2p(), context=context), + "p2p_tsqbx_cost": evaluate(xlat_cost.p2p_tsqbx(), context=context) + }) + + if queue: + cost_factors = self.cost_factors_to_dev(cost_factors, queue) + + return cost_factors @staticmethod - def _collect_direction_interaction_data(traversal, tree): - ntarget_boxes = len(traversal.target_boxes) + def gather_metadata(geo_data, fmm_level_to_order): + lpot_source = geo_data.lpot_source + tree = geo_data.tree() + + metadata = { + "p_qbx": lpot_source.qbx_order, + "nlevels": tree.nlevels, + "nsources": tree.nsources, + "ntargets": tree.ntargets, + "ncenters": geo_data.ncenters + } - # target box index -> nsources - nlist1_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - nlist3close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) - nlist4close_srcs_by_itgt_box = np.zeros(ntarget_boxes, dtype=np.intp) + for level in range(tree.nlevels): + metadata["p_fmm_lev%d" % level] = fmm_level_to_order[level] - for itgt_box in range(ntarget_boxes): - nlist1_srcs = 0 - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nlist1_srcs += tree.box_source_counts_nonchild[src_ibox] + return metadata - nlist1_srcs_by_itgt_box[itgt_box] = nlist1_srcs + def qbx_cost_per_box(self, queue, geo_data, kernel, kernel_arguments, + calibration_params): + # FIXME: This should support target filtering. + lpot_source = geo_data.lpot_source + use_tsqbx = lpot_source._use_target_specific_qbx + tree = geo_data.tree() + traversal = geo_data.traversal() + nqbtl = geo_data.non_qbx_box_target_lists() + box_target_counts_nonchild = nqbtl.box_target_counts_nonchild + target_boxes = traversal.target_boxes - nlist3close_srcs = 0 - # Could be None, if not using targets with extent. - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nlist3close_srcs += tree.box_source_counts_nonchild[src_ibox] + # FIXME: We can avoid using *kernel* and *kernel_arguments* if we talk + # to the wrangler to obtain the FMM order (see also + # https://gitlab.tiker.net/inducer/boxtree/issues/25) + fmm_level_to_order = [ + lpot_source.fmm_level_to_order( + kernel.get_base_kernel(), kernel_arguments, tree, ilevel + ) for ilevel in range(tree.nlevels) + ] - nlist3close_srcs_by_itgt_box[itgt_box] = nlist3close_srcs + # {{{ Construct parameters - nlist4close_srcs = 0 - # Could be None, if not using targets with extent. - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nlist4close_srcs += tree.box_source_counts_nonchild[src_ibox] + params = calibration_params.copy() + params.update(dict(p_qbx=lpot_source.qbx_order)) - nlist4close_srcs_by_itgt_box[itgt_box] = nlist4close_srcs + for ilevel in range(tree.nlevels): + params["p_fmm_lev%d" % ilevel] = fmm_level_to_order[ilevel] - result = {} - result["nlist1_srcs_by_itgt_box"] = nlist1_srcs_by_itgt_box - result["nlist3close_srcs_by_itgt_box"] = nlist3close_srcs_by_itgt_box - result["nlist4close_srcs_by_itgt_box"] = nlist4close_srcs_by_itgt_box + # }}} - return result + xlat_cost = self.translation_cost_model_factory( + tree.dimensions, tree.nlevels + ) + + translation_cost = self.qbx_cost_factors_for_kernels_from_model( + queue, tree.nlevels, xlat_cost, params + ) + + ndirect_sources_per_target_box = \ + self.get_ndirect_sources_per_target_box(queue, traversal) + + # get FMM cost per box from parent class + result = self.cost_per_box( + queue, traversal, fmm_level_to_order, + calibration_params, + ndirect_sources_per_target_box=ndirect_sources_per_target_box, + box_target_counts_nonchild=box_target_counts_nonchild + ) + + if use_tsqbx: + result[target_boxes] += self.process_eval_target_specific_qbxl( + queue, geo_data, translation_cost["p2p_tsqbx_cost"], + ndirect_sources_per_target_box + ) + else: + result[target_boxes] += self.process_form_qbxl( + queue, geo_data, translation_cost["p2qbxl_cost"], + ndirect_sources_per_target_box + ) - # }}} + result[target_boxes] += self.process_m2qbxl( + queue, geo_data, translation_cost["m2qbxl_cost"] + ) - # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) + result[target_boxes] += self.process_l2qbxl( + queue, geo_data, translation_cost["l2qbxl_cost"] + ) - def process_direct(self, xlat_cost, traversal, direct_interaction_data, - box_target_counts_nonchild): - nlist1_srcs_by_itgt_box = ( - direct_interaction_data["nlist1_srcs_by_itgt_box"]) - nlist3close_srcs_by_itgt_box = ( - direct_interaction_data["nlist3close_srcs_by_itgt_box"]) - nlist4close_srcs_by_itgt_box = ( - direct_interaction_data["nlist4close_srcs_by_itgt_box"]) + result[target_boxes] += self.process_eval_qbxl( + queue, geo_data, translation_cost["qbxl2p_cost"] + ) - # list -> number of source-target interactions - npart_direct_list1 = 0 - npart_direct_list3 = 0 - npart_direct_list4 = 0 + metadata = self.gather_metadata(geo_data, fmm_level_to_order) - for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = box_target_counts_nonchild[tgt_ibox] + return result, metadata - npart_direct_list1 += ntargets * nlist1_srcs_by_itgt_box[itgt_box] - npart_direct_list3 += ntargets * nlist3close_srcs_by_itgt_box[itgt_box] - npart_direct_list4 += ntargets * nlist4close_srcs_by_itgt_box[itgt_box] + def qbx_cost_per_stage(self, queue, geo_data, kernel, kernel_arguments, + calibration_params): + # FIXME: This should support target filtering. + lpot_source = geo_data.lpot_source + use_tsqbx = lpot_source._use_target_specific_qbx + tree = geo_data.tree() + traversal = geo_data.traversal() + nqbtl = geo_data.non_qbx_box_target_lists() + box_target_counts_nonchild = nqbtl.box_target_counts_nonchild - result = {} - result["eval_direct_list1"] = npart_direct_list1 * xlat_cost.direct() - result["eval_direct_list3"] = npart_direct_list3 * xlat_cost.direct() - result["eval_direct_list4"] = npart_direct_list4 * xlat_cost.direct() + # FIXME: We can avoid using *kernel* and *kernel_arguments* if we talk + # to the wrangler to obtain the FMM order (see also + # https://gitlab.tiker.net/inducer/boxtree/issues/25) + fmm_level_to_order = [ + lpot_source.fmm_level_to_order( + kernel.get_base_kernel(), kernel_arguments, tree, ilevel + ) for ilevel in range(tree.nlevels) + ] - return result + # {{{ Construct parameters - # }}} + params = calibration_params.copy() + params.update(dict(p_qbx=lpot_source.qbx_order)) - # {{{ translate separated siblings' ("list 2") mpoles to local + for ilevel in range(tree.nlevels): + params["p_fmm_lev%d" % ilevel] = fmm_level_to_order[ilevel] - def process_list2(self, xlat_cost, traversal, tree): - nm2l_by_level = np.zeros(tree.nlevels, dtype=np.intp) + # }}} - for itgt_box, tgt_ibox in enumerate(traversal.target_or_target_parent_boxes): - start, end = traversal.from_sep_siblings_starts[itgt_box:itgt_box+2] + xlat_cost = self.translation_cost_model_factory( + tree.dimensions, tree.nlevels + ) + + translation_cost = self.qbx_cost_factors_for_kernels_from_model( + queue, tree.nlevels, xlat_cost, params + ) + + ndirect_sources_per_target_box = \ + self.get_ndirect_sources_per_target_box(queue, traversal) + + # get FMM per-stage cost from parent class + result = self.cost_per_stage( + queue, traversal, fmm_level_to_order, + calibration_params, + ndirect_sources_per_target_box=ndirect_sources_per_target_box, + box_target_counts_nonchild=box_target_counts_nonchild + ) + + if use_tsqbx: + result["eval_target_specific_qbx_locals"] = self.aggregate_over_boxes( + self.process_eval_target_specific_qbxl( + queue, geo_data, translation_cost["p2p_tsqbx_cost"], + ndirect_sources_per_target_box=ndirect_sources_per_target_box + ) + ) + else: + result["form_global_qbx_locals"] = self.aggregate_over_boxes( + self.process_form_qbxl( + queue, geo_data, translation_cost["p2qbxl_cost"], + ndirect_sources_per_target_box + ) + ) - level = tree.box_levels[tgt_ibox] - nm2l_by_level[level] += end-start + result["translate_box_multipoles_to_qbx_local"] = self.aggregate_over_boxes( + self.process_m2qbxl(queue, geo_data, translation_cost["m2qbxl_cost"]) + ) - result = sum( - cost * xlat_cost.m2l(ilevel, ilevel) - for ilevel, cost in enumerate(nm2l_by_level)) + result["translate_box_local_to_qbx_local"] = self.aggregate_over_boxes( + self.process_l2qbxl(queue, geo_data, translation_cost["l2qbxl_cost"]) + ) - return dict(multipole_to_local=result) + result["eval_qbx_expansions"] = self.aggregate_over_boxes( + self.process_eval_qbxl(queue, geo_data, translation_cost["qbxl2p_cost"]) + ) - # }}} + metadata = self.gather_metadata(geo_data, fmm_level_to_order) - # {{{ evaluate sep. smaller mpoles ("list 3") at particles + return result, metadata - def process_list3(self, xlat_cost, traversal, tree, box_target_counts_nonchild): - nmp_eval_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + @staticmethod + def get_unit_calibration_params(): + calibration_params = AbstractFMMCostModel.get_unit_calibration_params() - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + calibration_params.update(dict( + c_p2qbxl=1.0, + c_p2p_tsqbx=1.0, + c_qbxl2p=1.0, + c_m2qbxl=1.0, + c_l2qbxl=1.0 + )) - for ilevel, sep_smaller_list in enumerate( - traversal.from_sep_smaller_by_level): - for itgt_box, tgt_ibox in enumerate( - traversal.target_boxes_sep_smaller_by_source_level[ilevel]): - ntargets = box_target_counts_nonchild[tgt_ibox] - start, end = sep_smaller_list.starts[itgt_box:itgt_box+2] - nmp_eval_by_source_level[ilevel] += ntargets * (end-start) + return calibration_params - result = sum( - cost * xlat_cost.m2p(ilevel) - for ilevel, cost in enumerate(nmp_eval_by_source_level)) + _QBX_STAGE_TO_CALIBRATION_PARAMETER = { + "form_global_qbx_locals": "c_p2qbxl", + "translate_box_multipoles_to_qbx_local": "c_m2qbxl", + "translate_box_local_to_qbx_local": "c_l2qbxl", + "eval_qbx_expansions": "c_qbxl2p", + "eval_target_specific_qbx_locals": "c_p2p_tsqbx" + } + + def estimate_calibration_params(self, model_results, timing_results, + time_field_name="wall_elapsed", + additional_stage_to_param_names=()): + stage_to_param_names = self._QBX_STAGE_TO_CALIBRATION_PARAMETER.copy() + stage_to_param_names.update(additional_stage_to_param_names) + + return AbstractFMMCostModel.estimate_calibration_params( + self, model_results, timing_results, time_field_name=time_field_name, + additional_stage_to_param_names=stage_to_param_names + ) + + def estimate_kernel_specific_calibration_params( + self, model_results, timing_results, time_field_name="wall_elapsed"): + """Get kernel-specific calibration parameters from samples of model costs and + real costs. + + :arg model_results: a :class:`list` of modeled costs. Each model cost can be + obtained from `BoundExpression.cost_per_stage` with "constant_one" for + argument `calibration_params`. + :arg timing_results: a :class:`list` of timing data. Each timing data can be + obtained from `BoundExpression.eval`. + :arg time_field_name: a :class:`str`, the field name from the timing result. + Usually this can be ``"wall_elapsed"`` or ``"process_elapsed"``. + :return: a :class:`dict` which maps kernels to calibration parameters. + """ + cost_per_kernel = {} + params_per_kernel = {} - return dict(eval_multipoles=result) + assert len(model_results) == len(timing_results) - # }}} + for icase in range(len(model_results)): + model_cost = model_results[icase] + real_cost = timing_results[icase] - # {{{ form locals for separated bigger source boxes ("list 4") + for insn in real_cost: + assert (insn in model_cost) - def process_list4(self, xlat_cost, traversal, tree): - nform_local_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + knls = frozenset(knl for knl in insn.kernels) - for itgt_box in range(len(traversal.target_or_target_parent_boxes)): - start, end = traversal.from_sep_bigger_starts[itgt_box:itgt_box+2] - for src_ibox in traversal.from_sep_bigger_lists[start:end]: - nsources = tree.box_source_counts_nonchild[src_ibox] - level = tree.box_levels[src_ibox] - nform_local_by_source_level[level] += nsources + if knls not in cost_per_kernel: + cost_per_kernel[knls] = { + "model_costs": [], + "real_costs": [] + } - result = sum( - cost * xlat_cost.p2l(ilevel) - for ilevel, cost in enumerate(nform_local_by_source_level)) + cost_per_kernel[knls]["model_costs"].append(model_cost[insn]) + cost_per_kernel[knls]["real_costs"].append(real_cost[insn]) - return dict(form_locals=result) + for knls in cost_per_kernel: + params_per_kernel[knls] = self.estimate_calibration_params( + cost_per_kernel[knls]["model_costs"], + cost_per_kernel[knls]["real_costs"], + time_field_name=time_field_name + ) - # }}} + return params_per_kernel - # {{{ propogate locals downward - def process_refine_locals(self, xlat_cost, traversal, tree): - result = 0 +class QBXCostModel(AbstractQBXCostModel, FMMCostModel): + """This class is an implementation of interface :class:`AbstractQBXCostModel` + using :mod:`pyopencl`. + """ + def __init__( + self, + translation_cost_model_factory=make_pde_aware_translation_cost_model): + """ + :arg translation_cost_model_factory: a function, which takes tree dimension + and the number of tree levels as arguments, returns an object of + :class:`QBXTranslationCostModel`. + """ + FMMCostModel.__init__(self, translation_cost_model_factory) + + @memoize_method + def _fill_array_with_index_knl(self, context, idx_dtype, array_dtype): + return ElementwiseKernel( + context, + Template(r""" + ${idx_t} *index, + ${array_t} *array, + ${array_t} val + """).render( + idx_t=dtype_to_ctype(idx_dtype), + array_t=dtype_to_ctype(array_dtype) + ), + Template(r""" + array[index[i]] = val; + """).render(), + name="fill_array_with_index" + ) + + def _fill_array_with_index(self, queue, array, index, value): + idx_dtype = index.dtype + array_dtype = array.dtype + knl = self._fill_array_with_index_knl(queue.context, idx_dtype, array_dtype) + knl(index, array, value, queue=queue) + + @memoize_method + def count_global_qbx_centers_knl(self, context, box_id_dtype, particle_id_dtype): + return ElementwiseKernel( + context, + Template(r""" + ${particle_id_t} *nqbx_centers_itgt_box, + ${particle_id_t} *global_qbx_center_weight, + ${box_id_t} *target_boxes, + ${particle_id_t} *box_target_starts, + ${particle_id_t} *box_target_counts_nonchild + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + Template(r""" + ${box_id_t} global_box_id = target_boxes[i]; + ${particle_id_t} start = box_target_starts[global_box_id]; + ${particle_id_t} end = start + box_target_counts_nonchild[ + global_box_id + ]; + + ${particle_id_t} nqbx_centers = 0; + for(${particle_id_t} iparticle = start; iparticle < end; iparticle++) + nqbx_centers += global_qbx_center_weight[iparticle]; + + nqbx_centers_itgt_box[i] = nqbx_centers; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + name="count_global_qbx_centers" + ) + + def get_nqbx_centers_per_tgt_box(self, queue, geo_data, weights=None): + """ + :arg queue: a :class:`pyopencl.CommandQueue` object. + :arg geo_data: a :class:`pytential.qbx.geometry.QBXFMMGeometryData` object. + :arg weights: a :class:`pyopencl.array.Array` of shape ``(ncenters,)`` with + particle_id_dtype, the weight of each center in user order. + :return: a :class:`pyopencl.array.Array` of shape ``(ntarget_boxes,)`` with + type *particle_id_dtype* where the *i*th entry represents the number of + `geo_data.global_qbx_centers` in ``target_boxes[i]``, optionally weighted + by *weights*. + """ + traversal = geo_data.traversal() + tree = geo_data.tree() + global_qbx_centers = geo_data.global_qbx_centers() + ncenters = geo_data.ncenters + + # Build a mask (weight) of whether a target is a global qbx center + global_qbx_centers_tree_order = take( + tree.sorted_target_ids, global_qbx_centers, queue=queue + ) + global_qbx_center_weight = cl.array.zeros( + queue, tree.ntargets, dtype=tree.particle_id_dtype + ) + + self._fill_array_with_index( + queue, global_qbx_center_weight, global_qbx_centers_tree_order, 1 + ) + + if weights is not None: + assert weights.dtype == tree.particle_id_dtype + global_qbx_center_weight[tree.sorted_target_ids[:ncenters]] *= weights + + # Each target box enumerate its target list and add the weight of global + # qbx centers + ntarget_boxes = len(traversal.target_boxes) + nqbx_centers_itgt_box = cl.array.empty( + queue, ntarget_boxes, dtype=tree.particle_id_dtype + ) + + count_global_qbx_centers_knl = self.count_global_qbx_centers_knl( + queue.context, tree.box_id_dtype, tree.particle_id_dtype + ) + count_global_qbx_centers_knl( + nqbx_centers_itgt_box, + global_qbx_center_weight, + traversal.target_boxes, + tree.box_target_starts, + tree.box_target_counts_nonchild, + queue=queue + ) + + return nqbx_centers_itgt_box + + def process_form_qbxl(self, queue, geo_data, p2qbxl_cost, + ndirect_sources_per_target_box): + nqbx_centers_itgt_box = self.get_nqbx_centers_per_tgt_box(queue, geo_data) + + return (nqbx_centers_itgt_box + * ndirect_sources_per_target_box + * p2qbxl_cost) + + @memoize_method + def process_m2qbxl_knl(self, context, box_id_dtype, particle_id_dtype): + return ElementwiseKernel( + context, + Template(r""" + ${box_id_t} *idx_to_itgt_box, + ${particle_id_t} *nqbx_centers_itgt_box, + ${box_id_t} *ssn_starts, + double *nm2qbxl, + double m2qbxl_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + Template(r""" + // get the index of current box in target_boxes + ${box_id_t} itgt_box = idx_to_itgt_box[i]; + // get the number of expansion centers in current box + ${particle_id_t} nqbx_centers = nqbx_centers_itgt_box[itgt_box]; + // get the number of list 3 boxes of the current box in a particular + // level + ${box_id_t} nlist3_boxes = ssn_starts[i + 1] - ssn_starts[i]; + // calculate the cost + nm2qbxl[itgt_box] += (nqbx_centers * nlist3_boxes * m2qbxl_cost); + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + name="process_m2qbxl" + ) + + def process_m2qbxl(self, queue, geo_data, m2qbxl_cost): + tree = geo_data.tree() + traversal = geo_data.traversal() + ntarget_boxes = len(traversal.target_boxes) + nqbx_centers_itgt_box = self.get_nqbx_centers_per_tgt_box(queue, geo_data) - for target_lev in range(1, tree.nlevels): - start, stop = traversal.level_start_target_or_target_parent_box_nrs[ - target_lev:target_lev+2] - source_lev = target_lev - 1 - result += (stop-start) * xlat_cost.l2l(source_lev, target_lev) + process_m2qbxl_knl = self.process_m2qbxl_knl( + queue.context, tree.box_id_dtype, tree.particle_id_dtype + ) - return dict(refine_locals=result) + nm2qbxl = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64) - # }}} + for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): + process_m2qbxl_knl( + ssn.nonempty_indices, + nqbx_centers_itgt_box, + ssn.starts, + nm2qbxl, + m2qbxl_cost[isrc_level].get(queue).reshape(-1)[0], + queue=queue + ) + + return nm2qbxl + + def process_l2qbxl(self, queue, geo_data, l2qbxl_cost): + tree = geo_data.tree() + traversal = geo_data.traversal() + nqbx_centers_itgt_box = self.get_nqbx_centers_per_tgt_box(queue, geo_data) + + # l2qbxl_cost_itgt_box = l2qbxl_cost[tree.box_levels[traversal.target_boxes]] + l2qbxl_cost_itgt_box = take( + l2qbxl_cost, + take(tree.box_levels, traversal.target_boxes, queue=queue), + queue=queue + ) + + return nqbx_centers_itgt_box * l2qbxl_cost_itgt_box + + def process_eval_qbxl(self, queue, geo_data, qbxl2p_cost): + center_to_targets_starts = geo_data.center_to_tree_targets().starts + center_to_targets_starts = center_to_targets_starts.with_queue(queue) + weights = center_to_targets_starts[1:] - center_to_targets_starts[:-1] - # {{{ evaluate local expansions at non-qbx targets + nqbx_targets_itgt_box = self.get_nqbx_centers_per_tgt_box( + queue, geo_data, weights=weights + ) - def process_eval_locals(self, xlat_cost, traversal, tree, nqbtl): - ntargets_by_level = np.zeros(tree.nlevels, dtype=np.intp) + return nqbx_targets_itgt_box * qbxl2p_cost - for target_lev in range(tree.nlevels): - start, stop = traversal.level_start_target_box_nrs[ - target_lev:target_lev+2] - for tgt_ibox in traversal.target_boxes[start:stop]: - ntargets_by_level[target_lev] += ( - nqbtl.box_target_counts_nonchild[tgt_ibox]) + def process_eval_target_specific_qbxl(self, queue, geo_data, p2p_tsqbx_cost, + ndirect_sources_per_target_box): + center_to_targets_starts = geo_data.center_to_tree_targets().starts + center_to_targets_starts = center_to_targets_starts.with_queue(queue) + weights = center_to_targets_starts[1:] - center_to_targets_starts[:-1] + + nqbx_targets_itgt_box = self.get_nqbx_centers_per_tgt_box( + queue, geo_data, weights=weights + ) + + return (nqbx_targets_itgt_box + * ndirect_sources_per_target_box + * p2p_tsqbx_cost) + + def qbx_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): + if not isinstance(queue, cl.CommandQueue): + raise TypeError( + "An OpenCL command queue must be supplied for cost model") + + return AbstractQBXCostModel.qbx_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context + ) + + +class _PythonQBXCostModel(AbstractQBXCostModel, _PythonFMMCostModel): + def __init__( + self, + translation_cost_model_factory=make_pde_aware_translation_cost_model): + """This cost model is a redundant implementation used for testing. It should + not be used outside of tests for :mod:`pytential`. + + :arg translation_cost_model_factory: a function, which takes tree dimension + and the number of tree levels as arguments, returns an object of + :class:`TranslationCostModel`. + """ + _PythonFMMCostModel.__init__(self, translation_cost_model_factory) - result = sum( - cost * xlat_cost.l2p(ilevel) - for ilevel, cost in enumerate(ntargets_by_level)) + def process_form_qbxl(self, queue, geo_data, p2qbxl_cost, + ndirect_sources_per_target_box): + global_qbx_centers = geo_data.global_qbx_centers() + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + traversal = geo_data.traversal() - return dict(eval_locals=result) + np2qbxl = np.zeros(len(traversal.target_boxes), dtype=np.float64) - # }}} + for tgt_icenter in global_qbx_centers: + itgt_box = qbx_center_to_target_box[tgt_icenter] + np2qbxl[itgt_box] += ndirect_sources_per_target_box[itgt_box] - # {{{ collect data about direct interactions with qbx centers + return np2qbxl * p2qbxl_cost - @staticmethod - def _collect_qbxl_direct_interaction_data(direct_interaction_data, - global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): - nlist1_srcs_by_itgt_box = ( - direct_interaction_data["nlist1_srcs_by_itgt_box"]) - nlist3close_srcs_by_itgt_box = ( - direct_interaction_data["nlist3close_srcs_by_itgt_box"]) - nlist4close_srcs_by_itgt_box = ( - direct_interaction_data["nlist4close_srcs_by_itgt_box"]) - - # center -> nsources - np2qbxl_list1_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list3_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - np2qbxl_list4_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) - - # center -> number of associated targets - nqbxl2p_by_center = np.zeros(len(global_qbx_centers), dtype=np.intp) + def process_eval_target_specific_qbxl(self, queue, geo_data, p2p_tsqbx_cost, + ndirect_sources_per_target_box): + center_to_targets_starts = geo_data.center_to_tree_targets().starts + global_qbx_centers = geo_data.global_qbx_centers() + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + traversal = geo_data.traversal() + neval_tsqbx = np.zeros(len(traversal.target_boxes), dtype=np.float64) for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - start, end = center_to_targets_starts[tgt_icenter:tgt_icenter+2] - nqbxl2p_by_center[itgt_center] = end - start - + start, end = center_to_targets_starts[tgt_icenter:tgt_icenter + 2] itgt_box = qbx_center_to_target_box[tgt_icenter] - np2qbxl_list1_by_center[itgt_center] = ( - nlist1_srcs_by_itgt_box[itgt_box]) - np2qbxl_list3_by_center[itgt_center] = ( - nlist3close_srcs_by_itgt_box[itgt_box]) - np2qbxl_list4_by_center[itgt_center] = ( - nlist4close_srcs_by_itgt_box[itgt_box]) - - result = {} - result["np2qbxl_list1_by_center"] = np2qbxl_list1_by_center - result["np2qbxl_list3_by_center"] = np2qbxl_list3_by_center - result["np2qbxl_list4_by_center"] = np2qbxl_list4_by_center - result["nqbxl2p_by_center"] = nqbxl2p_by_center - - return result - - # }}} - - # {{{ eval target specific qbx expansions + neval_tsqbx[itgt_box] += ( + ndirect_sources_per_target_box[itgt_box] * (end - start) + ) - def process_eval_target_specific_qbxl(self, xlat_cost, direct_interaction_data, - global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): + return neval_tsqbx * p2p_tsqbx_cost - counts = self._collect_qbxl_direct_interaction_data( - direct_interaction_data, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts) - - result = {} - result["eval_target_specific_qbx_locals_list1"] = ( - sum(counts["np2qbxl_list1_by_center"] * counts["nqbxl2p_by_center"]) - * xlat_cost.p2p_tsqbx()) - result["eval_target_specific_qbx_locals_list3"] = ( - sum(counts["np2qbxl_list3_by_center"] * counts["nqbxl2p_by_center"]) - * xlat_cost.p2p_tsqbx()) - result["eval_target_specific_qbx_locals_list4"] = ( - sum(counts["np2qbxl_list4_by_center"] * counts["nqbxl2p_by_center"]) - * xlat_cost.p2p_tsqbx()) - - return result - - # }}} - - # {{{ form global qbx locals - - def process_form_qbxl(self, xlat_cost, direct_interaction_data, - global_qbx_centers, qbx_center_to_target_box, center_to_targets_starts): - - counts = self._collect_qbxl_direct_interaction_data( - direct_interaction_data, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts) - - result = {} - result["form_global_qbx_locals_list1"] = ( - sum(counts["np2qbxl_list1_by_center"]) * xlat_cost.p2qbxl()) - result["form_global_qbx_locals_list3"] = ( - sum(counts["np2qbxl_list3_by_center"]) * xlat_cost.p2qbxl()) - result["form_global_qbx_locals_list4"] = ( - sum(counts["np2qbxl_list4_by_center"]) * xlat_cost.p2qbxl()) - - return result - - # }}} + def process_m2qbxl(self, queue, geo_data, m2qbxl_cost): + traversal = geo_data.traversal() + global_qbx_centers = geo_data.global_qbx_centers() + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - # {{{ translate from list 3 multipoles to qbx local expansions + ntarget_boxes = len(traversal.target_boxes) + nm2qbxl = np.zeros(ntarget_boxes, dtype=np.float64) - def process_m2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box_source_level): - nm2qbxl_by_source_level = np.zeros(tree.nlevels, dtype=np.intp) + for isrc_level, sep_smaller_list in enumerate( + traversal.from_sep_smaller_by_level): - assert tree.nlevels == len(traversal.from_sep_smaller_by_level) + qbx_center_to_target_box_source_level = \ + geo_data.qbx_center_to_target_box_source_level(isrc_level) - for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): for tgt_icenter in global_qbx_centers: icontaining_tgt_box = qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] + tgt_icenter + ] if icontaining_tgt_box == -1: continue - start, stop = ( - ssn.starts[icontaining_tgt_box], - ssn.starts[icontaining_tgt_box+1]) + start = sep_smaller_list.starts[icontaining_tgt_box] + stop = sep_smaller_list.starts[icontaining_tgt_box + 1] - nm2qbxl_by_source_level[isrc_level] += stop-start + containing_tgt_box = qbx_center_to_target_box[tgt_icenter] - result = sum( - cost * xlat_cost.m2qbxl(ilevel) - for ilevel, cost in enumerate(nm2qbxl_by_source_level)) + nm2qbxl[containing_tgt_box] += ( + (stop - start) * m2qbxl_cost[isrc_level]) - return dict(translate_box_multipoles_to_qbx_local=result) + return nm2qbxl - # }}} - - # {{{ translate from box locals to qbx local expansions + def process_l2qbxl(self, queue, geo_data, l2qbxl_cost): + tree = geo_data.tree() + traversal = geo_data.traversal() + global_qbx_centers = geo_data.global_qbx_centers() + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - def process_l2qbxl(self, xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box): - nl2qbxl_by_level = np.zeros(tree.nlevels, dtype=np.intp) + ntarget_boxes = len(traversal.target_boxes) + nl2qbxl = np.zeros(ntarget_boxes, dtype=np.float64) for tgt_icenter in global_qbx_centers: itgt_box = qbx_center_to_target_box[tgt_icenter] tgt_ibox = traversal.target_boxes[itgt_box] - level = tree.box_levels[tgt_ibox] - nl2qbxl_by_level[level] += 1 - - result = sum( - cost * xlat_cost.l2qbxl(ilevel) - for ilevel, cost in enumerate(nl2qbxl_by_level)) - - return dict(translate_box_local_to_qbx_local=result) + nl2qbxl[itgt_box] += l2qbxl_cost[tree.box_levels[tgt_ibox]] - # }}} + return nl2qbxl - # {{{ evaluate qbx local expansions + def process_eval_qbxl(self, queue, geo_data, qbxl2p_cost): + traversal = geo_data.traversal() + global_qbx_centers = geo_data.global_qbx_centers() + center_to_targets_starts = geo_data.center_to_tree_targets().starts + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - def process_eval_qbxl(self, xlat_cost, global_qbx_centers, - center_to_targets_starts): - result = 0 + ntarget_boxes = len(traversal.target_boxes) + neval_qbxl = np.zeros(ntarget_boxes, dtype=np.float64) for src_icenter in global_qbx_centers: start, end = center_to_targets_starts[src_icenter:src_icenter+2] - result += (end - start) + icontaining_tgt_box = qbx_center_to_target_box[src_icenter] + neval_qbxl[icontaining_tgt_box] += (end - start) - result *= xlat_cost.qbxl2p() + return neval_qbxl * qbxl2p_cost - return dict(eval_qbx_expansions=result) - - # }}} - - @log_process(logger, "model cost") - def __call__(self, wrangler, geo_data, kernel, kernel_arguments): - """Analyze the given geometry and return cost data. - - :returns: An instance of :class:`ParametrizedCosts`. + def qbx_cost_per_box(self, queue, geo_data, kernel, kernel_arguments, + calibration_params): + """This function transfers *geo_data* to host if necessary """ - result = OrderedDict() - - lpot_source = geo_data.lpot_source - - with cl.CommandQueue(geo_data.cl_context) as queue: - tree = geo_data.tree().get(queue) - traversal = geo_data.traversal(merge_close_lists=False).get(queue) - nqbtl = geo_data.non_qbx_box_target_lists().get(queue) - - box_target_counts_nonchild = nqbtl.box_target_counts_nonchild - - params = dict( - nlevels=tree.nlevels, - nboxes=tree.nboxes, - nsources=tree.nsources, - ntargets=tree.ntargets, - ncenters=geo_data.ncenters, - p_qbx=lpot_source.qbx_order, - ) - - # FIXME: We can avoid using *kernel* and *kernel_arguments* if we talk - # to the wrangler to obtain the FMM order (see also - # https://gitlab.tiker.net/inducer/boxtree/issues/25) - for ilevel in range(tree.nlevels): - params["p_fmm_lev%d" % ilevel] = ( - lpot_source.fmm_level_to_order( - kernel.get_base_kernel(), kernel_arguments, tree, ilevel)) - - params.update(self.calibration_params) - - xlat_cost = ( - self.translation_cost_model_factory(tree.dimensions, tree.nlevels)) - - # {{{ construct local multipoles - - result.update(self.process_form_multipoles(xlat_cost, traversal, tree)) - - # }}} - - # {{{ propagate multipoles upward - - result.update(self.process_coarsen_multipoles(xlat_cost, traversal, tree)) - - # }}} - - direct_interaction_data = ( - self._collect_direction_interaction_data(traversal, tree)) - - # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) - - result.update(self.process_direct( - xlat_cost, traversal, direct_interaction_data, - box_target_counts_nonchild)) - - # }}} - - # {{{ translate separated siblings' ("list 2") mpoles to local - - result.update(self.process_list2(xlat_cost, traversal, tree)) - - # }}} - - # {{{ evaluate sep. smaller mpoles ("list 3") at particles - - result.update(self.process_list3( - xlat_cost, traversal, tree, box_target_counts_nonchild)) - - # }}} + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper + from pytential.qbx.geometry import QBXFMMGeometryData - # {{{ form locals for separated bigger source boxes ("list 4") + if not isinstance(geo_data, ToHostTransferredGeoDataWrapper): + assert isinstance(geo_data, QBXFMMGeometryData) + geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) - result.update(self.process_list4(xlat_cost, traversal, tree)) + return AbstractQBXCostModel.qbx_cost_per_box( + self, queue, geo_data, kernel, kernel_arguments, calibration_params + ) - # }}} - - # {{{ propagate local_exps downward - - result.update(self.process_refine_locals(xlat_cost, traversal, tree)) - - # }}} - - # {{{ evaluate locals - - result.update(self.process_eval_locals(xlat_cost, traversal, tree, nqbtl)) - - # }}} - - global_qbx_centers = geo_data.global_qbx_centers() - - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - center_to_targets_starts = geo_data.center_to_tree_targets().starts - qbx_center_to_target_box_source_level = np.empty( - (tree.nlevels,), dtype=object) - - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - geo_data.qbx_center_to_target_box_source_level(src_level)) - - with cl.CommandQueue(geo_data.cl_context) as queue: - global_qbx_centers = global_qbx_centers.get( - queue=queue) - qbx_center_to_target_box = qbx_center_to_target_box.get( - queue=queue) - center_to_targets_starts = center_to_targets_starts.get( - queue=queue) - for src_level in range(tree.nlevels): - qbx_center_to_target_box_source_level[src_level] = ( - qbx_center_to_target_box_source_level[src_level] - .get(queue=queue)) - - # {{{ form global qbx locals or evaluate target specific qbx expansions - - if wrangler.using_tsqbx: - result.update(self.process_eval_target_specific_qbxl( - xlat_cost, direct_interaction_data, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts)) - result["form_global_qbx_locals_list1"] = 0 - result["form_global_qbx_locals_list3"] = 0 - result["form_global_qbx_locals_list4"] = 0 - - else: - result.update(self.process_form_qbxl( - xlat_cost, direct_interaction_data, global_qbx_centers, - qbx_center_to_target_box, center_to_targets_starts)) - result["eval_target_specific_qbx_locals_list1"] = 0 - result["eval_target_specific_qbx_locals_list3"] = 0 - result["eval_target_specific_qbx_locals_list4"] = 0 - - # }}} - - # {{{ translate from list 3 multipoles to qbx local expansions - - result.update(self.process_m2qbxl( - xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box_source_level)) - - # }}} - - # {{{ translate from box local expansions to qbx local expansions - - result.update(self.process_l2qbxl( - xlat_cost, traversal, tree, global_qbx_centers, - qbx_center_to_target_box)) - - # }}} - - # {{{ evaluate qbx local expansions - - result.update(self.process_eval_qbxl( - xlat_cost, global_qbx_centers, center_to_targets_starts)) - - # }}} - - return ParametrizedCosts(result, params) - -# }}} - - -# {{{ calibrate cost model - -_FMM_STAGE_TO_CALIBRATION_PARAMETER = { - "form_multipoles": "c_p2m", - "coarsen_multipoles": "c_m2m", - "eval_direct": "c_p2p", - "multipole_to_local": "c_m2l", - "eval_multipoles": "c_m2p", - "form_locals": "c_p2l", - "refine_locals": "c_l2l", - "eval_locals": "c_l2p", - "form_global_qbx_locals": "c_p2qbxl", - "translate_box_multipoles_to_qbx_local": "c_m2qbxl", - "translate_box_local_to_qbx_local": "c_l2qbxl", - "eval_qbx_expansions": "c_qbxl2p", - "eval_target_specific_qbx_locals": "c_p2p_tsqbx", - } - - -def estimate_calibration_params(model_results, timing_results): - """Given a set of model results and matching timing results, estimate the best - calibration parameters for the model. - """ - - params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values()) - - nresults = len(model_results) - - if nresults != len(timing_results): - raise ValueError("must have same number of model and timing results") - - uncalibrated_times = {} - actual_times = {} - - for param in params: - uncalibrated_times[param] = np.zeros(nresults) - actual_times[param] = np.zeros(nresults) - - from pymbolic import evaluate - from pymbolic.mapper.coefficient import CoefficientCollector - collect_coeffs = CoefficientCollector() - - for i, model_result in enumerate(model_results): - context = model_result.params.copy() - for param in params: - context[param] = var(param) - - # Represents the total modeled cost, but leaves the calibration - # parameters symbolic. - total_cost = evaluate( - sum(model_result.raw_costs.values()), - context=context) - - coeffs = collect_coeffs(total_cost) - # Assumes the total cost is a sum of terms linear in the parameters. - assert set(key.name for key in coeffs) <= params - - for param, time in coeffs.items(): - uncalibrated_times[param.name][i] = time - - for i, timing_result in enumerate(timing_results): - for param, time in timing_result.items(): - calibration_param = ( - _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) - actual_times[calibration_param][i] = time["process_elapsed"] - - result = {} - - for param in params: - uncalibrated = uncalibrated_times[param] - actual = actual_times[param] - - if np.allclose(uncalibrated, 0): - result[param] = float("NaN") - continue - - result[param] = ( - actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated)) - - return result + def qbx_cost_per_stage(self, queue, geo_data, kernel, kernel_arguments, + calibration_params): + """This function additionally transfers geo_data to host if necessary + """ + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper + from pytential.qbx.geometry import QBXFMMGeometryData + + if not isinstance(geo_data, ToHostTransferredGeoDataWrapper): + assert isinstance(geo_data, QBXFMMGeometryData) + geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) + queue.finish() + + return AbstractQBXCostModel.qbx_cost_per_stage( + self, queue, geo_data, kernel, kernel_arguments, calibration_params + ) + + def qbx_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): + return AbstractQBXCostModel.qbx_cost_factors_for_kernels_from_model( + self, None, nlevels, xlat_cost, context + ) # }}} diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 98df707e296fd8e0d128edb1dafcae7e7f6b7fa1..531b25616ffd4aae472c1ec26c7d69eae939f9ab 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -234,7 +234,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): def reorder_potentials(self, potentials): raise NotImplementedError("reorder_potentials should not " - "be called on a QBXFMMLibHelmholtzExpansionWrangler") + "be called on a QBXFMMLibExpansionWrangler") # Because this is a multi-stage, more complicated process that combines # potentials from non-QBX targets and QBX targets. diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 3beabc2c75457ef05eec31c21bad5bfdcee11536..058cbd539758aabac85b08df9e0b0991247014f2 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -42,6 +42,7 @@ from meshmode.array_context import PyOpenCLArrayContext from meshmode.dof_array import DOFArray, thaw from pytools import memoize_in +from pytential.qbx.cost import AbstractQBXCostModel from pytential import sym @@ -349,11 +350,21 @@ class CostModelMapper(EvaluationMapperBase): This executes everything *except* the layer potential operator. Instead of executing the operator, the cost model gets run and the cost data is collected. + + .. attribute:: kernel_to_calibration_params + + Can either be a :class:`str` "constant_one", which uses the constant 1.0 as + calibration parameters for all stages of all kernels, or be a :class:`dict`, + which maps from kernels to the calibration parameters, returned from + `estimate_kernel_specific_calibration_params`. + """ - def __init__(self, bound_expr, actx, context=None, - target_geometry=None, - target_points=None, target_normals=None, target_tangents=None): + def __init__(self, bound_expr, actx, + kernel_to_calibration_params, per_box, + context=None, + target_geometry=None, + target_points=None, target_normals=None, target_tangents=None): if context is None: context = {} EvaluationMapperBase.__init__( @@ -362,24 +373,39 @@ class CostModelMapper(EvaluationMapperBase): target_points, target_normals, target_tangents) + + self.kernel_to_calibration_params = kernel_to_calibration_params self.modeled_cost = {} + self.metadata = {} + self.per_box = per_box def exec_compute_potential_insn( self, actx: PyOpenCLArrayContext, insn, bound_expr, evaluate): source = bound_expr.places.get_geometry(insn.source.geometry) + knls = frozenset(knl for knl in insn.kernels) + + if (isinstance(self.kernel_to_calibration_params, str) + and self.kernel_to_calibration_params == "constant_one"): + calibration_params = \ + AbstractQBXCostModel.get_unit_calibration_params() + else: + calibration_params = self.kernel_to_calibration_params[knls] - result, cost_model_result = ( - source.cost_model_compute_potential_insn( - actx, insn, bound_expr, evaluate)) + result, (cost_model_result, metadata) = \ + source.cost_model_compute_potential_insn( + actx, insn, bound_expr, evaluate, calibration_params, + self.per_box) # The compiler ensures this. assert insn not in self.modeled_cost self.modeled_cost[insn] = cost_model_result + self.metadata[insn] = metadata + return result def get_modeled_cost(self): - return self.modeled_cost + return self.modeled_cost, self.metadata # }}} @@ -822,11 +848,53 @@ class GeometryCollection(object): # {{{ bound expression +def _find_array_context_from_args_in_context(context, supplied_array_context=None): + array_contexts = [] + if supplied_array_context is not None: + if not isinstance(supplied_array_context, PyOpenCLArrayContext): + raise TypeError( + "first argument (if supplied) must be a " + "PyOpenCLArrayContext") + + array_contexts.append(supplied_array_context) + del supplied_array_context + + def look_for_array_contexts(ary): + if isinstance(ary, DOFArray): + if ary.array_context is not None: + array_contexts.append(ary.array_context) + elif isinstance(ary, np.ndarray) and ary.dtype.char == "O": + for idx in np.ndindex(ary.shape): + look_for_array_contexts(ary[idx]) + else: + pass + + for key, val in context.items(): + look_for_array_contexts(val) + + if array_contexts: + from pytools import is_single_valued + if not is_single_valued(array_contexts): + raise ValueError("arguments do not agree on an array context") + + array_context = array_contexts[0] + else: + array_context = None + + if not isinstance(array_context, PyOpenCLArrayContext): + raise TypeError( + "array context (derived from arguments) is not a " + "PyOpenCLArrayContext") + + return array_context + + class BoundExpression(object): """An expression readied for evaluation by binding it to a :class:`~pytential.symbolic.execution.GeometryCollection`. - .. automethod :: get_modeled_cost + .. automethod :: cost_per_stage + .. automethod :: cost_per_box .. automethod :: scipy_op .. automethod :: eval .. automethod :: __call__ @@ -845,8 +913,43 @@ class BoundExpression(object): def _get_cache(self, name): return self.caches.setdefault(name, {}) - def get_modeled_cost(self, queue, **args): - cost_model_mapper = CostModelMapper(self, queue, args) + def cost_per_stage(self, calibration_params, **kwargs): + """ + :arg queue: a :class:`pyopencl.CommandQueue` object. + :arg calibration_params: either a :class:`dict` returned by + `estimate_kernel_specific_calibration_params`, or a :class:`str` + "constant_one". + :return: a :class:`dict` mapping from instruction to per-stage cost. Each + per-stage cost is represented by a :class:`dict` mapping from the stage + name to the predicted time. + """ + array_context = _find_array_context_from_args_in_context(kwargs) + + if array_context is None: + raise ValueError("unable to figure array context from arguments") + + cost_model_mapper = CostModelMapper( + self, array_context, calibration_params, per_box=False, context=kwargs + ) + self.code.execute(cost_model_mapper) + return cost_model_mapper.get_modeled_cost() + + def cost_per_box(self, calibration_params, **kwargs): + """ + :arg queue: a :class:`pyopencl.CommandQueue` object. + :arg calibration_params: either a :class:`dict` returned by + `estimate_kernel_specific_calibration_params`, or a :class:`str` + "constant_one". + :return: a :class:`dict` mapping from instruction to per-box cost. Each + per-box cost is represented by a :class:`numpy.ndarray` or + :class:`pyopencl.array.Array` of shape (nboxes,), where the ith entry + represents the cost of all stages for box i. + """ + array_context = _find_array_context_from_args_in_context(kwargs) + + cost_model_mapper = CostModelMapper( + self, array_context, calibration_params, per_box=True, context=kwargs + ) self.code.execute(cost_model_mapper) return cost_model_mapper.get_modeled_cost() @@ -917,41 +1020,8 @@ class BoundExpression(object): if context is None: context = {} - # {{{ figure array context - - array_contexts = [] - if array_context is not None: - if not isinstance(array_context, PyOpenCLArrayContext): - raise TypeError( - "first argument (if supplied) must be a " - "PyOpenCLArrayContext") - - array_contexts.append(array_context) - del array_context - - def look_for_array_contexts(ary): - if isinstance(ary, DOFArray): - if ary.array_context is not None: - array_contexts.append(ary.array_context) - elif isinstance(ary, np.ndarray) and ary.dtype.char == "O": - for idx in np.ndindex(ary.shape): - look_for_array_contexts(ary[idx]) - else: - pass - - for key, val in context.items(): - look_for_array_contexts(val) - - if array_contexts: - from pytools import is_single_valued - if not is_single_valued(array_contexts): - raise ValueError("arguments do not agree on an array context") - - array_context = array_contexts[0] - else: - array_context = None - - # }}} + array_context = _find_array_context_from_args_in_context( + context, array_context) exec_mapper = EvaluationMapper( self, array_context, context, timing_data=timing_data) diff --git a/test/test_cost_model.py b/test/test_cost_model.py index 42e03b2c4d1a1f4c284ee4b7f83ea094007a25da..71beb44936037a8195136c0a526dcc66ec6d3a60 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -1,6 +1,9 @@ from __future__ import division, print_function -__copyright__ = "Copyright (C) 2018 Matt Wala" +__copyright__ = """ + Copyright (C) 2018 Matt Wala + Copyright (C) 2019 Hao Gao +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,24 +25,262 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import numpy as np -import numpy.linalg as la # noqa +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) -from boxtree.tools import ConstantOneExpansionWrangler from meshmode.array_context import PyOpenCLArrayContext + +import numpy as np import pyopencl as cl -import pyopencl.clmath # noqa -import pytest -from pyopencl.tools import ( # noqa - pytest_generate_tests_for_pyopencl as pytest_generate_tests) -from pytools import one +from boxtree.tools import ConstantOneExpansionWrangler +from pytential.qbx import QBXLayerPotentialSource from sumpy.kernel import LaplaceKernel, HelmholtzKernel - from pytential import bind, sym, norm # noqa from pytential import GeometryCollection +from pytools import one + +from pytential.qbx.cost import ( + QBXCostModel, _PythonQBXCostModel, make_pde_aware_translation_cost_model +) + +import time + +import logging +logger = logging.getLogger(__name__) + + +# {{{ Compare the time and result of OpenCL implementation and Python implementation + +def test_compare_cl_and_py_cost_model(ctx_factory): + nelements = 3600 + target_order = 16 + fmm_order = 5 + qbx_order = fmm_order + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + actx = PyOpenCLArrayContext(queue) + + # {{{ Construct geometry + + from meshmode.mesh.generation import make_curve_mesh, starfish + mesh = make_curve_mesh(starfish, np.linspace(0, 1, nelements), target_order) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + pre_density_discr = Discretization( + actx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order) + ) + + qbx = QBXLayerPotentialSource( + pre_density_discr, 4 * target_order, + qbx_order, + fmm_order=fmm_order + ) + places = GeometryCollection(qbx) + + from pytential.qbx.refinement import refine_geometry_collection + places = refine_geometry_collection(places) + + target_discrs_and_qbx_sides = tuple([(qbx.density_discr, 0)]) + geo_data_dev = qbx.qbx_fmm_geometry_data( + places, places.auto_source.geometry, target_discrs_and_qbx_sides + ) + + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper + geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data_dev) + + # }}} -from pytential.qbx.cost import CostModel + # {{{ Construct cost models + + cl_cost_model = QBXCostModel() + python_cost_model = _PythonQBXCostModel() + + tree = geo_data.tree() + xlat_cost = make_pde_aware_translation_cost_model( + tree.targets.shape[0], tree.nlevels + ) + + constant_one_params = QBXCostModel.get_unit_calibration_params() + constant_one_params["p_qbx"] = 5 + for ilevel in range(tree.nlevels): + constant_one_params["p_fmm_lev%d" % ilevel] = 10 + + cl_cost_factors = cl_cost_model.qbx_cost_factors_for_kernels_from_model( + queue, tree.nlevels, xlat_cost, constant_one_params + ) + + python_cost_factors = python_cost_model.qbx_cost_factors_for_kernels_from_model( + None, tree.nlevels, xlat_cost, constant_one_params + ) + + # }}} + + # {{{ Test process_form_qbxl + + cl_ndirect_sources_per_target_box = ( + cl_cost_model.get_ndirect_sources_per_target_box( + queue, geo_data_dev.traversal() + ) + ) + + queue.finish() + start_time = time.time() + + cl_p2qbxl = cl_cost_model.process_form_qbxl( + queue, geo_data_dev, cl_cost_factors["p2qbxl_cost"], + cl_ndirect_sources_per_target_box + ) + + queue.finish() + logger.info("OpenCL time for process_form_qbxl: {0}".format( + str(time.time() - start_time) + )) + + python_ndirect_sources_per_target_box = ( + python_cost_model.get_ndirect_sources_per_target_box( + queue, geo_data.traversal() + ) + ) + + start_time = time.time() + + python_p2qbxl = python_cost_model.process_form_qbxl( + queue, geo_data, python_cost_factors["p2qbxl_cost"], + python_ndirect_sources_per_target_box + ) + + logger.info("Python time for process_form_qbxl: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_p2qbxl.get(), python_p2qbxl) + + # }}} + + # {{{ Test process_m2qbxl + + queue.finish() + start_time = time.time() + + cl_m2qbxl = cl_cost_model.process_m2qbxl( + queue, geo_data_dev, cl_cost_factors["m2qbxl_cost"] + ) + + queue.finish() + logger.info("OpenCL time for process_m2qbxl: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_m2qbxl = python_cost_model.process_m2qbxl( + queue, geo_data, python_cost_factors["m2qbxl_cost"] + ) + + logger.info("Python time for process_m2qbxl: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_m2qbxl.get(), python_m2qbxl) + + # }}} + + # {{{ Test process_l2qbxl + + queue.finish() + start_time = time.time() + + cl_l2qbxl = cl_cost_model.process_l2qbxl( + queue, geo_data_dev, cl_cost_factors["l2qbxl_cost"] + ) + + queue.finish() + logger.info("OpenCL time for process_l2qbxl: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_l2qbxl = python_cost_model.process_l2qbxl( + queue, geo_data, python_cost_factors["l2qbxl_cost"] + ) + + logger.info("Python time for process_l2qbxl: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_l2qbxl.get(), python_l2qbxl) + + # }}} + + # {{{ Test process_eval_qbxl + + queue.finish() + start_time = time.time() + + cl_eval_qbxl = cl_cost_model.process_eval_qbxl( + queue, geo_data_dev, cl_cost_factors["qbxl2p_cost"] + ) + + queue.finish() + logger.info("OpenCL time for process_eval_qbxl: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_eval_qbxl = python_cost_model.process_eval_qbxl( + queue, geo_data, python_cost_factors["qbxl2p_cost"] + ) + + logger.info("Python time for process_eval_qbxl: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal(cl_eval_qbxl.get(), python_eval_qbxl) + + # }}} + + # {{{ Test eval_target_specific_qbxl + + queue.finish() + start_time = time.time() + + cl_eval_target_specific_qbxl = cl_cost_model.process_eval_target_specific_qbxl( + queue, geo_data_dev, cl_cost_factors["p2p_tsqbx_cost"], + cl_ndirect_sources_per_target_box + ) + + queue.finish() + logger.info("OpenCL time for eval_target_specific_qbxl: {0}".format( + str(time.time() - start_time) + )) + + start_time = time.time() + + python_eval_target_specific_qbxl = \ + python_cost_model.process_eval_target_specific_qbxl( + queue, geo_data, python_cost_factors["p2p_tsqbx_cost"], + python_ndirect_sources_per_target_box + ) + + logger.info("Python time for eval_target_specific_qbxl: {0}".format( + str(time.time() - start_time) + )) + + assert np.array_equal( + cl_eval_target_specific_qbxl.get(), python_eval_target_specific_qbxl + ) + + # }}} + +# }}} # {{{ global params @@ -135,11 +376,14 @@ def test_timing_data_gathering(ctx_factory): # {{{ test cost model -@pytest.mark.parametrize("dim, use_target_specific_qbx", ( - (2, False), - (3, False), - (3, True))) -def test_cost_model(ctx_factory, dim, use_target_specific_qbx): +@pytest.mark.parametrize("dim, use_target_specific_qbx, per_box", ( + (2, False, False), + (3, False, False), + (3, True, False), + (2, False, True), + (3, False, True), + (3, True, True))) +def test_cost_model(ctx_factory, dim, use_target_specific_qbx, per_box): """Test that cost model gathering can execute successfully.""" cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) @@ -147,7 +391,7 @@ def test_cost_model(ctx_factory, dim, use_target_specific_qbx): lpot_source = get_lpot_source(actx, dim).copy( _use_target_specific_qbx=use_target_specific_qbx, - cost_model=CostModel()) + cost_model=QBXCostModel()) places = GeometryCollection(lpot_source) density_discr = places.get_discretization(places.auto_source.geometry) @@ -158,14 +402,28 @@ def test_cost_model(ctx_factory, dim, use_target_specific_qbx): sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) op_S = bind(places, sym_op_S) - cost_S = op_S.get_modeled_cost(actx, sigma=sigma) + + if per_box: + cost_S, _ = op_S.cost_per_box("constant_one", sigma=sigma) + else: + cost_S, _ = op_S.cost_per_stage("constant_one", sigma=sigma) + assert len(cost_S) == 1 sym_op_S_plus_D = ( sym.S(k_sym, sigma_sym, qbx_forced_limit=+1) + sym.D(k_sym, sigma_sym, qbx_forced_limit="avg")) op_S_plus_D = bind(places, sym_op_S_plus_D) - cost_S_plus_D = op_S_plus_D.get_modeled_cost(actx, sigma=sigma) + + if per_box: + cost_S_plus_D, _ = op_S_plus_D.cost_per_box( + "constant_one", sigma=sigma + ) + else: + cost_S_plus_D, _ = op_S_plus_D.cost_per_stage( + "constant_one", sigma=sigma + ) + assert len(cost_S_plus_D) == 2 # }}} @@ -197,7 +455,10 @@ def test_cost_model_metadata_gathering(ctx_factory): sym_op_S = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1, k=sym.var("k")) op_S = bind(places, sym_op_S) - cost_S = one(op_S.get_modeled_cost(actx, sigma=sigma, k=k).values()) + _, metadata = op_S.cost_per_stage( + "constant_one", sigma=sigma, k=k, return_metadata=True + ) + metadata = one(metadata.values()) geo_data = lpot_source.qbx_fmm_geometry_data( places, @@ -206,15 +467,15 @@ def test_cost_model_metadata_gathering(ctx_factory): tree = geo_data.tree() - assert cost_S.params["p_qbx"] == QBX_ORDER - assert cost_S.params["nlevels"] == tree.nlevels - assert cost_S.params["nsources"] == tree.nsources - assert cost_S.params["ntargets"] == tree.ntargets - assert cost_S.params["ncenters"] == geo_data.ncenters + assert metadata["p_qbx"] == QBX_ORDER + assert metadata["nlevels"] == tree.nlevels + assert metadata["nsources"] == tree.nsources + assert metadata["ntargets"] == tree.ntargets + assert metadata["ncenters"] == geo_data.ncenters for level in range(tree.nlevels): assert ( - cost_S.params["p_fmm_lev%d" % level] + metadata["p_fmm_lev%d" % level] == fmm_level_to_order(k_sym, {"k": 2}, tree, level)) # }}} @@ -444,9 +705,8 @@ def test_cost_model_correctness(ctx_factory, dim, off_surface, queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue) - cost_model = ( - CostModel( - translation_cost_model_factory=OpCountingTranslationCostModel)) + cost_model = QBXCostModel( + translation_cost_model_factory=OpCountingTranslationCostModel) lpot_source = get_lpot_source(actx, dim).copy( cost_model=cost_model, @@ -479,7 +739,8 @@ def test_cost_model_correctness(ctx_factory, dim, off_surface, sigma = get_density(actx, density_discr) from pytools import one - cost_S = one(op_S.get_modeled_cost(actx, sigma=sigma).values()) + modeled_time, _ = op_S.cost_per_stage("constant_one", sigma=sigma) + modeled_time = one(modeled_time.values()) # Run FMM with ConstantOneWrangler. This can't be done with pytential's # high-level interface, so call the FMM driver directly. @@ -503,40 +764,43 @@ def test_cost_model_correctness(ctx_factory, dim, off_surface, # Check constant one wrangler for correctness. assert (potential == ndofs).all() - modeled_time = cost_S.get_predicted_times(merge_close_lists=True) - # Check that the cost model matches the timing data returned by the # constant one wrangler. mismatches = [] for stage in timing_data: - if timing_data[stage]["ops_elapsed"] != modeled_time[stage]: - mismatches.append( + if stage not in modeled_time: + assert timing_data[stage]["ops_elapsed"] == 0 + else: + if timing_data[stage]["ops_elapsed"] != modeled_time[stage]: + mismatches.append( (stage, timing_data[stage]["ops_elapsed"], modeled_time[stage])) assert not mismatches, "\n".join(str(s) for s in mismatches) -# }}} + # {{{ Test per-box cost + total_cost = 0.0 + for stage in timing_data: + total_cost += timing_data[stage]["ops_elapsed"] -# {{{ test order varying by level + per_box_cost, _ = op_S.cost_per_box("constant_one", sigma=sigma) + print(per_box_cost) + per_box_cost = one(per_box_cost.values()) -CONSTANT_ONE_PARAMS = dict( - c_l2l=1, - c_l2p=1, - c_l2qbxl=1, - c_m2l=1, - c_m2m=1, - c_m2p=1, - c_m2qbxl=1, - c_p2l=1, - c_p2m=1, - c_p2p=1, - c_p2qbxl=1, - c_qbxl2p=1, - c_p2p_tsqbx=1, - ) + total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost) + assert total_cost == ( + total_aggregate_cost + + modeled_time["coarsen_multipoles"] + + modeled_time["refine_locals"] + ) + + # }}} + +# }}} +# {{{ test order varying by level + def test_cost_model_order_varying_by_level(ctx_factory): """For FMM order varying by level, this checks to ensure that the costs are different. The varying-level case should have larger cost. @@ -552,8 +816,7 @@ def test_cost_model_order_varying_by_level(ctx_factory): return 1 lpot_source = get_lpot_source(actx, 2).copy( - cost_model=CostModel( - calibration_params=CONSTANT_ONE_PARAMS), + cost_model=QBXCostModel(), fmm_level_to_order=level_to_order_constant) places = GeometryCollection(lpot_source) @@ -565,35 +828,51 @@ def test_cost_model_order_varying_by_level(ctx_factory): sigma = get_density(actx, density_discr) - cost_constant = one( - bind(places, sym_op) - .get_modeled_cost(actx, sigma=sigma).values()) + cost_constant, metadata = bind(places, sym_op).cost_per_stage( + "constant_one", sigma=sigma) + + cost_constant = one(cost_constant.values()) + metadata = one(metadata.values()) # }}} # {{{ varying level to order - varying_order_params = cost_constant.params.copy() + def level_to_order_varying(kernel, kernel_args, tree, level): + return metadata["nlevels"] - level - nlevels = cost_constant.params["nlevels"] - for level in range(nlevels): - varying_order_params["p_fmm_lev%d" % level] = nlevels - level + lpot_source = get_lpot_source(actx, 2).copy( + cost_model=QBXCostModel(), + fmm_level_to_order=level_to_order_varying) + places = GeometryCollection(lpot_source) + + density_discr = places.get_discretization(places.auto_source.geometry) - cost_varying = cost_constant.with_params(varying_order_params) + sigma = get_density(actx, density_discr) + + cost_varying, _ = bind(lpot_source, sym_op).cost_per_stage( + "constant_one", sigma=sigma) + + cost_varying = one(cost_varying.values()) # }}} - assert ( - sum(cost_varying.get_predicted_times().values()) - > sum(cost_constant.get_predicted_times().values())) + assert sum(cost_varying.values()) > sum(cost_constant.values()) # }}} # You can test individual routines by typing # $ python test_cost_model.py 'test_routine()' +# You can specify the log level by setting 'LOGLEVEL' enviroment variable, for +# example +# $ LOGLEVEL=INFO python test_cost_model.py 'test_compare_cl_and_py_cost_model( +# $ cl.create_some_context)' if __name__ == "__main__": + import os + logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) + import sys if len(sys.argv) > 1: exec(sys.argv[1])