diff --git a/boxtree/cost.py b/boxtree/cost.py index 503090e17343c8b5674a0005cb28185e97f6c84e..79726d85914ce9253dd0ce055f492d8c4a496b71 100644 --- a/boxtree/cost.py +++ b/boxtree/cost.py @@ -216,7 +216,6 @@ class AbstractFMMCostModel(ABC): .. automethod:: get_unit_calibration_params """ - @abstractmethod def __init__( self, translation_cost_model_factory=make_pde_aware_translation_cost_model): @@ -228,9 +227,10 @@ class AbstractFMMCostModel(ABC): self.translation_cost_model_factory = translation_cost_model_factory @abstractmethod - def process_form_multipoles(self, traversal, p2m_cost): + def process_form_multipoles(self, queue, traversal, p2m_cost): """Cost for forming multipole expansions of each box. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels,) representing the cost of forming the multipole @@ -241,9 +241,10 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_coarsen_multipoles(self, traversal, m2m_cost): + def process_coarsen_multipoles(self, queue, traversal, m2m_cost): """Cost for upward propagation. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels-1,), where the ith entry represents the @@ -257,10 +258,11 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def get_ndirect_sources_per_target_box(self, traversal): + def get_ndirect_sources_per_target_box(self, queue, traversal): """Collect the number of direct evaluation sources (list 1, list 3 close and list 4 close) for each target box. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry representing the number of direct @@ -269,10 +271,11 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): """Direct evaluation cost of each target box of *traversal*. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry @@ -290,8 +293,9 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_list2(self, traversal, m2l_cost): + def process_list2(self, queue, traversal, m2l_cost): """ + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels,) representing the translation cost of each level. @@ -302,8 +306,10 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + def process_list3(self, queue, traversal, m2p_cost, + box_target_counts_nonchild=None): """ + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels,) where the ith entry represents the evaluation cost @@ -320,8 +326,9 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_list4(self, traversal, p2l_cost): + def process_list4(self, queue, traversal, p2l_cost): """ + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels,) where the ith entry represents the translation cost @@ -333,9 +340,10 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_eval_locals(self, traversal, l2p_cost, + def process_eval_locals(self, queue, traversal, l2p_cost, box_target_counts_nonchild=None): """ + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels,) where the ith entry represents the cost of evaluating @@ -353,9 +361,10 @@ class AbstractFMMCostModel(ABC): pass @abstractmethod - def process_refine_locals(self, traversal, l2l_cost): + def process_refine_locals(self, queue, traversal, l2l_cost): """Cost of downward propagation. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (nlevels-1,), where the ith entry represents the cost of @@ -378,17 +387,34 @@ class AbstractFMMCostModel(ABC): """ pass - def fmm_cost_factors_for_kernels_from_model(self, nlevels, xlat_cost, context): + @staticmethod + def cost_factors_to_dev(cost_factors, queue): + cost_factors_dev = {} + + for name in cost_factors: + if not isinstance(cost_factors[name], np.ndarray): + cost_factors_dev[name] = cost_factors[name] + continue + cost_factors_dev[name] = cl.array.to_device( + queue, cost_factors[name] + ).with_queue(None) + + return cost_factors_dev + + def fmm_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): """Evaluate translation cost factors from symbolic model. The result of this function can be used for process_* methods in this class. + :arg queue: If not None, the cost factor arrays will be transferred to device + using this queue. :arg nlevels: the number of tree levels. :arg xlat_cost: a :class:`FMMTranslationCostModel`. :arg context: a :class:`dict` of parameters passed as context when evaluating symbolic expressions in *xlat_cost*. :return: a :class:`dict`, the translation cost of each step in FMM. """ - return { + cost_factors = { "p2m_cost": np.array([ evaluate(xlat_cost.p2m(ilevel), context=context) for ilevel in range(nlevels) @@ -420,22 +446,29 @@ class AbstractFMMCostModel(ABC): ], dtype=np.float64) } + if queue: + cost_factors = self.cost_factors_to_dev(cost_factors, queue) + + return cost_factors + @abstractmethod - def zero_cost_per_box(self, nboxes): + def zero_cost_per_box(self, queue, nboxes): """Helper function for returning the per-box cost filled with 0. + :arg queue: a :class:`pyopencl.CommandQueue` object. :param nboxes: the number of boxes :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape (*nboxes*,), representing the zero per-box cost. """ pass - def cost_per_box(self, traversal, level_to_order, + def cost_per_box(self, queue, traversal, level_to_order, calibration_params, ndirect_sources_per_target_box=None, box_target_counts_nonchild=None): """Predict the per-box costs of a new traversal object. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg level_to_order: a :class:`numpy.ndarray` of shape (traversal.tree.nlevels,) representing the expansion orders @@ -460,7 +493,8 @@ class AbstractFMMCostModel(ABC): """ if ndirect_sources_per_target_box is None: ndirect_sources_per_target_box = ( - self.get_ndirect_sources_per_target_box(traversal)) + self.get_ndirect_sources_per_target_box(queue, traversal) + ) tree = traversal.tree nboxes = tree.nboxes @@ -468,7 +502,7 @@ class AbstractFMMCostModel(ABC): target_boxes = traversal.target_boxes target_or_target_parent_boxes = traversal.target_or_target_parent_boxes - result = self.zero_cost_per_box(nboxes) + result = self.zero_cost_per_box(queue, nboxes) for ilevel in range(tree.nlevels): calibration_params["p_fmm_lev%d" % ilevel] = level_to_order[ilevel] @@ -478,47 +512,49 @@ class AbstractFMMCostModel(ABC): ) translation_cost = self.fmm_cost_factors_for_kernels_from_model( - tree.nlevels, xlat_cost, calibration_params + queue, tree.nlevels, xlat_cost, calibration_params ) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild result[source_boxes] += self.process_form_multipoles( - traversal, translation_cost["p2m_cost"] + queue, traversal, translation_cost["p2m_cost"] ) result[target_boxes] += self.process_direct( - traversal, ndirect_sources_per_target_box, translation_cost["c_p2p"], + queue, traversal, ndirect_sources_per_target_box, + translation_cost["c_p2p"], box_target_counts_nonchild=box_target_counts_nonchild ) result[target_or_target_parent_boxes] += self.process_list2( - traversal, translation_cost["m2l_cost"] + queue, traversal, translation_cost["m2l_cost"] ) result += self.process_list3( - traversal, translation_cost["m2p_cost"], + queue, traversal, translation_cost["m2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) result[target_or_target_parent_boxes] += self.process_list4( - traversal, translation_cost["p2l_cost"] + queue, traversal, translation_cost["p2l_cost"] ) result[target_boxes] += self.process_eval_locals( - traversal, translation_cost["l2p_cost"], + queue, traversal, translation_cost["l2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) return result - def cost_per_stage(self, traversal, level_to_order, + def cost_per_stage(self, queue, traversal, level_to_order, calibration_params, ndirect_sources_per_target_box=None, box_target_counts_nonchild=None): """Predict the per-stage costs of a new traversal object. + :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg level_to_order: a :class:`numpy.ndarray` of shape (traversal.tree.nlevels,) representing the expansion orders @@ -541,7 +577,8 @@ class AbstractFMMCostModel(ABC): """ if ndirect_sources_per_target_box is None: ndirect_sources_per_target_box = ( - self.get_ndirect_sources_per_target_box(traversal)) + self.get_ndirect_sources_per_target_box(queue, traversal) + ) tree = traversal.tree result = {} @@ -554,49 +591,52 @@ class AbstractFMMCostModel(ABC): ) translation_cost = self.fmm_cost_factors_for_kernels_from_model( - tree.nlevels, xlat_cost, calibration_params + queue, tree.nlevels, xlat_cost, calibration_params ) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild result["form_multipoles"] = self.aggregate_over_boxes( - self.process_form_multipoles(traversal, translation_cost["p2m_cost"]) + self.process_form_multipoles( + queue, traversal, translation_cost["p2m_cost"] + ) ) result["coarsen_multipoles"] = self.process_coarsen_multipoles( - traversal, translation_cost["m2m_cost"] + queue, traversal, translation_cost["m2m_cost"] ) result["eval_direct"] = self.aggregate_over_boxes( self.process_direct( - traversal, ndirect_sources_per_target_box, translation_cost["c_p2p"], + queue, traversal, ndirect_sources_per_target_box, + translation_cost["c_p2p"], box_target_counts_nonchild=box_target_counts_nonchild ) ) result["multipole_to_local"] = self.aggregate_over_boxes( - self.process_list2(traversal, translation_cost["m2l_cost"]) + self.process_list2(queue, traversal, translation_cost["m2l_cost"]) ) result["eval_multipoles"] = self.aggregate_over_boxes( self.process_list3( - traversal, translation_cost["m2p_cost"], + queue, traversal, translation_cost["m2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) ) result["form_locals"] = self.aggregate_over_boxes( - self.process_list4(traversal, translation_cost["p2l_cost"]) + self.process_list4(queue, traversal, translation_cost["p2l_cost"]) ) result["refine_locals"] = self.process_refine_locals( - traversal, translation_cost["l2l_cost"] + queue, traversal, translation_cost["l2l_cost"] ) result["eval_locals"] = self.aggregate_over_boxes( self.process_eval_locals( - traversal, translation_cost["l2p_cost"], + queue, traversal, translation_cost["l2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) ) @@ -698,26 +738,14 @@ class FMMCostModel(AbstractFMMCostModel): .. note:: For methods in this class, argument *traversal* should live in device memory. """ - def __init__( - self, queue, - translation_cost_model_factory=make_pde_aware_translation_cost_model): - """ - :arg queue: a :class:`pyopencl.CommandQueue` object on which the execution - of this object runs. - :arg translation_cost_model_factory: a function, which takes tree dimension - and the number of tree levels as arguments, returns an object of - :class:`TranslationCostModel`. - """ - self.queue = queue - AbstractFMMCostModel.__init__(self, translation_cost_model_factory) # {{{ form multipoles @memoize_method - def process_form_multipoles_knl(self, box_id_dtype, particle_id_dtype, + def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(r""" double *np2m, ${box_id_t} *source_boxes, @@ -742,13 +770,12 @@ class FMMCostModel(AbstractFMMCostModel): name="process_form_multipoles" ) - def process_form_multipoles(self, traversal, p2m_cost): + def process_form_multipoles(self, queue, traversal, p2m_cost): tree = traversal.tree - np2m = cl.array.zeros( - self.queue, len(traversal.source_boxes), dtype=np.float64 - ) + np2m = cl.array.zeros(queue, len(traversal.source_boxes), dtype=np.float64) process_form_multipoles_knl = self.process_form_multipoles_knl( + queue.context, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -767,10 +794,10 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ propagate multipoles upward @memoize_method - def process_coarsen_multipoles_knl(self, ndimensions, box_id_dtype, + def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype, box_level_dtype, nlevels): return ElementwiseKernel( - self.queue.context, + context, Template(r""" ${box_id_t} *source_parent_boxes, ${box_level_t} *box_levels, @@ -810,13 +837,14 @@ class FMMCostModel(AbstractFMMCostModel): name="process_coarsen_multipoles" ) - def process_coarsen_multipoles(self, traversal, m2m_cost): + def process_coarsen_multipoles(self, queue, traversal, m2m_cost): tree = traversal.tree nm2m = cl.array.zeros( - self.queue, len(traversal.source_parent_boxes), dtype=np.float64 + queue, len(traversal.source_parent_boxes), dtype=np.float64 ) process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl( + queue.context, tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels ) @@ -826,7 +854,7 @@ class FMMCostModel(AbstractFMMCostModel): m2m_cost, nm2m, *tree.box_child_ids, - queue=self.queue + queue=queue ) return self.aggregate_over_boxes(nm2m) @@ -836,9 +864,9 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) @memoize_method - def _get_ndirect_sources_knl(self, particle_id_dtype, box_id_dtype): + def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(""" ${particle_id_t} *ndirect_sources_by_itgt_box, ${box_id_t} *source_boxes_starts, @@ -871,18 +899,18 @@ class FMMCostModel(AbstractFMMCostModel): name="get_ndirect_sources" ) - def get_ndirect_sources_per_target_box(self, traversal): + def get_ndirect_sources_per_target_box(self, queue, traversal): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) particle_id_dtype = tree.particle_id_dtype box_id_dtype = tree.box_id_dtype get_ndirect_sources_knl = self._get_ndirect_sources_knl( - particle_id_dtype, box_id_dtype + queue.context, particle_id_dtype, box_id_dtype ) ndirect_sources_by_itgt_box = cl.array.zeros( - self.queue, ntarget_boxes, dtype=particle_id_dtype + queue, ntarget_boxes, dtype=particle_id_dtype ) # List 1 @@ -895,7 +923,7 @@ class FMMCostModel(AbstractFMMCostModel): # List 3 close if traversal.from_sep_close_smaller_starts is not None: - self.queue.finish() + queue.finish() get_ndirect_sources_knl( ndirect_sources_by_itgt_box, traversal.from_sep_close_smaller_starts, @@ -905,7 +933,7 @@ class FMMCostModel(AbstractFMMCostModel): # List 4 close if traversal.from_sep_close_bigger_starts is not None: - self.queue.finish() + queue.finish() get_ndirect_sources_knl( ndirect_sources_by_itgt_box, traversal.from_sep_close_bigger_starts, @@ -915,7 +943,7 @@ class FMMCostModel(AbstractFMMCostModel): return ndirect_sources_by_itgt_box - def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild @@ -924,7 +952,7 @@ class FMMCostModel(AbstractFMMCostModel): ntargets_by_itgt_box = take( box_target_counts_nonchild, traversal.target_boxes, - queue=self.queue + queue=queue ) return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost @@ -934,9 +962,9 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ translate separated siblings' ("list 2") mpoles to local @memoize_method - def process_list2_knl(self, box_id_dtype, box_level_dtype): + def process_list2_knl(self, context, box_id_dtype, box_level_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(r""" double *nm2l, ${box_id_t} *target_or_target_parent_boxes, @@ -960,17 +988,19 @@ class FMMCostModel(AbstractFMMCostModel): name="process_list2" ) - def process_list2(self, traversal, m2l_cost): + def process_list2(self, queue, traversal, m2l_cost): tree = traversal.tree box_id_dtype = tree.box_id_dtype box_level_dtype = tree.box_level_dtype ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) nm2l = cl.array.zeros( - self.queue, (ntarget_or_target_parent_boxes,), dtype=np.float64 + queue, (ntarget_or_target_parent_boxes,), dtype=np.float64 ) - process_list2_knl = self.process_list2_knl(box_id_dtype, box_level_dtype) + process_list2_knl = self.process_list2_knl( + queue.context, box_id_dtype, box_level_dtype + ) process_list2_knl( nm2l, traversal.target_or_target_parent_boxes, @@ -986,9 +1016,9 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ evaluate sep. smaller mpoles ("list 3") at particles @memoize_method - def process_list3_knl(self, box_id_dtype, particle_id_dtype): + def process_list3_knl(self, context, box_id_dtype, particle_id_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(r""" ${box_id_t} *target_boxes_sep_smaller, ${box_id_t} *sep_smaller_start, @@ -1014,15 +1044,16 @@ class FMMCostModel(AbstractFMMCostModel): name="process_list3" ) - def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + def process_list3(self, queue, traversal, m2p_cost, + box_target_counts_nonchild=None): tree = traversal.tree - nm2p = cl.array.zeros(self.queue, tree.nboxes, dtype=np.float64) + nm2p = cl.array.zeros(queue, tree.nboxes, dtype=np.float64) if box_target_counts_nonchild is None: box_target_counts_nonchild = tree.box_target_counts_nonchild process_list3_knl = self.process_list3_knl( - tree.box_id_dtype, tree.particle_id_dtype + queue.context, tree.box_id_dtype, tree.particle_id_dtype ) for ilevel, sep_smaller_list in enumerate( @@ -1031,11 +1062,10 @@ class FMMCostModel(AbstractFMMCostModel): traversal.target_boxes_sep_smaller_by_source_level[ilevel], sep_smaller_list.starts, box_target_counts_nonchild, - m2p_cost[ilevel].get().reshape(-1)[0], + m2p_cost[ilevel].get(queue=queue).reshape(-1)[0], nm2p, - queue=self.queue + queue=queue ) - self.queue.finish() return nm2p @@ -1044,9 +1074,10 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ form locals for separated bigger source boxes ("list 4") @memoize_method - def process_list4_knl(self, box_id_dtype, particle_id_dtype, box_level_dtype): + def process_list4_knl(self, context, + box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(r""" double *nm2p, ${box_id_t} *from_sep_bigger_starts, @@ -1076,14 +1107,15 @@ class FMMCostModel(AbstractFMMCostModel): name="process_list4" ) - def process_list4(self, traversal, p2l_cost): + def process_list4(self, queue, traversal, p2l_cost): tree = traversal.tree target_or_target_parent_boxes = traversal.target_or_target_parent_boxes nm2p = cl.array.zeros( - self.queue, len(target_or_target_parent_boxes), dtype=np.float64 + queue, len(target_or_target_parent_boxes), dtype=np.float64 ) process_list4_knl = self.process_list4_knl( + queue.context, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -1103,10 +1135,10 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ evaluate local expansions at targets @memoize_method - def process_eval_locals_knl(self, box_id_dtype, particle_id_dtype, + def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - self.queue.context, + context, Template(r""" double *neval_locals, ${box_id_t} *target_boxes, @@ -1131,16 +1163,17 @@ class FMMCostModel(AbstractFMMCostModel): name="process_eval_locals" ) - def process_eval_locals(self, traversal, l2p_cost, + def process_eval_locals(self, queue, traversal, l2p_cost, box_target_counts_nonchild=None): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) - neval_locals = cl.array.zeros(self.queue, ntarget_boxes, dtype=np.float64) + neval_locals = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild process_eval_locals_knl = self.process_eval_locals_knl( + queue.context, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -1159,10 +1192,10 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ propogate locals downward @memoize_method - def process_refine_locals_knl(self, box_id_dtype): + def process_refine_locals_knl(self, context, box_id_dtype): from pyopencl.reduction import ReductionKernel return ReductionKernel( - self.queue.context, + context, np.float64, neutral="0.0", reduce_expr="a+b", @@ -1180,12 +1213,14 @@ class FMMCostModel(AbstractFMMCostModel): name="process_refine_locals" ) - def process_refine_locals(self, traversal, l2l_cost): + def process_refine_locals(self, queue, traversal, l2l_cost): tree = traversal.tree - process_refine_locals_knl = self.process_refine_locals_knl(tree.box_id_dtype) + process_refine_locals_knl = self.process_refine_locals_knl( + queue.context, tree.box_id_dtype + ) level_start_target_or_target_parent_box_nrs = cl.array.to_device( - self.queue, traversal.level_start_target_or_target_parent_box_nrs + queue, traversal.level_start_target_or_target_parent_box_nrs ) cost = process_refine_locals_knl( @@ -1198,8 +1233,8 @@ class FMMCostModel(AbstractFMMCostModel): # }}} - def zero_cost_per_box(self, nboxes): - return cl.array.zeros(self.queue, (nboxes,), dtype=np.float64) + def zero_cost_per_box(self, queue, nboxes): + return cl.array.zeros(queue, (nboxes,), dtype=np.float64) def aggregate_over_boxes(self, per_box_result): if isinstance(per_box_result, float): @@ -1207,40 +1242,23 @@ class FMMCostModel(AbstractFMMCostModel): else: return cl.array.sum(per_box_result).get().reshape(-1)[0] - def translation_costs_to_dev(self, translation_costs): - """This helper function transfers all :class:`numpy.ndarray` fields in - *translation_costs* to device memory as :class:`pyopencl.array.Array`. - """ - for name in translation_costs: - if not isinstance(translation_costs[name], np.ndarray): - continue - translation_costs[name] = cl.array.to_device( - self.queue, translation_costs[name] - ) - - return translation_costs + def fmm_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): + if not isinstance(queue, cl.CommandQueue): + raise TypeError( + "An OpenCL command queue must be supplied for cost model") - def fmm_cost_factors_for_kernels_from_model(self, nlevels, xlat_cost, context): - translation_costs = ( - AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( - self, nlevels, xlat_cost, context - ) + return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context ) - return self.translation_costs_to_dev(translation_costs) - # }}} # {{{ _PythonFMMCostModel (undocumented, only used for testing) class _PythonFMMCostModel(AbstractFMMCostModel): - def __init__( - self, - translation_cost_model_factory=make_pde_aware_translation_cost_model): - AbstractFMMCostModel.__init__(self, translation_cost_model_factory) - - def process_form_multipoles(self, traversal, p2m_cost): + def process_form_multipoles(self, queue, traversal, p2m_cost): tree = traversal.tree np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64) @@ -1253,7 +1271,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return np2m - def get_ndirect_sources_per_target_box(self, traversal): + def get_ndirect_sources_per_target_box(self, queue, traversal): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) @@ -1287,7 +1305,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return ndirect_sources_by_itgt_box - def process_direct(self, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild @@ -1296,7 +1314,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost - def process_list2(self, traversal, m2l_cost): + def process_list2(self, queue, traversal, m2l_cost): tree = traversal.tree ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64) @@ -1309,7 +1327,8 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return nm2l - def process_list3(self, traversal, m2p_cost, box_target_counts_nonchild=None): + def process_list3(self, queue, traversal, m2p_cost, + box_target_counts_nonchild=None): tree = traversal.tree nm2p = np.zeros(tree.nboxes, dtype=np.float64) @@ -1326,7 +1345,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return nm2p - def process_list4(self, traversal, p2l_cost): + def process_list4(self, queue, traversal, p2l_cost): tree = traversal.tree target_or_target_parent_boxes = traversal.target_or_target_parent_boxes nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64) @@ -1340,7 +1359,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return nm2p - def process_eval_locals(self, traversal, l2p_cost, + def process_eval_locals(self, queue, traversal, l2p_cost, box_target_counts_nonchild=None): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) @@ -1358,7 +1377,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return neval_locals - def process_coarsen_multipoles(self, traversal, m2m_cost): + def process_coarsen_multipoles(self, queue, traversal, m2m_cost): tree = traversal.tree result = 0.0 @@ -1384,7 +1403,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return result - def process_refine_locals(self, traversal, l2l_cost): + def process_refine_locals(self, queue, traversal, l2l_cost): tree = traversal.tree result = 0.0 @@ -1396,7 +1415,7 @@ class _PythonFMMCostModel(AbstractFMMCostModel): return result - def zero_cost_per_box(self, nboxes): + def zero_cost_per_box(self, queue, nboxes): return np.zeros(nboxes, dtype=np.float64) def aggregate_over_boxes(self, per_box_result): @@ -1405,6 +1424,12 @@ class _PythonFMMCostModel(AbstractFMMCostModel): else: return np.sum(per_box_result) + def fmm_cost_factors_for_kernels_from_model( + self, queue, nlevels, xlat_cost, context): + return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( + self, None, nlevels, xlat_cost, context + ) + # }}} diff --git a/examples/cost_model.py b/examples/cost_model.py index cd30e0f773c2c29352b428b887601c5c1e4da115..362ca5950236153e4bdb87ba35a6a4b2d5987744 100644 --- a/examples/cost_model.py +++ b/examples/cost_model.py @@ -90,25 +90,27 @@ def demo_cost_model(): from boxtree.cost import FMMCostModel from boxtree.cost import make_pde_aware_translation_cost_model - cost_model = FMMCostModel(queue, make_pde_aware_translation_cost_model) + cost_model = FMMCostModel(make_pde_aware_translation_cost_model) model_results = [] for icase in range(len(traversals)-1): traversal = traversals_dev[icase] model_results.append( cost_model.cost_per_stage( - traversal, level_to_orders[icase], + queue, traversal, level_to_orders[icase], FMMCostModel.get_unit_calibration_params(), ) ) + queue.finish() params = cost_model.estimate_calibration_params( model_results, timing_results[:-1], time_field_name=time_field_name ) predicted_time = cost_model.cost_per_stage( - traversals_dev[-1], level_to_orders[-1], params, + queue, traversals_dev[-1], level_to_orders[-1], params, ) + queue.finish() for field in ["form_multipoles", "eval_direct", "multipole_to_local", "eval_multipoles", "form_locals", "eval_locals", diff --git a/test/test_cost_model.py b/test/test_cost_model.py index 236ea4bd3b13357c55869007f289f34693726793..c5cde123e874968862f69dc293fcbbb08e812c46 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -91,7 +91,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty # {{{ Construct cost models - cl_cost_model = FMMCostModel(queue, None) + cl_cost_model = FMMCostModel(None) python_cost_model = _PythonFMMCostModel(None) constant_one_params = cl_cost_model.get_unit_calibration_params().copy() @@ -117,7 +117,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() cl_form_multipoles = cl_cost_model.process_form_multipoles( - trav_dev, p2m_cost_dev + queue, trav_dev, p2m_cost_dev ) queue.finish() @@ -128,7 +128,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() python_form_multipoles = python_cost_model.process_form_multipoles( - trav, p2m_cost + queue, trav, p2m_cost ) logger.info("Python time for process_form_multipoles: {0}".format( @@ -152,7 +152,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty queue.finish() start_time = time.time() cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles( - trav_dev, m2m_cost_dev + queue, trav_dev, m2m_cost_dev ) queue.finish() @@ -163,7 +163,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles( - trav, m2m_cost + queue, trav, m2m_cost ) logger.info("Python time for coarsen_multipoles: {0}".format( @@ -180,10 +180,10 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() cl_ndirect_sources_per_target_box = \ - cl_cost_model.get_ndirect_sources_per_target_box(trav_dev) + cl_cost_model.get_ndirect_sources_per_target_box(queue, trav_dev) cl_direct = cl_cost_model.process_direct( - trav_dev, cl_ndirect_sources_per_target_box, 5.0 + queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0 ) queue.finish() @@ -194,10 +194,10 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() python_ndirect_sources_per_target_box = \ - python_cost_model.get_ndirect_sources_per_target_box(trav) + python_cost_model.get_ndirect_sources_per_target_box(queue, trav) python_direct = python_cost_model.process_direct( - trav, python_ndirect_sources_per_target_box, 5.0 + queue, trav, python_ndirect_sources_per_target_box, 5.0 ) logger.info("Python time for process_direct: {0}".format( @@ -245,7 +245,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty queue.finish() start_time = time.time() - cl_m2l_cost = cl_cost_model.process_list2(trav_dev, m2l_cost_dev) + cl_m2l_cost = cl_cost_model.process_list2(queue, trav_dev, m2l_cost_dev) queue.finish() logger.info("OpenCL time for process_list2: {0}".format( @@ -253,7 +253,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty )) start_time = time.time() - python_m2l_cost = python_cost_model.process_list2(trav, m2l_cost) + python_m2l_cost = python_cost_model.process_list2(queue, trav, m2l_cost) logger.info("Python time for process_list2: {0}".format( str(time.time() - start_time) )) @@ -275,7 +275,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty queue.finish() start_time = time.time() - cl_m2p_cost = cl_cost_model.process_list3(trav_dev, m2p_cost_dev) + cl_m2p_cost = cl_cost_model.process_list3(queue, trav_dev, m2p_cost_dev) queue.finish() logger.info("OpenCL time for process_list3: {0}".format( @@ -283,7 +283,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty )) start_time = time.time() - python_m2p_cost = python_cost_model.process_list3(trav, m2p_cost) + python_m2p_cost = python_cost_model.process_list3(queue, trav, m2p_cost) logger.info("Python time for process_list3: {0}".format( str(time.time() - start_time) )) @@ -305,7 +305,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty queue.finish() start_time = time.time() - cl_p2l_cost = cl_cost_model.process_list4(trav_dev, p2l_cost_dev) + cl_p2l_cost = cl_cost_model.process_list4(queue, trav_dev, p2l_cost_dev) queue.finish() logger.info("OpenCL time for process_list4: {0}".format( @@ -313,7 +313,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty )) start_time = time.time() - python_p2l_cost = python_cost_model.process_list4(trav, p2l_cost) + python_p2l_cost = python_cost_model.process_list4(queue, trav, p2l_cost) logger.info("Python time for process_list4: {0}".format( str(time.time() - start_time) )) @@ -336,7 +336,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() cl_refine_locals_cost = cl_cost_model.process_refine_locals( - trav_dev, l2l_cost_dev + queue, trav_dev, l2l_cost_dev ) queue.finish() @@ -346,7 +346,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty start_time = time.time() python_refine_locals_cost = python_cost_model.process_refine_locals( - trav, l2l_cost + queue, trav, l2l_cost ) logger.info("Python time for refine_locals: {0}".format( str(time.time() - start_time) @@ -369,7 +369,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty queue.finish() start_time = time.time() - cl_l2p_cost = cl_cost_model.process_eval_locals(trav_dev, l2p_cost_dev) + cl_l2p_cost = cl_cost_model.process_eval_locals(queue, trav_dev, l2p_cost_dev) queue.finish() logger.info("OpenCL time for process_eval_locals: {0}".format( @@ -377,7 +377,7 @@ def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dty )) start_time = time.time() - python_l2p_cost = python_cost_model.process_eval_locals(trav, l2p_cost) + python_l2p_cost = python_cost_model.process_eval_locals(queue, trav, l2p_cost) logger.info("Python time for process_eval_locals: {0}".format( str(time.time() - start_time) )) @@ -481,7 +481,7 @@ def test_estimate_calibration_params(ctx_factory): level_to_order = level_to_orders[icase] python_model_results.append(python_cost_model.cost_per_stage( - traversal, level_to_order, + queue, traversal, level_to_order, _PythonFMMCostModel.get_unit_calibration_params(), )) @@ -491,7 +491,7 @@ def test_estimate_calibration_params(ctx_factory): test_params_sanity(python_params) - cl_cost_model = FMMCostModel(queue, make_pde_aware_translation_cost_model) + cl_cost_model = FMMCostModel(make_pde_aware_translation_cost_model) cl_model_results = [] @@ -500,13 +500,14 @@ def test_estimate_calibration_params(ctx_factory): level_to_order = level_to_orders[icase] cl_model_results.append(cl_cost_model.cost_per_stage( - traversal, level_to_order, + queue, traversal, level_to_order, FMMCostModel.get_unit_calibration_params(), )) cl_params = cl_cost_model.estimate_calibration_params( cl_model_results, timing_results[:-1], time_field_name=time_field_name ) + test_params_sanity(cl_params) if SUPPORTS_PROCESS_TIME: @@ -583,14 +584,13 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( drive_fmm(trav, wrangler, src_weights, timing_data=timing_data) cost_model = FMMCostModel( - queue, translation_cost_model_factory=OpCountingTranslationCostModel ) level_to_order = np.array([1 for _ in range(tree.nlevels)]) modeled_time = cost_model.cost_per_stage( - trav_dev, level_to_order, + queue, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) @@ -609,10 +609,11 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( total_cost += timing_data[stage]["ops_elapsed"] per_box_cost = cost_model.cost_per_box( - trav_dev, level_to_order, + queue, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost) + assert total_cost == ( total_aggregate_cost + modeled_time["coarsen_multipoles"]