From f8f88319e2c49833a4b2294f4cdee21abdd57ef4 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 2 Apr 2018 12:05:47 -0500 Subject: [PATCH 01/86] Update gitlab CI --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 750bf6f4..75d900ef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,6 +11,7 @@ Python 3.5 POCL: - large-node except: - tags + - distributed-fmm Python 3.6 POCL: script: @@ -25,6 +26,7 @@ Python 3.6 POCL: - large-node except: - tags + - distributed-fmm Python 3.6 POCL Examples: script: @@ -39,6 +41,7 @@ Python 3.6 POCL Examples: - large-node except: - tags + - distributed-fmm Python 3.5 Conda: script: @@ -52,6 +55,7 @@ Python 3.5 Conda: - large-node except: - tags + - distributed-fmm Python 2.7 POCL: script: @@ -66,6 +70,7 @@ Python 2.7 POCL: - large-node except: - tags + - distributed-fmm Python 3.5 Conda Apple: script: @@ -80,6 +85,7 @@ Python 3.5 Conda Apple: - apple except: - tags + - distributed-fmm retry: 2 Documentation: -- GitLab From 6d49de9ba68421d708fc3016892758cef0eb7d60 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 3 May 2018 16:18:53 -0500 Subject: [PATCH 02/86] Distributed interface --- .gitignore | 3 + pytential/qbx/__init__.py | 17 +++- pytential/qbx/distributed.py | 160 +++++++++++++++++++++++++++++++++++ test/test_distributed.py | 82 ++++++++++++++++++ 4 files changed, 259 insertions(+), 3 deletions(-) create mode 100644 pytential/qbx/distributed.py create mode 100644 test/test_distributed.py diff --git a/.gitignore b/.gitignore index c5d29d7c..08045820 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ a.out hk test/*.pdf examples/*.pdf +.idea +.pytest_cache +.vscode *.dot diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index f1f53cd6..8800dfc4 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -646,6 +646,14 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): fmm_mpole_factory, fmm_local_factory, qbx_local_factory, out_kernels) + elif self.fmm_backend == 'distributed': + from pytential.qbx.distributed import \ + QBXDistributedFMMLibExpansionWranglerCodeContainer + return QBXDistributedFMMLibExpansionWranglerCodeContainer( + self.cl_context, + fmm_mpole_factory, fmm_local_factory, qbx_local_factory, + out_kernels) + else: raise ValueError("invalid FMM backend: %s" % self.fmm_backend) @@ -726,9 +734,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # }}} # {{{ execute global QBX - - from pytential.qbx.fmm import drive_fmm - all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) + if self.fmm_backend == 'distributed': + from pytential.qbx.distributed import drive_dfmm + all_potentials_on_every_tgt = drive_dfmm(wrangler, strengths) + else: + from pytential.qbx.fmm import drive_fmm + all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) # }}} diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py new file mode 100644 index 00000000..c46f0009 --- /dev/null +++ b/pytential/qbx/distributed.py @@ -0,0 +1,160 @@ +from pytential.qbx.fmmlib import ( + QBXFMMLibExpansionWranglerCodeContainer) +from mpi4py import MPI + + +# {{{ Expansion Wrangler + +class QBXDistributedFMMLibExpansionWranglerCodeContainer( + QBXFMMLibExpansionWranglerCodeContainer): + pass + +# }}} + + +# {{{ FMM Driver + +def drive_dfmm(expansion_wrangler, src_weights, comm=MPI.COMM_WORLD): + current_rank = comm.Get_rank() + # total_rank = comm.Get_size() + + if current_rank == 0: + wrangler = expansion_wrangler + + geo_data = wrangler.geo_data + traversal = geo_data.traversal() + tree = traversal.tree + + # Interface guidelines: Attributes of the tree are assumed to be known + # to the expansion wrangler and should not be passed. + + src_weights = wrangler.reorder_sources(src_weights) + + # {{{ construct local multipoles + + mpole_exps = wrangler.form_multipoles( + traversal.level_start_source_box_nrs, + traversal.source_boxes, + src_weights) + + # }}} + + # {{{ propagate multipoles upward + + wrangler.coarsen_multipoles( + traversal.level_start_source_parent_box_nrs, + traversal.source_parent_boxes, + mpole_exps) + + # }}} + + # {{{ direct evaluation from neighbor source boxes ("list 1") + + non_qbx_potentials = wrangler.eval_direct( + traversal.target_boxes, + traversal.neighbor_source_boxes_starts, + traversal.neighbor_source_boxes_lists, + src_weights) + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + local_exps = wrangler.multipole_to_local( + traversal.level_start_target_or_target_parent_box_nrs, + traversal.target_or_target_parent_boxes, + traversal.from_sep_siblings_starts, + traversal.from_sep_siblings_lists, + mpole_exps) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + # (the point of aiming this stage at particles is specifically to keep its + # contribution *out* of the downward-propagating local expansions) + + non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( + traversal.target_boxes_sep_smaller_by_source_level, + traversal.from_sep_smaller_by_level, + mpole_exps) + + # assert that list 3 close has been merged into list 1 + assert traversal.from_sep_close_smaller_starts is None + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + local_exps = local_exps + wrangler.form_locals( + traversal.level_start_target_or_target_parent_box_nrs, + traversal.target_or_target_parent_boxes, + traversal.from_sep_bigger_starts, + traversal.from_sep_bigger_lists, + src_weights) + + # assert that list 4 close has been merged into list 1 + assert traversal.from_sep_close_bigger_starts is None + + # }}} + + # {{{ propagate local_exps downward + + wrangler.refine_locals( + traversal.level_start_target_or_target_parent_box_nrs, + traversal.target_or_target_parent_boxes, + local_exps) + + # }}} + + # {{{ evaluate locals + + non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( + traversal.level_start_target_box_nrs, + traversal.target_boxes, + local_exps) + + # }}} + + # {{{ wrangle qbx expansions + + qbx_expansions = wrangler.form_global_qbx_locals(src_weights) + + qbx_expansions = qbx_expansions + \ + wrangler.translate_box_multipoles_to_qbx_local(mpole_exps) + + qbx_expansions = qbx_expansions + \ + wrangler.translate_box_local_to_qbx_local(local_exps) + + qbx_potentials = wrangler.eval_qbx_expansions( + qbx_expansions) + + # }}} + + # {{{ reorder potentials + + nqbtl = geo_data.non_qbx_box_target_lists() + + all_potentials_in_tree_order = wrangler.full_output_zeros() + + for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials): + ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i + + all_potentials_in_tree_order += qbx_potentials + + def reorder_and_finalize_potentials(x): + # "finalize" gives host FMMs (like FMMlib) a chance to turn the + # potential back into a CL array. + return wrangler.finalize_potentials(x[tree.sorted_target_ids]) + + from pytools.obj_array import with_object_array_or_scalar + result = with_object_array_or_scalar( + reorder_and_finalize_potentials, all_potentials_in_tree_order) + + # }}} + + return result + else: + pass + +# }}} diff --git a/test/test_distributed.py b/test/test_distributed.py new file mode 100644 index 00000000..77bdb423 --- /dev/null +++ b/test/test_distributed.py @@ -0,0 +1,82 @@ +import pyopencl as cl +from meshmode.mesh.generation import ( + make_curve_mesh, ellipse) +import functools +from sympy.core.cache import clear_cache +import numpy as np +from pytential.qbx import QBXLayerPotentialSource +from meshmode.discretization import Discretization +from meshmode.discretization.poly_element import ( + InterpolatoryQuadratureSimplexGroupFactory) +from sumpy.kernel import LaplaceKernel +import pytential +from sumpy.visualization import FieldPlotter +from pytential.target import PointsTarget +import matplotlib.pyplot as pt +from mpi4py import MPI + +# Get MPI information +comm = MPI.COMM_WORLD +current_rank = comm.Get_rank() +total_rank = comm.Get_size() + +# Disable sympy cache +clear_cache() + +# Setup PyOpenCL +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +# Parameters +nelements = 30 +target_order = 8 +qbx_order = 3 +fmm_order = qbx_order + +if current_rank == 0: # master rank + mesh = make_curve_mesh(functools.partial(ellipse, 3), + np.linspace(0, 1, nelements + 1), + target_order) + + pre_density_discr = Discretization( + ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + qbx, _ = QBXLayerPotentialSource( + pre_density_discr, + fine_order=4 * target_order, + qbx_order=qbx_order, + fmm_order=fmm_order, + fmm_backend="distributed" + ).with_refinement() + + density_discr = qbx.density_discr + + op = pytential.sym.D( + LaplaceKernel(2), pytential.sym.var("sigma"), qbx_forced_limit=-2) + + sigma = density_discr.zeros(queue) + 1 + + fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30) + + fld_in_vol = pytential.bind( + (qbx, PointsTarget(fplot.points)), + op)(queue, sigma=sigma) + + err = cl.clmath.fabs(fld_in_vol - (-1)) + + linf_err = cl.array.max(err).get() + print("l_inf error:", linf_err) + + fplot.show_scalar_in_matplotlib(fld_in_vol.get()) + + pt.colorbar() + pt.show() + + # FIXME: Why does the FMM only meet this sloppy tolerance? + assert linf_err < 1e-2 + +else: # helper rank + from pytential.qbx.distributed import drive_dfmm + wrangler = None + weights = None + drive_dfmm(wrangler, weights) -- GitLab From 35899415d35795cb69884b17be2d082d6dd922de Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 9 May 2018 11:51:08 -0500 Subject: [PATCH 03/86] Distribute non_qbx_box_target_lists --- pytential/qbx/__init__.py | 10 +- pytential/qbx/distributed.py | 307 +++++++++++++++++++++-------------- test/test_distributed.py | 1 + 3 files changed, 186 insertions(+), 132 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 8800dfc4..72ea58de 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -638,7 +638,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): fmm_mpole_factory, fmm_local_factory, qbx_local_factory, out_kernels) - elif self.fmm_backend == "fmmlib": + elif self.fmm_backend == "fmmlib" or self.fmm_backend == 'distributed': from pytential.qbx.fmmlib import \ QBXFMMLibExpansionWranglerCodeContainer return QBXFMMLibExpansionWranglerCodeContainer( @@ -646,14 +646,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): fmm_mpole_factory, fmm_local_factory, qbx_local_factory, out_kernels) - elif self.fmm_backend == 'distributed': - from pytential.qbx.distributed import \ - QBXDistributedFMMLibExpansionWranglerCodeContainer - return QBXDistributedFMMLibExpansionWranglerCodeContainer( - self.cl_context, - fmm_mpole_factory, fmm_local_factory, qbx_local_factory, - out_kernels) - else: raise ValueError("invalid FMM backend: %s" % self.fmm_backend) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index c46f0009..55949501 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,160 +1,221 @@ -from pytential.qbx.fmmlib import ( - QBXFMMLibExpansionWranglerCodeContainer) +from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler +from boxtree.distributed import DistributedFMMLibExpansionWrangler, queue +from boxtree.tree import FilteredTargetListsInTreeOrder from mpi4py import MPI +import numpy as np +import pyopencl as cl -# {{{ Expansion Wrangler - -class QBXDistributedFMMLibExpansionWranglerCodeContainer( - QBXFMMLibExpansionWranglerCodeContainer): - pass - -# }}} - - -# {{{ FMM Driver - -def drive_dfmm(expansion_wrangler, src_weights, comm=MPI.COMM_WORLD): - current_rank = comm.Get_rank() - # total_rank = comm.Get_size() - - if current_rank == 0: - wrangler = expansion_wrangler - - geo_data = wrangler.geo_data - traversal = geo_data.traversal() - tree = traversal.tree - - # Interface guidelines: Attributes of the tree are assumed to be known - # to the expansion wrangler and should not be passed. - - src_weights = wrangler.reorder_sources(src_weights) - - # {{{ construct local multipoles - - mpole_exps = wrangler.form_multipoles( - traversal.level_start_source_box_nrs, - traversal.source_boxes, - src_weights) - - # }}} - - # {{{ propagate multipoles upward - - wrangler.coarsen_multipoles( - traversal.level_start_source_parent_box_nrs, - traversal.source_parent_boxes, - mpole_exps) - - # }}} - - # {{{ direct evaluation from neighbor source boxes ("list 1") - non_qbx_potentials = wrangler.eval_direct( - traversal.target_boxes, - traversal.neighbor_source_boxes_starts, - traversal.neighbor_source_boxes_lists, - src_weights) - - # }}} +# {{{ Expansion Wrangler - # {{{ translate separated siblings' ("list 2") mpoles to local +class QBXDistributedFMMLibExpansionWrangler( + QBXFMMLibExpansionWrangler, DistributedFMMLibExpansionWrangler): - local_exps = wrangler.multipole_to_local( - traversal.level_start_target_or_target_parent_box_nrs, - traversal.target_or_target_parent_boxes, - traversal.from_sep_siblings_starts, - traversal.from_sep_siblings_lists, - mpole_exps) + @classmethod + def distribute(cls, wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): + if wrangler is not None: # master process + import copy + distributed_wrangler = copy.copy(wrangler) + distributed_wrangler.queue = None + distributed_wrangler.geo_data = None + distributed_wrangler.code = None + distributed_wrangler.tree = None + distributed_wrangler.__class__ = cls + else: # worker process + distributed_wrangler = None - # }}} + distributed_wrangler = comm.bcast(distributed_wrangler, root=0) + distributed_wrangler.tree = distributed_geo_data.local_tree + distributed_wrangler.geo_data = distributed_geo_data - # {{{ evaluate sep. smaller mpoles ("list 3") at particles + return distributed_wrangler - # (the point of aiming this stage at particles is specifically to keep its - # contribution *out* of the downward-propagating local expansions) +# }}} - non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( - traversal.target_boxes_sep_smaller_by_source_level, - traversal.from_sep_smaller_by_level, - mpole_exps) - # assert that list 3 close has been merged into list 1 - assert traversal.from_sep_close_smaller_starts is None +# {{{ + +class DistributedGeoData(object): + def __init__(self, geo_data, comm=MPI.COMM_WORLD): + self.comm = comm + current_rank = comm.Get_rank() + total_rank = comm.Get_size() + + if geo_data is not None: # master process + traversal = geo_data.traversal() + tree = traversal.tree + # ncenters = geo_data.ncenters + # centers = geo_data.centers() + # expansion_radii = geo_data.expansion_radii() + # global_qbx_centers = geo_data.global_qbx_centers() + # qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() + # center_to_tree_targets = geo_data.center_to_tree_targets() + + nlevels = traversal.tree.nlevels + self.qbx_center_to_target_box_source_level = np.empty( + (nlevels,), dtype=object) + for level in range(nlevels): + self.qbx_center_to_target_box_source_level[level] = ( + geo_data.qbx_center_to_target_box_source_level(level)) + else: # worker process + traversal = None + + from boxtree.distributed import generate_local_tree + self.local_tree, self.local_data, self.box_bounding_box, knls = \ + generate_local_tree(traversal) + + from boxtree.distributed import generate_local_travs + self.trav_local, self.trav_global = generate_local_travs( + self.local_tree, self.box_bounding_box, comm=comm) + + # {{{ Distribute non_qbx_box_target_lists + + if current_rank == 0: # master process + box_target_starts = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_starts) + box_target_counts_nonchild = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_counts_nonchild) + nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets + targets = non_qbx_box_target_lists.targets + + reqs = np.empty((total_rank,), dtype=object) + local_non_qbx_box_target_lists = np.empty((total_rank,), dtype=object) + + for irank in range(total_rank): + particle_mask = cl.array.zeros(queue, (nfiltered_targets,), + dtype=tree.particle_id_dtype) + knls["particle_mask_knl"]( + self.local_data[irank]["tgt_box_mask"], + box_target_starts, + box_target_counts_nonchild, + particle_mask + ) + + particle_scan = cl.array.empty(queue, (nfiltered_targets + 1,), + dtype=tree.particle_id_dtype) + particle_scan[0] = 0 + knls["mask_scan_knl"](particle_mask, particle_scan) + + local_box_target_starts = cl.array.empty( + queue, (tree.nboxes,), dtype=tree.particle_id_dtype) + knls["generate_box_particle_starts"]( + box_target_starts, particle_scan, + local_box_target_starts + ) + + local_box_target_counts_nonchild = cl.array.zeros( + queue, (tree.nboxes,), dtype=tree.particle_id_dtype) + knls["generate_box_particle_counts_nonchild"]( + self.local_data[irank]["tgt_box_mask"], + box_target_counts_nonchild, + local_box_target_counts_nonchild + ) + + local_nfiltered_targets = particle_scan[-1].get(queue) + + particle_mask = particle_mask.get().astype(bool) + local_targets = np.empty((tree.dimensions,), dtype=object) + for idimension in range(tree.dimensions): + local_targets[idimension] = targets[idimension][particle_mask] + + local_non_qbx_box_target_lists[irank] = { + "nfiltered_targets": local_nfiltered_targets, + "box_target_starts": local_box_target_starts.get(), + "box_target_counts_nonchild": + local_box_target_counts_nonchild.get(), + "targets": local_targets + } + + reqs[irank] = comm.isend(local_non_qbx_box_target_lists[irank], + dest=irank, tag=0) + + for irank in range(1, total_rank): + reqs[irank].wait() + if current_rank == 0: + local_non_qbx_box_target_lists = local_non_qbx_box_target_lists[0] + else: + local_non_qbx_box_target_lists = comm.recv(source=0, tag=0) + + self._non_qbx_box_target_lists = FilteredTargetListsInTreeOrder( + nfiltered_targets=local_non_qbx_box_target_lists["nfiltered_targets"], + box_target_starts=local_non_qbx_box_target_lists["box_target_starts"], + box_target_counts_nonchild=local_non_qbx_box_target_lists[ + "box_target_counts_nonchild"], + targets=local_non_qbx_box_target_lists["targets"], + unfiltered_from_filtered_target_indices=None + ) # }}} - # {{{ form locals for separated bigger source boxes ("list 4") - - local_exps = local_exps + wrangler.form_locals( - traversal.level_start_target_or_target_parent_box_nrs, - traversal.target_or_target_parent_boxes, - traversal.from_sep_bigger_starts, - traversal.from_sep_bigger_lists, - src_weights) - - # assert that list 4 close has been merged into list 1 - assert traversal.from_sep_close_bigger_starts is None + def non_qbx_box_target_lists(self): + return self._non_qbx_box_target_lists - # }}} - - # {{{ propagate local_exps downward +# }}} - wrangler.refine_locals( - traversal.level_start_target_or_target_parent_box_nrs, - traversal.target_or_target_parent_boxes, - local_exps) - # }}} +# {{{ FMM Driver - # {{{ evaluate locals +def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, + _communicate_mpoles_via_allreduce=False): + current_rank = comm.Get_rank() + total_rank = comm.Get_size() - non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( - traversal.level_start_target_box_nrs, - traversal.target_boxes, - local_exps) + if current_rank == 0: + distributed_geo_data = DistributedGeoData(root_wrangler.geo_data) + else: + distributed_geo_data = DistributedGeoData(None) - # }}} + distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( + root_wrangler, distributed_geo_data) + wrangler = distributed_wrangler - # {{{ wrangle qbx expansions + local_traversal = distributed_geo_data.trav_local + global_traversal = distributed_geo_data.trav_global - qbx_expansions = wrangler.form_global_qbx_locals(src_weights) + # {{{ Distribute source weights - qbx_expansions = qbx_expansions + \ - wrangler.translate_box_multipoles_to_qbx_local(mpole_exps) + if current_rank == 0: + global_tree = root_wrangler.geo_data.tree() + src_weights = root_wrangler.reorder_sources(src_weights) + else: + global_tree = None - qbx_expansions = qbx_expansions + \ - wrangler.translate_box_local_to_qbx_local(local_exps) + from boxtree.distributed import distribute_source_weights + local_source_weights = distribute_source_weights( + src_weights, global_tree, distributed_geo_data.local_data, comm=comm) - qbx_potentials = wrangler.eval_qbx_expansions( - qbx_expansions) + # }}} - # }}} + # {{{ construct local multipoles - # {{{ reorder potentials + mpole_exps = wrangler.form_multipoles( + local_traversal.level_start_source_box_nrs, + local_traversal.source_boxes, + local_source_weights) - nqbtl = geo_data.non_qbx_box_target_lists() + # }}} - all_potentials_in_tree_order = wrangler.full_output_zeros() + # {{{ propagate multipoles upward - for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials): - ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i + wrangler.coarsen_multipoles( + local_traversal.level_start_source_parent_box_nrs, + local_traversal.source_parent_boxes, + mpole_exps) - all_potentials_in_tree_order += qbx_potentials + # }}} - def reorder_and_finalize_potentials(x): - # "finalize" gives host FMMs (like FMMlib) a chance to turn the - # potential back into a CL array. - return wrangler.finalize_potentials(x[tree.sorted_target_ids]) + # {{{ direct evaluation from neighbor source boxes ("list 1") - from pytools.obj_array import with_object_array_or_scalar - result = with_object_array_or_scalar( - reorder_and_finalize_potentials, all_potentials_in_tree_order) + non_qbx_potentials = wrangler.eval_direct( + global_traversal.target_boxes, + global_traversal.neighbor_source_boxes_starts, + global_traversal.neighbor_source_boxes_lists, + local_source_weights) - # }}} + # }}} - return result - else: - pass + return None # }}} diff --git a/test/test_distributed.py b/test/test_distributed.py index 77bdb423..69b052e0 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -47,6 +47,7 @@ if current_rank == 0: # master rank qbx_order=qbx_order, fmm_order=fmm_order, fmm_backend="distributed" + # fmm_backend="fmmlib" ).with_refinement() density_discr = qbx.density_discr -- GitLab From b0a1108e6d30f39bef95c55ca0298abbd83477be Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 9 May 2018 23:58:22 -0500 Subject: [PATCH 04/86] Distribute global_qbx_centers --- pytential/qbx/distributed.py | 149 ++++++++++++++++++++++++++++++++--- 1 file changed, 136 insertions(+), 13 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 55949501..5adab924 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -6,6 +6,15 @@ import numpy as np import pyopencl as cl +# {{{ MPITags used in this module + +MPITags = { + "non_qbx_box_target_lists": 0, + "global_qbx_centers": 1 +} + +# }}} + # {{{ Expansion Wrangler @@ -45,10 +54,10 @@ class DistributedGeoData(object): if geo_data is not None: # master process traversal = geo_data.traversal() tree = traversal.tree - # ncenters = geo_data.ncenters + ncenters = geo_data.ncenters # centers = geo_data.centers() # expansion_radii = geo_data.expansion_radii() - # global_qbx_centers = geo_data.global_qbx_centers() + global_qbx_centers = geo_data.global_qbx_centers() # qbx_center_to_target_box = geo_data.qbx_center_to_target_box() non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() # center_to_tree_targets = geo_data.center_to_tree_targets() @@ -128,15 +137,20 @@ class DistributedGeoData(object): "targets": local_targets } - reqs[irank] = comm.isend(local_non_qbx_box_target_lists[irank], - dest=irank, tag=0) + if irank != 0: + reqs[irank] = comm.isend( + local_non_qbx_box_target_lists[irank], + dest=irank, + tag=MPITags["non_qbx_box_target_lists"] + ) for irank in range(1, total_rank): reqs[irank].wait() if current_rank == 0: local_non_qbx_box_target_lists = local_non_qbx_box_target_lists[0] else: - local_non_qbx_box_target_lists = comm.recv(source=0, tag=0) + local_non_qbx_box_target_lists = comm.recv( + source=0, tag=MPITags["non_qbx_box_target_lists"]) self._non_qbx_box_target_lists = FilteredTargetListsInTreeOrder( nfiltered_targets=local_non_qbx_box_target_lists["nfiltered_targets"], @@ -149,9 +163,44 @@ class DistributedGeoData(object): # }}} + # {{{ Distribute global_qbx_centers + + if current_rank == 0: + local_global_qbx_centers = np.empty((total_rank,), dtype=object) + for irank in range(total_rank): + tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) + tgt_mask_user_order = tgt_mask[tree.sorted_target_ids] + centers_mask = tgt_mask_user_order[:ncenters] + local_global_qbx_centers[irank] = global_qbx_centers[ + centers_mask[global_qbx_centers]] + + if irank != 0: + reqs[irank] = comm.isend( + local_global_qbx_centers[irank], + dest=irank, + tag=MPITags["global_qbx_centers"] + ) + + for irank in range(1, total_rank): + reqs[irank].wait() + local_global_qbx_centers = local_global_qbx_centers[0] + else: + local_global_qbx_centers = comm.recv( + source=0, tag=MPITags["global_qbx_centers"]) + + self._global_qbx_centers = local_global_qbx_centers + + # }}} + def non_qbx_box_target_lists(self): return self._non_qbx_box_target_lists + def traversal(self): + return self.trav_global + + def global_qbx_centers(self): + return self._global_qbx_centers + # }}} @@ -191,31 +240,105 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, # {{{ construct local multipoles mpole_exps = wrangler.form_multipoles( - local_traversal.level_start_source_box_nrs, - local_traversal.source_boxes, - local_source_weights) + local_traversal.level_start_source_box_nrs, + local_traversal.source_boxes, + local_source_weights) # }}} # {{{ propagate multipoles upward wrangler.coarsen_multipoles( - local_traversal.level_start_source_parent_box_nrs, - local_traversal.source_parent_boxes, - mpole_exps) + local_traversal.level_start_source_parent_box_nrs, + local_traversal.source_parent_boxes, + mpole_exps) # }}} # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials = wrangler.eval_direct( + global_traversal.target_boxes, + global_traversal.neighbor_source_boxes_starts, + global_traversal.neighbor_source_boxes_lists, + local_source_weights) + + # }}} + + # {{{ translate separated siblings' ("list 2") mpoles to local + + local_exps = wrangler.multipole_to_local( + global_traversal.level_start_target_or_target_parent_box_nrs, + global_traversal.target_or_target_parent_boxes, + global_traversal.from_sep_siblings_starts, + global_traversal.from_sep_siblings_lists, + mpole_exps) + + # }}} + + # {{{ evaluate sep. smaller mpoles ("list 3") at particles + + # (the point of aiming this stage at particles is specifically to keep its + # contribution *out* of the downward-propagating local expansions) + + non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( + global_traversal.target_boxes_sep_smaller_by_source_level, + global_traversal.from_sep_smaller_by_level, + mpole_exps) + + # assert that list 3 close has been merged into list 1 + # assert global_traversal.from_sep_close_smaller_starts is None + if global_traversal.from_sep_close_smaller_starts is not None: + non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( global_traversal.target_boxes, - global_traversal.neighbor_source_boxes_starts, - global_traversal.neighbor_source_boxes_lists, + global_traversal.from_sep_close_smaller_starts, + global_traversal.from_sep_close_smaller_lists, + local_source_weights) + + # }}} + + # {{{ form locals for separated bigger source boxes ("list 4") + + local_exps = local_exps + wrangler.form_locals( + global_traversal.level_start_target_or_target_parent_box_nrs, + global_traversal.target_or_target_parent_boxes, + global_traversal.from_sep_bigger_starts, + global_traversal.from_sep_bigger_lists, + local_source_weights) + + if global_traversal.from_sep_close_bigger_starts is not None: + non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( + global_traversal.target_or_target_parent_boxes, + global_traversal.from_sep_close_bigger_starts, + global_traversal.from_sep_close_bigger_lists, local_source_weights) # }}} + # {{{ propagate local_exps downward + + wrangler.refine_locals( + global_traversal.level_start_target_or_target_parent_box_nrs, + global_traversal.target_or_target_parent_boxes, + local_exps) + + # }}} + + # {{{ evaluate locals + + non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( + global_traversal.level_start_target_box_nrs, + global_traversal.target_boxes, + local_exps) + + # }}} + + # {{{ wrangle qbx expansions + + qbx_expansions = wrangler.form_global_qbx_locals(src_weights) + + # }}} + return None # }}} -- GitLab From a260b33d19e2248f44d251d87812a74798299c60 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 10 May 2018 16:35:19 -0500 Subject: [PATCH 05/86] Distribute centers and dipole_vec --- pytential/qbx/distributed.py | 87 ++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 9 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 5adab924..73f2062e 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -10,7 +10,9 @@ import pyopencl as cl MPITags = { "non_qbx_box_target_lists": 0, - "global_qbx_centers": 1 + "global_qbx_centers": 1, + "centers": 2, + "dipole_vec": 3 } # }}} @@ -23,6 +25,9 @@ class QBXDistributedFMMLibExpansionWrangler( @classmethod def distribute(cls, wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): + current_rank = comm.Get_rank() + total_rank = comm.Get_size() + if wrangler is not None: # master process import copy distributed_wrangler = copy.copy(wrangler) @@ -30,6 +35,7 @@ class QBXDistributedFMMLibExpansionWrangler( distributed_wrangler.geo_data = None distributed_wrangler.code = None distributed_wrangler.tree = None + distributed_wrangler.dipole_vec = None distributed_wrangler.__class__ = cls else: # worker process distributed_wrangler = None @@ -38,6 +44,30 @@ class QBXDistributedFMMLibExpansionWrangler( distributed_wrangler.tree = distributed_geo_data.local_tree distributed_wrangler.geo_data = distributed_geo_data + # {{{ Distribute dipole_vec + + if current_rank == 0: + reqs_dipole_vec = np.empty((total_rank,), dtype=object) + local_dipole_vec = np.empty((total_rank,), dtype=object) + for irank in range(total_rank): + src_mask = distributed_geo_data.local_data[irank]["src_mask"].get() + local_dipole_vec[irank] = \ + wrangler.dipole_vec[:, src_mask.astype(bool)] + reqs_dipole_vec[irank] = comm.isend( + local_dipole_vec[irank], + dest=irank, + tag=MPITags["dipole_vec"] + ) + + for irank in range(1, total_rank): + reqs_dipole_vec[irank].wait() + distributed_wrangler.dipole_vec = local_dipole_vec[0] + else: + distributed_wrangler.dipole_vec = comm.recv( + source=0, tag=MPITags["dipole_vec"]) + + # }}} + return distributed_wrangler # }}} @@ -54,20 +84,22 @@ class DistributedGeoData(object): if geo_data is not None: # master process traversal = geo_data.traversal() tree = traversal.tree + nlevels = tree.nlevels + ncenters = geo_data.ncenters - # centers = geo_data.centers() + centers = geo_data.centers() # expansion_radii = geo_data.expansion_radii() global_qbx_centers = geo_data.global_qbx_centers() # qbx_center_to_target_box = geo_data.qbx_center_to_target_box() non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() # center_to_tree_targets = geo_data.center_to_tree_targets() - nlevels = traversal.tree.nlevels - self.qbx_center_to_target_box_source_level = np.empty( + qbx_center_to_target_box_source_level = np.empty( (nlevels,), dtype=object) for level in range(nlevels): - self.qbx_center_to_target_box_source_level[level] = ( + qbx_center_to_target_box_source_level[level] = ( geo_data.qbx_center_to_target_box_source_level(level)) + else: # worker process traversal = None @@ -163,31 +195,61 @@ class DistributedGeoData(object): # }}} - # {{{ Distribute global_qbx_centers + # {{{ Distribute global_qbx_centers and centers if current_rank == 0: local_global_qbx_centers = np.empty((total_rank,), dtype=object) + local_centers = np.empty((total_rank,), dtype=object) + reqs_centers = np.empty((total_rank,), dtype=object) + reqs_global_qbx_centers = np.empty((total_rank,), dtype=object) + for irank in range(total_rank): tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) tgt_mask_user_order = tgt_mask[tree.sorted_target_ids] centers_mask = tgt_mask_user_order[:ncenters] + + # {{{ Distribute centers + + nlocal_centers = np.sum(centers_mask.astype(np.int32)) + centers_dims = centers.shape[0] + local_centers[irank] = np.empty((centers_dims, nlocal_centers), + dtype=centers[0].dtype) + for idims in range(centers_dims): + local_centers[irank][idims][:] = centers[idims][centers_mask] + + if irank != 0: + reqs_centers[irank] = comm.isend( + local_centers[irank], + dest=irank, + tag=MPITags["centers"] + ) + + # }}} + local_global_qbx_centers[irank] = global_qbx_centers[ centers_mask[global_qbx_centers]] if irank != 0: - reqs[irank] = comm.isend( + reqs_global_qbx_centers[irank] = comm.isend( local_global_qbx_centers[irank], dest=irank, tag=MPITags["global_qbx_centers"] ) for irank in range(1, total_rank): - reqs[irank].wait() + reqs_centers[irank].wait() + local_centers = local_centers[0] + + for irank in range(1, total_rank): + reqs_global_qbx_centers[irank].wait() local_global_qbx_centers = local_global_qbx_centers[0] else: + local_centers = comm.recv( + source=0, tag=MPITags["centers"]) local_global_qbx_centers = comm.recv( source=0, tag=MPITags["global_qbx_centers"]) + self._local_centers = local_centers self._global_qbx_centers = local_global_qbx_centers # }}} @@ -198,6 +260,13 @@ class DistributedGeoData(object): def traversal(self): return self.trav_global + def centers(self): + return self._local_centers + + @property + def ncenters(self): + return self._local_centers.shape[1] + def global_qbx_centers(self): return self._global_qbx_centers @@ -335,7 +404,7 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, # {{{ wrangle qbx expansions - qbx_expansions = wrangler.form_global_qbx_locals(src_weights) + qbx_expansions = wrangler.form_global_qbx_locals(local_source_weights) # }}} -- GitLab From 48e3e97914a74772d84be11ecf3751d3d9928662 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 13 May 2018 15:30:00 -0500 Subject: [PATCH 06/86] Distribute exapnsion_radii, qbx_center_to_target_box --- pytential/qbx/distributed.py | 89 +++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 73f2062e..5cba3697 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -12,7 +12,9 @@ MPITags = { "non_qbx_box_target_lists": 0, "global_qbx_centers": 1, "centers": 2, - "dipole_vec": 3 + "dipole_vec": 3, + "expansion_radii": 4, + "qbx_center_to_target_box": 5 } # }}} @@ -88,9 +90,9 @@ class DistributedGeoData(object): ncenters = geo_data.ncenters centers = geo_data.centers() - # expansion_radii = geo_data.expansion_radii() + expansion_radii = geo_data.expansion_radii() global_qbx_centers = geo_data.global_qbx_centers() - # qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() # center_to_tree_targets = geo_data.center_to_tree_targets() @@ -195,18 +197,28 @@ class DistributedGeoData(object): # }}} - # {{{ Distribute global_qbx_centers and centers + # {{{ Distribute global_qbx_centers, centers and expansion_radii if current_rank == 0: local_global_qbx_centers = np.empty((total_rank,), dtype=object) local_centers = np.empty((total_rank,), dtype=object) + local_expansion_radii = np.empty((total_rank,), dtype=object) + local_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) + reqs_centers = np.empty((total_rank,), dtype=object) reqs_global_qbx_centers = np.empty((total_rank,), dtype=object) + reqs_expansion_radii = np.empty((total_rank,), dtype=object) + reqs_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) for irank in range(total_rank): tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) tgt_mask_user_order = tgt_mask[tree.sorted_target_ids] centers_mask = tgt_mask_user_order[:ncenters] + centers_scan = np.empty( + (ncenters + 1,), dtype=tree.particle_id_dtype) + centers_scan[1:] = np.cumsum( + centers_mask.astype(tree.particle_id_dtype)) + centers_scan[0] = 0 # {{{ Distribute centers @@ -226,8 +238,10 @@ class DistributedGeoData(object): # }}} - local_global_qbx_centers[irank] = global_qbx_centers[ - centers_mask[global_qbx_centers]] + # {{{ Distribute global_qbx_centers + + local_global_qbx_centers[irank] = centers_scan[ + global_qbx_centers[centers_mask[global_qbx_centers]]] if irank != 0: reqs_global_qbx_centers[irank] = comm.isend( @@ -236,6 +250,38 @@ class DistributedGeoData(object): tag=MPITags["global_qbx_centers"] ) + # }}} + + # {{{ Distribute expansion_radii + + local_expansion_radii[irank] = expansion_radii[centers_mask] + if irank != 0: + reqs_expansion_radii[irank] = comm.isend( + local_expansion_radii[irank], + dest=irank, + tag=MPITags["expansion_radii"] + ) + + # }}} + + # {{{ Distribute qbx_center_to_target_box + + # Note: The code transforms qbx_center_to_target_box to global box + # indexing from target_boxes before transmission. Each process is + # expected to transform back to target_boxes indexing based its own + # traversal object. + + local_qbx_center_to_target_box[irank] = \ + traversal.target_boxes[qbx_center_to_target_box[centers_mask]] + if irank != 0: + reqs_qbx_center_to_target_box[irank] = comm.isend( + local_qbx_center_to_target_box[irank], + dest=irank, + tag=MPITags["qbx_center_to_target_box"] + ) + + # }}} + for irank in range(1, total_rank): reqs_centers[irank].wait() local_centers = local_centers[0] @@ -243,14 +289,39 @@ class DistributedGeoData(object): for irank in range(1, total_rank): reqs_global_qbx_centers[irank].wait() local_global_qbx_centers = local_global_qbx_centers[0] + + for irank in range(1, total_rank): + reqs_expansion_radii[irank].wait() + local_expansion_radii = local_expansion_radii[0] + + for irank in range(1, total_rank): + reqs_qbx_center_to_target_box[irank].wait() + local_qbx_center_to_target_box = local_qbx_center_to_target_box[0] + else: local_centers = comm.recv( source=0, tag=MPITags["centers"]) local_global_qbx_centers = comm.recv( source=0, tag=MPITags["global_qbx_centers"]) + local_expansion_radii = comm.recv( + source=0, tag=MPITags["expansion_radii"]) + local_qbx_center_to_target_box = comm.recv( + source=0, tag=MPITags["qbx_center_to_target_box"] + ) self._local_centers = local_centers self._global_qbx_centers = local_global_qbx_centers + self._expansion_radii = local_expansion_radii + + # Transform local_qbx_center_to_target_box to target_boxes indexing + global_boxes_to_target_boxes = np.ones( + (self.local_tree.nboxes,), dtype=self.local_tree.particle_id_dtype) + # make sure accessing invalid position raises an error + global_boxes_to_target_boxes *= -1 + global_boxes_to_target_boxes[self.trav_global.target_boxes] = \ + np.arange(self.trav_global.target_boxes.shape[0]) + self._local_qbx_center_to_target_box = \ + global_boxes_to_target_boxes[local_qbx_center_to_target_box] # }}} @@ -270,6 +341,12 @@ class DistributedGeoData(object): def global_qbx_centers(self): return self._global_qbx_centers + def expansion_radii(self): + return self._expansion_radii + + def qbx_center_to_target_box(self): + return self._local_qbx_center_to_target_box + # }}} -- GitLab From fa57b2d8bf5fd5e42f54146efefca1feedbb08cc Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 15 May 2018 11:47:23 -0500 Subject: [PATCH 07/86] Distribute center_to_tree_targets and the corresponding qbx targets --- pytential/qbx/distributed.py | 99 ++++++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 5cba3697..63c65e02 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -14,7 +14,9 @@ MPITags = { "centers": 2, "dipole_vec": 3, "expansion_radii": 4, - "qbx_center_to_target_box": 5 + "qbx_center_to_target_box": 5, + "center_to_tree_targets": 6, + "qbx_targets": 7 } # }}} @@ -94,7 +96,7 @@ class DistributedGeoData(object): global_qbx_centers = geo_data.global_qbx_centers() qbx_center_to_target_box = geo_data.qbx_center_to_target_box() non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() - # center_to_tree_targets = geo_data.center_to_tree_targets() + center_to_tree_targets = geo_data.center_to_tree_targets() qbx_center_to_target_box_source_level = np.empty( (nlevels,), dtype=object) @@ -197,18 +199,22 @@ class DistributedGeoData(object): # }}} - # {{{ Distribute global_qbx_centers, centers and expansion_radii + # {{{ Distribute other useful fields of geo_data if current_rank == 0: local_global_qbx_centers = np.empty((total_rank,), dtype=object) local_centers = np.empty((total_rank,), dtype=object) local_expansion_radii = np.empty((total_rank,), dtype=object) local_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) + local_center_to_tree_targets = np.empty((total_rank,), dtype=object) + local_qbx_targets = np.empty((total_rank,), dtype=object) reqs_centers = np.empty((total_rank,), dtype=object) reqs_global_qbx_centers = np.empty((total_rank,), dtype=object) reqs_expansion_radii = np.empty((total_rank,), dtype=object) reqs_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) + reqs_center_to_tree_targets = np.empty((total_rank,), dtype=object) + reqs_qbx_targets = np.empty((total_rank,), dtype=object) for irank in range(total_rank): tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) @@ -282,6 +288,66 @@ class DistributedGeoData(object): # }}} + # {{{ Distribute local_qbx_targets and center_to_tree_targets + + starts = center_to_tree_targets.starts + lists = center_to_tree_targets.lists + local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) + local_lists = np.empty(lists.shape, dtype=lists.dtype) + + qbx_target_mask = np.zeros((tree.ntargets,), dtype=bool) + current_start = 0 # index into local_lists + ilocal_center = 0 + local_starts[0] = 0 + + for icenter in range(ncenters): + if not centers_mask[icenter]: + continue + + current_center_targets = lists[ + starts[icenter]:starts[icenter + 1]] + qbx_target_mask[current_center_targets] = True + current_stop = \ + current_start + starts[icenter + 1] - starts[icenter] + local_starts[ilocal_center + 1] = current_stop + local_lists[current_start:current_stop] = \ + lists[starts[icenter]:starts[icenter + 1]] + + current_start = current_stop + ilocal_center += 1 + + local_lists = local_lists[:current_start] + + qbx_target_scan = np.empty((tree.ntargets + 1,), dtype=lists.dtype) + qbx_target_scan[0] = 0 + qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) + nlocal_qbx_target = qbx_target_scan[-1] + + local_qbx_targets[irank] = np.empty( + (tree.dimensions, nlocal_qbx_target), + dtype=tree.targets[0].dtype + ) + for idim in range(tree.dimensions): + local_qbx_targets[irank][idim, :] = \ + tree.targets[idim][qbx_target_mask] + reqs_qbx_targets[irank] = comm.isend( + local_qbx_targets[irank], + dest=irank, + tag=MPITags["qbx_targets"] + ) + + local_lists = qbx_target_scan[local_lists] + local_center_to_tree_targets[irank] = { + "starts": local_starts, + "lists": local_lists + } + reqs_center_to_tree_targets[irank] = comm.isend( + local_center_to_tree_targets[irank], + dest=irank, + tag=MPITags["center_to_tree_targets"]) + + # }}} + for irank in range(1, total_rank): reqs_centers[irank].wait() local_centers = local_centers[0] @@ -298,6 +364,14 @@ class DistributedGeoData(object): reqs_qbx_center_to_target_box[irank].wait() local_qbx_center_to_target_box = local_qbx_center_to_target_box[0] + for irank in range(1, total_rank): + reqs_center_to_tree_targets[irank].wait() + local_center_to_tree_targets = local_center_to_tree_targets[0] + + for irank in range(1, total_rank): + reqs_qbx_targets[irank].wait() + local_qbx_targets = local_qbx_targets[0] + else: local_centers = comm.recv( source=0, tag=MPITags["centers"]) @@ -308,10 +382,17 @@ class DistributedGeoData(object): local_qbx_center_to_target_box = comm.recv( source=0, tag=MPITags["qbx_center_to_target_box"] ) + local_center_to_tree_targets = comm.recv( + source=0, tag=MPITags["center_to_tree_targets"] + ) + local_qbx_targets = comm.recv( + source=0, tag=MPITags["qbx_targets"] + ) self._local_centers = local_centers self._global_qbx_centers = local_global_qbx_centers self._expansion_radii = local_expansion_radii + self._qbx_targets = local_qbx_targets # Transform local_qbx_center_to_target_box to target_boxes indexing global_boxes_to_target_boxes = np.ones( @@ -323,6 +404,12 @@ class DistributedGeoData(object): self._local_qbx_center_to_target_box = \ global_boxes_to_target_boxes[local_qbx_center_to_target_box] + from pytential.qbx.geometry import CenterToTargetList + self._local_center_to_tree_targets = CenterToTargetList( + starts=local_center_to_tree_targets["starts"], + lists=local_center_to_tree_targets["lists"] + ) + # }}} def non_qbx_box_target_lists(self): @@ -347,6 +434,12 @@ class DistributedGeoData(object): def qbx_center_to_target_box(self): return self._local_qbx_center_to_target_box + def local_center_to_tree_targets(self): + return self._local_center_to_tree_targets + + def qbx_targets(self): + return self._qbx_targets + # }}} -- GitLab From 34511e6a490bdd13071d8633b7ecfa54b7e2968a Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 15 May 2018 16:08:19 -0500 Subject: [PATCH 08/86] Construct qbx_center_to_target_box_source_level --- pytential/qbx/distributed.py | 50 +++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 63c65e02..5301ba75 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -412,12 +412,49 @@ class DistributedGeoData(object): # }}} + # {{{ Construct qbx_center_to_target_box_source_level + + # This is modified from pytential.geometry.QBXFMMGeometryData. + # qbx_center_to_target_box_source_level but on host using Numpy instead of + # PyOpenCL. + + traversal = self.traversal() + qbx_center_to_target_box = self.qbx_center_to_target_box() + tree = traversal.tree + + self._qbx_center_to_target_box_source_level = np.empty( + (tree.nlevels,), dtype=object) + + for source_level in range(tree.nlevels): + sep_smaller = traversal.from_sep_smaller_by_level[source_level] + + target_box_to_target_box_source_level = np.empty( + len(traversal.target_boxes), + dtype=tree.box_id_dtype + ) + target_box_to_target_box_source_level.fill(-1) + target_box_to_target_box_source_level[sep_smaller.nonempty_indices] = ( + np.arange(sep_smaller.num_nonempty_lists, + dtype=tree.box_id_dtype) + ) + + self._qbx_center_to_target_box_source_level[source_level] = ( + target_box_to_target_box_source_level[ + qbx_center_to_target_box + ] + ) + + # }}} + def non_qbx_box_target_lists(self): return self._non_qbx_box_target_lists def traversal(self): return self.trav_global + def tree(self): + return self.traversal().tree + def centers(self): return self._local_centers @@ -434,12 +471,15 @@ class DistributedGeoData(object): def qbx_center_to_target_box(self): return self._local_qbx_center_to_target_box - def local_center_to_tree_targets(self): + def center_to_tree_targets(self): return self._local_center_to_tree_targets def qbx_targets(self): return self._qbx_targets + def qbx_center_to_target_box_source_level(self, source_level): + return self._qbx_center_to_target_box_source_level[source_level] + # }}} @@ -576,6 +616,14 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, qbx_expansions = wrangler.form_global_qbx_locals(local_source_weights) + qbx_expansions = qbx_expansions + \ + wrangler.translate_box_multipoles_to_qbx_local(mpole_exps) + + qbx_expansions = qbx_expansions + \ + wrangler.translate_box_local_to_qbx_local(local_exps) + + # qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions) + # }}} return None -- GitLab From 15e3e982ad6d068f6cb4cac24e2ff5d7f5ba3615 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 15 May 2018 23:11:36 -0500 Subject: [PATCH 09/86] Evaluate qbx potentials on each process, assemble non-qbx potential on root --- pytential/qbx/distributed.py | 75 +++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 5301ba75..c4ac3d13 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -16,7 +16,8 @@ MPITags = { "expansion_radii": 4, "qbx_center_to_target_box": 5, "center_to_tree_targets": 6, - "qbx_targets": 7 + "qbx_targets": 7, + "non_qbx_potentials": 8 } # }}} @@ -74,6 +75,41 @@ class QBXDistributedFMMLibExpansionWrangler( return distributed_wrangler + def eval_qbx_expansions(self, qbx_expansions): + geo_data = self.geo_data + ctt = geo_data.center_to_tree_targets() + global_qbx_centers = geo_data.global_qbx_centers() + qbx_centers = geo_data.centers() + qbx_radii = geo_data.expansion_radii() + + from pytools.obj_array import make_obj_array + output = make_obj_array([np.zeros(len(ctt.lists), self.dtype) + for k in self.outputs]) + + all_targets = geo_data.qbx_targets() + + taeval = self.get_expn_eval_routine("ta") + + for isrc_center, src_icenter in enumerate(global_qbx_centers): + for icenter_tgt in range( + ctt.starts[src_icenter], + ctt.starts[src_icenter+1]): + + center_itgt = ctt.lists[icenter_tgt] + + center = qbx_centers[:, src_icenter] + + pot, grad = taeval( + rscale=qbx_radii[src_icenter], + center=center, + expn=qbx_expansions[src_icenter].T, + ztarg=all_targets[:, center_itgt], + **self.kernel_kwargs) + + self.add_potgrad_onto_output(output, center_itgt, pot, grad) + + return output + # }}} @@ -127,6 +163,7 @@ class DistributedGeoData(object): reqs = np.empty((total_rank,), dtype=object) local_non_qbx_box_target_lists = np.empty((total_rank,), dtype=object) + self.particle_mask = np.empty((total_rank,), dtype=object) for irank in range(total_rank): particle_mask = cl.array.zeros(queue, (nfiltered_targets,), @@ -161,6 +198,7 @@ class DistributedGeoData(object): local_nfiltered_targets = particle_scan[-1].get(queue) particle_mask = particle_mask.get().astype(bool) + self.particle_mask[irank] = particle_mask local_targets = np.empty((tree.dimensions,), dtype=object) for idimension in range(tree.dimensions): local_targets[idimension] = targets[idimension][particle_mask] @@ -622,10 +660,43 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, qbx_expansions = qbx_expansions + \ wrangler.translate_box_local_to_qbx_local(local_exps) - # qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions) + qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions) # }}} + if current_rank != 0: # worker process + comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) + return None + + else: # master process + + all_potentials_in_tree_order = root_wrangler.full_output_zeros() + + nqbtl = root_wrangler.geo_data.non_qbx_box_target_lists() + + from pytools.obj_array import make_obj_array + non_qbx_potentials_all_rank = make_obj_array([ + np.zeros(nqbtl.nfiltered_targets, root_wrangler.dtype) + for k in root_wrangler.outputs] + ) + + for irank in range(total_rank): + + if irank == 0: + non_qbx_potentials_cur_rank = non_qbx_potentials + else: + non_qbx_potentials_cur_rank = comm.recv( + source=irank, tag=MPITags["non_qbx_potentials"]) + + for idim in range(len(root_wrangler.outputs)): + non_qbx_potentials_all_rank[idim][ + distributed_geo_data.particle_mask[irank] + ] = non_qbx_potentials_cur_rank[idim] + + for ap_i, nqp_i in zip( + all_potentials_in_tree_order, non_qbx_potentials_all_rank): + ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i + return None # }}} -- GitLab From c4118a5f423cbd72a221df76625660c14c5a990b Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 15 May 2018 23:53:15 -0500 Subject: [PATCH 10/86] Send qbx_potentials back to master process --- pytential/qbx/distributed.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index c4ac3d13..5111ceab 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -17,7 +17,8 @@ MPITags = { "qbx_center_to_target_box": 5, "center_to_tree_targets": 6, "qbx_targets": 7, - "non_qbx_potentials": 8 + "non_qbx_potentials": 8, + "qbx_potentials": 9 } # }}} @@ -253,6 +254,7 @@ class DistributedGeoData(object): reqs_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) reqs_center_to_tree_targets = np.empty((total_rank,), dtype=object) reqs_qbx_targets = np.empty((total_rank,), dtype=object) + self.qbx_target_mask = np.empty((total_rank,), dtype=object) for irank in range(total_rank): tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) @@ -354,6 +356,8 @@ class DistributedGeoData(object): current_start = current_stop ilocal_center += 1 + self.qbx_target_mask[irank] = qbx_target_mask + local_lists = local_lists[:current_start] qbx_target_scan = np.empty((tree.ntargets + 1,), dtype=lists.dtype) @@ -666,6 +670,7 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) + comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) return None else: # master process @@ -697,6 +702,32 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, all_potentials_in_tree_order, non_qbx_potentials_all_rank): ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i - return None + for irank in range(total_rank): + + if irank == 0: + qbx_potentials_cur_rank = qbx_potentials + else: + qbx_potentials_cur_rank = comm.recv( + source=irank, tag=MPITags["qbx_potentials"] + ) + + for idim in range(len(root_wrangler.outputs)): + all_potentials_in_tree_order[idim][ + distributed_geo_data.qbx_target_mask[irank] + ] = qbx_potentials_cur_rank[idim] + + def reorder_and_finalize_potentials(x): + # "finalize" gives host FMMs (like FMMlib) a chance to turn the + # potential back into a CL array. + return root_wrangler.finalize_potentials( + x[root_wrangler.tree.sorted_target_ids]) + + from pytools.obj_array import with_object_array_or_scalar + result = with_object_array_or_scalar( + reorder_and_finalize_potentials, all_potentials_in_tree_order) + + # }}} + + return result # }}} -- GitLab From 66dc17e14d7cea854a48fed5a35f1876a0e39292 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 16 May 2018 11:48:57 -0500 Subject: [PATCH 11/86] Add multipole communication --- pytential/qbx/distributed.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 5111ceab..0f6b473d 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -576,6 +576,19 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, # }}} + # {{{ Communicate mpoles + + from boxtree.distributed import communicate_mpoles + + if _communicate_mpoles_via_allreduce: + mpole_exps_all = np.zeros_like(mpole_exps) + comm.Allreduce(mpole_exps, mpole_exps_all) + mpole_exps = mpole_exps_all + else: + communicate_mpoles(wrangler, comm, local_traversal, mpole_exps) + + # }}} + # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials = wrangler.eval_direct( @@ -726,8 +739,6 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) - # }}} - return result # }}} -- GitLab From 06efc5372d29aeb915cf308e8056583c2fdcccf5 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 17 May 2018 23:43:02 -0500 Subject: [PATCH 12/86] Distribute traversal options --- pytential/qbx/distributed.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 0f6b473d..0111795b 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -144,13 +144,39 @@ class DistributedGeoData(object): else: # worker process traversal = None + # {{{ Distribute _from_sep_smaller_min_nsources_cumul + + if current_rank == 0: + trav_param = { + "_from_sep_smaller_min_nsources_cumul": + geo_data.geo_data.lpot_source. + _from_sep_smaller_min_nsources_cumul, + "well_sep_is_n_away": + geo_data.geo_data.code_getter.build_traversal.well_sep_is_n_away, + "from_sep_smaller_crit": + geo_data.geo_data.code_getter.build_traversal. + from_sep_smaller_crit + } + else: + trav_param = None + + trav_param = comm.bcast(trav_param, root=0) + + # }}} + from boxtree.distributed import generate_local_tree self.local_tree, self.local_data, self.box_bounding_box, knls = \ generate_local_tree(traversal) from boxtree.distributed import generate_local_travs self.trav_local, self.trav_global = generate_local_travs( - self.local_tree, self.box_bounding_box, comm=comm) + self.local_tree, self.box_bounding_box, comm=comm, + well_sep_is_n_away=trav_param["well_sep_is_n_away"], + from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], + _from_sep_smaller_min_nsources_cumul=trav_param[ + "_from_sep_smaller_min_nsources_cumul"], + merge_close_lists=True + ) # {{{ Distribute non_qbx_box_target_lists -- GitLab From 2cc0f5a98359bad7977d7d5fafc23ca739e45f0c Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 19 May 2018 15:09:31 -0500 Subject: [PATCH 13/86] Handle dipole_vec is None --- pytential/qbx/distributed.py | 4 +++- .../test_off_surface_eval.py} | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename test/{test_distributed.py => distributed/test_off_surface_eval.py} (100%) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 0111795b..f00e8665 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -52,7 +52,9 @@ class QBXDistributedFMMLibExpansionWrangler( # {{{ Distribute dipole_vec - if current_rank == 0: + distributed_wrangler.dipole_vec = None + + if current_rank == 0 and wrangler.dipole_vec is not None: reqs_dipole_vec = np.empty((total_rank,), dtype=object) local_dipole_vec = np.empty((total_rank,), dtype=object) for irank in range(total_rank): diff --git a/test/test_distributed.py b/test/distributed/test_off_surface_eval.py similarity index 100% rename from test/test_distributed.py rename to test/distributed/test_off_surface_eval.py -- GitLab From 97a30b732e4d61559ad025a4c77ae69dd25fc6e9 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 19 May 2018 15:27:00 -0500 Subject: [PATCH 14/86] Fix bug for handling dipole_vec --- pytential/qbx/distributed.py | 48 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index f00e8665..be6e10b1 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -41,8 +41,14 @@ class QBXDistributedFMMLibExpansionWrangler( distributed_wrangler.geo_data = None distributed_wrangler.code = None distributed_wrangler.tree = None - distributed_wrangler.dipole_vec = None distributed_wrangler.__class__ = cls + + # Use bool to represent whether dipole_vec needs to be distributed + if wrangler.dipole_vec is not None: + distributed_wrangler.dipole_vec = True + else: + distributed_wrangler.dipole_vec = False + else: # worker process distributed_wrangler = None @@ -52,27 +58,31 @@ class QBXDistributedFMMLibExpansionWrangler( # {{{ Distribute dipole_vec - distributed_wrangler.dipole_vec = None + if distributed_wrangler.dipole_vec: + + if current_rank == 0: + reqs_dipole_vec = np.empty((total_rank,), dtype=object) + local_dipole_vec = np.empty((total_rank,), dtype=object) + for irank in range(total_rank): + src_mask = \ + distributed_geo_data.local_data[irank]["src_mask"].get() + local_dipole_vec[irank] = \ + wrangler.dipole_vec[:, src_mask.astype(bool)] + reqs_dipole_vec[irank] = comm.isend( + local_dipole_vec[irank], + dest=irank, + tag=MPITags["dipole_vec"] + ) - if current_rank == 0 and wrangler.dipole_vec is not None: - reqs_dipole_vec = np.empty((total_rank,), dtype=object) - local_dipole_vec = np.empty((total_rank,), dtype=object) - for irank in range(total_rank): - src_mask = distributed_geo_data.local_data[irank]["src_mask"].get() - local_dipole_vec[irank] = \ - wrangler.dipole_vec[:, src_mask.astype(bool)] - reqs_dipole_vec[irank] = comm.isend( - local_dipole_vec[irank], - dest=irank, - tag=MPITags["dipole_vec"] - ) + for irank in range(1, total_rank): + reqs_dipole_vec[irank].wait() + distributed_wrangler.dipole_vec = local_dipole_vec[0] + else: + distributed_wrangler.dipole_vec = comm.recv( + source=0, tag=MPITags["dipole_vec"]) - for irank in range(1, total_rank): - reqs_dipole_vec[irank].wait() - distributed_wrangler.dipole_vec = local_dipole_vec[0] else: - distributed_wrangler.dipole_vec = comm.recv( - source=0, tag=MPITags["dipole_vec"]) + distributed_wrangler.dipole_vec = None # }}} -- GitLab From a81dae70490479d5f0fc7bd0d59487de63da09a4 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 19 May 2018 21:18:34 -0500 Subject: [PATCH 15/86] Allow the master process to terminate all worker processes --- pytential/qbx/distributed.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index be6e10b1..e80b1b87 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -567,9 +567,19 @@ class DistributedGeoData(object): def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, _communicate_mpoles_via_allreduce=False): + current_rank = comm.Get_rank() total_rank = comm.Get_size() + if current_rank == 0: + flag = True + else: + flag = None + flag = comm.bcast(flag, root=0) + + if not flag: + return False + if current_rank == 0: distributed_geo_data = DistributedGeoData(root_wrangler.geo_data) else: @@ -722,7 +732,7 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) - return None + return True else: # master process -- GitLab From adc30018fe696998daaf4fad9d29cbcf7f7fc7ca Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 19 May 2018 21:22:33 -0500 Subject: [PATCH 16/86] Add helmholtz example using distributed FMM --- examples/distributed_helmholtz.py | 153 ++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 examples/distributed_helmholtz.py diff --git a/examples/distributed_helmholtz.py b/examples/distributed_helmholtz.py new file mode 100644 index 00000000..09f50c28 --- /dev/null +++ b/examples/distributed_helmholtz.py @@ -0,0 +1,153 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.clmath # noqa + +from meshmode.discretization import Discretization +from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + +from pytential import bind, sym # noqa + +from mpi4py import MPI + +# {{{ set some constants for use below + +nelements = 20 +bdry_quad_order = 4 +mesh_order = bdry_quad_order +qbx_order = bdry_quad_order +bdry_ovsmp_quad_order = 4*bdry_quad_order +fmm_order = 10 +k = 3 + +# }}} + +# {{{ setup MPI + +comm = MPI.COMM_WORLD +current_rank = comm.Get_rank() +total_rank = comm.Get_size() + +# }}} + + +if current_rank == 0: + import logging + logging.basicConfig(level=logging.WARNING) # INFO for more progress info + + cl_ctx = cl.create_some_context() + queue = cl.CommandQueue(cl_ctx) + + from meshmode.mesh.generation import ellipse, make_curve_mesh + from functools import partial + + if 0: + mesh = make_curve_mesh( + partial(ellipse, 1), + np.linspace(0, 1, nelements+1), + mesh_order) + else: + base_mesh = make_curve_mesh( + partial(ellipse, 1), + np.linspace(0, 1, nelements+1), + mesh_order) + + from meshmode.mesh.processing import affine_map, merge_disjoint_meshes + nx = 2 + ny = 2 + dx = 2 / nx + meshes = [ + affine_map( + base_mesh, + A=np.diag([dx*0.25, dx*0.25]), + b=np.array([dx*(ix-nx/2), dx*(iy-ny/2)])) + for ix in range(nx) + for iy in range(ny)] + + mesh = merge_disjoint_meshes(meshes, single_group=True) + + if 0: + from meshmode.mesh.visualization import draw_curve + draw_curve(mesh) + import matplotlib.pyplot as plt + plt.show() + + pre_density_discr = Discretization( + cl_ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order)) + + from pytential.qbx import QBXLayerPotentialSource + qbx, _ = QBXLayerPotentialSource( + pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order, + fmm_order=fmm_order, + fmm_backend="distributed" + ).with_refinement() + density_discr = qbx.density_discr + + # {{{ describe bvp + + from sumpy.kernel import HelmholtzKernel + kernel = HelmholtzKernel(2) + + cse = sym.cse + + sigma_sym = sym.var("sigma") + sqrt_w = sym.sqrt_jac_q_weight(2) + inv_sqrt_w_sigma = cse(sigma_sym/sqrt_w) + + # Brakhage-Werner parameter + alpha = 1j + + # -1 for interior Dirichlet + # +1 for exterior Dirichlet + loc_sign = +1 + + bdry_op_sym = (-loc_sign*0.5*sigma_sym + + sqrt_w*( + alpha*sym.S(kernel, inv_sqrt_w_sigma, k=sym.var("k")) + - sym.D(kernel, inv_sqrt_w_sigma, k=sym.var("k")) + )) + + # }}} + + bound_op = bind(qbx, bdry_op_sym) + + # {{{ fix rhs and solve + + nodes = density_discr.nodes().with_queue(queue) + k_vec = np.array([2, 1]) + k_vec = k * k_vec / la.norm(k_vec, 2) + + def u_incoming_func(x): + return cl.clmath.exp( + 1j * (x[0] * k_vec[0] + x[1] * k_vec[1])) + + bc = -u_incoming_func(nodes) + + bvp_rhs = bind(qbx, sqrt_w*sym.var("bc"))(queue, bc=bc) + + from pytential.solve import gmres + gmres_result = gmres( + bound_op.scipy_op(queue, "sigma", dtype=np.complex128, k=k), + bvp_rhs, tol=1e-8, progress=True, + stall_iterations=0, + hard_failure=True) + + # }}} + + # {{{ Terminate helper drivers + + flag = False + flag = comm.bcast(flag, root=0) + + # }}} + +else: + from pytential.qbx.distributed import drive_dfmm + while True: + wrangler = None + weights = None + flag = drive_dfmm(wrangler, weights) + if not flag: + break -- GitLab From cc450f1c37999097ad82679952f941fb517a38a7 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 22 May 2018 16:25:39 -0500 Subject: [PATCH 17/86] Cache distributed_geo_data --- examples/distributed_helmholtz.py | 19 +-- pytential/qbx/__init__.py | 6 +- pytential/qbx/distributed.py | 182 ++++++++++++++++++++-- test/distributed/test_off_surface_eval.py | 14 +- 4 files changed, 189 insertions(+), 32 deletions(-) diff --git a/examples/distributed_helmholtz.py b/examples/distributed_helmholtz.py index 09f50c28..1b220aad 100644 --- a/examples/distributed_helmholtz.py +++ b/examples/distributed_helmholtz.py @@ -77,12 +77,11 @@ if current_rank == 0: cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order)) - from pytential.qbx import QBXLayerPotentialSource - qbx, _ = QBXLayerPotentialSource( - pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order, - fmm_order=fmm_order, - fmm_backend="distributed" - ).with_refinement() + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + qbx, _ = DistributedQBXLayerPotentialSource( + comm, pre_density_discr, fine_order=bdry_ovsmp_quad_order, + qbx_order=qbx_order, fmm_order=fmm_order + ).with_refinement() density_discr = qbx.density_discr # {{{ describe bvp @@ -144,10 +143,12 @@ if current_rank == 0: # }}} else: + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + lp_source = DistributedQBXLayerPotentialSource(comm, None, None) + from pytential.qbx.distributed import drive_dfmm while True: wrangler = None weights = None - flag = drive_dfmm(wrangler, weights) - if not flag: - break + distribute_geo_data = lp_source.distibuted_geo_data(None) + drive_dfmm(wrangler, weights, distribute_geo_data) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 72ea58de..346cfe23 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -726,9 +726,13 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # }}} # {{{ execute global QBX + if self.fmm_backend == 'distributed': + distributed_geo_data = self.distibuted_geo_data(geo_data) + from pytential.qbx.distributed import drive_dfmm - all_potentials_on_every_tgt = drive_dfmm(wrangler, strengths) + all_potentials_on_every_tgt = drive_dfmm( + wrangler, strengths, distributed_geo_data, comm=self.comm) else: from pytential.qbx.fmm import drive_fmm all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index e80b1b87..a52d4a22 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,4 +1,5 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler +from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed import DistributedFMMLibExpansionWrangler, queue from boxtree.tree import FilteredTargetListsInTreeOrder from mpi4py import MPI @@ -563,28 +564,178 @@ class DistributedGeoData(object): # }}} +class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): + + def __init__( + self, + comm, + density_discr, + fine_order, + qbx_order=None, + fmm_order=None, + fmm_level_to_order=None, + to_refined_connection=None, + expansion_factory=None, + target_association_tolerance=_not_provided, + + # begin undocumented arguments + # FIXME default debug=False once everything has matured + debug=True, + _refined_for_global_qbx=False, + _expansions_in_tree_have_extent=True, + _expansion_stick_out_factor=0.5, + _well_sep_is_n_away=2, + _max_leaf_refine_weight=None, + _box_extent_norm=None, + _from_sep_smaller_crit=None, + _from_sep_smaller_min_nsources_cumul=None, + _tree_kind="adaptive", + geometry_data_inspector=None, + target_stick_out_factor=_not_provided): + + self.comm = comm + current_rank = self.comm.Get_rank() + + self.distributed_geo_data_cache = {} + + if current_rank == 0: + self.next_geo_data_id = 0 + self.arg_to_id = {} + + if current_rank == 0: + + super(DistributedQBXLayerPotentialSource, self).__init__( + density_discr, + fine_order, + qbx_order=qbx_order, + fmm_order=fmm_order, + fmm_level_to_order=fmm_level_to_order, + to_refined_connection=to_refined_connection, + expansion_factory=expansion_factory, + target_association_tolerance=target_association_tolerance, + debug=debug, + _refined_for_global_qbx=_refined_for_global_qbx, + _expansions_in_tree_have_extent=_expansions_in_tree_have_extent, + _expansion_stick_out_factor=_expansion_stick_out_factor, + _well_sep_is_n_away=_well_sep_is_n_away, + _max_leaf_refine_weight=_max_leaf_refine_weight, + _box_extent_norm=_box_extent_norm, + _from_sep_smaller_crit=_from_sep_smaller_crit, + _from_sep_smaller_min_nsources_cumul=( + _from_sep_smaller_min_nsources_cumul), + _tree_kind=_tree_kind, + geometry_data_inspector=geometry_data_inspector, + fmm_backend='distributed', + target_stick_out_factor=target_stick_out_factor + ) + + def copy( + self, + density_discr=None, + fine_order=None, + qbx_order=None, + fmm_order=_not_provided, + fmm_level_to_order=_not_provided, + to_refined_connection=None, + target_association_tolerance=_not_provided, + _expansions_in_tree_have_extent=_not_provided, + _expansion_stick_out_factor=_not_provided, + _max_leaf_refine_weight=None, + _box_extent_norm=None, + _from_sep_smaller_crit=None, + _tree_kind=None, + geometry_data_inspector=None, + fmm_backend=None, + + debug=_not_provided, + _refined_for_global_qbx=_not_provided, + target_stick_out_factor=_not_provided, + ): + + obj = super(DistributedQBXLayerPotentialSource, self).copy( + density_discr=density_discr, + fine_order=fine_order, + qbx_order=qbx_order, + fmm_order=fmm_order, + fmm_level_to_order=fmm_level_to_order, + to_refined_connection=to_refined_connection, + target_association_tolerance=target_association_tolerance, + _expansions_in_tree_have_extent=_expansions_in_tree_have_extent, + _expansion_stick_out_factor=_expansion_stick_out_factor, + _max_leaf_refine_weight=_max_leaf_refine_weight, + _box_extent_norm=_box_extent_norm, + _from_sep_smaller_crit=_from_sep_smaller_crit, + _tree_kind=_tree_kind, + geometry_data_inspector=geometry_data_inspector, + fmm_backend=fmm_backend, + + debug=debug, + _refined_for_global_qbx=_refined_for_global_qbx, + target_stick_out_factor=target_stick_out_factor, + ) + + obj.__class__ = DistributedQBXLayerPotentialSource + obj.comm = self.comm + obj.distributed_geo_data_cache = self.distributed_geo_data_cache + + current_rank = self.comm.Get_rank() + + if current_rank == 0: + obj.next_geo_data_id = self.next_geo_data_id + obj.arg_to_id = self.arg_to_id + + return obj + + def distibuted_geo_data(self, geo_data): + """ Note: This method needs to be called collectively by all processes of + self.comm + """ + current_rank = self.comm.Get_rank() + + if current_rank == 0: + + target_discrs_and_qbx_sides = geo_data.target_discrs_and_qbx_sides + + if target_discrs_and_qbx_sides in self.arg_to_id: + geo_data_id = self.arg_to_id[target_discrs_and_qbx_sides] + else: + geo_data_id = self.next_geo_data_id + self.arg_to_id[target_discrs_and_qbx_sides] = geo_data_id + self.next_geo_data_id += 1 + else: + geo_data_id = None + + geo_data_id = self.comm.bcast(geo_data_id, root=0) + + if geo_data_id in self.distributed_geo_data_cache: + return self.distributed_geo_data_cache[geo_data_id] + + # no cached result found, construct a new distributed_geo_data + if current_rank == 0: + + with cl.CommandQueue(geo_data.cl_context) as queue: + from pytential.qbx.fmmlib import ToHostTransferredGeoDataWrapper + host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) + + distributed_geo_data = DistributedGeoData(host_geo_data, self.comm) + + else: + distributed_geo_data = DistributedGeoData(None, self.comm) + + self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data + + return distributed_geo_data + + # {{{ FMM Driver -def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, +def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, + comm=MPI.COMM_WORLD, _communicate_mpoles_via_allreduce=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() - if current_rank == 0: - flag = True - else: - flag = None - flag = comm.bcast(flag, root=0) - - if not flag: - return False - - if current_rank == 0: - distributed_geo_data = DistributedGeoData(root_wrangler.geo_data) - else: - distributed_geo_data = DistributedGeoData(None) - distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( root_wrangler, distributed_geo_data) wrangler = distributed_wrangler @@ -732,7 +883,6 @@ def drive_dfmm(root_wrangler, src_weights, comm=MPI.COMM_WORLD, if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) - return True else: # master process diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index 69b052e0..d1c7e50d 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -4,7 +4,7 @@ from meshmode.mesh.generation import ( import functools from sympy.core.cache import clear_cache import numpy as np -from pytential.qbx import QBXLayerPotentialSource +from pytential.qbx.distributed import DistributedQBXLayerPotentialSource from meshmode.discretization import Discretization from meshmode.discretization.poly_element import ( InterpolatoryQuadratureSimplexGroupFactory) @@ -41,13 +41,12 @@ if current_rank == 0: # master rank pre_density_discr = Discretization( ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) - qbx, _ = QBXLayerPotentialSource( + qbx, _ = DistributedQBXLayerPotentialSource( + comm, pre_density_discr, fine_order=4 * target_order, qbx_order=qbx_order, - fmm_order=fmm_order, - fmm_backend="distributed" - # fmm_backend="fmmlib" + fmm_order=fmm_order ).with_refinement() density_discr = qbx.density_discr @@ -77,7 +76,10 @@ if current_rank == 0: # master rank assert linf_err < 1e-2 else: # helper rank + lp_source = DistributedQBXLayerPotentialSource(comm, None, None) + distribute_geo_data = lp_source.distibuted_geo_data(None) + from pytential.qbx.distributed import drive_dfmm wrangler = None weights = None - drive_dfmm(wrangler, weights) + drive_dfmm(wrangler, weights, distribute_geo_data, comm=comm) -- GitLab From 7ef7033aa0eac578f184ddd7ad62c32ace957e95 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 23 May 2018 14:31:23 -0500 Subject: [PATCH 18/86] Force _from_sep_smaller_min_nsources_cumul to None --- pytential/qbx/distributed.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index a52d4a22..374b9ac3 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -157,13 +157,10 @@ class DistributedGeoData(object): else: # worker process traversal = None - # {{{ Distribute _from_sep_smaller_min_nsources_cumul + # {{{ Distribute traversal parameters if current_rank == 0: trav_param = { - "_from_sep_smaller_min_nsources_cumul": - geo_data.geo_data.lpot_source. - _from_sep_smaller_min_nsources_cumul, "well_sep_is_n_away": geo_data.geo_data.code_getter.build_traversal.well_sep_is_n_away, "from_sep_smaller_crit": @@ -186,8 +183,6 @@ class DistributedGeoData(object): self.local_tree, self.box_bounding_box, comm=comm, well_sep_is_n_away=trav_param["well_sep_is_n_away"], from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], - _from_sep_smaller_min_nsources_cumul=trav_param[ - "_from_sep_smaller_min_nsources_cumul"], merge_close_lists=True ) @@ -588,7 +583,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _max_leaf_refine_weight=None, _box_extent_norm=None, _from_sep_smaller_crit=None, - _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", geometry_data_inspector=None, target_stick_out_factor=_not_provided): @@ -621,8 +615,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _max_leaf_refine_weight=_max_leaf_refine_weight, _box_extent_norm=_box_extent_norm, _from_sep_smaller_crit=_from_sep_smaller_crit, - _from_sep_smaller_min_nsources_cumul=( - _from_sep_smaller_min_nsources_cumul), + _from_sep_smaller_min_nsources_cumul=None, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, fmm_backend='distributed', -- GitLab From b66e015f9a3b8535bedae48204a44db2b6ce50c5 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 26 May 2018 13:56:11 -0500 Subject: [PATCH 19/86] Log timing --- pytential/qbx/distributed.py | 18 ++++++++++++++++-- test/distributed/test_off_surface_eval.py | 7 +++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 374b9ac3..e30d6ccf 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -4,8 +4,11 @@ from boxtree.distributed import DistributedFMMLibExpansionWrangler, queue from boxtree.tree import FilteredTargetListsInTreeOrder from mpi4py import MPI import numpy as np - import pyopencl as cl +import logging +import time + +logger = logging.getLogger(__name__) # {{{ MPITags used in this module @@ -127,7 +130,7 @@ class QBXDistributedFMMLibExpansionWrangler( # }}} -# {{{ +# {{{ Distributed GeoData class DistributedGeoData(object): def __init__(self, geo_data, comm=MPI.COMM_WORLD): @@ -154,6 +157,8 @@ class DistributedGeoData(object): qbx_center_to_target_box_source_level[level] = ( geo_data.qbx_center_to_target_box_source_level(level)) + start_time = time.time() + else: # worker process traversal = None @@ -448,6 +453,9 @@ class DistributedGeoData(object): reqs_qbx_targets[irank].wait() local_qbx_targets = local_qbx_targets[0] + logger.info("Distribute geometry data in {} secs.".format( + time.time() - start_time)) + else: local_centers = comm.recv( source=0, tag=MPITags["centers"]) @@ -729,6 +737,9 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, current_rank = comm.Get_rank() total_rank = comm.Get_size() + if current_rank == 0: + start_time = time.time() + distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( root_wrangler, distributed_geo_data) wrangler = distributed_wrangler @@ -930,6 +941,9 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) + logger.info("Distributed FMM evaluation finished in {} secs.".format( + time.time() - start_time)) + return result # }}} diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index d1c7e50d..78565a58 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -14,6 +14,13 @@ from sumpy.visualization import FieldPlotter from pytential.target import PointsTarget import matplotlib.pyplot as pt from mpi4py import MPI +import logging +import os + +# Set up logging infrastructure +logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) +logging.getLogger("boxtree.distributed").setLevel(logging.INFO) +logging.getLogger("pytential.qbx.distributed").setLevel(logging.INFO) # Get MPI information comm = MPI.COMM_WORLD -- GitLab From 7a7469c1ac867c13bd8f4302426a92fa9eaf7975 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 27 May 2018 16:58:31 -0500 Subject: [PATCH 20/86] Boxtree generate_local_travs function generate one trav object --- pytential/qbx/distributed.py | 61 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index e30d6ccf..665b379e 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -184,7 +184,7 @@ class DistributedGeoData(object): generate_local_tree(traversal) from boxtree.distributed import generate_local_travs - self.trav_local, self.trav_global = generate_local_travs( + self.local_trav = generate_local_travs( self.local_tree, self.box_bounding_box, comm=comm, well_sep_is_n_away=trav_param["well_sep_is_n_away"], from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], @@ -483,8 +483,8 @@ class DistributedGeoData(object): (self.local_tree.nboxes,), dtype=self.local_tree.particle_id_dtype) # make sure accessing invalid position raises an error global_boxes_to_target_boxes *= -1 - global_boxes_to_target_boxes[self.trav_global.target_boxes] = \ - np.arange(self.trav_global.target_boxes.shape[0]) + global_boxes_to_target_boxes[self.local_trav.target_boxes] = \ + np.arange(self.local_trav.target_boxes.shape[0]) self._local_qbx_center_to_target_box = \ global_boxes_to_target_boxes[local_qbx_center_to_target_box] @@ -534,7 +534,7 @@ class DistributedGeoData(object): return self._non_qbx_box_target_lists def traversal(self): - return self.trav_global + return self.local_trav def tree(self): return self.traversal().tree @@ -744,8 +744,7 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, root_wrangler, distributed_geo_data) wrangler = distributed_wrangler - local_traversal = distributed_geo_data.trav_local - global_traversal = distributed_geo_data.trav_global + local_traversal = distributed_geo_data.local_trav # {{{ Distribute source weights @@ -795,9 +794,9 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials = wrangler.eval_direct( - global_traversal.target_boxes, - global_traversal.neighbor_source_boxes_starts, - global_traversal.neighbor_source_boxes_lists, + local_traversal.target_boxes, + local_traversal.neighbor_source_boxes_starts, + local_traversal.neighbor_source_boxes_lists, local_source_weights) # }}} @@ -805,10 +804,10 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ translate separated siblings' ("list 2") mpoles to local local_exps = wrangler.multipole_to_local( - global_traversal.level_start_target_or_target_parent_box_nrs, - global_traversal.target_or_target_parent_boxes, - global_traversal.from_sep_siblings_starts, - global_traversal.from_sep_siblings_lists, + local_traversal.level_start_target_or_target_parent_box_nrs, + local_traversal.target_or_target_parent_boxes, + local_traversal.from_sep_siblings_starts, + local_traversal.from_sep_siblings_lists, mpole_exps) # }}} @@ -819,17 +818,17 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # contribution *out* of the downward-propagating local expansions) non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( - global_traversal.target_boxes_sep_smaller_by_source_level, - global_traversal.from_sep_smaller_by_level, + local_traversal.target_boxes_sep_smaller_by_source_level, + local_traversal.from_sep_smaller_by_level, mpole_exps) # assert that list 3 close has been merged into list 1 # assert global_traversal.from_sep_close_smaller_starts is None - if global_traversal.from_sep_close_smaller_starts is not None: + if local_traversal.from_sep_close_smaller_starts is not None: non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( - global_traversal.target_boxes, - global_traversal.from_sep_close_smaller_starts, - global_traversal.from_sep_close_smaller_lists, + local_traversal.target_boxes, + local_traversal.from_sep_close_smaller_starts, + local_traversal.from_sep_close_smaller_lists, local_source_weights) # }}} @@ -837,17 +836,17 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ form locals for separated bigger source boxes ("list 4") local_exps = local_exps + wrangler.form_locals( - global_traversal.level_start_target_or_target_parent_box_nrs, - global_traversal.target_or_target_parent_boxes, - global_traversal.from_sep_bigger_starts, - global_traversal.from_sep_bigger_lists, + local_traversal.level_start_target_or_target_parent_box_nrs, + local_traversal.target_or_target_parent_boxes, + local_traversal.from_sep_bigger_starts, + local_traversal.from_sep_bigger_lists, local_source_weights) - if global_traversal.from_sep_close_bigger_starts is not None: + if local_traversal.from_sep_close_bigger_starts is not None: non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( - global_traversal.target_or_target_parent_boxes, - global_traversal.from_sep_close_bigger_starts, - global_traversal.from_sep_close_bigger_lists, + local_traversal.target_or_target_parent_boxes, + local_traversal.from_sep_close_bigger_starts, + local_traversal.from_sep_close_bigger_lists, local_source_weights) # }}} @@ -855,8 +854,8 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ propagate local_exps downward wrangler.refine_locals( - global_traversal.level_start_target_or_target_parent_box_nrs, - global_traversal.target_or_target_parent_boxes, + local_traversal.level_start_target_or_target_parent_box_nrs, + local_traversal.target_or_target_parent_boxes, local_exps) # }}} @@ -864,8 +863,8 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ evaluate locals non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( - global_traversal.level_start_target_box_nrs, - global_traversal.target_boxes, + local_traversal.level_start_target_box_nrs, + local_traversal.target_boxes, local_exps) # }}} -- GitLab From 3e4700f638baa7d37a61160de870de7e2443fcbf Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 30 May 2018 21:09:01 -0500 Subject: [PATCH 21/86] Add 3d test case --- test/distributed/test_layer_pot_identity.py | 175 ++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 test/distributed/test_layer_pot_identity.py diff --git a/test/distributed/test_layer_pot_identity.py b/test/distributed/test_layer_pot_identity.py new file mode 100644 index 00000000..6cbc59ed --- /dev/null +++ b/test/distributed/test_layer_pot_identity.py @@ -0,0 +1,175 @@ +import pyopencl as cl +from pytential import bind, sym, norm +import numpy as np +from sympy.core.cache import clear_cache +from pytools.convergence import EOCRecorder +from mpi4py import MPI +from pytential.qbx.distributed import DistributedQBXLayerPotentialSource +from sumpy.kernel import LaplaceKernel +import matplotlib.pyplot as pt + +comm = MPI.COMM_WORLD +current_rank = comm.Get_rank() +total_rank = comm.Get_size() + +# prevent cache 'splosion +clear_cache() + +if current_rank == 0: + + class GreenExpr(object): + zero_op_name = "green" + + def get_zero_op(self, kernel, **knl_kwargs): + + u_sym = sym.var("u") + dn_u_sym = sym.var("dn_u") + + return ( + sym.S(kernel, dn_u_sym, qbx_forced_limit=-1, **knl_kwargs) + - sym.D(kernel, u_sym, qbx_forced_limit="avg", **knl_kwargs) + - 0.5*u_sym) + + order_drop = 0 + + def get_sphere_mesh(refinement_increment, target_order): + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(1, target_order) + from meshmode.mesh.refinement import Refiner + + refiner = Refiner(mesh) + for i in range(refinement_increment): + flags = np.ones(mesh.nelements, dtype=bool) + refiner.refine(flags) + mesh = refiner.get_current_mesh() + + return mesh + + class SphereGeometry(object): + mesh_name = "sphere" + dim = 3 + + resolutions = [0, 1] + + def get_mesh(self, resolution, tgt_order): + return get_sphere_mesh(resolution, tgt_order) + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + expr = GreenExpr() + geometry = SphereGeometry() + + target_order = 8 + k = 0 + qbx_order = 3 + fmm_order = 10 + resolutions = [0, 1] + _expansion_stick_out_factor = 0.5 + visualize = False + + eoc_rec = EOCRecorder() + + for resolution in resolutions: + mesh = geometry.get_mesh(resolution, target_order) + if mesh is None: + break + + d = mesh.ambient_dim + + lap_k_sym = LaplaceKernel(d) + k_sym = lap_k_sym + knl_kwargs = {} + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + + pre_density_discr = Discretization( + ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order)) + + refiner_extra_kwargs = {} + + qbx, _ = DistributedQBXLayerPotentialSource( + comm, + pre_density_discr, 4 * target_order, + qbx_order, + fmm_order=fmm_order, + _expansions_in_tree_have_extent=True, + _expansion_stick_out_factor=_expansion_stick_out_factor + ).with_refinement(**refiner_extra_kwargs) + + density_discr = qbx.density_discr + + # {{{ compute values of a solution to the PDE + + nodes_host = density_discr.nodes().get(queue) + normal = bind(density_discr, sym.normal(d))(queue).as_vector(np.object) + normal_host = [normal[j].get() for j in range(d)] + + center = np.array([3, 1, 2])[:d] + diff = nodes_host - center[:, np.newaxis] + dist_squared = np.sum(diff ** 2, axis=0) + dist = np.sqrt(dist_squared) + if d == 2: + u = np.log(dist) + grad_u = diff / dist_squared + elif d == 3: + u = 1 / dist + grad_u = -diff / dist ** 3 + else: + assert False + + dn_u = 0 + for i in range(d): + dn_u = dn_u + normal_host[i] * grad_u[i] + + # }}} + + u_dev = cl.array.to_device(queue, u) + dn_u_dev = cl.array.to_device(queue, dn_u) + grad_u_dev = cl.array.to_device(queue, grad_u) + + key = (qbx_order, geometry.mesh_name, resolution, + expr.zero_op_name) + + bound_op = bind(qbx, expr.get_zero_op(k_sym, **knl_kwargs)) + error = bound_op( + queue, u=u_dev, dn_u=dn_u_dev, grad_u=grad_u_dev, k=k) + if 0: + pt.plot(error) + pt.show() + + linf_error_norm = norm(density_discr, queue, error, p=np.inf) + print("--->", key, linf_error_norm) + + eoc_rec.add_data_point(qbx.h_max, linf_error_norm) + + if visualize: + from meshmode.discretization.visualization import make_visualizer + + bdry_vis = make_visualizer(queue, density_discr, target_order) + + bdry_normals = bind(density_discr, sym.normal(mesh.ambient_dim))(queue) \ + .as_vector(dtype=object) + + bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [ + ("u", u_dev), + ("bdry_normals", bdry_normals), + ("error", error), + ]) + + print(eoc_rec) + tgt_order = qbx_order - expr.order_drop + assert eoc_rec.order_estimate() > tgt_order - 1.6 + +else: + while True: + lp_source = DistributedQBXLayerPotentialSource(comm, None, None) + distribute_geo_data = lp_source.distibuted_geo_data(None) + + from pytential.qbx.distributed import drive_dfmm + wrangler = None + weights = None + drive_dfmm(wrangler, weights, distribute_geo_data, comm=comm) -- GitLab From 2fd612fc2a6c1fe6d099f6ec38a03c2f41d5e88f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 22 Jun 2018 17:04:31 +0800 Subject: [PATCH 22/86] Add QBXResponsibleBoxQuery, bug fix --- pytential/qbx/distributed.py | 143 ++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 665b379e..7bf32599 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -2,9 +2,11 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed import DistributedFMMLibExpansionWrangler, queue from boxtree.tree import FilteredTargetListsInTreeOrder +from boxtree.partition import ResponsibleBoxesQuery from mpi4py import MPI import numpy as np import pyopencl as cl +from pyopencl.algorithm import ListOfListsBuilder import logging import time @@ -130,6 +132,124 @@ class QBXDistributedFMMLibExpansionWrangler( # }}} +class QBXResponsibleBoxQuery(ResponsibleBoxesQuery): + + def __init__(self, queue, traversal, global_geo_data): + super(QBXResponsibleBoxQuery, self).__init__(queue, traversal) + + with cl.CommandQueue(global_geo_data.cl_context) as geo_data_queue: + self.user_target_to_center = \ + global_geo_data.user_target_to_center().get(queue=geo_data_queue) + self.qbx_center_to_target_box = \ + global_geo_data.qbx_center_to_target_box().get(queue=geo_data_queue) + + self.box_target_starts_dev = cl.array.to_device( + queue, traversal.tree.box_target_starts) + + self.box_target_counts_nonchild_dev = cl.array.to_device( + queue, traversal.tree.box_target_counts_nonchild) + + # helper kernel for constructing a list of particles in given boxes + from mako.template import Template + from pyopencl.tools import dtype_to_ctype + + self.box_to_target_knl = ListOfListsBuilder( + queue.context, + [("particle_list", self.tree.particle_id_dtype)], + Template(""" + void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) + { + typedef ${particle_id_t} particle_id_t; + typedef ${box_id_t} box_id_t; + + box_id_t ibox = box_list[i]; + particle_id_t start = box_particle_start[ibox]; + particle_id_t end = start + box_particle_counts_nonchild[ibox]; + + for(particle_id_t iparticle = start; iparticle < end; iparticle++) + { + APPEND_particle_list(iparticle); + } + } + """).render( + particle_id_t=dtype_to_ctype(self.tree.particle_id_dtype), + box_id_t=dtype_to_ctype(self.tree.box_id_dtype) + ), + arg_decls=Template(""" + ${particle_id_t} *box_particle_start, + ${particle_id_t} *box_particle_counts_nonchild, + ${box_id_t} *box_list + """).render( + particle_id_t=dtype_to_ctype(self.tree.particle_id_dtype), + box_id_t=dtype_to_ctype(self.tree.box_id_dtype) + ) + ) + + def center_boxes_mask(self, responsible_boxes_list): + lists, _ = self.box_to_target_knl( + self.queue, + responsible_boxes_list.shape[0], + self.box_target_starts_dev.data, + self.box_target_counts_nonchild_dev.data, + cl.array.to_device(self.queue, responsible_boxes_list).data + ) + + targets_list = lists["particle_list"] + targets_list = targets_list.lists.get() + + tree_order_to_user_order = np.ones( + (self.tree.ntargets,), dtype=self.tree.particle_id_dtype) * -1 + tree_order_to_user_order[self.tree.sorted_target_ids] = np.arange( + self.tree.ntargets) + + targets_list = tree_order_to_user_order[targets_list] + + centers = self.user_target_to_center[targets_list] + + from pytential.qbx.geometry import target_state + centers = centers[centers != target_state.NO_QBX_NEEDED] + + center_target_boxes = self.qbx_center_to_target_box[centers] + + global_center_boxes = self.traversal.target_boxes[center_target_boxes] + + center_boxes_mask = np.zeros((self.tree.nboxes,), dtype=np.int8) + center_boxes_mask[global_center_boxes] = 1 + + center_boxes_mask = cl.array.to_device(queue, center_boxes_mask) + + return center_boxes_mask + + def get_boxes_mask(self, responsible_boxes_list): + responsible_boxes_mask = np.zeros((self.tree.nboxes,), dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list] = 1 + responsible_boxes_mask = cl.array.to_device( + self.queue, responsible_boxes_mask) + + ancestor_boxes_mask = self.ancestor_boxes_mask(responsible_boxes_mask) + + centers_boxes_mask = self.center_boxes_mask(responsible_boxes_list) + + responsible_and_centers_boxes_mask = ( + responsible_boxes_mask | centers_boxes_mask) + + ancestor_responsible_and_centers_boxes_mask = self.ancestor_boxes_mask( + responsible_and_centers_boxes_mask) + + src_boxes_mask = self.src_boxes_mask( + responsible_and_centers_boxes_mask, + ancestor_responsible_and_centers_boxes_mask + ) + + multipole_boxes_mask = self.multipole_boxes_mask( + responsible_and_centers_boxes_mask, + ancestor_responsible_and_centers_boxes_mask, + ) + + return (responsible_boxes_mask, ancestor_boxes_mask, src_boxes_mask, + multipole_boxes_mask) + + # {{{ Distributed GeoData class DistributedGeoData(object): @@ -179,9 +299,28 @@ class DistributedGeoData(object): # }}} + if current_rank == 0: + from boxtree.partition import partition_work + from boxtree.distributed import WorkloadWeight + workload_weight = WorkloadWeight( + direct=1, m2l=1, m2p=1, p2l=1, multipole=5 + ) + responsible_boxes_list = partition_work( + traversal, comm.Get_size(), workload_weight + ) + else: + responsible_boxes_list = None + + if current_rank == 0: + responsible_box_query = QBXResponsibleBoxQuery( + queue, traversal, geo_data.geo_data) + else: + responsible_box_query = None + from boxtree.distributed import generate_local_tree self.local_tree, self.local_data, self.box_bounding_box, knls = \ - generate_local_tree(traversal) + generate_local_tree(traversal, responsible_boxes_list, + responsible_box_query) from boxtree.distributed import generate_local_travs self.local_trav = generate_local_travs( @@ -623,7 +762,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _max_leaf_refine_weight=_max_leaf_refine_weight, _box_extent_norm=_box_extent_norm, _from_sep_smaller_crit=_from_sep_smaller_crit, - _from_sep_smaller_min_nsources_cumul=None, + _from_sep_smaller_min_nsources_cumul=0, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, fmm_backend='distributed', -- GitLab From aca0eca06312af865598a45370da6f5ebfb1d7b7 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 3 Jul 2018 13:51:11 +0800 Subject: [PATCH 23/86] Use new boxtree API --- pytential/qbx/__init__.py | 4 +- pytential/qbx/distributed.py | 64 ++++++++++++--------- test/distributed/test_layer_pot_identity.py | 11 ++-- test/distributed/test_off_surface_eval.py | 4 +- 4 files changed, 47 insertions(+), 36 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 346cfe23..0213c806 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -728,11 +728,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX if self.fmm_backend == 'distributed': - distributed_geo_data = self.distibuted_geo_data(geo_data) + distributed_geo_data = self.distibuted_geo_data(geo_data, queue) from pytential.qbx.distributed import drive_dfmm all_potentials_on_every_tgt = drive_dfmm( - wrangler, strengths, distributed_geo_data, comm=self.comm) + queue, wrangler, strengths, distributed_geo_data, comm=self.comm) else: from pytential.qbx.fmm import drive_fmm all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 7bf32599..563c58ce 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,8 +1,8 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource, _not_provided -from boxtree.distributed import DistributedFMMLibExpansionWrangler, queue +from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder -from boxtree.partition import ResponsibleBoxesQuery +from boxtree.distributed.partition import ResponsibleBoxesQuery from mpi4py import MPI import numpy as np import pyopencl as cl @@ -36,7 +36,7 @@ class QBXDistributedFMMLibExpansionWrangler( QBXFMMLibExpansionWrangler, DistributedFMMLibExpansionWrangler): @classmethod - def distribute(cls, wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): + def distribute(cls, queue, wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -92,6 +92,8 @@ class QBXDistributedFMMLibExpansionWrangler( # }}} + distributed_wrangler.queue = queue + return distributed_wrangler def eval_qbx_expansions(self, qbx_expansions): @@ -216,7 +218,7 @@ class QBXResponsibleBoxQuery(ResponsibleBoxesQuery): center_boxes_mask = np.zeros((self.tree.nboxes,), dtype=np.int8) center_boxes_mask[global_center_boxes] = 1 - center_boxes_mask = cl.array.to_device(queue, center_boxes_mask) + center_boxes_mask = cl.array.to_device(self.queue, center_boxes_mask) return center_boxes_mask @@ -253,7 +255,7 @@ class QBXResponsibleBoxQuery(ResponsibleBoxesQuery): # {{{ Distributed GeoData class DistributedGeoData(object): - def __init__(self, geo_data, comm=MPI.COMM_WORLD): + def __init__(self, geo_data, queue, comm=MPI.COMM_WORLD): self.comm = comm current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -300,7 +302,7 @@ class DistributedGeoData(object): # }}} if current_rank == 0: - from boxtree.partition import partition_work + from boxtree.distributed.partition import partition_work from boxtree.distributed import WorkloadWeight workload_weight = WorkloadWeight( direct=1, m2l=1, m2p=1, p2l=1, multipole=5 @@ -317,14 +319,14 @@ class DistributedGeoData(object): else: responsible_box_query = None - from boxtree.distributed import generate_local_tree - self.local_tree, self.local_data, self.box_bounding_box, knls = \ - generate_local_tree(traversal, responsible_boxes_list, + from boxtree.distributed.local_tree import generate_local_tree + self.local_tree, self.local_data, self.box_bounding_box = \ + generate_local_tree(queue, traversal, responsible_boxes_list, responsible_box_query) - from boxtree.distributed import generate_local_travs + from boxtree.distributed.local_traversal import generate_local_travs self.local_trav = generate_local_travs( - self.local_tree, self.box_bounding_box, comm=comm, + queue, self.local_tree, self.box_bounding_box, well_sep_is_n_away=trav_param["well_sep_is_n_away"], from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], merge_close_lists=True @@ -333,6 +335,9 @@ class DistributedGeoData(object): # {{{ Distribute non_qbx_box_target_lists if current_rank == 0: # master process + from boxtree.distributed.local_tree import get_fetch_local_particles_knls + knls = get_fetch_local_particles_knls(queue.context, tree) + box_target_starts = cl.array.to_device( queue, non_qbx_box_target_lists.box_target_starts) box_target_counts_nonchild = cl.array.to_device( @@ -347,7 +352,7 @@ class DistributedGeoData(object): for irank in range(total_rank): particle_mask = cl.array.zeros(queue, (nfiltered_targets,), dtype=tree.particle_id_dtype) - knls["particle_mask_knl"]( + knls.particle_mask_knl( self.local_data[irank]["tgt_box_mask"], box_target_starts, box_target_counts_nonchild, @@ -357,18 +362,18 @@ class DistributedGeoData(object): particle_scan = cl.array.empty(queue, (nfiltered_targets + 1,), dtype=tree.particle_id_dtype) particle_scan[0] = 0 - knls["mask_scan_knl"](particle_mask, particle_scan) + knls.mask_scan_knl(particle_mask, particle_scan) local_box_target_starts = cl.array.empty( queue, (tree.nboxes,), dtype=tree.particle_id_dtype) - knls["generate_box_particle_starts"]( + knls.generate_box_particle_starts( box_target_starts, particle_scan, local_box_target_starts ) local_box_target_counts_nonchild = cl.array.zeros( queue, (tree.nboxes,), dtype=tree.particle_id_dtype) - knls["generate_box_particle_counts_nonchild"]( + knls.generate_box_particle_counts_nonchild( self.local_data[irank]["tgt_box_mask"], box_target_counts_nonchild, local_box_target_counts_nonchild @@ -826,7 +831,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): return obj - def distibuted_geo_data(self, geo_data): + def distibuted_geo_data(self, geo_data, queue): """ Note: This method needs to be called collectively by all processes of self.comm """ @@ -852,15 +857,14 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # no cached result found, construct a new distributed_geo_data if current_rank == 0: + from pytential.qbx.fmmlib import ToHostTransferredGeoDataWrapper + host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) - with cl.CommandQueue(geo_data.cl_context) as queue: - from pytential.qbx.fmmlib import ToHostTransferredGeoDataWrapper - host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) - - distributed_geo_data = DistributedGeoData(host_geo_data, self.comm) + distributed_geo_data = DistributedGeoData(host_geo_data, queue, + self.comm) else: - distributed_geo_data = DistributedGeoData(None, self.comm) + distributed_geo_data = DistributedGeoData(None, queue, self.comm) self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data @@ -869,7 +873,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # {{{ FMM Driver -def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, +def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, _communicate_mpoles_via_allreduce=False): @@ -880,7 +884,7 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, start_time = time.time() distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( - root_wrangler, distributed_geo_data) + queue, root_wrangler, distributed_geo_data) wrangler = distributed_wrangler local_traversal = distributed_geo_data.local_trav @@ -893,9 +897,15 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, else: global_tree = None - from boxtree.distributed import distribute_source_weights + from boxtree.distributed.calculation import distribute_source_weights + + if current_rank == 0: + queue = cl.CommandQueue(root_wrangler.geo_data.geo_data.cl_context) + else: + queue = None + local_source_weights = distribute_source_weights( - src_weights, global_tree, distributed_geo_data.local_data, comm=comm) + queue, src_weights, global_tree, distributed_geo_data.local_data, comm=comm) # }}} @@ -919,7 +929,7 @@ def drive_dfmm(root_wrangler, src_weights, distributed_geo_data, # {{{ Communicate mpoles - from boxtree.distributed import communicate_mpoles + from boxtree.distributed.calculation import communicate_mpoles if _communicate_mpoles_via_allreduce: mpole_exps_all = np.zeros_like(mpole_exps) diff --git a/test/distributed/test_layer_pot_identity.py b/test/distributed/test_layer_pot_identity.py index 6cbc59ed..248fdcce 100644 --- a/test/distributed/test_layer_pot_identity.py +++ b/test/distributed/test_layer_pot_identity.py @@ -15,6 +15,10 @@ total_rank = comm.Get_size() # prevent cache 'splosion clear_cache() +# Setup PyOpenCL +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + if current_rank == 0: class GreenExpr(object): @@ -54,9 +58,6 @@ if current_rank == 0: def get_mesh(self, resolution, tgt_order): return get_sphere_mesh(resolution, tgt_order) - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - expr = GreenExpr() geometry = SphereGeometry() @@ -167,9 +168,9 @@ if current_rank == 0: else: while True: lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue) from pytential.qbx.distributed import drive_dfmm wrangler = None weights = None - drive_dfmm(wrangler, weights, distribute_geo_data, comm=comm) + drive_dfmm(queue, wrangler, weights, distribute_geo_data, comm=comm) diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index 78565a58..a4149c9f 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -84,9 +84,9 @@ if current_rank == 0: # master rank else: # helper rank lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue) from pytential.qbx.distributed import drive_dfmm wrangler = None weights = None - drive_dfmm(wrangler, weights, distribute_geo_data, comm=comm) + drive_dfmm(queue, wrangler, weights, distribute_geo_data, comm=comm) -- GitLab From 5c41c1d9a17ee046f35ed9cb1a550012c6b64ef1 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 6 Jul 2018 06:02:43 -0500 Subject: [PATCH 24/86] Use new local_data API --- pytential/qbx/distributed.py | 55 ++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 563c58ce..a01e9b09 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -67,21 +67,26 @@ class QBXDistributedFMMLibExpansionWrangler( if distributed_wrangler.dipole_vec: if current_rank == 0: - reqs_dipole_vec = np.empty((total_rank,), dtype=object) + reqs_dipole_vec = [] local_dipole_vec = np.empty((total_rank,), dtype=object) + for irank in range(total_rank): - src_mask = \ - distributed_geo_data.local_data[irank]["src_mask"].get() - local_dipole_vec[irank] = \ - wrangler.dipole_vec[:, src_mask.astype(bool)] - reqs_dipole_vec[irank] = comm.isend( - local_dipole_vec[irank], - dest=irank, - tag=MPITags["dipole_vec"] - ) - for irank in range(1, total_rank): - reqs_dipole_vec[irank].wait() + src_idx = distributed_geo_data.local_data[irank].src_idx + + local_dipole_vec[irank] = wrangler.dipole_vec[:, src_idx] + + if irank != 0: + reqs_dipole_vec.append( + comm.isend( + local_dipole_vec[irank], + dest=irank, + tag=MPITags["dipole_vec"] + ) + ) + + MPI.Request.Waitall(reqs_dipole_vec) + distributed_wrangler.dipole_vec = local_dipole_vec[0] else: distributed_wrangler.dipole_vec = comm.recv( @@ -352,8 +357,15 @@ class DistributedGeoData(object): for irank in range(total_rank): particle_mask = cl.array.zeros(queue, (nfiltered_targets,), dtype=tree.particle_id_dtype) + + responsible_boxes_mask = np.zeros((tree.nboxes,), dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list[irank]] = 1 + responsible_boxes_mask = cl.array.to_device( + queue, responsible_boxes_mask + ) + knls.particle_mask_knl( - self.local_data[irank]["tgt_box_mask"], + responsible_boxes_mask, box_target_starts, box_target_counts_nonchild, particle_mask @@ -374,7 +386,7 @@ class DistributedGeoData(object): local_box_target_counts_nonchild = cl.array.zeros( queue, (tree.nboxes,), dtype=tree.particle_id_dtype) knls.generate_box_particle_counts_nonchild( - self.local_data[irank]["tgt_box_mask"], + responsible_boxes_mask, box_target_counts_nonchild, local_box_target_counts_nonchild ) @@ -440,7 +452,10 @@ class DistributedGeoData(object): self.qbx_target_mask = np.empty((total_rank,), dtype=object) for irank in range(total_rank): - tgt_mask = self.local_data[irank]["tgt_mask"].get().astype(bool) + + tgt_mask = np.zeros((tree.ntargets,), dtype=bool) + tgt_mask[self.local_data[irank].tgt_idx] = True + tgt_mask_user_order = tgt_mask[tree.sorted_target_ids] centers_mask = tgt_mask_user_order[:ncenters] centers_scan = np.empty( @@ -892,20 +907,12 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, # {{{ Distribute source weights if current_rank == 0: - global_tree = root_wrangler.geo_data.tree() src_weights = root_wrangler.reorder_sources(src_weights) - else: - global_tree = None from boxtree.distributed.calculation import distribute_source_weights - if current_rank == 0: - queue = cl.CommandQueue(root_wrangler.geo_data.geo_data.cl_context) - else: - queue = None - local_source_weights = distribute_source_weights( - queue, src_weights, global_tree, distributed_geo_data.local_data, comm=comm) + src_weights, distributed_geo_data.local_data, comm=comm) # }}} -- GitLab From 2a23bb2c6b7e1b5fe95ed8561c929482ec8e6fb7 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 13 Jul 2018 08:38:21 -0500 Subject: [PATCH 25/86] Correction for new timing API --- pytential/qbx/distributed.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index a01e9b09..14404ea0 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -9,6 +9,7 @@ import pyopencl as cl from pyopencl.algorithm import ListOfListsBuilder import logging import time +from boxtree.tools import return_timing_data logger = logging.getLogger(__name__) @@ -101,6 +102,7 @@ class QBXDistributedFMMLibExpansionWrangler( return distributed_wrangler + @return_timing_data def eval_qbx_expansions(self, qbx_expansions): geo_data = self.geo_data ctt = geo_data.center_to_tree_targets() @@ -921,7 +923,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, mpole_exps = wrangler.form_multipoles( local_traversal.level_start_source_box_nrs, local_traversal.source_boxes, - local_source_weights) + local_source_weights)[0] # }}} @@ -953,7 +955,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, local_traversal.target_boxes, local_traversal.neighbor_source_boxes_starts, local_traversal.neighbor_source_boxes_lists, - local_source_weights) + local_source_weights)[0] # }}} @@ -964,7 +966,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, local_traversal.target_or_target_parent_boxes, local_traversal.from_sep_siblings_starts, local_traversal.from_sep_siblings_lists, - mpole_exps) + mpole_exps)[0] # }}} @@ -976,7 +978,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( local_traversal.target_boxes_sep_smaller_by_source_level, local_traversal.from_sep_smaller_by_level, - mpole_exps) + mpole_exps)[0] # assert that list 3 close has been merged into list 1 # assert global_traversal.from_sep_close_smaller_starts is None @@ -985,7 +987,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, local_traversal.target_boxes, local_traversal.from_sep_close_smaller_starts, local_traversal.from_sep_close_smaller_lists, - local_source_weights) + local_source_weights)[0] # }}} @@ -996,14 +998,14 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, local_traversal.target_or_target_parent_boxes, local_traversal.from_sep_bigger_starts, local_traversal.from_sep_bigger_lists, - local_source_weights) + local_source_weights)[0] if local_traversal.from_sep_close_bigger_starts is not None: non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( local_traversal.target_or_target_parent_boxes, local_traversal.from_sep_close_bigger_starts, local_traversal.from_sep_close_bigger_lists, - local_source_weights) + local_source_weights)[0] # }}} @@ -1021,21 +1023,21 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( local_traversal.level_start_target_box_nrs, local_traversal.target_boxes, - local_exps) + local_exps)[0] # }}} # {{{ wrangle qbx expansions - qbx_expansions = wrangler.form_global_qbx_locals(local_source_weights) + qbx_expansions = wrangler.form_global_qbx_locals(local_source_weights)[0] qbx_expansions = qbx_expansions + \ - wrangler.translate_box_multipoles_to_qbx_local(mpole_exps) + wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)[0] qbx_expansions = qbx_expansions + \ - wrangler.translate_box_local_to_qbx_local(local_exps) + wrangler.translate_box_local_to_qbx_local(local_exps)[0] - qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions) + qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions)[0] # }}} -- GitLab From 4069b2caf0702b2936eba166e0e85f42b53001ac Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 14 Jul 2018 09:09:18 -0500 Subject: [PATCH 26/86] Use Waitall instead of loop in geo_data distribution --- pytential/qbx/distributed.py | 63 +++++++++++++----------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 14404ea0..13c9af38 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -445,12 +445,7 @@ class DistributedGeoData(object): local_center_to_tree_targets = np.empty((total_rank,), dtype=object) local_qbx_targets = np.empty((total_rank,), dtype=object) - reqs_centers = np.empty((total_rank,), dtype=object) - reqs_global_qbx_centers = np.empty((total_rank,), dtype=object) - reqs_expansion_radii = np.empty((total_rank,), dtype=object) - reqs_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) - reqs_center_to_tree_targets = np.empty((total_rank,), dtype=object) - reqs_qbx_targets = np.empty((total_rank,), dtype=object) + reqs = [] self.qbx_target_mask = np.empty((total_rank,), dtype=object) for irank in range(total_rank): @@ -476,11 +471,11 @@ class DistributedGeoData(object): local_centers[irank][idims][:] = centers[idims][centers_mask] if irank != 0: - reqs_centers[irank] = comm.isend( + reqs.append(comm.isend( local_centers[irank], dest=irank, tag=MPITags["centers"] - ) + )) # }}} @@ -490,11 +485,11 @@ class DistributedGeoData(object): global_qbx_centers[centers_mask[global_qbx_centers]]] if irank != 0: - reqs_global_qbx_centers[irank] = comm.isend( + reqs.append(comm.isend( local_global_qbx_centers[irank], dest=irank, tag=MPITags["global_qbx_centers"] - ) + )) # }}} @@ -502,11 +497,11 @@ class DistributedGeoData(object): local_expansion_radii[irank] = expansion_radii[centers_mask] if irank != 0: - reqs_expansion_radii[irank] = comm.isend( + reqs.append(comm.isend( local_expansion_radii[irank], dest=irank, tag=MPITags["expansion_radii"] - ) + )) # }}} @@ -520,11 +515,11 @@ class DistributedGeoData(object): local_qbx_center_to_target_box[irank] = \ traversal.target_boxes[qbx_center_to_target_box[centers_mask]] if irank != 0: - reqs_qbx_center_to_target_box[irank] = comm.isend( + reqs.append(comm.isend( local_qbx_center_to_target_box[irank], dest=irank, tag=MPITags["qbx_center_to_target_box"] - ) + )) # }}} @@ -572,46 +567,34 @@ class DistributedGeoData(object): for idim in range(tree.dimensions): local_qbx_targets[irank][idim, :] = \ tree.targets[idim][qbx_target_mask] - reqs_qbx_targets[irank] = comm.isend( - local_qbx_targets[irank], - dest=irank, - tag=MPITags["qbx_targets"] - ) + if irank != 0: + reqs.append(comm.isend( + local_qbx_targets[irank], + dest=irank, + tag=MPITags["qbx_targets"] + )) local_lists = qbx_target_scan[local_lists] local_center_to_tree_targets[irank] = { "starts": local_starts, "lists": local_lists } - reqs_center_to_tree_targets[irank] = comm.isend( - local_center_to_tree_targets[irank], - dest=irank, - tag=MPITags["center_to_tree_targets"]) + if irank != 0: + reqs.append(comm.isend( + local_center_to_tree_targets[irank], + dest=irank, + tag=MPITags["center_to_tree_targets"] + )) # }}} - for irank in range(1, total_rank): - reqs_centers[irank].wait() - local_centers = local_centers[0] + MPI.Request.Waitall(reqs) - for irank in range(1, total_rank): - reqs_global_qbx_centers[irank].wait() + local_centers = local_centers[0] local_global_qbx_centers = local_global_qbx_centers[0] - - for irank in range(1, total_rank): - reqs_expansion_radii[irank].wait() local_expansion_radii = local_expansion_radii[0] - - for irank in range(1, total_rank): - reqs_qbx_center_to_target_box[irank].wait() local_qbx_center_to_target_box = local_qbx_center_to_target_box[0] - - for irank in range(1, total_rank): - reqs_center_to_tree_targets[irank].wait() local_center_to_tree_targets = local_center_to_tree_targets[0] - - for irank in range(1, total_rank): - reqs_qbx_targets[irank].wait() local_qbx_targets = local_qbx_targets[0] logger.info("Distribute geometry data in {} secs.".format( -- GitLab From 2e956dd0615c41a7f7bbf185fa83c88bb2df4ba2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 17 Jul 2018 06:32:41 -0500 Subject: [PATCH 27/86] Remove extra dependencies, set no_targets flag --- pytential/qbx/distributed.py | 124 +---------------------------------- 1 file changed, 2 insertions(+), 122 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 13c9af38..195b6a07 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -6,7 +6,6 @@ from boxtree.distributed.partition import ResponsibleBoxesQuery from mpi4py import MPI import numpy as np import pyopencl as cl -from pyopencl.algorithm import ListOfListsBuilder import logging import time from boxtree.tools import return_timing_data @@ -141,124 +140,6 @@ class QBXDistributedFMMLibExpansionWrangler( # }}} -class QBXResponsibleBoxQuery(ResponsibleBoxesQuery): - - def __init__(self, queue, traversal, global_geo_data): - super(QBXResponsibleBoxQuery, self).__init__(queue, traversal) - - with cl.CommandQueue(global_geo_data.cl_context) as geo_data_queue: - self.user_target_to_center = \ - global_geo_data.user_target_to_center().get(queue=geo_data_queue) - self.qbx_center_to_target_box = \ - global_geo_data.qbx_center_to_target_box().get(queue=geo_data_queue) - - self.box_target_starts_dev = cl.array.to_device( - queue, traversal.tree.box_target_starts) - - self.box_target_counts_nonchild_dev = cl.array.to_device( - queue, traversal.tree.box_target_counts_nonchild) - - # helper kernel for constructing a list of particles in given boxes - from mako.template import Template - from pyopencl.tools import dtype_to_ctype - - self.box_to_target_knl = ListOfListsBuilder( - queue.context, - [("particle_list", self.tree.particle_id_dtype)], - Template(""" - void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) - { - typedef ${particle_id_t} particle_id_t; - typedef ${box_id_t} box_id_t; - - box_id_t ibox = box_list[i]; - particle_id_t start = box_particle_start[ibox]; - particle_id_t end = start + box_particle_counts_nonchild[ibox]; - - for(particle_id_t iparticle = start; iparticle < end; iparticle++) - { - APPEND_particle_list(iparticle); - } - } - """).render( - particle_id_t=dtype_to_ctype(self.tree.particle_id_dtype), - box_id_t=dtype_to_ctype(self.tree.box_id_dtype) - ), - arg_decls=Template(""" - ${particle_id_t} *box_particle_start, - ${particle_id_t} *box_particle_counts_nonchild, - ${box_id_t} *box_list - """).render( - particle_id_t=dtype_to_ctype(self.tree.particle_id_dtype), - box_id_t=dtype_to_ctype(self.tree.box_id_dtype) - ) - ) - - def center_boxes_mask(self, responsible_boxes_list): - lists, _ = self.box_to_target_knl( - self.queue, - responsible_boxes_list.shape[0], - self.box_target_starts_dev.data, - self.box_target_counts_nonchild_dev.data, - cl.array.to_device(self.queue, responsible_boxes_list).data - ) - - targets_list = lists["particle_list"] - targets_list = targets_list.lists.get() - - tree_order_to_user_order = np.ones( - (self.tree.ntargets,), dtype=self.tree.particle_id_dtype) * -1 - tree_order_to_user_order[self.tree.sorted_target_ids] = np.arange( - self.tree.ntargets) - - targets_list = tree_order_to_user_order[targets_list] - - centers = self.user_target_to_center[targets_list] - - from pytential.qbx.geometry import target_state - centers = centers[centers != target_state.NO_QBX_NEEDED] - - center_target_boxes = self.qbx_center_to_target_box[centers] - - global_center_boxes = self.traversal.target_boxes[center_target_boxes] - - center_boxes_mask = np.zeros((self.tree.nboxes,), dtype=np.int8) - center_boxes_mask[global_center_boxes] = 1 - - center_boxes_mask = cl.array.to_device(self.queue, center_boxes_mask) - - return center_boxes_mask - - def get_boxes_mask(self, responsible_boxes_list): - responsible_boxes_mask = np.zeros((self.tree.nboxes,), dtype=np.int8) - responsible_boxes_mask[responsible_boxes_list] = 1 - responsible_boxes_mask = cl.array.to_device( - self.queue, responsible_boxes_mask) - - ancestor_boxes_mask = self.ancestor_boxes_mask(responsible_boxes_mask) - - centers_boxes_mask = self.center_boxes_mask(responsible_boxes_list) - - responsible_and_centers_boxes_mask = ( - responsible_boxes_mask | centers_boxes_mask) - - ancestor_responsible_and_centers_boxes_mask = self.ancestor_boxes_mask( - responsible_and_centers_boxes_mask) - - src_boxes_mask = self.src_boxes_mask( - responsible_and_centers_boxes_mask, - ancestor_responsible_and_centers_boxes_mask - ) - - multipole_boxes_mask = self.multipole_boxes_mask( - responsible_and_centers_boxes_mask, - ancestor_responsible_and_centers_boxes_mask, - ) - - return (responsible_boxes_mask, ancestor_boxes_mask, src_boxes_mask, - multipole_boxes_mask) - - # {{{ Distributed GeoData class DistributedGeoData(object): @@ -321,15 +202,14 @@ class DistributedGeoData(object): responsible_boxes_list = None if current_rank == 0: - responsible_box_query = QBXResponsibleBoxQuery( - queue, traversal, geo_data.geo_data) + responsible_box_query = ResponsibleBoxesQuery(queue, traversal) else: responsible_box_query = None from boxtree.distributed.local_tree import generate_local_tree self.local_tree, self.local_data, self.box_bounding_box = \ generate_local_tree(queue, traversal, responsible_boxes_list, - responsible_box_query) + responsible_box_query, no_targets=True) from boxtree.distributed.local_traversal import generate_local_travs self.local_trav = generate_local_travs( -- GitLab From 87b82211eda2fedfef104e504440ca0a11b603bc Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 1 Aug 2018 08:53:28 -0500 Subject: [PATCH 28/86] Use boxtree perf model --- pytential/qbx/__init__.py | 6 ++- pytential/qbx/distributed.py | 53 ++++++++++++--------- test/distributed/test_layer_pot_identity.py | 4 +- test/distributed/test_off_surface_eval.py | 4 +- 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index fded57b3..a8af1085 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -764,11 +764,13 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX if self.fmm_backend == 'distributed': - distributed_geo_data = self.distibuted_geo_data(geo_data, queue) + distributed_geo_data = self.distibuted_geo_data( + geo_data, queue, wrangler + ) from pytential.qbx.distributed import drive_dfmm all_potentials_on_every_tgt = drive_dfmm( - queue, wrangler, strengths, distributed_geo_data, comm=self.comm) + queue, strengths, distributed_geo_data, comm=self.comm) else: from pytential.qbx.fmm import drive_fmm all_potentials_on_every_tgt = drive_fmm(wrangler, strengths) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 195b6a07..5da94b76 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -3,6 +3,7 @@ from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.distributed.partition import ResponsibleBoxesQuery +from boxtree.distributed.perf_model import PerformanceCounter, PerformanceModel from mpi4py import MPI import numpy as np import pyopencl as cl @@ -143,11 +144,13 @@ class QBXDistributedFMMLibExpansionWrangler( # {{{ Distributed GeoData class DistributedGeoData(object): - def __init__(self, geo_data, queue, comm=MPI.COMM_WORLD): + def __init__(self, geo_data, queue, global_wrangler, comm=MPI.COMM_WORLD): self.comm = comm current_rank = comm.Get_rank() total_rank = comm.Get_size() + self.global_wrangler = global_wrangler + if geo_data is not None: # master process traversal = geo_data.traversal() tree = traversal.tree @@ -191,12 +194,17 @@ class DistributedGeoData(object): if current_rank == 0: from boxtree.distributed.partition import partition_work - from boxtree.distributed import WorkloadWeight - workload_weight = WorkloadWeight( - direct=1, m2l=1, m2p=1, p2l=1, multipole=5 - ) + + counter = PerformanceCounter(traversal, global_wrangler, True) + # FIXME: If the expansion wrangler is not FMMLib, the argument + # 'uses_pde_expansions' might be different + + model = PerformanceModel(queue.context, None, True, None) + + model.load_default_model() + responsible_boxes_list = partition_work( - traversal, comm.Get_size(), workload_weight + model, counter, traversal, comm.Get_size() ) else: responsible_boxes_list = None @@ -711,7 +719,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): return obj - def distibuted_geo_data(self, geo_data, queue): + def distibuted_geo_data(self, geo_data, queue, wrangler): """ Note: This method needs to be called collectively by all processes of self.comm """ @@ -740,11 +748,12 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): from pytential.qbx.fmmlib import ToHostTransferredGeoDataWrapper host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) - distributed_geo_data = DistributedGeoData(host_geo_data, queue, - self.comm) + distributed_geo_data = DistributedGeoData( + host_geo_data, queue, wrangler, self.comm + ) else: - distributed_geo_data = DistributedGeoData(None, queue, self.comm) + distributed_geo_data = DistributedGeoData(None, queue, None, self.comm) self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data @@ -753,18 +762,18 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # {{{ FMM Driver -def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, - comm=MPI.COMM_WORLD, +def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, _communicate_mpoles_via_allreduce=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() + global_wrangler = distributed_geo_data.global_wrangler if current_rank == 0: start_time = time.time() distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( - queue, root_wrangler, distributed_geo_data) + queue, global_wrangler, distributed_geo_data) wrangler = distributed_wrangler local_traversal = distributed_geo_data.local_trav @@ -772,7 +781,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, # {{{ Distribute source weights if current_rank == 0: - src_weights = root_wrangler.reorder_sources(src_weights) + src_weights = global_wrangler.reorder_sources(src_weights) from boxtree.distributed.calculation import distribute_source_weights @@ -910,14 +919,14 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, else: # master process - all_potentials_in_tree_order = root_wrangler.full_output_zeros() + all_potentials_in_tree_order = global_wrangler.full_output_zeros() - nqbtl = root_wrangler.geo_data.non_qbx_box_target_lists() + nqbtl = global_wrangler.geo_data.non_qbx_box_target_lists() from pytools.obj_array import make_obj_array non_qbx_potentials_all_rank = make_obj_array([ - np.zeros(nqbtl.nfiltered_targets, root_wrangler.dtype) - for k in root_wrangler.outputs] + np.zeros(nqbtl.nfiltered_targets, global_wrangler.dtype) + for k in global_wrangler.outputs] ) for irank in range(total_rank): @@ -928,7 +937,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, non_qbx_potentials_cur_rank = comm.recv( source=irank, tag=MPITags["non_qbx_potentials"]) - for idim in range(len(root_wrangler.outputs)): + for idim in range(len(global_wrangler.outputs)): non_qbx_potentials_all_rank[idim][ distributed_geo_data.particle_mask[irank] ] = non_qbx_potentials_cur_rank[idim] @@ -946,7 +955,7 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, source=irank, tag=MPITags["qbx_potentials"] ) - for idim in range(len(root_wrangler.outputs)): + for idim in range(len(global_wrangler.outputs)): all_potentials_in_tree_order[idim][ distributed_geo_data.qbx_target_mask[irank] ] = qbx_potentials_cur_rank[idim] @@ -954,8 +963,8 @@ def drive_dfmm(queue, root_wrangler, src_weights, distributed_geo_data, def reorder_and_finalize_potentials(x): # "finalize" gives host FMMs (like FMMlib) a chance to turn the # potential back into a CL array. - return root_wrangler.finalize_potentials( - x[root_wrangler.tree.sorted_target_ids]) + return global_wrangler.finalize_potentials( + x[global_wrangler.tree.sorted_target_ids]) from pytools.obj_array import with_object_array_or_scalar result = with_object_array_or_scalar( diff --git a/test/distributed/test_layer_pot_identity.py b/test/distributed/test_layer_pot_identity.py index 248fdcce..f0ca6e8a 100644 --- a/test/distributed/test_layer_pot_identity.py +++ b/test/distributed/test_layer_pot_identity.py @@ -168,9 +168,9 @@ if current_rank == 0: else: while True: lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None) from pytential.qbx.distributed import drive_dfmm wrangler = None weights = None - drive_dfmm(queue, wrangler, weights, distribute_geo_data, comm=comm) + drive_dfmm(queue, weights, distribute_geo_data, comm=comm) diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index a4149c9f..85029b2c 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -84,9 +84,9 @@ if current_rank == 0: # master rank else: # helper rank lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None) from pytential.qbx.distributed import drive_dfmm wrangler = None weights = None - drive_dfmm(queue, wrangler, weights, distribute_geo_data, comm=comm) + drive_dfmm(queue, weights, distribute_geo_data, comm=comm) -- GitLab From 2714fdf2dff32189dfdd2e7d213367bbc5a1a602 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 1 Aug 2018 11:45:30 -0500 Subject: [PATCH 29/86] QBX perf model parameters --- pytential/qbx/distributed.py | 5 ++-- pytential/qbx/perf_model.py | 45 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 pytential/qbx/perf_model.py diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 5da94b76..a968b5b1 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -3,7 +3,6 @@ from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.distributed.partition import ResponsibleBoxesQuery -from boxtree.distributed.perf_model import PerformanceCounter, PerformanceModel from mpi4py import MPI import numpy as np import pyopencl as cl @@ -195,10 +194,12 @@ class DistributedGeoData(object): if current_rank == 0: from boxtree.distributed.partition import partition_work - counter = PerformanceCounter(traversal, global_wrangler, True) + from pytential.qbx.perf_model import QBXPerformanceCounter + counter = QBXPerformanceCounter(traversal, global_wrangler, True) # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different + from boxtree.distributed.perf_model import PerformanceModel model = PerformanceModel(queue.context, None, True, None) model.load_default_model() diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py new file mode 100644 index 00000000..14612378 --- /dev/null +++ b/pytential/qbx/perf_model.py @@ -0,0 +1,45 @@ +from boxtree.distributed.perf_model import PerformanceCounter +from collections import namedtuple + +QBXParameters = namedtuple( + "QBXParameters", + ['ncoeffs_fmm_by_level', + 'ncoeffs_qbx', + 'translation_source_power', + 'translation_target_power', + 'translation_max_power'] +) + + +class QBXPerformanceCounter(PerformanceCounter): + + def __init__(self, traversal, wrangler, uses_pde_expansions): + self.traversal = traversal + self.wrangler = wrangler + self.uses_pde_expansions = uses_pde_expansions + + self.parameters = self.get_qbx_parameters( + traversal.tree.dimensions, + uses_pde_expansions, + wrangler.level_nterms, + wrangler.qbx_order + ) + + @staticmethod + def get_qbx_parameters(dimensions, use_pde_expansions, level_nterms, qbx_order): + fmm_parameters = PerformanceCounter.get_fmm_parameters( + dimensions, use_pde_expansions, level_nterms + ) + + if use_pde_expansions: + ncoeffs_qbx = qbx_order ** (dimensions - 1) + else: + ncoeffs_qbx = qbx_order ** dimensions + + return QBXParameters( + ncoeffs_fmm_by_level=fmm_parameters.ncoeffs_fmm_by_level, + ncoeffs_qbx=ncoeffs_qbx, + translation_source_power=fmm_parameters.translation_source_power, + translation_target_power=fmm_parameters.translation_target_power, + translation_max_power=fmm_parameters.translation_max_power, + ) -- GitLab From 5db05921fb8389f5c5f9f7f1d317b73e1a6bdd83 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 1 Aug 2018 13:35:33 -0500 Subject: [PATCH 30/86] Overwrite list 1 counting --- pytential/qbx/distributed.py | 2 +- pytential/qbx/perf_model.py | 62 ++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index a968b5b1..3c0e908e 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -195,7 +195,7 @@ class DistributedGeoData(object): from boxtree.distributed.partition import partition_work from pytential.qbx.perf_model import QBXPerformanceCounter - counter = QBXPerformanceCounter(traversal, global_wrangler, True) + counter = QBXPerformanceCounter(geo_data, global_wrangler, True) # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 14612378..a8ca250a 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -1,3 +1,4 @@ +import numpy as np from boxtree.distributed.perf_model import PerformanceCounter from collections import namedtuple @@ -13,13 +14,14 @@ QBXParameters = namedtuple( class QBXPerformanceCounter(PerformanceCounter): - def __init__(self, traversal, wrangler, uses_pde_expansions): - self.traversal = traversal + def __init__(self, geo_data, wrangler, uses_pde_expansions): + self.geo_data = geo_data + self.traversal = geo_data.traversal() self.wrangler = wrangler self.uses_pde_expansions = uses_pde_expansions self.parameters = self.get_qbx_parameters( - traversal.tree.dimensions, + self.traversal.tree.dimensions, uses_pde_expansions, wrangler.level_nterms, wrangler.qbx_order @@ -43,3 +45,57 @@ class QBXPerformanceCounter(PerformanceCounter): translation_target_power=fmm_parameters.translation_target_power, translation_max_power=fmm_parameters.translation_max_power, ) + + def count_direct(self, use_global_idx=False): + """ + This method overwrites the one in parent class because the only non-qbx + targets should be counted. + + :return: If *use_global_idx* is True, return a numpy array of shape + (tree.nboxes,) such that the ith entry represents the workload from + direct evaluation on box i. If *use_global_idx* is False, return a numpy + array of shape (ntarget_boxes,) such that the ith entry represents the + workload on *target_boxes* i. + """ + box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ + .box_target_counts_nonchild + traversal = self.traversal + tree = traversal.tree + + if use_global_idx: + direct_workload = np.zeros((tree.nboxes,), dtype=np.intp) + else: + ntarget_boxes = len(traversal.target_boxes) + direct_workload = np.zeros((ntarget_boxes,), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): + ntargets = box_target_counts_nonchild[tgt_ibox] + nsources = 0 + + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) + + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) + + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + count = nsources * ntargets + + if use_global_idx: + direct_workload[tgt_ibox] = count + else: + direct_workload[itgt_box] = count + + return direct_workload -- GitLab From a796934e34d5565faeae3a99c7b3ac1040746374 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 2 Aug 2018 16:22:20 -0500 Subject: [PATCH 31/86] Add p2qbxl to counter --- pytential/qbx/perf_model.py | 89 ++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index a8ca250a..ffa3a8af 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -1,6 +1,7 @@ import numpy as np -from boxtree.distributed.perf_model import PerformanceCounter +from boxtree.distributed.perf_model import PerformanceCounter, PerformanceModel from collections import namedtuple +import pyopencl as cl QBXParameters = namedtuple( "QBXParameters", @@ -99,3 +100,89 @@ class QBXPerformanceCounter(PerformanceCounter): direct_workload[itgt_box] = count return direct_workload + + def count_p2qbxl(self, use_global_idx=False): + geo_data = self.geo_data + traversal = self.traversal + tree = traversal.tree + + if use_global_idx: + np2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) + else: + ntarget_boxes = len(traversal.target_boxes) + np2qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) + + for tgt_icenter in geo_data.global_qbx_centers: + itgt_box = geo_data.qbx_center_to_target_box[tgt_icenter] + + np2qbxl_srcs = 0 + + # list 1 + start, end = traversal.neighbor_source_boxes_starts[ + itgt_box:itgt_box + 2] + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] + + # list 3 close + if traversal.from_sep_close_smaller_starts is not None: + start, end = traversal.from_sep_close_smaller_starts[ + itgt_box:itgt_box + 2] + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] + + # list 4 close + if traversal.from_sep_close_bigger_starts is not None: + start, end = traversal.from_sep_close_bigger_starts[ + itgt_box:itgt_box + 2] + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] + + workload = np2qbxl_srcs * self.parameters.ncoeffs_qbx + + if use_global_idx: + np2qbxl[traversal.target_boxes[itgt_box]] += workload + else: + np2qbxl[itgt_box] += workload + + return np2qbxl + + +class QBXPerformanceModel(PerformanceModel): + + def time_performance(self, geo_data): + traversal = geo_data.traversal() + + wrangler = self.wrangler_factory(tree=traversal.tree) + + counter = QBXPerformanceCounter( + geo_data, wrangler, self.uses_pde_expansions + ) + + # Record useful metadata for assembling performance data + nm2p, nm2p_boxes = counter.count_m2p() + + timing_data = { + "nterms_fmm_total": counter.count_nters_fmm_total(), + "direct_workload": np.sum(counter.count_direct()), + "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], + "m2l_workload": np.sum(counter.count_m2l()), + "m2p_workload": np.sum(nm2p), + "m2p_nboxes": np.sum(nm2p_boxes), + "p2l_workload": np.sum(counter.count_p2l()), + "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), + "eval_part_workload": np.sum(counter.count_eval_part()), + "p2qbxl_workload": np.sum(counter.count_p2qbxl()) + } + + # Generate random source weights + with cl.CommandQueue(self.cl_context) as queue: + source_weights = self.rng.uniform( + queue, + traversal.tree.nsources, + traversal.tree.coord_dtype + ).get() + + # Time a FMM run + self.drive_fmm(traversal, wrangler, source_weights, timing_data=timing_data) + + self.time_result.append(timing_data) -- GitLab From cf21f90cd89f91da33b1ee9b5be3a2aec9498e4a Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 4 Aug 2018 16:53:37 -0500 Subject: [PATCH 32/86] Use new perf_model API from boxtree --- pytential/qbx/distributed.py | 22 ++++++++++++++-------- pytential/qbx/perf_model.py | 6 ++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 3c0e908e..fafa822c 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -192,20 +192,26 @@ class DistributedGeoData(object): # }}} if current_rank == 0: - from boxtree.distributed.partition import partition_work - - from pytential.qbx.perf_model import QBXPerformanceCounter - counter = QBXPerformanceCounter(geo_data, global_wrangler, True) + def wrangler_factory(wrangler_tree): + import copy + new_wrangler = copy.copy(global_wrangler) + new_wrangler.tree = wrangler_tree + return new_wrangler + + from pytential.qbx.perf_model import QBXPerformanceModel + model = QBXPerformanceModel( + queue.context, wrangler_factory, True, None + ) # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different - from boxtree.distributed.perf_model import PerformanceModel - model = PerformanceModel(queue.context, None, True, None) - model.load_default_model() + boxes_time = model.predict_boxes_time(geo_data) + + from boxtree.distributed.partition import partition_work responsible_boxes_list = partition_work( - model, counter, traversal, comm.Get_size() + boxes_time, traversal, comm.Get_size() ) else: responsible_boxes_list = None diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index ffa3a8af..9fef5ffe 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -186,3 +186,9 @@ class QBXPerformanceModel(PerformanceModel): self.drive_fmm(traversal, wrangler, source_weights, timing_data=timing_data) self.time_result.append(timing_data) + + def predict_boxes_time(self, geo_data): + # TODO: Overwrite boxes time to incoporate QBX time. + return super(QBXPerformanceModel, self).predict_boxes_time( + geo_data.traversal() + ) -- GitLab From b7a144829cdc5ea4d18fa58a0dca404d8d26be1e Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 6 Aug 2018 20:05:44 -0500 Subject: [PATCH 33/86] Add p2qbxl model --- pytential/qbx/perf_model.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 9fef5ffe..e010ba4b 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -132,6 +132,7 @@ class QBXPerformanceCounter(PerformanceCounter): # list 4 close if traversal.from_sep_close_bigger_starts is not None: + # POSSIBLY USE INTERFACE WRONGLY start, end = traversal.from_sep_close_bigger_starts[ itgt_box:itgt_box + 2] for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: @@ -187,8 +188,29 @@ class QBXPerformanceModel(PerformanceModel): self.time_result.append(timing_data) + def form_global_qbx_locals_model(self, wall_time=True): + return self.linear_regression( + "form_global_qbx_locals", ["p2qbxl_workload"], + wall_time=wall_time + ) + def predict_boxes_time(self, geo_data): # TODO: Overwrite boxes time to incoporate QBX time. - return super(QBXPerformanceModel, self).predict_boxes_time( + boxes_time = super(QBXPerformanceModel, self).predict_boxes_time( geo_data.traversal() ) + + wrangler = self.wrangler_factory(geo_data.tree()) + counter = QBXPerformanceCounter(geo_data, wrangler, self.uses_pde_expansions) + + # {{{ form_global_qbx_locals time + + param = self.form_global_qbx_locals_model() + + p2qbxl_workload = counter.count_p2qbxl() + + boxes_time += (p2qbxl_workload * param[0] + param[1]) + + # }}} + + return boxes_time -- GitLab From b1ebe71a127fa8692574efe22a4b0478a06d43c3 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 8 Aug 2018 10:53:21 -0500 Subject: [PATCH 34/86] Performance counter inside drive_fmm --- pytential/qbx/__init__.py | 7 +++-- pytential/qbx/distributed.py | 12 ++------ pytential/qbx/fmm.py | 39 +++++++++++++++++++++++- pytential/qbx/perf_model.py | 59 +++++++++++++----------------------- 4 files changed, 65 insertions(+), 52 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index a1828678..0b072880 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -661,7 +661,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): value = evaluate(expr) return with_object_array_or_scalar(oversample_nonscalars, value) - return func(queue, insn, bound_expr, evaluate_wrapper) + return func(queue, insn, bound_expr, evaluate_wrapper, + timing_data=evaluate.timing_data) @property @memoize_method @@ -783,7 +784,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute fmm - def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate): + def exec_compute_potential_insn_fmm(self, queue, insn, bound_expr, evaluate, + timing_data={}): target_name_and_side_to_number, target_discrs_and_qbx_sides = ( self.get_target_discrs_and_qbx_sides(insn, bound_expr)) @@ -843,7 +845,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): queue, strengths, distributed_geo_data, comm=self.comm) else: from pytential.qbx.fmm import drive_fmm - timing_data = {} all_potentials_on_every_tgt = drive_fmm(wrangler, strengths, timing_data) # }}} diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index fafa822c..66afcc0e 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -192,22 +192,14 @@ class DistributedGeoData(object): # }}} if current_rank == 0: - def wrangler_factory(wrangler_tree): - import copy - new_wrangler = copy.copy(global_wrangler) - new_wrangler.tree = wrangler_tree - return new_wrangler - from pytential.qbx.perf_model import QBXPerformanceModel - model = QBXPerformanceModel( - queue.context, wrangler_factory, True, None - ) + model = QBXPerformanceModel(queue.context, True) # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different model.load_default_model() - boxes_time = model.predict_boxes_time(geo_data) + boxes_time = model.predict_boxes_time(geo_data, global_wrangler) from boxtree.distributed.partition import partition_work responsible_boxes_list = partition_work( diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index b1ce86cc..994b1a62 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -374,6 +374,20 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), # {{{ FMM top-level +def add_dicts(dict1, dict2): + rtv = {} + + for key in set(dict1) | set(dict2): + if key not in dict1: + rtv[key] = dict2[key] + elif key not in dict2: + rtv[key] = dict1[key] + else: + rtv[key] = dict1[key] + dict2[key] + + return rtv + + def drive_fmm(expansion_wrangler, src_weights, timing_data=None): """Top-level driver routine for the QBX fast multipole calculation. @@ -396,6 +410,29 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): tree = traversal.tree recorder = TimingRecorder() + if timing_data is not None and 'WITH_COUNTER' in timing_data: + from pytential.qbx.perf_model import QBXPerformanceCounter + counter = QBXPerformanceCounter( + geo_data, wrangler, timing_data['USES_PDE_EXPRESSIONS'] + ) + + nm2p, nm2p_boxes = counter.count_m2p() + + timing_data.update(add_dicts(timing_data, { + "nterms_fmm_total": counter.count_nters_fmm_total(), + "direct_workload": np.sum(counter.count_direct()), + "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], + "m2l_workload": np.sum(counter.count_m2l()), + "m2p_workload": np.sum(nm2p), + "m2p_nboxes": np.sum(nm2p_boxes), + "p2l_workload": np.sum(counter.count_p2l()), + "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), + "eval_part_workload": np.sum(counter.count_eval_part()), + "p2qbxl_workload": np.sum(counter.count_p2qbxl()) + })) + + # CAUTION: Using add_dicts limits the + # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. @@ -562,7 +599,7 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): fmm_proc.done() if timing_data is not None: - timing_data.update(recorder.summarize()) + timing_data.update(add_dicts(timing_data, recorder.summarize())) return result diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index e010ba4b..eb7f476d 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -1,7 +1,6 @@ import numpy as np from boxtree.distributed.perf_model import PerformanceCounter, PerformanceModel from collections import namedtuple -import pyopencl as cl QBXParameters = namedtuple( "QBXParameters", @@ -105,6 +104,7 @@ class QBXPerformanceCounter(PerformanceCounter): geo_data = self.geo_data traversal = self.traversal tree = traversal.tree + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() if use_global_idx: np2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) @@ -112,8 +112,8 @@ class QBXPerformanceCounter(PerformanceCounter): ntarget_boxes = len(traversal.target_boxes) np2qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) - for tgt_icenter in geo_data.global_qbx_centers: - itgt_box = geo_data.qbx_center_to_target_box[tgt_icenter] + for tgt_icenter in geo_data.global_qbx_centers(): + itgt_box = qbx_center_to_target_box[tgt_icenter] np2qbxl_srcs = 0 @@ -150,41 +150,21 @@ class QBXPerformanceCounter(PerformanceCounter): class QBXPerformanceModel(PerformanceModel): - def time_performance(self, geo_data): - traversal = geo_data.traversal() - - wrangler = self.wrangler_factory(tree=traversal.tree) - - counter = QBXPerformanceCounter( - geo_data, wrangler, self.uses_pde_expansions + def __init__(self, cl_context, uses_pde_expansions): + super(QBXPerformanceModel, self).__init__( + cl_context, uses_pde_expansions ) - # Record useful metadata for assembling performance data - nm2p, nm2p_boxes = counter.count_m2p() - + def time_qbx_performance(self, queue, bound_op, context): timing_data = { - "nterms_fmm_total": counter.count_nters_fmm_total(), - "direct_workload": np.sum(counter.count_direct()), - "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], - "m2l_workload": np.sum(counter.count_m2l()), - "m2p_workload": np.sum(nm2p), - "m2p_nboxes": np.sum(nm2p_boxes), - "p2l_workload": np.sum(counter.count_p2l()), - "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), - "eval_part_workload": np.sum(counter.count_eval_part()), - "p2qbxl_workload": np.sum(counter.count_p2qbxl()) + 'WITH_COUNTER': True, + 'USES_PDE_EXPRESSIONS': self.uses_pde_expansions } - # Generate random source weights - with cl.CommandQueue(self.cl_context) as queue: - source_weights = self.rng.uniform( - queue, - traversal.tree.nsources, - traversal.tree.coord_dtype - ).get() + bound_op.eval(queue, context=context, timing_data=timing_data) - # Time a FMM run - self.drive_fmm(traversal, wrangler, source_weights, timing_data=timing_data) + timing_data.pop('WITH_COUNTER') + timing_data.pop('USES_PDE_EXPRESSIONS') self.time_result.append(timing_data) @@ -194,23 +174,26 @@ class QBXPerformanceModel(PerformanceModel): wall_time=wall_time ) - def predict_boxes_time(self, geo_data): + def predict_boxes_time(self, geo_data, wrangler): # TODO: Overwrite boxes time to incoporate QBX time. boxes_time = super(QBXPerformanceModel, self).predict_boxes_time( - geo_data.traversal() + geo_data.traversal(), wrangler ) - wrangler = self.wrangler_factory(geo_data.tree()) counter = QBXPerformanceCounter(geo_data, wrangler, self.uses_pde_expansions) # {{{ form_global_qbx_locals time param = self.form_global_qbx_locals_model() - p2qbxl_workload = counter.count_p2qbxl() + p2qbxl_workload = counter.count_p2qbxl(use_global_idx=True) boxes_time += (p2qbxl_workload * param[0] + param[1]) - # }}} - return boxes_time + + def load_default_model(self): + import os + current_dir = os.path.dirname(os.path.abspath(__file__)) + default_perf_file_path = os.path.join(current_dir, 'default_perf_model.json') + self.loadjson(default_perf_file_path) -- GitLab From e008d4d263b399cc56d7384e522fd46579e77494 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 8 Aug 2018 12:21:37 -0500 Subject: [PATCH 35/86] Add m2qbxl to counter --- pytential/qbx/perf_model.py | 49 +++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index eb7f476d..e4e4b4d7 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -147,6 +147,53 @@ class QBXPerformanceCounter(PerformanceCounter): return np2qbxl + def count_m2qbxl(self, use_global_idx=False): + geo_data = self.geo_data + traversal = self.traversal + tree = traversal.tree + global_qbx_centers = geo_data.global_qbx_centers() + qbx_center_to_target_box_source_level = \ + geo_data.qbx_center_to_target_box_source_level() + + if use_global_idx: + nm2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) + else: + ntarget_boxes = len(traversal.target_boxes) + nm2qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) + + for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): + + target_boxes_sep_smaller_current_level = \ + traversal.target_boxes_sep_smaller_by_source_level[isrc_level] + + cost_coefficient = self.xlat_cost( + self.wrangler.level_nterms[isrc_level], + self.wrangler.qbx_order, + self.parameters + ) + + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): + icontaining_tgt_box = qbx_center_to_target_box_source_level[ + isrc_level][tgt_icenter] + + if icontaining_tgt_box == -1: + continue + + start, stop = ( + ssn.starts[icontaining_tgt_box], + ssn.starts[icontaining_tgt_box+1]) + + cost = (stop - start) * cost_coefficient + + if use_global_idx: + global_boxes_idx = \ + target_boxes_sep_smaller_current_level[icontaining_tgt_box] + nm2qbxl[global_boxes_idx] += cost + else: + target_boxes_idx = ssn.nonempty_indices[icontaining_tgt_box] + nm2qbxl[target_boxes_idx] += cost + + return nm2qbxl class QBXPerformanceModel(PerformanceModel): @@ -190,6 +237,8 @@ class QBXPerformanceModel(PerformanceModel): boxes_time += (p2qbxl_workload * param[0] + param[1]) + # }}} + return boxes_time def load_default_model(self): -- GitLab From faf5498258e23d68143f7c44f6beb946872f32ca Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 8 Aug 2018 17:14:59 -0500 Subject: [PATCH 36/86] Add counter for l2qbxl and eval_qbxl --- pytential/qbx/perf_model.py | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index e4e4b4d7..21e7eea8 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -195,6 +195,67 @@ class QBXPerformanceCounter(PerformanceCounter): return nm2qbxl + def count_l2qbxl(self, use_global_idx=False): + geo_data = self.geo_data + traversal = self.traversal + tree = traversal.tree + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + global_qbx_centers = geo_data.global_qbx_centers() + + if use_global_idx: + nl2qbxl = np.zeros(tree.nboxes, dtype=np.intp) + else: + ntarget_boxes = len(traversal.target_boxes) + nl2qbxl = np.zeros(ntarget_boxes, dtype=np.intp) + + for src_icenter in global_qbx_centers: + target_box_idx = qbx_center_to_target_box[src_icenter] + global_box_idx = traversal.target_boxes[target_box_idx] + + box_level = tree.box_levels[global_box_idx] + + cost = self.xlat_cost( + self.wrangler.level_nterms[box_level], + self.wrangler.qbx_order, + self.parameters + ) + + if use_global_idx: + nl2qbxl[global_box_idx] += cost + else: + nl2qbxl[target_box_idx] += cost + + return nl2qbxl + + def count_eval_qbxl(self, use_global_idx=False): + geo_data = self.geo_data + traversal = self.traversal + tree = traversal.tree + qbx_center_to_target_box = geo_data.qbx_center_to_target_box() + global_qbx_centers = geo_data.global_qbx_centers() + center_to_targets_starts = geo_data.center_to_tree_targets().starts + + if use_global_idx: + neval_qbxl = np.zeros((tree.nboxes,), dtype=np.intp) + else: + ntarget_boxes = len(traversal.target_boxes) + neval_qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) + + for src_icenter in global_qbx_centers: + start, end = center_to_targets_starts[src_icenter:src_icenter+2] + cost = (end - start) * self.parameters.ncoeffs_qbx + + target_box_idx = qbx_center_to_target_box[src_icenter] + + if use_global_idx: + global_box_idx = traversal.target_boxes[target_box_idx] + neval_qbxl[global_box_idx] += cost + else: + neval_qbxl[target_box_idx] += cost + + return neval_qbxl + + class QBXPerformanceModel(PerformanceModel): def __init__(self, cl_context, uses_pde_expansions): -- GitLab From 114ad2f2c956fe64db5c6de0ddf875541aad7e1f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 9 Aug 2018 14:54:26 -0500 Subject: [PATCH 37/86] Add expansion wrangler inspector --- pytential/qbx/__init__.py | 10 ++++++++++ pytential/qbx/distributed.py | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 0b072880..fe50acd5 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -84,6 +84,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul=None, _tree_kind="adaptive", geometry_data_inspector=None, + expansion_wrangler_inspector=None, performance_model=None, fmm_backend="sumpy", target_stick_out_factor=_not_provided): @@ -204,6 +205,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_min_nsources_cumul self._tree_kind = _tree_kind self.geometry_data_inspector = geometry_data_inspector + self.expansion_wrangler_inspector = expansion_wrangler_inspector self.performance_model = performance_model # /!\ *All* parameters set here must also be set by copy() below, @@ -227,6 +229,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _from_sep_smaller_crit=None, _tree_kind=None, geometry_data_inspector=None, + expansion_wrangler_inspector=None, performance_model=_not_provided, fmm_backend=None, @@ -311,6 +314,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _tree_kind=_tree_kind or self._tree_kind, geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), + expansion_wrangler_inspector=( + expansion_wrangler_inspector or self.expansion_wrangler_inspector + ), performance_model=( # None is a valid value here performance_model @@ -831,6 +837,10 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): if not perform_fmm: return [(o.name, 0) for o in insn.outputs], [] + if self.expansion_wrangler_inspector is not None: + rtv = self.expansion_wrangler_inspector(wrangler) + return rtv + # }}} # {{{ execute global QBX diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 66afcc0e..8990bdee 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -624,6 +624,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=None, _tree_kind="adaptive", geometry_data_inspector=None, + expansion_wrangler_inspector=None, target_stick_out_factor=_not_provided): self.comm = comm @@ -657,6 +658,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_min_nsources_cumul=0, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, + expansion_wrangler_inspector=expansion_wrangler_inspector, fmm_backend='distributed', target_stick_out_factor=target_stick_out_factor ) @@ -677,6 +679,8 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=None, _tree_kind=None, geometry_data_inspector=None, + expansion_wrangler_inspector=None, + performance_model=_not_provided, fmm_backend=None, debug=_not_provided, @@ -699,6 +703,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=_from_sep_smaller_crit, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, + expansion_wrangler_inspector=expansion_wrangler_inspector, fmm_backend=fmm_backend, debug=debug, -- GitLab From 781cc6971f84566c93769b543a03279ddec87b8f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 9 Aug 2018 16:24:45 -0500 Subject: [PATCH 38/86] Use expansion wrangler inspector to implement counter --- pytential/qbx/__init__.py | 9 +++++++-- pytential/qbx/fmm.py | 23 ----------------------- pytential/qbx/perf_model.py | 35 ++++++++++++++++++++++++++++------- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index fe50acd5..6399da93 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -838,8 +838,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): return [(o.name, 0) for o in insn.outputs], [] if self.expansion_wrangler_inspector is not None: - rtv = self.expansion_wrangler_inspector(wrangler) - return rtv + self.expansion_wrangler_inspector(wrangler) # }}} @@ -1041,6 +1040,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # }}} + def bind_expansion_wrangler_inspector(self, inspector): + if self.expansion_wrangler_inspector is not None: + raise NotImplementedError("Cannot bind multiple inspectors.") + + self.expansion_wrangler_inspector = inspector + # }}} # }}} diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 994b1a62..862ff74b 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -410,29 +410,6 @@ def drive_fmm(expansion_wrangler, src_weights, timing_data=None): tree = traversal.tree recorder = TimingRecorder() - if timing_data is not None and 'WITH_COUNTER' in timing_data: - from pytential.qbx.perf_model import QBXPerformanceCounter - counter = QBXPerformanceCounter( - geo_data, wrangler, timing_data['USES_PDE_EXPRESSIONS'] - ) - - nm2p, nm2p_boxes = counter.count_m2p() - - timing_data.update(add_dicts(timing_data, { - "nterms_fmm_total": counter.count_nters_fmm_total(), - "direct_workload": np.sum(counter.count_direct()), - "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], - "m2l_workload": np.sum(counter.count_m2l()), - "m2p_workload": np.sum(nm2p), - "m2p_nboxes": np.sum(nm2p_boxes), - "p2l_workload": np.sum(counter.count_p2l()), - "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), - "eval_part_workload": np.sum(counter.count_eval_part()), - "p2qbxl_workload": np.sum(counter.count_p2qbxl()) - })) - - # CAUTION: Using add_dicts limits the - # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 21e7eea8..696452b6 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -264,15 +264,36 @@ class QBXPerformanceModel(PerformanceModel): ) def time_qbx_performance(self, queue, bound_op, context): - timing_data = { - 'WITH_COUNTER': True, - 'USES_PDE_EXPRESSIONS': self.uses_pde_expansions - } + timing_data = {} - bound_op.eval(queue, context=context, timing_data=timing_data) + def expansion_wrangler_inspector(wrangler): + counter = QBXPerformanceCounter( + wrangler.geo_data, wrangler, self.uses_pde_expansions + ) + traversal = wrangler.geo_data.traversal() + + nm2p, nm2p_boxes = counter.count_m2p() + + from pytential.qbx.fmm import add_dicts + timing_data.update(add_dicts(timing_data, { + "nterms_fmm_total": counter.count_nters_fmm_total(), + "direct_workload": np.sum(counter.count_direct()), + "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], + "m2l_workload": np.sum(counter.count_m2l()), + "m2p_workload": np.sum(nm2p), + "m2p_nboxes": np.sum(nm2p_boxes), + "p2l_workload": np.sum(counter.count_p2l()), + "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), + "eval_part_workload": np.sum(counter.count_eval_part()), + "p2qbxl_workload": np.sum(counter.count_p2qbxl()) + })) + + from pytential.symbolic.primitives import DEFAULT_SOURCE + bound_op.places[DEFAULT_SOURCE].bind_expansion_wrangler_inspector( + expansion_wrangler_inspector + ) - timing_data.pop('WITH_COUNTER') - timing_data.pop('USES_PDE_EXPRESSIONS') + bound_op.eval(queue, context=context, timing_data=timing_data) self.time_result.append(timing_data) -- GitLab From 8b7ba9e26851859348446370825c6e1eacd1f12d Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 10 Aug 2018 00:03:52 -0500 Subject: [PATCH 39/86] Add evaluation to perf model --- pytential/qbx/perf_model.py | 63 ++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 696452b6..87d89edb 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -263,6 +263,9 @@ class QBXPerformanceModel(PerformanceModel): cl_context, uses_pde_expansions ) + def time_performance(self, traversal, wrangler): + raise NotImplementedError("Please use time_qbx_performance instead.") + def time_qbx_performance(self, queue, bound_op, context): timing_data = {} @@ -304,7 +307,6 @@ class QBXPerformanceModel(PerformanceModel): ) def predict_boxes_time(self, geo_data, wrangler): - # TODO: Overwrite boxes time to incoporate QBX time. boxes_time = super(QBXPerformanceModel, self).predict_boxes_time( geo_data.traversal(), wrangler ) @@ -321,8 +323,67 @@ class QBXPerformanceModel(PerformanceModel): # }}} + # TODO: Overwrite boxes time to incoporate QBX time. + return boxes_time + def predict_step_time(self, eval_counter, wall_time=True): + predict_timing = super(QBXPerformanceModel, self).predict_step_time( + eval_counter, wall_time=wall_time + ) + + # {{{ Predict form_global_qbx_locals time + + param = self.form_global_qbx_locals_model(wall_time=wall_time) + + p2qbxl_workload = np.sum(eval_counter.count_p2qbxl()) + + predict_timing["form_global_qbx_locals"] = ( + p2qbxl_workload * param[0] + param[1] + ) + + # }}} + + # TODO: implement pytential specific fields + + return predict_timing + + def evaluate_model(self, queue, bound_op, context, wall_time=True): + predict_timing = {} + + def expansion_wrangler_inspector(wrangler): + eval_counter = QBXPerformanceCounter( + wrangler.geo_data, wrangler, self.uses_pde_expansions + ) + + from pytential.qbx.fmm import add_dicts + predict_timing.update(add_dicts( + predict_timing, + self.predict_step_time(eval_counter, wall_time=wall_time) + )) + + from pytential.symbolic.primitives import DEFAULT_SOURCE + bound_op.places[DEFAULT_SOURCE].bind_expansion_wrangler_inspector( + expansion_wrangler_inspector + ) + + actual_timing = {} + bound_op.eval(queue, context=context, timing_data=actual_timing) + + for field in ["eval_direct", "multipole_to_local", "eval_multipoles", + "form_locals", "eval_locals", "form_global_qbx_locals"]: + predict_time_field = predict_timing[field] + + if wall_time: + true_time_field = actual_timing[field].wall_elapsed + else: + true_time_field = actual_timing[field].process_elapsed + + diff = abs(predict_time_field - true_time_field) + + print(field + ": predict " + str(predict_time_field) + " actual " + + str(true_time_field) + " error " + str(diff / true_time_field)) + def load_default_model(self): import os current_dir = os.path.dirname(os.path.abspath(__file__)) -- GitLab From 97da5e51222724c05cbcd185786ca0b548d2119e Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 10 Aug 2018 11:12:15 -0500 Subject: [PATCH 40/86] Add m2qbxl, l2qbxl and eval_qbx_expansions to perf model --- pytential/qbx/default_perf_model.json | 1 + pytential/qbx/perf_model.py | 103 ++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 pytential/qbx/default_perf_model.json diff --git a/pytential/qbx/default_perf_model.json b/pytential/qbx/default_perf_model.json new file mode 100644 index 00000000..ee6878bb --- /dev/null +++ b/pytential/qbx/default_perf_model.json @@ -0,0 +1 @@ +[{"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "eval_part_workload": 392000, "m2l_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "eval_qbxl_workload": 15120, "p2l_nboxes": 0, "nterms_fmm_total": 448000, "eval_direct": {"wall_elapsed": 0.00014457499992204248, "process_elapsed": 0.00014556099999829542}, "form_global_qbx_locals": {"wall_elapsed": 0.4051341179999781, "process_elapsed": 3.0964739649999986}, "multipole_to_local": {"wall_elapsed": 0.001291328999968755, "process_elapsed": 0.0009987380000016088}, "eval_qbx_expansions": {"wall_elapsed": 0.026797234000241588, "process_elapsed": 0.02683310200000033}, "eval_locals": {"wall_elapsed": 0.00017090500023186905, "process_elapsed": 0.0001721510000001203}, "coarsen_multipoles": {"wall_elapsed": 2.864499992938363e-05, "process_elapsed": 2.9139000001343618e-05}, "refine_locals": {"wall_elapsed": 0.0011670649998904992, "process_elapsed": 0.0008489720000000034}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06030093000003944, "process_elapsed": 0.08477208899999855}, "form_locals": {"wall_elapsed": 0.00027019299977837363, "process_elapsed": 0.00027169400000115473}, "eval_multipoles": {"wall_elapsed": 0.0001437860000805813, "process_elapsed": 0.00014519499999909868}, "form_multipoles": {"wall_elapsed": 0.021561385000268274, "process_elapsed": 0.015583602000000418}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.017583687000296777, "process_elapsed": 0.08821777799999975}}, {"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "eval_part_workload": 392000, "m2l_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "eval_qbxl_workload": 15120, "p2l_nboxes": 0, "nterms_fmm_total": 448000, "eval_direct": {"wall_elapsed": 0.0001385880000270845, "process_elapsed": 0.00013979000000219344}, "form_global_qbx_locals": {"wall_elapsed": 0.41444351899963294, "process_elapsed": 3.116072316999997}, "multipole_to_local": {"wall_elapsed": 0.0007494009998936235, "process_elapsed": 0.0007517839999948706}, "eval_qbx_expansions": {"wall_elapsed": 0.02921361199992134, "process_elapsed": 0.032211528999997796}, "eval_locals": {"wall_elapsed": 0.00016483900026287301, "process_elapsed": 0.00016606899999871416}, "coarsen_multipoles": {"wall_elapsed": 2.9819999781466322e-05, "process_elapsed": 3.041800000147532e-05}, "refine_locals": {"wall_elapsed": 0.0007442299997819646, "process_elapsed": 0.0007460429999994744}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06153733599990119, "process_elapsed": 0.07276670300000276}, "form_locals": {"wall_elapsed": 0.00023785300004419696, "process_elapsed": 0.00023920199999949432}, "eval_multipoles": {"wall_elapsed": 0.00012937799988321785, "process_elapsed": 0.0001306139999996958}, "form_multipoles": {"wall_elapsed": 0.012569276000022, "process_elapsed": 0.012571251999997202}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.014739867000116647, "process_elapsed": 0.06327850900000342}}, {"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "l2qbxl_workload": 2700000, "direct_workload": 0, "eval_part_workload": 630000, "m2l_workload": 1632000, "m2qbxl_workload": 0, "p2qbxl_workload": 90281520, "eval_qbxl_workload": 24300, "p2l_nboxes": 0, "nterms_fmm_total": 772000, "eval_direct": {"wall_elapsed": 0.00044451099984144093, "process_elapsed": 0.00044582699999651254}, "form_global_qbx_locals": {"wall_elapsed": 1.1347037200000614, "process_elapsed": 8.175864781000001}, "multipole_to_local": {"wall_elapsed": 0.013594094999916706, "process_elapsed": 0.031522225000003345}, "eval_qbx_expansions": {"wall_elapsed": 0.04319645300006414, "process_elapsed": 0.04320027200000354}, "eval_locals": {"wall_elapsed": 0.0004639830001451628, "process_elapsed": 0.0004652279999994846}, "coarsen_multipoles": {"wall_elapsed": 3.193899988218618e-05, "process_elapsed": 3.224599999995803e-05}, "refine_locals": {"wall_elapsed": 0.00491668499989828, "process_elapsed": 0.006445430000002972}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.11420032300020466, "process_elapsed": 0.13396674499999506}, "form_locals": {"wall_elapsed": 0.0013731700000789715, "process_elapsed": 0.010520605000003513}, "eval_multipoles": {"wall_elapsed": 0.0002223300002697215, "process_elapsed": 0.0018475250000022925}, "form_multipoles": {"wall_elapsed": 0.02159770300022501, "process_elapsed": 0.021787137000000456}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.036156407000135005, "process_elapsed": 0.10665832599999447}}, {"p2l_workload": 17414400, "m2p_workload": 22632000, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "l2qbxl_workload": 10800000, "direct_workload": 0, "eval_part_workload": 2520000, "m2l_workload": 47616000, "m2qbxl_workload": 148080000, "p2qbxl_workload": 493642242, "eval_qbxl_workload": 97200, "p2l_nboxes": 5760, "nterms_fmm_total": 3088000, "eval_direct": {"wall_elapsed": 0.0014398020000498946, "process_elapsed": 0.001441014999997492}, "form_global_qbx_locals": {"wall_elapsed": 5.621687159999965, "process_elapsed": 43.778723058000004}, "multipole_to_local": {"wall_elapsed": 0.232288194000148, "process_elapsed": 1.521805771000004}, "eval_qbx_expansions": {"wall_elapsed": 0.17695011699993302, "process_elapsed": 0.1766893120000077}, "eval_locals": {"wall_elapsed": 0.0015130900001167902, "process_elapsed": 0.0015144890000016176}, "coarsen_multipoles": {"wall_elapsed": 0.01364043499984291, "process_elapsed": 0.013621080999996593}, "refine_locals": {"wall_elapsed": 0.01894895100008398, "process_elapsed": 0.018914254000002018}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.4205340979999619, "process_elapsed": 0.42609530299999676}, "form_locals": {"wall_elapsed": 0.5216265349999958, "process_elapsed": 0.527060671000001}, "eval_multipoles": {"wall_elapsed": 0.0004696480000347947, "process_elapsed": 0.013424528999998131}, "form_multipoles": {"wall_elapsed": 0.08589172200004214, "process_elapsed": 0.08777697000000018}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.36152846300024066, "process_elapsed": 1.7963144780000064}}, {"p2l_workload": 27091200, "m2p_workload": 33528000, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_workload": 0, "eval_part_workload": 3696000, "m2l_workload": 47616000, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "eval_qbxl_workload": 142560, "p2l_nboxes": 5760, "nterms_fmm_total": 4720000, "eval_direct": {"wall_elapsed": 0.0016364240000257269, "process_elapsed": 0.0016372789999934412}, "form_global_qbx_locals": {"wall_elapsed": 12.650557410000147, "process_elapsed": 98.856955851}, "multipole_to_local": {"wall_elapsed": 0.216005924999763, "process_elapsed": 1.4686899130000057}, "eval_qbx_expansions": {"wall_elapsed": 0.2587835169999835, "process_elapsed": 0.25878203500001007}, "eval_locals": {"wall_elapsed": 0.0016412659999787138, "process_elapsed": 0.0016427320000076406}, "coarsen_multipoles": {"wall_elapsed": 0.013596718999906443, "process_elapsed": 0.01359863899999425}, "refine_locals": {"wall_elapsed": 0.018542439000157174, "process_elapsed": 0.018544194000000402}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5977090630001385, "process_elapsed": 0.5977074240000206}, "form_locals": {"wall_elapsed": 0.7721112069998526, "process_elapsed": 0.7886337529999992}, "eval_multipoles": {"wall_elapsed": 0.0007427669997923658, "process_elapsed": 0.017427264000005493}, "form_multipoles": {"wall_elapsed": 0.12716900300006273, "process_elapsed": 0.12717103100000315}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5377625770001941, "process_elapsed": 2.7641417660000087}}, {"p2l_workload": 27091200, "m2p_workload": 33528000, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_workload": 0, "eval_part_workload": 3696000, "m2l_workload": 47616000, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "eval_qbxl_workload": 142560, "p2l_nboxes": 5760, "nterms_fmm_total": 4720000, "eval_direct": {"wall_elapsed": 0.0017083899999761343, "process_elapsed": 0.0017096619999676932}, "form_global_qbx_locals": {"wall_elapsed": 12.650411099000166, "process_elapsed": 98.909811163}, "multipole_to_local": {"wall_elapsed": 0.24223890199982634, "process_elapsed": 1.509391834000013}, "eval_qbx_expansions": {"wall_elapsed": 0.25370455300003414, "process_elapsed": 0.2537074840000173}, "eval_locals": {"wall_elapsed": 0.001636830000052214, "process_elapsed": 0.0016382329999942158}, "coarsen_multipoles": {"wall_elapsed": 0.015331328000002031, "process_elapsed": 0.015333470000001626}, "refine_locals": {"wall_elapsed": 0.018799158999854626, "process_elapsed": 0.01880120900003135}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5966280350000943, "process_elapsed": 0.5966279570000097}, "form_locals": {"wall_elapsed": 0.7709777210002358, "process_elapsed": 0.7852971849999903}, "eval_multipoles": {"wall_elapsed": 0.0009081359999072447, "process_elapsed": 0.003684275000011894}, "form_multipoles": {"wall_elapsed": 0.12745193399996424, "process_elapsed": 0.12750258299999473}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5393657320000784, "process_elapsed": 2.756274291999972}}] \ No newline at end of file diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 87d89edb..4e7deeb6 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -152,8 +152,6 @@ class QBXPerformanceCounter(PerformanceCounter): traversal = self.traversal tree = traversal.tree global_qbx_centers = geo_data.global_qbx_centers() - qbx_center_to_target_box_source_level = \ - geo_data.qbx_center_to_target_box_source_level() if use_global_idx: nm2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) @@ -172,9 +170,13 @@ class QBXPerformanceCounter(PerformanceCounter): self.parameters ) + qbx_center_to_target_box_current_level = \ + geo_data.qbx_center_to_target_box_source_level(isrc_level) + for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - icontaining_tgt_box = qbx_center_to_target_box_source_level[ - isrc_level][tgt_icenter] + icontaining_tgt_box = qbx_center_to_target_box_current_level[ + tgt_icenter + ] if icontaining_tgt_box == -1: continue @@ -288,7 +290,10 @@ class QBXPerformanceModel(PerformanceModel): "p2l_workload": np.sum(counter.count_p2l()), "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), "eval_part_workload": np.sum(counter.count_eval_part()), - "p2qbxl_workload": np.sum(counter.count_p2qbxl()) + "p2qbxl_workload": np.sum(counter.count_p2qbxl()), + "m2qbxl_workload": np.sum(counter.count_m2qbxl()), + "l2qbxl_workload": np.sum(counter.count_l2qbxl()), + "eval_qbxl_workload": np.sum(counter.count_eval_qbxl()) })) from pytential.symbolic.primitives import DEFAULT_SOURCE @@ -306,6 +311,24 @@ class QBXPerformanceModel(PerformanceModel): wall_time=wall_time ) + def translate_box_multipoles_to_qbx_local_model(self, wall_time=True): + return self.linear_regression( + "translate_box_multipoles_to_qbx_local", ["m2qbxl_workload"], + wall_time=wall_time + ) + + def translate_box_local_to_qbx_local_model(self, wall_time=True): + return self.linear_regression( + "translate_box_local_to_qbx_local", ["l2qbxl_workload"], + wall_time=wall_time + ) + + def eval_qbx_expansions_model(self, wall_time=True): + return self.linear_regression( + "eval_qbx_expansions", ["eval_qbxl_workload"], + wall_time=wall_time + ) + def predict_boxes_time(self, geo_data, wrangler): boxes_time = super(QBXPerformanceModel, self).predict_boxes_time( geo_data.traversal(), wrangler @@ -323,7 +346,35 @@ class QBXPerformanceModel(PerformanceModel): # }}} - # TODO: Overwrite boxes time to incoporate QBX time. + # {{{ translate_box_multipoles_to_qbx_local time + + param = self.translate_box_multipoles_to_qbx_local_model() + + m2qbxl_workload = counter.count_m2qbxl(use_global_idx=True) + + boxes_time += (m2qbxl_workload * param[0] + param[1]) + + # }}} + + # {{{ translate_box_local_to_qbx_local time + + param = self.translate_box_local_to_qbx_local_model() + + l2qbxl_workload = counter.count_l2qbxl(use_global_idx=True) + + boxes_time += (l2qbxl_workload * param[0] + param[1]) + + # }}} + + # {{{ eval_qbx_expansions time + + param = self.eval_qbx_expansions_model() + + eval_qbxl_workload = counter.count_eval_qbxl(use_global_idx=True) + + boxes_time += (eval_qbxl_workload * param[0] + param[1]) + + # }}} return boxes_time @@ -344,7 +395,41 @@ class QBXPerformanceModel(PerformanceModel): # }}} - # TODO: implement pytential specific fields + # {{{ Predict translate_box_multipoles_to_qbx_local time + + param = self.translate_box_multipoles_to_qbx_local_model(wall_time=wall_time) + + m2qbxl_workload = np.sum(eval_counter.count_m2qbxl()) + + predict_timing["translate_box_multipoles_to_qbx_local"] = ( + m2qbxl_workload * param[0] + param[1] + ) + + # }}} + + # {{{ Predict translate_box_local_to_qbx_local time + + param = self.translate_box_local_to_qbx_local_model(wall_time=wall_time) + + l2qbxl_workload = np.sum(eval_counter.count_l2qbxl()) + + predict_timing["translate_box_local_to_qbx_local"] = ( + l2qbxl_workload * param[0] + param[1] + ) + + # }}} + + # {{{ Predict eval_qbx_expansions time + + param = self.eval_qbx_expansions_model(wall_time=wall_time) + + eval_qbxl_workload = np.sum(eval_counter.count_eval_qbxl()) + + predict_timing["eval_qbx_expansions"] = ( + eval_qbxl_workload * param[0] + param[1] + ) + + # }}} return predict_timing @@ -371,7 +456,9 @@ class QBXPerformanceModel(PerformanceModel): bound_op.eval(queue, context=context, timing_data=actual_timing) for field in ["eval_direct", "multipole_to_local", "eval_multipoles", - "form_locals", "eval_locals", "form_global_qbx_locals"]: + "form_locals", "eval_locals", "form_global_qbx_locals", + "translate_box_multipoles_to_qbx_local", + "translate_box_local_to_qbx_local", "eval_qbx_expansions"]: predict_time_field = predict_timing[field] if wall_time: -- GitLab From d2cdc9bd40afaccb81b5afce07b88ce7699f1ff1 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sat, 11 Aug 2018 15:10:22 -0500 Subject: [PATCH 41/86] Fix overcounting in p2p, m2p and eval_part --- pytential/qbx/default_perf_model.json | 2 +- pytential/qbx/perf_model.py | 74 ++++++++++++--------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/pytential/qbx/default_perf_model.json b/pytential/qbx/default_perf_model.json index ee6878bb..01df2bff 100644 --- a/pytential/qbx/default_perf_model.json +++ b/pytential/qbx/default_perf_model.json @@ -1 +1 @@ -[{"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "eval_part_workload": 392000, "m2l_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "eval_qbxl_workload": 15120, "p2l_nboxes": 0, "nterms_fmm_total": 448000, "eval_direct": {"wall_elapsed": 0.00014457499992204248, "process_elapsed": 0.00014556099999829542}, "form_global_qbx_locals": {"wall_elapsed": 0.4051341179999781, "process_elapsed": 3.0964739649999986}, "multipole_to_local": {"wall_elapsed": 0.001291328999968755, "process_elapsed": 0.0009987380000016088}, "eval_qbx_expansions": {"wall_elapsed": 0.026797234000241588, "process_elapsed": 0.02683310200000033}, "eval_locals": {"wall_elapsed": 0.00017090500023186905, "process_elapsed": 0.0001721510000001203}, "coarsen_multipoles": {"wall_elapsed": 2.864499992938363e-05, "process_elapsed": 2.9139000001343618e-05}, "refine_locals": {"wall_elapsed": 0.0011670649998904992, "process_elapsed": 0.0008489720000000034}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06030093000003944, "process_elapsed": 0.08477208899999855}, "form_locals": {"wall_elapsed": 0.00027019299977837363, "process_elapsed": 0.00027169400000115473}, "eval_multipoles": {"wall_elapsed": 0.0001437860000805813, "process_elapsed": 0.00014519499999909868}, "form_multipoles": {"wall_elapsed": 0.021561385000268274, "process_elapsed": 0.015583602000000418}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.017583687000296777, "process_elapsed": 0.08821777799999975}}, {"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "eval_part_workload": 392000, "m2l_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "eval_qbxl_workload": 15120, "p2l_nboxes": 0, "nterms_fmm_total": 448000, "eval_direct": {"wall_elapsed": 0.0001385880000270845, "process_elapsed": 0.00013979000000219344}, "form_global_qbx_locals": {"wall_elapsed": 0.41444351899963294, "process_elapsed": 3.116072316999997}, "multipole_to_local": {"wall_elapsed": 0.0007494009998936235, "process_elapsed": 0.0007517839999948706}, "eval_qbx_expansions": {"wall_elapsed": 0.02921361199992134, "process_elapsed": 0.032211528999997796}, "eval_locals": {"wall_elapsed": 0.00016483900026287301, "process_elapsed": 0.00016606899999871416}, "coarsen_multipoles": {"wall_elapsed": 2.9819999781466322e-05, "process_elapsed": 3.041800000147532e-05}, "refine_locals": {"wall_elapsed": 0.0007442299997819646, "process_elapsed": 0.0007460429999994744}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06153733599990119, "process_elapsed": 0.07276670300000276}, "form_locals": {"wall_elapsed": 0.00023785300004419696, "process_elapsed": 0.00023920199999949432}, "eval_multipoles": {"wall_elapsed": 0.00012937799988321785, "process_elapsed": 0.0001306139999996958}, "form_multipoles": {"wall_elapsed": 0.012569276000022, "process_elapsed": 0.012571251999997202}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.014739867000116647, "process_elapsed": 0.06327850900000342}}, {"p2l_workload": 0, "m2p_workload": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "l2qbxl_workload": 2700000, "direct_workload": 0, "eval_part_workload": 630000, "m2l_workload": 1632000, "m2qbxl_workload": 0, "p2qbxl_workload": 90281520, "eval_qbxl_workload": 24300, "p2l_nboxes": 0, "nterms_fmm_total": 772000, "eval_direct": {"wall_elapsed": 0.00044451099984144093, "process_elapsed": 0.00044582699999651254}, "form_global_qbx_locals": {"wall_elapsed": 1.1347037200000614, "process_elapsed": 8.175864781000001}, "multipole_to_local": {"wall_elapsed": 0.013594094999916706, "process_elapsed": 0.031522225000003345}, "eval_qbx_expansions": {"wall_elapsed": 0.04319645300006414, "process_elapsed": 0.04320027200000354}, "eval_locals": {"wall_elapsed": 0.0004639830001451628, "process_elapsed": 0.0004652279999994846}, "coarsen_multipoles": {"wall_elapsed": 3.193899988218618e-05, "process_elapsed": 3.224599999995803e-05}, "refine_locals": {"wall_elapsed": 0.00491668499989828, "process_elapsed": 0.006445430000002972}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.11420032300020466, "process_elapsed": 0.13396674499999506}, "form_locals": {"wall_elapsed": 0.0013731700000789715, "process_elapsed": 0.010520605000003513}, "eval_multipoles": {"wall_elapsed": 0.0002223300002697215, "process_elapsed": 0.0018475250000022925}, "form_multipoles": {"wall_elapsed": 0.02159770300022501, "process_elapsed": 0.021787137000000456}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.036156407000135005, "process_elapsed": 0.10665832599999447}}, {"p2l_workload": 17414400, "m2p_workload": 22632000, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "l2qbxl_workload": 10800000, "direct_workload": 0, "eval_part_workload": 2520000, "m2l_workload": 47616000, "m2qbxl_workload": 148080000, "p2qbxl_workload": 493642242, "eval_qbxl_workload": 97200, "p2l_nboxes": 5760, "nterms_fmm_total": 3088000, "eval_direct": {"wall_elapsed": 0.0014398020000498946, "process_elapsed": 0.001441014999997492}, "form_global_qbx_locals": {"wall_elapsed": 5.621687159999965, "process_elapsed": 43.778723058000004}, "multipole_to_local": {"wall_elapsed": 0.232288194000148, "process_elapsed": 1.521805771000004}, "eval_qbx_expansions": {"wall_elapsed": 0.17695011699993302, "process_elapsed": 0.1766893120000077}, "eval_locals": {"wall_elapsed": 0.0015130900001167902, "process_elapsed": 0.0015144890000016176}, "coarsen_multipoles": {"wall_elapsed": 0.01364043499984291, "process_elapsed": 0.013621080999996593}, "refine_locals": {"wall_elapsed": 0.01894895100008398, "process_elapsed": 0.018914254000002018}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.4205340979999619, "process_elapsed": 0.42609530299999676}, "form_locals": {"wall_elapsed": 0.5216265349999958, "process_elapsed": 0.527060671000001}, "eval_multipoles": {"wall_elapsed": 0.0004696480000347947, "process_elapsed": 0.013424528999998131}, "form_multipoles": {"wall_elapsed": 0.08589172200004214, "process_elapsed": 0.08777697000000018}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.36152846300024066, "process_elapsed": 1.7963144780000064}}, {"p2l_workload": 27091200, "m2p_workload": 33528000, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_workload": 0, "eval_part_workload": 3696000, "m2l_workload": 47616000, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "eval_qbxl_workload": 142560, "p2l_nboxes": 5760, "nterms_fmm_total": 4720000, "eval_direct": {"wall_elapsed": 0.0016364240000257269, "process_elapsed": 0.0016372789999934412}, "form_global_qbx_locals": {"wall_elapsed": 12.650557410000147, "process_elapsed": 98.856955851}, "multipole_to_local": {"wall_elapsed": 0.216005924999763, "process_elapsed": 1.4686899130000057}, "eval_qbx_expansions": {"wall_elapsed": 0.2587835169999835, "process_elapsed": 0.25878203500001007}, "eval_locals": {"wall_elapsed": 0.0016412659999787138, "process_elapsed": 0.0016427320000076406}, "coarsen_multipoles": {"wall_elapsed": 0.013596718999906443, "process_elapsed": 0.01359863899999425}, "refine_locals": {"wall_elapsed": 0.018542439000157174, "process_elapsed": 0.018544194000000402}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5977090630001385, "process_elapsed": 0.5977074240000206}, "form_locals": {"wall_elapsed": 0.7721112069998526, "process_elapsed": 0.7886337529999992}, "eval_multipoles": {"wall_elapsed": 0.0007427669997923658, "process_elapsed": 0.017427264000005493}, "form_multipoles": {"wall_elapsed": 0.12716900300006273, "process_elapsed": 0.12717103100000315}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5377625770001941, "process_elapsed": 2.7641417660000087}}, {"p2l_workload": 27091200, "m2p_workload": 33528000, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_workload": 0, "eval_part_workload": 3696000, "m2l_workload": 47616000, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "eval_qbxl_workload": 142560, "p2l_nboxes": 5760, "nterms_fmm_total": 4720000, "eval_direct": {"wall_elapsed": 0.0017083899999761343, "process_elapsed": 0.0017096619999676932}, "form_global_qbx_locals": {"wall_elapsed": 12.650411099000166, "process_elapsed": 98.909811163}, "multipole_to_local": {"wall_elapsed": 0.24223890199982634, "process_elapsed": 1.509391834000013}, "eval_qbx_expansions": {"wall_elapsed": 0.25370455300003414, "process_elapsed": 0.2537074840000173}, "eval_locals": {"wall_elapsed": 0.001636830000052214, "process_elapsed": 0.0016382329999942158}, "coarsen_multipoles": {"wall_elapsed": 0.015331328000002031, "process_elapsed": 0.015333470000001626}, "refine_locals": {"wall_elapsed": 0.018799158999854626, "process_elapsed": 0.01880120900003135}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5966280350000943, "process_elapsed": 0.5966279570000097}, "form_locals": {"wall_elapsed": 0.7709777210002358, "process_elapsed": 0.7852971849999903}, "eval_multipoles": {"wall_elapsed": 0.0009081359999072447, "process_elapsed": 0.003684275000011894}, "form_multipoles": {"wall_elapsed": 0.12745193399996424, "process_elapsed": 0.12750258299999473}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5393657320000784, "process_elapsed": 2.756274291999972}}] \ No newline at end of file +[{"m2l_workload": 0, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 33868800, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 1680000, "p2l_nboxes": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 15120, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.021093479999763076, "process_elapsed": 0.015244096000000873}, "eval_qbx_expansions": {"wall_elapsed": 0.029716666000240366, "process_elapsed": 0.02971870699999979}, "eval_locals": {"wall_elapsed": 0.0001865880003606435, "process_elapsed": 0.00018793100000102925}, "multipole_to_local": {"wall_elapsed": 0.001130373000705731, "process_elapsed": 0.0008656770000001757}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06602678799936257, "process_elapsed": 0.08573500999999961}, "refine_locals": {"wall_elapsed": 0.0012004210002487525, "process_elapsed": 0.0008828260000006694}, "form_global_qbx_locals": {"wall_elapsed": 0.4057365960015886, "process_elapsed": 3.1082306060000002}, "coarsen_multipoles": {"wall_elapsed": 2.9456999982357956e-05, "process_elapsed": 2.999100000078414e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.014524768999763182, "process_elapsed": 0.06752062500000111}, "eval_multipoles": {"wall_elapsed": 0.0001489429996581748, "process_elapsed": 0.0001503160000000392}, "eval_direct": {"wall_elapsed": 0.00015629100016667508, "process_elapsed": 0.00015768100000013163}, "form_locals": {"wall_elapsed": 0.00027929699899686966, "process_elapsed": 0.0002805499999993799}}, {"m2l_workload": 0, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 33868800, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 1680000, "p2l_nboxes": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 15120, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.012340336999841384, "process_elapsed": 0.012342126000000064}, "eval_qbx_expansions": {"wall_elapsed": 0.03010847199766431, "process_elapsed": 0.030110333999996186}, "eval_locals": {"wall_elapsed": 0.00017738399947120342, "process_elapsed": 0.00017871500000055107}, "multipole_to_local": {"wall_elapsed": 0.0007668389989703428, "process_elapsed": 0.0007692639999987705}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.070026515999416, "process_elapsed": 0.0920731059999973}, "refine_locals": {"wall_elapsed": 0.0008041430010052864, "process_elapsed": 0.0008061699999988292}, "form_global_qbx_locals": {"wall_elapsed": 0.4070693410012609, "process_elapsed": 3.1083596039999986}, "coarsen_multipoles": {"wall_elapsed": 3.477300015219953e-05, "process_elapsed": 3.480300000191505e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01423785099905217, "process_elapsed": 0.06792730700000149}, "eval_multipoles": {"wall_elapsed": 0.00013409600069280714, "process_elapsed": 0.00013513799999920195}, "eval_direct": {"wall_elapsed": 0.00014394999925571028, "process_elapsed": 0.0001449600000000828}, "form_locals": {"wall_elapsed": 0.0002557949992478825, "process_elapsed": 0.0002571680000009735}}, {"m2l_workload": 1632000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 90281520, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 2700000, "p2l_nboxes": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 24300, "nterms_fmm_total": 772000, "form_multipoles": {"wall_elapsed": 0.021441335000417894, "process_elapsed": 0.02144320500000063}, "eval_qbx_expansions": {"wall_elapsed": 0.05041948199868784, "process_elapsed": 0.05305360899999911}, "eval_locals": {"wall_elapsed": 0.0005096630011394154, "process_elapsed": 0.0005112489999952174}, "multipole_to_local": {"wall_elapsed": 0.013463256000250112, "process_elapsed": 0.03790370300000134}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.11192823199962731, "process_elapsed": 0.12738405600000036}, "refine_locals": {"wall_elapsed": 0.005293365999023081, "process_elapsed": 0.005453985999999134}, "form_global_qbx_locals": {"wall_elapsed": 1.0642655539995758, "process_elapsed": 8.188607980000004}, "coarsen_multipoles": {"wall_elapsed": 3.5937999200541526e-05, "process_elapsed": 3.627799999961212e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.03252427000006719, "process_elapsed": 0.09634986499999343}, "eval_multipoles": {"wall_elapsed": 0.00020739300089189783, "process_elapsed": 0.017759949999998526}, "eval_direct": {"wall_elapsed": 0.0004732769994006958, "process_elapsed": 0.00047447200000050316}, "form_locals": {"wall_elapsed": 0.0011059109983762028, "process_elapsed": 0.0011847469999999305}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 493642242, "m2qbxl_workload": 148080000, "m2p_workload": 0, "l2qbxl_workload": 10800000, "p2l_nboxes": 5760, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "p2l_workload": 17414400, "eval_qbxl_workload": 97200, "nterms_fmm_total": 3088000, "form_multipoles": {"wall_elapsed": 0.08514756100157683, "process_elapsed": 0.08530784799999935}, "eval_qbx_expansions": {"wall_elapsed": 0.1988989629990101, "process_elapsed": 0.20408010799999943}, "eval_locals": {"wall_elapsed": 0.0016958759988483507, "process_elapsed": 0.001697354000000928}, "multipole_to_local": {"wall_elapsed": 0.24327286500192713, "process_elapsed": 1.5432809840000061}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.45564075499896717, "process_elapsed": 0.4602933469999968}, "refine_locals": {"wall_elapsed": 0.02061064599911333, "process_elapsed": 0.021224416999999107}, "form_global_qbx_locals": {"wall_elapsed": 5.597717438999098, "process_elapsed": 43.771067205}, "coarsen_multipoles": {"wall_elapsed": 0.014241842998671927, "process_elapsed": 0.014243738000004669}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.3647387950004486, "process_elapsed": 1.8143376410000016}, "eval_multipoles": {"wall_elapsed": 0.0005216209992795484, "process_elapsed": 0.014195053999998208}, "eval_direct": {"wall_elapsed": 0.0015977099992596777, "process_elapsed": 0.0015989620000027571}, "form_locals": {"wall_elapsed": 0.5284309560011025, "process_elapsed": 0.5403717400000048}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 1118188692, "m2qbxl_workload": 226896000, "m2p_workload": 0, "l2qbxl_workload": 15840000, "p2l_nboxes": 5760, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "p2l_workload": 27091200, "eval_qbxl_workload": 142560, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12709579200054577, "process_elapsed": 0.12702671399999588}, "eval_qbx_expansions": {"wall_elapsed": 0.28158704199995555, "process_elapsed": 0.28159011700000747}, "eval_locals": {"wall_elapsed": 0.001804113999241963, "process_elapsed": 0.0018056689999923492}, "multipole_to_local": {"wall_elapsed": 0.21822331300063524, "process_elapsed": 1.470087723000006}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6500158240014571, "process_elapsed": 0.6500182090000095}, "refine_locals": {"wall_elapsed": 0.01991879000161134, "process_elapsed": 0.019876508999999487}, "form_global_qbx_locals": {"wall_elapsed": 12.633814991997497, "process_elapsed": 98.88120058599999}, "coarsen_multipoles": {"wall_elapsed": 0.014304040001661633, "process_elapsed": 0.014269510000005425}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5418222919997788, "process_elapsed": 2.748641480999993}, "eval_multipoles": {"wall_elapsed": 0.0009419779999007005, "process_elapsed": 0.0009434199999986959}, "eval_direct": {"wall_elapsed": 0.0017403539986844407, "process_elapsed": 0.0017415970000058678}, "form_locals": {"wall_elapsed": 0.7705097859998205, "process_elapsed": 0.7850629389999995}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 1118188692, "m2qbxl_workload": 226896000, "m2p_workload": 0, "l2qbxl_workload": 15840000, "p2l_nboxes": 5760, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "p2l_workload": 27091200, "eval_qbxl_workload": 142560, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12758302099973662, "process_elapsed": 0.1273970490000238}, "eval_qbx_expansions": {"wall_elapsed": 0.2835366819999763, "process_elapsed": 0.28353914399997393}, "eval_locals": {"wall_elapsed": 0.001810083000236773, "process_elapsed": 0.0018113820000280612}, "multipole_to_local": {"wall_elapsed": 0.21933700199952, "process_elapsed": 1.4927009270000156}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6411107570020249, "process_elapsed": 0.6427057839999577}, "refine_locals": {"wall_elapsed": 0.019865717000357108, "process_elapsed": 0.019831113999998706}, "form_global_qbx_locals": {"wall_elapsed": 12.666067901998758, "process_elapsed": 98.85360947599997}, "coarsen_multipoles": {"wall_elapsed": 0.014492083999357419, "process_elapsed": 0.014402262000004384}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5464885890014557, "process_elapsed": 2.7702280599999938}, "eval_multipoles": {"wall_elapsed": 0.0007965080003486946, "process_elapsed": 0.010344065999987606}, "eval_direct": {"wall_elapsed": 0.0017223149989149533, "process_elapsed": 0.001723447999978589}, "form_locals": {"wall_elapsed": 0.7680267520008783, "process_elapsed": 0.7662016509999887}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 448000, "p2l_workload": 0, "m2l_workload": 0, "direct_workload": 0, "eval_qbxl_workload": 15120, "p2qbxl_workload": 33868800, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_nsource_boxes": 128, "eval_qbx_expansions": {"wall_elapsed": 0.027458508000563597, "process_elapsed": 0.027460470000000292}, "coarsen_multipoles": {"wall_elapsed": 2.592999953776598e-05, "process_elapsed": 2.6278000000878876e-05}, "form_multipoles": {"wall_elapsed": 0.014194509998560534, "process_elapsed": 0.014196669000001272}, "form_locals": {"wall_elapsed": 0.0002368280001974199, "process_elapsed": 0.00023786200000142088}, "refine_locals": {"wall_elapsed": 0.0007696419997955672, "process_elapsed": 0.0007711829999994535}, "form_global_qbx_locals": {"wall_elapsed": 0.4148323249992245, "process_elapsed": 3.136391765000001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06255740400047216, "process_elapsed": 0.0824586750000007}, "multipole_to_local": {"wall_elapsed": 0.0007675939978071256, "process_elapsed": 0.000769723000000333}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01681618300062837, "process_elapsed": 0.0716890939999999}, "eval_multipoles": {"wall_elapsed": 0.00012192499889351893, "process_elapsed": 0.00012304999999823707}, "eval_direct": {"wall_elapsed": 0.00014055599967832677, "process_elapsed": 0.00014181200000074057}, "eval_locals": {"wall_elapsed": 0.00016502899961778894, "process_elapsed": 0.00016628200000035065}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 448000, "p2l_workload": 0, "m2l_workload": 0, "direct_workload": 0, "eval_qbxl_workload": 15120, "p2qbxl_workload": 33868800, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_nsource_boxes": 128, "eval_qbx_expansions": {"wall_elapsed": 0.027217917999223573, "process_elapsed": 0.027210197999998798}, "coarsen_multipoles": {"wall_elapsed": 2.7272999432170764e-05, "process_elapsed": 2.7537000001132128e-05}, "form_multipoles": {"wall_elapsed": 0.012168246998044197, "process_elapsed": 0.012170330000000007}, "form_locals": {"wall_elapsed": 0.00025652499789430294, "process_elapsed": 0.0002578460000020044}, "refine_locals": {"wall_elapsed": 0.0007966619996295776, "process_elapsed": 0.0007986469999980983}, "form_global_qbx_locals": {"wall_elapsed": 0.41449507100151095, "process_elapsed": 3.124277467999999}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06530420099988987, "process_elapsed": 0.08900481899999946}, "multipole_to_local": {"wall_elapsed": 0.0007791059997543925, "process_elapsed": 0.0007817149999986839}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.016263018998870393, "process_elapsed": 0.08408505000000233}, "eval_multipoles": {"wall_elapsed": 0.0001345959990430856, "process_elapsed": 0.00013584000000221863}, "eval_direct": {"wall_elapsed": 0.00014376900071511045, "process_elapsed": 0.0001450710000003852}, "eval_locals": {"wall_elapsed": 0.00017906200082506984, "process_elapsed": 0.00018038700000033714}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 772000, "p2l_workload": 0, "m2l_workload": 1632000, "direct_workload": 0, "eval_qbxl_workload": 24300, "p2qbxl_workload": 90281520, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 2700000, "direct_nsource_boxes": 3008, "eval_qbx_expansions": {"wall_elapsed": 0.04887341300127446, "process_elapsed": 0.05584138399999716}, "coarsen_multipoles": {"wall_elapsed": 2.4689998099347576e-05, "process_elapsed": 2.4934000002474477e-05}, "form_multipoles": {"wall_elapsed": 0.021673631999874488, "process_elapsed": 0.02167526300000233}, "form_locals": {"wall_elapsed": 0.0013548639999498846, "process_elapsed": 0.0030114619999999093}, "refine_locals": {"wall_elapsed": 0.004909453999061952, "process_elapsed": 0.006460705999998595}, "form_global_qbx_locals": {"wall_elapsed": 1.0763670570013346, "process_elapsed": 8.281176414999997}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.10042979500030924, "process_elapsed": 0.12154810500000224}, "multipole_to_local": {"wall_elapsed": 0.01164319700183114, "process_elapsed": 0.043857604999995914}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.033270312000240665, "process_elapsed": 0.11139830600000167}, "eval_multipoles": {"wall_elapsed": 0.00020722800036310218, "process_elapsed": 0.012455046999999553}, "eval_direct": {"wall_elapsed": 0.00044317800166027155, "process_elapsed": 0.00044460899999876347}, "eval_locals": {"wall_elapsed": 0.0004815839984075865, "process_elapsed": 0.00048297599999713725}}, {"m2qbxl_workload": 148080000, "nterms_fmm_total": 3088000, "p2l_workload": 17414400, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 97200, "p2qbxl_workload": 493642242, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 2880, "l2qbxl_workload": 10800000, "direct_nsource_boxes": 18368, "eval_qbx_expansions": {"wall_elapsed": 0.17471831900002144, "process_elapsed": 0.1748417530000026}, "coarsen_multipoles": {"wall_elapsed": 0.01359383099952538, "process_elapsed": 0.013595676000001333}, "form_multipoles": {"wall_elapsed": 0.08885329399890907, "process_elapsed": 0.08904873100000543}, "form_locals": {"wall_elapsed": 0.5181811650018062, "process_elapsed": 0.5401015320000013}, "refine_locals": {"wall_elapsed": 0.0193349160017533, "process_elapsed": 0.019337051000000827}, "form_global_qbx_locals": {"wall_elapsed": 5.6473584810009925, "process_elapsed": 44.13173326100001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.43283446399982495, "process_elapsed": 0.45431766599999435}, "multipole_to_local": {"wall_elapsed": 0.21959404200060817, "process_elapsed": 1.4773259700000096}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.36450363600124547, "process_elapsed": 1.839162819000002}, "eval_multipoles": {"wall_elapsed": 0.0005835300016769907, "process_elapsed": 0.015605516999990243}, "eval_direct": {"wall_elapsed": 0.001544830000057118, "process_elapsed": 0.001546145999995474}, "eval_locals": {"wall_elapsed": 0.0018016889989667106, "process_elapsed": 0.0018037119999974038}}, {"m2qbxl_workload": 226896000, "nterms_fmm_total": 4720000, "p2l_workload": 27091200, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 142560, "p2qbxl_workload": 1118188692, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_nsource_boxes": 21560, "eval_qbx_expansions": {"wall_elapsed": 0.2554673249996995, "process_elapsed": 0.2548831920000083}, "coarsen_multipoles": {"wall_elapsed": 0.013426133000393747, "process_elapsed": 0.013428055999995081}, "form_multipoles": {"wall_elapsed": 0.12737836500127742, "process_elapsed": 0.12738107700000967}, "form_locals": {"wall_elapsed": 0.7677063330011151, "process_elapsed": 0.7686912520000106}, "refine_locals": {"wall_elapsed": 0.018829676000677864, "process_elapsed": 0.0190762870000043}, "form_global_qbx_locals": {"wall_elapsed": 12.738795529001436, "process_elapsed": 99.644129348}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5864635059988359, "process_elapsed": 0.5854574929999927}, "multipole_to_local": {"wall_elapsed": 0.23098103499978606, "process_elapsed": 1.500902835000005}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.540710029001275, "process_elapsed": 2.745416965999979}, "eval_multipoles": {"wall_elapsed": 0.0006263670002226718, "process_elapsed": 0.014304139000003602}, "eval_direct": {"wall_elapsed": 0.0016434970002592308, "process_elapsed": 0.0016445880000048874}, "eval_locals": {"wall_elapsed": 0.0017078709988709306, "process_elapsed": 0.0017091690000086146}}, {"m2qbxl_workload": 226896000, "nterms_fmm_total": 4720000, "p2l_workload": 27091200, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 142560, "p2qbxl_workload": 1118188692, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_nsource_boxes": 21560, "eval_qbx_expansions": {"wall_elapsed": 0.2562931430002209, "process_elapsed": 0.2562958170000229}, "coarsen_multipoles": {"wall_elapsed": 0.013575992001278792, "process_elapsed": 0.013578048999988823}, "form_multipoles": {"wall_elapsed": 0.127262083999085, "process_elapsed": 0.12726020400000948}, "form_locals": {"wall_elapsed": 0.7632982899995113, "process_elapsed": 0.7632958620000068}, "refine_locals": {"wall_elapsed": 0.018785588999890024, "process_elapsed": 0.01878766699999801}, "form_global_qbx_locals": {"wall_elapsed": 12.749122495999472, "process_elapsed": 99.54689662300001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5856983570010925, "process_elapsed": 0.5856943359999889}, "multipole_to_local": {"wall_elapsed": 0.2411785369986319, "process_elapsed": 1.523934061999995}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5380380780006817, "process_elapsed": 2.7199274079999896}, "eval_multipoles": {"wall_elapsed": 0.0006180829986988101, "process_elapsed": 0.017411200999987386}, "eval_direct": {"wall_elapsed": 0.0016484799998579547, "process_elapsed": 0.0016498760000160928}, "eval_locals": {"wall_elapsed": 0.0017064889998437138, "process_elapsed": 0.0017078759999833437}}, {"p2l_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "p2l_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 15120, "m2l_workload": 0, "m2p_workload": 0, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.014648365000539343, "process_elapsed": 0.014650504000000453}, "eval_qbx_expansions": {"wall_elapsed": 0.028573530000358005, "process_elapsed": 0.028575531999999626}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01594121800189896, "process_elapsed": 0.06301019599999957}, "eval_multipoles": {"wall_elapsed": 0.00014124999870546162, "process_elapsed": 0.00014257299999975714}, "form_global_qbx_locals": {"wall_elapsed": 0.415889525998864, "process_elapsed": 3.152819396}, "multipole_to_local": {"wall_elapsed": 0.0008315519989992026, "process_elapsed": 0.0008338300000003684}, "form_locals": {"wall_elapsed": 0.0002776020010060165, "process_elapsed": 0.0002789340000006746}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06354885899963847, "process_elapsed": 0.08562401699999977}, "coarsen_multipoles": {"wall_elapsed": 2.6239000362693332e-05, "process_elapsed": 2.668399999894433e-05}, "eval_direct": {"wall_elapsed": 0.0001438249983038986, "process_elapsed": 0.00014487600000023804}, "refine_locals": {"wall_elapsed": 0.0007603810008731671, "process_elapsed": 0.0007619090000003936}, "eval_locals": {"wall_elapsed": 0.00016837099974509329, "process_elapsed": 0.00016960200000060155}}, {"p2l_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "p2l_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 15120, "m2l_workload": 0, "m2p_workload": 0, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.012285744000109844, "process_elapsed": 0.012279533000000953}, "eval_qbx_expansions": {"wall_elapsed": 0.027927285998885054, "process_elapsed": 0.027929128000000247}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01500640099766315, "process_elapsed": 0.061567719999999326}, "eval_multipoles": {"wall_elapsed": 0.00013151700113667175, "process_elapsed": 0.00013257900000063216}, "form_global_qbx_locals": {"wall_elapsed": 0.4132579479992273, "process_elapsed": 3.1420789520000003}, "multipole_to_local": {"wall_elapsed": 0.0007470569998986321, "process_elapsed": 0.0007492009999996441}, "form_locals": {"wall_elapsed": 0.00024485500034643337, "process_elapsed": 0.0002460000000006346}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06745604599927901, "process_elapsed": 0.09076768399999935}, "coarsen_multipoles": {"wall_elapsed": 2.470700019330252e-05, "process_elapsed": 2.5114999999686916e-05}, "eval_direct": {"wall_elapsed": 0.00014215699775377288, "process_elapsed": 0.00014338100000088616}, "refine_locals": {"wall_elapsed": 0.000750778999645263, "process_elapsed": 0.0007524110000023398}, "eval_locals": {"wall_elapsed": 0.00016190499991353136, "process_elapsed": 0.00016301199999801952}}, {"p2l_nboxes": 0, "l2qbxl_workload": 2700000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 90281520, "p2l_workload": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 24300, "m2l_workload": 1632000, "m2p_workload": 0, "nterms_fmm_total": 772000, "form_multipoles": {"wall_elapsed": 0.021994480001012562, "process_elapsed": 0.02252935800000344}, "eval_qbx_expansions": {"wall_elapsed": 0.0468985520019487, "process_elapsed": 0.0469004459999951}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.037229944000500836, "process_elapsed": 0.12695072000000707}, "eval_multipoles": {"wall_elapsed": 0.0001834110007621348, "process_elapsed": 0.00018498699999724977}, "form_global_qbx_locals": {"wall_elapsed": 1.0715511270009301, "process_elapsed": 8.245669204999999}, "multipole_to_local": {"wall_elapsed": 0.012459129999115248, "process_elapsed": 0.044035273999998736}, "form_locals": {"wall_elapsed": 0.0012082059984095395, "process_elapsed": 0.00120971399999803}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.10760574299820291, "process_elapsed": 0.1259072450000076}, "coarsen_multipoles": {"wall_elapsed": 2.7975002012681216e-05, "process_elapsed": 2.83579999980077e-05}, "eval_direct": {"wall_elapsed": 0.0004856249997828854, "process_elapsed": 0.00048697500000116634}, "refine_locals": {"wall_elapsed": 0.004943396999806282, "process_elapsed": 0.004945113000001555}, "eval_locals": {"wall_elapsed": 0.0004906579997623339, "process_elapsed": 0.0004921109999962425}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 10800000, "direct_workload": 0, "m2qbxl_workload": 148080000, "p2qbxl_workload": 493642242, "p2l_workload": 17414400, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "eval_part_workload": 0, "eval_qbxl_workload": 97200, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 3088000, "form_multipoles": {"wall_elapsed": 0.08571038500122086, "process_elapsed": 0.0860022119999968}, "eval_qbx_expansions": {"wall_elapsed": 0.17934093500116433, "process_elapsed": 0.18099043000000847}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.37942328399913094, "process_elapsed": 1.9070648920000082}, "eval_multipoles": {"wall_elapsed": 0.000492429999212618, "process_elapsed": 0.014625257999995256}, "form_global_qbx_locals": {"wall_elapsed": 5.654556654000771, "process_elapsed": 44.21751857500001}, "multipole_to_local": {"wall_elapsed": 0.23715932500090275, "process_elapsed": 1.5620473709999985}, "form_locals": {"wall_elapsed": 0.5225906369996665, "process_elapsed": 0.5309461430000013}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.4364859279994562, "process_elapsed": 0.45337846200001053}, "coarsen_multipoles": {"wall_elapsed": 0.013538190001781913, "process_elapsed": 0.013539729999997974}, "eval_direct": {"wall_elapsed": 0.001511279999249382, "process_elapsed": 0.0015123169999995412}, "refine_locals": {"wall_elapsed": 0.020114442000704003, "process_elapsed": 0.020405819000004044}, "eval_locals": {"wall_elapsed": 0.002200300999902538, "process_elapsed": 0.0022019109999931175}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 15840000, "direct_workload": 0, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "p2l_workload": 27091200, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "eval_part_workload": 0, "eval_qbxl_workload": 142560, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12949872399804008, "process_elapsed": 0.1295011399999879}, "eval_qbx_expansions": {"wall_elapsed": 0.26377770700128167, "process_elapsed": 0.26379084600002045}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5667059379993589, "process_elapsed": 2.8107373960000217}, "eval_multipoles": {"wall_elapsed": 0.0006380379982147133, "process_elapsed": 0.02108191400000692}, "form_global_qbx_locals": {"wall_elapsed": 12.72665303499889, "process_elapsed": 99.514778799}, "multipole_to_local": {"wall_elapsed": 0.23357510099958745, "process_elapsed": 1.5240983700000044}, "form_locals": {"wall_elapsed": 0.7732686960007413, "process_elapsed": 0.7745623440000173}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6227469330005988, "process_elapsed": 0.6227212100000088}, "coarsen_multipoles": {"wall_elapsed": 0.013453300000037416, "process_elapsed": 0.013455266999997662}, "eval_direct": {"wall_elapsed": 0.0016683400008332683, "process_elapsed": 0.0016693409999959385}, "refine_locals": {"wall_elapsed": 0.01893364100214967, "process_elapsed": 0.01893565800000374}, "eval_locals": {"wall_elapsed": 0.0017234650022146525, "process_elapsed": 0.0017244739999995318}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 15840000, "direct_workload": 0, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "p2l_workload": 27091200, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "eval_part_workload": 0, "eval_qbxl_workload": 142560, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12816052500056685, "process_elapsed": 0.12831986100002268}, "eval_qbx_expansions": {"wall_elapsed": 0.2602627809992555, "process_elapsed": 0.259980588000019}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5683883500005322, "process_elapsed": 2.859925634999996}, "eval_multipoles": {"wall_elapsed": 0.0006368619997374481, "process_elapsed": 0.02064926899998909}, "form_global_qbx_locals": {"wall_elapsed": 12.768018111000856, "process_elapsed": 99.855472591}, "multipole_to_local": {"wall_elapsed": 0.2329632920009317, "process_elapsed": 1.5199842549999971}, "form_locals": {"wall_elapsed": 0.7612577990003047, "process_elapsed": 0.760523242000005}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6147937720015761, "process_elapsed": 0.6136677860000077}, "coarsen_multipoles": {"wall_elapsed": 0.013572886000474682, "process_elapsed": 0.013538662000001978}, "eval_direct": {"wall_elapsed": 0.0017914090003614547, "process_elapsed": 0.0017926269999861688}, "refine_locals": {"wall_elapsed": 0.01885352499994042, "process_elapsed": 0.018819304999993847}, "eval_locals": {"wall_elapsed": 0.0016937519994826289, "process_elapsed": 0.0016949300000135281}}] \ No newline at end of file diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 4e7deeb6..761a931a 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -46,10 +46,10 @@ class QBXPerformanceCounter(PerformanceCounter): translation_max_power=fmm_parameters.translation_max_power, ) - def count_direct(self, use_global_idx=False): + def count_direct(self, use_global_idx=False, box_target_counts_nonchild=None): """ - This method overwrites the one in parent class because the only non-qbx - targets should be counted. + This method overwrites the one in parent class because only non-qbx targets + should be counted. :return: If *use_global_idx* is True, return a numpy array of shape (tree.nboxes,) such that the ith entry represents the workload from @@ -57,48 +57,42 @@ class QBXPerformanceCounter(PerformanceCounter): array of shape (ntarget_boxes,) such that the ith entry represents the workload on *target_boxes* i. """ - box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ - .box_target_counts_nonchild - traversal = self.traversal - tree = traversal.tree - - if use_global_idx: - direct_workload = np.zeros((tree.nboxes,), dtype=np.intp) - else: - ntarget_boxes = len(traversal.target_boxes) - direct_workload = np.zeros((ntarget_boxes,), dtype=np.intp) - - for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = box_target_counts_nonchild[tgt_ibox] - nsources = 0 + if box_target_counts_nonchild is None: + box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ + .box_target_counts_nonchild - start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] - - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - nsources += tree.box_source_counts_nonchild[src_ibox] - - if traversal.from_sep_close_smaller_starts is not None: - start, end = ( - traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) - - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - nsources += tree.box_source_counts_nonchild[src_ibox] - - if traversal.from_sep_close_bigger_starts is not None: - start, end = ( - traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) + return super(QBXPerformanceCounter, self).count_direct( + use_global_idx=use_global_idx, + box_target_counts_nonchild=box_target_counts_nonchild + ) - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - nsources += tree.box_source_counts_nonchild[src_ibox] + def count_m2p(self, use_global_idx=False, box_target_counts_nonchild=None): + """ + This method overwrites the one in parent class because only non-qbx targets + should be counted. + """ + if box_target_counts_nonchild is None: + box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ + .box_target_counts_nonchild - count = nsources * ntargets + return super(QBXPerformanceCounter, self).count_m2p( + use_global_idx=use_global_idx, + box_target_counts_nonchild=box_target_counts_nonchild + ) - if use_global_idx: - direct_workload[tgt_ibox] = count - else: - direct_workload[itgt_box] = count + def count_eval_part(self, use_global_idx=False, box_target_counts_nonchild=None): + """ + This method overwrites the one in parent class because only non-qbx targets + should be counted. + """ + if box_target_counts_nonchild is None: + box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ + .box_target_counts_nonchild - return direct_workload + return super(QBXPerformanceCounter, self).count_eval_part( + use_global_idx=use_global_idx, + box_target_counts_nonchild=box_target_counts_nonchild + ) def count_p2qbxl(self, use_global_idx=False): geo_data = self.geo_data -- GitLab From b42626b716805f5adb379551950a4f92a985b1e6 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 13 Aug 2018 12:02:02 -0500 Subject: [PATCH 42/86] Add an argument for supplying perf model --- pytential/qbx/distributed.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 8990bdee..0716bb02 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -143,7 +143,8 @@ class QBXDistributedFMMLibExpansionWrangler( # {{{ Distributed GeoData class DistributedGeoData(object): - def __init__(self, geo_data, queue, global_wrangler, comm=MPI.COMM_WORLD): + def __init__(self, geo_data, queue, global_wrangler, perf_model_file_path=None, + comm=MPI.COMM_WORLD): self.comm = comm current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -197,7 +198,10 @@ class DistributedGeoData(object): # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different - model.load_default_model() + if perf_model_file_path is None: + model.load_default_model() + else: + model.loadjson(perf_model_file_path) boxes_time = model.predict_boxes_time(geo_data, global_wrangler) @@ -611,6 +615,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): to_refined_connection=None, expansion_factory=None, target_association_tolerance=_not_provided, + perf_model_file_path=None, # begin undocumented arguments # FIXME default debug=False once everything has matured @@ -631,6 +636,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): current_rank = self.comm.Get_rank() self.distributed_geo_data_cache = {} + self.perf_model_file_path = perf_model_file_path if current_rank == 0: self.next_geo_data_id = 0 @@ -714,6 +720,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): obj.__class__ = DistributedQBXLayerPotentialSource obj.comm = self.comm obj.distributed_geo_data_cache = self.distributed_geo_data_cache + obj.perf_model_file_path = self.perf_model_file_path current_rank = self.comm.Get_rank() @@ -753,7 +760,9 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) distributed_geo_data = DistributedGeoData( - host_geo_data, queue, wrangler, self.comm + host_geo_data, queue, wrangler, + perf_model_file_path=self.perf_model_file_path, + comm=self.comm ) else: -- GitLab From 29d34a173487bd7041738eb11208c0bc02560aea Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 16 Aug 2018 20:29:08 -0500 Subject: [PATCH 43/86] [ci skip] Update for changes to TimingResult interface --- examples/performance.py | 2 +- pytential/qbx/perf_model.py | 4 ++-- pytential/qbx/performance.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/performance.py b/examples/performance.py index cc0c01db..3e2dc61b 100644 --- a/examples/performance.py +++ b/examples/performance.py @@ -148,7 +148,7 @@ def test_performance_model(ctx, perf_model): timing_result = {} for param in model_result: timing_result[param] = ( - sum(temp_timing_result[param].process_elapsed + sum(temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS print("=" * 20) diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py index 761a931a..49c05388 100644 --- a/pytential/qbx/perf_model.py +++ b/pytential/qbx/perf_model.py @@ -456,9 +456,9 @@ class QBXPerformanceModel(PerformanceModel): predict_time_field = predict_timing[field] if wall_time: - true_time_field = actual_timing[field].wall_elapsed + true_time_field = actual_timing[field]["wall_elapsed"] else: - true_time_field = actual_timing[field].process_elapsed + true_time_field = actual_timing[field]["process_elapsed"] diff = abs(predict_time_field - true_time_field) diff --git a/pytential/qbx/performance.py b/pytential/qbx/performance.py index 2e42203d..8c886f40 100644 --- a/pytential/qbx/performance.py +++ b/pytential/qbx/performance.py @@ -757,7 +757,7 @@ def estimate_calibration_params(model_results, timing_results): for param, time in timing_result.items(): calibration_param = ( _FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) - actual_times[calibration_param][i] = time.process_elapsed + actual_times[calibration_param][i] = time["process_elapsed"] result = {} -- GitLab From ff17d5a79aa9336abe54ba75a1ce151d9829327a Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 17 Aug 2018 00:04:13 -0500 Subject: [PATCH 44/86] List 4 near interface change --- pytential/qbx/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 0716bb02..37135293 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -887,7 +887,7 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, if local_traversal.from_sep_close_bigger_starts is not None: non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( - local_traversal.target_or_target_parent_boxes, + local_traversal.target_boxes, local_traversal.from_sep_close_bigger_starts, local_traversal.from_sep_close_bigger_lists, local_source_weights)[0] -- GitLab From 1fce4e698aafaa1135cf3b8ea3a5834aff1c6a3c Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 28 Oct 2018 17:16:38 -0500 Subject: [PATCH 45/86] Bug fix --- pytential/qbx/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 37135293..bd72b87d 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -756,7 +756,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # no cached result found, construct a new distributed_geo_data if current_rank == 0: - from pytential.qbx.fmmlib import ToHostTransferredGeoDataWrapper + from pytential.qbx.utils import ToHostTransferredGeoDataWrapper host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) distributed_geo_data = DistributedGeoData( -- GitLab From 0ddc574dbef5d2ec6a08bda303651fa6df21c41b Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 29 Oct 2018 09:49:44 -0500 Subject: [PATCH 46/86] Log timing in each step --- pytential/qbx/__init__.py | 4 ++- pytential/qbx/distributed.py | 49 ++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index d6569c12..f0ac2b62 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -856,7 +856,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): from pytential.qbx.distributed import drive_dfmm all_potentials_on_every_tgt = drive_dfmm( - queue, strengths, distributed_geo_data, comm=self.comm) + queue, strengths, distributed_geo_data, comm=self.comm, + record_timing=self.record_timing + ) else: from pytential.qbx.fmm import drive_fmm all_potentials_on_every_tgt = drive_fmm(wrangler, strengths, timing_data) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index bd72b87d..80785a56 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -9,6 +9,7 @@ import pyopencl as cl import logging import time from boxtree.tools import return_timing_data +from boxtree.distributed.util import TimeRecorder logger = logging.getLogger(__name__) @@ -616,6 +617,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): expansion_factory=None, target_association_tolerance=_not_provided, perf_model_file_path=None, + record_timing=False, # begin undocumented arguments # FIXME default debug=False once everything has matured @@ -637,6 +639,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): self.distributed_geo_data_cache = {} self.perf_model_file_path = perf_model_file_path + self.record_timing = record_timing if current_rank == 0: self.next_geo_data_id = 0 @@ -721,6 +724,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): obj.comm = self.comm obj.distributed_geo_data_cache = self.distributed_geo_data_cache obj.perf_model_file_path = self.perf_model_file_path + obj.record_timing = self.record_timing current_rank = self.comm.Get_rank() @@ -775,8 +779,10 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # {{{ FMM Driver + def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, - _communicate_mpoles_via_allreduce=False): + _communicate_mpoles_via_allreduce=False, + record_timing=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -785,10 +791,18 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, if current_rank == 0: start_time = time.time() + if record_timing: + distribute_wrangler_recorder = TimeRecorder( + "Distribute wrangler", comm, logger + ) + distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( queue, global_wrangler, distributed_geo_data) wrangler = distributed_wrangler + if record_timing: + distribute_wrangler_recorder.record() + local_traversal = distributed_geo_data.local_trav # {{{ Distribute source weights @@ -799,7 +813,9 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, from boxtree.distributed.calculation import distribute_source_weights local_source_weights = distribute_source_weights( - src_weights, distributed_geo_data.local_data, comm=comm) + src_weights, distributed_geo_data.local_data, comm=comm, + record_timing=record_timing + ) # }}} @@ -830,10 +846,16 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, comm.Allreduce(mpole_exps, mpole_exps_all) mpole_exps = mpole_exps_all else: - communicate_mpoles(wrangler, comm, local_traversal, mpole_exps) + communicate_mpoles( + wrangler, comm, local_traversal, mpole_exps, + record_timing=record_timing + ) # }}} + if record_timing: + interaction_timer = time.time() + # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials = wrangler.eval_direct( @@ -926,9 +948,22 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # }}} + if record_timing: + logger.info( + "Interaction calculation finished on process {0} in {1} secs".format( + comm.Get_rank(), time.time() - interaction_timer + ) + ) + + if record_timing: + send_potential_to_root_recorder = TimeRecorder( + "Send potential to root", comm, logger + ) + if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) + result = None else: # master process @@ -983,9 +1018,13 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) + if record_timing: + send_potential_to_root_recorder.record() + + if current_rank == 0: logger.info("Distributed FMM evaluation finished in {} secs.".format( - time.time() - start_time)) + time.time() - start_time)) - return result + return result # }}} -- GitLab From da05808623291005102d874427a74bf46568e930 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 21 Feb 2019 21:42:08 -0600 Subject: [PATCH 47/86] Use new cost model --- pytential/qbx/__init__.py | 22 +- pytential/qbx/default_perf_model.json | 1 - pytential/qbx/distributed.py | 30 +- pytential/qbx/perf_model.py | 472 -------------------- test/distributed/test_layer_pot_identity.py | 2 +- test/distributed/test_off_surface_eval.py | 2 +- 6 files changed, 32 insertions(+), 497 deletions(-) delete mode 100644 pytential/qbx/default_perf_model.json delete mode 100644 pytential/qbx/perf_model.py diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 45951401..2810248a 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -870,8 +870,28 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): timing_data = {} if return_timing_data else None if self.fmm_backend == 'distributed': + # FIXME: If the expansion wrangler is not FMMLib, the argument + # 'uses_pde_expansions' might be different + if self.cost_model is None: + from pytential.qbx.cost import CLQBXCostModel + cost_model = CLQBXCostModel( + queue, CLQBXCostModel.get_constantone_calibration_params() + ) + else: + cost_model = self.cost_model + + kernel_args = {} + for arg_name, arg_expr in six.iteritems(insn.kernel_arguments): + kernel_args[arg_name] = evaluate(arg_expr) + + boxes_time = cost_model.aggregate_stage_costs_per_box( + geo_data.traversal(), cost_model.get_qbx_modeled_cost( + geo_data, insn.base_kernel, kernel_args + ) + ).get() + distributed_geo_data = self.distibuted_geo_data( - geo_data, queue, wrangler + geo_data, queue, wrangler, boxes_time ) from pytential.qbx.distributed import drive_dfmm diff --git a/pytential/qbx/default_perf_model.json b/pytential/qbx/default_perf_model.json deleted file mode 100644 index 01df2bff..00000000 --- a/pytential/qbx/default_perf_model.json +++ /dev/null @@ -1 +0,0 @@ -[{"m2l_workload": 0, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 33868800, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 1680000, "p2l_nboxes": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 15120, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.021093479999763076, "process_elapsed": 0.015244096000000873}, "eval_qbx_expansions": {"wall_elapsed": 0.029716666000240366, "process_elapsed": 0.02971870699999979}, "eval_locals": {"wall_elapsed": 0.0001865880003606435, "process_elapsed": 0.00018793100000102925}, "multipole_to_local": {"wall_elapsed": 0.001130373000705731, "process_elapsed": 0.0008656770000001757}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06602678799936257, "process_elapsed": 0.08573500999999961}, "refine_locals": {"wall_elapsed": 0.0012004210002487525, "process_elapsed": 0.0008828260000006694}, "form_global_qbx_locals": {"wall_elapsed": 0.4057365960015886, "process_elapsed": 3.1082306060000002}, "coarsen_multipoles": {"wall_elapsed": 2.9456999982357956e-05, "process_elapsed": 2.999100000078414e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.014524768999763182, "process_elapsed": 0.06752062500000111}, "eval_multipoles": {"wall_elapsed": 0.0001489429996581748, "process_elapsed": 0.0001503160000000392}, "eval_direct": {"wall_elapsed": 0.00015629100016667508, "process_elapsed": 0.00015768100000013163}, "form_locals": {"wall_elapsed": 0.00027929699899686966, "process_elapsed": 0.0002805499999993799}}, {"m2l_workload": 0, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 33868800, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 1680000, "p2l_nboxes": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 15120, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.012340336999841384, "process_elapsed": 0.012342126000000064}, "eval_qbx_expansions": {"wall_elapsed": 0.03010847199766431, "process_elapsed": 0.030110333999996186}, "eval_locals": {"wall_elapsed": 0.00017738399947120342, "process_elapsed": 0.00017871500000055107}, "multipole_to_local": {"wall_elapsed": 0.0007668389989703428, "process_elapsed": 0.0007692639999987705}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.070026515999416, "process_elapsed": 0.0920731059999973}, "refine_locals": {"wall_elapsed": 0.0008041430010052864, "process_elapsed": 0.0008061699999988292}, "form_global_qbx_locals": {"wall_elapsed": 0.4070693410012609, "process_elapsed": 3.1083596039999986}, "coarsen_multipoles": {"wall_elapsed": 3.477300015219953e-05, "process_elapsed": 3.480300000191505e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01423785099905217, "process_elapsed": 0.06792730700000149}, "eval_multipoles": {"wall_elapsed": 0.00013409600069280714, "process_elapsed": 0.00013513799999920195}, "eval_direct": {"wall_elapsed": 0.00014394999925571028, "process_elapsed": 0.0001449600000000828}, "form_locals": {"wall_elapsed": 0.0002557949992478825, "process_elapsed": 0.0002571680000009735}}, {"m2l_workload": 1632000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 90281520, "m2qbxl_workload": 0, "m2p_workload": 0, "l2qbxl_workload": 2700000, "p2l_nboxes": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "p2l_workload": 0, "eval_qbxl_workload": 24300, "nterms_fmm_total": 772000, "form_multipoles": {"wall_elapsed": 0.021441335000417894, "process_elapsed": 0.02144320500000063}, "eval_qbx_expansions": {"wall_elapsed": 0.05041948199868784, "process_elapsed": 0.05305360899999911}, "eval_locals": {"wall_elapsed": 0.0005096630011394154, "process_elapsed": 0.0005112489999952174}, "multipole_to_local": {"wall_elapsed": 0.013463256000250112, "process_elapsed": 0.03790370300000134}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.11192823199962731, "process_elapsed": 0.12738405600000036}, "refine_locals": {"wall_elapsed": 0.005293365999023081, "process_elapsed": 0.005453985999999134}, "form_global_qbx_locals": {"wall_elapsed": 1.0642655539995758, "process_elapsed": 8.188607980000004}, "coarsen_multipoles": {"wall_elapsed": 3.5937999200541526e-05, "process_elapsed": 3.627799999961212e-05}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.03252427000006719, "process_elapsed": 0.09634986499999343}, "eval_multipoles": {"wall_elapsed": 0.00020739300089189783, "process_elapsed": 0.017759949999998526}, "eval_direct": {"wall_elapsed": 0.0004732769994006958, "process_elapsed": 0.00047447200000050316}, "form_locals": {"wall_elapsed": 0.0011059109983762028, "process_elapsed": 0.0011847469999999305}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 493642242, "m2qbxl_workload": 148080000, "m2p_workload": 0, "l2qbxl_workload": 10800000, "p2l_nboxes": 5760, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "p2l_workload": 17414400, "eval_qbxl_workload": 97200, "nterms_fmm_total": 3088000, "form_multipoles": {"wall_elapsed": 0.08514756100157683, "process_elapsed": 0.08530784799999935}, "eval_qbx_expansions": {"wall_elapsed": 0.1988989629990101, "process_elapsed": 0.20408010799999943}, "eval_locals": {"wall_elapsed": 0.0016958759988483507, "process_elapsed": 0.001697354000000928}, "multipole_to_local": {"wall_elapsed": 0.24327286500192713, "process_elapsed": 1.5432809840000061}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.45564075499896717, "process_elapsed": 0.4602933469999968}, "refine_locals": {"wall_elapsed": 0.02061064599911333, "process_elapsed": 0.021224416999999107}, "form_global_qbx_locals": {"wall_elapsed": 5.597717438999098, "process_elapsed": 43.771067205}, "coarsen_multipoles": {"wall_elapsed": 0.014241842998671927, "process_elapsed": 0.014243738000004669}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.3647387950004486, "process_elapsed": 1.8143376410000016}, "eval_multipoles": {"wall_elapsed": 0.0005216209992795484, "process_elapsed": 0.014195053999998208}, "eval_direct": {"wall_elapsed": 0.0015977099992596777, "process_elapsed": 0.0015989620000027571}, "form_locals": {"wall_elapsed": 0.5284309560011025, "process_elapsed": 0.5403717400000048}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 1118188692, "m2qbxl_workload": 226896000, "m2p_workload": 0, "l2qbxl_workload": 15840000, "p2l_nboxes": 5760, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "p2l_workload": 27091200, "eval_qbxl_workload": 142560, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12709579200054577, "process_elapsed": 0.12702671399999588}, "eval_qbx_expansions": {"wall_elapsed": 0.28158704199995555, "process_elapsed": 0.28159011700000747}, "eval_locals": {"wall_elapsed": 0.001804113999241963, "process_elapsed": 0.0018056689999923492}, "multipole_to_local": {"wall_elapsed": 0.21822331300063524, "process_elapsed": 1.470087723000006}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6500158240014571, "process_elapsed": 0.6500182090000095}, "refine_locals": {"wall_elapsed": 0.01991879000161134, "process_elapsed": 0.019876508999999487}, "form_global_qbx_locals": {"wall_elapsed": 12.633814991997497, "process_elapsed": 98.88120058599999}, "coarsen_multipoles": {"wall_elapsed": 0.014304040001661633, "process_elapsed": 0.014269510000005425}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5418222919997788, "process_elapsed": 2.748641480999993}, "eval_multipoles": {"wall_elapsed": 0.0009419779999007005, "process_elapsed": 0.0009434199999986959}, "eval_direct": {"wall_elapsed": 0.0017403539986844407, "process_elapsed": 0.0017415970000058678}, "form_locals": {"wall_elapsed": 0.7705097859998205, "process_elapsed": 0.7850629389999995}}, {"m2l_workload": 47616000, "direct_workload": 0, "eval_part_workload": 0, "p2qbxl_workload": 1118188692, "m2qbxl_workload": 226896000, "m2p_workload": 0, "l2qbxl_workload": 15840000, "p2l_nboxes": 5760, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "p2l_workload": 27091200, "eval_qbxl_workload": 142560, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12758302099973662, "process_elapsed": 0.1273970490000238}, "eval_qbx_expansions": {"wall_elapsed": 0.2835366819999763, "process_elapsed": 0.28353914399997393}, "eval_locals": {"wall_elapsed": 0.001810083000236773, "process_elapsed": 0.0018113820000280612}, "multipole_to_local": {"wall_elapsed": 0.21933700199952, "process_elapsed": 1.4927009270000156}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6411107570020249, "process_elapsed": 0.6427057839999577}, "refine_locals": {"wall_elapsed": 0.019865717000357108, "process_elapsed": 0.019831113999998706}, "form_global_qbx_locals": {"wall_elapsed": 12.666067901998758, "process_elapsed": 98.85360947599997}, "coarsen_multipoles": {"wall_elapsed": 0.014492083999357419, "process_elapsed": 0.014402262000004384}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5464885890014557, "process_elapsed": 2.7702280599999938}, "eval_multipoles": {"wall_elapsed": 0.0007965080003486946, "process_elapsed": 0.010344065999987606}, "eval_direct": {"wall_elapsed": 0.0017223149989149533, "process_elapsed": 0.001723447999978589}, "form_locals": {"wall_elapsed": 0.7680267520008783, "process_elapsed": 0.7662016509999887}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 448000, "p2l_workload": 0, "m2l_workload": 0, "direct_workload": 0, "eval_qbxl_workload": 15120, "p2qbxl_workload": 33868800, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_nsource_boxes": 128, "eval_qbx_expansions": {"wall_elapsed": 0.027458508000563597, "process_elapsed": 0.027460470000000292}, "coarsen_multipoles": {"wall_elapsed": 2.592999953776598e-05, "process_elapsed": 2.6278000000878876e-05}, "form_multipoles": {"wall_elapsed": 0.014194509998560534, "process_elapsed": 0.014196669000001272}, "form_locals": {"wall_elapsed": 0.0002368280001974199, "process_elapsed": 0.00023786200000142088}, "refine_locals": {"wall_elapsed": 0.0007696419997955672, "process_elapsed": 0.0007711829999994535}, "form_global_qbx_locals": {"wall_elapsed": 0.4148323249992245, "process_elapsed": 3.136391765000001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06255740400047216, "process_elapsed": 0.0824586750000007}, "multipole_to_local": {"wall_elapsed": 0.0007675939978071256, "process_elapsed": 0.000769723000000333}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01681618300062837, "process_elapsed": 0.0716890939999999}, "eval_multipoles": {"wall_elapsed": 0.00012192499889351893, "process_elapsed": 0.00012304999999823707}, "eval_direct": {"wall_elapsed": 0.00014055599967832677, "process_elapsed": 0.00014181200000074057}, "eval_locals": {"wall_elapsed": 0.00016502899961778894, "process_elapsed": 0.00016628200000035065}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 448000, "p2l_workload": 0, "m2l_workload": 0, "direct_workload": 0, "eval_qbxl_workload": 15120, "p2qbxl_workload": 33868800, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 1680000, "direct_nsource_boxes": 128, "eval_qbx_expansions": {"wall_elapsed": 0.027217917999223573, "process_elapsed": 0.027210197999998798}, "coarsen_multipoles": {"wall_elapsed": 2.7272999432170764e-05, "process_elapsed": 2.7537000001132128e-05}, "form_multipoles": {"wall_elapsed": 0.012168246998044197, "process_elapsed": 0.012170330000000007}, "form_locals": {"wall_elapsed": 0.00025652499789430294, "process_elapsed": 0.0002578460000020044}, "refine_locals": {"wall_elapsed": 0.0007966619996295776, "process_elapsed": 0.0007986469999980983}, "form_global_qbx_locals": {"wall_elapsed": 0.41449507100151095, "process_elapsed": 3.124277467999999}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06530420099988987, "process_elapsed": 0.08900481899999946}, "multipole_to_local": {"wall_elapsed": 0.0007791059997543925, "process_elapsed": 0.0007817149999986839}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.016263018998870393, "process_elapsed": 0.08408505000000233}, "eval_multipoles": {"wall_elapsed": 0.0001345959990430856, "process_elapsed": 0.00013584000000221863}, "eval_direct": {"wall_elapsed": 0.00014376900071511045, "process_elapsed": 0.0001450710000003852}, "eval_locals": {"wall_elapsed": 0.00017906200082506984, "process_elapsed": 0.00018038700000033714}}, {"m2qbxl_workload": 0, "nterms_fmm_total": 772000, "p2l_workload": 0, "m2l_workload": 1632000, "direct_workload": 0, "eval_qbxl_workload": 24300, "p2qbxl_workload": 90281520, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 0, "m2p_nboxes": 0, "l2qbxl_workload": 2700000, "direct_nsource_boxes": 3008, "eval_qbx_expansions": {"wall_elapsed": 0.04887341300127446, "process_elapsed": 0.05584138399999716}, "coarsen_multipoles": {"wall_elapsed": 2.4689998099347576e-05, "process_elapsed": 2.4934000002474477e-05}, "form_multipoles": {"wall_elapsed": 0.021673631999874488, "process_elapsed": 0.02167526300000233}, "form_locals": {"wall_elapsed": 0.0013548639999498846, "process_elapsed": 0.0030114619999999093}, "refine_locals": {"wall_elapsed": 0.004909453999061952, "process_elapsed": 0.006460705999998595}, "form_global_qbx_locals": {"wall_elapsed": 1.0763670570013346, "process_elapsed": 8.281176414999997}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.10042979500030924, "process_elapsed": 0.12154810500000224}, "multipole_to_local": {"wall_elapsed": 0.01164319700183114, "process_elapsed": 0.043857604999995914}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.033270312000240665, "process_elapsed": 0.11139830600000167}, "eval_multipoles": {"wall_elapsed": 0.00020722800036310218, "process_elapsed": 0.012455046999999553}, "eval_direct": {"wall_elapsed": 0.00044317800166027155, "process_elapsed": 0.00044460899999876347}, "eval_locals": {"wall_elapsed": 0.0004815839984075865, "process_elapsed": 0.00048297599999713725}}, {"m2qbxl_workload": 148080000, "nterms_fmm_total": 3088000, "p2l_workload": 17414400, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 97200, "p2qbxl_workload": 493642242, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 2880, "l2qbxl_workload": 10800000, "direct_nsource_boxes": 18368, "eval_qbx_expansions": {"wall_elapsed": 0.17471831900002144, "process_elapsed": 0.1748417530000026}, "coarsen_multipoles": {"wall_elapsed": 0.01359383099952538, "process_elapsed": 0.013595676000001333}, "form_multipoles": {"wall_elapsed": 0.08885329399890907, "process_elapsed": 0.08904873100000543}, "form_locals": {"wall_elapsed": 0.5181811650018062, "process_elapsed": 0.5401015320000013}, "refine_locals": {"wall_elapsed": 0.0193349160017533, "process_elapsed": 0.019337051000000827}, "form_global_qbx_locals": {"wall_elapsed": 5.6473584810009925, "process_elapsed": 44.13173326100001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.43283446399982495, "process_elapsed": 0.45431766599999435}, "multipole_to_local": {"wall_elapsed": 0.21959404200060817, "process_elapsed": 1.4773259700000096}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.36450363600124547, "process_elapsed": 1.839162819000002}, "eval_multipoles": {"wall_elapsed": 0.0005835300016769907, "process_elapsed": 0.015605516999990243}, "eval_direct": {"wall_elapsed": 0.001544830000057118, "process_elapsed": 0.001546145999995474}, "eval_locals": {"wall_elapsed": 0.0018016889989667106, "process_elapsed": 0.0018037119999974038}}, {"m2qbxl_workload": 226896000, "nterms_fmm_total": 4720000, "p2l_workload": 27091200, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 142560, "p2qbxl_workload": 1118188692, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_nsource_boxes": 21560, "eval_qbx_expansions": {"wall_elapsed": 0.2554673249996995, "process_elapsed": 0.2548831920000083}, "coarsen_multipoles": {"wall_elapsed": 0.013426133000393747, "process_elapsed": 0.013428055999995081}, "form_multipoles": {"wall_elapsed": 0.12737836500127742, "process_elapsed": 0.12738107700000967}, "form_locals": {"wall_elapsed": 0.7677063330011151, "process_elapsed": 0.7686912520000106}, "refine_locals": {"wall_elapsed": 0.018829676000677864, "process_elapsed": 0.0190762870000043}, "form_global_qbx_locals": {"wall_elapsed": 12.738795529001436, "process_elapsed": 99.644129348}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5864635059988359, "process_elapsed": 0.5854574929999927}, "multipole_to_local": {"wall_elapsed": 0.23098103499978606, "process_elapsed": 1.500902835000005}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.540710029001275, "process_elapsed": 2.745416965999979}, "eval_multipoles": {"wall_elapsed": 0.0006263670002226718, "process_elapsed": 0.014304139000003602}, "eval_direct": {"wall_elapsed": 0.0016434970002592308, "process_elapsed": 0.0016445880000048874}, "eval_locals": {"wall_elapsed": 0.0017078709988709306, "process_elapsed": 0.0017091690000086146}}, {"m2qbxl_workload": 226896000, "nterms_fmm_total": 4720000, "p2l_workload": 27091200, "m2l_workload": 47616000, "direct_workload": 0, "eval_qbxl_workload": 142560, "p2qbxl_workload": 1118188692, "m2p_workload": 0, "eval_part_workload": 0, "p2l_nboxes": 5760, "m2p_nboxes": 4896, "l2qbxl_workload": 15840000, "direct_nsource_boxes": 21560, "eval_qbx_expansions": {"wall_elapsed": 0.2562931430002209, "process_elapsed": 0.2562958170000229}, "coarsen_multipoles": {"wall_elapsed": 0.013575992001278792, "process_elapsed": 0.013578048999988823}, "form_multipoles": {"wall_elapsed": 0.127262083999085, "process_elapsed": 0.12726020400000948}, "form_locals": {"wall_elapsed": 0.7632982899995113, "process_elapsed": 0.7632958620000068}, "refine_locals": {"wall_elapsed": 0.018785588999890024, "process_elapsed": 0.01878766699999801}, "form_global_qbx_locals": {"wall_elapsed": 12.749122495999472, "process_elapsed": 99.54689662300001}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.5856983570010925, "process_elapsed": 0.5856943359999889}, "multipole_to_local": {"wall_elapsed": 0.2411785369986319, "process_elapsed": 1.523934061999995}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5380380780006817, "process_elapsed": 2.7199274079999896}, "eval_multipoles": {"wall_elapsed": 0.0006180829986988101, "process_elapsed": 0.017411200999987386}, "eval_direct": {"wall_elapsed": 0.0016484799998579547, "process_elapsed": 0.0016498760000160928}, "eval_locals": {"wall_elapsed": 0.0017064889998437138, "process_elapsed": 0.0017078759999833437}}, {"p2l_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "p2l_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 15120, "m2l_workload": 0, "m2p_workload": 0, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.014648365000539343, "process_elapsed": 0.014650504000000453}, "eval_qbx_expansions": {"wall_elapsed": 0.028573530000358005, "process_elapsed": 0.028575531999999626}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01594121800189896, "process_elapsed": 0.06301019599999957}, "eval_multipoles": {"wall_elapsed": 0.00014124999870546162, "process_elapsed": 0.00014257299999975714}, "form_global_qbx_locals": {"wall_elapsed": 0.415889525998864, "process_elapsed": 3.152819396}, "multipole_to_local": {"wall_elapsed": 0.0008315519989992026, "process_elapsed": 0.0008338300000003684}, "form_locals": {"wall_elapsed": 0.0002776020010060165, "process_elapsed": 0.0002789340000006746}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06354885899963847, "process_elapsed": 0.08562401699999977}, "coarsen_multipoles": {"wall_elapsed": 2.6239000362693332e-05, "process_elapsed": 2.668399999894433e-05}, "eval_direct": {"wall_elapsed": 0.0001438249983038986, "process_elapsed": 0.00014487600000023804}, "refine_locals": {"wall_elapsed": 0.0007603810008731671, "process_elapsed": 0.0007619090000003936}, "eval_locals": {"wall_elapsed": 0.00016837099974509329, "process_elapsed": 0.00016960200000060155}}, {"p2l_nboxes": 0, "l2qbxl_workload": 1680000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 33868800, "p2l_workload": 0, "direct_nsource_boxes": 128, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 15120, "m2l_workload": 0, "m2p_workload": 0, "nterms_fmm_total": 448000, "form_multipoles": {"wall_elapsed": 0.012285744000109844, "process_elapsed": 0.012279533000000953}, "eval_qbx_expansions": {"wall_elapsed": 0.027927285998885054, "process_elapsed": 0.027929128000000247}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.01500640099766315, "process_elapsed": 0.061567719999999326}, "eval_multipoles": {"wall_elapsed": 0.00013151700113667175, "process_elapsed": 0.00013257900000063216}, "form_global_qbx_locals": {"wall_elapsed": 0.4132579479992273, "process_elapsed": 3.1420789520000003}, "multipole_to_local": {"wall_elapsed": 0.0007470569998986321, "process_elapsed": 0.0007492009999996441}, "form_locals": {"wall_elapsed": 0.00024485500034643337, "process_elapsed": 0.0002460000000006346}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.06745604599927901, "process_elapsed": 0.09076768399999935}, "coarsen_multipoles": {"wall_elapsed": 2.470700019330252e-05, "process_elapsed": 2.5114999999686916e-05}, "eval_direct": {"wall_elapsed": 0.00014215699775377288, "process_elapsed": 0.00014338100000088616}, "refine_locals": {"wall_elapsed": 0.000750778999645263, "process_elapsed": 0.0007524110000023398}, "eval_locals": {"wall_elapsed": 0.00016190499991353136, "process_elapsed": 0.00016301199999801952}}, {"p2l_nboxes": 0, "l2qbxl_workload": 2700000, "direct_workload": 0, "m2qbxl_workload": 0, "p2qbxl_workload": 90281520, "p2l_workload": 0, "direct_nsource_boxes": 3008, "m2p_nboxes": 0, "eval_part_workload": 0, "eval_qbxl_workload": 24300, "m2l_workload": 1632000, "m2p_workload": 0, "nterms_fmm_total": 772000, "form_multipoles": {"wall_elapsed": 0.021994480001012562, "process_elapsed": 0.02252935800000344}, "eval_qbx_expansions": {"wall_elapsed": 0.0468985520019487, "process_elapsed": 0.0469004459999951}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.037229944000500836, "process_elapsed": 0.12695072000000707}, "eval_multipoles": {"wall_elapsed": 0.0001834110007621348, "process_elapsed": 0.00018498699999724977}, "form_global_qbx_locals": {"wall_elapsed": 1.0715511270009301, "process_elapsed": 8.245669204999999}, "multipole_to_local": {"wall_elapsed": 0.012459129999115248, "process_elapsed": 0.044035273999998736}, "form_locals": {"wall_elapsed": 0.0012082059984095395, "process_elapsed": 0.00120971399999803}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.10760574299820291, "process_elapsed": 0.1259072450000076}, "coarsen_multipoles": {"wall_elapsed": 2.7975002012681216e-05, "process_elapsed": 2.83579999980077e-05}, "eval_direct": {"wall_elapsed": 0.0004856249997828854, "process_elapsed": 0.00048697500000116634}, "refine_locals": {"wall_elapsed": 0.004943396999806282, "process_elapsed": 0.004945113000001555}, "eval_locals": {"wall_elapsed": 0.0004906579997623339, "process_elapsed": 0.0004921109999962425}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 10800000, "direct_workload": 0, "m2qbxl_workload": 148080000, "p2qbxl_workload": 493642242, "p2l_workload": 17414400, "direct_nsource_boxes": 18368, "m2p_nboxes": 2880, "eval_part_workload": 0, "eval_qbxl_workload": 97200, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 3088000, "form_multipoles": {"wall_elapsed": 0.08571038500122086, "process_elapsed": 0.0860022119999968}, "eval_qbx_expansions": {"wall_elapsed": 0.17934093500116433, "process_elapsed": 0.18099043000000847}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.37942328399913094, "process_elapsed": 1.9070648920000082}, "eval_multipoles": {"wall_elapsed": 0.000492429999212618, "process_elapsed": 0.014625257999995256}, "form_global_qbx_locals": {"wall_elapsed": 5.654556654000771, "process_elapsed": 44.21751857500001}, "multipole_to_local": {"wall_elapsed": 0.23715932500090275, "process_elapsed": 1.5620473709999985}, "form_locals": {"wall_elapsed": 0.5225906369996665, "process_elapsed": 0.5309461430000013}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.4364859279994562, "process_elapsed": 0.45337846200001053}, "coarsen_multipoles": {"wall_elapsed": 0.013538190001781913, "process_elapsed": 0.013539729999997974}, "eval_direct": {"wall_elapsed": 0.001511279999249382, "process_elapsed": 0.0015123169999995412}, "refine_locals": {"wall_elapsed": 0.020114442000704003, "process_elapsed": 0.020405819000004044}, "eval_locals": {"wall_elapsed": 0.002200300999902538, "process_elapsed": 0.0022019109999931175}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 15840000, "direct_workload": 0, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "p2l_workload": 27091200, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "eval_part_workload": 0, "eval_qbxl_workload": 142560, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12949872399804008, "process_elapsed": 0.1295011399999879}, "eval_qbx_expansions": {"wall_elapsed": 0.26377770700128167, "process_elapsed": 0.26379084600002045}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5667059379993589, "process_elapsed": 2.8107373960000217}, "eval_multipoles": {"wall_elapsed": 0.0006380379982147133, "process_elapsed": 0.02108191400000692}, "form_global_qbx_locals": {"wall_elapsed": 12.72665303499889, "process_elapsed": 99.514778799}, "multipole_to_local": {"wall_elapsed": 0.23357510099958745, "process_elapsed": 1.5240983700000044}, "form_locals": {"wall_elapsed": 0.7732686960007413, "process_elapsed": 0.7745623440000173}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6227469330005988, "process_elapsed": 0.6227212100000088}, "coarsen_multipoles": {"wall_elapsed": 0.013453300000037416, "process_elapsed": 0.013455266999997662}, "eval_direct": {"wall_elapsed": 0.0016683400008332683, "process_elapsed": 0.0016693409999959385}, "refine_locals": {"wall_elapsed": 0.01893364100214967, "process_elapsed": 0.01893565800000374}, "eval_locals": {"wall_elapsed": 0.0017234650022146525, "process_elapsed": 0.0017244739999995318}}, {"p2l_nboxes": 5760, "l2qbxl_workload": 15840000, "direct_workload": 0, "m2qbxl_workload": 226896000, "p2qbxl_workload": 1118188692, "p2l_workload": 27091200, "direct_nsource_boxes": 21560, "m2p_nboxes": 4896, "eval_part_workload": 0, "eval_qbxl_workload": 142560, "m2l_workload": 47616000, "m2p_workload": 0, "nterms_fmm_total": 4720000, "form_multipoles": {"wall_elapsed": 0.12816052500056685, "process_elapsed": 0.12831986100002268}, "eval_qbx_expansions": {"wall_elapsed": 0.2602627809992555, "process_elapsed": 0.259980588000019}, "translate_box_multipoles_to_qbx_local": {"wall_elapsed": 0.5683883500005322, "process_elapsed": 2.859925634999996}, "eval_multipoles": {"wall_elapsed": 0.0006368619997374481, "process_elapsed": 0.02064926899998909}, "form_global_qbx_locals": {"wall_elapsed": 12.768018111000856, "process_elapsed": 99.855472591}, "multipole_to_local": {"wall_elapsed": 0.2329632920009317, "process_elapsed": 1.5199842549999971}, "form_locals": {"wall_elapsed": 0.7612577990003047, "process_elapsed": 0.760523242000005}, "translate_box_local_to_qbx_local": {"wall_elapsed": 0.6147937720015761, "process_elapsed": 0.6136677860000077}, "coarsen_multipoles": {"wall_elapsed": 0.013572886000474682, "process_elapsed": 0.013538662000001978}, "eval_direct": {"wall_elapsed": 0.0017914090003614547, "process_elapsed": 0.0017926269999861688}, "refine_locals": {"wall_elapsed": 0.01885352499994042, "process_elapsed": 0.018819304999993847}, "eval_locals": {"wall_elapsed": 0.0016937519994826289, "process_elapsed": 0.0016949300000135281}}] \ No newline at end of file diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 80785a56..86e3546a 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -144,7 +144,7 @@ class QBXDistributedFMMLibExpansionWrangler( # {{{ Distributed GeoData class DistributedGeoData(object): - def __init__(self, geo_data, queue, global_wrangler, perf_model_file_path=None, + def __init__(self, geo_data, queue, global_wrangler, boxes_time, comm=MPI.COMM_WORLD): self.comm = comm current_rank = comm.Get_rank() @@ -194,18 +194,6 @@ class DistributedGeoData(object): # }}} if current_rank == 0: - from pytential.qbx.perf_model import QBXPerformanceModel - model = QBXPerformanceModel(queue.context, True) - # FIXME: If the expansion wrangler is not FMMLib, the argument - # 'uses_pde_expansions' might be different - - if perf_model_file_path is None: - model.load_default_model() - else: - model.loadjson(perf_model_file_path) - - boxes_time = model.predict_boxes_time(geo_data, global_wrangler) - from boxtree.distributed.partition import partition_work responsible_boxes_list = partition_work( boxes_time, traversal, comm.Get_size() @@ -616,7 +604,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): to_refined_connection=None, expansion_factory=None, target_association_tolerance=_not_provided, - perf_model_file_path=None, + cost_model=None, record_timing=False, # begin undocumented arguments @@ -638,7 +626,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): current_rank = self.comm.Get_rank() self.distributed_geo_data_cache = {} - self.perf_model_file_path = perf_model_file_path + self.cost_model = cost_model self.record_timing = record_timing if current_rank == 0: @@ -723,7 +711,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): obj.__class__ = DistributedQBXLayerPotentialSource obj.comm = self.comm obj.distributed_geo_data_cache = self.distributed_geo_data_cache - obj.perf_model_file_path = self.perf_model_file_path + obj.cost_model = self.cost_model obj.record_timing = self.record_timing current_rank = self.comm.Get_rank() @@ -734,7 +722,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): return obj - def distibuted_geo_data(self, geo_data, queue, wrangler): + def distibuted_geo_data(self, geo_data, queue, wrangler, boxes_time): """ Note: This method needs to be called collectively by all processes of self.comm """ @@ -764,13 +752,13 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) distributed_geo_data = DistributedGeoData( - host_geo_data, queue, wrangler, - perf_model_file_path=self.perf_model_file_path, - comm=self.comm + host_geo_data, queue, wrangler, boxes_time, comm=self.comm ) else: - distributed_geo_data = DistributedGeoData(None, queue, None, self.comm) + distributed_geo_data = DistributedGeoData( + None, queue, None, None, self.comm + ) self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data diff --git a/pytential/qbx/perf_model.py b/pytential/qbx/perf_model.py deleted file mode 100644 index 49c05388..00000000 --- a/pytential/qbx/perf_model.py +++ /dev/null @@ -1,472 +0,0 @@ -import numpy as np -from boxtree.distributed.perf_model import PerformanceCounter, PerformanceModel -from collections import namedtuple - -QBXParameters = namedtuple( - "QBXParameters", - ['ncoeffs_fmm_by_level', - 'ncoeffs_qbx', - 'translation_source_power', - 'translation_target_power', - 'translation_max_power'] -) - - -class QBXPerformanceCounter(PerformanceCounter): - - def __init__(self, geo_data, wrangler, uses_pde_expansions): - self.geo_data = geo_data - self.traversal = geo_data.traversal() - self.wrangler = wrangler - self.uses_pde_expansions = uses_pde_expansions - - self.parameters = self.get_qbx_parameters( - self.traversal.tree.dimensions, - uses_pde_expansions, - wrangler.level_nterms, - wrangler.qbx_order - ) - - @staticmethod - def get_qbx_parameters(dimensions, use_pde_expansions, level_nterms, qbx_order): - fmm_parameters = PerformanceCounter.get_fmm_parameters( - dimensions, use_pde_expansions, level_nterms - ) - - if use_pde_expansions: - ncoeffs_qbx = qbx_order ** (dimensions - 1) - else: - ncoeffs_qbx = qbx_order ** dimensions - - return QBXParameters( - ncoeffs_fmm_by_level=fmm_parameters.ncoeffs_fmm_by_level, - ncoeffs_qbx=ncoeffs_qbx, - translation_source_power=fmm_parameters.translation_source_power, - translation_target_power=fmm_parameters.translation_target_power, - translation_max_power=fmm_parameters.translation_max_power, - ) - - def count_direct(self, use_global_idx=False, box_target_counts_nonchild=None): - """ - This method overwrites the one in parent class because only non-qbx targets - should be counted. - - :return: If *use_global_idx* is True, return a numpy array of shape - (tree.nboxes,) such that the ith entry represents the workload from - direct evaluation on box i. If *use_global_idx* is False, return a numpy - array of shape (ntarget_boxes,) such that the ith entry represents the - workload on *target_boxes* i. - """ - if box_target_counts_nonchild is None: - box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ - .box_target_counts_nonchild - - return super(QBXPerformanceCounter, self).count_direct( - use_global_idx=use_global_idx, - box_target_counts_nonchild=box_target_counts_nonchild - ) - - def count_m2p(self, use_global_idx=False, box_target_counts_nonchild=None): - """ - This method overwrites the one in parent class because only non-qbx targets - should be counted. - """ - if box_target_counts_nonchild is None: - box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ - .box_target_counts_nonchild - - return super(QBXPerformanceCounter, self).count_m2p( - use_global_idx=use_global_idx, - box_target_counts_nonchild=box_target_counts_nonchild - ) - - def count_eval_part(self, use_global_idx=False, box_target_counts_nonchild=None): - """ - This method overwrites the one in parent class because only non-qbx targets - should be counted. - """ - if box_target_counts_nonchild is None: - box_target_counts_nonchild = self.geo_data.non_qbx_box_target_lists()\ - .box_target_counts_nonchild - - return super(QBXPerformanceCounter, self).count_eval_part( - use_global_idx=use_global_idx, - box_target_counts_nonchild=box_target_counts_nonchild - ) - - def count_p2qbxl(self, use_global_idx=False): - geo_data = self.geo_data - traversal = self.traversal - tree = traversal.tree - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - - if use_global_idx: - np2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) - else: - ntarget_boxes = len(traversal.target_boxes) - np2qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) - - for tgt_icenter in geo_data.global_qbx_centers(): - itgt_box = qbx_center_to_target_box[tgt_icenter] - - np2qbxl_srcs = 0 - - # list 1 - start, end = traversal.neighbor_source_boxes_starts[ - itgt_box:itgt_box + 2] - for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: - np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] - - # list 3 close - if traversal.from_sep_close_smaller_starts is not None: - start, end = traversal.from_sep_close_smaller_starts[ - itgt_box:itgt_box + 2] - for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: - np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] - - # list 4 close - if traversal.from_sep_close_bigger_starts is not None: - # POSSIBLY USE INTERFACE WRONGLY - start, end = traversal.from_sep_close_bigger_starts[ - itgt_box:itgt_box + 2] - for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: - np2qbxl_srcs += tree.box_source_counts_nonchild[src_ibox] - - workload = np2qbxl_srcs * self.parameters.ncoeffs_qbx - - if use_global_idx: - np2qbxl[traversal.target_boxes[itgt_box]] += workload - else: - np2qbxl[itgt_box] += workload - - return np2qbxl - - def count_m2qbxl(self, use_global_idx=False): - geo_data = self.geo_data - traversal = self.traversal - tree = traversal.tree - global_qbx_centers = geo_data.global_qbx_centers() - - if use_global_idx: - nm2qbxl = np.zeros((tree.nboxes,), dtype=np.intp) - else: - ntarget_boxes = len(traversal.target_boxes) - nm2qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) - - for isrc_level, ssn in enumerate(traversal.from_sep_smaller_by_level): - - target_boxes_sep_smaller_current_level = \ - traversal.target_boxes_sep_smaller_by_source_level[isrc_level] - - cost_coefficient = self.xlat_cost( - self.wrangler.level_nterms[isrc_level], - self.wrangler.qbx_order, - self.parameters - ) - - qbx_center_to_target_box_current_level = \ - geo_data.qbx_center_to_target_box_source_level(isrc_level) - - for itgt_center, tgt_icenter in enumerate(global_qbx_centers): - icontaining_tgt_box = qbx_center_to_target_box_current_level[ - tgt_icenter - ] - - if icontaining_tgt_box == -1: - continue - - start, stop = ( - ssn.starts[icontaining_tgt_box], - ssn.starts[icontaining_tgt_box+1]) - - cost = (stop - start) * cost_coefficient - - if use_global_idx: - global_boxes_idx = \ - target_boxes_sep_smaller_current_level[icontaining_tgt_box] - nm2qbxl[global_boxes_idx] += cost - else: - target_boxes_idx = ssn.nonempty_indices[icontaining_tgt_box] - nm2qbxl[target_boxes_idx] += cost - - return nm2qbxl - - def count_l2qbxl(self, use_global_idx=False): - geo_data = self.geo_data - traversal = self.traversal - tree = traversal.tree - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - global_qbx_centers = geo_data.global_qbx_centers() - - if use_global_idx: - nl2qbxl = np.zeros(tree.nboxes, dtype=np.intp) - else: - ntarget_boxes = len(traversal.target_boxes) - nl2qbxl = np.zeros(ntarget_boxes, dtype=np.intp) - - for src_icenter in global_qbx_centers: - target_box_idx = qbx_center_to_target_box[src_icenter] - global_box_idx = traversal.target_boxes[target_box_idx] - - box_level = tree.box_levels[global_box_idx] - - cost = self.xlat_cost( - self.wrangler.level_nterms[box_level], - self.wrangler.qbx_order, - self.parameters - ) - - if use_global_idx: - nl2qbxl[global_box_idx] += cost - else: - nl2qbxl[target_box_idx] += cost - - return nl2qbxl - - def count_eval_qbxl(self, use_global_idx=False): - geo_data = self.geo_data - traversal = self.traversal - tree = traversal.tree - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - global_qbx_centers = geo_data.global_qbx_centers() - center_to_targets_starts = geo_data.center_to_tree_targets().starts - - if use_global_idx: - neval_qbxl = np.zeros((tree.nboxes,), dtype=np.intp) - else: - ntarget_boxes = len(traversal.target_boxes) - neval_qbxl = np.zeros((ntarget_boxes,), dtype=np.intp) - - for src_icenter in global_qbx_centers: - start, end = center_to_targets_starts[src_icenter:src_icenter+2] - cost = (end - start) * self.parameters.ncoeffs_qbx - - target_box_idx = qbx_center_to_target_box[src_icenter] - - if use_global_idx: - global_box_idx = traversal.target_boxes[target_box_idx] - neval_qbxl[global_box_idx] += cost - else: - neval_qbxl[target_box_idx] += cost - - return neval_qbxl - - -class QBXPerformanceModel(PerformanceModel): - - def __init__(self, cl_context, uses_pde_expansions): - super(QBXPerformanceModel, self).__init__( - cl_context, uses_pde_expansions - ) - - def time_performance(self, traversal, wrangler): - raise NotImplementedError("Please use time_qbx_performance instead.") - - def time_qbx_performance(self, queue, bound_op, context): - timing_data = {} - - def expansion_wrangler_inspector(wrangler): - counter = QBXPerformanceCounter( - wrangler.geo_data, wrangler, self.uses_pde_expansions - ) - traversal = wrangler.geo_data.traversal() - - nm2p, nm2p_boxes = counter.count_m2p() - - from pytential.qbx.fmm import add_dicts - timing_data.update(add_dicts(timing_data, { - "nterms_fmm_total": counter.count_nters_fmm_total(), - "direct_workload": np.sum(counter.count_direct()), - "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], - "m2l_workload": np.sum(counter.count_m2l()), - "m2p_workload": np.sum(nm2p), - "m2p_nboxes": np.sum(nm2p_boxes), - "p2l_workload": np.sum(counter.count_p2l()), - "p2l_nboxes": np.sum(counter.count_p2l_source_boxes()), - "eval_part_workload": np.sum(counter.count_eval_part()), - "p2qbxl_workload": np.sum(counter.count_p2qbxl()), - "m2qbxl_workload": np.sum(counter.count_m2qbxl()), - "l2qbxl_workload": np.sum(counter.count_l2qbxl()), - "eval_qbxl_workload": np.sum(counter.count_eval_qbxl()) - })) - - from pytential.symbolic.primitives import DEFAULT_SOURCE - bound_op.places[DEFAULT_SOURCE].bind_expansion_wrangler_inspector( - expansion_wrangler_inspector - ) - - bound_op.eval(queue, context=context, timing_data=timing_data) - - self.time_result.append(timing_data) - - def form_global_qbx_locals_model(self, wall_time=True): - return self.linear_regression( - "form_global_qbx_locals", ["p2qbxl_workload"], - wall_time=wall_time - ) - - def translate_box_multipoles_to_qbx_local_model(self, wall_time=True): - return self.linear_regression( - "translate_box_multipoles_to_qbx_local", ["m2qbxl_workload"], - wall_time=wall_time - ) - - def translate_box_local_to_qbx_local_model(self, wall_time=True): - return self.linear_regression( - "translate_box_local_to_qbx_local", ["l2qbxl_workload"], - wall_time=wall_time - ) - - def eval_qbx_expansions_model(self, wall_time=True): - return self.linear_regression( - "eval_qbx_expansions", ["eval_qbxl_workload"], - wall_time=wall_time - ) - - def predict_boxes_time(self, geo_data, wrangler): - boxes_time = super(QBXPerformanceModel, self).predict_boxes_time( - geo_data.traversal(), wrangler - ) - - counter = QBXPerformanceCounter(geo_data, wrangler, self.uses_pde_expansions) - - # {{{ form_global_qbx_locals time - - param = self.form_global_qbx_locals_model() - - p2qbxl_workload = counter.count_p2qbxl(use_global_idx=True) - - boxes_time += (p2qbxl_workload * param[0] + param[1]) - - # }}} - - # {{{ translate_box_multipoles_to_qbx_local time - - param = self.translate_box_multipoles_to_qbx_local_model() - - m2qbxl_workload = counter.count_m2qbxl(use_global_idx=True) - - boxes_time += (m2qbxl_workload * param[0] + param[1]) - - # }}} - - # {{{ translate_box_local_to_qbx_local time - - param = self.translate_box_local_to_qbx_local_model() - - l2qbxl_workload = counter.count_l2qbxl(use_global_idx=True) - - boxes_time += (l2qbxl_workload * param[0] + param[1]) - - # }}} - - # {{{ eval_qbx_expansions time - - param = self.eval_qbx_expansions_model() - - eval_qbxl_workload = counter.count_eval_qbxl(use_global_idx=True) - - boxes_time += (eval_qbxl_workload * param[0] + param[1]) - - # }}} - - return boxes_time - - def predict_step_time(self, eval_counter, wall_time=True): - predict_timing = super(QBXPerformanceModel, self).predict_step_time( - eval_counter, wall_time=wall_time - ) - - # {{{ Predict form_global_qbx_locals time - - param = self.form_global_qbx_locals_model(wall_time=wall_time) - - p2qbxl_workload = np.sum(eval_counter.count_p2qbxl()) - - predict_timing["form_global_qbx_locals"] = ( - p2qbxl_workload * param[0] + param[1] - ) - - # }}} - - # {{{ Predict translate_box_multipoles_to_qbx_local time - - param = self.translate_box_multipoles_to_qbx_local_model(wall_time=wall_time) - - m2qbxl_workload = np.sum(eval_counter.count_m2qbxl()) - - predict_timing["translate_box_multipoles_to_qbx_local"] = ( - m2qbxl_workload * param[0] + param[1] - ) - - # }}} - - # {{{ Predict translate_box_local_to_qbx_local time - - param = self.translate_box_local_to_qbx_local_model(wall_time=wall_time) - - l2qbxl_workload = np.sum(eval_counter.count_l2qbxl()) - - predict_timing["translate_box_local_to_qbx_local"] = ( - l2qbxl_workload * param[0] + param[1] - ) - - # }}} - - # {{{ Predict eval_qbx_expansions time - - param = self.eval_qbx_expansions_model(wall_time=wall_time) - - eval_qbxl_workload = np.sum(eval_counter.count_eval_qbxl()) - - predict_timing["eval_qbx_expansions"] = ( - eval_qbxl_workload * param[0] + param[1] - ) - - # }}} - - return predict_timing - - def evaluate_model(self, queue, bound_op, context, wall_time=True): - predict_timing = {} - - def expansion_wrangler_inspector(wrangler): - eval_counter = QBXPerformanceCounter( - wrangler.geo_data, wrangler, self.uses_pde_expansions - ) - - from pytential.qbx.fmm import add_dicts - predict_timing.update(add_dicts( - predict_timing, - self.predict_step_time(eval_counter, wall_time=wall_time) - )) - - from pytential.symbolic.primitives import DEFAULT_SOURCE - bound_op.places[DEFAULT_SOURCE].bind_expansion_wrangler_inspector( - expansion_wrangler_inspector - ) - - actual_timing = {} - bound_op.eval(queue, context=context, timing_data=actual_timing) - - for field in ["eval_direct", "multipole_to_local", "eval_multipoles", - "form_locals", "eval_locals", "form_global_qbx_locals", - "translate_box_multipoles_to_qbx_local", - "translate_box_local_to_qbx_local", "eval_qbx_expansions"]: - predict_time_field = predict_timing[field] - - if wall_time: - true_time_field = actual_timing[field]["wall_elapsed"] - else: - true_time_field = actual_timing[field]["process_elapsed"] - - diff = abs(predict_time_field - true_time_field) - - print(field + ": predict " + str(predict_time_field) + " actual " - + str(true_time_field) + " error " + str(diff / true_time_field)) - - def load_default_model(self): - import os - current_dir = os.path.dirname(os.path.abspath(__file__)) - default_perf_file_path = os.path.join(current_dir, 'default_perf_model.json') - self.loadjson(default_perf_file_path) diff --git a/test/distributed/test_layer_pot_identity.py b/test/distributed/test_layer_pot_identity.py index f0ca6e8a..38af2989 100644 --- a/test/distributed/test_layer_pot_identity.py +++ b/test/distributed/test_layer_pot_identity.py @@ -168,7 +168,7 @@ if current_rank == 0: else: while True: lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None, None) from pytential.qbx.distributed import drive_dfmm wrangler = None diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index 85029b2c..11a6473f 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -84,7 +84,7 @@ if current_rank == 0: # master rank else: # helper rank lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None) + distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None, None) from pytential.qbx.distributed import drive_dfmm wrangler = None -- GitLab From 33ad39c1846fab048a739331969f56abf39a1572 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 21 Feb 2019 22:18:54 -0600 Subject: [PATCH 48/86] Remove expansion_wrangler_inspector --- pytential/qbx/__init__.py | 22 ---------------------- pytential/qbx/distributed.py | 4 ---- 2 files changed, 26 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 2810248a..89ddd555 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -85,7 +85,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _tree_kind="adaptive", _use_target_specific_qbx=False, geometry_data_inspector=None, - expansion_wrangler_inspector=None, performance_model=None, cost_model=None, fmm_backend="sumpy", @@ -208,7 +207,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): self._tree_kind = _tree_kind self._use_target_specific_qbx = _use_target_specific_qbx self.geometry_data_inspector = geometry_data_inspector - self.expansion_wrangler_inspector = expansion_wrangler_inspector self.performance_model = performance_model self.cost_model = cost_model @@ -234,7 +232,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): _tree_kind=None, _use_target_specific_qbx=_not_provided, geometry_data_inspector=None, - expansion_wrangler_inspector=None, performance_model=_not_provided, cost_model=_not_provided, fmm_backend=None, @@ -323,14 +320,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else self._use_target_specific_qbx), geometry_data_inspector=( geometry_data_inspector or self.geometry_data_inspector), - expansion_wrangler_inspector=( - expansion_wrangler_inspector or self.expansion_wrangler_inspector - ), - performance_model=( - # None is a valid value here - performance_model - if performance_model is not _not_provided - else self.performance_model), cost_model=( # None is a valid value here cost_model @@ -861,9 +850,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): if not perform_fmm: return [(o.name, 0) for o in insn.outputs] - if self.expansion_wrangler_inspector is not None: - self.expansion_wrangler_inspector(wrangler) - # }}} # {{{ execute global QBX @@ -1091,14 +1077,6 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # }}} - def bind_expansion_wrangler_inspector(self, inspector): - if self.expansion_wrangler_inspector is not None: - raise NotImplementedError("Cannot bind multiple inspectors.") - - self.expansion_wrangler_inspector = inspector - - # }}} - # }}} diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 86e3546a..d658a58a 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -619,7 +619,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=None, _tree_kind="adaptive", geometry_data_inspector=None, - expansion_wrangler_inspector=None, target_stick_out_factor=_not_provided): self.comm = comm @@ -655,7 +654,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_min_nsources_cumul=0, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, - expansion_wrangler_inspector=expansion_wrangler_inspector, fmm_backend='distributed', target_stick_out_factor=target_stick_out_factor ) @@ -676,7 +674,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=None, _tree_kind=None, geometry_data_inspector=None, - expansion_wrangler_inspector=None, performance_model=_not_provided, fmm_backend=None, @@ -700,7 +697,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=_from_sep_smaller_crit, _tree_kind=_tree_kind, geometry_data_inspector=geometry_data_inspector, - expansion_wrangler_inspector=expansion_wrangler_inspector, fmm_backend=fmm_backend, debug=debug, -- GitLab From 027292352fc2fdc37db9210108d753f32c23d95f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 22 Feb 2019 22:39:52 -0600 Subject: [PATCH 49/86] Try disable recv mprobe --- pytential/qbx/distributed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index d658a58a..6ac5df52 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,3 +1,6 @@ +import mpi4py +mpi4py.rc.recv_mprobe = False + from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler -- GitLab From b830e192302c61d1367bcd0ebcdd60ee13dceb00 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 22 Feb 2019 23:54:18 -0600 Subject: [PATCH 50/86] Try fix pickle error --- pytential/qbx/distributed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 6ac5df52..98440f26 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -471,7 +471,8 @@ class DistributedGeoData(object): # }}} - MPI.Request.Waitall(reqs) + for req in reqs: + req.wait() local_centers = local_centers[0] local_global_qbx_centers = local_global_qbx_centers[0] -- GitLab From da629063208e5f05779101dec97a040932e99249 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 25 Feb 2019 15:41:41 -0800 Subject: [PATCH 51/86] Revert "Try fix pickle error" This reverts commit b830e192302c61d1367bcd0ebcdd60ee13dceb00. --- pytential/qbx/distributed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 98440f26..6ac5df52 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -471,8 +471,7 @@ class DistributedGeoData(object): # }}} - for req in reqs: - req.wait() + MPI.Request.Waitall(reqs) local_centers = local_centers[0] local_global_qbx_centers = local_global_qbx_centers[0] -- GitLab From 888a7b496723631f1da9b661803b973d0d4eb53a Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 25 Feb 2019 15:41:50 -0800 Subject: [PATCH 52/86] Revert "Try disable recv mprobe" This reverts commit 027292352fc2fdc37db9210108d753f32c23d95f. --- pytential/qbx/distributed.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 6ac5df52..d658a58a 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,6 +1,3 @@ -import mpi4py -mpi4py.rc.recv_mprobe = False - from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource, _not_provided from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler -- GitLab From 7d86a80915b3da1b664d8effa145d62658568314 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 28 Apr 2019 21:59:40 -0500 Subject: [PATCH 53/86] Remove record_timing option --- pytential/qbx/__init__.py | 3 +-- pytential/qbx/distributed.py | 41 +++--------------------------------- 2 files changed, 4 insertions(+), 40 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 16522af5..3f2cdead 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -882,8 +882,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): from pytential.qbx.distributed import drive_dfmm all_potentials_on_every_tgt = drive_dfmm( - queue, strengths, distributed_geo_data, comm=self.comm, - record_timing=self.record_timing + queue, strengths, distributed_geo_data, comm=self.comm ) else: from pytential.qbx.fmm import drive_fmm diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index d658a58a..976988c5 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -9,7 +9,6 @@ import pyopencl as cl import logging import time from boxtree.tools import return_timing_data -from boxtree.distributed.util import TimeRecorder logger = logging.getLogger(__name__) @@ -605,7 +604,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): expansion_factory=None, target_association_tolerance=_not_provided, cost_model=None, - record_timing=False, # begin undocumented arguments # FIXME default debug=False once everything has matured @@ -626,7 +624,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): self.distributed_geo_data_cache = {} self.cost_model = cost_model - self.record_timing = record_timing if current_rank == 0: self.next_geo_data_id = 0 @@ -708,7 +705,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): obj.comm = self.comm obj.distributed_geo_data_cache = self.distributed_geo_data_cache obj.cost_model = self.cost_model - obj.record_timing = self.record_timing current_rank = self.comm.Get_rank() @@ -765,8 +761,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, - _communicate_mpoles_via_allreduce=False, - record_timing=False): + _communicate_mpoles_via_allreduce=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -775,18 +770,10 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, if current_rank == 0: start_time = time.time() - if record_timing: - distribute_wrangler_recorder = TimeRecorder( - "Distribute wrangler", comm, logger - ) - distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( queue, global_wrangler, distributed_geo_data) wrangler = distributed_wrangler - if record_timing: - distribute_wrangler_recorder.record() - local_traversal = distributed_geo_data.local_trav # {{{ Distribute source weights @@ -797,8 +784,7 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, from boxtree.distributed.calculation import distribute_source_weights local_source_weights = distribute_source_weights( - src_weights, distributed_geo_data.local_data, comm=comm, - record_timing=record_timing + src_weights, distributed_geo_data.local_data, comm=comm ) # }}} @@ -830,16 +816,10 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, comm.Allreduce(mpole_exps, mpole_exps_all) mpole_exps = mpole_exps_all else: - communicate_mpoles( - wrangler, comm, local_traversal, mpole_exps, - record_timing=record_timing - ) + communicate_mpoles(wrangler, comm, local_traversal, mpole_exps) # }}} - if record_timing: - interaction_timer = time.time() - # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials = wrangler.eval_direct( @@ -932,18 +912,6 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # }}} - if record_timing: - logger.info( - "Interaction calculation finished on process {0} in {1} secs".format( - comm.Get_rank(), time.time() - interaction_timer - ) - ) - - if record_timing: - send_potential_to_root_recorder = TimeRecorder( - "Send potential to root", comm, logger - ) - if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) @@ -1002,9 +970,6 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) - if record_timing: - send_potential_to_root_recorder.record() - if current_rank == 0: logger.info("Distributed FMM evaluation finished in {} secs.".format( time.time() - start_time)) -- GitLab From 432a286364f519957e8c10e3359e069c836a4f0f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 7 May 2019 00:57:47 -0500 Subject: [PATCH 54/86] Temporary solution for terminating slave nodes --- pytential/qbx/distributed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 976988c5..fc772d44 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -735,6 +735,9 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): geo_data_id = self.comm.bcast(geo_data_id, root=0) + if geo_data_id == -1: + return None + if geo_data_id in self.distributed_geo_data_cache: return self.distributed_geo_data_cache[geo_data_id] -- GitLab From b821c970b20579e998dcb5dbb51eb0a89300fca7 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 29 Jul 2019 00:12:38 -0500 Subject: [PATCH 55/86] Estimate kernel-specific parameters --- pytential/qbx/cost.py | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pytential/qbx/cost.py b/pytential/qbx/cost.py index 22e957db..9758a818 100644 --- a/pytential/qbx/cost.py +++ b/pytential/qbx/cost.py @@ -724,4 +724,51 @@ class PythonQBXCostModel(AbstractQBXCostModel, PythonFMMCostModel): # }}} + +def generate_parameters_output(queue, model_costs, real_costs): + """Get kernel-specific calibration parameters from samples of model costs and + real costs. + + :arg queue: a :class:`pyopencl.CommandQueue` object on which the cost model is + created. + :arg model_costs: a :class:`list` of modeled costs. Each model cost can be + obtained from `BoundExpression.get_modeled_cost`. + :arg real_costs: a :class:`list` of timing data. Each timing data can be obtained + from `BoundExpression.eval`. + :return: a :class:`dict` which maps kernels to calibration parameters. + """ + cost_per_kernel = {} + params_per_kernel = {} + + assert len(model_costs) == len(real_costs) + + for icase in range(len(model_costs)): + model_cost = model_costs[icase] + real_cost = real_costs[icase] + + for insn in real_cost: + assert (insn in model_cost) + + knls = tuple(knl for knl in insn.kernels) + + if knls not in cost_per_kernel: + cost_per_kernel[knls] = { + "model_costs": [], + "real_costs": [] + } + + cost_per_kernel[knls]["model_costs"].append(model_cost[insn]) + cost_per_kernel[knls]["real_costs"].append(real_cost[insn]) + + cost_model = CLQBXCostModel( + queue, CLQBXCostModel.get_constantone_calibration_params() + ) + + for knls in cost_per_kernel: + params_per_kernel[knls] = cost_model.estimate_calibration_params( + cost_per_kernel[knls]["model_costs"], cost_per_kernel[knls]["real_costs"] + ) + + return params_per_kernel + # vim: foldmethod=marker -- GitLab From ba88743114c4d8a9fc970cf750738b7f4ae899e1 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 3 Sep 2019 00:23:38 -0500 Subject: [PATCH 56/86] Fix bug --- pytential/qbx/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 12e15807..4292b18b 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -578,9 +578,9 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): def drive_cost_model( wrangler, strengths, geo_data, kernel, kernel_arguments): del strengths - - cost_model_result = ( - self.cost_model(wrangler, geo_data, kernel, kernel_arguments)) + cost_model_result = self.cost_model( + geo_data, kernel, kernel_arguments, calibration_params + ) from pytools.obj_array import with_object_array_or_scalar output_placeholder = with_object_array_or_scalar( -- GitLab From 5bb0a50be48b8a7bce766c8ee5c3d99be7c9e733 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 9 Sep 2019 17:48:54 -0500 Subject: [PATCH 57/86] Temporarily mark rotation as not implemented --- pytential/qbx/distributed.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 7eb0e7e4..26b3e66c 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -589,6 +589,12 @@ class DistributedGeoData(object): def qbx_center_to_target_box_source_level(self, source_level): return self._qbx_center_to_target_box_source_level[source_level] + def m2l_rotation_lists(self): + raise NotImplementedError + + def m2l_rotation_angles(self): + raise NotImplementedError + # }}} @@ -619,6 +625,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind="adaptive", + _use_target_specific_qbx=None, geometry_data_inspector=None, target_stick_out_factor=_not_provided): @@ -652,6 +659,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _from_sep_smaller_crit=_from_sep_smaller_crit, _from_sep_smaller_min_nsources_cumul=0, _tree_kind=_tree_kind, + _use_target_specific_qbx=_use_target_specific_qbx, geometry_data_inspector=geometry_data_inspector, fmm_backend='distributed', target_stick_out_factor=target_stick_out_factor, @@ -674,6 +682,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _box_extent_norm=None, _from_sep_smaller_crit=None, _tree_kind=None, + _use_target_specific_qbx=_not_provided, geometry_data_inspector=None, fmm_backend=None, cost_model=_not_provided, @@ -698,6 +707,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): _box_extent_norm=_box_extent_norm, _from_sep_smaller_crit=_from_sep_smaller_crit, _tree_kind=_tree_kind, + _use_target_specific_qbx=_use_target_specific_qbx, geometry_data_inspector=geometry_data_inspector, fmm_backend=fmm_backend, cost_model=cost_model, -- GitLab From bdfdba2d7e5cc1bd1b598664d7bee45c26a1f797 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 9 Sep 2019 18:30:58 -0500 Subject: [PATCH 58/86] Handle RotationClass --- pytential/qbx/distributed.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 26b3e66c..19ed2ba4 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -9,6 +9,7 @@ import pyopencl as cl import logging import time from boxtree.tools import return_timing_data +from pytools import memoize_method logger = logging.getLogger(__name__) @@ -152,6 +153,7 @@ class DistributedGeoData(object): total_rank = comm.Get_size() self.global_wrangler = global_wrangler + self.queue = queue if geo_data is not None: # master process traversal = geo_data.traversal() @@ -589,11 +591,24 @@ class DistributedGeoData(object): def qbx_center_to_target_box_source_level(self, source_level): return self._qbx_center_to_target_box_source_level[source_level] + @memoize_method + def build_rotation_classes_lists(self): + trav = self.traversal().to_device(self.queue) + tree = self.tree().to_device(self.queue) + + from boxtree.rotation_classes import RotationClassesBuilder + return RotationClassesBuilder(self.queue.context)( + self.queue, trav, tree)[0].get(self.queue) + + @memoize_method def m2l_rotation_lists(self): - raise NotImplementedError + return self.build_rotation_classes_lists().from_sep_siblings_rotation_classes + @memoize_method def m2l_rotation_angles(self): - raise NotImplementedError + return (self + .build_rotation_classes_lists() + .from_sep_siblings_rotation_class_to_angle) # }}} -- GitLab From e23b3766a47222c72c8040b288f723f93101ad96 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 11 Sep 2019 17:53:46 -0500 Subject: [PATCH 59/86] Supply default calibration parameters when not supplied by user --- pytential/qbx/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 4292b18b..9311c88d 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -749,18 +749,20 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): cost_model = self.cost_model if self.knl_specific_calibration_params is None: - raise RuntimeError( - "Must specify kernel-specific calibration parameters when using " - "distributed FMM." + import warnings + warnings.warn( + "Kernel-specific calibration parameters are not supplied when" + "using distributed FMM." ) - - knls = tuple(knl for knl in insn.kernels) - - if (isinstance(self.knl_specific_calibration_params, str) + # TODO: supply better default calibration parameters + calibration_params = \ + AbstractQBXCostModel.get_constantone_calibration_params() + elif (isinstance(self.knl_specific_calibration_params, str) and self.knl_specific_calibration_params == "constant_one"): calibration_params = \ AbstractQBXCostModel.get_constantone_calibration_params() else: + knls = tuple(knl for knl in insn.kernels) calibration_params = self.knl_specific_calibration_params[knls] kernel_args = {} -- GitLab From f16607c363faf0805be7798906269c898dc1cb3d Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 11 Sep 2019 18:07:26 -0500 Subject: [PATCH 60/86] Retry pylint --- .gitlab-ci.yml | 2 +- .test-conda-env-py3.yml | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 58e8a73d..617af34e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -135,7 +135,7 @@ Pylint: - export PY_EXE=python3 # Pin to numpy 1.15 # See https://github.com/PyCQA/pylint/issues/2721 - - EXTRA_INSTALL="Cython pybind11 numpy==1.15 mako matplotlib" + - EXTRA_INSTALL="Cython pybind11 numpy==1.15 mako matplotlib mpi4py" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - ". ./prepare-and-run-pylint.sh pytential test/test_*.py" tags: diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 19d8881c..a4205fb2 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -18,7 +18,7 @@ dependencies: - pip - pip: - - git+https://gitlab.tiker.net/inducer/boxtree + - git+https://gitlab.tiker.net/inducer/boxtree@distributed-fmm-global - git+https://github.com/inducer/pymbolic - git+https://github.com/inducer/loopy - git+https://gitlab.tiker.net/inducer/sumpy diff --git a/requirements.txt b/requirements.txt index bbbf6dc1..d7e11128 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/inducer/modepy git+https://github.com/inducer/pyopencl git+https://github.com/inducer/islpy git+https://github.com/inducer/loopy -git+https://gitlab.tiker.net/inducer/boxtree@opencl-counter +git+https://gitlab.tiker.net/inducer/boxtree@distributed-fmm-global git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://gitlab.tiker.net/inducer/pyfmmlib -- GitLab From b9c6dec939d0e5ea06045f7d1a403ebe0fc10ac1 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 20 Sep 2019 00:58:20 -0500 Subject: [PATCH 61/86] Automatically detect the number of FMMs on worker ranks --- pytential/symbolic/compiler.py | 82 +++++++++++++++++++++++ pytential/symbolic/execution.py | 75 +++++++++++++++++++++ test/distributed/test_off_surface_eval.py | 30 ++++----- 3 files changed, 171 insertions(+), 16 deletions(-) diff --git a/pytential/symbolic/compiler.py b/pytential/symbolic/compiler.py index 86b10d24..13f6d2c5 100644 --- a/pytential/symbolic/compiler.py +++ b/pytential/symbolic/compiler.py @@ -417,6 +417,88 @@ class Code(object): # }}} + +class DistributedCode(Code): + def __init__(self, comm, instructions, result): + Code.__init__(self, instructions, result) + self.comm = comm + + def execute(self, exec_mapper, pre_assign_check=None): + insn_type = { + "ASSIGN": 0, + "COMPUTE_POTENTIAL": 1, + "TERMINAL": 2 + } + + if self.comm.Get_rank() == 0: + context = exec_mapper.context + + done_insns = set() + + while True: + try: + insn, discardable_vars = self.get_next_step( + frozenset(list(context.keys())), + frozenset(done_insns)) + + except self.NoInstructionAvailable: + # no available instructions: we're done + self.comm.bcast(insn_type["TERMINAL"], root=0) + break + else: + for name in discardable_vars: + del context[name] + + if isinstance(insn, ComputePotentialInstruction): + self.comm.bcast(insn_type["COMPUTE_POTENTIAL"], root=0) + else: + self.comm.bcast(insn_type["ASSIGN"], root=0) + + done_insns.add(insn) + assignments = ( + self.get_exec_function(insn, exec_mapper) + (exec_mapper.queue, insn, exec_mapper.bound_expr, + exec_mapper)) + + assignees = insn.get_assignees() + for target, value in assignments: + if pre_assign_check is not None: + pre_assign_check(target, value) + + assert target in assignees + context[target] = value + + if len(done_insns) < len(self.instructions): + print("Unreachable instructions:") + for insn in set(self.instructions) - done_insns: + print(" ", str(insn).replace("\n", "\n ")) + from pymbolic import var + print(" missing: ", ", ".join( + str(s) for s in + set(insn.get_dependencies()) + - set(var(v) for v in six.iterkeys(context)))) + + raise RuntimeError("not all instructions are reachable" + "--did you forget to pass a value for a placeholder?") + + from pytools.obj_array import with_object_array_or_scalar + return with_object_array_or_scalar(exec_mapper, self.result) + + else: + while True: + current_insn_type = self.comm.bcast(None, root=0) + + if current_insn_type == insn_type["ASSIGN"]: + continue + elif current_insn_type == insn_type["TERMINAL"]: + break + elif current_insn_type == insn_type["COMPUTE_POTENTIAL"]: + exec_mapper.exec_compute_potential_insn( + exec_mapper.queue, None, None, None + ) + else: + raise RuntimeError("Unknown instruction type") + # }}} diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index db5d1e7e..c0ccf639 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -322,6 +322,41 @@ class EvaluationMapper(EvaluationMapperBase): return result + +class DistributedEvaluationMapper(EvaluationMapper): + def __init__(self, comm, bound_expr, queue, context=None, timing_data=None): + self.comm = comm + + if timing_data is not None: + raise NotImplementedError + + if comm.Get_rank() == 0: + EvaluationMapper.__init__(self, bound_expr, queue, context, timing_data) + else: + self.queue = queue + + def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + if self.comm.Get_rank() == 0: + result = EvaluationMapper.exec_compute_potential_insn( + self, queue, insn, bound_expr, evaluate + ) + else: + # TODO: use evaluation mapper for launching FMM in the following code + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + lp_source = DistributedQBXLayerPotentialSource(self.comm, None, None) + + distribute_geo_data = lp_source.distibuted_geo_data( + None, queue, None, None + ) + + from pytential.qbx.distributed import drive_dfmm + weights = None + drive_dfmm(queue, weights, distribute_geo_data, comm=self.comm) + + result = None + + return result + # }}} @@ -745,6 +780,30 @@ class BoundExpression(object): return self.eval(queue, args) +class DistributedBoundExpression(BoundExpression): + def __init__(self, comm, places, sym_op_expr): + self.comm = comm + rank = comm.Get_rank() + + from pytential.symbolic.compiler import DistributedCode + + if rank == 0: + BoundExpression.__init__(self, places, sym_op_expr) + self.code = DistributedCode( + comm, self.code.instructions, self.code.result + ) + else: + self.code = DistributedCode(comm, None, None) + + def eval(self, queue, context=None, timing_data=None): + if context is None: + context = {} + exec_mapper = DistributedEvaluationMapper( + self.comm, self, queue, context, timing_data=timing_data + ) + return self.code.execute(exec_mapper) + + def bind(places, expr, auto_where=None): """ :arg places: a :class:`pytential.symbolic.execution.GeometryCollection`. @@ -765,6 +824,22 @@ def bind(places, expr, auto_where=None): return BoundExpression(places, expr) + +def bind_distributed(comm, places, expr, auto_where=None): + """ + Same as bind, except `places` and `expr` can be None on worker ranks. + """ + + if comm.Get_rank() == 0: + if not isinstance(places, GeometryCollection): + places = GeometryCollection(places, auto_where=auto_where) + expr = _prepare_expr(places, expr) + else: + places = None + expr = None + + return DistributedBoundExpression(comm, places, expr) + # }}} diff --git a/test/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py index 2c30b19f..d0fd6822 100644 --- a/test/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -5,6 +5,7 @@ import functools from sympy.core.cache import clear_cache import numpy as np from pytential.qbx.distributed import DistributedQBXLayerPotentialSource +from pytential.symbolic.execution import bind_distributed from meshmode.discretization import Discretization from meshmode.discretization.poly_element import ( InterpolatoryQuadratureSimplexGroupFactory) @@ -48,28 +49,34 @@ if current_rank == 0: # master rank pre_density_discr = Discretization( ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) - qbx, _ = DistributedQBXLayerPotentialSource( + qbx = DistributedQBXLayerPotentialSource( comm, pre_density_discr, fine_order=4 * target_order, qbx_order=qbx_order, fmm_order=fmm_order, knl_specific_calibration_params="constant_one" - ).with_refinement() - - density_discr = qbx.density_discr + ) op = pytential.sym.D( LaplaceKernel(2), pytential.sym.var("sigma"), qbx_forced_limit=-2) + qbx, _ = qbx.with_refinement() + density_discr = qbx.density_discr sigma = density_discr.zeros(queue) + 1 + qbx_ctx = {"sigma": sigma} fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30) + targets = PointsTarget(fplot.points) +else: + qbx = None + targets = None + op = None + qbx_ctx = {} - fld_in_vol = pytential.bind( - (qbx, PointsTarget(fplot.points)), - op)(queue, sigma=sigma) +fld_in_vol = bind_distributed(comm, (qbx, targets), op)(queue, **qbx_ctx) +if current_rank == 0: err = cl.clmath.fabs(fld_in_vol - (-1)) linf_err = cl.array.max(err).get() @@ -82,12 +89,3 @@ if current_rank == 0: # master rank # FIXME: Why does the FMM only meet this sloppy tolerance? assert linf_err < 1e-2 - -else: # helper rank - lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None, None) - - from pytential.qbx.distributed import drive_dfmm - wrangler = None - weights = None - drive_dfmm(queue, weights, distribute_geo_data, comm=comm) -- GitLab From d27e035d53dbb9672a54aad8354a25fcc342aee8 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 24 Oct 2019 20:26:02 -0500 Subject: [PATCH 62/86] Move distributed test cases to example --- {test => examples}/distributed/test_layer_pot_identity.py | 0 {test => examples}/distributed/test_off_surface_eval.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {test => examples}/distributed/test_layer_pot_identity.py (100%) rename {test => examples}/distributed/test_off_surface_eval.py (100%) diff --git a/test/distributed/test_layer_pot_identity.py b/examples/distributed/test_layer_pot_identity.py similarity index 100% rename from test/distributed/test_layer_pot_identity.py rename to examples/distributed/test_layer_pot_identity.py diff --git a/test/distributed/test_off_surface_eval.py b/examples/distributed/test_off_surface_eval.py similarity index 100% rename from test/distributed/test_off_surface_eval.py rename to examples/distributed/test_off_surface_eval.py -- GitLab From db860e5e8ecd642011bfb0cd31e3fb78a0a76150 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 Nov 2019 17:49:44 -0600 Subject: [PATCH 63/86] Make distributed implementation compatible with tsqbx --- .../distributed/test_layer_pot_identity.py | 111 +++++++++--------- pytential/qbx/distributed.py | 48 ++------ pytential/qbx/fmmlib.py | 19 +-- pytential/qbx/utils.py | 3 + 4 files changed, 83 insertions(+), 98 deletions(-) diff --git a/examples/distributed/test_layer_pot_identity.py b/examples/distributed/test_layer_pot_identity.py index 38af2989..2b127e81 100644 --- a/examples/distributed/test_layer_pot_identity.py +++ b/examples/distributed/test_layer_pot_identity.py @@ -1,12 +1,12 @@ import pyopencl as cl from pytential import bind, sym, norm +from pytential.symbolic.execution import bind_distributed import numpy as np from sympy.core.cache import clear_cache from pytools.convergence import EOCRecorder from mpi4py import MPI from pytential.qbx.distributed import DistributedQBXLayerPotentialSource from sumpy.kernel import LaplaceKernel -import matplotlib.pyplot as pt comm = MPI.COMM_WORLD current_rank = comm.Get_rank() @@ -19,59 +19,61 @@ clear_cache() ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) -if current_rank == 0: - - class GreenExpr(object): - zero_op_name = "green" - def get_zero_op(self, kernel, **knl_kwargs): +class GreenExpr(object): + zero_op_name = "green" - u_sym = sym.var("u") - dn_u_sym = sym.var("dn_u") + def get_zero_op(self, kernel, **knl_kwargs): - return ( - sym.S(kernel, dn_u_sym, qbx_forced_limit=-1, **knl_kwargs) - - sym.D(kernel, u_sym, qbx_forced_limit="avg", **knl_kwargs) - - 0.5*u_sym) + u_sym = sym.var("u") + dn_u_sym = sym.var("dn_u") - order_drop = 0 + return ( + sym.S(kernel, dn_u_sym, qbx_forced_limit=-1, **knl_kwargs) + - sym.D(kernel, u_sym, qbx_forced_limit="avg", **knl_kwargs) + - 0.5*u_sym) - def get_sphere_mesh(refinement_increment, target_order): - from meshmode.mesh.generation import generate_icosphere - mesh = generate_icosphere(1, target_order) - from meshmode.mesh.refinement import Refiner + order_drop = 0 - refiner = Refiner(mesh) - for i in range(refinement_increment): - flags = np.ones(mesh.nelements, dtype=bool) - refiner.refine(flags) - mesh = refiner.get_current_mesh() - return mesh +def get_sphere_mesh(refinement_increment, target_order): + from meshmode.mesh.generation import generate_icosphere + mesh = generate_icosphere(1, target_order) + from meshmode.mesh.refinement import Refiner - class SphereGeometry(object): - mesh_name = "sphere" - dim = 3 + refiner = Refiner(mesh) + for i in range(refinement_increment): + flags = np.ones(mesh.nelements, dtype=bool) + refiner.refine(flags) + mesh = refiner.get_current_mesh() - resolutions = [0, 1] + return mesh - def get_mesh(self, resolution, tgt_order): - return get_sphere_mesh(resolution, tgt_order) - expr = GreenExpr() - geometry = SphereGeometry() +class SphereGeometry(object): + mesh_name = "sphere" + dim = 3 - target_order = 8 - k = 0 - qbx_order = 3 - fmm_order = 10 resolutions = [0, 1] - _expansion_stick_out_factor = 0.5 - visualize = False - eoc_rec = EOCRecorder() + def get_mesh(self, resolution, tgt_order): + return get_sphere_mesh(resolution, tgt_order) + + +target_order = 8 +k = 0 +qbx_order = 3 +fmm_order = 10 +resolutions = [0, 1] +_expansion_stick_out_factor = 0.5 +visualize = False + +eoc_rec = EOCRecorder() - for resolution in resolutions: +for resolution in resolutions: + if current_rank == 0: + expr = GreenExpr() + geometry = SphereGeometry() mesh = geometry.get_mesh(resolution, target_order) if mesh is None: break @@ -97,6 +99,7 @@ if current_rank == 0: pre_density_discr, 4 * target_order, qbx_order, fmm_order=fmm_order, + knl_specific_calibration_params="constant_one", _expansions_in_tree_have_extent=True, _expansion_stick_out_factor=_expansion_stick_out_factor ).with_refinement(**refiner_extra_kwargs) @@ -135,17 +138,22 @@ if current_rank == 0: key = (qbx_order, geometry.mesh_name, resolution, expr.zero_op_name) - bound_op = bind(qbx, expr.get_zero_op(k_sym, **knl_kwargs)) - error = bound_op( - queue, u=u_dev, dn_u=dn_u_dev, grad_u=grad_u_dev, k=k) - if 0: - pt.plot(error) - pt.show() + op = expr.get_zero_op(k_sym, **knl_kwargs) + qbx_ctx = {"u": u_dev, "dn_u": dn_u_dev, "grad_u": grad_u_dev, "k": k} + else: + qbx = None + op = None + qbx_ctx = {} + bound_op = bind_distributed(comm, qbx, op) + error = bound_op(queue, **qbx_ctx) + + if current_rank == 0: linf_error_norm = norm(density_discr, queue, error, p=np.inf) print("--->", key, linf_error_norm) - eoc_rec.add_data_point(qbx.h_max, linf_error_norm) + h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue) + eoc_rec.add_data_point(h_max, linf_error_norm) if visualize: from meshmode.discretization.visualization import make_visualizer @@ -161,16 +169,7 @@ if current_rank == 0: ("error", error), ]) +if current_rank == 0: print(eoc_rec) tgt_order = qbx_order - expr.order_drop assert eoc_rec.order_estimate() > tgt_order - 1.6 - -else: - while True: - lp_source = DistributedQBXLayerPotentialSource(comm, None, None) - distribute_geo_data = lp_source.distibuted_geo_data(None, queue, None, None) - - from pytential.qbx.distributed import drive_dfmm - wrangler = None - weights = None - drive_dfmm(queue, weights, distribute_geo_data, comm=comm) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 19ed2ba4..bc6e18b2 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -8,7 +8,6 @@ import numpy as np import pyopencl as cl import logging import time -from boxtree.tools import return_timing_data from pytools import memoize_method logger = logging.getLogger(__name__) @@ -104,40 +103,11 @@ class QBXDistributedFMMLibExpansionWrangler( return distributed_wrangler - @return_timing_data - def eval_qbx_expansions(self, qbx_expansions): - geo_data = self.geo_data - ctt = geo_data.center_to_tree_targets() - global_qbx_centers = geo_data.global_qbx_centers() - qbx_centers = geo_data.centers() - qbx_radii = geo_data.expansion_radii() - + def eval_qbx_output_zeros(self): from pytools.obj_array import make_obj_array + ctt = self.geo_data.center_to_tree_targets() output = make_obj_array([np.zeros(len(ctt.lists), self.dtype) for k in self.outputs]) - - all_targets = geo_data.qbx_targets() - - taeval = self.get_expn_eval_routine("ta") - - for isrc_center, src_icenter in enumerate(global_qbx_centers): - for icenter_tgt in range( - ctt.starts[src_icenter], - ctt.starts[src_icenter+1]): - - center_itgt = ctt.lists[icenter_tgt] - - center = qbx_centers[:, src_icenter] - - pot, grad = taeval( - rscale=qbx_radii[src_icenter], - center=center, - expn=qbx_expansions[src_icenter].T, - ztarg=all_targets[:, center_itgt], - **self.kernel_kwargs) - - self.add_potgrad_onto_output(output, center_itgt, pot, grad) - return output # }}} @@ -419,6 +389,8 @@ class DistributedGeoData(object): local_starts[0] = 0 for icenter in range(ncenters): + # skip the current center if irank is not responsible for + # processing it if not centers_mask[icenter]: continue @@ -600,6 +572,9 @@ class DistributedGeoData(object): return RotationClassesBuilder(self.queue.context)( self.queue, trav, tree)[0].get(self.queue) + def eval_qbx_targets(self): + return self.qbx_targets() + @memoize_method def m2l_rotation_lists(self): return self.build_rotation_classes_lists().from_sep_siblings_rotation_classes @@ -916,10 +891,10 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # {{{ propagate local_exps downward - wrangler.refine_locals( + local_exps = wrangler.refine_locals( local_traversal.level_start_target_or_target_parent_box_nrs, local_traversal.target_or_target_parent_boxes, - local_exps) + local_exps)[0] # }}} @@ -944,6 +919,9 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions)[0] + qbx_potentials = qbx_potentials + \ + wrangler.eval_target_specific_qbx_locals(local_source_weights)[0] + # }}} if current_rank != 0: # worker process @@ -992,7 +970,7 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, for idim in range(len(global_wrangler.outputs)): all_potentials_in_tree_order[idim][ distributed_geo_data.qbx_target_mask[irank] - ] = qbx_potentials_cur_rank[idim] + ] += qbx_potentials_cur_rank[idim] def reorder_and_finalize_potentials(x): # "finalize" gives host FMMs (like FMMlib) a chance to turn the diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py index 98df707e..6ebf7d10 100644 --- a/pytential/qbx/fmmlib.py +++ b/pytential/qbx/fmmlib.py @@ -225,6 +225,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): np.zeros(self.tree.ntargets, self.dtype) for k in self.outputs]) + def eval_qbx_output_zeros(self): + return self.full_output_zeros() + def reorder_sources(self, source_array): if isinstance(source_array, cl.array.Array): source_array = source_array.get(queue=self.queue) @@ -547,7 +550,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) @return_timing_data def eval_qbx_expansions(self, qbx_expansions): - output = self.full_output_zeros() + output = self.eval_qbx_output_zeros() geo_data = self.geo_data ctt = geo_data.center_to_tree_targets() @@ -555,7 +558,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): qbx_centers = geo_data.centers() qbx_radii = geo_data.expansion_radii() - all_targets = geo_data.all_targets() + all_targets = geo_data.eval_qbx_targets() taeval = self.get_expn_eval_routine("ta") @@ -582,9 +585,12 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): @log_process(logger) @return_timing_data def eval_target_specific_qbx_locals(self, src_weights): + output = self.eval_qbx_output_zeros() + if not self.using_tsqbx: - return self.full_output_zeros() + return output + noutput_targets = len(output[0]) geo_data = self.geo_data trav = geo_data.traversal() @@ -599,9 +605,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifgrad = self.ifgrad # Create temporary output arrays for potential / gradient. - pot = np.zeros(self.tree.ntargets, np.complex) if ifpot else None + pot = np.zeros(noutput_targets, np.complex) if ifpot else None grad = ( - np.zeros((self.dim, self.tree.ntargets), np.complex) + np.zeros((self.dim, noutput_targets), np.complex) if ifgrad else None) ts.eval_target_specific_qbx_locals( @@ -611,7 +617,7 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): ifdipole=ifdipole, order=self.qbx_order, sources=self._get_single_sources_array(), - targets=geo_data.all_targets(), + targets=geo_data.eval_qbx_targets(), centers=self._get_single_centers_array(), qbx_centers=geo_data.global_qbx_centers(), qbx_center_to_target_box=geo_data.qbx_center_to_target_box(), @@ -628,7 +634,6 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler): pot=pot, grad=grad) - output = self.full_output_zeros() self.add_potgrad_onto_output(output, slice(None), pot, grad) return output diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py index f5a74dc1..7a59c029 100644 --- a/pytential/qbx/utils.py +++ b/pytential/qbx/utils.py @@ -467,6 +467,9 @@ class ToHostTransferredGeoDataWrapper(FMMLibRotationDataInterface): """All (not just non-QBX) targets packaged into a single array.""" return np.array(list(self.tree().targets)) + def eval_qbx_targets(self): + return self.all_targets() + def m2l_rotation_lists(self): # Already on host return self.geo_data.m2l_rotation_lists() -- GitLab From 7b5252f140e51729569be2cdbd99ae338efae8ad Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 10 Dec 2019 02:07:46 -0600 Subject: [PATCH 64/86] Integrate distributed layer potential evaluations into gmres --- pytential/qbx/distributed.py | 8 +- pytential/solve.py | 216 ++++++ pytential/symbolic/execution.py | 50 ++ test/distributed/test_scalar_int_eq.py | 988 +++++++++++++++++++++++++ 4 files changed, 1261 insertions(+), 1 deletion(-) create mode 100644 test/distributed/test_scalar_int_eq.py diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index bc6e18b2..270c359b 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -726,6 +726,10 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): """ current_rank = self.comm.Get_rank() + ''' + TODO: Cache is not working at the moment because workers' layer + potential source is recomputed after each iteration + if current_rank == 0: target_discrs_and_qbx_sides = geo_data.target_discrs_and_qbx_sides @@ -747,6 +751,8 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): if geo_data_id in self.distributed_geo_data_cache: return self.distributed_geo_data_cache[geo_data_id] + ''' + # no cached result found, construct a new distributed_geo_data if current_rank == 0: from pytential.qbx.utils import ToHostTransferredGeoDataWrapper @@ -761,7 +767,7 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): None, queue, None, None, self.comm ) - self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data + # self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data return distributed_geo_data diff --git a/pytential/solve.py b/pytential/solve.py index 82d6f68f..38661aba 100644 --- a/pytential/solve.py +++ b/pytential/solve.py @@ -342,6 +342,222 @@ def gmres(op, rhs, restart=None, tol=None, x0=None, return result.copy(solution=chopper.chop(result.solution)) + +gmres_status = { + "EVALUATE": 0, + "TERMINATE": 1 +} + + +def _distributed_gmres_master(comm, A, b, + restart=None, tol=None, x0=None, dot=None, # noqa + maxiter=None, hard_failure=None, + require_monotonicity=True, no_progress_factor=None, + stall_iterations=None, callback=None): + + # {{{ input processing + + n, _ = A.shape + + if not callable(A): + a_call = A.matvec + else: + a_call = A + + if restart is None: + restart = min(n, 20) + + if tol is None: + tol = 1e-5 + + if maxiter is None: + maxiter = 2*n + + if hard_failure is None: + hard_failure = True + + if stall_iterations is None: + stall_iterations = 10 + if no_progress_factor is None: + no_progress_factor = 1.25 + + # }}} + + def norm(x): + return np.sqrt(abs(dot(x, x))) + + if x0 is None: + x = 0*b + r = b + recalc_r = False + else: + x = x0 + del x0 + recalc_r = True + + Ae = [None]*restart # noqa + e = [None]*restart + + k = 0 + + norm_b = norm(b) + last_resid_norm = None + residual_norms = [] + + for iteration in range(maxiter): + # restart if required + if k == restart: + k = 0 + orth_count = restart + else: + orth_count = k + + # recalculate residual every 10 steps + if recalc_r: + comm.bcast(gmres_status["EVALUATE"], root=0) + r = b - a_call(x) + + norm_r = norm(r) + residual_norms.append(norm_r) + + if callback is not None: + callback(r) + + if abs(norm_r) < tol*norm_b: + comm.bcast(gmres_status["TERMINATE"], root=0) + return GMRESResult(solution=x, + residual_norms=residual_norms, + iteration_count=iteration, success=True, + state="success") + if last_resid_norm is not None: + if norm_r > 1.25*last_resid_norm: + state = "non-monotonic residuals" + if require_monotonicity: + if hard_failure: + raise GMRESError(state) + else: + comm.bcast(gmres_status["TERMINATE"], root=0) + return GMRESResult(solution=x, + residual_norms=residual_norms, + iteration_count=iteration, success=False, + state=state) + else: + print("*** WARNING: non-monotonic residuals in GMRES") + + if (stall_iterations + and len(residual_norms) > stall_iterations + and norm_r > ( + residual_norms[-stall_iterations] # noqa pylint:disable=invalid-unary-operand-type + / no_progress_factor)): + + state = "stalled" + if hard_failure: + raise GMRESError(state) + else: + comm.bcast(gmres_status["TERMINATE"], root=0) + return GMRESResult(solution=x, + residual_norms=residual_norms, + iteration_count=iteration, success=False, + state=state) + + last_resid_norm = norm_r + + # initial new direction guess + comm.bcast(gmres_status["EVALUATE"], root=0) + w = a_call(r) + + # {{{ double-orthogonalize the new direction against preceding ones + + rp = r + + for orth_trips in range(2): + for j in range(0, orth_count): + d = dot(Ae[j], w) + w = w - d * Ae[j] + rp = rp - d * e[j] + + # normalize + d = 1/norm(w) + w = d*w + rp = d*rp + + # }}} + + Ae[k] = w + e[k] = rp + + # update the residual and solution + d = dot(Ae[k], r) + + recalc_r = (iteration+1) % 10 == 0 + if not recalc_r: + r = r - d*Ae[k] + + x = x + d*e[k] + + k += 1 + + state = "max iterations" + if hard_failure: + raise GMRESError(state) + else: + comm.bcast(gmres_status["TERMINATE"], root=0) + return GMRESResult(solution=x, + residual_norms=residual_norms, + iteration_count=iteration, success=False, + state=state) + + +def distributed_gmres(comm, op, rhs, restart=None, tol=None, x0=None, + inner_product=None, + maxiter=None, hard_failure=None, + no_progress_factor=None, stall_iterations=None, + callback=None, progress=False): + + # TODO: Not sure if this is sufficient to ensure op is evaluating distributed + # layer potentials + from pytential.symbolic.execution import DistributedMatVecOp + if not isinstance(op, DistributedMatVecOp): + raise NotImplementedError("Only support op of DistributedMatVecOp class") + + if comm.Get_rank() == 0: + amod = get_array_module(rhs) + + chopper = VectorChopper(rhs) + stacked_rhs = chopper.stack(rhs) + + if inner_product is None: + inner_product = amod.vdot + + if callback is None: + if progress: + callback = ResidualPrinter(inner_product) + else: + callback = None + + result = _distributed_gmres_master( + comm, op, stacked_rhs, restart=restart, tol=tol, x0=x0, + dot=inner_product, + maxiter=maxiter, hard_failure=hard_failure, + no_progress_factor=no_progress_factor, + stall_iterations=stall_iterations, callback=callback + ) + + return result.copy(solution=chopper.chop(result.solution)) + else: + while True: + gmres_current_status = comm.bcast(None, root=0) + + if gmres_current_status == gmres_status["TERMINATE"]: + break + else: + if not callable(op): + a_call = op.matvec + else: + a_call = op + + a_call(None) + # }}} # }}} diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index ae0e5925..58305168 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -27,6 +27,7 @@ THE SOFTWARE. import six from six.moves import zip +import copy from pymbolic.mapper.evaluator import ( EvaluationMapper as PymbolicEvaluationMapper) @@ -473,6 +474,31 @@ class MatVecOp: return result + +class DistributedMatVecOp(MatVecOp): + def __init__(self, + comm, bound_expr, queue, arg_name, dtype, total_dofs, + starts_and_ends, extra_args): + self.comm = comm + MatVecOp.__init__( + self, bound_expr, queue, arg_name, dtype, total_dofs, + starts_and_ends, extra_args + ) + + @classmethod + def from_mat_vec_op(cls, comm, mat_vec_op): + distributed_mat_vec_op = copy.copy(mat_vec_op) + distributed_mat_vec_op.__class__ = cls + distributed_mat_vec_op.comm = comm + + return distributed_mat_vec_op + + def matvec(self, x): + if self.comm.Get_rank() == 0: + return MatVecOp.matvec(self, x) + else: + return self.bound_expr(self.queue) + # }}} @@ -803,6 +829,30 @@ class DistributedBoundExpression(BoundExpression): else: self.code = DistributedCode(comm, None, None) + def get_discretization(self, where): + if self.comm.Get_rank() == 0: + return BoundExpression.get_discretization(self, where) + else: + raise RuntimeError("Discretization is not available on worker nodes") + + def get_modeled_cost(self, queue, calibration_params, **args): + if self.comm.Get_rank() == 0: + return BoundExpression.get_modeled_cost( + self, queue, calibration_params, **args + ) + else: + raise RuntimeError("Cost model is not available on worker nodes") + + def scipy_op(self, queue, arg_name, dtype, domains=None, **extra_args): + if self.comm.Get_rank() == 0: + mat_vec_op = BoundExpression.scipy_op( + self, queue, arg_name, dtype, domains=None, **extra_args + ) + else: + mat_vec_op = MatVecOp(self, queue, None, None, None, None, None) + + return DistributedMatVecOp.from_mat_vec_op(self.comm, mat_vec_op) + def eval(self, queue, context=None, timing_data=None): if context is None: context = {} diff --git a/test/distributed/test_scalar_int_eq.py b/test/distributed/test_scalar_int_eq.py new file mode 100644 index 00000000..96770ef8 --- /dev/null +++ b/test/distributed/test_scalar_int_eq.py @@ -0,0 +1,988 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2014 Andreas Kloeckner" \ + "Copyright (C) 2019 Hao Gao" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.clmath # noqa +import pytest +from pytools import Record +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from functools import partial +from meshmode.mesh.generation import ( # noqa + ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle, + make_curve_mesh) +from meshmode.discretization.visualization import make_visualizer +from sumpy.symbolic import USE_SYMENGINE +from pytential import bind, sym +from pytential.qbx import QBXTargetAssociationFailedException + +from mpi4py import MPI +from pytential.qbx.distributed import DistributedQBXLayerPotentialSource +from pytential.symbolic.execution import bind_distributed + +import logging +logger = logging.getLogger(__name__) + +circle = partial(ellipse, 1) + +try: + import matplotlib.pyplot as pt +except ImportError: + pass + + +def make_circular_point_group(ambient_dim, npoints, radius, + center=np.array([0., 0.]), func=lambda x: x): + t = func(np.linspace(0, 1, npoints, endpoint=False)) * (2 * np.pi) + center = np.asarray(center) + result = np.zeros((ambient_dim, npoints)) + result[:2, :] = center[:, np.newaxis] + radius*np.vstack((np.cos(t), np.sin(t))) + return result + + +# {{{ test cases + +class IntEqTestCase: + + @property + def default_helmholtz_k(self): + raise NotImplementedError + + @property + def name(self): + raise NotImplementedError + + @property + def qbx_order(self): + raise NotImplementedError + + @property + def target_order(self): + raise NotImplementedError + + def __init__(self, helmholtz_k, bc_type, prob_side): + """ + :arg prob_side: may be -1, +1, or ``'scat'`` for a scattering problem + """ + + if helmholtz_k is None: + helmholtz_k = self.default_helmholtz_k + + self.helmholtz_k = helmholtz_k + self.bc_type = bc_type + self.prob_side = prob_side + + @property + def k(self): + return self.helmholtz_k + + def __str__(self): + return ("name: %s, bc_type: %s, prob_side: %s, " + "helmholtz_k: %s, qbx_order: %d, target_order: %d" + % (self.name, self.bc_type, self.prob_side, self.helmholtz_k, + self.qbx_order, self.target_order)) + + fmm_backend = "sumpy" + gmres_tol = 1e-14 + + +class CurveIntEqTestCase(IntEqTestCase): + resolutions = [40, 50, 60] + + def curve_func(self, *args, **kwargs): + raise NotImplementedError + + def get_mesh(self, resolution, target_order): + return make_curve_mesh( + self.curve_func, + np.linspace(0, 1, resolution+1), + target_order) + + use_refinement = True + + inner_radius = 0.1 + outer_radius = 2 + + qbx_order = 5 + target_order = qbx_order + fmm_backend = None + + check_tangential_deriv = True + check_gradient = False + + +class EllipseIntEqTestCase(CurveIntEqTestCase): + name = "3-to-1 ellipse" + + def curve_func(self, x): + return ellipse(3, x) + + +class Helmholtz3DIntEqTestCase(IntEqTestCase): + fmm_backend = "fmmlib" + use_refinement = False + + @property + def target_order(self): + return self.qbx_order + + check_tangential_deriv = False + + gmres_tol = 1e-7 + + +class EllipsoidIntEqTestCase(Helmholtz3DIntEqTestCase): + resolutions = [2, 0.8] + name = "ellipsoid" + + def get_mesh(self, resolution, target_order): + from meshmode.mesh.io import generate_gmsh, FileSource + mesh = generate_gmsh( + FileSource("../ellipsoid.step"), 2, order=2, + other_options=[ + "-string", + "Mesh.CharacteristicLengthMax = %g;" % resolution]) + + from meshmode.mesh.processing import perform_flips + # Flip elements--gmsh generates inside-out geometry. + return perform_flips(mesh, np.ones(mesh.nelements)) + + qbx_order = 5 + fmm_order = 13 + + inner_radius = 0.4 + outer_radius = 5 + + check_gradient = True + + +class SphereIntEqTestCase(IntEqTestCase): + resolutions = [1, 2] + name = "sphere" + + def get_mesh(self, resolution, target_order): + from meshmode.mesh.generation import generate_icosphere + from meshmode.mesh.refinement import refine_uniformly + mesh = refine_uniformly( + generate_icosphere(1, target_order), + resolution) + + return mesh + + fmm_backend = "fmmlib" + use_refinement = False + + fmm_tol = 1e-4 + + inner_radius = 0.4 + outer_radius = 5 + + qbx_order = 5 + target_order = 8 + check_gradient = False + check_tangential_deriv = False + + gmres_tol = 1e-7 + + +class MergedCubesIntEqTestCase(Helmholtz3DIntEqTestCase): + resolutions = [1.4] + name = "merged-cubes" + + def get_mesh(self, resolution, target_order): + from meshmode.mesh.io import generate_gmsh, FileSource + mesh = generate_gmsh( + FileSource("../merged-cubes.step"), 2, order=2, + other_options=[ + "-string", + "Mesh.CharacteristicLengthMax = %g;" % resolution]) + + from meshmode.mesh.processing import perform_flips + # Flip elements--gmsh generates inside-out geometry. + mesh = perform_flips(mesh, np.ones(mesh.nelements)) + + return mesh + + use_refinement = True + + inner_radius = 0.4 + outer_radius = 12 + + +class ManyEllipsoidIntEqTestCase(Helmholtz3DIntEqTestCase): + resolutions = [2, 1] + name = "ellipsoid" + + nx = 2 + ny = 2 + nz = 2 + + def get_mesh(self, resolution, target_order): + from meshmode.mesh.io import generate_gmsh, FileSource + base_mesh = generate_gmsh( + FileSource("../ellipsoid.step"), 2, order=2, + other_options=[ + "-string", + "Mesh.CharacteristicLengthMax = %g;" % resolution]) + + from meshmode.mesh.processing import perform_flips + # Flip elements--gmsh generates inside-out geometry. + base_mesh = perform_flips(base_mesh, np.ones(base_mesh.nelements)) + + from meshmode.mesh.processing import affine_map, merge_disjoint_meshes + from meshmode.mesh.tools import rand_rotation_matrix + pitch = 10 + meshes = [ + affine_map( + base_mesh, + A=rand_rotation_matrix(3), + b=pitch*np.array([ + (ix-self.nx//2), + (iy-self.ny//2), + (iz-self.ny//2)])) + for ix in range(self.nx) + for iy in range(self.ny) + for iz in range(self.nz) + ] + + mesh = merge_disjoint_meshes(meshes, single_group=True) + return mesh + + inner_radius = 0.4 + # This should sit in the area just outside the middle ellipsoid + outer_radius = 5 + + +class ElliptiplaneIntEqTestCase(IntEqTestCase): + name = "elliptiplane" + + resolutions = [0.1] + + fmm_backend = "fmmlib" + use_refinement = True + + qbx_order = 3 + fmm_tol = 1e-4 + target_order = qbx_order + check_gradient = False + check_tangential_deriv = False + + # We're only expecting three digits based on FMM settings. Who are we + # kidding? + gmres_tol = 1e-5 + + # to match the scheme given in the GIGAQBX3D paper + box_extent_norm = "l2" + from_sep_smaller_crit = "static_l2" + + def get_mesh(self, resolution, target_order): + from pytools import download_from_web_if_not_present + + download_from_web_if_not_present( + "https://raw.githubusercontent.com/inducer/geometries/master/" + "surface-3d/elliptiplane.brep") + + from meshmode.mesh.io import generate_gmsh, FileSource + mesh = generate_gmsh( + FileSource("elliptiplane.brep"), 2, order=2, + other_options=[ + "-string", + "Mesh.CharacteristicLengthMax = %g;" % resolution]) + + # now centered at origin and extends to -1,1 + + # Flip elements--gmsh generates inside-out geometry. + from meshmode.mesh.processing import perform_flips + return perform_flips(mesh, np.ones(mesh.nelements)) + + inner_radius = 0.2 + outer_radius = 12 # was '-13' in some large-scale run (?) + + +class BetterplaneIntEqTestCase(IntEqTestCase): + name = "betterplane" + + default_helmholtz_k = 20 + resolutions = [0.2] + # refine_on_helmholtz_k = False + + fmm_backend = "fmmlib" + use_refinement = True + + qbx_order = 3 + fmm_tol = 1e-4 + target_order = 6 + check_gradient = False + check_tangential_deriv = False + + visualize_geometry = True + + #scaled_max_curvature_threshold = 1 + expansion_disturbance_tolerance = 0.3 + + # We're only expecting three digits based on FMM settings. Who are we + # kidding? + gmres_tol = 1e-5 + + vis_grid_spacing = (0.025, 0.2, 0.025) + vis_extend_factor = 0.2 + + def get_mesh(self, resolution, target_order): + from pytools import download_from_web_if_not_present + + download_from_web_if_not_present( + "https://raw.githubusercontent.com/inducer/geometries/a869fc3/" + "surface-3d/betterplane.brep") + + from meshmode.mesh.io import generate_gmsh, ScriptWithFilesSource + mesh = generate_gmsh( + ScriptWithFilesSource(""" + Merge "betterplane.brep"; + + Mesh.CharacteristicLengthMax = %(lcmax)f; + Mesh.ElementOrder = 2; + Mesh.CharacteristicLengthExtendFromBoundary = 0; + + // 2D mesh optimization + // Mesh.Lloyd = 1; + + l_superfine() = Unique(Abs(Boundary{ Surface{ + 27, 25, 17, 13, 18 }; })); + l_fine() = Unique(Abs(Boundary{ Surface{ 2, 6, 7}; })); + l_coarse() = Unique(Abs(Boundary{ Surface{ 14, 16 }; })); + + // p() = Unique(Abs(Boundary{ Line{l_fine()}; })); + // Characteristic Length{p()} = 0.05; + + Field[1] = Attractor; + Field[1].NNodesByEdge = 100; + Field[1].EdgesList = {l_superfine()}; + + Field[2] = Threshold; + Field[2].IField = 1; + Field[2].LcMin = 0.075; + Field[2].LcMax = %(lcmax)f; + Field[2].DistMin = 0.1; + Field[2].DistMax = 0.4; + + Field[3] = Attractor; + Field[3].NNodesByEdge = 100; + Field[3].EdgesList = {l_fine()}; + + Field[4] = Threshold; + Field[4].IField = 3; + Field[4].LcMin = 0.1; + Field[4].LcMax = %(lcmax)f; + Field[4].DistMin = 0.15; + Field[4].DistMax = 0.4; + + Field[5] = Attractor; + Field[5].NNodesByEdge = 100; + Field[5].EdgesList = {l_coarse()}; + + Field[6] = Threshold; + Field[6].IField = 5; + Field[6].LcMin = 0.15; + Field[6].LcMax = %(lcmax)f; + Field[6].DistMin = 0.2; + Field[6].DistMax = 0.4; + + Field[7] = Min; + Field[7].FieldsList = {2, 4, 6}; + + Background Field = 7; + """ % { + "lcmax": resolution, + }, ["betterplane.brep"]), 2) + + # Flip elements--gmsh generates inside-out geometry. + from meshmode.mesh.processing import perform_flips + return perform_flips(mesh, np.ones(mesh.nelements)) + + inner_radius = 0.2 + outer_radius = 15 + +# }}} + + +# {{{ test backend + +def run_int_eq_test(cl_ctx, queue, case, resolution, visualize): + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + + if rank == 0: + mesh = case.get_mesh(resolution, case.target_order) + print("%d elements" % mesh.nelements) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + pre_density_discr = Discretization( + cl_ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(case.target_order)) + + source_order = 4*case.target_order + + refiner_extra_kwargs = {} + + qbx_lpot_kwargs = {} + if case.fmm_backend is None: + qbx_lpot_kwargs["fmm_order"] = False + else: + if hasattr(case, "fmm_tol"): + from sumpy.expansion.level_to_order import SimpleExpansionOrderFinder + qbx_lpot_kwargs["fmm_level_to_order"] = SimpleExpansionOrderFinder( + case.fmm_tol) + + elif hasattr(case, "fmm_order"): + qbx_lpot_kwargs["fmm_order"] = case.fmm_order + else: + qbx_lpot_kwargs["fmm_order"] = case.qbx_order + 5 + + qbx = DistributedQBXLayerPotentialSource( + comm, + pre_density_discr, + fine_order=source_order, + qbx_order=case.qbx_order, + _box_extent_norm=getattr(case, "box_extent_norm", None), + _from_sep_smaller_crit=getattr(case, "from_sep_smaller_crit", None), + # _from_sep_smaller_min_nsources_cumul=30, + knl_specific_calibration_params="constant_one", + **qbx_lpot_kwargs + ) + + if case.use_refinement: + if case.k != 0 and getattr(case, "refine_on_helmholtz_k", True): + refiner_extra_kwargs["kernel_length_scale"] = 5/case.k + + if hasattr(case, "scaled_max_curvature_threshold"): + refiner_extra_kwargs["_scaled_max_curvature_threshold"] = \ + case.scaled_max_curvature_threshold + + if hasattr(case, "expansion_disturbance_tolerance"): + refiner_extra_kwargs["_expansion_disturbance_tolerance"] = \ + case.expansion_disturbance_tolerance + + if hasattr(case, "refinement_maxiter"): + refiner_extra_kwargs["maxiter"] = case.refinement_maxiter + + #refiner_extra_kwargs["visualize"] = True + + print("%d elements before refinement" % pre_density_discr.mesh.nelements) + qbx, _ = qbx.with_refinement(**refiner_extra_kwargs) + print("%d stage-1 elements after refinement" + % qbx.density_discr.mesh.nelements) + print("%d stage-2 elements after refinement" + % qbx.stage2_density_discr.mesh.nelements) + print("quad stage-2 elements have %d nodes" + % qbx.quad_stage2_density_discr.groups[0].nunit_nodes) + + density_discr = qbx.density_discr + + if hasattr(case, "visualize_geometry") and case.visualize_geometry: + bdry_normals = bind( + density_discr, sym.normal(mesh.ambient_dim) + )(queue).as_vector(dtype=object) + + bdry_vis = make_visualizer(queue, density_discr, case.target_order) + bdry_vis.write_vtk_file("geometry.vtu", [ + ("normals", bdry_normals) + ]) + + # {{{ plot geometry + + if 0: + if mesh.ambient_dim == 2: + # show geometry, centers, normals + nodes_h = density_discr.nodes().get(queue=queue) + pt.plot(nodes_h[0], nodes_h[1], "x-") + normal = bind( + density_discr, sym.normal(2))(queue).as_vector(np.object) + pt.quiver(nodes_h[0], nodes_h[1], + normal[0].get(queue), normal[1].get(queue)) + pt.gca().set_aspect("equal") + pt.show() + + elif mesh.ambient_dim == 3: + bdry_vis = make_visualizer(queue, density_discr, case.target_order+3) + + bdry_normals = bind(density_discr, sym.normal(3))(queue)\ + .as_vector(dtype=object) + + bdry_vis.write_vtk_file("pre-solve-source-%s.vtu" % resolution, [ + ("bdry_normals", bdry_normals), + ]) + + else: + raise ValueError("invalid mesh dim") + + # }}} + + # {{{ set up operator + + from pytential.symbolic.pde.scalar import ( + DirichletOperator, + NeumannOperator) + + from sumpy.kernel import LaplaceKernel, HelmholtzKernel + if case.k: + knl = HelmholtzKernel(mesh.ambient_dim) + knl_kwargs = {"k": sym.var("k")} + concrete_knl_kwargs = {"k": case.k} + else: + knl = LaplaceKernel(mesh.ambient_dim) + knl_kwargs = {} + concrete_knl_kwargs = {} + + if knl.is_complex_valued: + dtype = np.complex128 + else: + dtype = np.float64 + + loc_sign = +1 if case.prob_side in [+1, "scat"] else -1 + + if case.bc_type == "dirichlet": + op = DirichletOperator(knl, loc_sign, use_l2_weighting=True, + kernel_arguments=knl_kwargs) + elif case.bc_type == "neumann": + op = NeumannOperator(knl, loc_sign, use_l2_weighting=True, + use_improved_operator=False, kernel_arguments=knl_kwargs) + else: + assert False + + op_u = op.operator(sym.var("u")) + + # }}} + + # {{{ set up test data + + if case.prob_side == -1: + test_src_geo_radius = case.outer_radius + test_tgt_geo_radius = case.inner_radius + elif case.prob_side == +1: + test_src_geo_radius = case.inner_radius + test_tgt_geo_radius = case.outer_radius + elif case.prob_side == "scat": + test_src_geo_radius = case.outer_radius + test_tgt_geo_radius = case.outer_radius + else: + raise ValueError("unknown problem_side") + + point_sources = make_circular_point_group( + mesh.ambient_dim, 10, test_src_geo_radius, + func=lambda x: x**1.5) + test_targets = make_circular_point_group( + mesh.ambient_dim, 20, test_tgt_geo_radius) + + np.random.seed(22) + source_charges = np.random.randn(point_sources.shape[1]) + source_charges[-1] = -np.sum(source_charges[:-1]) + source_charges = source_charges.astype(dtype) + assert np.sum(source_charges) < 1e-15 + + source_charges_dev = cl.array.to_device(queue, source_charges) + + # }}} + + # {{{ establish BCs + + from pytential.source import PointPotentialSource + from pytential.target import PointsTarget + + point_source = PointPotentialSource(cl_ctx, point_sources) + + pot_src = sym.IntG( + # FIXME: qbx_forced_limit--really? + knl, sym.var("charges"), qbx_forced_limit=None, **knl_kwargs) + + test_direct = bind((point_source, PointsTarget(test_targets)), pot_src)( + queue, charges=source_charges_dev, **concrete_knl_kwargs) + + if case.bc_type == "dirichlet": + bc = bind((point_source, density_discr), pot_src)( + queue, charges=source_charges_dev, **concrete_knl_kwargs) + + elif case.bc_type == "neumann": + bc = bind( + (point_source, density_discr), + sym.normal_derivative( + qbx.ambient_dim, pot_src, dofdesc=sym.DEFAULT_TARGET) + )(queue, charges=source_charges_dev, **concrete_knl_kwargs) + + rhs = bind(density_discr, op.prepare_rhs(sym.var("bc")))(queue, bc=bc) + + # }}} + else: + qbx = None + op_u = None + dtype = None + rhs = None + concrete_knl_kwargs = {} + + # {{{ solve + + bound_op = bind_distributed(comm, qbx, op_u) + + try: + from pytential.solve import distributed_gmres + gmres_result = distributed_gmres( + comm, + bound_op.scipy_op(queue, "u", dtype, **concrete_knl_kwargs), + rhs, + tol=case.gmres_tol, + progress=True, + hard_failure=True, + stall_iterations=50, no_progress_factor=1.05) + except QBXTargetAssociationFailedException as e: + bdry_vis = make_visualizer(queue, density_discr, case.target_order+3) + + bdry_vis.write_vtk_file("failed-targets-%s.vtu" % resolution, [ + ("failed_targets", e.failed_target_flags), + ]) + raise + + if rank == 0: + print("gmres state:", gmres_result.state) + weighted_u = gmres_result.solution + + # }}} + + # {{{ error check + + if case.prob_side != "scat": + + if rank == 0: + points_target = PointsTarget(test_targets) + tgt_op = op.representation(sym.var("u")) + qbx_ctx = { + "u": weighted_u, + "k": case.k + } + else: + points_target = None + tgt_op = None + qbx_ctx = {} + + bound_tgt_op = bind_distributed(comm, (qbx, points_target), tgt_op) + + test_via_bdry = bound_tgt_op(queue, **qbx_ctx) + + if rank == 0: + err = test_via_bdry - test_direct + + err = err.get() + test_direct = test_direct.get() + test_via_bdry = test_via_bdry.get() + + # {{{ remove effect of net source charge + + if case.k == 0 and case.bc_type == "neumann" and loc_sign == -1: + + # remove constant offset in interior Laplace Neumann error + tgt_ones = np.ones_like(test_direct) + tgt_ones = tgt_ones/la.norm(tgt_ones) + err = err - np.vdot(tgt_ones, err)*tgt_ones + + # }}} + + rel_err_2 = la.norm(err)/la.norm(test_direct) + rel_err_inf = la.norm(err, np.inf)/la.norm(test_direct, np.inf) + + print("rel_err_2: %g rel_err_inf: %g" % (rel_err_2, rel_err_inf)) + else: + rel_err_2 = None + rel_err_inf = None + + # }}} + + # {{{ test gradient + + if case.check_gradient and case.prob_side != "scat": + + if rank == 0: + tgt_op = op.representation( + sym.var("u"), + map_potentials=lambda pot: sym.grad(mesh.ambient_dim, pot), + qbx_forced_limit=None + ) + qbx_ctx = concrete_knl_kwargs.copy() + qbx_ctx["u"] = weighted_u + else: + tgt_op = None + qbx_ctx = {} + + bound_grad_op = bind_distributed(comm, (qbx, points_target), tgt_op) + + grad_from_src = bound_grad_op(queue, **qbx_ctx) + + if rank == 0: + grad_ref = (bind( + (point_source, points_target), + sym.grad(mesh.ambient_dim, pot_src) + )(queue, charges=source_charges_dev, **concrete_knl_kwargs) + ) + + grad_err = (grad_from_src - grad_ref) + + rel_grad_err_inf = ( + la.norm(grad_err[0].get(), np.inf) + / la.norm(grad_ref[0].get(), np.inf)) + + print("rel_grad_err_inf: %g" % rel_grad_err_inf) + + # }}} + + # {{{ test tangential derivative + + if case.check_tangential_deriv and case.prob_side != "scat": + + if rank == 0: + deriv_op = op.representation( + sym.var("u"), + map_potentials=lambda pot: sym.tangential_derivative(2, pot), + qbx_forced_limit=loc_sign + ) + qbx_ctx = concrete_knl_kwargs.copy() + qbx_ctx["u"] = weighted_u + else: + deriv_op = None + qbx_ctx = None + + bound_t_deriv_op = bind_distributed(comm, qbx, deriv_op) + + tang_deriv_from_src = bound_t_deriv_op(queue, **qbx_ctx) + + if rank == 0: + tang_deriv_from_src = tang_deriv_from_src.as_scalar().get() + + tang_deriv_ref = (bind( + (point_source, density_discr), + sym.tangential_derivative(2, pot_src) + )(queue, charges=source_charges_dev, **concrete_knl_kwargs) + .as_scalar().get()) + + if 0: + pt.plot(tang_deriv_ref.real) + pt.plot(tang_deriv_from_src.real) + pt.show() + + td_err = (tang_deriv_from_src - tang_deriv_ref) + + rel_td_err_inf = la.norm(td_err, np.inf)/la.norm(tang_deriv_ref, np.inf) + + print("rel_td_err_inf: %g" % rel_td_err_inf) + else: + rel_td_err_inf = None + + # }}} + + if rank == 0: + + # {{{ any-D file plotting + + if visualize: + bdry_vis = make_visualizer(queue, density_discr, case.target_order+3) + + bdry_normals = bind(density_discr, sym.normal(qbx.ambient_dim))(queue)\ + .as_vector(dtype=object) + + sym_sqrt_j = sym.sqrt_jac_q_weight(density_discr.ambient_dim) + u = bind(density_discr, sym.var("u")/sym_sqrt_j)(queue, u=weighted_u) + + bdry_vis.write_vtk_file("source-%s.vtu" % resolution, [ + ("u", u), + ("bc", bc), + #("bdry_normals", bdry_normals), + ]) + + from sumpy.visualization import make_field_plotter_from_bbox # noqa + from meshmode.mesh.processing import find_bounding_box + + vis_grid_spacing = (0.1, 0.1, 0.1)[:qbx.ambient_dim] + if hasattr(case, "vis_grid_spacing"): + vis_grid_spacing = case.vis_grid_spacing + vis_extend_factor = 0.2 + if hasattr(case, "vis_extend_factor"): + vis_grid_spacing = case.vis_grid_spacing + + fplot = make_field_plotter_from_bbox( + find_bounding_box(mesh), + h=vis_grid_spacing, + extend_factor=vis_extend_factor) + + qbx_tgt_tol = qbx.copy(target_association_tolerance=0.15) + from pytential.target import PointsTarget + + try: + solved_pot = bind( + (qbx_tgt_tol, PointsTarget(fplot.points)), + op.representation(sym.var("u")) + )(queue, u=weighted_u, k=case.k) + except QBXTargetAssociationFailedException as e: + fplot.write_vtk_file( + "failed-targets.vts", + [ + ("failed_targets", e.failed_target_flags.get(queue)) + ]) + raise + + from sumpy.kernel import LaplaceKernel + ones_density = density_discr.zeros(queue) + ones_density.fill(1) + indicator = bind( + (qbx_tgt_tol, PointsTarget(fplot.points)), + -sym.D(LaplaceKernel(density_discr.ambient_dim), + sym.var("sigma"), + qbx_forced_limit=None))( + queue, sigma=ones_density).get() + + solved_pot = solved_pot.get() + + true_pot = bind((point_source, PointsTarget(fplot.points)), pot_src)( + queue, charges=source_charges_dev, **concrete_knl_kwargs).get() + + #fplot.show_scalar_in_mayavi(solved_pot.real, max_val=5) + if case.prob_side == "scat": + fplot.write_vtk_file( + "potential-%s.vts" % resolution, + [ + ("pot_scattered", solved_pot), + ("pot_incoming", -true_pot), + ("indicator", indicator), + ] + ) + else: + fplot.write_vtk_file( + "potential-%s.vts" % resolution, + [ + ("solved_pot", solved_pot), + ("true_pot", true_pot), + ("indicator", indicator), + ] + ) + + # }}} + + class Result(Record): + pass + + h_max = bind(qbx, sym.h_max(qbx.ambient_dim))(queue) + return Result( + h_max=h_max, + rel_err_2=rel_err_2, + rel_err_inf=rel_err_inf, + rel_td_err_inf=rel_td_err_inf, + gmres_result=gmres_result) + +# }}} + + +# {{{ test frontend + +@pytest.mark.parametrize("case", [ + EllipseIntEqTestCase(helmholtz_k=helmholtz_k, bc_type=bc_type, + prob_side=prob_side) + for helmholtz_k in [0, 1.2] + for bc_type in ["dirichlet", "neumann"] + for prob_side in [-1, +1] + ]) +# Sample test run: +# 'test_integral_equation(cl._csc, EllipseIntEqTestCase(0, "dirichlet", +1), visualize=True)' # noqa: E501 +def test_integral_equation(ctx_factory, case, visualize=False): + logging.basicConfig(level=logging.INFO) + + cl_ctx = ctx_factory() + queue = cl.CommandQueue(cl_ctx) + + if USE_SYMENGINE and case.fmm_backend is None: + pytest.skip("https://gitlab.tiker.net/inducer/sumpy/issues/25") + + # prevent cache 'splosion + from sympy.core.cache import clear_cache + clear_cache() + + # get MPI information + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + + if rank == 0: + from pytools.convergence import EOCRecorder + print("qbx_order: %d, %s" % (case.qbx_order, case)) + + eoc_rec_target = EOCRecorder() + eoc_rec_td = EOCRecorder() + + have_error_data = False + + for resolution in case.resolutions: + result = run_int_eq_test(cl_ctx, queue, case, resolution, + visualize=visualize) + + if rank == 0: + if result.rel_err_2 is not None: + have_error_data = True + eoc_rec_target.add_data_point(result.h_max, result.rel_err_2) + + if result.rel_td_err_inf is not None: + eoc_rec_td.add_data_point(result.h_max, result.rel_td_err_inf) + + if rank == 0: + if case.bc_type == "dirichlet": + tgt_order = case.qbx_order + elif case.bc_type == "neumann": + tgt_order = case.qbx_order-1 + else: + assert False + + if have_error_data: + print("TARGET ERROR:") + print(eoc_rec_target) + assert eoc_rec_target.order_estimate() > tgt_order - 1.3 + + if case.check_tangential_deriv: + print("TANGENTIAL DERIVATIVE ERROR:") + print(eoc_rec_td) + assert eoc_rec_td.order_estimate() > tgt_order - 2.3 + +# }}} + + +# You can test individual routines by typing +# $ python test_scalar_int_eq.py 'test_routine()' + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker -- GitLab From e2297d874fbf2c1f87bf058a30306e2045b9be63 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 5 Mar 2020 11:29:56 -0600 Subject: [PATCH 65/86] Update DistributedFMMLibExpansionWrangler interface --- pytential/qbx/__init__.py | 2 +- pytential/qbx/distributed.py | 124 +++---------------------- test/distributed/test_scalar_int_eq.py | 5 +- 3 files changed, 16 insertions(+), 115 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 36f9eb89..631ae6ca 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -297,7 +297,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # FIXME Could/should share wrangler and geometry kernels # if no relevant changes have been made. - return QBXLayerPotentialSource( + return type(self)( density_discr=density_discr or self.density_discr, fine_order=( fine_order if fine_order is not None else self.fine_order), diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 270c359b..4770dcd8 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,5 +1,5 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler -from pytential.qbx import QBXLayerPotentialSource, _not_provided +from pytential.qbx import QBXLayerPotentialSource from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.distributed.partition import ResponsibleBoxesQuery @@ -590,37 +590,10 @@ class DistributedGeoData(object): class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): - def __init__( - self, - comm, - density_discr, - fine_order, - qbx_order=None, - fmm_order=None, - fmm_level_to_order=None, - to_refined_connection=None, - expansion_factory=None, - target_association_tolerance=_not_provided, - cost_model=None, - knl_specific_calibration_params=None, - - # begin undocumented arguments - # FIXME default debug=False once everything has matured - debug=True, - _refined_for_global_qbx=False, - _expansions_in_tree_have_extent=True, - _expansion_stick_out_factor=0.5, - _well_sep_is_n_away=2, - _max_leaf_refine_weight=None, - _box_extent_norm=None, - _from_sep_smaller_crit=None, - _tree_kind="adaptive", - _use_target_specific_qbx=None, - geometry_data_inspector=None, - target_stick_out_factor=_not_provided): - + def __init__(self, *args, **kwargs): + comm = kwargs.pop("comm", MPI.COMM_WORLD) self.comm = comm - current_rank = self.comm.Get_rank() + current_rank = comm.Get_rank() self.distributed_geo_data_cache = {} @@ -629,91 +602,18 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): self.arg_to_id = {} if current_rank == 0: + super(DistributedQBXLayerPotentialSource, self).__init__(*args, **kwargs) - super(DistributedQBXLayerPotentialSource, self).__init__( - density_discr, - fine_order, - qbx_order=qbx_order, - fmm_order=fmm_order, - fmm_level_to_order=fmm_level_to_order, - to_refined_connection=to_refined_connection, - expansion_factory=expansion_factory, - target_association_tolerance=target_association_tolerance, - debug=debug, - _refined_for_global_qbx=_refined_for_global_qbx, - _expansions_in_tree_have_extent=_expansions_in_tree_have_extent, - _expansion_stick_out_factor=_expansion_stick_out_factor, - _well_sep_is_n_away=_well_sep_is_n_away, - _max_leaf_refine_weight=_max_leaf_refine_weight, - _box_extent_norm=_box_extent_norm, - _from_sep_smaller_crit=_from_sep_smaller_crit, - _from_sep_smaller_min_nsources_cumul=0, - _tree_kind=_tree_kind, - _use_target_specific_qbx=_use_target_specific_qbx, - geometry_data_inspector=geometry_data_inspector, - fmm_backend='distributed', - target_stick_out_factor=target_stick_out_factor, - cost_model=cost_model, - knl_specific_calibration_params=knl_specific_calibration_params - ) - - def copy( - self, - density_discr=None, - fine_order=None, - qbx_order=None, - fmm_order=_not_provided, - fmm_level_to_order=_not_provided, - to_refined_connection=None, - target_association_tolerance=_not_provided, - _expansions_in_tree_have_extent=_not_provided, - _expansion_stick_out_factor=_not_provided, - _max_leaf_refine_weight=None, - _box_extent_norm=None, - _from_sep_smaller_crit=None, - _tree_kind=None, - _use_target_specific_qbx=_not_provided, - geometry_data_inspector=None, - fmm_backend=None, - cost_model=_not_provided, - knl_specific_calibration_params=_not_provided, - - debug=_not_provided, - _refined_for_global_qbx=_not_provided, - target_stick_out_factor=_not_provided, - ): - - obj = super(DistributedQBXLayerPotentialSource, self).copy( - density_discr=density_discr, - fine_order=fine_order, - qbx_order=qbx_order, - fmm_order=fmm_order, - fmm_level_to_order=fmm_level_to_order, - to_refined_connection=to_refined_connection, - target_association_tolerance=target_association_tolerance, - _expansions_in_tree_have_extent=_expansions_in_tree_have_extent, - _expansion_stick_out_factor=_expansion_stick_out_factor, - _max_leaf_refine_weight=_max_leaf_refine_weight, - _box_extent_norm=_box_extent_norm, - _from_sep_smaller_crit=_from_sep_smaller_crit, - _tree_kind=_tree_kind, - _use_target_specific_qbx=_use_target_specific_qbx, - geometry_data_inspector=geometry_data_inspector, - fmm_backend=fmm_backend, - cost_model=cost_model, - knl_specific_calibration_params=knl_specific_calibration_params, - - debug=debug, - _refined_for_global_qbx=_refined_for_global_qbx, - target_stick_out_factor=target_stick_out_factor, - ) + def copy(self, *args, **kwargs): + comm = kwargs.pop("comm", self.comm) + current_rank = comm.Get_rank() - obj.__class__ = DistributedQBXLayerPotentialSource - obj.comm = self.comm - obj.distributed_geo_data_cache = self.distributed_geo_data_cache + obj = super(DistributedQBXLayerPotentialSource, self).copy(*args, **kwargs) - current_rank = self.comm.Get_rank() + # obj.__class__ = DistributedQBXLayerPotentialSource + obj.comm = comm + obj.distributed_geo_data_cache = self.distributed_geo_data_cache if current_rank == 0: obj.next_geo_data_id = self.next_geo_data_id obj.arg_to_id = self.arg_to_id diff --git a/test/distributed/test_scalar_int_eq.py b/test/distributed/test_scalar_int_eq.py index 96770ef8..a4561f45 100644 --- a/test/distributed/test_scalar_int_eq.py +++ b/test/distributed/test_scalar_int_eq.py @@ -467,14 +467,15 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize): qbx_lpot_kwargs["fmm_order"] = case.qbx_order + 5 qbx = DistributedQBXLayerPotentialSource( - comm, pre_density_discr, + comm=comm, fine_order=source_order, qbx_order=case.qbx_order, _box_extent_norm=getattr(case, "box_extent_norm", None), _from_sep_smaller_crit=getattr(case, "from_sep_smaller_crit", None), - # _from_sep_smaller_min_nsources_cumul=30, + _from_sep_smaller_min_nsources_cumul=0, knl_specific_calibration_params="constant_one", + fmm_backend="distributed", **qbx_lpot_kwargs ) -- GitLab From d111b060964cf2feabd2b80be18cea1e8f0948b1 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 5 Mar 2020 17:36:01 -0600 Subject: [PATCH 66/86] Change argument of DistributedQBXLayerPotentialSource --- pytential/qbx/__init__.py | 5 +++-- pytential/qbx/distributed.py | 23 ++++++++++++++++++++++- test/distributed/test_scalar_int_eq.py | 2 -- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 631ae6ca..2ea5564b 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -635,7 +635,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): fmm_mpole_factory, fmm_local_factory, qbx_local_factory, out_kernels) - elif self.fmm_backend == "fmmlib" or self.fmm_backend == 'distributed': + elif self.fmm_backend == "fmmlib": from pytential.qbx.fmmlib import \ QBXFMMLibExpansionWranglerCodeContainer return QBXFMMLibExpansionWranglerCodeContainer( @@ -735,7 +735,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): # {{{ execute global QBX - if self.fmm_backend == 'distributed': + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + if isinstance(self, DistributedQBXLayerPotentialSource): # FIXME: If the expansion wrangler is not FMMLib, the argument # 'uses_pde_expansions' might be different if self.cost_model is None: diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 4770dcd8..d88eb4b0 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -591,17 +591,38 @@ class DistributedGeoData(object): class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): def __init__(self, *args, **kwargs): + # process communicator argument comm = kwargs.pop("comm", MPI.COMM_WORLD) self.comm = comm current_rank = comm.Get_rank() + # process fmm backend argument + if "fmm_backend" not in kwargs: + kwargs["fmm_backend"] = "fmmlib" + elif kwargs["fmm_backend"] != "fmmlib": + raise NotImplementedError( + "Currently the distributed implementation only works with fmmlib" + ) + + # "_from_sep_smaller_min_nsources_cumul" will be forced to 0 for distributed + # implementation. If not, the potential contribution of a list 3 box might be + # computed particle-to-particle instead of using its multipole expansion. + # However, the particle information may not be distributed to the target + # rank. + if "_from_sep_smaller_min_nsources_cumul" not in kwargs: + kwargs["_from_sep_smaller_min_nsources_cumul"] = 0 + elif kwargs["_from_sep_smaller_min_nsources_cumul"] != 0: + raise ValueError( + "_from_sep_smaller_min_nsources_cumul has to be 0 for distributed " + "implementation" + ) + self.distributed_geo_data_cache = {} if current_rank == 0: self.next_geo_data_id = 0 self.arg_to_id = {} - if current_rank == 0: super(DistributedQBXLayerPotentialSource, self).__init__(*args, **kwargs) def copy(self, *args, **kwargs): diff --git a/test/distributed/test_scalar_int_eq.py b/test/distributed/test_scalar_int_eq.py index a4561f45..e3e7deb1 100644 --- a/test/distributed/test_scalar_int_eq.py +++ b/test/distributed/test_scalar_int_eq.py @@ -473,9 +473,7 @@ def run_int_eq_test(cl_ctx, queue, case, resolution, visualize): qbx_order=case.qbx_order, _box_extent_norm=getattr(case, "box_extent_norm", None), _from_sep_smaller_crit=getattr(case, "from_sep_smaller_crit", None), - _from_sep_smaller_min_nsources_cumul=0, knl_specific_calibration_params="constant_one", - fmm_backend="distributed", **qbx_lpot_kwargs ) -- GitLab From 568a73c360c730599817cca1e1cf1e09d2a3a3e7 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 14 Apr 2020 11:27:38 -0500 Subject: [PATCH 67/86] Use new API --- pytential/qbx/__init__.py | 6 +++--- {examples => test}/distributed/test_layer_pot_identity.py | 7 ++++--- {examples => test}/distributed/test_off_surface_eval.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) rename {examples => test}/distributed/test_layer_pot_identity.py (97%) rename {examples => test}/distributed/test_off_surface_eval.py (99%) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 77b0ed67..a3866255 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -765,11 +765,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): ) # TODO: supply better default calibration parameters calibration_params = \ - AbstractQBXCostModel.get_constantone_calibration_params() + AbstractQBXCostModel.get_unit_calibration_params() elif (isinstance(self.knl_specific_calibration_params, str) and self.knl_specific_calibration_params == "constant_one"): calibration_params = \ - AbstractQBXCostModel.get_constantone_calibration_params() + AbstractQBXCostModel.get_unit_calibration_params() else: knls = tuple(knl for knl in insn.kernels) calibration_params = self.knl_specific_calibration_params[knls] @@ -778,7 +778,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): for arg_name, arg_expr in six.iteritems(insn.kernel_arguments): kernel_args[arg_name] = evaluate(arg_expr) - boxes_time, _ = cost_model.qbx_modeled_cost_per_box( + boxes_time, _ = cost_model.qbx_cost_per_box( geo_data, insn.base_kernel, kernel_args, calibration_params ) boxes_time = boxes_time.get() diff --git a/examples/distributed/test_layer_pot_identity.py b/test/distributed/test_layer_pot_identity.py similarity index 97% rename from examples/distributed/test_layer_pot_identity.py rename to test/distributed/test_layer_pot_identity.py index 2b127e81..e67464d6 100644 --- a/examples/distributed/test_layer_pot_identity.py +++ b/test/distributed/test_layer_pot_identity.py @@ -95,9 +95,10 @@ for resolution in resolutions: refiner_extra_kwargs = {} qbx, _ = DistributedQBXLayerPotentialSource( - comm, - pre_density_discr, 4 * target_order, - qbx_order, + pre_density_discr, + fine_order=4 * target_order, + qbx_order=qbx_order, + comm=comm, fmm_order=fmm_order, knl_specific_calibration_params="constant_one", _expansions_in_tree_have_extent=True, diff --git a/examples/distributed/test_off_surface_eval.py b/test/distributed/test_off_surface_eval.py similarity index 99% rename from examples/distributed/test_off_surface_eval.py rename to test/distributed/test_off_surface_eval.py index d0fd6822..3e0679d6 100644 --- a/examples/distributed/test_off_surface_eval.py +++ b/test/distributed/test_off_surface_eval.py @@ -50,9 +50,9 @@ if current_rank == 0: # master rank ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) qbx = DistributedQBXLayerPotentialSource( - comm, pre_density_discr, fine_order=4 * target_order, + comm=comm, qbx_order=qbx_order, fmm_order=fmm_order, knl_specific_calibration_params="constant_one" -- GitLab From fa49c8cfa2f28967e894c25fcba2895e33bcf736 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 14 Apr 2020 23:50:57 -0500 Subject: [PATCH 68/86] Bug fix --- pytential/qbx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index a3866255..d05b3e72 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -771,7 +771,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): calibration_params = \ AbstractQBXCostModel.get_unit_calibration_params() else: - knls = tuple(knl for knl in insn.kernels) + knls = frozenset(knl for knl in insn.kernels) calibration_params = self.knl_specific_calibration_params[knls] kernel_args = {} -- GitLab From fddf47a9d12272393ce5358c6069fe17f3485a70 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 19 Apr 2020 18:40:34 -0500 Subject: [PATCH 69/86] Report timing for distributed implementation --- pytential/qbx/__init__.py | 20 ++++- pytential/qbx/distributed.py | 127 ++++++++++++++++++++++---------- pytential/symbolic/execution.py | 28 +++++-- 3 files changed, 129 insertions(+), 46 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index d05b3e72..f24ee158 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -557,6 +557,16 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): else: func = self.exec_compute_potential_insn_fmm + def drive_dfmm(*args, **kwargs): + if return_timing_data: + timing_data = {} + else: + timing_data = None + kwargs.update({"timing_data": timing_data}) + + from pytential.qbx.distributed import drive_dfmm + return drive_dfmm(*args, **kwargs), timing_data + def drive_fmm(wrangler, strengths, geo_data, kernel, kernel_arguments): del geo_data, kernel, kernel_arguments from pytential.qbx.fmm import drive_fmm @@ -566,7 +576,11 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): timing_data = None return drive_fmm(wrangler, strengths, timing_data), timing_data - extra_args["fmm_driver"] = drive_fmm + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + if isinstance(self, DistributedQBXLayerPotentialSource): + extra_args["fmm_driver"] = drive_dfmm + else: + extra_args["fmm_driver"] = drive_fmm return self._dispatch_compute_potential_insn( queue, insn, bound_expr, evaluate, func, extra_args) @@ -787,12 +801,10 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): geo_data, queue, wrangler, boxes_time ) - from pytential.qbx.distributed import drive_dfmm - all_potentials_on_every_target = drive_dfmm( + all_potentials_on_every_target, extra_outputs = fmm_driver( queue, strengths, distributed_geo_data, comm=self.comm ) - extra_outputs = None else: # Execute global QBX. all_potentials_on_every_target, extra_outputs = ( diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index d88eb4b0..3c03eda8 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -3,6 +3,7 @@ from pytential.qbx import QBXLayerPotentialSource from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.distributed.partition import ResponsibleBoxesQuery +from boxtree.fmm import TimingRecorder from mpi4py import MPI import numpy as np import pyopencl as cl @@ -695,9 +696,22 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # {{{ FMM Driver +def add_dicts(dict1, dict2): + rtv = {} + + for key in set(dict1) | set(dict2): + if key not in dict1: + rtv[key] = dict2[key] + elif key not in dict2: + rtv[key] = dict1[key] + else: + rtv[key] = dict1[key] + dict2[key] + + return rtv + def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, - _communicate_mpoles_via_allreduce=False): + timing_data=None, _communicate_mpoles_via_allreduce=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() @@ -725,22 +739,28 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # }}} + recorder = TimingRecorder() + # {{{ construct local multipoles - mpole_exps = wrangler.form_multipoles( + mpole_exps, timing_future = wrangler.form_multipoles( local_traversal.level_start_source_box_nrs, local_traversal.source_boxes, - local_source_weights)[0] + local_source_weights) + + recorder.add("form_multipoles", timing_future) # }}} # {{{ propagate multipoles upward - wrangler.coarsen_multipoles( + mpole_exps, timing_future = wrangler.coarsen_multipoles( local_traversal.level_start_source_parent_box_nrs, local_traversal.source_parent_boxes, mpole_exps) + recorder.add("coarsen_multipoles", timing_future) + # }}} # {{{ Communicate mpoles @@ -758,22 +778,26 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # {{{ direct evaluation from neighbor source boxes ("list 1") - non_qbx_potentials = wrangler.eval_direct( + non_qbx_potentials, timing_future = wrangler.eval_direct( local_traversal.target_boxes, local_traversal.neighbor_source_boxes_starts, local_traversal.neighbor_source_boxes_lists, - local_source_weights)[0] + local_source_weights) + + recorder.add("eval_direct", timing_future) # }}} # {{{ translate separated siblings' ("list 2") mpoles to local - local_exps = wrangler.multipole_to_local( + local_exps, timing_future = wrangler.multipole_to_local( local_traversal.level_start_target_or_target_parent_box_nrs, local_traversal.target_or_target_parent_boxes, local_traversal.from_sep_siblings_starts, local_traversal.from_sep_siblings_lists, - mpole_exps)[0] + mpole_exps) + + recorder.add("multipole_to_local", timing_future) # }}} @@ -782,72 +806,98 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) - non_qbx_potentials = non_qbx_potentials + wrangler.eval_multipoles( + mpole_result, timing_future = wrangler.eval_multipoles( local_traversal.target_boxes_sep_smaller_by_source_level, local_traversal.from_sep_smaller_by_level, - mpole_exps)[0] + mpole_exps) + + recorder.add("eval_multipoles", timing_future) + + non_qbx_potentials = non_qbx_potentials + mpole_result # assert that list 3 close has been merged into list 1 - # assert global_traversal.from_sep_close_smaller_starts is None - if local_traversal.from_sep_close_smaller_starts is not None: - non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( - local_traversal.target_boxes, - local_traversal.from_sep_close_smaller_starts, - local_traversal.from_sep_close_smaller_lists, - local_source_weights)[0] + assert local_traversal.from_sep_close_smaller_starts is None # }}} # {{{ form locals for separated bigger source boxes ("list 4") - local_exps = local_exps + wrangler.form_locals( + local_result, timing_future = wrangler.form_locals( local_traversal.level_start_target_or_target_parent_box_nrs, local_traversal.target_or_target_parent_boxes, local_traversal.from_sep_bigger_starts, local_traversal.from_sep_bigger_lists, - local_source_weights)[0] + local_source_weights) + + recorder.add("form_locals", timing_future) - if local_traversal.from_sep_close_bigger_starts is not None: - non_qbx_potentials = non_qbx_potentials + wrangler.eval_direct( - local_traversal.target_boxes, - local_traversal.from_sep_close_bigger_starts, - local_traversal.from_sep_close_bigger_lists, - local_source_weights)[0] + local_exps = local_exps + local_result + + # assert that list 4 close has been merged into list 1 + assert local_traversal.from_sep_close_bigger_starts is None # }}} # {{{ propagate local_exps downward - local_exps = wrangler.refine_locals( + local_exps, timing_future = wrangler.refine_locals( local_traversal.level_start_target_or_target_parent_box_nrs, local_traversal.target_or_target_parent_boxes, - local_exps)[0] + local_exps) + + recorder.add("refine_locals", timing_future) # }}} # {{{ evaluate locals - non_qbx_potentials = non_qbx_potentials + wrangler.eval_locals( + local_result, timing_future = wrangler.eval_locals( local_traversal.level_start_target_box_nrs, local_traversal.target_boxes, - local_exps)[0] + local_exps) + + recorder.add("eval_locals", timing_future) + + non_qbx_potentials = non_qbx_potentials + local_result # }}} # {{{ wrangle qbx expansions - qbx_expansions = wrangler.form_global_qbx_locals(local_source_weights)[0] + # form_global_qbx_locals and eval_target_specific_qbx_locals are responsible + # for the same interactions (directly evaluated portion of the potentials + # via unified List 1). Which one is used depends on the wrangler. If one of + # them is unused the corresponding output entries will be zero. + + qbx_expansions, timing_future = \ + wrangler.form_global_qbx_locals(local_source_weights) + + recorder.add("form_global_qbx_locals", timing_future) + + local_result, timing_future = \ + wrangler.translate_box_multipoles_to_qbx_local(mpole_exps) - qbx_expansions = qbx_expansions + \ - wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)[0] + recorder.add("translate_box_multipoles_to_qbx_local", timing_future) - qbx_expansions = qbx_expansions + \ - wrangler.translate_box_local_to_qbx_local(local_exps)[0] + qbx_expansions = qbx_expansions + local_result - qbx_potentials = wrangler.eval_qbx_expansions(qbx_expansions)[0] + local_result, timing_future = \ + wrangler.translate_box_local_to_qbx_local(local_exps) - qbx_potentials = qbx_potentials + \ - wrangler.eval_target_specific_qbx_locals(local_source_weights)[0] + recorder.add("translate_box_local_to_qbx_local", timing_future) + + qbx_expansions = qbx_expansions + local_result + + qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions) + + recorder.add("eval_qbx_expansions", timing_future) + + ts_result, timing_future = \ + wrangler.eval_target_specific_qbx_locals(local_source_weights) + + recorder.add("eval_target_specific_qbx_locals", timing_future) + + qbx_potentials = qbx_potentials + ts_result # }}} @@ -913,6 +963,9 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, logger.info("Distributed FMM evaluation finished in {} secs.".format( time.time() - start_time)) + if timing_data is not None: + timing_data.update(add_dicts(timing_data, recorder.summarize())) + return result # }}} diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 62acf88c..7fac40b7 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -326,17 +326,23 @@ class EvaluationMapper(EvaluationMapperBase): class DistributedEvaluationMapper(EvaluationMapper): def __init__(self, comm, bound_expr, queue, context=None, timing_data=None): + """ + Note :arg timing_data: has to be None or a valid dict across ranks. It could + lead to deadlock if some ranks have a valid dict but others are None. + """ self.comm = comm - if timing_data is not None: - raise NotImplementedError - if comm.Get_rank() == 0: EvaluationMapper.__init__(self, bound_expr, queue, context, timing_data) else: + self.timing_data = timing_data self.queue = queue def exec_compute_potential_insn(self, queue, insn, bound_expr, evaluate): + return_timing_data = self.timing_data is not None + if return_timing_data: + insn_str = self.comm.bcast(str(insn), root=0) + if self.comm.Get_rank() == 0: result = EvaluationMapper.exec_compute_potential_insn( self, queue, insn, bound_expr, evaluate @@ -350,11 +356,23 @@ class DistributedEvaluationMapper(EvaluationMapper): None, queue, None, None ) + if return_timing_data: + timing_data = {} + else: + timing_data = None + from pytential.qbx.distributed import drive_dfmm weights = None - drive_dfmm(queue, weights, distribute_geo_data, comm=self.comm) + result = drive_dfmm( + queue, weights, distribute_geo_data, comm=self.comm, + timing_data=timing_data + ) + + if return_timing_data: + # The compiler ensures this. + assert insn not in self.timing_data - result = None + self.timing_data[insn_str] = timing_data return result -- GitLab From 9589cbeaf8e423f895c22715fe1bc1ad1dce3738 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 20 Apr 2020 00:18:53 -0500 Subject: [PATCH 70/86] Report more timing/parameters --- pytential/qbx/distributed.py | 45 ++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 3c03eda8..9dd4b1b8 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -145,8 +145,6 @@ class DistributedGeoData(object): qbx_center_to_target_box_source_level[level] = ( geo_data.qbx_center_to_target_box_source_level(level)) - start_time = time.time() - else: # worker process traversal = None @@ -196,6 +194,8 @@ class DistributedGeoData(object): # {{{ Distribute non_qbx_box_target_lists if current_rank == 0: # master process + start_time = time.time() + from boxtree.distributed.local_tree import get_fetch_local_particles_knls knls = get_fetch_local_particles_knls(queue.context, tree) @@ -272,6 +272,10 @@ class DistributedGeoData(object): for irank in range(1, total_rank): reqs[irank].wait() + + logger.info("Distribute non_qbx_box_target_lists in {} secs.".format( + time.time() - start_time)) + if current_rank == 0: local_non_qbx_box_target_lists = local_non_qbx_box_target_lists[0] else: @@ -292,6 +296,8 @@ class DistributedGeoData(object): # {{{ Distribute other useful fields of geo_data if current_rank == 0: + start_time = time.time() + local_global_qbx_centers = np.empty((total_rank,), dtype=object) local_centers = np.empty((total_rank,), dtype=object) local_expansion_radii = np.empty((total_rank,), dtype=object) @@ -618,6 +624,10 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): "implementation" ) + # report geometric parameters + report_parameters = kwargs.pop("report_parameters", False) + self.report_parameters = report_parameters + self.distributed_geo_data_cache = {} if current_rank == 0: @@ -628,15 +638,16 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): def copy(self, *args, **kwargs): comm = kwargs.pop("comm", self.comm) - current_rank = comm.Get_rank() + report_parameters = kwargs.pop("report_parameters", self.report_parameters) obj = super(DistributedQBXLayerPotentialSource, self).copy(*args, **kwargs) # obj.__class__ = DistributedQBXLayerPotentialSource obj.comm = comm + obj.report_parameters = report_parameters obj.distributed_geo_data_cache = self.distributed_geo_data_cache - if current_rank == 0: + if comm.Get_rank() == 0: obj.next_geo_data_id = self.next_geo_data_id obj.arg_to_id = self.arg_to_id @@ -680,6 +691,25 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): from pytential.qbx.utils import ToHostTransferredGeoDataWrapper host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) + if self.report_parameters: + from pytools import Table + table = Table() + table.add_row(["name", "value"]) + + table.add_row(["nsources", host_geo_data.tree().nsources]) + table.add_row(["ntargets", host_geo_data.tree().ntargets]) + table.add_row(["ncenters", host_geo_data.ncenters]) + table.add_row([ + "non-qbx targets", + host_geo_data.non_qbx_box_target_lists().nfiltered_targets + ]) + table.add_row([ + "qbx targets", + host_geo_data.tree().ntargets - host_geo_data.ncenters + ]) + + print(table) + distributed_geo_data = DistributedGeoData( host_geo_data, queue, wrangler, boxes_time, comm=self.comm ) @@ -767,6 +797,8 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, from boxtree.distributed.calculation import communicate_mpoles + comm_start_time = time.time() + if _communicate_mpoles_via_allreduce: mpole_exps_all = np.zeros_like(mpole_exps) comm.Allreduce(mpole_exps, mpole_exps_all) @@ -774,6 +806,11 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, else: communicate_mpoles(wrangler, comm, local_traversal, mpole_exps) + from boxtree.tools import DummyTimingFuture + timing_future = DummyTimingFuture(wall_elapsed=(time.time() - comm_start_time)) + + recorder.add("multipole communication", timing_future) + # }}} # {{{ direct evaluation from neighbor source boxes ("list 1") -- GitLab From bcd9fc234873be7920f9c17ea1401b46629d7f29 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 20 Apr 2020 15:08:18 -0500 Subject: [PATCH 71/86] Report merge time --- pytential/qbx/distributed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 9dd4b1b8..1cb4e2a3 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -945,6 +945,8 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, else: # master process + merge_start_time = time.time() + all_potentials_in_tree_order = global_wrangler.full_output_zeros() nqbtl = global_wrangler.geo_data.non_qbx_box_target_lists() @@ -996,6 +998,11 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) + timing_future = DummyTimingFuture( + wall_elapsed=(time.time() - merge_start_time) + ) + recorder.add("merge potentials", timing_future) + if current_rank == 0: logger.info("Distributed FMM evaluation finished in {} secs.".format( time.time() - start_time)) -- GitLab From 4068106878b8bee0ccd89e3fb0c74819bd56d4dd Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 May 2020 15:07:26 -0500 Subject: [PATCH 72/86] Add distributed test case --- .gitlab-ci.yml | 20 ++++-- test/test_distributed.py | 140 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 test/test_distributed.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6d95506e..b07f3b2a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -42,6 +42,22 @@ Python 3 POCL: reports: junit: test/pytest.xml +Python 3 POCL MPI: + script: + - export PY_EXE=python3 + - export PYOPENCL_TEST=portable:pthread + - export PYTEST_ADDOPTS="-m mpi --capture=no" + - export EXTRA_INSTALL="Cython pybind11 numpy scipy mako mpi4py" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3 + - pocl + - large-node + except: + - tags + + Python 3 Intel: script: - export PY_EXE=python3 @@ -93,11 +109,7 @@ Python 3 Conda: - large-node except: - tags -<<<<<<< HEAD - distributed-fmm -======= - ->>>>>>> cl-cost-model artifacts: reports: junit: test/pytest.xml diff --git a/test/test_distributed.py b/test/test_distributed.py new file mode 100644 index 00000000..d2d4615e --- /dev/null +++ b/test/test_distributed.py @@ -0,0 +1,140 @@ +import numpy as np +import numpy.linalg as la # noqa +import pyopencl as cl +import pyopencl.clmath # noqa +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from meshmode.mesh.generation import make_curve_mesh, ellipse +from sumpy.visualization import FieldPlotter +from pytential import sym +from boxtree.tools import run_mpi + +import pytest +from functools import partial +import sys +import os + +from mpi4py import MPI +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +import logging +logger = logging.getLogger(__name__) + + +# {{{ test off-surface eval + +def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): + logging.basicConfig(level=logging.INFO) + + cl_ctx = ctx_factory() + queue = cl.CommandQueue(cl_ctx) + + # prevent cache 'splosion + from sympy.core.cache import clear_cache + clear_cache() + + nelements = 30 + target_order = 8 + qbx_order = 3 + if use_fmm: + fmm_order = qbx_order + else: + fmm_order = False + + if rank == 0: + + mesh = make_curve_mesh(partial(ellipse, 3), + np.linspace(0, 1, nelements+1), + target_order) + + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + + pre_density_discr = Discretization( + cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) + qbx, _ = DistributedQBXLayerPotentialSource( + pre_density_discr, + fine_order=4*target_order, + qbx_order=qbx_order, + fmm_order=fmm_order, + comm=comm, + knl_specific_calibration_params="constant_one" + ).with_refinement() + + density_discr = qbx.density_discr + + from sumpy.kernel import LaplaceKernel + op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=-2) + + sigma = density_discr.zeros(queue) + 1 + qbx_ctx = {"sigma": sigma} + + fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30) + + from pytential.target import PointsTarget + targets = PointsTarget(fplot.points) + + else: + qbx = None + targets = None + op = None + qbx_ctx = {} + + from pytential.symbolic.execution import bind_distributed + fld_in_vol = bind_distributed(comm, (qbx, targets), op)(queue, **qbx_ctx) + + if rank == 0: + err = cl.clmath.fabs(fld_in_vol - (-1)) + + linf_err = cl.array.max(err).get() + print("l_inf error:", linf_err) + + if do_plot: + fplot.show_scalar_in_matplotlib(fld_in_vol.get()) + import matplotlib.pyplot as pt + pt.colorbar() + pt.show() + + assert linf_err < 1e-3 + + +@pytest.mark.mpi +@pytest.mark.parametrize("num_processes, use_fmm", [ + (4, False), + (4, True) +]) +@pytest.mark.skipif(sys.version_info < (3, 5), + reason="distributed implementation requires 3.5 or higher") +def test_off_surface_eval(num_processes, use_fmm, do_plot=False): + pytest.importorskip("mpi4py") + + newenv = os.environ.copy() + newenv["PYTEST"] = "1" + newenv["OMP_NUM_THREADS"] = "1" + newenv["use_fmm"] = str(use_fmm) + newenv["do_plot"] = str(do_plot) + + run_mpi(__file__, num_processes, newenv) + +# }}} + + +if __name__ == "__main__": + if "PYTEST" in os.environ: + if os.environ["PYTEST"] == "1": + # Run "test_off_surface_eval" test case + use_fmm = (os.environ["use_fmm"] == 'True') + do_plot = (os.environ["do_plot"] == 'True') + + _test_off_surface_eval(cl.create_some_context, use_fmm, do_plot=do_plot) + else: + if len(sys.argv) > 1: + + # You can test individual routines by typing + # $ python test_distributed.py 'test_off_surface_eval(4, True, True)' + exec(sys.argv[1]) -- GitLab From 607fba5e358bff82220daa8051e34a134c35fd50 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 May 2020 15:31:53 -0500 Subject: [PATCH 73/86] Address test failure --- .test-conda-env-py3-macos.yml | 2 +- .test-conda-env-py3.yml | 2 +- pytential/qbx/__init__.py | 4 ++-- pytential/symbolic/execution.py | 12 ++++++++++-- requirements.txt | 2 +- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.test-conda-env-py3-macos.yml b/.test-conda-env-py3-macos.yml index f97fd74e..901e4a25 100644 --- a/.test-conda-env-py3-macos.yml +++ b/.test-conda-env-py3-macos.yml @@ -23,7 +23,7 @@ dependencies: - pip - pip: - git+https://github.com/inducer/pytools - - git+https://github.com/gaohao95/boxtree@distributed-fmm-global + - git+https://gitlab.tiker.net/inducer/boxtree@distributed-fmm-global - git+https://github.com/inducer/pymbolic - git+https://github.com/inducer/loopy - git+https://gitlab.tiker.net/inducer/sumpy diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 0a53c70c..ae923492 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -19,7 +19,7 @@ dependencies: - pip - pip: - git+https://github.com/inducer/pytools - - git+https://github.com/gaohao95/boxtree@distributed-fmm-global + - git+https://gitlab.tiker.net/inducer/boxtree@distributed-fmm-global - git+https://github.com/inducer/pymbolic - git+https://github.com/inducer/loopy - git+https://gitlab.tiker.net/inducer/sumpy diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index f24ee158..833305fc 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -797,12 +797,12 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): ) boxes_time = boxes_time.get() - distributed_geo_data = self.distibuted_geo_data( + distributed_geo_data = self.distibuted_geo_data( # noqa pylint:disable=no-member geo_data, queue, wrangler, boxes_time ) all_potentials_on_every_target, extra_outputs = fmm_driver( - queue, strengths, distributed_geo_data, comm=self.comm + queue, strengths, distributed_geo_data, comm=self.comm # noqa pylint:disable=no-member ) else: diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 7fac40b7..2c5abe5c 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -878,9 +878,17 @@ class DistributedBoundExpression(BoundExpression): else: raise RuntimeError("Discretization is not available on worker nodes") - def get_modeled_cost(self, queue, calibration_params, **args): + def cost_per_stage(self, queue, calibration_params, **args): + if self.comm.Get_rank() == 0: + return BoundExpression.cost_per_stage( + self, queue, calibration_params, **args + ) + else: + raise RuntimeError("Cost model is not available on worker nodes") + + def cost_per_box(self, queue, calibration_params, **args): if self.comm.Get_rank() == 0: - return BoundExpression.get_modeled_cost( + return BoundExpression.cost_per_box( self, queue, calibration_params, **args ) else: diff --git a/requirements.txt b/requirements.txt index 05e2007d..f8d0764d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ git+https://github.com/inducer/modepy git+https://github.com/inducer/pyopencl git+https://github.com/inducer/islpy git+https://github.com/inducer/loopy -git+https://github.com/gaohao95/boxtree@distributed-fmm-global +git+https://gitlab.tiker.net/inducer/boxtree@distributed-fmm-global git+https://github.com/inducer/meshmode git+https://gitlab.tiker.net/inducer/sumpy git+https://gitlab.tiker.net/inducer/pyfmmlib -- GitLab From a28c23d09aecdff544e348b9ecdb2752591e07ec Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 May 2020 15:47:29 -0500 Subject: [PATCH 74/86] Move old tests to examples --- {test => examples}/distributed/test_layer_pot_identity.py | 0 {test => examples}/distributed/test_off_surface_eval.py | 0 {test => examples}/distributed/test_scalar_int_eq.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {test => examples}/distributed/test_layer_pot_identity.py (100%) rename {test => examples}/distributed/test_off_surface_eval.py (100%) rename {test => examples}/distributed/test_scalar_int_eq.py (100%) diff --git a/test/distributed/test_layer_pot_identity.py b/examples/distributed/test_layer_pot_identity.py similarity index 100% rename from test/distributed/test_layer_pot_identity.py rename to examples/distributed/test_layer_pot_identity.py diff --git a/test/distributed/test_off_surface_eval.py b/examples/distributed/test_off_surface_eval.py similarity index 100% rename from test/distributed/test_off_surface_eval.py rename to examples/distributed/test_off_surface_eval.py diff --git a/test/distributed/test_scalar_int_eq.py b/examples/distributed/test_scalar_int_eq.py similarity index 100% rename from test/distributed/test_scalar_int_eq.py rename to examples/distributed/test_scalar_int_eq.py -- GitLab From efbac712f151456bf9e4929edca1f713672ac246 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 May 2020 18:00:33 -0500 Subject: [PATCH 75/86] Remove the layer potential test case without FMM --- test/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index d2d4615e..a77b0ec6 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -105,7 +105,7 @@ def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): @pytest.mark.mpi @pytest.mark.parametrize("num_processes, use_fmm", [ - (4, False), + # (4, False), (4, True) ]) @pytest.mark.skipif(sys.version_info < (3, 5), -- GitLab From 7c960dc4fcd18fe863460e140fe43156022e638f Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 4 May 2020 23:18:36 -0500 Subject: [PATCH 76/86] Test distributed on-surface urchin against single-node --- test/test_distributed.py | 177 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 176 insertions(+), 1 deletion(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index a77b0ec6..21f836e0 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -7,7 +7,7 @@ from pyopencl.tools import ( # noqa from meshmode.mesh.generation import make_curve_mesh, ellipse from sumpy.visualization import FieldPlotter -from pytential import sym +from pytential import bind, sym from boxtree.tools import run_mpi import pytest @@ -89,6 +89,27 @@ def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): fld_in_vol = bind_distributed(comm, (qbx, targets), op)(queue, **qbx_ctx) if rank == 0: + # test against shared memory result + + from pytential.qbx import QBXLayerPotentialSource + qbx, _ = QBXLayerPotentialSource( + pre_density_discr, + 4 * target_order, + qbx_order, + fmm_order=fmm_order, + _from_sep_smaller_min_nsources_cumul=0 + ).with_refinement() + + fld_in_vol_single_node = bind((qbx, targets), op)(queue, **qbx_ctx) + + linf_err = cl.array.max( + cl.clmath.fabs(fld_in_vol - fld_in_vol_single_node) + ) + + assert linf_err < 1e-13 + + # test against analytical solution + err = cl.clmath.fabs(fld_in_vol - (-1)) linf_err = cl.array.max(err).get() @@ -124,6 +145,144 @@ def test_off_surface_eval(num_processes, use_fmm, do_plot=False): # }}} +# {{{ compare on-surface urchin geometry against single-rank result + +def single_layer_wrapper(kernel): + u_sym = sym.var("u") + return sym.S(kernel, u_sym, qbx_forced_limit=-1) + + +def double_layer_wrapper(kernel): + u_sym = sym.var("u") + return sym.D(kernel, u_sym, qbx_forced_limit="avg") + + +def _test_urchin_against_single_rank(ctx_factory, m, n, op_wrapper): + logging.basicConfig(level=logging.INFO) + + qbx_order = 3 + fmm_order = 10 + target_order = 8 + est_rel_interp_tolerance = 1e-10 + _expansion_stick_out_factor = 0.5 + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + if rank == 0: + from meshmode.mesh.generation import generate_urchin + mesh = generate_urchin(target_order, m, n, est_rel_interp_tolerance) + d = mesh.ambient_dim + + from sumpy.kernel import LaplaceKernel + k_sym = LaplaceKernel(d) + op = op_wrapper(k_sym) + + from meshmode.discretization import Discretization + from meshmode.discretization.poly_element import \ + InterpolatoryQuadratureSimplexGroupFactory + + pre_density_discr = Discretization( + ctx, mesh, + InterpolatoryQuadratureSimplexGroupFactory(target_order) + ) + + refiner_extra_kwargs = {} + + params = { + "qbx_order": qbx_order, + "fmm_order": fmm_order, + "_from_sep_smaller_min_nsources_cumul": 0, + "_expansions_in_tree_have_extent": True, + "_expansion_stick_out_factor": _expansion_stick_out_factor, + "_use_target_specific_qbx": True, + "fmm_backend": 'fmmlib' + } + + from pytential.qbx.distributed import DistributedQBXLayerPotentialSource + qbx, _ = DistributedQBXLayerPotentialSource( + density_discr=pre_density_discr, + fine_order=4 * target_order, + comm=comm, + knl_specific_calibration_params="constant_one", + **params + ).with_refinement(**refiner_extra_kwargs) + + density_discr = qbx.density_discr + + # {{{ compute values of a solution to the PDE + + nodes_host = density_discr.nodes().get(queue) + + center = np.array([3, 1, 2])[:d] + diff = nodes_host - center[:, np.newaxis] + dist_squared = np.sum(diff ** 2, axis=0) + dist = np.sqrt(dist_squared) + if d == 2: + u = np.log(dist) + grad_u = diff / dist_squared + elif d == 3: + u = 1 / dist + grad_u = -diff / dist ** 3 + else: + assert False + + # }}} + + u_dev = cl.array.to_device(queue, u) + grad_u_dev = cl.array.to_device(queue, grad_u) + context = {'u': u_dev, 'grad_u': grad_u_dev} + else: + qbx = None + op = None + context = {} + + from pytential.symbolic.execution import bind_distributed + bound_op = bind_distributed(comm, qbx, op) + distributed_result = bound_op(queue, **context) + + if rank == 0: + from pytential.qbx import QBXLayerPotentialSource + qbx, _ = QBXLayerPotentialSource( + density_discr=pre_density_discr, + fine_order=4 * target_order, + **params + ).with_refinement() + + single_node_result = bind(qbx, op)(queue, **context) + + distributed_result = distributed_result.get() + single_node_result = single_node_result.get() + + linf_err = la.norm(distributed_result - single_node_result, ord=np.inf) + + print("l_inf error:", linf_err) + + assert linf_err < 1e-13 + + +@pytest.mark.mpi +@pytest.mark.parametrize("num_processes, m, n, op_wrapper", [ + (4, 1, 3, "single_layer_wrapper"), + (4, 1, 3, "double_layer_wrapper") +]) +@pytest.mark.skipif(sys.version_info < (3, 5), + reason="distributed implementation requires 3.5 or higher") +def test_urchin_against_single_rank(num_processes, m, n, op_wrapper): + pytest.importorskip("mpi4py") + + newenv = os.environ.copy() + newenv["PYTEST"] = "2" + newenv["OMP_NUM_THREADS"] = "1" + newenv["m"] = str(m) + newenv["n"] = str(n) + newenv["op_wrapper"] = op_wrapper + + run_mpi(__file__, num_processes, newenv) + +# }}} + + if __name__ == "__main__": if "PYTEST" in os.environ: if os.environ["PYTEST"] == "1": @@ -132,6 +291,22 @@ if __name__ == "__main__": do_plot = (os.environ["do_plot"] == 'True') _test_off_surface_eval(cl.create_some_context, use_fmm, do_plot=do_plot) + elif os.environ["PYTEST"] == "2": + # Run "test_urchin_against_single_rank" test case + m = int(os.environ["m"]) + n = int(os.environ["n"]) + op_wrapper_str = os.environ["op_wrapper"] + + if op_wrapper_str == "single_layer_wrapper": + op_wrapper = single_layer_wrapper + elif op_wrapper_str == "double_layer_wrapper": + op_wrapper = double_layer_wrapper + else: + raise ValueError("unknown op wrapper") + + _test_urchin_against_single_rank( + cl.create_some_context, m, n, op_wrapper + ) else: if len(sys.argv) > 1: -- GitLab From 45589c8f8eb0937998a563d19013fcc41ade2b54 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 5 May 2020 00:21:22 -0500 Subject: [PATCH 77/86] Try clear_cache --- test/test_distributed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_distributed.py b/test/test_distributed.py index 21f836e0..e1937825 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -169,6 +169,10 @@ def _test_urchin_against_single_rank(ctx_factory, m, n, op_wrapper): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + # prevent cache 'splosion + from sympy.core.cache import clear_cache + clear_cache() + if rank == 0: from meshmode.mesh.generation import generate_urchin mesh = generate_urchin(target_order, m, n, est_rel_interp_tolerance) -- GitLab From 5a89cb358d4c9e75e96a0eeeac798853447997d2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 14 May 2020 16:56:17 -0500 Subject: [PATCH 78/86] Broadcast global tree and redundantly build global traversals independently --- pytential/qbx/distributed.py | 216 +++++++++++++++++++++++------------ 1 file changed, 145 insertions(+), 71 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 1cb4e2a3..6a0c002d 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,8 +1,9 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource -from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder +from boxtree.traversal import FMMTraversalBuilder from boxtree.distributed.partition import ResponsibleBoxesQuery +from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.fmm import TimingRecorder from mpi4py import MPI import numpy as np @@ -74,8 +75,7 @@ class QBXDistributedFMMLibExpansionWrangler( local_dipole_vec = np.empty((total_rank,), dtype=object) for irank in range(total_rank): - - src_idx = distributed_geo_data.local_data[irank].src_idx + src_idx = distributed_geo_data.src_idx_all_ranks[irank] local_dipole_vec[irank] = wrangler.dipole_vec[:, src_idx] @@ -114,22 +114,53 @@ class QBXDistributedFMMLibExpansionWrangler( # }}} +# {{{ Traversal builder + +class QBXFMMGeometryDataTraversalBuilder: + def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None, + _from_sep_smaller_min_nsources_cumul=0, + _expansions_in_tree_have_extent=True): + self.traversal_builder = FMMTraversalBuilder( + context, + well_sep_is_n_away=well_sep_is_n_away, + from_sep_smaller_crit=from_sep_smaller_crit + ) + self._from_sep_smaller_min_nsources_cumul = \ + _from_sep_smaller_min_nsources_cumul + + def __call__(self, queue, tree, **kwargs): + trav, evt = self.traversal_builder( + queue, tree, + _from_sep_smaller_min_nsources_cumul=( + self._from_sep_smaller_min_nsources_cumul + ), + **kwargs + ) + + return trav, evt + +# }}} + + # {{{ Distributed GeoData class DistributedGeoData(object): def __init__(self, geo_data, queue, global_wrangler, boxes_time, comm=MPI.COMM_WORLD): self.comm = comm - current_rank = comm.Get_rank() - total_rank = comm.Get_size() + mpi_rank = comm.Get_rank() + mpi_size = comm.Get_size() self.global_wrangler = global_wrangler self.queue = queue + global_traversal = None + global_tree = None + if geo_data is not None: # master process - traversal = geo_data.traversal() - tree = traversal.tree - nlevels = tree.nlevels + global_traversal = geo_data.traversal() + global_tree = global_traversal.tree + nlevels = global_tree.nlevels ncenters = geo_data.ncenters centers = geo_data.centers() @@ -145,59 +176,94 @@ class DistributedGeoData(object): qbx_center_to_target_box_source_level[level] = ( geo_data.qbx_center_to_target_box_source_level(level)) - else: # worker process - traversal = None + # {{{ Broadcast necessary parts of geometry data to worker ranks - # {{{ Distribute traversal parameters + global_tree = comm.bcast(global_tree, root=0) + global_tree_dev = global_tree.to_device(queue).with_queue(queue) - if current_rank == 0: + if mpi_rank != 0: + boxes_time = np.empty(global_tree.nboxes, dtype=np.float64) + + comm.Bcast(boxes_time, root=0) + + # }}} + + # {{{ Construct traversal object on all ranks + + trav_param = None + if mpi_rank == 0: trav_param = { "well_sep_is_n_away": geo_data.geo_data.code_getter.build_traversal.well_sep_is_n_away, "from_sep_smaller_crit": geo_data.geo_data.code_getter.build_traversal. - from_sep_smaller_crit + from_sep_smaller_crit, + "_from_sep_smaller_min_nsources_cumul": + geo_data.geo_data.lpot_source. + _from_sep_smaller_min_nsources_cumul } - else: - trav_param = None - trav_param = comm.bcast(trav_param, root=0) + # NOTE: The distributed implementation relies on building the same traversal + # objects as the global traversal object on root rank. This means here we + # should construct traversal objects using the same parameters as + # `QBXFMMGeometryData.traversal`. + + traversal_builder = QBXFMMGeometryDataTraversalBuilder( + queue.context, + well_sep_is_n_away=trav_param["well_sep_is_n_away"], + from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], + _from_sep_smaller_min_nsources_cumul=trav_param[ + "_from_sep_smaller_min_nsources_cumul" + ] + ) + + if mpi_rank != 0: + global_traversal, _ = traversal_builder(queue, global_tree_dev) + + if global_tree_dev.targets_have_extent: + global_traversal = global_traversal.merge_close_lists(queue) + + global_traversal = global_traversal.get(queue) + # }}} - if current_rank == 0: - from boxtree.distributed.partition import partition_work - responsible_boxes_list = partition_work( - boxes_time, traversal, comm.Get_size() - ) - else: - responsible_boxes_list = None + from boxtree.distributed.partition import partition_work + responsible_boxes_list = partition_work( + boxes_time, global_traversal, comm.Get_size() + ) - if current_rank == 0: - responsible_box_query = ResponsibleBoxesQuery(queue, traversal) - else: - responsible_box_query = None + responsible_box_query = ResponsibleBoxesQuery(queue, global_traversal) from boxtree.distributed.local_tree import generate_local_tree - self.local_tree, self.local_data, self.box_bounding_box = \ - generate_local_tree(queue, traversal, responsible_boxes_list, - responsible_box_query, no_targets=True) + self.local_tree, self.src_idx, self.tgt_idx = \ + generate_local_tree( + queue, global_tree, responsible_boxes_list, + responsible_box_query, comm=comm + ) + + self.src_idx_all_ranks = comm.gather(self.src_idx, root=0) + self.tgt_idx_all_ranks = comm.gather(self.tgt_idx, root=0) from boxtree.distributed.local_traversal import generate_local_travs self.local_trav = generate_local_travs( - queue, self.local_tree, self.box_bounding_box, - well_sep_is_n_away=trav_param["well_sep_is_n_away"], - from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], + queue, self.local_tree, traversal_builder, + box_bounding_box={ + "min": global_traversal.box_target_bounding_box_min, + "max": global_traversal.box_target_bounding_box_max + }, + # TODO: get whether to merge close lists from root instead of + # hard-coding? merge_close_lists=True ) # {{{ Distribute non_qbx_box_target_lists - if current_rank == 0: # master process + if mpi_rank == 0: # master process start_time = time.time() from boxtree.distributed.local_tree import get_fetch_local_particles_knls - knls = get_fetch_local_particles_knls(queue.context, tree) + knls = get_fetch_local_particles_knls(queue.context, global_tree) box_target_starts = cl.array.to_device( queue, non_qbx_box_target_lists.box_target_starts) @@ -206,15 +272,15 @@ class DistributedGeoData(object): nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets targets = non_qbx_box_target_lists.targets - reqs = np.empty((total_rank,), dtype=object) - local_non_qbx_box_target_lists = np.empty((total_rank,), dtype=object) - self.particle_mask = np.empty((total_rank,), dtype=object) + reqs = np.empty((mpi_size,), dtype=object) + local_non_qbx_box_target_lists = np.empty((mpi_size,), dtype=object) + self.particle_mask = np.empty((mpi_size,), dtype=object) - for irank in range(total_rank): + for irank in range(mpi_size): particle_mask = cl.array.zeros(queue, (nfiltered_targets,), - dtype=tree.particle_id_dtype) + dtype=global_tree.particle_id_dtype) - responsible_boxes_mask = np.zeros((tree.nboxes,), dtype=np.int8) + responsible_boxes_mask = np.zeros(global_tree.nboxes, dtype=np.int8) responsible_boxes_mask[responsible_boxes_list[irank]] = 1 responsible_boxes_mask = cl.array.to_device( queue, responsible_boxes_mask @@ -227,20 +293,24 @@ class DistributedGeoData(object): particle_mask ) - particle_scan = cl.array.empty(queue, (nfiltered_targets + 1,), - dtype=tree.particle_id_dtype) + particle_scan = cl.array.empty( + queue, (nfiltered_targets + 1,), + dtype=global_tree.particle_id_dtype + ) particle_scan[0] = 0 knls.mask_scan_knl(particle_mask, particle_scan) local_box_target_starts = cl.array.empty( - queue, (tree.nboxes,), dtype=tree.particle_id_dtype) + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) knls.generate_box_particle_starts( box_target_starts, particle_scan, local_box_target_starts ) local_box_target_counts_nonchild = cl.array.zeros( - queue, (tree.nboxes,), dtype=tree.particle_id_dtype) + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) knls.generate_box_particle_counts_nonchild( responsible_boxes_mask, box_target_counts_nonchild, @@ -251,8 +321,8 @@ class DistributedGeoData(object): particle_mask = particle_mask.get().astype(bool) self.particle_mask[irank] = particle_mask - local_targets = np.empty((tree.dimensions,), dtype=object) - for idimension in range(tree.dimensions): + local_targets = np.empty((global_tree.dimensions,), dtype=object) + for idimension in range(global_tree.dimensions): local_targets[idimension] = targets[idimension][particle_mask] local_non_qbx_box_target_lists[irank] = { @@ -270,13 +340,13 @@ class DistributedGeoData(object): tag=MPITags["non_qbx_box_target_lists"] ) - for irank in range(1, total_rank): + for irank in range(1, mpi_size): reqs[irank].wait() logger.info("Distribute non_qbx_box_target_lists in {} secs.".format( time.time() - start_time)) - if current_rank == 0: + if mpi_rank == 0: local_non_qbx_box_target_lists = local_non_qbx_box_target_lists[0] else: local_non_qbx_box_target_lists = comm.recv( @@ -295,30 +365,30 @@ class DistributedGeoData(object): # {{{ Distribute other useful fields of geo_data - if current_rank == 0: + if mpi_rank == 0: start_time = time.time() - local_global_qbx_centers = np.empty((total_rank,), dtype=object) - local_centers = np.empty((total_rank,), dtype=object) - local_expansion_radii = np.empty((total_rank,), dtype=object) - local_qbx_center_to_target_box = np.empty((total_rank,), dtype=object) - local_center_to_tree_targets = np.empty((total_rank,), dtype=object) - local_qbx_targets = np.empty((total_rank,), dtype=object) + local_global_qbx_centers = np.empty((mpi_size,), dtype=object) + local_centers = np.empty((mpi_size,), dtype=object) + local_expansion_radii = np.empty((mpi_size,), dtype=object) + local_qbx_center_to_target_box = np.empty((mpi_size,), dtype=object) + local_center_to_tree_targets = np.empty((mpi_size,), dtype=object) + local_qbx_targets = np.empty((mpi_size,), dtype=object) reqs = [] - self.qbx_target_mask = np.empty((total_rank,), dtype=object) + self.qbx_target_mask = np.empty((mpi_size,), dtype=object) - for irank in range(total_rank): + for irank in range(mpi_size): - tgt_mask = np.zeros((tree.ntargets,), dtype=bool) - tgt_mask[self.local_data[irank].tgt_idx] = True + tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) + tgt_mask[self.tgt_idx_all_ranks[irank]] = True - tgt_mask_user_order = tgt_mask[tree.sorted_target_ids] + tgt_mask_user_order = tgt_mask[global_tree.sorted_target_ids] centers_mask = tgt_mask_user_order[:ncenters] centers_scan = np.empty( - (ncenters + 1,), dtype=tree.particle_id_dtype) + (ncenters + 1,), dtype=global_tree.particle_id_dtype) centers_scan[1:] = np.cumsum( - centers_mask.astype(tree.particle_id_dtype)) + centers_mask.astype(global_tree.particle_id_dtype)) centers_scan[0] = 0 # {{{ Distribute centers @@ -373,7 +443,9 @@ class DistributedGeoData(object): # traversal object. local_qbx_center_to_target_box[irank] = \ - traversal.target_boxes[qbx_center_to_target_box[centers_mask]] + global_traversal.target_boxes[ + qbx_center_to_target_box[centers_mask] + ] if irank != 0: reqs.append(comm.isend( local_qbx_center_to_target_box[irank], @@ -390,7 +462,7 @@ class DistributedGeoData(object): local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) local_lists = np.empty(lists.shape, dtype=lists.dtype) - qbx_target_mask = np.zeros((tree.ntargets,), dtype=bool) + qbx_target_mask = np.zeros((global_tree.ntargets,), dtype=bool) current_start = 0 # index into local_lists ilocal_center = 0 local_starts[0] = 0 @@ -417,18 +489,20 @@ class DistributedGeoData(object): local_lists = local_lists[:current_start] - qbx_target_scan = np.empty((tree.ntargets + 1,), dtype=lists.dtype) + qbx_target_scan = np.empty( + (global_tree.ntargets + 1,), dtype=lists.dtype + ) qbx_target_scan[0] = 0 qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) nlocal_qbx_target = qbx_target_scan[-1] local_qbx_targets[irank] = np.empty( - (tree.dimensions, nlocal_qbx_target), - dtype=tree.targets[0].dtype + (global_tree.dimensions, nlocal_qbx_target), + dtype=global_tree.targets[0].dtype ) - for idim in range(tree.dimensions): + for idim in range(global_tree.dimensions): local_qbx_targets[irank][idim, :] = \ - tree.targets[idim][qbx_target_mask] + global_tree.targets[idim][qbx_target_mask] if irank != 0: reqs.append(comm.isend( local_qbx_targets[irank], @@ -764,7 +838,7 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, from boxtree.distributed.calculation import distribute_source_weights local_source_weights = distribute_source_weights( - src_weights, distributed_geo_data.local_data, comm=comm + src_weights, distributed_geo_data.src_idx_all_ranks, comm=comm ) # }}} -- GitLab From e74afeb3c55fd7444e456fe36bee8708a92f4d7e Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 15 May 2020 01:24:37 -0500 Subject: [PATCH 79/86] Broadcast fields in geodata and compute non_qbx_box_target_lists independently --- pytential/qbx/distributed.py | 168 ++++++++++++++++------------------- 1 file changed, 77 insertions(+), 91 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 6a0c002d..15771703 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -156,11 +156,16 @@ class DistributedGeoData(object): global_traversal = None global_tree = None + centers = None + expansion_radii = None + global_qbx_centers = None + qbx_center_to_target_box = None + non_qbx_box_target_lists = None + center_to_tree_targets = None if geo_data is not None: # master process global_traversal = geo_data.traversal() global_tree = global_traversal.tree - nlevels = global_tree.nlevels ncenters = geo_data.ncenters centers = geo_data.centers() @@ -170,12 +175,6 @@ class DistributedGeoData(object): non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() center_to_tree_targets = geo_data.center_to_tree_targets() - qbx_center_to_target_box_source_level = np.empty( - (nlevels,), dtype=object) - for level in range(nlevels): - qbx_center_to_target_box_source_level[level] = ( - geo_data.qbx_center_to_target_box_source_level(level)) - # {{{ Broadcast necessary parts of geometry data to worker ranks global_tree = comm.bcast(global_tree, root=0) @@ -186,9 +185,16 @@ class DistributedGeoData(object): comm.Bcast(boxes_time, root=0) + centers = comm.bcast(centers, root=0) + expansion_radii = comm.bcast(expansion_radii, root=0) + global_qbx_centers = comm.bcast(global_qbx_centers, root=0) + qbx_center_to_target_box = comm.bcast(qbx_center_to_target_box, root=0) + non_qbx_box_target_lists = comm.bcast(non_qbx_box_target_lists, root=0) + center_to_tree_targets = comm.bcast(center_to_tree_targets, root=0) + # }}} - # {{{ Construct traversal object on all ranks + # {{{ Construct global traversals independently on all ranks trav_param = None if mpi_rank == 0: @@ -257,100 +263,76 @@ class DistributedGeoData(object): merge_close_lists=True ) - # {{{ Distribute non_qbx_box_target_lists - - if mpi_rank == 0: # master process - start_time = time.time() - - from boxtree.distributed.local_tree import get_fetch_local_particles_knls - knls = get_fetch_local_particles_knls(queue.context, global_tree) - - box_target_starts = cl.array.to_device( - queue, non_qbx_box_target_lists.box_target_starts) - box_target_counts_nonchild = cl.array.to_device( - queue, non_qbx_box_target_lists.box_target_counts_nonchild) - nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets - targets = non_qbx_box_target_lists.targets - - reqs = np.empty((mpi_size,), dtype=object) - local_non_qbx_box_target_lists = np.empty((mpi_size,), dtype=object) - self.particle_mask = np.empty((mpi_size,), dtype=object) - - for irank in range(mpi_size): - particle_mask = cl.array.zeros(queue, (nfiltered_targets,), - dtype=global_tree.particle_id_dtype) + # {{{ Form non_qbx_box_target_lists - responsible_boxes_mask = np.zeros(global_tree.nboxes, dtype=np.int8) - responsible_boxes_mask[responsible_boxes_list[irank]] = 1 - responsible_boxes_mask = cl.array.to_device( - queue, responsible_boxes_mask - ) + start_time = time.time() - knls.particle_mask_knl( - responsible_boxes_mask, - box_target_starts, - box_target_counts_nonchild, - particle_mask - ) + from boxtree.distributed.local_tree import get_fetch_local_particles_knls + knls = get_fetch_local_particles_knls(queue.context, global_tree) - particle_scan = cl.array.empty( - queue, (nfiltered_targets + 1,), - dtype=global_tree.particle_id_dtype - ) - particle_scan[0] = 0 - knls.mask_scan_knl(particle_mask, particle_scan) + box_target_starts = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_starts + ) + box_target_counts_nonchild = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_counts_nonchild + ) + nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets + targets = non_qbx_box_target_lists.targets - local_box_target_starts = cl.array.empty( - queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype - ) - knls.generate_box_particle_starts( - box_target_starts, particle_scan, - local_box_target_starts - ) + particle_mask = cl.array.zeros( + queue, (nfiltered_targets,), dtype=global_tree.particle_id_dtype + ) - local_box_target_counts_nonchild = cl.array.zeros( - queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype - ) - knls.generate_box_particle_counts_nonchild( - responsible_boxes_mask, - box_target_counts_nonchild, - local_box_target_counts_nonchild - ) + responsible_boxes_mask = np.zeros(global_tree.nboxes, dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list[mpi_rank]] = 1 + responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask) - local_nfiltered_targets = particle_scan[-1].get(queue) + knls.particle_mask_knl( + responsible_boxes_mask, + box_target_starts, + box_target_counts_nonchild, + particle_mask + ) - particle_mask = particle_mask.get().astype(bool) - self.particle_mask[irank] = particle_mask - local_targets = np.empty((global_tree.dimensions,), dtype=object) - for idimension in range(global_tree.dimensions): - local_targets[idimension] = targets[idimension][particle_mask] + particle_scan = cl.array.empty( + queue, (nfiltered_targets + 1,), + dtype=global_tree.particle_id_dtype + ) + particle_scan[0] = 0 + knls.mask_scan_knl(particle_mask, particle_scan) - local_non_qbx_box_target_lists[irank] = { - "nfiltered_targets": local_nfiltered_targets, - "box_target_starts": local_box_target_starts.get(), - "box_target_counts_nonchild": - local_box_target_counts_nonchild.get(), - "targets": local_targets - } + local_box_target_starts = cl.array.empty( + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) + knls.generate_box_particle_starts( + box_target_starts, particle_scan, + local_box_target_starts + ) - if irank != 0: - reqs[irank] = comm.isend( - local_non_qbx_box_target_lists[irank], - dest=irank, - tag=MPITags["non_qbx_box_target_lists"] - ) + local_box_target_counts_nonchild = cl.array.zeros( + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) + knls.generate_box_particle_counts_nonchild( + responsible_boxes_mask, + box_target_counts_nonchild, + local_box_target_counts_nonchild + ) - for irank in range(1, mpi_size): - reqs[irank].wait() + local_nfiltered_targets = particle_scan[-1].get(queue) - logger.info("Distribute non_qbx_box_target_lists in {} secs.".format( - time.time() - start_time)) + particle_mask = particle_mask.get().astype(bool) + self.particle_mask = comm.gather(particle_mask, root=0) + local_targets = np.empty((global_tree.dimensions,), dtype=object) + for idimension in range(global_tree.dimensions): + local_targets[idimension] = targets[idimension][particle_mask] - if mpi_rank == 0: - local_non_qbx_box_target_lists = local_non_qbx_box_target_lists[0] - else: - local_non_qbx_box_target_lists = comm.recv( - source=0, tag=MPITags["non_qbx_box_target_lists"]) + local_non_qbx_box_target_lists = { + "nfiltered_targets": local_nfiltered_targets, + "box_target_starts": local_box_target_starts.get(), + "box_target_counts_nonchild": + local_box_target_counts_nonchild.get(), + "targets": local_targets + } self._non_qbx_box_target_lists = FilteredTargetListsInTreeOrder( nfiltered_targets=local_non_qbx_box_target_lists["nfiltered_targets"], @@ -361,9 +343,13 @@ class DistributedGeoData(object): unfiltered_from_filtered_target_indices=None ) + logger.info("Form non_qbx_box_target_lists on rank {} in {} secs.".format( + mpi_rank, time.time() - start_time + )) + # }}} - # {{{ Distribute other useful fields of geo_data + # {{{ Form other useful fields of geo_data if mpi_rank == 0: start_time = time.time() -- GitLab From d16c5646f97dd5507030c65e259316bbd4d66d1b Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 15 May 2020 13:29:42 -0500 Subject: [PATCH 80/86] Compute local geodata independently on each worker rank --- pytential/qbx/distributed.py | 309 +++++++++++++---------------------- 1 file changed, 115 insertions(+), 194 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 15771703..9f4a9aa6 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -16,6 +16,7 @@ logger = logging.getLogger(__name__) # {{{ MPITags used in this module +# TODO: remove unused tags MPITags = { "non_qbx_box_target_lists": 0, "global_qbx_centers": 1, @@ -149,7 +150,6 @@ class DistributedGeoData(object): comm=MPI.COMM_WORLD): self.comm = comm mpi_rank = comm.Get_rank() - mpi_size = comm.Get_size() self.global_wrangler = global_wrangler self.queue = queue @@ -167,7 +167,6 @@ class DistributedGeoData(object): global_traversal = geo_data.traversal() global_tree = global_traversal.tree - ncenters = geo_data.ncenters centers = geo_data.centers() expansion_radii = geo_data.expansion_radii() global_qbx_centers = geo_data.global_qbx_centers() @@ -186,6 +185,7 @@ class DistributedGeoData(object): comm.Bcast(boxes_time, root=0) centers = comm.bcast(centers, root=0) + ncenters = len(centers[0]) expansion_radii = comm.bcast(expansion_radii, root=0) global_qbx_centers = comm.bcast(global_qbx_centers, root=0) qbx_center_to_target_box = comm.bcast(qbx_center_to_target_box, root=0) @@ -349,204 +349,58 @@ class DistributedGeoData(object): # }}} - # {{{ Form other useful fields of geo_data + # {{{ Compute other useful fields of local geo_data - if mpi_rank == 0: - start_time = time.time() - - local_global_qbx_centers = np.empty((mpi_size,), dtype=object) - local_centers = np.empty((mpi_size,), dtype=object) - local_expansion_radii = np.empty((mpi_size,), dtype=object) - local_qbx_center_to_target_box = np.empty((mpi_size,), dtype=object) - local_center_to_tree_targets = np.empty((mpi_size,), dtype=object) - local_qbx_targets = np.empty((mpi_size,), dtype=object) - - reqs = [] - self.qbx_target_mask = np.empty((mpi_size,), dtype=object) - - for irank in range(mpi_size): - - tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) - tgt_mask[self.tgt_idx_all_ranks[irank]] = True - - tgt_mask_user_order = tgt_mask[global_tree.sorted_target_ids] - centers_mask = tgt_mask_user_order[:ncenters] - centers_scan = np.empty( - (ncenters + 1,), dtype=global_tree.particle_id_dtype) - centers_scan[1:] = np.cumsum( - centers_mask.astype(global_tree.particle_id_dtype)) - centers_scan[0] = 0 - - # {{{ Distribute centers - - nlocal_centers = np.sum(centers_mask.astype(np.int32)) - centers_dims = centers.shape[0] - local_centers[irank] = np.empty((centers_dims, nlocal_centers), - dtype=centers[0].dtype) - for idims in range(centers_dims): - local_centers[irank][idims][:] = centers[idims][centers_mask] - - if irank != 0: - reqs.append(comm.isend( - local_centers[irank], - dest=irank, - tag=MPITags["centers"] - )) - - # }}} - - # {{{ Distribute global_qbx_centers - - local_global_qbx_centers[irank] = centers_scan[ - global_qbx_centers[centers_mask[global_qbx_centers]]] - - if irank != 0: - reqs.append(comm.isend( - local_global_qbx_centers[irank], - dest=irank, - tag=MPITags["global_qbx_centers"] - )) - - # }}} - - # {{{ Distribute expansion_radii - - local_expansion_radii[irank] = expansion_radii[centers_mask] - if irank != 0: - reqs.append(comm.isend( - local_expansion_radii[irank], - dest=irank, - tag=MPITags["expansion_radii"] - )) - - # }}} - - # {{{ Distribute qbx_center_to_target_box - - # Note: The code transforms qbx_center_to_target_box to global box - # indexing from target_boxes before transmission. Each process is - # expected to transform back to target_boxes indexing based its own - # traversal object. - - local_qbx_center_to_target_box[irank] = \ - global_traversal.target_boxes[ - qbx_center_to_target_box[centers_mask] - ] - if irank != 0: - reqs.append(comm.isend( - local_qbx_center_to_target_box[irank], - dest=irank, - tag=MPITags["qbx_center_to_target_box"] - )) - - # }}} - - # {{{ Distribute local_qbx_targets and center_to_tree_targets - - starts = center_to_tree_targets.starts - lists = center_to_tree_targets.lists - local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) - local_lists = np.empty(lists.shape, dtype=lists.dtype) - - qbx_target_mask = np.zeros((global_tree.ntargets,), dtype=bool) - current_start = 0 # index into local_lists - ilocal_center = 0 - local_starts[0] = 0 - - for icenter in range(ncenters): - # skip the current center if irank is not responsible for - # processing it - if not centers_mask[icenter]: - continue - - current_center_targets = lists[ - starts[icenter]:starts[icenter + 1]] - qbx_target_mask[current_center_targets] = True - current_stop = \ - current_start + starts[icenter + 1] - starts[icenter] - local_starts[ilocal_center + 1] = current_stop - local_lists[current_start:current_stop] = \ - lists[starts[icenter]:starts[icenter + 1]] - - current_start = current_stop - ilocal_center += 1 - - self.qbx_target_mask[irank] = qbx_target_mask - - local_lists = local_lists[:current_start] - - qbx_target_scan = np.empty( - (global_tree.ntargets + 1,), dtype=lists.dtype - ) - qbx_target_scan[0] = 0 - qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) - nlocal_qbx_target = qbx_target_scan[-1] + start_time = time.time() - local_qbx_targets[irank] = np.empty( - (global_tree.dimensions, nlocal_qbx_target), - dtype=global_tree.targets[0].dtype - ) - for idim in range(global_tree.dimensions): - local_qbx_targets[irank][idim, :] = \ - global_tree.targets[idim][qbx_target_mask] - if irank != 0: - reqs.append(comm.isend( - local_qbx_targets[irank], - dest=irank, - tag=MPITags["qbx_targets"] - )) - - local_lists = qbx_target_scan[local_lists] - local_center_to_tree_targets[irank] = { - "starts": local_starts, - "lists": local_lists - } - if irank != 0: - reqs.append(comm.isend( - local_center_to_tree_targets[irank], - dest=irank, - tag=MPITags["center_to_tree_targets"] - )) - - # }}} - - MPI.Request.Waitall(reqs) - - local_centers = local_centers[0] - local_global_qbx_centers = local_global_qbx_centers[0] - local_expansion_radii = local_expansion_radii[0] - local_qbx_center_to_target_box = local_qbx_center_to_target_box[0] - local_center_to_tree_targets = local_center_to_tree_targets[0] - local_qbx_targets = local_qbx_targets[0] - - logger.info("Distribute geometry data in {} secs.".format( - time.time() - start_time)) + tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) + tgt_mask[self.tgt_idx] = True - else: - local_centers = comm.recv( - source=0, tag=MPITags["centers"]) - local_global_qbx_centers = comm.recv( - source=0, tag=MPITags["global_qbx_centers"]) - local_expansion_radii = comm.recv( - source=0, tag=MPITags["expansion_radii"]) - local_qbx_center_to_target_box = comm.recv( - source=0, tag=MPITags["qbx_center_to_target_box"] - ) - local_center_to_tree_targets = comm.recv( - source=0, tag=MPITags["center_to_tree_targets"] - ) - local_qbx_targets = comm.recv( - source=0, tag=MPITags["qbx_targets"] - ) + tgt_mask_user_order = tgt_mask[global_tree.sorted_target_ids] + centers_mask = tgt_mask_user_order[:ncenters] + centers_scan = np.empty( + (ncenters + 1,), dtype=global_tree.particle_id_dtype) + centers_scan[1:] = np.cumsum( + centers_mask.astype(global_tree.particle_id_dtype)) + centers_scan[0] = 0 - self._local_centers = local_centers - self._global_qbx_centers = local_global_qbx_centers - self._expansion_radii = local_expansion_radii - self._qbx_targets = local_qbx_targets + # {{{ local centers + + nlocal_centers = np.sum(centers_mask.astype(np.int32)) + centers_dims = centers.shape[0] + local_centers = np.empty( + (centers_dims, nlocal_centers), dtype=centers[0].dtype + ) + for idims in range(centers_dims): + local_centers[idims, :] = centers[idims][centers_mask] + + # }}} + + # {{{ local global_qbx_centers + + local_global_qbx_centers = centers_scan[ + global_qbx_centers[centers_mask[global_qbx_centers]] + ] + + # }}} + + # {{{ local expansion_radii + + local_expansion_radii = expansion_radii[centers_mask] + + # }}} + + # {{{ local qbx_center_to_target_box + + # Transform local qbx_center_to_target_box to global indexing + local_qbx_center_to_target_box = global_traversal.target_boxes[ + qbx_center_to_target_box[centers_mask] + ] - # Transform local_qbx_center_to_target_box to target_boxes indexing + # Transform local_qbx_center_to_target_box to local target_boxes indexing global_boxes_to_target_boxes = np.ones( - (self.local_tree.nboxes,), dtype=self.local_tree.particle_id_dtype) + (self.local_tree.nboxes,), dtype=self.local_tree.particle_id_dtype + ) # make sure accessing invalid position raises an error global_boxes_to_target_boxes *= -1 global_boxes_to_target_boxes[self.local_trav.target_boxes] = \ @@ -554,12 +408,79 @@ class DistributedGeoData(object): self._local_qbx_center_to_target_box = \ global_boxes_to_target_boxes[local_qbx_center_to_target_box] + # }}} + + # {{{ local_qbx_targets and local center_to_tree_targets + + starts = center_to_tree_targets.starts + lists = center_to_tree_targets.lists + local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) + local_lists = np.empty(lists.shape, dtype=lists.dtype) + + qbx_target_mask = np.zeros((global_tree.ntargets,), dtype=bool) + current_start = 0 # index into local_lists + ilocal_center = 0 + local_starts[0] = 0 + + for icenter in range(ncenters): + # skip the current center if the current rank is not responsible for + # processing it + if not centers_mask[icenter]: + continue + + current_center_targets = lists[ + starts[icenter]:starts[icenter + 1]] + qbx_target_mask[current_center_targets] = True + current_stop = \ + current_start + starts[icenter + 1] - starts[icenter] + local_starts[ilocal_center + 1] = current_stop + local_lists[current_start:current_stop] = \ + lists[starts[icenter]:starts[icenter + 1]] + + current_start = current_stop + ilocal_center += 1 + + self.qbx_target_mask = comm.gather(qbx_target_mask, root=0) + + local_lists = local_lists[:current_start] + + qbx_target_scan = np.empty( + (global_tree.ntargets + 1,), dtype=lists.dtype + ) + qbx_target_scan[0] = 0 + qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) + nlocal_qbx_target = qbx_target_scan[-1] + + local_qbx_targets = np.empty( + (global_tree.dimensions, nlocal_qbx_target), + dtype=global_tree.targets[0].dtype + ) + for idim in range(global_tree.dimensions): + local_qbx_targets[idim, :] = global_tree.targets[idim][qbx_target_mask] + + local_lists = qbx_target_scan[local_lists] + local_center_to_tree_targets = { + "starts": local_starts, + "lists": local_lists + } + + # }}} + + self._local_centers = local_centers + self._global_qbx_centers = local_global_qbx_centers + self._expansion_radii = local_expansion_radii + self._qbx_targets = local_qbx_targets + from pytential.qbx.geometry import CenterToTargetList self._local_center_to_tree_targets = CenterToTargetList( starts=local_center_to_tree_targets["starts"], lists=local_center_to_tree_targets["lists"] ) + logger.info("Form local geometry data on rank {} in {} secs.".format( + mpi_rank, time.time() - start_time) + ) + # }}} # {{{ Construct qbx_center_to_target_box_source_level -- GitLab From 23fcddec96b4f6ba3f17f4f9a850a5f5e6e7ebf2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 18 May 2020 18:56:07 -0500 Subject: [PATCH 81/86] Include broadcast time when distributing geometry data --- pytential/qbx/distributed.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 9f4a9aa6..e1258963 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -174,6 +174,8 @@ class DistributedGeoData(object): non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() center_to_tree_targets = geo_data.center_to_tree_targets() + start_time = time.time() + # {{{ Broadcast necessary parts of geometry data to worker ranks global_tree = comm.bcast(global_tree, root=0) @@ -265,8 +267,6 @@ class DistributedGeoData(object): # {{{ Form non_qbx_box_target_lists - start_time = time.time() - from boxtree.distributed.local_tree import get_fetch_local_particles_knls knls = get_fetch_local_particles_knls(queue.context, global_tree) @@ -343,16 +343,10 @@ class DistributedGeoData(object): unfiltered_from_filtered_target_indices=None ) - logger.info("Form non_qbx_box_target_lists on rank {} in {} secs.".format( - mpi_rank, time.time() - start_time - )) - # }}} # {{{ Compute other useful fields of local geo_data - start_time = time.time() - tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) tgt_mask[self.tgt_idx] = True -- GitLab From 4bd71e58c7226d41cf9ce2898152fb54fc3f3df2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 19 May 2020 01:07:29 -0500 Subject: [PATCH 82/86] Refactor geometry data broadcasting --- pytential/qbx/distributed.py | 152 ++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 48 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index e1258963..216e358a 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,5 +1,6 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource +from pytential.qbx.utils import ToHostTransferredGeoDataWrapper from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.traversal import FMMTraversalBuilder from boxtree.distributed.partition import ResponsibleBoxesQuery @@ -8,6 +9,7 @@ from boxtree.fmm import TimingRecorder from mpi4py import MPI import numpy as np import pyopencl as cl +from collections import namedtuple import logging import time from pytools import memoize_method @@ -143,7 +145,87 @@ class QBXFMMGeometryDataTraversalBuilder: # }}} -# {{{ Distributed GeoData +# {{{ Distribute QBXFMMGeometryData to each worker rank + +DistributedGlobalQBXFMMGeometryData = namedtuple( + "DistributedGlobalQBXFMMGeometryData", + [ + "global_traversal", "centers", "expansion_radii", "global_qbx_centers", + "qbx_center_to_target_box", "non_qbx_box_target_lists", + "center_to_tree_targets" + ] +) + + +def broadcast_global_geometry_data( + comm, queue, traversal_builder, global_geometry_data): + """Broadcasts useful fields of global geometry data from root to worker ranks, + so that each rank can form local geometry data independently. + + :arg comm: an object of :class:`MPI.Intracomm`, the MPI communicator. + :arg queue: a :class:`pyopencl.CommandQueue` object. + :arg traversal_builder: a :class:`QBXFMMGeometryDataTraversalBuilder` object, + used for constructing the global traversal object. This argument is + significant on all ranks. + :arg global_geometry_data: an object of :class:`ToHostTransferredGeoDataWrapper`, + the global geometry data on host memory. This argument is only significant on + the root rank. + :returns: a :class:`DistributedGlobalQBXFMMGeometryData` object on each worker + rank, representing the broadcasted subset of the global geometry data, used + for constructing the local geometry data independently. See + :func:`compute_local_geometry_data`. + """ + + mpi_rank = comm.Get_rank() + + global_traversal = None + global_tree = None + centers = None + expansion_radii = None + global_qbx_centers = None + qbx_center_to_target_box = None + non_qbx_box_target_lists = None + center_to_tree_targets = None + + if mpi_rank == 0: + global_traversal = global_geometry_data.traversal() + global_tree = global_traversal.tree + + centers = global_geometry_data.centers() + expansion_radii = global_geometry_data.expansion_radii() + global_qbx_centers = global_geometry_data.global_qbx_centers() + qbx_center_to_target_box = global_geometry_data.qbx_center_to_target_box() + non_qbx_box_target_lists = global_geometry_data.non_qbx_box_target_lists() + center_to_tree_targets = global_geometry_data.center_to_tree_targets() + + global_tree = comm.bcast(global_tree, root=0) + global_tree_dev = global_tree.to_device(queue).with_queue(queue) + + centers = comm.bcast(centers, root=0) + expansion_radii = comm.bcast(expansion_radii, root=0) + global_qbx_centers = comm.bcast(global_qbx_centers, root=0) + qbx_center_to_target_box = comm.bcast(qbx_center_to_target_box, root=0) + non_qbx_box_target_lists = comm.bcast(non_qbx_box_target_lists, root=0) + center_to_tree_targets = comm.bcast(center_to_tree_targets, root=0) + + if mpi_rank != 0: + global_traversal, _ = traversal_builder(queue, global_tree_dev) + + if global_tree_dev.targets_have_extent: + global_traversal = global_traversal.merge_close_lists(queue) + + global_traversal = global_traversal.get(queue) + + return DistributedGlobalQBXFMMGeometryData( + global_traversal=global_traversal, + centers=centers, + expansion_radii=expansion_radii, + global_qbx_centers=global_qbx_centers, + qbx_center_to_target_box=qbx_center_to_target_box, + non_qbx_box_target_lists=non_qbx_box_target_lists, + center_to_tree_targets=center_to_tree_targets + ) + class DistributedGeoData(object): def __init__(self, geo_data, queue, global_wrangler, boxes_time, @@ -154,49 +236,9 @@ class DistributedGeoData(object): self.global_wrangler = global_wrangler self.queue = queue - global_traversal = None - global_tree = None - centers = None - expansion_radii = None - global_qbx_centers = None - qbx_center_to_target_box = None - non_qbx_box_target_lists = None - center_to_tree_targets = None - - if geo_data is not None: # master process - global_traversal = geo_data.traversal() - global_tree = global_traversal.tree - - centers = geo_data.centers() - expansion_radii = geo_data.expansion_radii() - global_qbx_centers = geo_data.global_qbx_centers() - qbx_center_to_target_box = geo_data.qbx_center_to_target_box() - non_qbx_box_target_lists = geo_data.non_qbx_box_target_lists() - center_to_tree_targets = geo_data.center_to_tree_targets() - start_time = time.time() - # {{{ Broadcast necessary parts of geometry data to worker ranks - - global_tree = comm.bcast(global_tree, root=0) - global_tree_dev = global_tree.to_device(queue).with_queue(queue) - - if mpi_rank != 0: - boxes_time = np.empty(global_tree.nboxes, dtype=np.float64) - - comm.Bcast(boxes_time, root=0) - - centers = comm.bcast(centers, root=0) - ncenters = len(centers[0]) - expansion_radii = comm.bcast(expansion_radii, root=0) - global_qbx_centers = comm.bcast(global_qbx_centers, root=0) - qbx_center_to_target_box = comm.bcast(qbx_center_to_target_box, root=0) - non_qbx_box_target_lists = comm.bcast(non_qbx_box_target_lists, root=0) - center_to_tree_targets = comm.bcast(center_to_tree_targets, root=0) - - # }}} - - # {{{ Construct global traversals independently on all ranks + # {{{ traversal builder trav_param = None if mpi_rank == 0: @@ -226,13 +268,28 @@ class DistributedGeoData(object): ] ) - if mpi_rank != 0: - global_traversal, _ = traversal_builder(queue, global_tree_dev) + # }}} - if global_tree_dev.targets_have_extent: - global_traversal = global_traversal.merge_close_lists(queue) + # {{{ Broadcast necessary parts of geometry data to worker ranks + + global_geometry_data = broadcast_global_geometry_data( + comm, queue, traversal_builder, geo_data + ) - global_traversal = global_traversal.get(queue) + global_traversal = global_geometry_data.global_traversal + global_tree = global_traversal.tree + centers = global_geometry_data.centers + ncenters = len(centers[0]) + expansion_radii = global_geometry_data.expansion_radii + global_qbx_centers = global_geometry_data.global_qbx_centers + qbx_center_to_target_box = global_geometry_data.qbx_center_to_target_box + non_qbx_box_target_lists = global_geometry_data.non_qbx_box_target_lists + center_to_tree_targets = global_geometry_data.center_to_tree_targets + + if mpi_rank != 0: + boxes_time = np.empty(global_tree.nboxes, dtype=np.float64) + + comm.Bcast(boxes_time, root=0) # }}} @@ -663,7 +720,6 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): # no cached result found, construct a new distributed_geo_data if current_rank == 0: - from pytential.qbx.utils import ToHostTransferredGeoDataWrapper host_geo_data = ToHostTransferredGeoDataWrapper(queue, geo_data) if self.report_parameters: -- GitLab From 2b263e43f0b2384b9f3dfc4ecc4298e2d12b4f66 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 22 May 2020 09:58:53 -0500 Subject: [PATCH 83/86] Refactor local geometry data generation --- pytential/qbx/__init__.py | 3 +- pytential/qbx/distributed.py | 680 ++++++++++++++++---------------- pytential/symbolic/execution.py | 5 +- 3 files changed, 350 insertions(+), 338 deletions(-) diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py index 833305fc..48cb3e81 100644 --- a/pytential/qbx/__init__.py +++ b/pytential/qbx/__init__.py @@ -802,7 +802,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase): ) all_potentials_on_every_target, extra_outputs = fmm_driver( - queue, strengths, distributed_geo_data, comm=self.comm # noqa pylint:disable=no-member + queue, strengths, distributed_geo_data, wrangler, + comm=self.comm # noqa pylint:disable=no-member ) else: diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 216e358a..54454573 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -6,6 +6,7 @@ from boxtree.traversal import FMMTraversalBuilder from boxtree.distributed.partition import ResponsibleBoxesQuery from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.fmm import TimingRecorder +from boxtree.tools import DeviceDataRecord from mpi4py import MPI import numpy as np import pyopencl as cl @@ -20,16 +21,9 @@ logger = logging.getLogger(__name__) # TODO: remove unused tags MPITags = { - "non_qbx_box_target_lists": 0, - "global_qbx_centers": 1, - "centers": 2, - "dipole_vec": 3, - "expansion_radii": 4, - "qbx_center_to_target_box": 5, - "center_to_tree_targets": 6, - "qbx_targets": 7, - "non_qbx_potentials": 8, - "qbx_potentials": 9 + "dipole_vec": 0, + "non_qbx_potentials": 1, + "qbx_potentials": 2 } # }}} @@ -227,404 +221,419 @@ def broadcast_global_geometry_data( ) -class DistributedGeoData(object): - def __init__(self, geo_data, queue, global_wrangler, boxes_time, - comm=MPI.COMM_WORLD): - self.comm = comm - mpi_rank = comm.Get_rank() - - self.global_wrangler = global_wrangler - self.queue = queue - - start_time = time.time() +class LocalQBXFMMGeometryData(DeviceDataRecord): + def non_qbx_box_target_lists(self): + return self._non_qbx_box_target_lists - # {{{ traversal builder - - trav_param = None - if mpi_rank == 0: - trav_param = { - "well_sep_is_n_away": - geo_data.geo_data.code_getter.build_traversal.well_sep_is_n_away, - "from_sep_smaller_crit": - geo_data.geo_data.code_getter.build_traversal. - from_sep_smaller_crit, - "_from_sep_smaller_min_nsources_cumul": - geo_data.geo_data.lpot_source. - _from_sep_smaller_min_nsources_cumul - } - trav_param = comm.bcast(trav_param, root=0) - - # NOTE: The distributed implementation relies on building the same traversal - # objects as the global traversal object on root rank. This means here we - # should construct traversal objects using the same parameters as - # `QBXFMMGeometryData.traversal`. - - traversal_builder = QBXFMMGeometryDataTraversalBuilder( - queue.context, - well_sep_is_n_away=trav_param["well_sep_is_n_away"], - from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], - _from_sep_smaller_min_nsources_cumul=trav_param[ - "_from_sep_smaller_min_nsources_cumul" - ] - ) + def traversal(self): + return self.local_trav - # }}} + def tree(self): + return self.traversal().tree - # {{{ Broadcast necessary parts of geometry data to worker ranks + def centers(self): + return self._local_centers - global_geometry_data = broadcast_global_geometry_data( - comm, queue, traversal_builder, geo_data - ) + @property + def ncenters(self): + return self._local_centers.shape[1] - global_traversal = global_geometry_data.global_traversal - global_tree = global_traversal.tree - centers = global_geometry_data.centers - ncenters = len(centers[0]) - expansion_radii = global_geometry_data.expansion_radii - global_qbx_centers = global_geometry_data.global_qbx_centers - qbx_center_to_target_box = global_geometry_data.qbx_center_to_target_box - non_qbx_box_target_lists = global_geometry_data.non_qbx_box_target_lists - center_to_tree_targets = global_geometry_data.center_to_tree_targets + def global_qbx_centers(self): + return self._global_qbx_centers - if mpi_rank != 0: - boxes_time = np.empty(global_tree.nboxes, dtype=np.float64) + def expansion_radii(self): + return self._expansion_radii - comm.Bcast(boxes_time, root=0) + def qbx_center_to_target_box(self): + return self._local_qbx_center_to_target_box - # }}} + def center_to_tree_targets(self): + return self._local_center_to_tree_targets - from boxtree.distributed.partition import partition_work - responsible_boxes_list = partition_work( - boxes_time, global_traversal, comm.Get_size() - ) + def qbx_targets(self): + return self._qbx_targets - responsible_box_query = ResponsibleBoxesQuery(queue, global_traversal) + def qbx_center_to_target_box_source_level(self, source_level): + return self._qbx_center_to_target_box_source_level[source_level] - from boxtree.distributed.local_tree import generate_local_tree - self.local_tree, self.src_idx, self.tgt_idx = \ - generate_local_tree( - queue, global_tree, responsible_boxes_list, - responsible_box_query, comm=comm - ) + @memoize_method + def build_rotation_classes_lists(self): + with cl.CommandQueue(self.cl_context) as queue: + trav = self.traversal().to_device(queue) + tree = self.tree().to_device(queue) - self.src_idx_all_ranks = comm.gather(self.src_idx, root=0) - self.tgt_idx_all_ranks = comm.gather(self.tgt_idx, root=0) - - from boxtree.distributed.local_traversal import generate_local_travs - self.local_trav = generate_local_travs( - queue, self.local_tree, traversal_builder, - box_bounding_box={ - "min": global_traversal.box_target_bounding_box_min, - "max": global_traversal.box_target_bounding_box_max - }, - # TODO: get whether to merge close lists from root instead of - # hard-coding? - merge_close_lists=True - ) + from boxtree.rotation_classes import RotationClassesBuilder + return RotationClassesBuilder(self.cl_context)( + queue, trav, tree)[0].get(queue) - # {{{ Form non_qbx_box_target_lists + def eval_qbx_targets(self): + return self.qbx_targets() - from boxtree.distributed.local_tree import get_fetch_local_particles_knls - knls = get_fetch_local_particles_knls(queue.context, global_tree) + @memoize_method + def m2l_rotation_lists(self): + return self.build_rotation_classes_lists().from_sep_siblings_rotation_classes - box_target_starts = cl.array.to_device( - queue, non_qbx_box_target_lists.box_target_starts - ) - box_target_counts_nonchild = cl.array.to_device( - queue, non_qbx_box_target_lists.box_target_counts_nonchild - ) - nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets - targets = non_qbx_box_target_lists.targets + @memoize_method + def m2l_rotation_angles(self): + return (self + .build_rotation_classes_lists() + .from_sep_siblings_rotation_class_to_angle) - particle_mask = cl.array.zeros( - queue, (nfiltered_targets,), dtype=global_tree.particle_id_dtype - ) - responsible_boxes_mask = np.zeros(global_tree.nboxes, dtype=np.int8) - responsible_boxes_mask[responsible_boxes_list[mpi_rank]] = 1 - responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask) +def compute_local_geometry_data( + queue, comm, global_geometry_data, boxes_time, traversal_builder): + mpi_rank = comm.Get_rank() - knls.particle_mask_knl( - responsible_boxes_mask, - box_target_starts, - box_target_counts_nonchild, - particle_mask - ) + global_traversal = global_geometry_data.global_traversal + global_tree = global_traversal.tree + centers = global_geometry_data.centers + ncenters = len(centers[0]) + expansion_radii = global_geometry_data.expansion_radii + global_qbx_centers = global_geometry_data.global_qbx_centers + qbx_center_to_target_box = global_geometry_data.qbx_center_to_target_box + non_qbx_box_target_lists = global_geometry_data.non_qbx_box_target_lists + center_to_tree_targets = global_geometry_data.center_to_tree_targets + + # {{{ Generate local tree and local traversal + + from boxtree.distributed.partition import partition_work + responsible_boxes_list = partition_work( + boxes_time, global_traversal, comm.Get_size() + ) - particle_scan = cl.array.empty( - queue, (nfiltered_targets + 1,), - dtype=global_tree.particle_id_dtype - ) - particle_scan[0] = 0 - knls.mask_scan_knl(particle_mask, particle_scan) + responsible_box_query = ResponsibleBoxesQuery(queue, global_traversal) - local_box_target_starts = cl.array.empty( - queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype - ) - knls.generate_box_particle_starts( - box_target_starts, particle_scan, - local_box_target_starts - ) + from boxtree.distributed.local_tree import generate_local_tree + local_tree, src_idx, tgt_idx = generate_local_tree( + queue, global_tree, responsible_boxes_list, responsible_box_query, comm=comm + ) - local_box_target_counts_nonchild = cl.array.zeros( - queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype - ) - knls.generate_box_particle_counts_nonchild( - responsible_boxes_mask, - box_target_counts_nonchild, - local_box_target_counts_nonchild - ) + src_idx_all_ranks = comm.gather(src_idx, root=0) + tgt_idx_all_ranks = comm.gather(tgt_idx, root=0) + + from boxtree.distributed.local_traversal import generate_local_travs + local_trav = generate_local_travs( + queue, local_tree, traversal_builder, + box_bounding_box={ + "min": global_traversal.box_target_bounding_box_min, + "max": global_traversal.box_target_bounding_box_max + }, + # TODO: get whether to merge close lists from root instead of + # hard-coding? + merge_close_lists=True + ) - local_nfiltered_targets = particle_scan[-1].get(queue) + # }}} - particle_mask = particle_mask.get().astype(bool) - self.particle_mask = comm.gather(particle_mask, root=0) - local_targets = np.empty((global_tree.dimensions,), dtype=object) - for idimension in range(global_tree.dimensions): - local_targets[idimension] = targets[idimension][particle_mask] + # {{{ Form non_qbx_box_target_lists - local_non_qbx_box_target_lists = { - "nfiltered_targets": local_nfiltered_targets, - "box_target_starts": local_box_target_starts.get(), - "box_target_counts_nonchild": - local_box_target_counts_nonchild.get(), - "targets": local_targets - } + from boxtree.distributed.local_tree import get_fetch_local_particles_knls + knls = get_fetch_local_particles_knls(queue.context, global_tree) - self._non_qbx_box_target_lists = FilteredTargetListsInTreeOrder( - nfiltered_targets=local_non_qbx_box_target_lists["nfiltered_targets"], - box_target_starts=local_non_qbx_box_target_lists["box_target_starts"], - box_target_counts_nonchild=local_non_qbx_box_target_lists[ - "box_target_counts_nonchild"], - targets=local_non_qbx_box_target_lists["targets"], - unfiltered_from_filtered_target_indices=None - ) + box_target_starts = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_starts + ) + box_target_counts_nonchild = cl.array.to_device( + queue, non_qbx_box_target_lists.box_target_counts_nonchild + ) + nfiltered_targets = non_qbx_box_target_lists.nfiltered_targets + targets = non_qbx_box_target_lists.targets - # }}} + particle_mask = cl.array.zeros( + queue, (nfiltered_targets,), dtype=global_tree.particle_id_dtype + ) - # {{{ Compute other useful fields of local geo_data + responsible_boxes_mask = np.zeros(global_tree.nboxes, dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list[mpi_rank]] = 1 + responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask) - tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) - tgt_mask[self.tgt_idx] = True + knls.particle_mask_knl( + responsible_boxes_mask, + box_target_starts, + box_target_counts_nonchild, + particle_mask + ) - tgt_mask_user_order = tgt_mask[global_tree.sorted_target_ids] - centers_mask = tgt_mask_user_order[:ncenters] - centers_scan = np.empty( - (ncenters + 1,), dtype=global_tree.particle_id_dtype) - centers_scan[1:] = np.cumsum( - centers_mask.astype(global_tree.particle_id_dtype)) - centers_scan[0] = 0 + particle_scan = cl.array.empty( + queue, (nfiltered_targets + 1,), + dtype=global_tree.particle_id_dtype + ) + particle_scan[0] = 0 + knls.mask_scan_knl(particle_mask, particle_scan) - # {{{ local centers + local_box_target_starts = cl.array.empty( + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) + knls.generate_box_particle_starts( + box_target_starts, particle_scan, + local_box_target_starts + ) - nlocal_centers = np.sum(centers_mask.astype(np.int32)) - centers_dims = centers.shape[0] - local_centers = np.empty( - (centers_dims, nlocal_centers), dtype=centers[0].dtype - ) - for idims in range(centers_dims): - local_centers[idims, :] = centers[idims][centers_mask] + local_box_target_counts_nonchild = cl.array.zeros( + queue, (global_tree.nboxes,), dtype=global_tree.particle_id_dtype + ) + knls.generate_box_particle_counts_nonchild( + responsible_boxes_mask, + box_target_counts_nonchild, + local_box_target_counts_nonchild + ) - # }}} + local_nfiltered_targets = particle_scan[-1].get(queue) + + particle_mask = particle_mask.get().astype(bool) + particle_mask_all_ranks = comm.gather(particle_mask, root=0) + local_targets = np.empty((global_tree.dimensions,), dtype=object) + for idimension in range(global_tree.dimensions): + local_targets[idimension] = targets[idimension][particle_mask] + + local_non_qbx_box_target_lists = { + "nfiltered_targets": local_nfiltered_targets, + "box_target_starts": local_box_target_starts.get(), + "box_target_counts_nonchild": + local_box_target_counts_nonchild.get(), + "targets": local_targets + } + + non_qbx_box_target_lists = FilteredTargetListsInTreeOrder( + nfiltered_targets=local_non_qbx_box_target_lists["nfiltered_targets"], + box_target_starts=local_non_qbx_box_target_lists["box_target_starts"], + box_target_counts_nonchild=local_non_qbx_box_target_lists[ + "box_target_counts_nonchild"], + targets=local_non_qbx_box_target_lists["targets"], + unfiltered_from_filtered_target_indices=None + ) - # {{{ local global_qbx_centers + # }}} - local_global_qbx_centers = centers_scan[ - global_qbx_centers[centers_mask[global_qbx_centers]] - ] + tgt_mask = np.zeros((global_tree.ntargets,), dtype=bool) + tgt_mask[tgt_idx] = True - # }}} + tgt_mask_user_order = tgt_mask[global_tree.sorted_target_ids] + centers_mask = tgt_mask_user_order[:ncenters] + centers_scan = np.empty( + (ncenters + 1,), dtype=global_tree.particle_id_dtype) + centers_scan[1:] = np.cumsum( + centers_mask.astype(global_tree.particle_id_dtype)) + centers_scan[0] = 0 - # {{{ local expansion_radii + # {{{ local centers - local_expansion_radii = expansion_radii[centers_mask] + nlocal_centers = np.sum(centers_mask.astype(np.int32)) + centers_dims = centers.shape[0] + local_centers = np.empty( + (centers_dims, nlocal_centers), dtype=centers[0].dtype + ) + for idims in range(centers_dims): + local_centers[idims, :] = centers[idims][centers_mask] - # }}} + # }}} - # {{{ local qbx_center_to_target_box + # {{{ local global_qbx_centers - # Transform local qbx_center_to_target_box to global indexing - local_qbx_center_to_target_box = global_traversal.target_boxes[ - qbx_center_to_target_box[centers_mask] - ] + local_global_qbx_centers = centers_scan[ + global_qbx_centers[centers_mask[global_qbx_centers]] + ] - # Transform local_qbx_center_to_target_box to local target_boxes indexing - global_boxes_to_target_boxes = np.ones( - (self.local_tree.nboxes,), dtype=self.local_tree.particle_id_dtype - ) - # make sure accessing invalid position raises an error - global_boxes_to_target_boxes *= -1 - global_boxes_to_target_boxes[self.local_trav.target_boxes] = \ - np.arange(self.local_trav.target_boxes.shape[0]) - self._local_qbx_center_to_target_box = \ - global_boxes_to_target_boxes[local_qbx_center_to_target_box] + # }}} - # }}} + # {{{ local expansion_radii - # {{{ local_qbx_targets and local center_to_tree_targets + local_expansion_radii = expansion_radii[centers_mask] - starts = center_to_tree_targets.starts - lists = center_to_tree_targets.lists - local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) - local_lists = np.empty(lists.shape, dtype=lists.dtype) + # }}} - qbx_target_mask = np.zeros((global_tree.ntargets,), dtype=bool) - current_start = 0 # index into local_lists - ilocal_center = 0 - local_starts[0] = 0 + # {{{ local qbx_center_to_target_box - for icenter in range(ncenters): - # skip the current center if the current rank is not responsible for - # processing it - if not centers_mask[icenter]: - continue + # Transform local qbx_center_to_target_box to global indexing + local_qbx_center_to_target_box = global_traversal.target_boxes[ + qbx_center_to_target_box[centers_mask] + ] - current_center_targets = lists[ - starts[icenter]:starts[icenter + 1]] - qbx_target_mask[current_center_targets] = True - current_stop = \ - current_start + starts[icenter + 1] - starts[icenter] - local_starts[ilocal_center + 1] = current_stop - local_lists[current_start:current_stop] = \ - lists[starts[icenter]:starts[icenter + 1]] + # Transform local_qbx_center_to_target_box to local target_boxes indexing + global_boxes_to_target_boxes = np.ones( + (local_tree.nboxes,), dtype=local_tree.particle_id_dtype + ) + # make sure accessing invalid position raises an error + global_boxes_to_target_boxes *= -1 + global_boxes_to_target_boxes[local_trav.target_boxes] = \ + np.arange(local_trav.target_boxes.shape[0]) + local_qbx_center_to_target_box = \ + global_boxes_to_target_boxes[local_qbx_center_to_target_box] - current_start = current_stop - ilocal_center += 1 + # }}} - self.qbx_target_mask = comm.gather(qbx_target_mask, root=0) + # {{{ local_qbx_targets and local center_to_tree_targets - local_lists = local_lists[:current_start] + starts = center_to_tree_targets.starts + lists = center_to_tree_targets.lists + local_starts = np.empty((nlocal_centers + 1,), dtype=starts.dtype) + local_lists = np.empty(lists.shape, dtype=lists.dtype) - qbx_target_scan = np.empty( - (global_tree.ntargets + 1,), dtype=lists.dtype - ) - qbx_target_scan[0] = 0 - qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) - nlocal_qbx_target = qbx_target_scan[-1] + qbx_target_mask = np.zeros((global_tree.ntargets,), dtype=bool) + current_start = 0 # index into local_lists + ilocal_center = 0 + local_starts[0] = 0 - local_qbx_targets = np.empty( - (global_tree.dimensions, nlocal_qbx_target), - dtype=global_tree.targets[0].dtype - ) - for idim in range(global_tree.dimensions): - local_qbx_targets[idim, :] = global_tree.targets[idim][qbx_target_mask] + for icenter in range(ncenters): + # skip the current center if the current rank is not responsible for + # processing it + if not centers_mask[icenter]: + continue - local_lists = qbx_target_scan[local_lists] - local_center_to_tree_targets = { - "starts": local_starts, - "lists": local_lists - } + current_center_targets = lists[ + starts[icenter]:starts[icenter + 1]] + qbx_target_mask[current_center_targets] = True + current_stop = \ + current_start + starts[icenter + 1] - starts[icenter] + local_starts[ilocal_center + 1] = current_stop + local_lists[current_start:current_stop] = \ + lists[starts[icenter]:starts[icenter + 1]] - # }}} + current_start = current_stop + ilocal_center += 1 - self._local_centers = local_centers - self._global_qbx_centers = local_global_qbx_centers - self._expansion_radii = local_expansion_radii - self._qbx_targets = local_qbx_targets + qbx_target_mask_all_ranks = comm.gather(qbx_target_mask, root=0) - from pytential.qbx.geometry import CenterToTargetList - self._local_center_to_tree_targets = CenterToTargetList( - starts=local_center_to_tree_targets["starts"], - lists=local_center_to_tree_targets["lists"] - ) + local_lists = local_lists[:current_start] - logger.info("Form local geometry data on rank {} in {} secs.".format( - mpi_rank, time.time() - start_time) - ) + qbx_target_scan = np.empty( + (global_tree.ntargets + 1,), dtype=lists.dtype + ) + qbx_target_scan[0] = 0 + qbx_target_scan[1:] = np.cumsum(qbx_target_mask.astype(lists.dtype)) + nlocal_qbx_target = qbx_target_scan[-1] - # }}} + local_qbx_targets = np.empty( + (global_tree.dimensions, nlocal_qbx_target), + dtype=global_tree.targets[0].dtype + ) + for idim in range(global_tree.dimensions): + local_qbx_targets[idim, :] = global_tree.targets[idim][qbx_target_mask] - # {{{ Construct qbx_center_to_target_box_source_level + local_lists = qbx_target_scan[local_lists] + local_center_to_tree_targets = { + "starts": local_starts, + "lists": local_lists + } - # This is modified from pytential.geometry.QBXFMMGeometryData. - # qbx_center_to_target_box_source_level but on host using Numpy instead of - # PyOpenCL. + # }}} - traversal = self.traversal() - qbx_center_to_target_box = self.qbx_center_to_target_box() - tree = traversal.tree + from pytential.qbx.geometry import CenterToTargetList + local_center_to_tree_targets = CenterToTargetList( + starts=local_center_to_tree_targets["starts"], + lists=local_center_to_tree_targets["lists"] + ) - self._qbx_center_to_target_box_source_level = np.empty( - (tree.nlevels,), dtype=object) + # }}} - for source_level in range(tree.nlevels): - sep_smaller = traversal.from_sep_smaller_by_level[source_level] + # {{{ Construct qbx_center_to_target_box_source_level - target_box_to_target_box_source_level = np.empty( - len(traversal.target_boxes), - dtype=tree.box_id_dtype - ) - target_box_to_target_box_source_level.fill(-1) - target_box_to_target_box_source_level[sep_smaller.nonempty_indices] = ( - np.arange(sep_smaller.num_nonempty_lists, - dtype=tree.box_id_dtype) - ) + # This is modified from pytential.geometry.QBXFMMGeometryData. + # qbx_center_to_target_box_source_level but on host using Numpy instead of + # PyOpenCL. - self._qbx_center_to_target_box_source_level[source_level] = ( - target_box_to_target_box_source_level[ - qbx_center_to_target_box - ] - ) + tree = local_trav.tree - # }}} + qbx_center_to_target_box_source_level = np.empty( + (tree.nlevels,), dtype=object) - def non_qbx_box_target_lists(self): - return self._non_qbx_box_target_lists + for source_level in range(tree.nlevels): + sep_smaller = local_trav.from_sep_smaller_by_level[source_level] - def traversal(self): - return self.local_trav + target_box_to_target_box_source_level = np.empty( + len(local_trav.target_boxes), + dtype=tree.box_id_dtype + ) + target_box_to_target_box_source_level.fill(-1) + target_box_to_target_box_source_level[sep_smaller.nonempty_indices] = ( + np.arange(sep_smaller.num_nonempty_lists, + dtype=tree.box_id_dtype) + ) - def tree(self): - return self.traversal().tree + qbx_center_to_target_box_source_level[source_level] = ( + target_box_to_target_box_source_level[ + local_qbx_center_to_target_box + ] + ) - def centers(self): - return self._local_centers + # }}} - @property - def ncenters(self): - return self._local_centers.shape[1] + return LocalQBXFMMGeometryData( + cl_context=queue.context, + local_tree=local_tree, + local_trav=local_trav, + _local_centers=local_centers, + _global_qbx_centers=local_global_qbx_centers, + src_idx=src_idx, + tgt_idx=tgt_idx, + src_idx_all_ranks=src_idx_all_ranks, + tgt_idx_all_ranks=tgt_idx_all_ranks, + particle_mask=particle_mask_all_ranks, + qbx_target_mask=qbx_target_mask_all_ranks, + _non_qbx_box_target_lists=non_qbx_box_target_lists, + _local_qbx_center_to_target_box=local_qbx_center_to_target_box, + _expansion_radii=local_expansion_radii, + _qbx_targets=local_qbx_targets, + _local_center_to_tree_targets=local_center_to_tree_targets, + _qbx_center_to_target_box_source_level=( + qbx_center_to_target_box_source_level) + ) - def global_qbx_centers(self): - return self._global_qbx_centers - def expansion_radii(self): - return self._expansion_radii +def distribute_geo_data(geo_data, queue, boxes_time, comm=MPI.COMM_WORLD): + mpi_rank = comm.Get_rank() - def qbx_center_to_target_box(self): - return self._local_qbx_center_to_target_box + # {{{ traversal builder - def center_to_tree_targets(self): - return self._local_center_to_tree_targets + trav_param = None + if mpi_rank == 0: + trav_param = { + "well_sep_is_n_away": + geo_data.geo_data.code_getter.build_traversal.well_sep_is_n_away, + "from_sep_smaller_crit": + geo_data.geo_data.code_getter.build_traversal. + from_sep_smaller_crit, + "_from_sep_smaller_min_nsources_cumul": + geo_data.geo_data.lpot_source. + _from_sep_smaller_min_nsources_cumul + } + trav_param = comm.bcast(trav_param, root=0) + + # NOTE: The distributed implementation relies on building the same traversal + # objects as the global traversal object of the root rank. This means here + # the traversal builder should use the same parameter as + # `QBXFMMGeometryData.traversal`. + + traversal_builder = QBXFMMGeometryDataTraversalBuilder( + queue.context, + well_sep_is_n_away=trav_param["well_sep_is_n_away"], + from_sep_smaller_crit=trav_param["from_sep_smaller_crit"], + _from_sep_smaller_min_nsources_cumul=trav_param[ + "_from_sep_smaller_min_nsources_cumul" + ] + ) - def qbx_targets(self): - return self._qbx_targets + # }}} - def qbx_center_to_target_box_source_level(self, source_level): - return self._qbx_center_to_target_box_source_level[source_level] + # {{{ Broadcast necessary parts of geometry data to worker ranks - @memoize_method - def build_rotation_classes_lists(self): - trav = self.traversal().to_device(self.queue) - tree = self.tree().to_device(self.queue) + global_geometry_data = broadcast_global_geometry_data( + comm, queue, traversal_builder, geo_data + ) - from boxtree.rotation_classes import RotationClassesBuilder - return RotationClassesBuilder(self.queue.context)( - self.queue, trav, tree)[0].get(self.queue) + if mpi_rank != 0: + nboxes = global_geometry_data.global_traversal.tree.nboxes + boxes_time = np.empty(nboxes, dtype=np.float64) - def eval_qbx_targets(self): - return self.qbx_targets() + comm.Bcast(boxes_time, root=0) - @memoize_method - def m2l_rotation_lists(self): - return self.build_rotation_classes_lists().from_sep_siblings_rotation_classes + # }}} - @memoize_method - def m2l_rotation_angles(self): - return (self - .build_rotation_classes_lists() - .from_sep_siblings_rotation_class_to_angle) + local_geo_data = compute_local_geometry_data( + queue, comm, global_geometry_data, boxes_time, traversal_builder + ) -# }}} + return local_geo_data class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): @@ -741,13 +750,13 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): print(table) - distributed_geo_data = DistributedGeoData( - host_geo_data, queue, wrangler, boxes_time, comm=self.comm + distributed_geo_data = distribute_geo_data( + host_geo_data, queue, boxes_time, comm=self.comm ) else: - distributed_geo_data = DistributedGeoData( - None, queue, None, None, self.comm + distributed_geo_data = distribute_geo_data( + None, queue, None, self.comm ) # self.distributed_geo_data_cache[geo_data_id] = distributed_geo_data @@ -771,19 +780,20 @@ def add_dicts(dict1, dict2): return rtv -def drive_dfmm(queue, src_weights, distributed_geo_data, comm=MPI.COMM_WORLD, - timing_data=None, _communicate_mpoles_via_allreduce=False): +def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, + comm=MPI.COMM_WORLD, + timing_data=None, + _communicate_mpoles_via_allreduce=False): current_rank = comm.Get_rank() total_rank = comm.Get_size() - global_wrangler = distributed_geo_data.global_wrangler if current_rank == 0: start_time = time.time() - distributed_wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( - queue, global_wrangler, distributed_geo_data) - wrangler = distributed_wrangler + wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( + queue, global_wrangler, distributed_geo_data + ) local_traversal = distributed_geo_data.local_trav diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index 2c5abe5c..45d320e3 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -363,9 +363,10 @@ class DistributedEvaluationMapper(EvaluationMapper): from pytential.qbx.distributed import drive_dfmm weights = None + global_wrangler = None result = drive_dfmm( - queue, weights, distribute_geo_data, comm=self.comm, - timing_data=timing_data + queue, weights, distribute_geo_data, global_wrangler, + comm=self.comm, timing_data=timing_data ) if return_timing_data: -- GitLab From 98a2768b6b48c3c136359cca16d26b96f5ad1647 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 28 May 2020 18:34:57 -0500 Subject: [PATCH 84/86] Support distributed implementation with sumpy --- pytential/qbx/distributed.py | 260 ++++++++++++++++++++++++++--------- pytential/qbx/fmm.py | 9 +- pytential/qbx/geometry.py | 3 + pytential/qbx/utils.py | 37 +++++ test/test_distributed.py | 52 ++++--- 5 files changed, 273 insertions(+), 88 deletions(-) diff --git a/pytential/qbx/distributed.py b/pytential/qbx/distributed.py index 54454573..2b430c52 100644 --- a/pytential/qbx/distributed.py +++ b/pytential/qbx/distributed.py @@ -1,9 +1,11 @@ from pytential.qbx.fmmlib import QBXFMMLibExpansionWrangler from pytential.qbx import QBXLayerPotentialSource from pytential.qbx.utils import ToHostTransferredGeoDataWrapper +from pytential.qbx.fmm import QBXExpansionWrangler from boxtree.tree import FilteredTargetListsInTreeOrder from boxtree.traversal import FMMTraversalBuilder from boxtree.distributed.partition import ResponsibleBoxesQuery +from boxtree.distributed.calculation import DistributedExpansionWrangler from boxtree.distributed.calculation import DistributedFMMLibExpansionWrangler from boxtree.fmm import TimingRecorder from boxtree.tools import DeviceDataRecord @@ -14,6 +16,7 @@ from collections import namedtuple import logging import time from pytools import memoize_method +from pytools.obj_array import with_object_array_or_scalar logger = logging.getLogger(__name__) @@ -31,30 +34,23 @@ MPITags = { # {{{ Expansion Wrangler -class QBXDistributedFMMLibExpansionWrangler( +class DistribtuedQBXFMMLibExpansionWrangler( QBXFMMLibExpansionWrangler, DistributedFMMLibExpansionWrangler): @classmethod - def distribute(cls, queue, wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): - current_rank = comm.Get_rank() - total_rank = comm.Get_size() + def distribute_wrangler( + cls, queue, global_wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): + mpi_rank = comm.Get_rank() - if wrangler is not None: # master process + if mpi_rank == 0: import copy - distributed_wrangler = copy.copy(wrangler) + distributed_wrangler = copy.copy(global_wrangler) distributed_wrangler.queue = None distributed_wrangler.geo_data = None distributed_wrangler.rotation_data = None distributed_wrangler.code = None distributed_wrangler.tree = None distributed_wrangler.__class__ = cls - - # Use bool to represent whether dipole_vec needs to be distributed - if wrangler.dipole_vec is not None: - distributed_wrangler.dipole_vec = True - else: - distributed_wrangler.dipole_vec = False - else: # worker process distributed_wrangler = None @@ -62,51 +58,130 @@ class QBXDistributedFMMLibExpansionWrangler( distributed_wrangler.tree = distributed_geo_data.local_tree distributed_wrangler.geo_data = distributed_geo_data distributed_wrangler.rotation_data = distributed_geo_data + distributed_wrangler.queue = queue - # {{{ Distribute dipole_vec + # {{{ Compute local dipole_vec from the global one - if distributed_wrangler.dipole_vec: + if distributed_wrangler.dipole_vec is not None: + src_idx = distributed_geo_data.src_idx + local_dipole_vec = distributed_wrangler.dipole_vec[:, src_idx] + distributed_wrangler.dipole_vec = local_dipole_vec - if current_rank == 0: - reqs_dipole_vec = [] - local_dipole_vec = np.empty((total_rank,), dtype=object) + # }}} - for irank in range(total_rank): - src_idx = distributed_geo_data.src_idx_all_ranks[irank] + return distributed_wrangler - local_dipole_vec[irank] = wrangler.dipole_vec[:, src_idx] + def eval_qbx_output_zeros(self): + from pytools.obj_array import make_obj_array + ctt = self.geo_data.center_to_tree_targets() + output = make_obj_array([np.zeros(len(ctt.lists), self.dtype) + for k in self.outputs]) + return output - if irank != 0: - reqs_dipole_vec.append( - comm.isend( - local_dipole_vec[irank], - dest=irank, - tag=MPITags["dipole_vec"] - ) - ) - MPI.Request.Waitall(reqs_dipole_vec) +class DistribtuedQBXSumpyExpansionWrangler( + QBXExpansionWrangler, DistributedExpansionWrangler): + @classmethod + def distribute_wrangler( + cls, queue, global_wrangler, distributed_geo_data, comm=MPI.COMM_WORLD): + mpi_rank = comm.Get_rank() - distributed_wrangler.dipole_vec = local_dipole_vec[0] - else: - distributed_wrangler.dipole_vec = comm.recv( - source=0, tag=MPITags["dipole_vec"]) + if mpi_rank == 0: + import copy + distributed_wrangler = copy.copy(global_wrangler) + distributed_wrangler.code._cl_context = None + distributed_wrangler.code.cl_context = None - else: - distributed_wrangler.dipole_vec = None + distributed_wrangler.queue = None + distributed_wrangler.geo_data = None + distributed_wrangler.rotation_data = None + distributed_wrangler.tree = None - # }}} + from pytential.qbx.utils import sumpy_wrangler_extra_kwargs_to_host + distributed_wrangler.source_extra_kwargs = \ + sumpy_wrangler_extra_kwargs_to_host( + distributed_wrangler.source_extra_kwargs, queue + ) + + distributed_wrangler.extra_kwargs = \ + sumpy_wrangler_extra_kwargs_to_host( + distributed_wrangler.extra_kwargs, queue + ) + + distributed_wrangler.__class__ = cls + else: # worker process + distributed_wrangler = None + distributed_wrangler = comm.bcast(distributed_wrangler, root=0) + distributed_wrangler.tree = distributed_geo_data.local_tree + distributed_wrangler.geo_data = distributed_geo_data + # distributed_wrangler.rotation_data = distributed_geo_data distributed_wrangler.queue = queue + distributed_wrangler.code.cl_context = queue.context + + # {{{ compute local dsource_vec + + if "dsource_vec" in distributed_wrangler.source_extra_kwargs: + dsource_vec = distributed_wrangler.source_extra_kwargs["dsource_vec"] + for idim in range(len(dsource_vec)): + dsource_vec[idim] = dsource_vec[idim][distributed_geo_data.src_idx] + + if "dsource_vec" in distributed_wrangler.extra_kwargs: + dsource_vec = distributed_wrangler.extra_kwargs["dsource_vec"] + for idim in range(len(dsource_vec)): + dsource_vec[idim] = dsource_vec[idim][distributed_geo_data.src_idx] + + # }}} + + from pytential.qbx.utils import sumpy_wrangler_extra_kwargs_to_device + distributed_wrangler.source_extra_kwargs = \ + sumpy_wrangler_extra_kwargs_to_device( + distributed_wrangler.source_extra_kwargs, queue + ) + + distributed_wrangler.extra_kwargs = \ + sumpy_wrangler_extra_kwargs_to_device( + distributed_wrangler.extra_kwargs, queue + ) return distributed_wrangler + def distribute_source_weights( + self, source_weights, src_idx_all_ranks, comm=MPI.COMM_WORLD): + """ This method transfers needed source_weights from root process to each + worker process in communicator *comm*. + + This method needs to be called collectively by all processes in *comm*. + + :arg source_weights: Source weights in tree order on root, None on worker + processes. + :arg src_idx_all_ranks: Returned from *generate_local_tree*. None on worker + processes. + :arg comm: MPI communicator. + :return Source weights needed for the current process. + """ + mpi_rank = comm.Get_rank() + + if mpi_rank == 0: + source_weights = source_weights.get(self.queue) + + local_source_weights = super( + DistribtuedQBXSumpyExpansionWrangler, self + ).distribute_source_weights( + source_weights, src_idx_all_ranks, comm=comm + ) + + return cl.array.to_device(self.queue, local_source_weights) + def eval_qbx_output_zeros(self): from pytools.obj_array import make_obj_array ctt = self.geo_data.center_to_tree_targets() - output = make_obj_array([np.zeros(len(ctt.lists), self.dtype) - for k in self.outputs]) - return output + return make_obj_array([ + cl.array.zeros( + self.queue, + len(ctt.lists), + dtype=self.dtype) + for k in self.code.out_kernels]) # }}} @@ -378,7 +453,7 @@ def compute_local_geometry_data( local_box_target_counts_nonchild ) - local_nfiltered_targets = particle_scan[-1].get(queue) + local_nfiltered_targets = int(particle_scan[-1].get(queue)) particle_mask = particle_mask.get().astype(bool) particle_mask_all_ranks = comm.gather(particle_mask, root=0) @@ -480,11 +555,9 @@ def compute_local_geometry_data( if not centers_mask[icenter]: continue - current_center_targets = lists[ - starts[icenter]:starts[icenter + 1]] + current_center_targets = lists[starts[icenter]:starts[icenter + 1]] qbx_target_mask[current_center_targets] = True - current_stop = \ - current_start + starts[icenter + 1] - starts[icenter] + current_stop = current_start + starts[icenter + 1] - starts[icenter] local_starts[ilocal_center + 1] = current_stop local_lists[current_start:current_stop] = \ lists[starts[icenter]:starts[icenter + 1]] @@ -644,13 +717,9 @@ class DistributedQBXLayerPotentialSource(QBXLayerPotentialSource): self.comm = comm current_rank = comm.Get_rank() - # process fmm backend argument + # set fmmlib as default fmm backend if "fmm_backend" not in kwargs: kwargs["fmm_backend"] = "fmmlib" - elif kwargs["fmm_backend"] != "fmmlib": - raise NotImplementedError( - "Currently the distributed implementation only works with fmmlib" - ) # "_from_sep_smaller_min_nsources_cumul" will be forced to 0 for distributed # implementation. If not, the potential contribution of a list 3 box might be @@ -791,20 +860,35 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, if current_rank == 0: start_time = time.time() - wrangler = QBXDistributedFMMLibExpansionWrangler.distribute( - queue, global_wrangler, distributed_geo_data + cls = None + if current_rank == 0: + fmm_backend = global_wrangler.geo_data.lpot_source.fmm_backend + if fmm_backend == "sumpy": + cls = DistribtuedQBXSumpyExpansionWrangler + elif fmm_backend == "fmmlib": + cls = DistribtuedQBXFMMLibExpansionWrangler + else: + raise RuntimeError("Unknown fmm backend") + cls = comm.bcast(cls, root=0) + + wrangler = cls.distribute_wrangler( + queue, global_wrangler, distributed_geo_data, comm=comm ) - local_traversal = distributed_geo_data.local_trav + local_traversal_host = distributed_geo_data.local_trav + + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + local_traversal = \ + local_traversal_host.copy().to_device(queue).with_queue(queue) + else: + local_traversal = local_traversal_host # {{{ Distribute source weights if current_rank == 0: src_weights = global_wrangler.reorder_sources(src_weights) - from boxtree.distributed.calculation import distribute_source_weights - - local_source_weights = distribute_source_weights( + local_source_weights = wrangler.distribute_source_weights( src_weights, distributed_geo_data.src_idx_all_ranks, comm=comm ) @@ -840,12 +924,22 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, comm_start_time = time.time() + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + mpole_exps_host = mpole_exps.get(queue=queue) + else: + mpole_exps_host = mpole_exps + if _communicate_mpoles_via_allreduce: - mpole_exps_all = np.zeros_like(mpole_exps) - comm.Allreduce(mpole_exps, mpole_exps_all) - mpole_exps = mpole_exps_all + mpole_exps_all = np.zeros_like(mpole_exps_host) + comm.Allreduce(mpole_exps_host, mpole_exps_all) + mpole_exps_host = mpole_exps_all + else: + communicate_mpoles(wrangler, comm, local_traversal_host, mpole_exps_host) + + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + mpole_exps[:] = cl.array.to_device(queue, mpole_exps_host) else: - communicate_mpoles(wrangler, comm, local_traversal, mpole_exps) + mpole_exps = mpole_exps_host from boxtree.tools import DummyTimingFuture timing_future = DummyTimingFuture(wall_elapsed=(time.time() - comm_start_time)) @@ -979,6 +1073,12 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, # }}} + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + qbx_potentials = \ + with_object_array_or_scalar(lambda x: x.get(queue), qbx_potentials) + non_qbx_potentials = \ + with_object_array_or_scalar(lambda x: x.get(queue), non_qbx_potentials) + if current_rank != 0: # worker process comm.send(non_qbx_potentials, dest=0, tag=MPITags["non_qbx_potentials"]) comm.send(qbx_potentials, dest=0, tag=MPITags["qbx_potentials"]) @@ -988,14 +1088,14 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, merge_start_time = time.time() + ndims = len(non_qbx_potentials) all_potentials_in_tree_order = global_wrangler.full_output_zeros() - nqbtl = global_wrangler.geo_data.non_qbx_box_target_lists() from pytools.obj_array import make_obj_array non_qbx_potentials_all_rank = make_obj_array([ np.zeros(nqbtl.nfiltered_targets, global_wrangler.dtype) - for k in global_wrangler.outputs] + for _ in range(ndims)] ) for irank in range(total_rank): @@ -1006,11 +1106,16 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, non_qbx_potentials_cur_rank = comm.recv( source=irank, tag=MPITags["non_qbx_potentials"]) - for idim in range(len(global_wrangler.outputs)): + for idim in range(ndims): non_qbx_potentials_all_rank[idim][ distributed_geo_data.particle_mask[irank] ] = non_qbx_potentials_cur_rank[idim] + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + non_qbx_potentials_all_rank = with_object_array_or_scalar( + lambda x: cl.array.to_device(queue, x), non_qbx_potentials_all_rank + ) + for ap_i, nqp_i in zip( all_potentials_in_tree_order, non_qbx_potentials_all_rank): ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i @@ -1024,10 +1129,30 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, source=irank, tag=MPITags["qbx_potentials"] ) - for idim in range(len(global_wrangler.outputs)): - all_potentials_in_tree_order[idim][ - distributed_geo_data.qbx_target_mask[irank] - ] += qbx_potentials_cur_rank[idim] + qbx_target_mask = distributed_geo_data.qbx_target_mask[irank] + + if isinstance(wrangler, DistribtuedQBXSumpyExpansionWrangler): + qbx_potentials_cur_rank = with_object_array_or_scalar( + lambda x: cl.array.to_device(queue, x), + qbx_potentials_cur_rank + ) + + qbx_target_idx = np.arange( + len(qbx_target_mask), + dtype=local_traversal.tree.particle_id_dtype + ) + qbx_target_idx = qbx_target_idx[qbx_target_mask] + qbx_target_idx = cl.array.to_device(queue, qbx_target_idx) + + for idim in range(ndims): + all_potentials_in_tree_order[idim][qbx_target_idx] += ( + qbx_potentials_cur_rank[idim] + ) + else: + for idim in range(ndims): + all_potentials_in_tree_order[idim][qbx_target_mask] += ( + qbx_potentials_cur_rank[idim] + ) def reorder_and_finalize_potentials(x): # "finalize" gives host FMMs (like FMMlib) a chance to turn the @@ -1035,7 +1160,6 @@ def drive_dfmm(queue, src_weights, distributed_geo_data, global_wrangler, return global_wrangler.finalize_potentials( x[global_wrangler.tree.sorted_target_ids]) - from pytools.obj_array import with_object_array_or_scalar result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py index 96a99152..5700c452 100644 --- a/pytential/qbx/fmm.py +++ b/pytential/qbx/fmm.py @@ -341,9 +341,12 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), return (qbx_expansions, SumpyTimingFuture(self.queue, events)) + def eval_qbx_output_zeros(self): + return self.full_output_zeros() + @log_process(logger) def eval_qbx_expansions(self, qbx_expansions): - pot = self.full_output_zeros() + pot = self.eval_qbx_output_zeros() geo_data = self.geo_data events = [] @@ -364,7 +367,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), center_to_targets_starts=ctt.starts, center_to_targets_lists=ctt.lists, - targets=self.tree.targets, + targets=geo_data.eval_qbx_targets(), qbx_expansions=qbx_expansions, result=pot, @@ -378,7 +381,7 @@ QBXFMMGeometryData.non_qbx_box_target_lists`), @log_process(logger) def eval_target_specific_qbx_locals(self, src_weights): - return (self.full_output_zeros(), SumpyTimingFuture(self.queue, events=())) + return self.eval_qbx_output_zeros(), SumpyTimingFuture(self.queue, events=()) # }}} diff --git a/pytential/qbx/geometry.py b/pytential/qbx/geometry.py index 72b054f5..05a2c3dc 100644 --- a/pytential/qbx/geometry.py +++ b/pytential/qbx/geometry.py @@ -682,6 +682,9 @@ class QBXFMMGeometryData(FMMLibRotationDataInterface): return result.with_queue(None) + def eval_qbx_targets(self): + return self.tree().targets + @memoize_method @log_process(logger) def global_qbx_centers(self): diff --git a/pytential/qbx/utils.py b/pytential/qbx/utils.py index 7a59c029..58043bca 100644 --- a/pytential/qbx/utils.py +++ b/pytential/qbx/utils.py @@ -31,6 +31,7 @@ from boxtree.tree import Tree import pyopencl as cl import pyopencl.array # noqa from pytools import memoize_method +from pytools.obj_array import with_object_array_or_scalar from boxtree.pyfmmlib_integration import FMMLibRotationDataInterface import logging @@ -480,4 +481,40 @@ class ToHostTransferredGeoDataWrapper(FMMLibRotationDataInterface): # }}} + +# {{{ + +def _transform_dict(f, old_dict): + new_dict = {} + + for key in old_dict: + if isinstance(old_dict[key], np.ndarray) and old_dict[key].dtype == object: + new_dict[key] = with_object_array_or_scalar(f, old_dict[key]) + else: + new_dict[key] = f(key) + + return new_dict + + +def sumpy_wrangler_extra_kwargs_to_host(wrangler_kwargs, queue): + def to_host(attr): + if not isinstance(attr, cl.array.Array): + return attr + + return attr.get(queue=queue) + + return _transform_dict(to_host, wrangler_kwargs) + + +def sumpy_wrangler_extra_kwargs_to_device(wrangler_kwargs, queue): + def to_device(attr): + if not isinstance(attr, np.ndarray): + return attr + + return cl.array.to_device(queue, attr).with_queue(queue) + + return _transform_dict(to_device, wrangler_kwargs) + +# }}} + # vim: foldmethod=marker:filetype=pyopencl diff --git a/test/test_distributed.py b/test/test_distributed.py index e1937825..30dcf6b4 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) # {{{ test off-surface eval -def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): +def _test_off_surface_eval( + ctx_factory, use_fmm, do_plot=False, fmm_backend='fmmlib'): logging.basicConfig(level=logging.INFO) cl_ctx = ctx_factory() @@ -63,7 +64,8 @@ def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): qbx_order=qbx_order, fmm_order=fmm_order, comm=comm, - knl_specific_calibration_params="constant_one" + knl_specific_calibration_params="constant_one", + fmm_backend=fmm_backend ).with_refinement() density_discr = qbx.density_discr @@ -125,20 +127,23 @@ def _test_off_surface_eval(ctx_factory, use_fmm, do_plot=False): @pytest.mark.mpi -@pytest.mark.parametrize("num_processes, use_fmm", [ - # (4, False), - (4, True) +@pytest.mark.parametrize("num_processes, use_fmm, fmm_backend", [ + (4, True, 'fmmlib'), + (4, True, 'sumpy') ]) @pytest.mark.skipif(sys.version_info < (3, 5), reason="distributed implementation requires 3.5 or higher") -def test_off_surface_eval(num_processes, use_fmm, do_plot=False): +def test_off_surface_eval( + num_processes, use_fmm, fmm_backend, do_plot=False): pytest.importorskip("mpi4py") newenv = os.environ.copy() newenv["PYTEST"] = "1" newenv["OMP_NUM_THREADS"] = "1" + newenv["POCL_MAX_PTHREAD_COUNT"] = "1" newenv["use_fmm"] = str(use_fmm) newenv["do_plot"] = str(do_plot) + newenv["fmm_backend"] = fmm_backend run_mpi(__file__, num_processes, newenv) @@ -157,7 +162,8 @@ def double_layer_wrapper(kernel): return sym.D(kernel, u_sym, qbx_forced_limit="avg") -def _test_urchin_against_single_rank(ctx_factory, m, n, op_wrapper): +def _test_urchin_against_single_rank( + ctx_factory, m, n, op_wrapper, fmm_backend, use_tsqbx): logging.basicConfig(level=logging.INFO) qbx_order = 3 @@ -199,8 +205,8 @@ def _test_urchin_against_single_rank(ctx_factory, m, n, op_wrapper): "_from_sep_smaller_min_nsources_cumul": 0, "_expansions_in_tree_have_extent": True, "_expansion_stick_out_factor": _expansion_stick_out_factor, - "_use_target_specific_qbx": True, - "fmm_backend": 'fmmlib' + "_use_target_specific_qbx": use_tsqbx, + "fmm_backend": fmm_backend } from pytential.qbx.distributed import DistributedQBXLayerPotentialSource @@ -266,21 +272,27 @@ def _test_urchin_against_single_rank(ctx_factory, m, n, op_wrapper): @pytest.mark.mpi -@pytest.mark.parametrize("num_processes, m, n, op_wrapper", [ - (4, 1, 3, "single_layer_wrapper"), - (4, 1, 3, "double_layer_wrapper") +@pytest.mark.parametrize("num_processes, m, n, op_wrapper, fmm_backend, use_tsqbx", [ + (4, 1, 3, "single_layer_wrapper", "fmmlib", True), + (4, 1, 3, "single_layer_wrapper", "sumpy", False), + (4, 1, 3, "double_layer_wrapper", "fmmlib", True), + (4, 1, 3, "double_layer_wrapper", "sumpy", False), ]) @pytest.mark.skipif(sys.version_info < (3, 5), reason="distributed implementation requires 3.5 or higher") -def test_urchin_against_single_rank(num_processes, m, n, op_wrapper): +def test_urchin_against_single_rank( + num_processes, m, n, op_wrapper, fmm_backend, use_tsqbx): pytest.importorskip("mpi4py") newenv = os.environ.copy() newenv["PYTEST"] = "2" newenv["OMP_NUM_THREADS"] = "1" + newenv["POCL_MAX_PTHREAD_COUNT"] = "1" newenv["m"] = str(m) newenv["n"] = str(n) newenv["op_wrapper"] = op_wrapper + newenv["fmm_backend"] = fmm_backend + newenv["use_tsqbx"] = str(use_tsqbx) run_mpi(__file__, num_processes, newenv) @@ -293,13 +305,19 @@ if __name__ == "__main__": # Run "test_off_surface_eval" test case use_fmm = (os.environ["use_fmm"] == 'True') do_plot = (os.environ["do_plot"] == 'True') + fmm_backend = os.environ["fmm_backend"] - _test_off_surface_eval(cl.create_some_context, use_fmm, do_plot=do_plot) + _test_off_surface_eval( + cl.create_some_context, use_fmm, + do_plot=do_plot, fmm_backend=fmm_backend + ) elif os.environ["PYTEST"] == "2": # Run "test_urchin_against_single_rank" test case m = int(os.environ["m"]) n = int(os.environ["n"]) op_wrapper_str = os.environ["op_wrapper"] + fmm_backend = os.environ["fmm_backend"] + use_tsqbx = (os.environ["use_tsqbx"] == 'True') if op_wrapper_str == "single_layer_wrapper": op_wrapper = single_layer_wrapper @@ -309,11 +327,11 @@ if __name__ == "__main__": raise ValueError("unknown op wrapper") _test_urchin_against_single_rank( - cl.create_some_context, m, n, op_wrapper + cl.create_some_context, m, n, op_wrapper, fmm_backend, use_tsqbx ) else: if len(sys.argv) > 1: - # You can test individual routines by typing - # $ python test_distributed.py 'test_off_surface_eval(4, True, True)' + # $ python test_distributed.py 'test_off_surface_eval(4, True, "fmmlib", + # True)' exec(sys.argv[1]) -- GitLab From da74753790ab64fb5bbc75d1fae49b0d1c9dfdb5 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 4 Jun 2020 00:54:56 -0500 Subject: [PATCH 85/86] Adapt distributed test cases to new GeometryCollection interface --- test/test_distributed.py | 50 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index 30dcf6b4..243c6093 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -7,7 +7,7 @@ from pyopencl.tools import ( # noqa from meshmode.mesh.generation import make_curve_mesh, ellipse from sumpy.visualization import FieldPlotter -from pytential import bind, sym +from pytential import bind, sym, GeometryCollection from boxtree.tools import run_mpi import pytest @@ -58,7 +58,7 @@ def _test_off_surface_eval( pre_density_discr = Discretization( cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) - qbx, _ = DistributedQBXLayerPotentialSource( + qbx = DistributedQBXLayerPotentialSource( pre_density_discr, fine_order=4*target_order, qbx_order=qbx_order, @@ -66,9 +66,14 @@ def _test_off_surface_eval( comm=comm, knl_specific_calibration_params="constant_one", fmm_backend=fmm_backend - ).with_refinement() + ) - density_discr = qbx.density_discr + from pytential.target import PointsTarget + fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30) + targets = PointsTarget(fplot.points) + + places = GeometryCollection((qbx, targets)) + density_discr = places.get_discretization(places.auto_source.geometry) from sumpy.kernel import LaplaceKernel op = sym.D(LaplaceKernel(2), sym.var("sigma"), qbx_forced_limit=-2) @@ -76,33 +81,28 @@ def _test_off_surface_eval( sigma = density_discr.zeros(queue) + 1 qbx_ctx = {"sigma": sigma} - fplot = FieldPlotter(np.zeros(2), extent=0.54, npoints=30) - - from pytential.target import PointsTarget - targets = PointsTarget(fplot.points) - else: - qbx = None - targets = None + places = None op = None qbx_ctx = {} from pytential.symbolic.execution import bind_distributed - fld_in_vol = bind_distributed(comm, (qbx, targets), op)(queue, **qbx_ctx) + fld_in_vol = bind_distributed(comm, places, op)(queue, **qbx_ctx) if rank == 0: # test against shared memory result from pytential.qbx import QBXLayerPotentialSource - qbx, _ = QBXLayerPotentialSource( + qbx = QBXLayerPotentialSource( pre_density_discr, 4 * target_order, qbx_order, fmm_order=fmm_order, _from_sep_smaller_min_nsources_cumul=0 - ).with_refinement() + ) - fld_in_vol_single_node = bind((qbx, targets), op)(queue, **qbx_ctx) + places = GeometryCollection((qbx, targets)) + fld_in_vol_single_node = bind(places, op)(queue, **qbx_ctx) linf_err = cl.array.max( cl.clmath.fabs(fld_in_vol - fld_in_vol_single_node) @@ -197,8 +197,6 @@ def _test_urchin_against_single_rank( InterpolatoryQuadratureSimplexGroupFactory(target_order) ) - refiner_extra_kwargs = {} - params = { "qbx_order": qbx_order, "fmm_order": fmm_order, @@ -210,15 +208,16 @@ def _test_urchin_against_single_rank( } from pytential.qbx.distributed import DistributedQBXLayerPotentialSource - qbx, _ = DistributedQBXLayerPotentialSource( + qbx = DistributedQBXLayerPotentialSource( density_discr=pre_density_discr, fine_order=4 * target_order, comm=comm, knl_specific_calibration_params="constant_one", **params - ).with_refinement(**refiner_extra_kwargs) + ) - density_discr = qbx.density_discr + places = GeometryCollection(qbx) + density_discr = places.get_discretization(places.auto_source.geometry) # {{{ compute values of a solution to the PDE @@ -243,23 +242,24 @@ def _test_urchin_against_single_rank( grad_u_dev = cl.array.to_device(queue, grad_u) context = {'u': u_dev, 'grad_u': grad_u_dev} else: - qbx = None + places = None op = None context = {} from pytential.symbolic.execution import bind_distributed - bound_op = bind_distributed(comm, qbx, op) + bound_op = bind_distributed(comm, places, op) distributed_result = bound_op(queue, **context) if rank == 0: from pytential.qbx import QBXLayerPotentialSource - qbx, _ = QBXLayerPotentialSource( + qbx = QBXLayerPotentialSource( density_discr=pre_density_discr, fine_order=4 * target_order, **params - ).with_refinement() + ) + places = GeometryCollection(qbx) - single_node_result = bind(qbx, op)(queue, **context) + single_node_result = bind(places, op)(queue, **context) distributed_result = distributed_result.get() single_node_result = single_node_result.get() -- GitLab From 2030b647de99f8f77a944cd6660ea5f573dadd42 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 4 Jun 2020 16:24:30 -0500 Subject: [PATCH 86/86] Placate pylint --- pytential/symbolic/execution.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pytential/symbolic/execution.py b/pytential/symbolic/execution.py index bf7f8a98..4b3747fd 100644 --- a/pytential/symbolic/execution.py +++ b/pytential/symbolic/execution.py @@ -1043,12 +1043,6 @@ class DistributedBoundExpression(BoundExpression): else: self.code = DistributedCode(comm, None, None) - def get_discretization(self, where): - if self.comm.Get_rank() == 0: - return BoundExpression.get_discretization(self, where) - else: - raise RuntimeError("Discretization is not available on worker nodes") - def cost_per_stage(self, queue, calibration_params, **args): if self.comm.Get_rank() == 0: return BoundExpression.cost_per_stage( -- GitLab