From 87cb89efe5eda39cf4c07d082b4a5911b248dcc9 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 12 Jul 2018 18:04:18 -0500 Subject: [PATCH 01/12] Add performance model for form_multipole --- boxtree/fmm.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 54a1649..ee03a3c 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -26,6 +26,8 @@ import logging logger = logging.getLogger(__name__) from pytools import ProcessLogger, Record +import pyopencl as cl +import numpy as np def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None): @@ -430,4 +432,91 @@ class TimingRecorder(object): # }}} +def calculate_nsources_by_level(tree): + nsources_by_level = np.empty((tree.nlevels,), dtype=np.int32) + + for ilevel in range(tree.nlevels): + start_ibox = tree.level_start_box_nrs[ilevel] + end_ibox = tree.level_start_box_nrs[ilevel + 1] + count = 0 + + for ibox in range(start_ibox, end_ibox): + count += tree.box_source_counts_nonchild[ibox] + + nsources_by_level[ilevel] = count + + return nsources_by_level + + +class PerformanceModel: + + def __init__(self, cl_context, wrangler_factory, uses_pde_expansions): + self.cl_context = cl_context + self.wrangler_factory = wrangler_factory + self.uses_pde_expansions = uses_pde_expansions + + self.time_result = [] + + from pyopencl.clrandom import PhiloxGenerator + self.rng = PhiloxGenerator(cl_context) + + def time_performance(self, traversal): + # Calculate "nterms_fmm_total" + dimensions = traversal.tree.dimensions + wrangler = self.wrangler_factory(traversal.tree) + nsources_by_level = calculate_nsources_by_level(traversal.tree) + + level_nterms = wrangler.level_nterms + + if self.uses_pde_expansions: + ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) + else: + ncoeffs_fmm_by_level = level_nterms ** dimensions + + nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) + + # Record useful metadata for assembling performance data + timing_data = { + "nterms_fmm_total": nterms_fmm_total + } + + # Generate random source weights + with cl.CommandQueue(self.cl_context) as queue: + source_weights = self.rng.uniform( + queue, + traversal.tree.nsources, + traversal.tree.coord_dtype + ).get() + + # Time a FMM run + drive_fmm(traversal, wrangler, source_weights, timing_data=timing_data) + + self.time_result.append(timing_data) + + def form_multipole_model(self): + nresult = len(self.time_result) + + if nresult < 1: + raise RuntimeError("Please run FMM at lease once using time_performance" + "before forming models.") + elif nresult == 1: + result = self.time_result[0] + wall_elapsed_time = result["form_multipoles"].wall_elapsed + nterm_fmm_total = result["nterms_fmm_total"] + return wall_elapsed_time / nterm_fmm_total, 0.0 + else: + wall_elapsed_time = np.empty((nresult,), dtype=float) + coeff_matrix = np.empty((nresult, 2), dtype=float) + + for iresult, result in enumerate(self.time_result): + wall_elapsed_time[iresult] = result["form_multipoles"].wall_elapsed + coeff_matrix[iresult, 0] = result["nterms_fmm_total"] + + coeff_matrix[:, 1] = 1 + + from numpy.linalg import lstsq + coeff = lstsq(coeff_matrix, wall_elapsed_time, rcond=-1)[0] + + return coeff[0], coeff[1] + # vim: filetype=pyopencl:fdm=marker -- GitLab From 40bbd4b81a11d48d7f74a46f4e176d2e11d3bd41 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 17 Jul 2018 09:24:46 -0500 Subject: [PATCH 02/12] Refactor total FMM terms computation --- boxtree/fmm.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index ee03a3c..3a18afb 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -433,6 +433,10 @@ class TimingRecorder(object): def calculate_nsources_by_level(tree): + """ + :return: A numpy array of share (tree.nlevels,) such that the ith index documents + the number of sources on level i. + """ nsources_by_level = np.empty((tree.nlevels,), dtype=np.int32) for ilevel in range(tree.nlevels): @@ -461,23 +465,11 @@ class PerformanceModel: self.rng = PhiloxGenerator(cl_context) def time_performance(self, traversal): - # Calculate "nterms_fmm_total" - dimensions = traversal.tree.dimensions wrangler = self.wrangler_factory(traversal.tree) - nsources_by_level = calculate_nsources_by_level(traversal.tree) - - level_nterms = wrangler.level_nterms - - if self.uses_pde_expansions: - ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) - else: - ncoeffs_fmm_by_level = level_nterms ** dimensions - - nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) # Record useful metadata for assembling performance data timing_data = { - "nterms_fmm_total": nterms_fmm_total + "nterms_fmm_total": self._calculate_nters_fmm_total(wrangler) } # Generate random source weights @@ -519,4 +511,23 @@ class PerformanceModel: return coeff[0], coeff[1] + def _calculate_nters_fmm_total(self, wrangler): + """ + :return: total number of terms formed during form_multipole + """ + dimensions = wrangler.tree.dimensions + + # Calculate "nterms_fmm_total" + nsources_by_level = calculate_nsources_by_level(wrangler.tree) + level_nterms = wrangler.level_nterms + + if self.uses_pde_expansions: + ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) + else: + ncoeffs_fmm_by_level = level_nterms ** dimensions + + nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) + + return nterms_fmm_total + # vim: filetype=pyopencl:fdm=marker -- GitLab From 689a2a3af95a86ad584cfaa73024ce17c873ad6d Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 17 Jul 2018 10:45:25 -0500 Subject: [PATCH 03/12] Count the workload of direct evaluation --- boxtree/fmm.py | 95 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 19 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 3a18afb..5963656 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -432,24 +432,80 @@ class TimingRecorder(object): # }}} -def calculate_nsources_by_level(tree): - """ - :return: A numpy array of share (tree.nlevels,) such that the ith index documents - the number of sources on level i. - """ - nsources_by_level = np.empty((tree.nlevels,), dtype=np.int32) +class PerformanceCounter: + + def __init__(self, traversal): + self.traversal = traversal + + def count_nsources_by_level(self): + """ + :return: A numpy array of share (tree.nlevels,) such that the ith index + documents the number of sources on level i. + """ + tree = self.traversal.tree + + nsources_by_level = np.empty((tree.nlevels,), dtype=np.int32) + + for ilevel in range(tree.nlevels): + start_ibox = tree.level_start_box_nrs[ilevel] + end_ibox = tree.level_start_box_nrs[ilevel + 1] + count = 0 + + for ibox in range(start_ibox, end_ibox): + count += tree.box_source_counts_nonchild[ibox] + + nsources_by_level[ilevel] = count + + return nsources_by_level + + def count_direct(self, use_global_idx=False): + """ + :return: If *use_global_idx* is True, return a numpy array of shape + (tree.nboxes,) such that the ith entry represents the workload from + direct evaluation on box i. If *use_global_idx* is False, return a numpy + array of shape (ntarget_boxes,) such that the ith entry represents the + workload on *target_boxes* i. + """ + traversal = self.traversal + tree = traversal.tree + + if use_global_idx: + direct_workload = np.zeros((tree.nboxes,), dtype=np.int64) + else: + ntarget_boxes = len(traversal.target_boxes) + direct_workload = np.zeros((ntarget_boxes,), dtype=np.int64) + + for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): + ntargets = traversal.box_target_counts_nonchild[tgt_ibox] + nsources = 0 + + start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] + + for src_ibox in traversal.neighbor_source_boxes_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + if traversal.from_sep_close_smaller_starts is not None: + start, end = ( + traversal.from_sep_close_smaller_starts[itgt_box:itgt_box+2]) + + for src_ibox in traversal.from_sep_close_smaller_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] + + if traversal.from_sep_close_bigger_starts is not None: + start, end = ( + traversal.from_sep_close_bigger_starts[itgt_box:itgt_box+2]) - for ilevel in range(tree.nlevels): - start_ibox = tree.level_start_box_nrs[ilevel] - end_ibox = tree.level_start_box_nrs[ilevel + 1] - count = 0 + for src_ibox in traversal.from_sep_close_bigger_lists[start:end]: + nsources += tree.box_source_counts_nonchild[src_ibox] - for ibox in range(start_ibox, end_ibox): - count += tree.box_source_counts_nonchild[ibox] + count = nsources * ntargets - nsources_by_level[ilevel] = count + if use_global_idx: + direct_workload[tgt_ibox] = count + else: + direct_workload[itgt_box] = count - return nsources_by_level + return direct_workload class PerformanceModel: @@ -466,10 +522,12 @@ class PerformanceModel: def time_performance(self, traversal): wrangler = self.wrangler_factory(traversal.tree) + counter = PerformanceCounter(traversal) # Record useful metadata for assembling performance data timing_data = { - "nterms_fmm_total": self._calculate_nters_fmm_total(wrangler) + "nterms_fmm_total": self._calculate_nters_fmm_total(wrangler, counter), + "direct_workload": np.sum(counter.count_direct()) } # Generate random source weights @@ -511,14 +569,13 @@ class PerformanceModel: return coeff[0], coeff[1] - def _calculate_nters_fmm_total(self, wrangler): + def _calculate_nters_fmm_total(self, wrangler, counter): """ - :return: total number of terms formed during form_multipole + :return: total number of terms formed across all levels during form_multipole """ dimensions = wrangler.tree.dimensions - # Calculate "nterms_fmm_total" - nsources_by_level = calculate_nsources_by_level(wrangler.tree) + nsources_by_level = counter.count_nsources_by_level(wrangler.tree) level_nterms = wrangler.level_nterms if self.uses_pde_expansions: -- GitLab From 2d79680b4b54275b20846b9097d3922ebc5973d4 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 17 Jul 2018 17:10:46 -0500 Subject: [PATCH 04/12] Refactor linear regression, add eval_direct model --- boxtree/fmm.py | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 5963656..cb0fa66 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -476,7 +476,7 @@ class PerformanceCounter: direct_workload = np.zeros((ntarget_boxes,), dtype=np.int64) for itgt_box, tgt_ibox in enumerate(traversal.target_boxes): - ntargets = traversal.box_target_counts_nonchild[tgt_ibox] + ntargets = tree.box_target_counts_nonchild[tgt_ibox] nsources = 0 start, end = traversal.neighbor_source_boxes_starts[itgt_box:itgt_box+2] @@ -543,31 +543,13 @@ class PerformanceModel: self.time_result.append(timing_data) - def form_multipole_model(self): - nresult = len(self.time_result) - - if nresult < 1: - raise RuntimeError("Please run FMM at lease once using time_performance" - "before forming models.") - elif nresult == 1: - result = self.time_result[0] - wall_elapsed_time = result["form_multipoles"].wall_elapsed - nterm_fmm_total = result["nterms_fmm_total"] - return wall_elapsed_time / nterm_fmm_total, 0.0 - else: - wall_elapsed_time = np.empty((nresult,), dtype=float) - coeff_matrix = np.empty((nresult, 2), dtype=float) - - for iresult, result in enumerate(self.time_result): - wall_elapsed_time[iresult] = result["form_multipoles"].wall_elapsed - coeff_matrix[iresult, 0] = result["nterms_fmm_total"] - - coeff_matrix[:, 1] = 1 - - from numpy.linalg import lstsq - coeff = lstsq(coeff_matrix, wall_elapsed_time, rcond=-1)[0] + def form_multipoles_model(self, wall_time=True): + return self._linear_regression("nterms_fmm_total", "form_multipoles", + wall_time=wall_time) - return coeff[0], coeff[1] + def eval_direct_model(self, wall_time=True): + return self._linear_regression("direct_workload", "eval_direct", + wall_time=wall_time) def _calculate_nters_fmm_total(self, wrangler, counter): """ @@ -575,7 +557,7 @@ class PerformanceModel: """ dimensions = wrangler.tree.dimensions - nsources_by_level = counter.count_nsources_by_level(wrangler.tree) + nsources_by_level = counter.count_nsources_by_level() level_nterms = wrangler.level_nterms if self.uses_pde_expansions: @@ -587,4 +569,43 @@ class PerformanceModel: return nterms_fmm_total + def _linear_regression(self, x_name, y_name, wall_time=True): + nresult = len(self.time_result) + + if nresult < 1: + raise RuntimeError("Please run FMM at lease once using time_performance" + "before forming models.") + elif nresult == 1: + result = self.time_result[0] + + if wall_time: + dependent_value = result[y_name].wall_elapsed + else: + dependent_value = result[y_name].process_elapsed + + independent_value = result[x_name] + return dependent_value / independent_value, 0.0 + else: + dependent_value = np.empty((nresult,), dtype=float) + coeff_matrix = np.empty((nresult, 2), dtype=float) + + for iresult, result in enumerate(self.time_result): + if wall_time: + dependent_value[iresult] = result[y_name].wall_elapsed + else: + dependent_value[iresult] = result[y_name].process_elapsed + + coeff_matrix[iresult, 0] = result[x_name] + + coeff_matrix[:, 1] = 1 + + from numpy.linalg import lstsq + coeff = lstsq(coeff_matrix, dependent_value, rcond=-1)[0] + + print(coeff_matrix) + print(dependent_value) + + return coeff[0], coeff[1] + + # vim: filetype=pyopencl:fdm=marker -- GitLab From b3853b2ed5975f9f5868bb705b593cdc18c07ab3 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Wed, 18 Jul 2018 10:35:07 -0500 Subject: [PATCH 05/12] Extend linear regression to multiple variables --- boxtree/fmm.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index cb0fa66..3521711 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -544,11 +544,11 @@ class PerformanceModel: self.time_result.append(timing_data) def form_multipoles_model(self, wall_time=True): - return self._linear_regression("nterms_fmm_total", "form_multipoles", + return self._linear_regression("form_multipoles", ["nterms_fmm_total"], wall_time=wall_time) def eval_direct_model(self, wall_time=True): - return self._linear_regression("direct_workload", "eval_direct", + return self._linear_regression("eval_direct", ["direct_workload"], wall_time=wall_time) def _calculate_nters_fmm_total(self, wrangler, counter): @@ -569,8 +569,13 @@ class PerformanceModel: return nterms_fmm_total - def _linear_regression(self, x_name, y_name, wall_time=True): + def _linear_regression(self, y_name, x_name, wall_time=True): + """ + :arg y_name: Name of the depedent variable + :arg x_name: A list of names of independent variables + """ nresult = len(self.time_result) + nvariables = len(x_name) if nresult < 1: raise RuntimeError("Please run FMM at lease once using time_performance" @@ -583,11 +588,13 @@ class PerformanceModel: else: dependent_value = result[y_name].process_elapsed - independent_value = result[x_name] - return dependent_value / independent_value, 0.0 + independent_value = result[x_name[0]] + coeff = dependent_value / independent_value + + return (coeff,) + tuple(0.0 for _ in range(nvariables - 1)) else: dependent_value = np.empty((nresult,), dtype=float) - coeff_matrix = np.empty((nresult, 2), dtype=float) + coeff_matrix = np.empty((nresult, nvariables + 1), dtype=float) for iresult, result in enumerate(self.time_result): if wall_time: @@ -595,9 +602,10 @@ class PerformanceModel: else: dependent_value[iresult] = result[y_name].process_elapsed - coeff_matrix[iresult, 0] = result[x_name] + for icol, variable_name in enumerate(x_name): + coeff_matrix[iresult, icol] = result[variable_name] - coeff_matrix[:, 1] = 1 + coeff_matrix[:, -1] = 1 from numpy.linalg import lstsq coeff = lstsq(coeff_matrix, dependent_value, rcond=-1)[0] @@ -605,7 +613,7 @@ class PerformanceModel: print(coeff_matrix) print(dependent_value) - return coeff[0], coeff[1] + return coeff # vim: filetype=pyopencl:fdm=marker -- GitLab From 5077f8cb6852df261b93660fcecc0b49be23f5a2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Thu, 19 Jul 2018 11:05:59 -0500 Subject: [PATCH 06/12] Refactor FMM parameters --- boxtree/fmm.py | 71 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 3521711..ed24c02 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -28,6 +28,7 @@ logger = logging.getLogger(__name__) from pytools import ProcessLogger, Record import pyopencl as cl import numpy as np +from collections import namedtuple def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None): @@ -508,6 +509,15 @@ class PerformanceCounter: return direct_workload +FMMParameters = namedtuple( + "FMMParameters", + ['ncoeffs_fmm_by_level', + 'translation_source_power', + 'translation_target_power', + 'translation_max_power'] +) + + class PerformanceModel: def __init__(self, cl_context, wrangler_factory, uses_pde_expansions): @@ -522,12 +532,20 @@ class PerformanceModel: def time_performance(self, traversal): wrangler = self.wrangler_factory(traversal.tree) + counter = PerformanceCounter(traversal) + parameters = self.get_fmm_parameters( + traversal.tree.dimensions, + self.uses_pde_expansions, + wrangler.level_nterms + ) + # Record useful metadata for assembling performance data timing_data = { - "nterms_fmm_total": self._calculate_nters_fmm_total(wrangler, counter), - "direct_workload": np.sum(counter.count_direct()) + "nterms_fmm_total": self.calculate_nters_fmm_total(counter, parameters), + "direct_workload": np.sum(counter.count_direct()), + "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1] } # Generate random source weights @@ -548,22 +566,51 @@ class PerformanceModel: wall_time=wall_time) def eval_direct_model(self, wall_time=True): - return self._linear_regression("eval_direct", ["direct_workload"], - wall_time=wall_time) + return self._linear_regression( + "eval_direct", + ["direct_workload", "direct_nsource_boxes"], + wall_time=wall_time) + + @staticmethod + def get_fmm_parameters(dimensions, use_pde_expansions, level_nterms): + if use_pde_expansions: + ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) + + if dimensions == 2: + translation_source_power = 1 + translation_target_power = 1 + translation_max_power = 0 + elif dimensions == 3: + # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. + translation_source_power = 0 + translation_target_power = 0 + translation_max_power = 3 + else: + raise ValueError("Don't know how to estimate expansion complexities " + "for dimension %d" % dimensions) - def _calculate_nters_fmm_total(self, wrangler, counter): + else: + ncoeffs_fmm_by_level = level_nterms ** dimensions + + translation_source_power = dimensions + translation_target_power = dimensions + translation_max_power = 0 + + return FMMParameters( + ncoeffs_fmm_by_level=ncoeffs_fmm_by_level, + translation_source_power=translation_source_power, + translation_target_power=translation_target_power, + translation_max_power=translation_max_power + ) + + @staticmethod + def calculate_nters_fmm_total(counter, parameters): """ :return: total number of terms formed across all levels during form_multipole """ - dimensions = wrangler.tree.dimensions - nsources_by_level = counter.count_nsources_by_level() - level_nterms = wrangler.level_nterms - if self.uses_pde_expansions: - ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) - else: - ncoeffs_fmm_by_level = level_nterms ** dimensions + ncoeffs_fmm_by_level = parameters.ncoeffs_fmm_by_level nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) -- GitLab From 81a3bb10bf83acc626d6978820415ab6039653e6 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 20 Jul 2018 17:25:49 -0500 Subject: [PATCH 07/12] Count m2l operations --- boxtree/fmm.py | 188 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 65 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index ed24c02..ef6c82e 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -433,10 +433,71 @@ class TimingRecorder(object): # }}} +FMMParameters = namedtuple( + "FMMParameters", + ['ncoeffs_fmm_by_level', + 'translation_source_power', + 'translation_target_power', + 'translation_max_power'] +) + + class PerformanceCounter: - def __init__(self, traversal): + def __init__(self, traversal, wrangler, uses_pde_expansions): self.traversal = traversal + self.wrangler = wrangler + self.uses_pde_expansions = uses_pde_expansions + + self.parameters = self.get_fmm_parameters( + traversal.tree.dimensions, + uses_pde_expansions, + wrangler.level_nterms + ) + + @staticmethod + def xlat_cost(p_source, p_target, parameters): + """ + :param p_source: A numpy array of numbers of source terms + :return: The same shape as *p_source* + """ + return ( + p_source ** parameters.translation_source_power + * p_target ** parameters.translation_target_power + * np.maximum(p_source, p_target) ** parameters.translation_max_power + ) + + @staticmethod + def get_fmm_parameters(dimensions, use_pde_expansions, level_nterms): + if use_pde_expansions: + ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) + + if dimensions == 2: + translation_source_power = 1 + translation_target_power = 1 + translation_max_power = 0 + elif dimensions == 3: + # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. + translation_source_power = 0 + translation_target_power = 0 + translation_max_power = 3 + else: + raise ValueError("Don't know how to estimate expansion complexities " + "for dimension %d" % dimensions) + + else: + ncoeffs_fmm_by_level = level_nterms ** dimensions + + translation_source_power = dimensions + translation_target_power = dimensions + translation_max_power = 0 + + return FMMParameters( + ncoeffs_fmm_by_level=ncoeffs_fmm_by_level, + translation_source_power=translation_source_power, + translation_target_power=translation_target_power, + translation_max_power=translation_max_power + ) def count_nsources_by_level(self): """ @@ -459,6 +520,18 @@ class PerformanceCounter: return nsources_by_level + def count_nters_fmm_total(self): + """ + :return: total number of terms formed across all levels during form_multipole + """ + nsources_by_level = self.count_nsources_by_level() + + ncoeffs_fmm_by_level = self.parameters.ncoeffs_fmm_by_level + + nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) + + return nterms_fmm_total + def count_direct(self, use_global_idx=False): """ :return: If *use_global_idx* is True, return a numpy array of shape @@ -508,14 +581,49 @@ class PerformanceCounter: return direct_workload + def count_m2l(self, use_global_idx=False): + """ + :return: If *use_global_idx* is True, return a numpy array of shape + (tree.nboxes,) such that the ith entry represents the workload from + multipole to local expansion on box i. If *use_global_idx* is False, + return a numpy array of shape (ntarget_or_target_parent_boxes,) such that + the ith entry represents the workload on *target_or_target_parent_boxes* + i. + """ + trav = self.traversal + wrangler = self.wrangler + parameters = self.parameters -FMMParameters = namedtuple( - "FMMParameters", - ['ncoeffs_fmm_by_level', - 'translation_source_power', - 'translation_target_power', - 'translation_max_power'] -) + ntarget_or_target_parent_boxes = len(trav.target_or_target_parent_boxes) + + if use_global_idx: + nm2l = np.zeros((trav.tree.nboxes,), dtype=np.intp) + else: + nm2l = np.zeros((ntarget_or_target_parent_boxes,), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(trav.target_or_target_parent_boxes): + start, end = trav.from_sep_siblings_starts[itgt_box:itgt_box+2] + from_sep_siblings_level = trav.tree.box_levels[ + trav.from_sep_siblings_lists[start:end] + ] + + if start == end: + continue + + tgt_box_level = trav.tree.box_levels[tgt_ibox] + + from_sep_siblings_nterms = wrangler.level_nterms[from_sep_siblings_level] + tgt_box_nterms = wrangler.level_nterms[tgt_box_level] + + from_sep_siblings_costs = self.xlat_cost( + from_sep_siblings_nterms, tgt_box_nterms, parameters) + + if use_global_idx: + nm2l[tgt_ibox] += np.sum(from_sep_siblings_costs) + else: + nm2l[itgt_box] += np.sum(from_sep_siblings_costs) + + return nm2l class PerformanceModel: @@ -533,19 +641,14 @@ class PerformanceModel: def time_performance(self, traversal): wrangler = self.wrangler_factory(traversal.tree) - counter = PerformanceCounter(traversal) - - parameters = self.get_fmm_parameters( - traversal.tree.dimensions, - self.uses_pde_expansions, - wrangler.level_nterms - ) + counter = PerformanceCounter(traversal, wrangler, self.uses_pde_expansions) # Record useful metadata for assembling performance data timing_data = { - "nterms_fmm_total": self.calculate_nters_fmm_total(counter, parameters), + "nterms_fmm_total": counter.count_nters_fmm_total(), "direct_workload": np.sum(counter.count_direct()), - "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1] + "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], + "m2l_workload": np.sum(counter.count_m2l()) } # Generate random source weights @@ -562,61 +665,16 @@ class PerformanceModel: self.time_result.append(timing_data) def form_multipoles_model(self, wall_time=True): - return self._linear_regression("form_multipoles", ["nterms_fmm_total"], + return self.linear_regression("form_multipoles", ["nterms_fmm_total"], wall_time=wall_time) def eval_direct_model(self, wall_time=True): - return self._linear_regression( + return self.linear_regression( "eval_direct", ["direct_workload", "direct_nsource_boxes"], wall_time=wall_time) - @staticmethod - def get_fmm_parameters(dimensions, use_pde_expansions, level_nterms): - if use_pde_expansions: - ncoeffs_fmm_by_level = level_nterms ** (dimensions - 1) - - if dimensions == 2: - translation_source_power = 1 - translation_target_power = 1 - translation_max_power = 0 - elif dimensions == 3: - # Based on a reading of FMMlib, i.e. a point-and-shoot FMM. - translation_source_power = 0 - translation_target_power = 0 - translation_max_power = 3 - else: - raise ValueError("Don't know how to estimate expansion complexities " - "for dimension %d" % dimensions) - - else: - ncoeffs_fmm_by_level = level_nterms ** dimensions - - translation_source_power = dimensions - translation_target_power = dimensions - translation_max_power = 0 - - return FMMParameters( - ncoeffs_fmm_by_level=ncoeffs_fmm_by_level, - translation_source_power=translation_source_power, - translation_target_power=translation_target_power, - translation_max_power=translation_max_power - ) - - @staticmethod - def calculate_nters_fmm_total(counter, parameters): - """ - :return: total number of terms formed across all levels during form_multipole - """ - nsources_by_level = counter.count_nsources_by_level() - - ncoeffs_fmm_by_level = parameters.ncoeffs_fmm_by_level - - nterms_fmm_total = np.sum(nsources_by_level * ncoeffs_fmm_by_level) - - return nterms_fmm_total - - def _linear_regression(self, y_name, x_name, wall_time=True): + def linear_regression(self, y_name, x_name, wall_time=True): """ :arg y_name: Name of the depedent variable :arg x_name: A list of names of independent variables -- GitLab From b855e88810ef64678efe9d703c6b9e26bf089c27 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 22 Jul 2018 21:54:31 -0500 Subject: [PATCH 08/12] Add script for testing performance model --- boxtree/fmm.py | 3 - examples/performance_model.py | 103 ++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 examples/performance_model.py diff --git a/boxtree/fmm.py b/boxtree/fmm.py index ef6c82e..bae2346 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -715,9 +715,6 @@ class PerformanceModel: from numpy.linalg import lstsq coeff = lstsq(coeff_matrix, dependent_value, rcond=-1)[0] - print(coeff_matrix) - print(dependent_value) - return coeff diff --git a/examples/performance_model.py b/examples/performance_model.py new file mode 100644 index 0000000..3ea75b9 --- /dev/null +++ b/examples/performance_model.py @@ -0,0 +1,103 @@ +from __future__ import division +import pyopencl as cl +import numpy as np +from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler +import functools +from boxtree.fmm import PerformanceModel, PerformanceCounter +from boxtree.fmm import drive_fmm +from pyopencl.clrandom import PhiloxGenerator + +context = cl.create_some_context() +queue = cl.CommandQueue(context) +dtype = np.float64 +helmholtz_k = 0 + + +def fmm_level_to_nterms(tree, level): + return max(level, 3) + + +# {{{ Generate traversal objects for forming models and verification + +traversals = [] + +for nsources, ntargets, dims in [(6000, 6000, 3), + (9000, 9000, 3), + (12000, 12000, 3), + (15000, 15000, 3), + (20000, 20000, 3)]: + + from boxtree.tools import make_normal_particle_array as p_normal + sources = p_normal(queue, nsources, dims, dtype, seed=15) + targets = p_normal(queue, ntargets, dims, dtype, seed=18) + + rng = PhiloxGenerator(context, seed=22) + target_radii = rng.uniform( + queue, ntargets, a=0, b=0.05, dtype=np.float64).get() + + from boxtree import TreeBuilder + tb = TreeBuilder(context) + tree, _ = tb(queue, sources, targets=targets, target_radii=target_radii, + stick_out_factor=0.25, max_particles_in_box=30, debug=True) + + from boxtree.traversal import FMMTraversalBuilder + tg = FMMTraversalBuilder(context, well_sep_is_n_away=2) + d_trav, _ = tg(queue, tree, debug=True) + trav = d_trav.get(queue=queue) + + traversals.append(trav) + +# }}} + +wrangler_factory = functools.partial( + FMMLibExpansionWrangler, helmholtz_k=0, fmm_level_to_nterms=fmm_level_to_nterms) + +ntraversals = len(traversals) +model = PerformanceModel(context, wrangler_factory, True) +for i in range(ntraversals - 1): + model.time_performance(traversals[i]) + +eval_traversal = traversals[-1] +eval_wrangler = wrangler_factory(eval_traversal.tree) +dimensions = eval_traversal.tree.dimensions +eval_counter = PerformanceCounter(eval_traversal, eval_wrangler, True) + +predict_timing = {} +wall_time = True + +# {{{ Predict eval_direct + +param = model.eval_direct_model(wall_time=wall_time) + +direct_workload = np.sum(eval_counter.count_direct()) +direct_nsource_boxes = eval_traversal.neighbor_source_boxes_starts[-1] + +predict_timing["eval_direct"] = ( + direct_workload * param[0] + direct_nsource_boxes * param[1] + param[2]) + +# }}} + +# {{{ Actual timing + +true_timing = {} + +rng = PhiloxGenerator(context) +source_weights = rng.uniform( + queue, eval_traversal.tree.nsources, eval_traversal.tree.coord_dtype).get() + +_ = drive_fmm(eval_traversal, eval_wrangler, source_weights, timing_data=true_timing) + +# }}} + + +for field in ["eval_direct"]: + wall_time_field = predict_timing[field] + + if wall_time: + true_time_field = true_timing[field].wall_elapsed + else: + true_time_field = true_timing[field].process_elapsed + + diff = abs(wall_time_field - true_time_field) + + print(field + " error: " + str(diff / true_time_field)) -- GitLab From 535032d04f0a04072fa39b5659d688ca10358402 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Sun, 22 Jul 2018 22:12:02 -0500 Subject: [PATCH 09/12] Add m2l model --- boxtree/fmm.py | 6 ++++++ examples/performance_model.py | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index bae2346..d4147f9 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -674,6 +674,12 @@ class PerformanceModel: ["direct_workload", "direct_nsource_boxes"], wall_time=wall_time) + def multipole_to_local_model(self, wall_time=True): + return self.linear_regression( + "multipole_to_local", ["m2l_workload"], + wall_time=wall_time + ) + def linear_regression(self, y_name, x_name, wall_time=True): """ :arg y_name: Name of the depedent variable diff --git a/examples/performance_model.py b/examples/performance_model.py index 3ea75b9..075b250 100644 --- a/examples/performance_model.py +++ b/examples/performance_model.py @@ -77,6 +77,16 @@ predict_timing["eval_direct"] = ( # }}} +# {{{ Predict multipole_to_local + +param = model.multipole_to_local_model(wall_time=wall_time) + +m2l_workload = np.sum(eval_counter.count_m2l()) + +predict_timing["multipole_to_local"] = m2l_workload * param[0] + param[1] + +# }}} + # {{{ Actual timing true_timing = {} @@ -90,7 +100,7 @@ _ = drive_fmm(eval_traversal, eval_wrangler, source_weights, timing_data=true_ti # }}} -for field in ["eval_direct"]: +for field in ["eval_direct", "multipole_to_local"]: wall_time_field = predict_timing[field] if wall_time: -- GitLab From 00f47b3e02c7fc7a2904d74d49266a30a4af059a Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 23 Jul 2018 13:58:28 -0500 Subject: [PATCH 10/12] Add eval_multipoles model --- boxtree/fmm.py | 41 ++++++++++++++++++++++++++++++++--- examples/performance_model.py | 12 +++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index d4147f9..0b65688 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -625,6 +625,33 @@ class PerformanceCounter: return nm2l + def count_m2p(self, use_global_idx=False): + trav = self.traversal + tree = trav.tree + + if use_global_idx: + nm2p = np.zeros((tree.nboxes,), dtype=np.intp) + else: + nm2p = np.zeros((len(trav.target_boxes),), dtype=np.intp) + + for ilevel, sep_smaller_list in enumerate(trav.from_sep_smaller_by_level): + ncoeffs_fmm_cur_level = self.parameters.ncoeffs_fmm_by_level[ilevel] + tgt_box_list = trav.target_boxes_sep_smaller_by_source_level[ilevel] + + for itgt_box, tgt_ibox in enumerate(tgt_box_list): + ntargets = tree.box_target_counts_nonchild[tgt_ibox] + + start, end = sep_smaller_list.starts[itgt_box:itgt_box + 2] + + workload = (end - start) * ntargets * ncoeffs_fmm_cur_level + + if use_global_idx: + nm2p[tgt_ibox] += workload + else: + nm2p[sep_smaller_list.nonempty_indices[itgt_box]] += workload + + return nm2p + class PerformanceModel: @@ -648,7 +675,8 @@ class PerformanceModel: "nterms_fmm_total": counter.count_nters_fmm_total(), "direct_workload": np.sum(counter.count_direct()), "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], - "m2l_workload": np.sum(counter.count_m2l()) + "m2l_workload": np.sum(counter.count_m2l()), + "m2p_workload": np.sum(counter.count_m2p()) } # Generate random source weights @@ -665,8 +693,9 @@ class PerformanceModel: self.time_result.append(timing_data) def form_multipoles_model(self, wall_time=True): - return self.linear_regression("form_multipoles", ["nterms_fmm_total"], - wall_time=wall_time) + return self.linear_regression( + "form_multipoles", ["nterms_fmm_total"], + wall_time=wall_time) def eval_direct_model(self, wall_time=True): return self.linear_regression( @@ -680,6 +709,12 @@ class PerformanceModel: wall_time=wall_time ) + def eval_multipoles_model(self, wall_time=True): + return self.linear_regression( + "eval_multipoles", ["m2p_workload"], + wall_time=wall_time + ) + def linear_regression(self, y_name, x_name, wall_time=True): """ :arg y_name: Name of the depedent variable diff --git a/examples/performance_model.py b/examples/performance_model.py index 075b250..4ca8872 100644 --- a/examples/performance_model.py +++ b/examples/performance_model.py @@ -87,6 +87,16 @@ predict_timing["multipole_to_local"] = m2l_workload * param[0] + param[1] # }}} +# {{{ Predict eval_multipoles + +param = model.eval_multipoles_model(wall_time=wall_time) + +m2p_workload = np.sum(eval_counter.count_m2p()) + +predict_timing["eval_multipoles"] = m2p_workload * param[0] + param[1] + +# }}} + # {{{ Actual timing true_timing = {} @@ -100,7 +110,7 @@ _ = drive_fmm(eval_traversal, eval_wrangler, source_weights, timing_data=true_ti # }}} -for field in ["eval_direct", "multipole_to_local"]: +for field in ["eval_direct", "multipole_to_local", "eval_multipoles"]: wall_time_field = predict_timing[field] if wall_time: -- GitLab From 311f8fa9f5e44342bd0fdfcd7294b433667204f2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 23 Jul 2018 17:57:07 -0500 Subject: [PATCH 11/12] Add form_locals model --- boxtree/fmm.py | 36 ++++++++++++++++++++++++++++++++++- examples/performance_model.py | 19 ++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 0b65688..2000225 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -652,6 +652,33 @@ class PerformanceCounter: return nm2p + def count_p2l(self, use_global_idx=False): + trav = self.traversal + tree = trav.tree + parameters = self.parameters + + if use_global_idx: + np2l = np.zeros((tree.nboxes,), dtype=np.intp) + else: + np2l = np.zeros(len(trav.target_or_target_parent_boxes), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(trav.target_or_target_parent_boxes): + tgt_box_level = trav.tree.box_levels[tgt_ibox] + ncoeffs = parameters.ncoeffs_fmm_by_level[tgt_box_level] + + start, end = trav.from_sep_bigger_starts[itgt_box:itgt_box + 2] + + np2l_sources = 0 + for src_ibox in trav.from_sep_bigger_lists[start:end]: + np2l_sources += tree.box_source_counts_nonchild[src_ibox] + + if use_global_idx: + np2l[tgt_ibox] = np2l_sources * ncoeffs + else: + np2l[itgt_box] = np2l_sources * ncoeffs + + return np2l + class PerformanceModel: @@ -676,7 +703,8 @@ class PerformanceModel: "direct_workload": np.sum(counter.count_direct()), "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], "m2l_workload": np.sum(counter.count_m2l()), - "m2p_workload": np.sum(counter.count_m2p()) + "m2p_workload": np.sum(counter.count_m2p()), + "p2l_workload": np.sum(counter.count_p2l()) } # Generate random source weights @@ -715,6 +743,12 @@ class PerformanceModel: wall_time=wall_time ) + def form_locals_model(self, wall_time=True): + return self.linear_regression( + "form_locals", ["p2l_workload"], + wall_time=wall_time + ) + def linear_regression(self, y_name, x_name, wall_time=True): """ :arg y_name: Name of the depedent variable diff --git a/examples/performance_model.py b/examples/performance_model.py index 4ca8872..a139f99 100644 --- a/examples/performance_model.py +++ b/examples/performance_model.py @@ -97,6 +97,16 @@ predict_timing["eval_multipoles"] = m2p_workload * param[0] + param[1] # }}} +# {{{ Predict form_locals + +param = model.form_locals_model(wall_time=wall_time) + +p2l_workload = np.sum(eval_counter.count_p2l()) + +predict_timing["form_locals"] = p2l_workload * param[0] + param[1] + +# }}} + # {{{ Actual timing true_timing = {} @@ -110,14 +120,15 @@ _ = drive_fmm(eval_traversal, eval_wrangler, source_weights, timing_data=true_ti # }}} -for field in ["eval_direct", "multipole_to_local", "eval_multipoles"]: - wall_time_field = predict_timing[field] +for field in ["eval_direct", "multipole_to_local", "eval_multipoles", "form_locals"]: + predict_time_field = predict_timing[field] if wall_time: true_time_field = true_timing[field].wall_elapsed else: true_time_field = true_timing[field].process_elapsed - diff = abs(wall_time_field - true_time_field) + diff = abs(predict_time_field - true_time_field) - print(field + " error: " + str(diff / true_time_field)) + print(field + ": predict " + str(predict_time_field) + " actual " + + str(true_time_field) + " error " + str(diff / true_time_field)) -- GitLab From 27f73be20c6b07836cfeff90237406f4de439681 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 24 Jul 2018 11:34:53 -0500 Subject: [PATCH 12/12] Add eval_locals model --- boxtree/fmm.py | 33 +++++++++++++++++++++++++++++++-- examples/performance_model.py | 15 +++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 2000225..b79e849 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -679,6 +679,28 @@ class PerformanceCounter: return np2l + def count_eval_part(self, use_global_idx=False): + trav = self.traversal + tree = trav.tree + parameters = self.parameters + + if use_global_idx: + neval_part = np.zeros(tree.nboxes, dtype=np.intp) + else: + neval_part = np.zeros(len(trav.target_boxes), dtype=np.intp) + + for itgt_box, tgt_ibox in enumerate(trav.target_boxes): + ntargets = tree.box_target_counts_nonchild[tgt_ibox] + tgt_box_level = trav.tree.box_levels[tgt_ibox] + ncoeffs_fmm = parameters.ncoeffs_fmm_by_level[tgt_box_level] + + if use_global_idx: + neval_part[tgt_ibox] = ntargets * ncoeffs_fmm + else: + neval_part[itgt_box] = ntargets * ncoeffs_fmm + + return neval_part + class PerformanceModel: @@ -692,7 +714,7 @@ class PerformanceModel: from pyopencl.clrandom import PhiloxGenerator self.rng = PhiloxGenerator(cl_context) - def time_performance(self, traversal): + def time_performance(self, traversal, drive_fmm): wrangler = self.wrangler_factory(traversal.tree) counter = PerformanceCounter(traversal, wrangler, self.uses_pde_expansions) @@ -704,7 +726,8 @@ class PerformanceModel: "direct_nsource_boxes": traversal.neighbor_source_boxes_starts[-1], "m2l_workload": np.sum(counter.count_m2l()), "m2p_workload": np.sum(counter.count_m2p()), - "p2l_workload": np.sum(counter.count_p2l()) + "p2l_workload": np.sum(counter.count_p2l()), + "eval_part_workload": np.sum(counter.count_eval_part()) } # Generate random source weights @@ -749,6 +772,12 @@ class PerformanceModel: wall_time=wall_time ) + def eval_locals_model(self, wall_time=True): + return self.linear_regression( + "eval_locals", ["eval_part_workload"], + wall_time=wall_time + ) + def linear_regression(self, y_name, x_name, wall_time=True): """ :arg y_name: Name of the depedent variable diff --git a/examples/performance_model.py b/examples/performance_model.py index a139f99..24fc6db 100644 --- a/examples/performance_model.py +++ b/examples/performance_model.py @@ -55,7 +55,7 @@ wrangler_factory = functools.partial( ntraversals = len(traversals) model = PerformanceModel(context, wrangler_factory, True) for i in range(ntraversals - 1): - model.time_performance(traversals[i]) + model.time_performance(traversals[i], drive_fmm) eval_traversal = traversals[-1] eval_wrangler = wrangler_factory(eval_traversal.tree) @@ -107,6 +107,16 @@ predict_timing["form_locals"] = p2l_workload * param[0] + param[1] # }}} +# {{{ + +param = model.eval_locals_model(wall_time=wall_time) + +eval_part_workload = np.sum(eval_counter.count_eval_part()) + +predict_timing["eval_locals"] = eval_part_workload * param[0] + param[1] + +# }}} + # {{{ Actual timing true_timing = {} @@ -120,7 +130,8 @@ _ = drive_fmm(eval_traversal, eval_wrangler, source_weights, timing_data=true_ti # }}} -for field in ["eval_direct", "multipole_to_local", "eval_multipoles", "form_locals"]: +for field in ["eval_direct", "multipole_to_local", "eval_multipoles", "form_locals", + "eval_locals"]: predict_time_field = predict_timing[field] if wall_time: -- GitLab