From 679cbf3ac31a1bacd35fbdbd1e8d57e589711c41 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sun, 27 Jan 2019 21:56:46 +0100 Subject: [PATCH 01/32] Initial commit --- README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 000000000..846211dcb --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# kernel_profiler + -- GitLab From 55cd323e243a8f32d59bd0566683c44de63e56e6 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 27 Jan 2019 16:29:01 -0600 Subject: [PATCH 02/32] added initial version of kernel profiler that just times kernels --- examples/example.py | 35 +++++++++ kernel_profiler/__init__.py | 139 ++++++++++++++++++++++++++++++++++++ kernel_profiler/version.py | 1 + requirements.txt | 4 ++ setup.cfg | 6 ++ setup.py | 28 ++++++++ 6 files changed, 213 insertions(+) create mode 100644 examples/example.py create mode 100644 kernel_profiler/__init__.py create mode 100644 kernel_profiler/version.py create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 000000000..ab9cb0353 --- /dev/null +++ b/examples/example.py @@ -0,0 +1,35 @@ +import loopy as lp +import numpy as np +from kernel_profiler import KernelProfiler +from kernel_profiler import KernelStatOptions as stat_opts + + +knl = lp.make_kernel( + "{[i,k,j]: 0<=i Date: Sun, 27 Jan 2019 20:31:48 -0600 Subject: [PATCH 03/32] added mem ops, math ops, sync ops, and grid sizes to stats options in profiler --- examples/example.py | 26 +++- kernel_profiler/__init__.py | 247 +++++++++++++++++++++++++++++++++--- 2 files changed, 251 insertions(+), 22 deletions(-) diff --git a/examples/example.py b/examples/example.py index ab9cb0353..a7eb14ef2 100644 --- a/examples/example.py +++ b/examples/example.py @@ -9,7 +9,10 @@ knl = lp.make_kernel( [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], - name="matmul", assumptions="n,m,ell >= 1") + name="matmul", + assumptions="n,m,ell >= 1", + lang_version=(2018, 2), + ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) @@ -26,10 +29,23 @@ ell = 128 param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler("NVIDIA", "GEFORCE") -stats = kp.get_stats(knl, [stat_opts.WALL_TIME], param_dict=param_dict) -print(stats[stat_opts.WALL_TIME]) +stats = kp.profile( + knl, + [ + stat_opts.WALL_TIME, + stat_opts.MEMORY_ACCESS, + stat_opts.ARITHMETIC_OPS, + stat_opts.SYNCHRONIZATION, + stat_opts.GRID_SIZES, + ], + param_dict=param_dict) +print("\nWall time:", stats[stat_opts.WALL_TIME], "\n") +print(lp.stringify_stats_mapping(stats[stat_opts.MEMORY_ACCESS])) +print(lp.stringify_stats_mapping(stats[stat_opts.ARITHMETIC_OPS])) +print(lp.stringify_stats_mapping(stats[stat_opts.SYNCHRONIZATION])) +print(stats[stat_opts.GRID_SIZES], "\n") interactive_kp = KernelProfiler(interactive=True) -interactive_stats = interactive_kp.get_stats( +interactive_stats = interactive_kp.profile( knl, [stat_opts.WALL_TIME], param_dict=param_dict) -print(interactive_stats[stat_opts.WALL_TIME]) +print(interactive_stats[stat_opts.WALL_TIME], "\n") diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 919aef41f..3ec544842 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -38,25 +38,43 @@ def find_cl_device_candidates(platform_name, device_name): class KernelStatOptions: WALL_TIME = "wall_time" + MEMORY_ACCESS = "memory_access" + ARITHMETIC_OPS = "arithmetic_ops" + SYNCHRONIZATION = "synchronization" + GRID_SIZES = "grid_sizes" # TODO add other stat options here class KernelProfiler(object): - N_WARMUP_TIME_TRIALS = 4 - N_TIME_TRIALS = 64 - def __init__( self, platform_name=None, device_name=None, interactive=False, + n_warmup_time_trials=4, + n_time_trials=64, + evaluate_polys=True, + subgroup_size=32, + count_redundant_work=True, + count_madds=True, + count_within_subscripts=False, ): + self.ctx_cache = {} self.platform_name = platform_name self.device_name = device_name self.interactive = interactive + self.n_warmup_time_trials = n_warmup_time_trials + self.n_time_trials = n_time_trials + + self.evaluate_polys = evaluate_polys + self.subgroup_size = subgroup_size + self.count_redundant_work = count_redundant_work + self.count_madds = count_madds + self.count_within_subscripts = count_within_subscripts + def get_cl_context(self): if self.interactive: @@ -70,8 +88,8 @@ class KernelProfiler(object): try: return self.ctx_cache[cache_key] except KeyError: - ctx = cl.Context( - [find_cl_device_candidates(self.platform_name, self.device_name)[-1]] + ctx = cl.Context([find_cl_device_candidates( + self.platform_name, self.device_name)[-1]] ) self.ctx_cache[cache_key] = ctx return ctx @@ -84,9 +102,9 @@ class KernelProfiler(object): n_trials=None, ): - n_warmup_trials = self.N_WARMUP_TIME_TRIALS if n_warmup_trials is None \ + n_warmup_trials = self.n_warmup_time_trials if not n_warmup_trials \ else n_warmup_trials - n_trials = self.N_TIME_TRIALS if n_trials is None else n_trials + n_trials = self.n_time_trials if not n_trials else n_trials ctx = self.get_cl_context() queue = cl.CommandQueue(ctx) @@ -109,22 +127,154 @@ class KernelProfiler(object): import numpy as np return np.average(wtimes[n_warmup_trials:]) - def get_stats( - self, + def get_mem_access_stats( + self, + knl, + evaluate_polys=None, + param_dict=None, + count_redundant_work=None, + subgroup_size=None, + ): + + from loopy.statistics import get_mem_access_map + + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + count_redundant_work = self.count_redundant_work \ + if not count_redundant_work else count_redundant_work + subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size + + mem_access_map = get_mem_access_map( + knl, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size, + ) + + if evaluate_polys: + if param_dict is None: + raise ValueError("Cannont evaluate polynomials without param_dict.") + return mem_access_map.eval(param_dict) + else: + return mem_access_map + + def get_op_stats( + self, + knl, + evaluate_polys=None, + param_dict=None, + count_redundant_work=None, + subgroup_size=None, + count_madds=None, + count_within_subscripts=None, + ): + + from loopy.statistics import get_op_map + + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + count_redundant_work = self.count_redundant_work \ + if not count_redundant_work else count_redundant_work + subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size + count_madds = self.count_madds if not count_madds else count_madds + count_within_subscripts = self.count_within_subscripts \ + if not count_within_subscripts else count_within_subscripts + + op_map = get_op_map( knl, - stat_options=[], - param_dict=None, - n_warmup_wtime_trials=None, - n_wtime_trials=None, - ): + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size, + count_madds=count_madds, + ) + + if evaluate_polys: + if param_dict is None: + raise ValueError("Cannont evaluate polynomials without param_dict.") + return op_map.eval(param_dict) + else: + return op_map + + def get_synchronization_stats( + self, + knl, + evaluate_polys=None, + param_dict=None, + subgroup_size=None, + ): + + from loopy.statistics import get_synchronization_map + + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size + + sync_map = get_synchronization_map( + knl, + subgroup_size=subgroup_size, + ) + + if evaluate_polys: + if param_dict is None: + raise ValueError("Cannont evaluate polynomials without param_dict.") + return sync_map.eval(param_dict) + else: + return sync_map + + def get_grid_sizes( + self, + knl, + evaluate_polys=None, + param_dict=None, + ): + + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + + global_size, local_size = knl.get_grid_size_upper_bounds() + + from islpy import PwQPolynomial + gsize_pwqs = [] + lsize_pwqs = [] + for gsize in global_size: + gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize)) + for lsize in local_size: + lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize)) + + if evaluate_polys: + if param_dict is None: + raise ValueError("Cannont evaluate polynomials without param_dict.") + return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \ + [l.eval_with_dict(param_dict) for l in lsize_pwqs] + else: + return gsize_pwqs, lsize_pwqs + + def profile( + self, + knl, + stat_options=[], + param_dict=None, + n_warmup_wtime_trials=None, + n_wtime_trials=None, + evaluate_polys=True, + count_redundant_work=None, + subgroup_size=None, + count_madds=True, + count_within_subscripts=False, + ): stats_found = {} if KernelStatOptions.WALL_TIME in stat_options: - n_warmup_wtime_trials = self.N_WARMUP_TIME_TRIALS \ + # if no value passed, set to defaults + #TODO these checks are redundant + n_warmup_wtime_trials = self.n_warmup_time_trials \ if n_warmup_wtime_trials is None else n_warmup_wtime_trials - n_wtime_trials = self.N_TIME_TRIALS \ + n_wtime_trials = self.n_time_trials \ if n_wtime_trials is None else n_wtime_trials if param_dict is None: @@ -134,6 +284,69 @@ class KernelProfiler(object): stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel( knl, param_dict, n_warmup_wtime_trials, n_wtime_trials) - # TODO add other stat options here + if KernelStatOptions.MEMORY_ACCESS in stat_options: + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + count_redundant_work = self.count_redundant_work \ + if not count_redundant_work else count_redundant_work + subgroup_size = self.subgroup_size \ + if not subgroup_size else subgroup_size + + stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats( + knl, + evaluate_polys=evaluate_polys, + param_dict=param_dict, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size, + ) + + if KernelStatOptions.ARITHMETIC_OPS in stat_options: + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + count_redundant_work = self.count_redundant_work \ + if not count_redundant_work else count_redundant_work + subgroup_size = self.subgroup_size \ + if not subgroup_size else subgroup_size + count_madds = self.count_madds if not count_madds else count_madds + count_within_subscripts = self.count_within_subscripts \ + if not count_within_subscripts else count_within_subscripts + + stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats( + knl, + evaluate_polys=evaluate_polys, + param_dict=param_dict, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size, + count_madds=count_madds, + count_within_subscripts=count_within_subscripts, + ) + + if KernelStatOptions.SYNCHRONIZATION in stat_options: + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + subgroup_size = self.subgroup_size \ + if not subgroup_size else subgroup_size + + stats_found[KernelStatOptions.SYNCHRONIZATION] = \ + self.get_synchronization_stats( + knl, + evaluate_polys=evaluate_polys, + param_dict=param_dict, + subgroup_size=subgroup_size, + ) + + if KernelStatOptions.GRID_SIZES in stat_options: + # if no value passed, set to defaults + evaluate_polys = self.evaluate_polys \ + if not evaluate_polys else evaluate_polys + + stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes( + knl, + evaluate_polys=evaluate_polys, + param_dict=param_dict, + ) return stats_found -- GitLab From c1b5b03182a4cdbab3897c89ab25e34c8de8cd22 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 27 Jan 2019 20:58:10 -0600 Subject: [PATCH 04/32] removed redundant parameter checks, instead using instance variables and updating instance vars when requested --- kernel_profiler/__init__.py | 149 +++++++++--------------------------- 1 file changed, 38 insertions(+), 111 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 3ec544842..0d3048119 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -98,13 +98,11 @@ class KernelProfiler(object): self, knl, param_dict, - n_warmup_trials=None, - n_trials=None, ): - n_warmup_trials = self.n_warmup_time_trials if not n_warmup_trials \ - else n_warmup_trials - n_trials = self.n_time_trials if not n_trials else n_trials + if param_dict is None: + raise ValueError( + "Wall time requires dictionary of kernel parameters.") ctx = self.get_cl_context() queue = cl.CommandQueue(ctx) @@ -116,7 +114,7 @@ class KernelProfiler(object): wtimes = [] import time - for t in range(n_trials + n_warmup_trials): + for t in range(self.n_time_trials + self.n_warmup_time_trials): queue.finish() tstart = time.time() evt, out = compiled(queue, **arg_arrays) @@ -125,33 +123,23 @@ class KernelProfiler(object): wtimes.append(tend-tstart) import numpy as np - return np.average(wtimes[n_warmup_trials:]) + return np.average(wtimes[self.n_warmup_time_trials:]) def get_mem_access_stats( self, knl, - evaluate_polys=None, param_dict=None, - count_redundant_work=None, - subgroup_size=None, ): from loopy.statistics import get_mem_access_map - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - count_redundant_work = self.count_redundant_work \ - if not count_redundant_work else count_redundant_work - subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size - mem_access_map = get_mem_access_map( knl, - count_redundant_work=count_redundant_work, - subgroup_size=subgroup_size, + count_redundant_work=self.count_redundant_work, + subgroup_size=self.subgroup_size, ) - if evaluate_polys: + if self.evaluate_polys: if param_dict is None: raise ValueError("Cannont evaluate polynomials without param_dict.") return mem_access_map.eval(param_dict) @@ -161,35 +149,20 @@ class KernelProfiler(object): def get_op_stats( self, knl, - evaluate_polys=None, param_dict=None, - count_redundant_work=None, - subgroup_size=None, - count_madds=None, - count_within_subscripts=None, ): from loopy.statistics import get_op_map - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - count_redundant_work = self.count_redundant_work \ - if not count_redundant_work else count_redundant_work - subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size - count_madds = self.count_madds if not count_madds else count_madds - count_within_subscripts = self.count_within_subscripts \ - if not count_within_subscripts else count_within_subscripts - op_map = get_op_map( knl, - count_redundant_work=count_redundant_work, - count_within_subscripts=count_within_subscripts, - subgroup_size=subgroup_size, - count_madds=count_madds, + count_redundant_work=self.count_redundant_work, + count_within_subscripts=self.count_within_subscripts, + subgroup_size=self.subgroup_size, + count_madds=self.count_madds, ) - if evaluate_polys: + if self.evaluate_polys: if param_dict is None: raise ValueError("Cannont evaluate polynomials without param_dict.") return op_map.eval(param_dict) @@ -199,24 +172,17 @@ class KernelProfiler(object): def get_synchronization_stats( self, knl, - evaluate_polys=None, param_dict=None, - subgroup_size=None, ): from loopy.statistics import get_synchronization_map - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size - sync_map = get_synchronization_map( knl, - subgroup_size=subgroup_size, + subgroup_size=self.subgroup_size, ) - if evaluate_polys: + if self.evaluate_polys: if param_dict is None: raise ValueError("Cannont evaluate polynomials without param_dict.") return sync_map.eval(param_dict) @@ -226,14 +192,9 @@ class KernelProfiler(object): def get_grid_sizes( self, knl, - evaluate_polys=None, param_dict=None, ): - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - global_size, local_size = knl.get_grid_size_upper_bounds() from islpy import PwQPolynomial @@ -244,7 +205,7 @@ class KernelProfiler(object): for lsize in local_size: lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize)) - if evaluate_polys: + if self.evaluate_polys: if param_dict is None: raise ValueError("Cannont evaluate polynomials without param_dict.") return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \ @@ -259,93 +220,59 @@ class KernelProfiler(object): param_dict=None, n_warmup_wtime_trials=None, n_wtime_trials=None, - evaluate_polys=True, + evaluate_polys=None, count_redundant_work=None, subgroup_size=None, - count_madds=True, - count_within_subscripts=False, + count_madds=None, + count_within_subscripts=None, ): + # update instance vars if requested + if n_warmup_wtime_trials is not None: + self.n_warmup_wtime_trials = n_warmup_wtime_trials + if n_wtime_trials is not None: + self.n_wtime_trials = n_wtime_trials + if evaluate_polys is not None: + self.evaluate_polys = evaluate_polys + if count_redundant_work is not None: + self.count_redundant_work = count_redundant_work + if subgroup_size is not None: + self.subgroup_size = subgroup_size + if count_madds is not None: + self.count_madds = count_madds + if count_within_subscripts is not None: + self.count_within_subscripts = count_within_subscripts + stats_found = {} if KernelStatOptions.WALL_TIME in stat_options: - - # if no value passed, set to defaults - #TODO these checks are redundant - n_warmup_wtime_trials = self.n_warmup_time_trials \ - if n_warmup_wtime_trials is None else n_warmup_wtime_trials - n_wtime_trials = self.n_time_trials \ - if n_wtime_trials is None else n_wtime_trials - - if param_dict is None: - raise ValueError( - "Wall time requires dictionary of kernel parameters.") - stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel( - knl, param_dict, n_warmup_wtime_trials, n_wtime_trials) + knl, + param_dict, + ) if KernelStatOptions.MEMORY_ACCESS in stat_options: - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - count_redundant_work = self.count_redundant_work \ - if not count_redundant_work else count_redundant_work - subgroup_size = self.subgroup_size \ - if not subgroup_size else subgroup_size - stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats( knl, - evaluate_polys=evaluate_polys, param_dict=param_dict, - count_redundant_work=count_redundant_work, - subgroup_size=subgroup_size, ) if KernelStatOptions.ARITHMETIC_OPS in stat_options: - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - count_redundant_work = self.count_redundant_work \ - if not count_redundant_work else count_redundant_work - subgroup_size = self.subgroup_size \ - if not subgroup_size else subgroup_size - count_madds = self.count_madds if not count_madds else count_madds - count_within_subscripts = self.count_within_subscripts \ - if not count_within_subscripts else count_within_subscripts - stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats( knl, - evaluate_polys=evaluate_polys, param_dict=param_dict, - count_redundant_work=count_redundant_work, - subgroup_size=subgroup_size, - count_madds=count_madds, - count_within_subscripts=count_within_subscripts, ) if KernelStatOptions.SYNCHRONIZATION in stat_options: - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - subgroup_size = self.subgroup_size \ - if not subgroup_size else subgroup_size - stats_found[KernelStatOptions.SYNCHRONIZATION] = \ self.get_synchronization_stats( knl, - evaluate_polys=evaluate_polys, param_dict=param_dict, - subgroup_size=subgroup_size, ) if KernelStatOptions.GRID_SIZES in stat_options: - # if no value passed, set to defaults - evaluate_polys = self.evaluate_polys \ - if not evaluate_polys else evaluate_polys - stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes( knl, - evaluate_polys=evaluate_polys, param_dict=param_dict, ) -- GitLab From 8e8a980ff3c655bf5b03a862b63a1a6aa4e45a59 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 27 Jan 2019 23:16:30 -0600 Subject: [PATCH 05/32] added flop rate and bandwidth to profiler stat options --- examples/example.py | 40 ++++++++++++++++------------ kernel_profiler/__init__.py | 52 +++++++++++++++++++++++++++---------- requirements.txt | 1 + 3 files changed, 62 insertions(+), 31 deletions(-) diff --git a/examples/example.py b/examples/example.py index a7eb14ef2..3a18c0aa2 100644 --- a/examples/example.py +++ b/examples/example.py @@ -1,7 +1,7 @@ import loopy as lp import numpy as np from kernel_profiler import KernelProfiler -from kernel_profiler import KernelStatOptions as stat_opts +from kernel_profiler import KernelStatOptions as kso knl = lp.make_kernel( @@ -23,29 +23,35 @@ knl = lp.split_iname(knl, "k", lsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") -n = 512 -m = 256 -ell = 128 +n = 2**10 +m = 2**11 +ell = 2**12 param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler("NVIDIA", "GEFORCE") stats = kp.profile( knl, [ - stat_opts.WALL_TIME, - stat_opts.MEMORY_ACCESS, - stat_opts.ARITHMETIC_OPS, - stat_opts.SYNCHRONIZATION, - stat_opts.GRID_SIZES, + kso.WALL_TIME, + kso.MEM_ACCESS_MAP, + kso.OP_MAP, + kso.SYNC_MAP, + kso.GRID_SIZES, + kso.FLOP_RATE, + kso.MEM_BANDWIDTH, ], - param_dict=param_dict) -print("\nWall time:", stats[stat_opts.WALL_TIME], "\n") -print(lp.stringify_stats_mapping(stats[stat_opts.MEMORY_ACCESS])) -print(lp.stringify_stats_mapping(stats[stat_opts.ARITHMETIC_OPS])) -print(lp.stringify_stats_mapping(stats[stat_opts.SYNCHRONIZATION])) -print(stats[stat_opts.GRID_SIZES], "\n") + param_dict=param_dict, + evaluate_polys=False, + ) +print("\nWall time:", stats[kso.WALL_TIME], "\n") +print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP])) +print(lp.stringify_stats_mapping(stats[kso.OP_MAP])) +print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP])) +print(stats[kso.GRID_SIZES], "\n") +print(stats[kso.FLOP_RATE], "\n") +print(stats[kso.MEM_BANDWIDTH], "\n") interactive_kp = KernelProfiler(interactive=True) interactive_stats = interactive_kp.profile( - knl, [stat_opts.WALL_TIME], param_dict=param_dict) -print(interactive_stats[stat_opts.WALL_TIME], "\n") + knl, [kso.WALL_TIME], param_dict=param_dict) +print(interactive_stats[kso.WALL_TIME], "\n") diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 0d3048119..06ee71a3d 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -38,11 +38,12 @@ def find_cl_device_candidates(platform_name, device_name): class KernelStatOptions: WALL_TIME = "wall_time" - MEMORY_ACCESS = "memory_access" - ARITHMETIC_OPS = "arithmetic_ops" - SYNCHRONIZATION = "synchronization" + MEM_ACCESS_MAP = "mem_access_map" + OP_MAP = "op_map" + SYNC_MAP = "sync_map" GRID_SIZES = "grid_sizes" - # TODO add other stat options here + FLOP_RATE = "flop_rate" + MEM_BANDWIDTH = "mem_bandwidth" class KernelProfiler(object): @@ -244,36 +245,59 @@ class KernelProfiler(object): self.count_within_subscripts = count_within_subscripts stats_found = {} + kso = KernelStatOptions - if KernelStatOptions.WALL_TIME in stat_options: - stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel( + if kso.WALL_TIME in stat_options or \ + kso.FLOP_RATE in stat_options or \ + kso.MEM_BANDWIDTH in stat_options: + stats_found[kso.WALL_TIME] = self.time_kernel( knl, param_dict, ) - if KernelStatOptions.MEMORY_ACCESS in stat_options: - stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats( + if kso.MEM_ACCESS_MAP in stat_options or \ + kso.MEM_BANDWIDTH in stat_options: + stats_found[kso.MEM_ACCESS_MAP] = self.get_mem_access_stats( knl, param_dict=param_dict, ) - if KernelStatOptions.ARITHMETIC_OPS in stat_options: - stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats( + if kso.OP_MAP in stat_options or \ + kso.FLOP_RATE in stat_options: + stats_found[kso.OP_MAP] = self.get_op_stats( knl, param_dict=param_dict, ) - if KernelStatOptions.SYNCHRONIZATION in stat_options: - stats_found[KernelStatOptions.SYNCHRONIZATION] = \ + if kso.SYNC_MAP in stat_options: + stats_found[kso.SYNC_MAP] = \ self.get_synchronization_stats( knl, param_dict=param_dict, ) - if KernelStatOptions.GRID_SIZES in stat_options: - stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes( + if kso.GRID_SIZES in stat_options: + stats_found[kso.GRID_SIZES] = self.get_grid_sizes( knl, param_dict=param_dict, ) + if kso.FLOP_RATE in stat_options: + import numpy as np + float_ops = stats_found[kso.OP_MAP].filter_by( + dtype=[np.float32, np.float64] + ).sum() + if not self.evaluate_polys: + float_ops = float_ops.eval_with_dict(param_dict) + stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME] + + if kso.MEM_BANDWIDTH in stat_options: + data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( + mtype=["global"] + ).to_bytes().sum() + if not self.evaluate_polys: + data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict) + stats_found[kso.MEM_BANDWIDTH] = \ + data_moved_bytes/stats_found[kso.WALL_TIME] + return stats_found diff --git a/requirements.txt b/requirements.txt index 5352cf661..8482d2c84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ numpy +git+https://github.com/inducer/islpy.git git+https://github.com/inducer/loopy.git git+https://github.com/inducer/pyopencl.git git+https://github.com/inducer/pytools.git -- GitLab From b55c066ff5ccd37fd13427b1d28ce6736bc6be00 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 28 Jan 2019 11:38:49 -0600 Subject: [PATCH 06/32] minor change to example --- examples/example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/example.py b/examples/example.py index 3a18c0aa2..ddc44bc2a 100644 --- a/examples/example.py +++ b/examples/example.py @@ -25,7 +25,7 @@ knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 2**10 m = 2**11 -ell = 2**12 +ell = 2**9 param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler("NVIDIA", "GEFORCE") @@ -41,7 +41,7 @@ stats = kp.profile( kso.MEM_BANDWIDTH, ], param_dict=param_dict, - evaluate_polys=False, + evaluate_polys=True, ) print("\nWall time:", stats[kso.WALL_TIME], "\n") print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP])) -- GitLab From f69fab3d8f0bb43897a37d6495be984cab48c835 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 28 Jan 2019 11:46:01 -0600 Subject: [PATCH 07/32] install instructions in readme --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 846211dcb..addc1c34c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,9 @@ # kernel_profiler +Install: + +`python setup.py install` + +Developer install (source changes take immediate effect): + +`python setup.py develop` -- GitLab From 16cef0d159e829be19be4c8de0bcfe50470a6d9c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 28 Jan 2019 12:04:58 -0600 Subject: [PATCH 08/32] explained stats options in readme --- README.md | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index addc1c34c..5f83baabb 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,32 @@ # kernel_profiler -Install: +**Install**: `python setup.py install` -Developer install (source changes take immediate effect): +**Developer install** (source changes take immediate effect): `python setup.py develop` + +# Stat options + +* **KernelStatOptions.WALL_TIME** + Kernel execution time using random input data. + +* **KernelStatOptions.MEM_ACCESS_MAP** + A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping memory accesses to counts. Also see [**loopy.get_mem_access_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_mem_access_map). + +* **KernelStatOptions.OP_MAP** + A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping operations to counts. Also see [**loopy.get_op_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_op_map). + +* **KernelStatOptions.SYNC_MAP** + A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping synchronization operations to counts. Also see [**loopy.get_synchronization_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_synchronization_map). + +* **KernelStatOptions.GRID_SIZES** + A tuple containing (local sizes, global sizes). + +* **KernelStatOptions.FLOP_RATE** + Number of 32-bit and 64-bit floating point operations per second. + +* **KernelStatOptions.MEM_BANDWIDTH** + Global memory bytes accessed per second. -- GitLab From 8a680829aaa2f91b2f42d20c42a9fb8eee712acd Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 30 Jan 2019 16:15:55 -0600 Subject: [PATCH 09/32] added save_ptx option --- examples/example.py | 3 ++- kernel_profiler/__init__.py | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/examples/example.py b/examples/example.py index ddc44bc2a..0231089f9 100644 --- a/examples/example.py +++ b/examples/example.py @@ -28,7 +28,7 @@ m = 2**11 ell = 2**9 param_dict = {'n': n, 'm': m, 'ell': ell} -kp = KernelProfiler("NVIDIA", "GEFORCE") +kp = KernelProfiler("NVIDIA", "GEFORCE", include_kernel_params_in_ptx_filename=True) stats = kp.profile( knl, [ @@ -39,6 +39,7 @@ stats = kp.profile( kso.GRID_SIZES, kso.FLOP_RATE, kso.MEM_BANDWIDTH, + kso.SAVE_PTX, ], param_dict=param_dict, evaluate_polys=True, diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 06ee71a3d..67a8a40e7 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -36,6 +36,17 @@ def find_cl_device_candidates(platform_name, device_name): return candidates +def write_ptx(ctx, knl, filename=None): + cl_program = cl.Program( + ctx, lp.generate_code_v2(knl).device_code() + ).build(options=knl.options.cl_build_options) + ptx_src = cl_program.binaries[0] + if not filename: + filename = "ptx_"+knl.name+".ptx" + ptx_src_file = open(filename, 'w') + ptx_src_file.write(ptx_src.decode('utf-8', 'ignore')) + + class KernelStatOptions: WALL_TIME = "wall_time" MEM_ACCESS_MAP = "mem_access_map" @@ -44,6 +55,7 @@ class KernelStatOptions: GRID_SIZES = "grid_sizes" FLOP_RATE = "flop_rate" MEM_BANDWIDTH = "mem_bandwidth" + SAVE_PTX = "save_ptx" class KernelProfiler(object): @@ -60,6 +72,7 @@ class KernelProfiler(object): count_redundant_work=True, count_madds=True, count_within_subscripts=False, + include_kernel_params_in_ptx_filename=False, ): self.ctx_cache = {} @@ -76,6 +89,9 @@ class KernelProfiler(object): self.count_madds = count_madds self.count_within_subscripts = count_within_subscripts + self.include_kernel_params_in_ptx_filename = \ + include_kernel_params_in_ptx_filename + def get_cl_context(self): if self.interactive: @@ -126,6 +142,23 @@ class KernelProfiler(object): import numpy as np return np.average(wtimes[self.n_warmup_time_trials:]) + def save_ptx( + self, + knl, + param_dict=None, + ): + + if self.include_kernel_params_in_ptx_filename: + write_ptx( + self.get_cl_context(), + knl, + filename="ptx_"+knl.name+"_"+"_".join( + ["%s%d" % (p, v) for p, v in param_dict.items()] + )+".ptx" + ) + else: + write_ptx(self.get_cl_context(), knl) + def get_mem_access_stats( self, knl, @@ -226,9 +259,12 @@ class KernelProfiler(object): subgroup_size=None, count_madds=None, count_within_subscripts=None, + include_kernel_params_in_ptx_filename=None, ): # update instance vars if requested + # TODO don't change instance variables, don't allow options changes here, + # instead, make a change_profile_options function if n_warmup_wtime_trials is not None: self.n_warmup_wtime_trials = n_warmup_wtime_trials if n_wtime_trials is not None: @@ -243,6 +279,9 @@ class KernelProfiler(object): self.count_madds = count_madds if count_within_subscripts is not None: self.count_within_subscripts = count_within_subscripts + if include_kernel_params_in_ptx_filename is not None: + self.include_kernel_params_in_ptx_filename = \ + include_kernel_params_in_ptx_filename stats_found = {} kso = KernelStatOptions @@ -300,4 +339,7 @@ class KernelProfiler(object): stats_found[kso.MEM_BANDWIDTH] = \ data_moved_bytes/stats_found[kso.WALL_TIME] + if kso.SAVE_PTX in stat_options: + self.save_ptx(knl, param_dict) + return stats_found -- GitLab From b07cd5b03d6c00a88734642ba9047b20eb1782be Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 30 Jan 2019 16:25:27 -0600 Subject: [PATCH 10/32] created update_options function to change instance variables in profiler, rather than changing them when profiling --- examples/example.py | 11 +++--- kernel_profiler/__init__.py | 70 ++++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/examples/example.py b/examples/example.py index 0231089f9..c3a6cfff8 100644 --- a/examples/example.py +++ b/examples/example.py @@ -28,7 +28,10 @@ m = 2**11 ell = 2**9 param_dict = {'n': n, 'm': m, 'ell': ell} -kp = KernelProfiler("NVIDIA", "GEFORCE", include_kernel_params_in_ptx_filename=True) +kp = KernelProfiler("NVIDIA", "GEFORCE", + evaluate_polys = True, + include_kernel_params_in_ptx_filename=True, + ) stats = kp.profile( knl, [ @@ -42,7 +45,6 @@ stats = kp.profile( kso.SAVE_PTX, ], param_dict=param_dict, - evaluate_polys=True, ) print("\nWall time:", stats[kso.WALL_TIME], "\n") print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP])) @@ -52,7 +54,8 @@ print(stats[kso.GRID_SIZES], "\n") print(stats[kso.FLOP_RATE], "\n") print(stats[kso.MEM_BANDWIDTH], "\n") -interactive_kp = KernelProfiler(interactive=True) -interactive_stats = interactive_kp.profile( +kp.update_options(interactive=True) + +interactive_stats = kp.profile( knl, [kso.WALL_TIME], param_dict=param_dict) print(interactive_stats[kso.WALL_TIME], "\n") diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 67a8a40e7..b0abf3bfd 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -92,6 +92,44 @@ class KernelProfiler(object): self.include_kernel_params_in_ptx_filename = \ include_kernel_params_in_ptx_filename + def update_options( + self, + platform_name=None, + device_name=None, + interactive=None, + n_warmup_wtime_trials=None, + n_wtime_trials=None, + evaluate_polys=None, + count_redundant_work=None, + subgroup_size=None, + count_madds=None, + count_within_subscripts=None, + include_kernel_params_in_ptx_filename=None, + ): + if platform_name is not None: + self.platform_name = platform_name + if device_name is not None: + self.device_name = device_name + if interactive is not None: + self.interactive = interactive + if n_warmup_wtime_trials is not None: + self.n_warmup_wtime_trials = n_warmup_wtime_trials + if n_wtime_trials is not None: + self.n_wtime_trials = n_wtime_trials + if evaluate_polys is not None: + self.evaluate_polys = evaluate_polys + if count_redundant_work is not None: + self.count_redundant_work = count_redundant_work + if subgroup_size is not None: + self.subgroup_size = subgroup_size + if count_madds is not None: + self.count_madds = count_madds + if count_within_subscripts is not None: + self.count_within_subscripts = count_within_subscripts + if include_kernel_params_in_ptx_filename is not None: + self.include_kernel_params_in_ptx_filename = \ + include_kernel_params_in_ptx_filename + def get_cl_context(self): if self.interactive: @@ -149,6 +187,9 @@ class KernelProfiler(object): ): if self.include_kernel_params_in_ptx_filename: + if param_dict is None: + raise ValueError("Cannot include kernel params " + "in ptx filename, no param dict passed.") write_ptx( self.get_cl_context(), knl, @@ -252,37 +293,8 @@ class KernelProfiler(object): knl, stat_options=[], param_dict=None, - n_warmup_wtime_trials=None, - n_wtime_trials=None, - evaluate_polys=None, - count_redundant_work=None, - subgroup_size=None, - count_madds=None, - count_within_subscripts=None, - include_kernel_params_in_ptx_filename=None, ): - # update instance vars if requested - # TODO don't change instance variables, don't allow options changes here, - # instead, make a change_profile_options function - if n_warmup_wtime_trials is not None: - self.n_warmup_wtime_trials = n_warmup_wtime_trials - if n_wtime_trials is not None: - self.n_wtime_trials = n_wtime_trials - if evaluate_polys is not None: - self.evaluate_polys = evaluate_polys - if count_redundant_work is not None: - self.count_redundant_work = count_redundant_work - if subgroup_size is not None: - self.subgroup_size = subgroup_size - if count_madds is not None: - self.count_madds = count_madds - if count_within_subscripts is not None: - self.count_within_subscripts = count_within_subscripts - if include_kernel_params_in_ptx_filename is not None: - self.include_kernel_params_in_ptx_filename = \ - include_kernel_params_in_ptx_filename - stats_found = {} kso = KernelStatOptions -- GitLab From ad495b80ac860b6fca804ea01b771500be38e12d Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 30 Jan 2019 22:22:12 -0600 Subject: [PATCH 11/32] added generated_code as a stat option --- examples/example.py | 7 +++++-- kernel_profiler/__init__.py | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/example.py b/examples/example.py index c3a6cfff8..28ebf2c29 100644 --- a/examples/example.py +++ b/examples/example.py @@ -28,7 +28,9 @@ m = 2**11 ell = 2**9 param_dict = {'n': n, 'm': m, 'ell': ell} -kp = KernelProfiler("NVIDIA", "GEFORCE", +kp = KernelProfiler( + #"NVIDIA", "GEFORCE", + interactive=True, evaluate_polys = True, include_kernel_params_in_ptx_filename=True, ) @@ -43,6 +45,7 @@ stats = kp.profile( kso.FLOP_RATE, kso.MEM_BANDWIDTH, kso.SAVE_PTX, + kso.GENERATED_CODE, ], param_dict=param_dict, ) @@ -54,7 +57,7 @@ print(stats[kso.GRID_SIZES], "\n") print(stats[kso.FLOP_RATE], "\n") print(stats[kso.MEM_BANDWIDTH], "\n") -kp.update_options(interactive=True) +kp.update_options(evaluate_polys=False) interactive_stats = kp.profile( knl, [kso.WALL_TIME], param_dict=param_dict) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index b0abf3bfd..ec13b0914 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -56,6 +56,7 @@ class KernelStatOptions: FLOP_RATE = "flop_rate" MEM_BANDWIDTH = "mem_bandwidth" SAVE_PTX = "save_ptx" + GENERATED_CODE = "generated_code" class KernelProfiler(object): @@ -354,4 +355,7 @@ class KernelProfiler(object): if kso.SAVE_PTX in stat_options: self.save_ptx(knl, param_dict) + if kso.GENERATED_CODE in stat_options: + stats_found[kso.GENERATED_CODE] = lp.generate_code_v2(knl).device_code() + return stats_found -- GitLab From 10eb329a4c33c69bf77f397c3eb54109de508a9e Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 30 Jan 2019 22:40:46 -0600 Subject: [PATCH 12/32] removed interactive setting, instead automatically interactive when no platform/device provided --- examples/example.py | 6 +++--- kernel_profiler/__init__.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/example.py b/examples/example.py index 28ebf2c29..c9db9ebe5 100644 --- a/examples/example.py +++ b/examples/example.py @@ -30,7 +30,7 @@ param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler( #"NVIDIA", "GEFORCE", - interactive=True, + #"NVIDIA", "K40C", evaluate_polys = True, include_kernel_params_in_ptx_filename=True, ) @@ -59,6 +59,6 @@ print(stats[kso.MEM_BANDWIDTH], "\n") kp.update_options(evaluate_polys=False) -interactive_stats = kp.profile( +stats = kp.profile( knl, [kso.WALL_TIME], param_dict=param_dict) -print(interactive_stats[kso.WALL_TIME], "\n") +print(stats[kso.WALL_TIME], "\n") diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index ec13b0914..058966dd2 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -133,13 +133,13 @@ class KernelProfiler(object): def get_cl_context(self): - if self.interactive: - return cl.create_some_context() + if self.platform_name is None or self.device_name is None: + ctx = cl.create_some_context() + self.platform_name = ctx.devices[0].platform.name + self.device_name = ctx.devices[0].name + self.ctx_cache[(self.platform_name, self.device_name, "ctx")] = ctx + return ctx else: - if self.platform_name is None or self.device_name is None: - raise ValueError( - "Wall time requires platform name, and device name.") - cache_key = (self.platform_name, self.device_name, "ctx") try: return self.ctx_cache[cache_key] -- GitLab From 12e4c12855b407fbb8e2994beee3ec43b0219a28 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 30 Jan 2019 22:56:49 -0600 Subject: [PATCH 13/32] allowing ptx filename suffix --- kernel_profiler/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 058966dd2..4c0e103b3 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -74,7 +74,9 @@ class KernelProfiler(object): count_madds=True, count_within_subscripts=False, include_kernel_params_in_ptx_filename=False, + ptx_filename_suffix="", ): + # TODO figure out how to let user specify target w/device self.ctx_cache = {} self.platform_name = platform_name @@ -92,6 +94,7 @@ class KernelProfiler(object): self.include_kernel_params_in_ptx_filename = \ include_kernel_params_in_ptx_filename + self.ptx_filename_suffix = ptx_filename_suffix def update_options( self, @@ -106,6 +109,7 @@ class KernelProfiler(object): count_madds=None, count_within_subscripts=None, include_kernel_params_in_ptx_filename=None, + ptx_filename_suffix=None, ): if platform_name is not None: self.platform_name = platform_name @@ -130,6 +134,8 @@ class KernelProfiler(object): if include_kernel_params_in_ptx_filename is not None: self.include_kernel_params_in_ptx_filename = \ include_kernel_params_in_ptx_filename + if ptx_filename_suffix is not None: + self.ptx_filename_suffix = ptx_filename_suffix def get_cl_context(self): @@ -196,7 +202,7 @@ class KernelProfiler(object): knl, filename="ptx_"+knl.name+"_"+"_".join( ["%s%d" % (p, v) for p, v in param_dict.items()] - )+".ptx" + )+self.ptx_filename_suffix+".ptx" ) else: write_ptx(self.get_cl_context(), knl) -- GitLab From ac5516874cce9724319ec26bda2da9a6b4f9c77b Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 30 Jan 2019 23:22:47 -0600 Subject: [PATCH 14/32] now when kernel already has a target w/ device, use that to create ctx --- kernel_profiler/__init__.py | 41 +++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 4c0e103b3..ec113b3a6 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -137,15 +137,44 @@ class KernelProfiler(object): if ptx_filename_suffix is not None: self.ptx_filename_suffix = ptx_filename_suffix - def get_cl_context(self): + def get_cl_context(self, knl): + + if knl.target is not None and knl.target.device is not None: + # kernel has a device already, see if we can use it + knl_platform_name = knl.target.device.platform.name + knl_device_name = knl.target.device.name + + # check for mismatch between platforms/devices + if (self.platform_name is not None + and not self.platform_name in knl_platform_name) or ( + self.device_name is not None + and not self.device_name in knl_device_name): + raise ValueError("kernel target platform %s and/or device %s do " + "not match profiler platform %s and/or device %s." + % (knl_platform_name, knl_device_name, + self.platform_name, self.device_name)) + + cache_key = (knl_platform_name, knl_device_name, "ctx") + try: + return self.ctx_cache[cache_key] + except KeyError: + ctx = cl.Context([find_cl_device_candidates( + knl_platform_name, knl_device_name)[-1]] + ) + self.ctx_cache[cache_key] = ctx + return ctx - if self.platform_name is None or self.device_name is None: - ctx = cl.create_some_context() + elif self.platform_name is None or self.device_name is None: + # kernel does not have a pre-specified device, + # and profiler does not know platform+device + ctx = cl.create_some_context() # interactive mode self.platform_name = ctx.devices[0].platform.name self.device_name = ctx.devices[0].name self.ctx_cache[(self.platform_name, self.device_name, "ctx")] = ctx return ctx + else: + # profiler knows both platform and device already cache_key = (self.platform_name, self.device_name, "ctx") try: return self.ctx_cache[cache_key] @@ -166,7 +195,7 @@ class KernelProfiler(object): raise ValueError( "Wall time requires dictionary of kernel parameters.") - ctx = self.get_cl_context() + ctx = self.get_cl_context(knl) queue = cl.CommandQueue(ctx) arg_arrays = create_rand_args(ctx, knl, param_dict) @@ -198,14 +227,14 @@ class KernelProfiler(object): raise ValueError("Cannot include kernel params " "in ptx filename, no param dict passed.") write_ptx( - self.get_cl_context(), + self.get_cl_context(knl), knl, filename="ptx_"+knl.name+"_"+"_".join( ["%s%d" % (p, v) for p, v in param_dict.items()] )+self.ptx_filename_suffix+".ptx" ) else: - write_ptx(self.get_cl_context(), knl) + write_ptx(self.get_cl_context(knl), knl) def get_mem_access_stats( self, -- GitLab From a61c07d94b91aa2585dcdd6a052d627ca9c26eac Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 31 Jan 2019 01:40:46 -0600 Subject: [PATCH 15/32] counting madds as two ops for flop/s rate --- kernel_profiler/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index ec113b3a6..453fa9a92 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -371,8 +371,13 @@ class KernelProfiler(object): if kso.FLOP_RATE in stat_options: import numpy as np + # count madds as 2 ops + # (count all flops once and then count the madds again) float_ops = stats_found[kso.OP_MAP].filter_by( dtype=[np.float32, np.float64] + ).sum() + \ + stats_found[kso.OP_MAP].filter_by( + dtype=[np.float32, np.float64], name=["madd"] ).sum() if not self.evaluate_polys: float_ops = float_ops.eval_with_dict(param_dict) -- GitLab From 5b330274988b54ff3d4c5e1ad6982c45606f9219 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 31 Jan 2019 10:39:40 -0600 Subject: [PATCH 16/32] updated readme --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 5f83baabb..0c1e2149c 100644 --- a/README.md +++ b/README.md @@ -30,3 +30,9 @@ * **KernelStatOptions.MEM_BANDWIDTH** Global memory bytes accessed per second. + +* **KernelStatOptions.GENERATED_CODE** + Generated opencl code. + +* **KernelStatOptions.SAVE_PTX** + Save PTX (Portable Thread eXecution) file. -- GitLab From 339ecaaa28007d6f8190ed899d8d6bd882864bb3 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 31 Jan 2019 19:28:32 -0600 Subject: [PATCH 17/32] fixing flop counting, flops were only being counted once per subgroup --- kernel_profiler/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 453fa9a92..d2b2e00f8 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -373,17 +373,19 @@ class KernelProfiler(object): import numpy as np # count madds as 2 ops # (count all flops once and then count the madds again) - float_ops = stats_found[kso.OP_MAP].filter_by( - dtype=[np.float32, np.float64] - ).sum() + \ + float_ops = self.subgroup_size*( stats_found[kso.OP_MAP].filter_by( - dtype=[np.float32, np.float64], name=["madd"] - ).sum() + dtype=[np.float32, np.float64] + ).sum() + + stats_found[kso.OP_MAP].filter_by( + dtype=[np.float32, np.float64], name=["madd"] + ).sum()) if not self.evaluate_polys: float_ops = float_ops.eval_with_dict(param_dict) stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME] if kso.MEM_BANDWIDTH in stat_options: + # TODO check for stride 0 access, only counted once per subgroup data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( mtype=["global"] ).to_bytes().sum() -- GitLab From 642845920ba7bd7560b8ccd3be218c2301d68ce0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 31 Jan 2019 22:38:08 -0600 Subject: [PATCH 18/32] accounting for count granularity when computing flops and bandwidth --- kernel_profiler/__init__.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index d2b2e00f8..4cbefbed0 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -76,7 +76,6 @@ class KernelProfiler(object): include_kernel_params_in_ptx_filename=False, ptx_filename_suffix="", ): - # TODO figure out how to let user specify target w/device self.ctx_cache = {} self.platform_name = platform_name @@ -146,9 +145,9 @@ class KernelProfiler(object): # check for mismatch between platforms/devices if (self.platform_name is not None - and not self.platform_name in knl_platform_name) or ( + and self.platform_name not in knl_platform_name) or ( self.device_name is not None - and not self.device_name in knl_device_name): + and self.device_name not in knl_device_name): raise ValueError("kernel target platform %s and/or device %s do " "not match profiler platform %s and/or device %s." % (knl_platform_name, knl_device_name, @@ -373,21 +372,45 @@ class KernelProfiler(object): import numpy as np # count madds as 2 ops # (count all flops once and then count the madds again) + + # flops counted w/subgroup granularity float_ops = self.subgroup_size*( stats_found[kso.OP_MAP].filter_by( - dtype=[np.float32, np.float64] + dtype=[np.float32, np.float64], + count_granularity=[lp.CountGranularity.SUBGROUP], ).sum() + stats_found[kso.OP_MAP].filter_by( - dtype=[np.float32, np.float64], name=["madd"] + dtype=[np.float32, np.float64], + count_granularity=[lp.CountGranularity.SUBGROUP], + name=["madd"] ).sum()) + + # flops counted w/workitem granularity (should be zero) + float_ops += stats_found[kso.OP_MAP].filter_by( + dtype=[np.float32, np.float64], + count_granularity=[lp.CountGranularity.WORKITEM], + ).sum() + stats_found[kso.OP_MAP].filter_by( + dtype=[np.float32, np.float64], + count_granularity=[lp.CountGranularity.WORKITEM], + name=["madd"] + ).sum() + # TODO after ToCountMap gets version of sum that allows + # counting w/specified count granularity, update this + if not self.evaluate_polys: float_ops = float_ops.eval_with_dict(param_dict) stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME] if kso.MEM_BANDWIDTH in stat_options: - # TODO check for stride 0 access, only counted once per subgroup + # mem access counted w/subgroup granularity data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( - mtype=["global"] + mtype=["global"], + count_granularity=[lp.CountGranularity.SUBGROUP], + ).to_bytes().sum()*self.subgroup_size + # mem access counted w/workitem granularity + data_moved_bytes += stats_found[kso.MEM_ACCESS_MAP].filter_by( + mtype=["global"], + count_granularity=[lp.CountGranularity.WORKITEM], ).to_bytes().sum() if not self.evaluate_polys: data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict) -- GitLab From b5272b58fb1cdee56aeb7ee2cde77f411fe5bf37 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 2 Feb 2019 19:32:48 -0600 Subject: [PATCH 19/32] MEM_BANDWIDTH now calculated two ways, once counting all global accesses and once counting footprint --- examples/example.py | 6 +++--- kernel_profiler/__init__.py | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/example.py b/examples/example.py index c9db9ebe5..09c25a7e8 100644 --- a/examples/example.py +++ b/examples/example.py @@ -1,7 +1,7 @@ import loopy as lp import numpy as np from kernel_profiler import KernelProfiler -from kernel_profiler import KernelStatOptions as kso +from kernel_profiler import KernelStatOptions as kso # noqa knl = lp.make_kernel( @@ -31,7 +31,7 @@ param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler( #"NVIDIA", "GEFORCE", #"NVIDIA", "K40C", - evaluate_polys = True, + evaluate_polys=True, include_kernel_params_in_ptx_filename=True, ) stats = kp.profile( @@ -55,7 +55,7 @@ print(lp.stringify_stats_mapping(stats[kso.OP_MAP])) print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP])) print(stats[kso.GRID_SIZES], "\n") print(stats[kso.FLOP_RATE], "\n") -print(stats[kso.MEM_BANDWIDTH], "\n") +print(stats[kso.MEM_BANDWIDTH][0], stats[kso.MEM_BANDWIDTH][1], "\n") kp.update_options(evaluate_polys=False) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 4cbefbed0..1dd2223fa 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -77,6 +77,8 @@ class KernelProfiler(object): ptx_filename_suffix="", ): + # TODO create cache to store kernels (for executing w/different params) + # TODO create cache to store stats mappings self.ctx_cache = {} self.platform_name = platform_name self.device_name = device_name @@ -402,6 +404,14 @@ class KernelProfiler(object): stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME] if kso.MEM_BANDWIDTH in stat_options: + # first get footprint of data moved + from loopy import gather_access_footprint_bytes + footsize_bytes = 0 + for access, count in stats_found[kso.MEM_ACCESS_MAP].items(): + if access.mtype == "global": + direction = "write" if access.direction == "store" else "read" + footsize_bytes += gather_access_footprint_bytes(knl)[(access.variable, direction)].eval_with_dict(param_dict) + # mem access counted w/subgroup granularity data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( mtype=["global"], @@ -412,10 +422,14 @@ class KernelProfiler(object): mtype=["global"], count_granularity=[lp.CountGranularity.WORKITEM], ).to_bytes().sum() + # if these polys have not alread been evaluated, evaluate them if not self.evaluate_polys: data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict) - stats_found[kso.MEM_BANDWIDTH] = \ - data_moved_bytes/stats_found[kso.WALL_TIME] + + stats_found[kso.MEM_BANDWIDTH] = ( + data_moved_bytes/stats_found[kso.WALL_TIME], + footsize_bytes/stats_found[kso.WALL_TIME] + ) if kso.SAVE_PTX in stat_options: self.save_ptx(knl, param_dict) -- GitLab From 8198c328bec28743678ebbdae78e675b9ec630d9 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 2 Feb 2019 20:10:28 -0600 Subject: [PATCH 20/32] caching stats maps; combined separate stats getting functions into one to reduce redundant code --- kernel_profiler/__init__.py | 120 +++++++++++++++++------------------- 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 1dd2223fa..7f0d150a3 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -1,5 +1,6 @@ import pyopencl as cl import loopy as lp +from loopy.preprocess import prepare_for_caching def create_rand_args(ctx, knl, param_dict): @@ -78,8 +79,8 @@ class KernelProfiler(object): ): # TODO create cache to store kernels (for executing w/different params) - # TODO create cache to store stats mappings self.ctx_cache = {} + self.stats_mapping_cache = {} self.platform_name = platform_name self.device_name = device_name self.interactive = interactive @@ -237,69 +238,57 @@ class KernelProfiler(object): else: write_ptx(self.get_cl_context(knl), knl) - def get_mem_access_stats( + def get_cached_stats_mapping( self, knl, - param_dict=None, - ): - - from loopy.statistics import get_mem_access_map - - mem_access_map = get_mem_access_map( - knl, - count_redundant_work=self.count_redundant_work, - subgroup_size=self.subgroup_size, - ) - - if self.evaluate_polys: - if param_dict is None: - raise ValueError("Cannont evaluate polynomials without param_dict.") - return mem_access_map.eval(param_dict) - else: - return mem_access_map - - def get_op_stats( - self, - knl, - param_dict=None, + stat_option, # KernelStatOptions ): - from loopy.statistics import get_op_map - - op_map = get_op_map( - knl, - count_redundant_work=self.count_redundant_work, - count_within_subscripts=self.count_within_subscripts, - subgroup_size=self.subgroup_size, - count_madds=self.count_madds, - ) + cache_key = (prepare_for_caching(knl), stat_option) + # TODO avoid multiple calls to prepare_for_caching()? - if self.evaluate_polys: - if param_dict is None: - raise ValueError("Cannont evaluate polynomials without param_dict.") - return op_map.eval(param_dict) - else: - return op_map + try: + return self.stats_mapping_cache[cache_key] + except KeyError: + if stat_option == KernelStatOptions.MEM_ACCESS_MAP: + from loopy.statistics import get_mem_access_map + stats_map = get_mem_access_map( + knl, + count_redundant_work=self.count_redundant_work, + subgroup_size=self.subgroup_size, + ) + elif stat_option == KernelStatOptions.OP_MAP: + from loopy.statistics import get_op_map + stats_map = get_op_map( + knl, + count_redundant_work=self.count_redundant_work, + count_within_subscripts=self.count_within_subscripts, + subgroup_size=self.subgroup_size, + count_madds=self.count_madds, + ) + elif stat_option == KernelStatOptions.SYNC_MAP: + from loopy.statistics import get_synchronization_map + stats_map = get_synchronization_map( + knl, + subgroup_size=self.subgroup_size, + ) + self.stats_mapping_cache[cache_key] = stats_map + return stats_map - def get_synchronization_stats( + def get_stats_mapping_and_evaluate_if_required( self, knl, + stat_option, # KernelStatOptions param_dict=None, ): - - from loopy.statistics import get_synchronization_map - - sync_map = get_synchronization_map( - knl, - subgroup_size=self.subgroup_size, - ) - + stats_map = self.get_cached_stats_mapping(knl, stat_option) if self.evaluate_polys: if param_dict is None: - raise ValueError("Cannont evaluate polynomials without param_dict.") - return sync_map.eval(param_dict) + raise ValueError( + "Cannot evaluate polynomials without param_dict.") + return stats_map.eval(param_dict) else: - return sync_map + return stats_map def get_grid_sizes( self, @@ -345,24 +334,29 @@ class KernelProfiler(object): if kso.MEM_ACCESS_MAP in stat_options or \ kso.MEM_BANDWIDTH in stat_options: - stats_found[kso.MEM_ACCESS_MAP] = self.get_mem_access_stats( - knl, - param_dict=param_dict, - ) + stats_found[kso.MEM_ACCESS_MAP] = \ + self.get_stats_mapping_and_evaluate_if_required( + knl, + kso.MEM_ACCESS_MAP, + param_dict=param_dict, + ) if kso.OP_MAP in stat_options or \ kso.FLOP_RATE in stat_options: - stats_found[kso.OP_MAP] = self.get_op_stats( - knl, - param_dict=param_dict, - ) + stats_found[kso.OP_MAP] = \ + self.get_stats_mapping_and_evaluate_if_required( + knl, + kso.OP_MAP, + param_dict=param_dict, + ) if kso.SYNC_MAP in stat_options: stats_found[kso.SYNC_MAP] = \ - self.get_synchronization_stats( - knl, - param_dict=param_dict, - ) + self.get_stats_mapping_and_evaluate_if_required( + knl, + kso.SYNC_MAP, + param_dict=param_dict, + ) if kso.GRID_SIZES in stat_options: stats_found[kso.GRID_SIZES] = self.get_grid_sizes( -- GitLab From f220705a0a6b12ec10f2dba4eaba9a250602fca9 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 2 Feb 2019 20:31:58 -0600 Subject: [PATCH 21/32] caching grid sizes --- kernel_profiler/__init__.py | 49 ++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 7f0d150a3..5d83df7af 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -80,7 +80,7 @@ class KernelProfiler(object): # TODO create cache to store kernels (for executing w/different params) self.ctx_cache = {} - self.stats_mapping_cache = {} + self.stat_cache = {} self.platform_name = platform_name self.device_name = device_name self.interactive = interactive @@ -238,7 +238,7 @@ class KernelProfiler(object): else: write_ptx(self.get_cl_context(knl), knl) - def get_cached_stats_mapping( + def get_cached_stats_map( self, knl, stat_option, # KernelStatOptions @@ -248,7 +248,7 @@ class KernelProfiler(object): # TODO avoid multiple calls to prepare_for_caching()? try: - return self.stats_mapping_cache[cache_key] + return self.stat_cache[cache_key] except KeyError: if stat_option == KernelStatOptions.MEM_ACCESS_MAP: from loopy.statistics import get_mem_access_map @@ -272,16 +272,16 @@ class KernelProfiler(object): knl, subgroup_size=self.subgroup_size, ) - self.stats_mapping_cache[cache_key] = stats_map + self.stat_cache[cache_key] = stats_map return stats_map - def get_stats_mapping_and_evaluate_if_required( + def get_stats_map_and_evaluate_if_required( self, knl, stat_option, # KernelStatOptions param_dict=None, ): - stats_map = self.get_cached_stats_mapping(knl, stat_option) + stats_map = self.get_cached_stats_map(knl, stat_option) if self.evaluate_polys: if param_dict is None: raise ValueError( @@ -296,23 +296,32 @@ class KernelProfiler(object): param_dict=None, ): - global_size, local_size = knl.get_grid_size_upper_bounds() + cache_key = (prepare_for_caching(knl), KernelStatOptions.GRID_SIZES) + # TODO avoid multiple calls to prepare_for_caching()? + + try: + grid_sizes = self.stat_cache[cache_key] + except KeyError: + + global_size, local_size = knl.get_grid_size_upper_bounds() - from islpy import PwQPolynomial - gsize_pwqs = [] - lsize_pwqs = [] - for gsize in global_size: - gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize)) - for lsize in local_size: - lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize)) + from islpy import PwQPolynomial + gsize_pwqs = [] + lsize_pwqs = [] + for gsize in global_size: + gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize)) + for lsize in local_size: + lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize)) + grid_sizes = [gsize_pwqs, lsize_pwqs] + self.stat_cache[cache_key] = grid_sizes if self.evaluate_polys: if param_dict is None: raise ValueError("Cannont evaluate polynomials without param_dict.") - return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \ - [l.eval_with_dict(param_dict) for l in lsize_pwqs] + return [g.eval_with_dict(param_dict) for g in grid_sizes[0]], \ + [l.eval_with_dict(param_dict) for l in grid_sizes[1]] else: - return gsize_pwqs, lsize_pwqs + return grid_sizes def profile( self, @@ -335,7 +344,7 @@ class KernelProfiler(object): if kso.MEM_ACCESS_MAP in stat_options or \ kso.MEM_BANDWIDTH in stat_options: stats_found[kso.MEM_ACCESS_MAP] = \ - self.get_stats_mapping_and_evaluate_if_required( + self.get_stats_map_and_evaluate_if_required( knl, kso.MEM_ACCESS_MAP, param_dict=param_dict, @@ -344,7 +353,7 @@ class KernelProfiler(object): if kso.OP_MAP in stat_options or \ kso.FLOP_RATE in stat_options: stats_found[kso.OP_MAP] = \ - self.get_stats_mapping_and_evaluate_if_required( + self.get_stats_map_and_evaluate_if_required( knl, kso.OP_MAP, param_dict=param_dict, @@ -352,7 +361,7 @@ class KernelProfiler(object): if kso.SYNC_MAP in stat_options: stats_found[kso.SYNC_MAP] = \ - self.get_stats_mapping_and_evaluate_if_required( + self.get_stats_map_and_evaluate_if_required( knl, kso.SYNC_MAP, param_dict=param_dict, -- GitLab From 7f054b2972b83c38b2b6d37f8b271296b1397b1e Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 2 Feb 2019 20:36:31 -0600 Subject: [PATCH 22/32] in footprint counting, filtering mem map by mtype=global before iterating rather than checking mtype==global --- kernel_profiler/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 5d83df7af..449088eb4 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -410,10 +410,11 @@ class KernelProfiler(object): # first get footprint of data moved from loopy import gather_access_footprint_bytes footsize_bytes = 0 - for access, count in stats_found[kso.MEM_ACCESS_MAP].items(): - if access.mtype == "global": - direction = "write" if access.direction == "store" else "read" - footsize_bytes += gather_access_footprint_bytes(knl)[(access.variable, direction)].eval_with_dict(param_dict) + for access, count in stats_found[kso.MEM_ACCESS_MAP].filter_by( + mtype=["global"]).items(): + direction = "write" if access.direction == "store" else "read" + footsize_bytes += gather_access_footprint_bytes(knl)[ + (access.variable, direction)].eval_with_dict(param_dict) # mem access counted w/subgroup granularity data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( -- GitLab From 861fb7e521813dac9f7f40504f2fe42cd861dd72 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 13 Mar 2019 15:20:08 -0500 Subject: [PATCH 23/32] printing generated code in example --- examples/example.py | 3 +++ kernel_profiler/__init__.py | 1 + 2 files changed, 4 insertions(+) diff --git a/examples/example.py b/examples/example.py index 09c25a7e8..f0b2e970d 100644 --- a/examples/example.py +++ b/examples/example.py @@ -49,6 +49,9 @@ stats = kp.profile( ], param_dict=param_dict, ) + +print(stats[kso.GENERATED_CODE]) + print("\nWall time:", stats[kso.WALL_TIME], "\n") print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP])) print(lp.stringify_stats_mapping(stats[kso.OP_MAP])) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py index 449088eb4..c6834a985 100644 --- a/kernel_profiler/__init__.py +++ b/kernel_profiler/__init__.py @@ -58,6 +58,7 @@ class KernelStatOptions: MEM_BANDWIDTH = "mem_bandwidth" SAVE_PTX = "save_ptx" GENERATED_CODE = "generated_code" + # TODO mem access to footprint ratio class KernelProfiler(object): -- GitLab From f16dd472b62eb79e9d169924a346002d9e61f873 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 13 Mar 2019 15:37:00 -0500 Subject: [PATCH 24/32] renamed kernel_profiler --- kernel_profiler/{__init__.py => kernel_profiler.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel_profiler/{__init__.py => kernel_profiler.py} (100%) diff --git a/kernel_profiler/__init__.py b/kernel_profiler/kernel_profiler.py similarity index 100% rename from kernel_profiler/__init__.py rename to kernel_profiler/kernel_profiler.py -- GitLab From 899999ebec8f99de49d8b9365b1017a9bde9fb5c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 1 May 2019 12:28:10 -0500 Subject: [PATCH 25/32] moved kernel profiler --- .../kernel_profiler}/kernel_profiler.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) rename {kernel_profiler => loopy/kernel_profiler}/kernel_profiler.py (93%) diff --git a/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py similarity index 93% rename from kernel_profiler/kernel_profiler.py rename to loopy/kernel_profiler/kernel_profiler.py index c6834a985..13de9c0ca 100644 --- a/kernel_profiler/kernel_profiler.py +++ b/loopy/kernel_profiler/kernel_profiler.py @@ -260,13 +260,26 @@ class KernelProfiler(object): ) elif stat_option == KernelStatOptions.OP_MAP: from loopy.statistics import get_op_map - stats_map = get_op_map( - knl, - count_redundant_work=self.count_redundant_work, - count_within_subscripts=self.count_within_subscripts, - subgroup_size=self.subgroup_size, - count_madds=self.count_madds, - ) + if self.count_madds: + # TODO once madd counting branch is merged, remove this conditional + try: + stats_map = get_op_map( + knl, + count_redundant_work=self.count_redundant_work, + count_within_subscripts=self.count_within_subscripts, + subgroup_size=self.subgroup_size, + count_madds=self.count_madds, + ) + except TypeError: + raise NotImplementedError( + "count_madds requires the unmerged madd counting branch.") + else: + stats_map = get_op_map( + knl, + count_redundant_work=self.count_redundant_work, + count_within_subscripts=self.count_within_subscripts, + subgroup_size=self.subgroup_size, + ) elif stat_option == KernelStatOptions.SYNC_MAP: from loopy.statistics import get_synchronization_map stats_map = get_synchronization_map( @@ -431,6 +444,9 @@ class KernelProfiler(object): if not self.evaluate_polys: data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict) + # TODO decide on better way to handle multiple count granularities here + # (uniform access only counted once per warp) + stats_found[kso.MEM_BANDWIDTH] = ( data_moved_bytes/stats_found[kso.WALL_TIME], footsize_bytes/stats_found[kso.WALL_TIME] -- GitLab From d32f00e0325bff7caf5f808a4ea896b0f223b911 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 1 May 2019 12:28:22 -0500 Subject: [PATCH 26/32] added kernel_profiler example --- examples/python/kernel_profiler.py | 87 ++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 examples/python/kernel_profiler.py diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py new file mode 100644 index 000000000..674c75c7e --- /dev/null +++ b/examples/python/kernel_profiler.py @@ -0,0 +1,87 @@ +import loopy as lp +import numpy as np +from loopy.kernel_profiler.kernel_profiler import KernelProfiler +from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso # noqa + + +knl = lp.make_kernel( + "{[i,k,j]: 0<=i Date: Wed, 1 May 2019 12:45:21 -0500 Subject: [PATCH 27/32] when no filename passed to write_ptx, write to stdout --- loopy/kernel_profiler/kernel_profiler.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py index 13de9c0ca..8beaa539a 100644 --- a/loopy/kernel_profiler/kernel_profiler.py +++ b/loopy/kernel_profiler/kernel_profiler.py @@ -42,10 +42,16 @@ def write_ptx(ctx, knl, filename=None): ctx, lp.generate_code_v2(knl).device_code() ).build(options=knl.options.cl_build_options) ptx_src = cl_program.binaries[0] - if not filename: - filename = "ptx_"+knl.name+".ptx" - ptx_src_file = open(filename, 'w') - ptx_src_file.write(ptx_src.decode('utf-8', 'ignore')) + if filename: + ptx_src_file = open(filename, 'w') + ptx_src_file.write(ptx_src.decode('utf-8', 'ignore')) + else: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(knl, "write_ptx_no_filename", + "No filename passed to write_ptx for kernel %s, writing to stdout" + % (knl.name)) + import sys + sys.stdout.write(ptx_src.decode('utf-8', 'ignore')+"\n") class KernelStatOptions: -- GitLab From e7b4300996dbf0bc6486d38c32f157dc1f230a8a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 1 May 2019 12:48:06 -0500 Subject: [PATCH 28/32] fixing flake8 issues --- loopy/kernel_profiler/kernel_profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py index 8beaa539a..71b6eb817 100644 --- a/loopy/kernel_profiler/kernel_profiler.py +++ b/loopy/kernel_profiler/kernel_profiler.py @@ -267,7 +267,7 @@ class KernelProfiler(object): elif stat_option == KernelStatOptions.OP_MAP: from loopy.statistics import get_op_map if self.count_madds: - # TODO once madd counting branch is merged, remove this conditional + # TODO once madd counting branch is merged, remove conditional try: stats_map = get_op_map( knl, @@ -278,7 +278,7 @@ class KernelProfiler(object): ) except TypeError: raise NotImplementedError( - "count_madds requires the unmerged madd counting branch.") + "count_madds requires unmerged madd counting branch.") else: stats_map = get_op_map( knl, -- GitLab From ea61f1b0c784842b59b5de1a0cba8fd6870cb155 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 31 May 2019 03:18:19 -0500 Subject: [PATCH 29/32] changed update_options() to copy(); new copy of everything except cache --- examples/python/kernel_profiler.py | 10 +-- loopy/kernel_profiler/kernel_profiler.py | 99 ++++++++++++++---------- 2 files changed, 64 insertions(+), 45 deletions(-) diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py index 674c75c7e..71bcad903 100644 --- a/examples/python/kernel_profiler.py +++ b/examples/python/kernel_profiler.py @@ -23,9 +23,9 @@ knl = lp.split_iname(knl, "k", lsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") -n = 2**10 -m = 2**11 -ell = 2**9 +n = 2**8 +m = 2**9 +ell = 2**7 param_dict = {'n': n, 'm': m, 'ell': ell} kp = KernelProfiler( @@ -70,9 +70,9 @@ print(stats[kso.FLOP_RATE]*inv_giga) print("\nMem throughput rate (GB/s) (total data accessed, data footprint only):") print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n") -kp.update_options(evaluate_polys=True) +kp2 = kp.copy(evaluate_polys=True) -stats = kp.profile( +stats = kp2.profile( knl, stats, param_dict=param_dict) print("Now change eval_polys to True =========================================") diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py index 71b6eb817..0e4069560 100644 --- a/loopy/kernel_profiler/kernel_profiler.py +++ b/loopy/kernel_profiler/kernel_profiler.py @@ -105,46 +105,65 @@ class KernelProfiler(object): include_kernel_params_in_ptx_filename self.ptx_filename_suffix = ptx_filename_suffix - def update_options( - self, - platform_name=None, - device_name=None, - interactive=None, - n_warmup_wtime_trials=None, - n_wtime_trials=None, - evaluate_polys=None, - count_redundant_work=None, - subgroup_size=None, - count_madds=None, - count_within_subscripts=None, - include_kernel_params_in_ptx_filename=None, - ptx_filename_suffix=None, - ): - if platform_name is not None: - self.platform_name = platform_name - if device_name is not None: - self.device_name = device_name - if interactive is not None: - self.interactive = interactive - if n_warmup_wtime_trials is not None: - self.n_warmup_wtime_trials = n_warmup_wtime_trials - if n_wtime_trials is not None: - self.n_wtime_trials = n_wtime_trials - if evaluate_polys is not None: - self.evaluate_polys = evaluate_polys - if count_redundant_work is not None: - self.count_redundant_work = count_redundant_work - if subgroup_size is not None: - self.subgroup_size = subgroup_size - if count_madds is not None: - self.count_madds = count_madds - if count_within_subscripts is not None: - self.count_within_subscripts = count_within_subscripts - if include_kernel_params_in_ptx_filename is not None: - self.include_kernel_params_in_ptx_filename = \ - include_kernel_params_in_ptx_filename - if ptx_filename_suffix is not None: - self.ptx_filename_suffix = ptx_filename_suffix + def copy( + self, + platform_name=None, + device_name=None, + interactive=None, + n_warmup_time_trials=None, + n_time_trials=None, + evaluate_polys=None, + subgroup_size=None, + count_redundant_work=None, + count_madds=None, + count_within_subscripts=None, + include_kernel_params_in_ptx_filename=None, + ptx_filename_suffix=None, + ): + + platform_name_new = self.platform_name \ + if platform_name is None else platform_name + device_name_new = self.device_name if device_name is None else device_name + interactive_new = self.interactive if interactive is None else interactive + n_warmup_time_trials_new = self.n_warmup_time_trials \ + if n_warmup_time_trials is None else n_warmup_time_trials + n_time_trials_new = self.n_time_trials \ + if n_time_trials is None else n_time_trials + evaluate_polys_new = self.evaluate_polys \ + if evaluate_polys is None else evaluate_polys + count_redundant_work_new = self.count_redundant_work \ + if count_redundant_work is None else count_redundant_work + subgroup_size_new = self.subgroup_size \ + if subgroup_size is None else subgroup_size + count_madds_new = self.count_madds if count_madds is None else count_madds + count_within_subscripts_new = self.count_within_subscripts \ + if count_within_subscripts is None else count_within_subscripts + include_kernel_params_in_ptx_filename_new = \ + self.include_kernel_params_in_ptx_filename \ + if include_kernel_params_in_ptx_filename is None \ + else include_kernel_params_in_ptx_filename + ptx_filename_suffix_new = self.ptx_filename_suffix \ + if ptx_filename_suffix is None else ptx_filename_suffix + + profiler_new = KernelProfiler( + platform_name=platform_name_new, + device_name=device_name_new, + interactive=interactive_new, + n_warmup_time_trials=n_warmup_time_trials_new, + n_time_trials=n_time_trials_new, + evaluate_polys=evaluate_polys_new, + subgroup_size=subgroup_size_new, + count_redundant_work=count_redundant_work_new, + count_madds=count_madds_new, + count_within_subscripts=count_within_subscripts_new, + include_kernel_params_in_ptx_filename= # noqa + include_kernel_params_in_ptx_filename_new, + ptx_filename_suffix=ptx_filename_suffix_new, + ) + profiler_new.ctx_cache = self.ctx_cache + profiler_new.stat_cache = self.stat_cache + + return profiler_new def get_cl_context(self, knl): -- GitLab From 8f270c16648af46f30fc56e5c7f0ab8859adedcf Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 6 Jun 2019 23:42:31 -0500 Subject: [PATCH 30/32] renaming profiler->stat_collector (still need to change file/dir names) --- examples/python/kernel_profiler.py | 8 ++++---- loopy/kernel_profiler/kernel_profiler.py | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py index 71bcad903..f5aa90bb0 100644 --- a/examples/python/kernel_profiler.py +++ b/examples/python/kernel_profiler.py @@ -1,6 +1,6 @@ import loopy as lp import numpy as np -from loopy.kernel_profiler.kernel_profiler import KernelProfiler +from loopy.kernel_profiler.kernel_profiler import KernelStatCollector from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso # noqa @@ -28,14 +28,14 @@ m = 2**9 ell = 2**7 param_dict = {'n': n, 'm': m, 'ell': ell} -kp = KernelProfiler( +kp = KernelStatCollector( #"NVIDIA", "GEFORCE", #"NVIDIA", "K40C", evaluate_polys=False, count_madds=False, # TODO enables this after madd counting branch is merged include_kernel_params_in_ptx_filename=True, ) -stats = kp.profile( +stats = kp.collect_stats( knl, [ kso.WALL_TIME, @@ -72,7 +72,7 @@ print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga kp2 = kp.copy(evaluate_polys=True) -stats = kp2.profile( +stats = kp2.collect_stats( knl, stats, param_dict=param_dict) print("Now change eval_polys to True =========================================") diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py index 0e4069560..c22726b5c 100644 --- a/loopy/kernel_profiler/kernel_profiler.py +++ b/loopy/kernel_profiler/kernel_profiler.py @@ -67,7 +67,7 @@ class KernelStatOptions: # TODO mem access to footprint ratio -class KernelProfiler(object): +class KernelStatCollector(object): def __init__( self, @@ -145,7 +145,7 @@ class KernelProfiler(object): ptx_filename_suffix_new = self.ptx_filename_suffix \ if ptx_filename_suffix is None else ptx_filename_suffix - profiler_new = KernelProfiler( + stat_collector_new = KernelStatCollector( platform_name=platform_name_new, device_name=device_name_new, interactive=interactive_new, @@ -160,10 +160,10 @@ class KernelProfiler(object): include_kernel_params_in_ptx_filename_new, ptx_filename_suffix=ptx_filename_suffix_new, ) - profiler_new.ctx_cache = self.ctx_cache - profiler_new.stat_cache = self.stat_cache + stat_collector_new.ctx_cache = self.ctx_cache + stat_collector_new.stat_cache = self.stat_cache - return profiler_new + return stat_collector_new def get_cl_context(self, knl): @@ -178,7 +178,7 @@ class KernelProfiler(object): self.device_name is not None and self.device_name not in knl_device_name): raise ValueError("kernel target platform %s and/or device %s do " - "not match profiler platform %s and/or device %s." + "not match KernelStatCollector platform %s and/or device %s." % (knl_platform_name, knl_device_name, self.platform_name, self.device_name)) @@ -194,7 +194,7 @@ class KernelProfiler(object): elif self.platform_name is None or self.device_name is None: # kernel does not have a pre-specified device, - # and profiler does not know platform+device + # and KernelStatCollector does not know platform+device ctx = cl.create_some_context() # interactive mode self.platform_name = ctx.devices[0].platform.name self.device_name = ctx.devices[0].name @@ -202,7 +202,7 @@ class KernelProfiler(object): return ctx else: - # profiler knows both platform and device already + # KernelStatCollector knows both platform and device already cache_key = (self.platform_name, self.device_name, "ctx") try: return self.ctx_cache[cache_key] @@ -362,7 +362,7 @@ class KernelProfiler(object): else: return grid_sizes - def profile( + def collect_stats( self, knl, stat_options=[], -- GitLab From f42a619426ab4eeeb649bd9e20ed1c846cb5a249 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 6 Jun 2019 23:44:33 -0500 Subject: [PATCH 31/32] renaming directories/files for kernel profiler -> stat collector --- .../{kernel_profiler.py => kernel_stat_collector.py} | 7 ++++--- .../kernel_stat_collector.py} | 0 2 files changed, 4 insertions(+), 3 deletions(-) rename examples/python/{kernel_profiler.py => kernel_stat_collector.py} (89%) rename loopy/{kernel_profiler/kernel_profiler.py => kernel_stat_collector/kernel_stat_collector.py} (100%) diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_stat_collector.py similarity index 89% rename from examples/python/kernel_profiler.py rename to examples/python/kernel_stat_collector.py index f5aa90bb0..5924b6b30 100644 --- a/examples/python/kernel_profiler.py +++ b/examples/python/kernel_stat_collector.py @@ -1,7 +1,7 @@ import loopy as lp import numpy as np -from loopy.kernel_profiler.kernel_profiler import KernelStatCollector -from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso # noqa +from loopy.kernel_stat_collector.kernel_stat_collector import KernelStatCollector +from loopy.kernel_stat_collector.kernel_stat_collector import KernelStatOptions as kso # noqa knl = lp.make_kernel( @@ -68,7 +68,8 @@ print(stats[kso.GRID_SIZES]) print("\nFlop rate (GFLOP/s):") print(stats[kso.FLOP_RATE]*inv_giga) print("\nMem throughput rate (GB/s) (total data accessed, data footprint only):") -print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n") +print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, + stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n") kp2 = kp.copy(evaluate_polys=True) diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_stat_collector/kernel_stat_collector.py similarity index 100% rename from loopy/kernel_profiler/kernel_profiler.py rename to loopy/kernel_stat_collector/kernel_stat_collector.py -- GitLab From c5da129c3617acf0633c82daeca71e7d8c3b3bce Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 7 Jun 2019 00:07:54 -0500 Subject: [PATCH 32/32] processing subgroup size before using it to compute flops/throughput --- loopy/kernel_stat_collector/kernel_stat_collector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/kernel_stat_collector/kernel_stat_collector.py b/loopy/kernel_stat_collector/kernel_stat_collector.py index c22726b5c..026f9e74f 100644 --- a/loopy/kernel_stat_collector/kernel_stat_collector.py +++ b/loopy/kernel_stat_collector/kernel_stat_collector.py @@ -414,11 +414,13 @@ class KernelStatCollector(object): if kso.FLOP_RATE in stat_options: import numpy as np + from loopy.statistics import _process_subgroup_size + sgs_processed = _process_subgroup_size(knl, self.subgroup_size) # count madds as 2 ops # (count all flops once and then count the madds again) # flops counted w/subgroup granularity - float_ops = self.subgroup_size*( + float_ops = sgs_processed*( stats_found[kso.OP_MAP].filter_by( dtype=[np.float32, np.float64], count_granularity=[lp.CountGranularity.SUBGROUP], @@ -448,6 +450,8 @@ class KernelStatCollector(object): if kso.MEM_BANDWIDTH in stat_options: # first get footprint of data moved from loopy import gather_access_footprint_bytes + from loopy.statistics import _process_subgroup_size + sgs_processed = _process_subgroup_size(knl, self.subgroup_size) footsize_bytes = 0 for access, count in stats_found[kso.MEM_ACCESS_MAP].filter_by( mtype=["global"]).items(): @@ -459,7 +463,7 @@ class KernelStatCollector(object): data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by( mtype=["global"], count_granularity=[lp.CountGranularity.SUBGROUP], - ).to_bytes().sum()*self.subgroup_size + ).to_bytes().sum()*sgs_processed # mem access counted w/workitem granularity data_moved_bytes += stats_found[kso.MEM_ACCESS_MAP].filter_by( mtype=["global"], -- GitLab