diff --git a/examples/plot-connectivity.py b/examples/plot-connectivity.py index 48e6be76b7033c4a3f04f55c6958c7efd856f0e0..23dfdcd1d2db6f06466007a173dcf8d843b3c571 100644 --- a/examples/plot-connectivity.py +++ b/examples/plot-connectivity.py @@ -2,7 +2,7 @@ from __future__ import division import numpy as np # noqa import pyopencl as cl -from meshmode.array_context import PyOpenCLArrayContext +from meshmode.array_context import PyOpenCLArrayContext, PyOpenCLArrayContext from meshmode.dof_array import thaw order = 4 diff --git a/examples/simple-dg.py b/examples/simple-dg.py index b945687758f172194d859560bc1723064d56971b..6adb73738989fd73329bef70b797c7d5f87c4fde 100644 --- a/examples/simple-dg.py +++ b/examples/simple-dg.py @@ -33,7 +33,7 @@ from pytools.obj_array import ( obj_array_vectorize) from meshmode.mesh import BTAG_ALL, BTAG_NONE # noqa from meshmode.dof_array import DOFArray, freeze, thaw -from meshmode.array_context import PyOpenCLArrayContext, make_loopy_program +from meshmode.array_context import PyOpenCLProfilingArrayContext, make_loopy_program # Features lost vs. https://github.com/inducer/grudge: @@ -464,9 +464,9 @@ def bump(actx, discr, t=0): def main(): cl_ctx = cl.create_some_context() - queue = cl.CommandQueue(cl_ctx) + queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - actx = PyOpenCLArrayContext(queue) + actx = PyOpenCLProfilingArrayContext(queue) nel_1d = 16 from meshmode.mesh.generation import generate_regular_rect_mesh diff --git a/meshmode/array_context.py b/meshmode/array_context.py index 26da731eaa4a6e44873bd66d5a514a18b1e2e8dc..ee3c686e3dd1c59b1bc612410d55781872b316f9 100644 --- a/meshmode/array_context.py +++ b/meshmode/array_context.py @@ -22,9 +22,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np import loopy as lp +import pyopencl as cl from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pytools import memoize_method @@ -262,4 +262,166 @@ class PyOpenCLArrayContext(ArrayContext): # }}} +class ProfileData: + time = 0 + flops = 0 + mem_access = 0 + + def __init__(self, time=0, flops=0, mem_access=0): + self.time = time + self.flops = flops + self.mem_access = mem_access + + def __repr__(self): + return "(time={0}, flops={1}, mem_access={2})".format(self.time, self.flops, self.mem_access) + + def __str__(self): + return self.__repr__() + + +class TimingEvent: + def __init__(self, event, program, kwargs): + self.event = event + self.program = program + self.kwargs = kwargs + + +class PyOpenCLProfilingArrayContext(PyOpenCLArrayContext): + + def __init__(self, queue, allocator=None): + super().__init__(queue, allocator) + + if not queue.properties & cl.command_queue_properties.PROFILING_ENABLE: + from warnings import warn + warn("Profiling was not enabled in the command queue. Timing data will not be collected.") + self.profiling_enabled = False + else: + self.profiling_enabled = True + + self.events = [] + self.profiling_data = {} + self.invoker_codes = {} + + def finish_profile_events(self): + + if not self.profiling_enabled: + return + + if self.events: + cl.wait_for_events([t.event for t in self.events]) + + for t in self.events: + + kwargs = t.kwargs + program = t.program + invoker_code = self.invoker_codes[program.name][tuple(kwargs)] + evt = t.event + + types = {} + param_dict = {} + + for key, value in kwargs.items(): + types[key] = value.dtype + param_dict[key] = value + + # extract integer argument generation code from wrapper + code = "" + import textwrap + for o in ["shapes", "strides"]: #"offsets", + subs = "# {{{ find integer arguments from " + o + start=invoker_code.find(subs) + len(subs) + end=invoker_code.find("# }}}", start) + code = code + textwrap.dedent(invoker_code[start:end]) + + + for key, value in program.arg_dict.items(): + if key not in param_dict: + param_dict[key] = None + + # execute integer argument generation code from wrapper + exec(code, param_dict) + + # get statistics + program = lp.add_and_infer_dtypes(program, types) + op_map = lp.get_op_map(program, subgroup_size='guess') + mem_map = lp.get_mem_access_map(program, subgroup_size='guess') + + f32op_count = op_map.filter_by(dtype=[np.float32]).eval_and_sum(param_dict) + f64op_count = op_map.filter_by(dtype=[np.float64]).eval_and_sum(param_dict) + + mem32_count = mem_map.filter_by(dtype=[np.float32]).eval_and_sum(param_dict) + mem64_count = mem_map.filter_by(dtype=[np.float64]).eval_and_sum(param_dict) + + flops = f32op_count + f64op_count + time = evt.profile.end - evt.profile.start + mem_access = mem32_count + mem64_count + + if program.name in self.profiling_data: + self.profiling_data[program.name].append(ProfileData(time, flops, mem_access)) + else: + self.profiling_data[program.name] = [ProfileData(time, flops, mem_access)] + + self.events = [] + + def print_profiling_data(self): + + if not self.profiling_enabled: + return + + self.finish_profile_events() + + max_name_len = max([len(key) for key, value in self.profiling_data.items()]) + max_name_len = max(max_name_len, len('Function')) + + format_str = "{:<" + str(max_name_len) + "} {:>6} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}" + + print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8)) + print(format_str.format('Function', 'Calls', 'T_min', 'T_avg', 'T_max', 'F_min', 'F_avg', 'F_max', 'M_min', 'M_avg', 'M_max', 'BW_avg')) + print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8)) + + from statistics import mean + + for key, value in self.profiling_data.items(): + num_values = len(value) + + times = [v.time for v in value] + flops = [v.flops for v in value] + mem_access = [v.mem_access for v in value] + + print(format_str.format(key, num_values, min(times), int(mean(times)), max(times), min(flops), int(mean(flops)), max(flops), min(mem_access), int(mean(mem_access)), max(mem_access), round(mean(mem_access)/mean(times),3) )) + + print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8)) + + def __del__(self): + self.print_profiling_data() + + def call_loopy(self, program, **kwargs): + + from warnings import resetwarnings, filterwarnings + resetwarnings() + filterwarnings('ignore', category=Warning) + + program = self.transform_loopy_program(program) + assert program.options.return_dict + assert program.options.no_numpy + + # Determine if we need to get the invoker code (for integer argument generation). + # N.B.: The invoker code might be different for the same program with different kwargs + if program.name not in self.invoker_codes or tuple(kwargs) not in self.invoker_codes[program.name]: + executor=program.target.get_kernel_executor(program, self.queue) + info = executor.kernel_info(executor.arg_to_dtype_set(kwargs)) + invoker_code = info.invoker.get() + self.invoker_codes[program.name] = {} + self.invoker_codes[program.name][tuple(kwargs)] = invoker_code + + evt, result = program(self.queue, **kwargs, allocator=self.allocator) + + if self.profiling_enabled: + self.events.append(TimingEvent(evt, program, kwargs)) + + # self.print_profiling_data() + + return result + + # vim: foldmethod=marker