diff --git a/examples/dump-performance.py b/examples/dump-performance.py new file mode 100644 index 0000000000000000000000000000000000000000..97fa0544aaf357f0769a5ff6d9aa623c26dda4ef --- /dev/null +++ b/examples/dump-performance.py @@ -0,0 +1,38 @@ +from __future__ import division +import pyopencl as cl +import pyopencl.characterize.performance as perf + + + + +def main(): + ctx = cl.create_some_context() + + prof_overhead, latency = perf.get_profiling_overhead(ctx) + print "command latency: %g s" % latency + print "profiling overhead: %g s -> %.1f %%" % ( + prof_overhead, 100*prof_overhead/latency) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + print "empty kernel: %g s" % perf.get_empty_kernel_time(queue) + print "float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9) + + for tx_type in [ + perf.HostToDeviceTransfer, + perf.DeviceToHostTransfer, + perf.DeviceToDeviceTransfer]: + print "----------------------------------------" + print tx_type.__name__ + print "----------------------------------------" + + print "latency: %g s" % perf.transfer_latency(queue, tx_type) + for i in range(6, 28, 2): + bs = 1<= (3,0): + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + + return 32 + + if dev.type == cl.device_type.GPU: + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + return 16 + elif dev.type == cl.device_type.CPU: + return 1 + else: + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + return 16 + + + + + +def local_memory_access_granularity(dev): + """Return the number of bytes per bank in local memory.""" + return 4 + + + + +def local_memory_bank_count(dev): + """Return the number of banks present in local memory. + """ + nv_compute_cap = nv_compute_capability(dev) + + if nv_compute_cap is not None: + if nv_compute_cap < (2,0): + return 16 + else: + if nv_compute_cap >= (3,0): + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + + return 32 + + if dev.type == cl.device_type.GPU: + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + return 16 + elif dev.type == cl.device_type.CPU: + if dev.local_mem_type == cl.device_local_mem_type.GLOBAL: + raise RuntimeError("asking for a bank count is meaningless for cache-based lmem") + + from warnings import warn + warn("wildly guessing conflicting local access size on '%s'" + % dev, + CLCharacterizationWarning) + return 16 + + + +def why_not_local_access_conflict_free(dev, itemsize, + array_shape, array_stored_shape=None): + """ + :param itemsize: size of accessed data in bytes + :param array_shape: array dimensions, fastest-moving last + (C order) + + :returns: a tuple (multiplicity, explanation), where *multiplicity* + is the number of work items that will conflict on a bank when accessing + local memory. *explanation* is a string detailing the found conflict. + """ + # FIXME: Treat 64-bit access on NV CC 2.x + correctly + + if array_stored_shape is None: + array_stored_shape = array_shape + + rank = len(array_shape) + + array_shape = array_shape[::-1] + array_stored_shape = array_stored_shape[::-1] + + gran = local_memory_access_granularity(dev) + if itemsize != gran: + from warnings import warn + warn("local conflict info might be inaccurate " + "for itemsize != %d" % gran, + CLCharacterizationWarning) + + sim_wi = simultaneous_work_items_on_local_access(dev) + bank_count = local_memory_bank_count(dev) + + conflicts = [] + + for work_item_axis in range(rank): + + bank_accesses = {} + for work_item_id in xrange(sim_wi): + addr = 0 + addr_mult = itemsize + + idx = [] + left_over_idx = work_item_id + for axis, (ax_size, ax_stor_size) in enumerate( + zip(array_shape, array_stored_shape)): + + if axis >= work_item_axis: + left_over_idx, ax_idx = divmod(left_over_idx, ax_size) + addr += addr_mult*ax_idx + idx.append(ax_idx) + else: + idx.append(0) + + addr_mult *= ax_stor_size + + if left_over_idx: + # out-of-bounds, assume not taking place + continue + + bank = (addr // gran) % bank_count + bank_accesses.setdefault(bank, []).append( + "w.item %s -> %s" % (work_item_id, idx[::-1])) + + conflict_multiplicity = max( + len(acc) for acc in bank_accesses.itervalues()) + + if conflict_multiplicity > 1: + for bank, acc in bank_accesses.iteritems(): + if len(acc) == conflict_multiplicity: + conflicts.append( + (conflict_multiplicity, + "%dx conflict on axis %d (from right, 0-based): " + "%s access bank %d" % ( + conflict_multiplicity, + work_item_axis, + ", ".join(acc), bank))) + + if conflicts: + return max(conflicts) + else: + return 1, None + + + + +def get_fast_inaccurate_build_options(dev): + """Return a list of flags valid on device *dev* that enable fast, but + potentially inaccurate floating point math. + """ + return ["-cl-mad-enable", "-cl-fast-relaxed-math", + "-cl-no-signed-zeros", "-cl-strict-aliasing"] + + + + +def get_simd_group_size(dev, type_size): + """Return an estimate of how many work items will be executed across SIMD + lanes. This returns the size of what Nvidia calls a warp and what AMD calls + a wavefront. + + Only refers to implicit SIMD. + + :arg type_size: number of bytes in vector entry type. + """ + try: + return dev.warp_size_nv + except: + pass + + lc_vendor = dev.platform.vendor.lower() + if "nvidia" in lc_vendor: + return 32 + + if ("advanced micro" in lc_vendor or "ati" in lc_vendor): + if dev.type == cl.device_type.GPU: + return 64 + elif dev.type == cl.device_type.CPU: + return 1 + else: + raise RuntimeError("unexpected AMD device type") + + if dev.type == cl.device_type.CPU: + # implicit assumption: Impl. will vectorize + + if type_size == 1: + return dev.preferred_vector_width_char + elif type_size == 2: + return dev.preferred_vector_width_short + elif type_size == 4: + return dev.preferred_vector_width_float + elif type_size == 8: + return dev.preferred_vector_width_double + else: + from warnings import warn + warn("unexpected dtype size in get_simd_group on CPU device, " + "guessing group width 1") + return 1 + + return None diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..a64570ce1347247ce0ec00e210286a8681fc48d0 --- /dev/null +++ b/pyopencl/characterize/performance.py @@ -0,0 +1,223 @@ +from __future__ import division +import pyopencl as cl +import numpy as np + + + + +# {{{ timing helpers + +class Timer: + def __init__(self, queue): + self.queue = queue + + def start(self): + pass + + def stop(self): + pass + + def add_event(self, evt): + pass + + def get_elapsed(self): + pass + + + + +class WallTimer(Timer): + def start(self): + from time import time + self.queue.finish() + self.start = time() + + def stop(self): + from time import time + self.queue.finish() + self.end = time() + + def get_elapsed(self): + return self.end-self.start + + + + +def _get_time(queue, f, timer_factory=None, desired_duration=0.1, + warmup_rounds=3): + + if timer_factory is None: + timer_factory = WallTimer + + count = 1 + + while True: + timer = timer_factory(queue) + + for i in xrange(warmup_rounds): + f() + warmup_rounds = 0 + + timer.start() + for i in xrange(count): + timer.add_event(f()) + timer.stop() + + elapsed = timer.get_elapsed() + if elapsed < desired_duration: + if elapsed == 0: + count *= 5 + else: + new_count = int(desired_duration/elapsed) + + new_count = max(2*count, new_count) + new_count = min(10*count, new_count) + count = new_count + + else: + return elapsed/count + +# }}} + + + + +# {{{ transfer measurements + +class HostDeviceTransferBase(object): + def __init__(self, queue, block_size): + self.queue = queue + self.host_buf = np.empty(block_size, dtype=np.uint8) + self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) + +class HostToDeviceTransfer(HostDeviceTransferBase): + def do(self): + return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) + +class DeviceToHostTransfer(HostDeviceTransferBase): + def do(self): + return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf) + +class DeviceToDeviceTransfer(object): + def __init__(self, queue, block_size): + self.queue = queue + self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) + self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) + + def do(self): + return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1) + +class HostToDeviceTransfer(HostDeviceTransferBase): + def do(self): + return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) + + +def transfer_latency(queue, transfer_type, timer_factory=None): + transfer = transfer_type(queue, 1) + return _get_time(queue, transfer.do, timer_factory=timer_factory) + +def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None): + """Measures one-sided bandwidth.""" + + transfer = transfer_type(queue, block_size) + return block_size/_get_time(queue, transfer.do, timer_factory=timer_factory) + +# }}} + + + + +def get_profiling_overhead(ctx, timer_factory=None): + no_prof_queue = cl.CommandQueue(ctx) + transfer = DeviceToDeviceTransfer(no_prof_queue, 1) + no_prof_time = _get_time(no_prof_queue, transfer.do, timer_factory=timer_factory) + + prof_queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + transfer = DeviceToDeviceTransfer(prof_queue, 1) + prof_time = _get_time(prof_queue, transfer.do, timer_factory=timer_factory) + + return prof_time - no_prof_time, prof_time + +def get_empty_kernel_time(queue, timer_factory=None): + prg = cl.Program(queue.context, """ + __kernel void empty() + { } + """).build() + + knl = prg.empty + + def f(): + knl(queue, (1,), None) + + return _get_time(queue, f, timer_factory=timer_factory) + +def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None): + prg = cl.Program(queue.context, src).build() + + knl = getattr(prg, name) + + dev = queue.device + global_size = 4 * dev.max_compute_units + def f(): + knl(queue, (global_size,), None, *args) + + rates = [] + num_dips = 0 + + while True: + elapsed = _get_time(queue, f, timer_factory=timer_factory) + rate = global_size/elapsed + print global_size, rate, num_dips + + keep_trying = not rates + + if rates and rate > 1.05*max(rates): # big improvement + keep_trying = True + num_dips = 0 + + if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip + keep_trying = True + num_dips += 1 + + if keep_trying: + global_size *= 2 + last_rate = rate + rates.append(rate) + else: + rates.append(rate) + return max(rates) + +def get_add_rate(queue, type="float", timer_factory=None): + return 50*10*_get_full_machine_kernel_rate(queue, """ + typedef %(op_t)s op_t; + __kernel void benchmark() + { + local op_t tgt[1024]; + op_t val = get_global_id(0); + + for (int i = 0; i < 10; ++i) + { + val += val; val += val; val += val; val += val; val += val; + val += val; val += val; val += val; val += val; val += val; + + val += val; val += val; val += val; val += val; val += val; + val += val; val += val; val += val; val += val; val += val; + + val += val; val += val; val += val; val += val; val += val; + val += val; val += val; val += val; val += val; val += val; + + val += val; val += val; val += val; val += val; val += val; + val += val; val += val; val += val; val += val; val += val; + + val += val; val += val; val += val; val += val; val += val; + val += val; val += val; val += val; val += val; val += val; + } + tgt[get_local_id(0)] = val; + } + """ % dict(op_t=type), ()) + + + + +# vim: foldmethod=marker:filetype=pyopencl