diff --git a/examples/dump-performance.py b/examples/dump-performance.py index 42946a791d7ef876885e7efcabb830b1d906e567..d8b00142b52afc49635f5f52f061701274abea4d 100644 --- a/examples/dump-performance.py +++ b/examples/dump-performance.py @@ -1,13 +1,9 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function +from __future__ import division, absolute_import, print_function import pyopencl as cl import pyopencl.characterize.performance as perf from six.moves import range - - def main(): ctx = cl.create_some_context() @@ -15,7 +11,8 @@ def main(): print("command latency: %g s" % latency) print("profiling overhead: %g s -> %.1f %%" % ( prof_overhead, 100*prof_overhead/latency)) - queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + queue = cl.CommandQueue( + ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9)) @@ -29,13 +26,11 @@ def main(): print("----------------------------------------") print("latency: %g s" % perf.transfer_latency(queue, tx_type)) - for i in range(6, 28, 2): - bs = 1<<i + for i in range(6, 31, 2): + bs = 1 << i print("bandwidth @ %d bytes: %g GB/s" % ( bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9)) - - if __name__ == "__main__": main() diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py index d2cbf791ecde610c5a5e6fc69bad7d0dff37d213..f3f2d8947c6ada90356e228b23961a9fb369c375 100644 --- a/pyopencl/characterize/performance.py +++ b/pyopencl/characterize/performance.py @@ -1,7 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from six.moves import range +from __future__ import division, absolute_import, print_function __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -25,12 +22,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from six.moves import range import pyopencl as cl import numpy as np - - # {{{ timing helpers class Timer: @@ -50,8 +46,6 @@ class Timer: pass - - class WallTimer(Timer): def start(self): from time import time @@ -67,8 +61,6 @@ class WallTimer(Timer): return self.end-self.start - - def _get_time(queue, f, timer_factory=None, desired_duration=0.1, warmup_rounds=3): @@ -106,8 +98,6 @@ def _get_time(queue, f, timer_factory=None, desired_duration=0.1, # }}} - - # {{{ transfer measurements class HostDeviceTransferBase(object): @@ -116,32 +106,33 @@ class HostDeviceTransferBase(object): self.host_buf = np.empty(block_size, dtype=np.uint8) self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) + class HostToDeviceTransfer(HostDeviceTransferBase): def do(self): return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) + class DeviceToHostTransfer(HostDeviceTransferBase): def do(self): return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf) + class DeviceToDeviceTransfer(object): def __init__(self, queue, block_size): self.queue = queue - self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) - self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) + mf = cl.mem_flags + self.dev_buf_1 = cl.Buffer(queue.context, mf.READ_WRITE, block_size) + self.dev_buf_2 = cl.Buffer(queue.context, mf.READ_WRITE, block_size) def do(self): return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1) -class HostToDeviceTransfer(HostDeviceTransferBase): - def do(self): - return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) - def transfer_latency(queue, transfer_type, timer_factory=None): transfer = transfer_type(queue, 1) return _get_time(queue, transfer.do, timer_factory=timer_factory) + def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None): """Measures one-sided bandwidth.""" @@ -151,8 +142,6 @@ def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None): # }}} - - def get_profiling_overhead(ctx, timer_factory=None): no_prof_queue = cl.CommandQueue(ctx) transfer = DeviceToDeviceTransfer(no_prof_queue, 1) @@ -165,6 +154,7 @@ def get_profiling_overhead(ctx, timer_factory=None): return prof_time - no_prof_time, prof_time + def get_empty_kernel_time(queue, timer_factory=None): prg = cl.Program(queue.context, """ __kernel void empty() @@ -178,13 +168,16 @@ def get_empty_kernel_time(queue, timer_factory=None): return _get_time(queue, f, timer_factory=timer_factory) -def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None): + +def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", + timer_factory=None): prg = cl.Program(queue.context, src).build() knl = getattr(prg, name) dev = queue.device global_size = 4 * dev.max_compute_units + def f(): knl(queue, (global_size,), None, *args) @@ -198,22 +191,22 @@ def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_fact keep_trying = not rates - if rates and rate > 1.05*max(rates): # big improvement + if rates and rate > 1.05*max(rates): # big improvement keep_trying = True num_dips = 0 - if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip + if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip keep_trying = True num_dips += 1 if keep_trying: global_size *= 2 - last_rate = rate rates.append(rate) else: rates.append(rate) return max(rates) + def get_add_rate(queue, type="float", timer_factory=None): return 50*10*_get_full_machine_kernel_rate(queue, """ typedef %(op_t)s op_t; @@ -244,6 +237,4 @@ def get_add_rate(queue, type="float", timer_factory=None): """ % dict(op_t=type), ()) - - # vim: foldmethod=marker:filetype=pyopencl