From fa8335040f16d4351ef4a04b70ddba2bddf0297d Mon Sep 17 00:00:00 2001 From: dhj <dhj.consulting@gmail.com> Date: Tue, 23 Oct 2012 19:35:24 -0500 Subject: [PATCH] Updated benchmark to use equivalent work loads for cpu and gpu. * Numpy does element-wise operations by default. Updated the cpu operation to use pure numpy. * Eliminated the loop which is not necessary to demonstrate parallelism on array operations. * Made the number of workers explicit rather than gpu chosen, through local_size variable passed to kernel execution. * Increased to ~8 million data points to more clearly demonstrate the difference between cpu and gpu based computations. --- examples/{benchmark-all.py => benchmark.py} | 65 ++++++++++++++------- 1 file changed, 45 insertions(+), 20 deletions(-) rename examples/{benchmark-all.py => benchmark.py} (50%) diff --git a/examples/benchmark-all.py b/examples/benchmark.py similarity index 50% rename from examples/benchmark-all.py rename to examples/benchmark.py index 09c423c6..d04f1833 100644 --- a/examples/benchmark-all.py +++ b/examples/benchmark.py @@ -1,23 +1,28 @@ # example provided by Roger Pau Monn'e +from __future__ import print_function import pyopencl as cl import numpy import numpy.linalg as la import datetime from time import time -a = numpy.random.rand(1000).astype(numpy.float32) -b = numpy.random.rand(1000).astype(numpy.float32) +data_points = 2**23 # ~8 million data points, ~32 MB data +workers = 2**8 # 256 workers, play with this to see performance differences + # eg: 2**0 => 1 worker will be non-parallel execution on gpu + # data points must be a multiple of workers + +a = numpy.random.rand(data_points).astype(numpy.float32) +b = numpy.random.rand(data_points).astype(numpy.float32) c_result = numpy.empty_like(a) # Speed in normal CPU usage time1 = time() -for i in range(1000): - for j in range(1000): - c_result[i] = a[i] + b[i] - c_result[i] = c_result[i] * (a[i] + b[i]) - c_result[i] = c_result[i] * (a[i] / 2.0) +c_temp = (a+b) # adds each element in a to its corresponding element in b +c_temp = c_temp * c_temp # element-wise multiplication +c_result = c_temp * (a/2.0) # element-wise half a and multiply time2 = time() + print("Execution time of test without OpenCL: ", time2 - time1, "s") @@ -34,6 +39,8 @@ for platform in cl.get_platforms(): print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) + print("Device max work group size:", device.max_work_group_size) + print("Device max work item sizes:", device.max_work_item_sizes) # Simnple speed test ctx = cl.Context([device]) @@ -49,18 +56,38 @@ for platform in cl.get_platforms(): __kernel void sum(__global const float *a, __global const float *b, __global float *c) { - int loop; int gid = get_global_id(0); - for(loop=0; loop<1000;loop++) - { - c[gid] = a[gid] + b[gid]; - c[gid] = c[gid] * (a[gid] + b[gid]); - c[gid] = c[gid] * (a[gid] / 2.0); - } + float a_temp; + float b_temp; + float c_temp; + + a_temp = a[gid]; // my a element (by global ref) + b_temp = b[gid]; // my b element (by global ref) + + c_temp = a_temp+b_temp; // sum of my elements + c_temp = c_temp * c_temp; // product of sums + c_temp = c_temp * (a_temp/2.0); // times 1/2 my a + + c[gid] = c_temp; // store result in global memory } """).build() - exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf) + global_size=(data_points,) + local_size=(workers,) + preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \ + cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \ + device) + + print("Data points:", data_points) + print("Workers:", workers) + print("Preferred work group size multiple:", preferred_multiple) + + if (workers % preferred_multiple): + print("Number of workers not a preferred multiple (%d*N)." \ + % (preferred_multiple)) + print("Performance may be reduced.") + + exec_evt = prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) @@ -68,11 +95,9 @@ for platform in cl.get_platforms(): c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() - error = 0 - for i in range(1000): - if c[i] != c_result[i]: - error = 1 - if error: + equal = numpy.all( c == c_result) + + if not equal: print("Results doesn't match!!") else: print("Results OK") -- GitLab