From fa8335040f16d4351ef4a04b70ddba2bddf0297d Mon Sep 17 00:00:00 2001
From: dhj <dhj.consulting@gmail.com>
Date: Tue, 23 Oct 2012 19:35:24 -0500
Subject: [PATCH] Updated benchmark to use equivalent work loads for cpu and
 gpu.

* Numpy does element-wise operations by default.  Updated the cpu
  operation to use pure numpy.

* Eliminated the loop which is not necessary to demonstrate
  parallelism on array operations.

* Made the number of workers explicit rather than gpu chosen,
  through local_size variable passed to kernel execution.

* Increased to ~8 million data points to more clearly demonstrate
  the difference between cpu and gpu based computations.
---
 examples/{benchmark-all.py => benchmark.py} | 65 ++++++++++++++-------
 1 file changed, 45 insertions(+), 20 deletions(-)
 rename examples/{benchmark-all.py => benchmark.py} (50%)

diff --git a/examples/benchmark-all.py b/examples/benchmark.py
similarity index 50%
rename from examples/benchmark-all.py
rename to examples/benchmark.py
index 09c423c6..d04f1833 100644
--- a/examples/benchmark-all.py
+++ b/examples/benchmark.py
@@ -1,23 +1,28 @@
 # example provided by Roger Pau Monn'e
 
+from __future__ import print_function
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
 import datetime
 from time import time
 
-a = numpy.random.rand(1000).astype(numpy.float32)
-b = numpy.random.rand(1000).astype(numpy.float32)
+data_points = 2**23 # ~8 million data points, ~32 MB data
+workers = 2**8 # 256 workers, play with this to see performance differences
+               # eg: 2**0 => 1 worker will be non-parallel execution on gpu
+               # data points must be a multiple of workers
+
+a = numpy.random.rand(data_points).astype(numpy.float32)
+b = numpy.random.rand(data_points).astype(numpy.float32)
 c_result = numpy.empty_like(a)
 
 # Speed in normal CPU usage
 time1 = time()
-for i in range(1000):
-        for j in range(1000):
-                c_result[i] = a[i] + b[i]
-                c_result[i] = c_result[i] * (a[i] + b[i])
-                c_result[i] = c_result[i] * (a[i] / 2.0)
+c_temp = (a+b) # adds each element in a to its corresponding element in b
+c_temp = c_temp * c_temp # element-wise multiplication
+c_result = c_temp * (a/2.0) # element-wise half a and multiply
 time2 = time()
+
 print("Execution time of test without OpenCL: ", time2 - time1, "s")
 
 
@@ -34,6 +39,8 @@ for platform in cl.get_platforms():
         print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
         print("Device max clock speed:", device.max_clock_frequency, 'MHz')
         print("Device compute units:", device.max_compute_units)
+        print("Device max work group size:", device.max_work_group_size)
+        print("Device max work item sizes:", device.max_work_item_sizes)
 
         # Simnple speed test
         ctx = cl.Context([device])
@@ -49,18 +56,38 @@ for platform in cl.get_platforms():
             __kernel void sum(__global const float *a,
             __global const float *b, __global float *c)
             {
-                        int loop;
                         int gid = get_global_id(0);
-                        for(loop=0; loop<1000;loop++)
-                        {
-                                c[gid] = a[gid] + b[gid];
-                                c[gid] = c[gid] * (a[gid] + b[gid]);
-                                c[gid] = c[gid] * (a[gid] / 2.0);
-                        }
+                        float a_temp;
+                        float b_temp;
+                        float c_temp;
+
+                        a_temp = a[gid]; // my a element (by global ref)
+                        b_temp = b[gid]; // my b element (by global ref)
+                        
+                        c_temp = a_temp+b_temp; // sum of my elements
+                        c_temp = c_temp * c_temp; // product of sums
+                        c_temp = c_temp * (a_temp/2.0); // times 1/2 my a
+
+                        c[gid] = c_temp; // store result in global memory
                 }
                 """).build()
 
-        exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
+        global_size=(data_points,)
+        local_size=(workers,)
+        preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
+            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
+            device)
+
+        print("Data points:", data_points)
+        print("Workers:", workers)
+        print("Preferred work group size multiple:", preferred_multiple)
+
+        if (workers % preferred_multiple):
+            print("Number of workers not a preferred multiple (%d*N)." \
+                    % (preferred_multiple))
+            print("Performance may be reduced.")
+
+        exec_evt = prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf)
         exec_evt.wait()
         elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)
 
@@ -68,11 +95,9 @@ for platform in cl.get_platforms():
 
         c = numpy.empty_like(a)
         cl.enqueue_read_buffer(queue, dest_buf, c).wait()
-        error = 0
-        for i in range(1000):
-                if c[i] != c_result[i]:
-                        error = 1
-        if error:
+        equal = numpy.all( c == c_result)
+
+        if not equal:
                 print("Results doesn't match!!")
         else:
                 print("Results OK")
-- 
GitLab