Skip to content
Snippets Groups Projects
Commit 82936bbf authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Merge pull request #8 from davethej/master

Made some changes to the benchmark example... let me know what you think.
parents 0724c5b6 fa833504
Branches
Tags
No related merge requests found
# example provided by Roger Pau Monn'e # example provided by Roger Pau Monn'e
from __future__ import print_function
import pyopencl as cl import pyopencl as cl
import numpy import numpy
import numpy.linalg as la import numpy.linalg as la
import datetime import datetime
from time import time from time import time
a = numpy.random.rand(1000).astype(numpy.float32) data_points = 2**23 # ~8 million data points, ~32 MB data
b = numpy.random.rand(1000).astype(numpy.float32) workers = 2**8 # 256 workers, play with this to see performance differences
# eg: 2**0 => 1 worker will be non-parallel execution on gpu
# data points must be a multiple of workers
a = numpy.random.rand(data_points).astype(numpy.float32)
b = numpy.random.rand(data_points).astype(numpy.float32)
c_result = numpy.empty_like(a) c_result = numpy.empty_like(a)
# Speed in normal CPU usage # Speed in normal CPU usage
time1 = time() time1 = time()
for i in range(1000): c_temp = (a+b) # adds each element in a to its corresponding element in b
for j in range(1000): c_temp = c_temp * c_temp # element-wise multiplication
c_result[i] = a[i] + b[i] c_result = c_temp * (a/2.0) # element-wise half a and multiply
c_result[i] = c_result[i] * (a[i] + b[i])
c_result[i] = c_result[i] * (a[i] / 2.0)
time2 = time() time2 = time()
print("Execution time of test without OpenCL: ", time2 - time1, "s") print("Execution time of test without OpenCL: ", time2 - time1, "s")
...@@ -34,6 +39,8 @@ for platform in cl.get_platforms(): ...@@ -34,6 +39,8 @@ for platform in cl.get_platforms():
print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device max clock speed:", device.max_clock_frequency, 'MHz')
print("Device compute units:", device.max_compute_units) print("Device compute units:", device.max_compute_units)
print("Device max work group size:", device.max_work_group_size)
print("Device max work item sizes:", device.max_work_item_sizes)
# Simnple speed test # Simnple speed test
ctx = cl.Context([device]) ctx = cl.Context([device])
...@@ -49,18 +56,38 @@ for platform in cl.get_platforms(): ...@@ -49,18 +56,38 @@ for platform in cl.get_platforms():
__kernel void sum(__global const float *a, __kernel void sum(__global const float *a,
__global const float *b, __global float *c) __global const float *b, __global float *c)
{ {
int loop;
int gid = get_global_id(0); int gid = get_global_id(0);
for(loop=0; loop<1000;loop++) float a_temp;
{ float b_temp;
c[gid] = a[gid] + b[gid]; float c_temp;
c[gid] = c[gid] * (a[gid] + b[gid]);
c[gid] = c[gid] * (a[gid] / 2.0); a_temp = a[gid]; // my a element (by global ref)
} b_temp = b[gid]; // my b element (by global ref)
c_temp = a_temp+b_temp; // sum of my elements
c_temp = c_temp * c_temp; // product of sums
c_temp = c_temp * (a_temp/2.0); // times 1/2 my a
c[gid] = c_temp; // store result in global memory
} }
""").build() """).build()
exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf) global_size=(data_points,)
local_size=(workers,)
preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
device)
print("Data points:", data_points)
print("Workers:", workers)
print("Preferred work group size multiple:", preferred_multiple)
if (workers % preferred_multiple):
print("Number of workers not a preferred multiple (%d*N)." \
% (preferred_multiple))
print("Performance may be reduced.")
exec_evt = prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf)
exec_evt.wait() exec_evt.wait()
elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)
...@@ -68,11 +95,9 @@ for platform in cl.get_platforms(): ...@@ -68,11 +95,9 @@ for platform in cl.get_platforms():
c = numpy.empty_like(a) c = numpy.empty_like(a)
cl.enqueue_read_buffer(queue, dest_buf, c).wait() cl.enqueue_read_buffer(queue, dest_buf, c).wait()
error = 0 equal = numpy.all( c == c_result)
for i in range(1000):
if c[i] != c_result[i]: if not equal:
error = 1
if error:
print("Results doesn't match!!") print("Results doesn't match!!")
else: else:
print("Results OK") print("Results OK")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment