From 017fe9d3f8b1a31757f713f2437aec88d2d08a4f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Tue, 22 Jun 2010 21:25:00 -0400 Subject: [PATCH] Fix slowness of matrix-multiply example. --- examples/matrix-multiply.py | 108 +++++++++++++++++------------------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/examples/matrix-multiply.py b/examples/matrix-multiply.py index 791bd0e9..7e26f8c2 100644 --- a/examples/matrix-multiply.py +++ b/examples/matrix-multiply.py @@ -2,7 +2,7 @@ from __future__ import division -kernel_code = """ +KERNEL_CODE = """ // Thread block size #define BLOCK_SIZE %(block_size)d @@ -36,8 +36,8 @@ kernel_code = """ * Device code. */ -#define AS(i, j) As[i + j * BLOCK_SIZE] -#define BS(i, j) Bs[i + j * BLOCK_SIZE] +#define AS(j, i) As[i + j * BLOCK_SIZE] +#define BS(j, i) Bs[i + j * BLOCK_SIZE] //////////////////////////////////////////////////////////////////////////////// //! Matrix multiplication on the device: C = A * B @@ -45,9 +45,11 @@ kernel_code = """ //////////////////////////////////////////////////////////////////////////////// __kernel __attribute__((reqd_work_group_size(16,16,1))) void -matrixMul( __global float* C, __global float* A, __global float* B, - __local float* As, __local float* Bs) +matrixMul( __global float* C, __global float* A, __global float* B) { + __local float As[BLOCK_SIZE*BLOCK_SIZE]; + __local float Bs[BLOCK_SIZE*BLOCK_SIZE]; + // Block index int bx = get_group_id(0); int by = get_group_id(1); @@ -73,7 +75,7 @@ matrixMul( __global float* C, __global float* A, __global float* B, // Csub is used to store the element of the block sub-matrix // that is computed by the thread - float Csub = 0; + float Csub = 0.0f; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix @@ -119,14 +121,13 @@ block_size = 16 ctx = cl.create_some_context() for dev in ctx.devices: - assert dev.local_mem_size > 0 + assert dev.local_mem_size > 0 queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) #queue = cl.CommandQueue(ctx) - if False: a_height = 4096 #a_height = 1024 @@ -144,39 +145,33 @@ elif False: else: # CL SDK - a_height = 50*block_size - a_width = 100*block_size - b_height = a_width + a_width = 50*block_size + a_height = 100*block_size b_width = 50*block_size + b_height = a_width -h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32) -h_b = numpy.random.rand(a_width, a_height).astype(numpy.float32) -h_c = numpy.empty((a_height, a_height)).astype(numpy.float32) -print h_a.shape, h_b.shape +c_width = b_width +c_height = a_height -mf = cl.mem_flags +h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32) +h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32) +h_c = numpy.empty((c_height, c_width)).astype(numpy.float32) kernel_params = {"block_size": block_size, - "w_a":a_width, "h_a":a_height, "w_b":a_height} + "w_a":a_width, "h_a":a_height, "w_b":b_width} -prg = cl.Program(ctx, kernel_code % kernel_params, +prg = cl.Program(ctx, KERNEL_CODE % kernel_params, ).build(options="-cl-mad-enable -cl-fast-relaxed-math") kernel = prg.matrixMul #print prg.binaries[0] -#def __call__(self, queue, tgt, src, shape): -# w, h = shape - assert a_width % block_size == 0 assert a_height % block_size == 0 assert b_width % block_size == 0 -# kernel(queue, (w, h), tgt, src, numpy.uint32(w), numpy.uint32(h)) - -# __call__(queue, a_t_buf, a_buf, source.shape) - -# args: queue, domain, *args +# transfer host -> device ----------------------------------------------------- +mf = cl.mem_flags t1 = time() @@ -186,56 +181,57 @@ d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes) push_time = time()-t1 -# warmup -event = kernel(queue, (a_height,a_height), d_c_buf, d_a_buf, d_b_buf, - cl.LocalMemory(4* block_size**2), - cl.LocalMemory(4* block_size**2), - local_size=(block_size, block_size)) +# warmup ---------------------------------------------------------------------- +for i in range(5): + event = kernel(queue, h_c.shape, d_c_buf, d_a_buf, d_b_buf, + local_size=(block_size, block_size)) event.wait() +queue.finish() + +# actual benchmark ------------------------------------------------------------ t1 = time() + count = 20 for i in range(count): - event = kernel(queue, (a_height,a_height), d_c_buf, d_a_buf, d_b_buf, - cl.LocalMemory(4* block_size**2), - cl.LocalMemory(4* block_size**2), + event = kernel(queue, h_c.shape, d_c_buf, d_a_buf, d_b_buf, local_size=(block_size, block_size)) event.wait() +gpu_time = (time()-t1)/count -gpu_time = time()-t1 - +# transfer device -> host ----------------------------------------------------- t1 = time() - cl.enqueue_read_buffer(queue, d_c_buf, h_c).wait() - pull_time = time()-t1 -ans1 = h_c - +# timing output --------------------------------------------------------------- gpu_total_time = gpu_time+push_time+pull_time -print "GPU (s) total:", gpu_total_time -print "PUSH ", push_time -print "PULL ", pull_time -print "COMPUTE ", gpu_time/count -print "COMPUTE2 ", (event.profile.end-event.profile.start)*1e-9 +print "GPU push+compute+pull total [s]:", gpu_total_time +print "GPU push [s]:", push_time +print "GPU pull [s]:", pull_time +print "GPU compute (host-timed) [s]:", gpu_time +print "GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9 gflop = h_c.size * (a_width * 2.) / (1000**3.) -gflops = gflop / (gpu_time/count) -print "gflops:", gflops +gflops = gflop / gpu_time -do_cpu = False +print +print "GFlops/s:", gflops -if do_cpu: - t1 = time() - ans2 = numpy.dot(h_a,h_b) - - cpu_time = time()-t1 +# cpu comparison -------------------------------------------------------------- +t1 = time() +h_c_cpu = numpy.dot(h_a,h_b) +cpu_time = time()-t1 - print "CPU (s)", cpu_time +print +print "GPU==CPU:",numpy.allclose(h_c, h_c_cpu) +print +print "CPU time (s)", cpu_time +print - print "GPU speedup: ", cpu_time/gpu_total_time +print "GPU speedup (with transfer): ", cpu_time/gpu_total_time +print "GPU speedup (without transfer): ", cpu_time/gpu_time - print "GPU==CPU:",numpy.allclose(ans1,ans2) -- GitLab