diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1145abfbfca87863edd6f6bc79a44a3f6dee3d8b --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +wiki-examples diff --git a/examples/download-examples-from-wiki.py b/examples/download-examples-from-wiki.py new file mode 100755 index 0000000000000000000000000000000000000000..8740391051b722a81c42900c3a74f09cb047158b --- /dev/null +++ b/examples/download-examples-from-wiki.py @@ -0,0 +1,56 @@ +#! /usr/bin/env python + +import xmlrpclib +destwiki = xmlrpclib.ServerProxy("http://wiki.tiker.net?action=xmlrpc2") + +import os +try: + os.mkdir("wiki-examples") +except OSError: + pass + +print "downloading wiki examples to wiki-examples/..." +print "fetching page list..." +all_pages = destwiki.getAllPages() + + +from os.path import exists + +for page in all_pages: + if not page.startswith("PyOpenCL/Examples/"): + continue + + print page + try: + content = destwiki.getPage(page) + + import re + match = re.search(r"\{\{\{\#\!python(.*)\}\}\}", content, re.DOTALL) + code = match.group(1) + + match = re.search("([^/]+)$", page) + fname = match.group(1) + + outfname = os.path.join("wiki-examples", fname+".py") + if exists(outfname): + print "%s exists, refusing to overwrite." % outfname + else: + outf = open(outfname, "w") + outf.write(code) + outf.close() + + for att_name in destwiki.listAttachments(page): + content = destwiki.getAttachment(page, att_name) + + outfname = os.path.join("wiki-examples", att_name) + if exists(outfname): + print "%s exists, refusing to overwrite." % outfname + else: + outf = open(outfname, "w") + outf.write(str(content)) + outf.close() + + except Exception, e: + print "Error when processing %s: %s" % (page, e) + from traceback import print_exc + print_exc() diff --git a/examples/matrix-multiply.py b/examples/matrix-multiply.py deleted file mode 100644 index 7181cfc9eea385c8eafa8c33dc87e1940a452582..0000000000000000000000000000000000000000 --- a/examples/matrix-multiply.py +++ /dev/null @@ -1,241 +0,0 @@ -# example provided by Eilif Muller - -from __future__ import division - -KERNEL_CODE = """ - -// Thread block size -#define BLOCK_SIZE %(block_size)d - -// Matrix dimensions -// (chosen as multiples of the thread block size for simplicity) -#define WA %(w_a)d // Matrix A width -#define HA %(h_a)d // Matrix A height -#define WB %(w_b)d // Matrix B width -#define HB WA // Matrix B height -#define WC WB // Matrix C width -#define HC HA // Matrix C height - - -/* - * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -/* Matrix multiplication: C = A * B. - * Device code. - */ - -#define AS(j, i) As[i + j * BLOCK_SIZE] -#define BS(j, i) Bs[i + j * BLOCK_SIZE] - -//////////////////////////////////////////////////////////////////////////////// -//! Matrix multiplication on the device: C = A * B -//! WA is A's width and WB is B's width -//////////////////////////////////////////////////////////////////////////////// -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1))) -void -matrixMul( __global float* C, __global float* A, __global float* B) -{ - __local float As[BLOCK_SIZE*BLOCK_SIZE]; - __local float Bs[BLOCK_SIZE*BLOCK_SIZE]; - - // Block index - int bx = get_group_id(0); - int by = get_group_id(1); - - // Thread index - int tx = get_local_id(0); - int ty = get_local_id(1); - - // Index of the first sub-matrix of A processed by the block - int aBegin = WA * BLOCK_SIZE * by; - - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + WA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * WB; - - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0.0f; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) { - - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - AS(ty, tx) = A[a + WA * ty + tx]; - BS(ty, tx) = B[b + WB * ty + tx]; - - // Synchronize to make sure the matrices are loaded - barrier(CLK_LOCAL_MEM_FENCE); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - for (int k = 0; k < BLOCK_SIZE; ++k) - Csub += AS(ty, k) * BS(k, tx); - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub; - -} - -""" - -import pyopencl as cl -from time import time -import numpy - -block_size = 16 - -ctx = cl.create_some_context() - -for dev in ctx.devices: - assert dev.local_mem_size > 0 - -queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - -#queue = cl.CommandQueue(ctx) - -if False: - a_height = 4096 - #a_height = 1024 - a_width = 2048 - #a_width = 256 - #b_height == a_width - b_width = a_height - -elif False: - # like PyCUDA - a_height = 2516 - a_width = 1472 - b_height = a_width - b_width = 2144 - -else: - # CL SDK - a_width = 50*block_size - a_height = 100*block_size - b_width = 50*block_size - b_height = a_width - -c_width = b_width -c_height = a_height - -h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32) -h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32) -h_c = numpy.empty((c_height, c_width)).astype(numpy.float32) - - -kernel_params = {"block_size": block_size, - "w_a":a_width, "h_a":a_height, "w_b":b_width} - -if "NVIDIA" in queue.device.vendor: - options = "-cl-mad-enable -cl-fast-relaxed-math" -else: - options = "" -prg = cl.Program(ctx, KERNEL_CODE % kernel_params, - ).build(options=options) -kernel = prg.matrixMul -#print prg.binaries[0] - -assert a_width % block_size == 0 -assert a_height % block_size == 0 -assert b_width % block_size == 0 - -# transfer host -> device ----------------------------------------------------- -mf = cl.mem_flags - -t1 = time() - -d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a) -d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b) -d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes) - -push_time = time()-t1 - -# warmup ---------------------------------------------------------------------- -for i in range(5): - event = kernel(queue, h_c.shape[::-1], (block_size, block_size), - d_c_buf, d_a_buf, d_b_buf) - event.wait() - -queue.finish() - -# actual benchmark ------------------------------------------------------------ -t1 = time() - -count = 20 -for i in range(count): - event = kernel(queue, h_c.shape[::-1], (block_size, block_size), - d_c_buf, d_a_buf, d_b_buf) - -event.wait() - -gpu_time = (time()-t1)/count - -# transfer device -> host ----------------------------------------------------- -t1 = time() -cl.enqueue_copy(queue, h_c, d_c_buf) -pull_time = time()-t1 - -# timing output --------------------------------------------------------------- -gpu_total_time = gpu_time+push_time+pull_time - -print "GPU push+compute+pull total [s]:", gpu_total_time -print "GPU push [s]:", push_time -print "GPU pull [s]:", pull_time -print "GPU compute (host-timed) [s]:", gpu_time -print "GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9 - -gflop = h_c.size * (a_width * 2.) / (1000**3.) -gflops = gflop / gpu_time - -print -print "GFlops/s:", gflops - -# cpu comparison -------------------------------------------------------------- -t1 = time() -h_c_cpu = numpy.dot(h_a,h_b) -cpu_time = time()-t1 - -print -print "GPU==CPU:",numpy.allclose(h_c, h_c_cpu) -print -print "CPU time (s)", cpu_time -print - -print "GPU speedup (with transfer): ", cpu_time/gpu_total_time -print "GPU speedup (without transfer): ", cpu_time/gpu_time -