-
Andreas Klöckner authoredAndreas Klöckner authored
transpose.py 2.66 KiB
# Transposition of a matrix
# originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
from __future__ import division
import pyopencl as cl
import numpy
import numpy.linalg as la
block_size = 16
def inner_transpose(ctx, queue, tgt, src, shape):
knl = cl.Program(ctx, """
#define BLOCK_SIZE %(block_size)d
#define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
__kernel void transpose(
__global float *a_t, __global float *a,
unsigned a_width, unsigned a_height,
__local float a_local[BLOCK_SIZE][BLOCK_SIZE+1])
{
// Base indices in A and A_t
int base_idx_a =
get_group_id(0) * BLOCK_SIZE +
get_group_id(1) * A_BLOCK_STRIDE;
int base_idx_a_t =
get_group_id(1) * BLOCK_SIZE +
get_group_id(0) * A_T_BLOCK_STRIDE;
// Global indices in A and A_t
int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1);
int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);
// Store transposed submatrix to local memory
a_local[get_local_id(1)][get_local_id(0)] = a[glob_idx_a];
barrier(CLK_LOCAL_MEM_FENCE);
// Write transposed submatrix to global memory
a_t[glob_idx_a_t] = a_local[get_local_id(0)][get_local_id(1)];
}
"""% {"block_size": block_size}).build().transpose
w, h = shape
assert w % block_size == 0
assert h % block_size == 0
knl(queue, (w, h),
tgt, src, numpy.uint32(w), numpy.uint32(h),
cl.LocalMemory(4*block_size*(block_size+1)),
local_size=(block_size, block_size))
def transpose_using_cl(ctx, queue, cpu_src):
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cpu_src)
a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=cpu_src.nbytes)
inner_transpose(ctx, queue, a_t_buf, a_buf, cpu_src.shape)
w, h = cpu_src.shape
result = numpy.empty((h, w), dtype=cpu_src.dtype)
cl.enqueue_read_buffer(queue, a_t_buf, result).wait()
a_buf.release()
a_t_buf.release()
return result
def check_transpose():
ctx = cl.Context(dev_type=cl.device_type.ALL)
for dev in ctx.devices:
assert dev.local_mem_size > 0
queue = cl.CommandQueue(ctx)
from pycuda.curandom import rand
for i in numpy.arange(10, 13, 0.125):
size = int(((2**i) // 32) * 32)
print size
source = numpy.random.rand(size, size).astype(numpy.float32)
result = transpose_using_cl(ctx, queue, source)
err = source.T - result
err_norm = la.norm(err)
assert err_norm == 0, (size, err_norm)
check_transpose()