Newer
Older
import pyopencl as cl
import numpy
import numpy.linalg as la
macroblock_count = 33
dtype = numpy.float32
total_size = local_size*thread_strides*macroblock_count
queue = cl.CommandQueue(ctx)
a = numpy.random.randn(total_size).astype(dtype)
b = numpy.random.randn(total_size).astype(dtype)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
Pointer, Module, Block, Initializer, Assign, Const
CLRequiredWorkGroupSize
mod = Module([
FunctionBody(
CLKernel(CLRequiredWorkGroupSize((local_size,),
FunctionDeclaration(
Value("void", "add"),
arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name))))
for name in ["tgt", "op1", "op2"]]))),
Block([
"get_local_id(0) + %d * get_group_id(0)"
% (local_size*thread_strides))
]+[
Assign(
"tgt[idx+%d]" % (o*local_size),
"op1[idx+%d] + op2[idx+%d]" % (
knl(queue, (local_size*macroblock_count,), (local_size,),
c_buf, a_buf, b_buf)