Newer
Older
import pyopencl as cl
import numpy
import numpy.linalg as la
macroblock_count = 33
dtype = numpy.float32
total_size = local_size*thread_strides*macroblock_count
a = numpy.random.randn(total_size).astype(dtype)
b = numpy.random.randn(total_size).astype(dtype)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
from mako.template import Template
tpl = Template("""
__kernel void add(
__global ${ type_name } *tgt,
__global const ${ type_name } *op1,
__global const ${ type_name } *op2)
+ ${ local_size } * ${ thread_strides }
% for i in range(thread_strides):
<% offset = i*local_size %>
tgt[idx + ${ offset }] =
op1[idx + ${ offset }]
+ op2[idx + ${ offset } ];
% endfor
local_size=local_size, thread_strides=thread_strides)
knl = cl.Program(ctx, str(rendered_tpl)).build().add
knl(queue, (local_size*macroblock_count,), (local_size,),
c_buf, a_buf, b_buf)