Skip to content
Snippets Groups Projects
demo_meta_codepy.py 1.66 KiB
Newer Older
  • Learn to ignore specific revisions
  • import pyopencl as cl
    import numpy
    import numpy.linalg as la
    
    
    local_size = 256
    thread_strides = 32
    
    macroblock_count = 33
    dtype = numpy.float32
    
    total_size = local_size*thread_strides*macroblock_count
    
    ctx = cl.create_some_context()
    
    queue = cl.CommandQueue(ctx)
    
    a = numpy.random.randn(total_size).astype(dtype)
    b = numpy.random.randn(total_size).astype(dtype)
    
    mf = cl.mem_flags
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
    c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
    
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
    from cgen import FunctionBody, \
    
            FunctionDeclaration, POD, Value, \
    
            Pointer, Module, Block, Initializer, Assign, Const
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
    from cgen.opencl import CLKernel, CLGlobal, \
    
            CLRequiredWorkGroupSize
    
    mod = Module([
        FunctionBody(
    
            CLKernel(CLRequiredWorkGroupSize((local_size,),
    
                FunctionDeclaration(
                Value("void", "add"),
                arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name))))
                    for name in ["tgt", "op1", "op2"]]))),
            Block([
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
                Initializer(POD(numpy.int32, "idx"),
    
                    "get_local_id(0) + %d * get_group_id(0)"
                    % (local_size*thread_strides))
    
                ]+[
                Assign(
                    "tgt[idx+%d]" % (o*local_size),
                    "op1[idx+%d] + op2[idx+%d]" % (
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
                        o*local_size,
    
                        o*local_size))
    
                for o in range(thread_strides)]))])
    
    knl = cl.Program(ctx, str(mod)).build().add
    
    knl(queue, (local_size*macroblock_count,), (local_size,),
            c_buf, a_buf, b_buf)
    
    
    c = numpy.empty_like(a)
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
    cl.enqueue_copy(queue, c, c_buf).wait()
    
    
    assert la.norm(c-(a+b)) == 0