Skip to content
Snippets Groups Projects
transpose.py 6.37 KiB
Newer Older
  • Learn to ignore specific revisions
  • # Transposition of a matrix
    # originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
    
    import pyopencl as cl
    import numpy
    import numpy.linalg as la
    
    
    block_size = 16
    
    
    
    class NaiveTranspose:
        def __init__(self, ctx):
    
            self.kernel = (
                cl.Program(
                    ctx,
                    """
            __kernel void transpose(
    
              __global float *a_t, __global float *a,
              unsigned a_width, unsigned a_height)
            {
              int read_idx = get_global_id(0) + get_global_id(1) * a_width;
              int write_idx = get_global_id(1) + get_global_id(0) * a_height;
    
            """,)
                .build()
                .transpose
            )
    
        def __call__(self, queue, tgt, src, shape):
            w, h = shape
            assert w % block_size == 0
            assert h % block_size == 0
    
            return self.kernel(
                queue,
                (w, h),
                (block_size, block_size),
                tgt,
                src,
                numpy.uint32(w),
                numpy.uint32(h),
            )
    
    class SillyTranspose(NaiveTranspose):
        def __call__(self, queue, tgt, src, shape):
            w, h = shape
            assert w % block_size == 0
            assert h % block_size == 0
    
            return self.kernel(
                queue, (w, h), None, tgt, src, numpy.uint32(w), numpy.uint32(h)
            )
    
    
    class TransposeWithLocal:
        def __init__(self, ctx):
    
            self.kernel = (
                cl.Program(
                    ctx,
                    """
    
            #define BLOCK_SIZE %(block_size)d
            #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
            #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
    
            __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1)))
            void transpose(
              __global float *a_t, __global float *a,
              unsigned a_width, unsigned a_height,
    
              __local float *a_local)
    
            {
              int base_idx_a   =
                get_group_id(0) * BLOCK_SIZE +
                get_group_id(1) * A_BLOCK_STRIDE;
              int base_idx_a_t =
                get_group_id(1) * BLOCK_SIZE +
                get_group_id(0) * A_T_BLOCK_STRIDE;
    
    
              int glob_idx_a   =
                base_idx_a + get_local_id(0) + a_width * get_local_id(1);
              int glob_idx_a_t =
                base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);
    
              a_local[get_local_id(1)*BLOCK_SIZE+get_local_id(0)] = a[glob_idx_a];
    
              a_t[glob_idx_a_t] = a_local[get_local_id(0)*BLOCK_SIZE+get_local_id(1)];
    
            """
                    % {"block_size": block_size},
                )
                .build()
                .transpose
            )
    
    
        def __call__(self, queue, tgt, src, shape):
            w, h = shape
            assert w % block_size == 0
            assert h % block_size == 0
    
    
            return self.kernel(
                queue,
                (w, h),
                (block_size, block_size),
                tgt,
                src,
                numpy.uint32(w),
                numpy.uint32(h),
                cl.LocalMemory(4 * block_size * (block_size + 1)),
            )
    
    
    
    def transpose_using_cl(ctx, queue, cpu_src, cls):
    
        mf = cl.mem_flags
        a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cpu_src)
        a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=cpu_src.nbytes)
    
        cls(ctx)(queue, a_t_buf, a_buf, cpu_src.shape)
    
    
        w, h = cpu_src.shape
        result = numpy.empty((h, w), dtype=cpu_src.dtype)
    
        cl.enqueue_copy(queue, result, a_t_buf).wait()
    
    
        a_buf.release()
        a_t_buf.release()
    
        return result
    
    
    def check_transpose():
    
        for cls in [NaiveTranspose, SillyTranspose, TransposeWithLocal]:
    
            print("checking", cls.__name__)
    
            ctx = cl.create_some_context()
    
    
            for dev in ctx.devices:
                assert dev.local_mem_size > 0
    
            queue = cl.CommandQueue(ctx)
    
            for i in numpy.arange(10, 13, 0.125):
    
                size = int(((2 ** i) // 32) * 32)
    
                print(size)
    
    
                source = numpy.random.rand(size, size).astype(numpy.float32)
                result = transpose_using_cl(ctx, queue, source, NaiveTranspose)
    
                err = source.T - result
                err_norm = la.norm(err)
    
                assert err_norm == 0, (size, err_norm)
    
    
    def benchmark_transpose():
    
        ctx = cl.create_some_context()
    
    
        for dev in ctx.devices:
            assert dev.local_mem_size > 0
    
    
        queue = cl.CommandQueue(
            ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
        )
    
        sizes = [int(((2 ** i) // 32) * 32) for i in numpy.arange(10, 13, 0.125)]
        # for i in numpy.arange(10, 10.5, 0.125)]
    
    
        mem_bandwidths = {}
    
        methods = [SillyTranspose, NaiveTranspose, TransposeWithLocal]
        for cls in methods:
            name = cls.__name__.replace("Transpose", "")
    
            mem_bandwidths[cls] = meth_mem_bws = []
    
            for size in sizes:
    
                source = numpy.random.rand(size, size).astype(numpy.float32)
    
                mf = cl.mem_flags
                a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=source)
                a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=source.nbytes)
                method = cls(ctx)
    
    
                for _i in range(4):
    
                    method(queue, a_t_buf, a_buf, source.shape)
    
                count = 12
                events = []
    
                for _i in range(count):
    
                    events.append(method(queue, a_t_buf, a_buf, source.shape))
    
                events[-1].wait()
                time = sum(evt.profile.end - evt.profile.start for evt in events)
    
    
                mem_bw = 2 * source.nbytes * count / (time * 1e-9)
                print("benchmarking", name, size, mem_bw / 1e9, "GB/s")
    
                meth_mem_bws.append(mem_bw)
    
                a_buf.release()
                a_t_buf.release()
    
            from matplotlib.pyplot import (
                clf,
                plot,
                xlabel,
                ylabel,
                savefig,
                legend,
                grid,
            )
    
        except ModuleNotFoundError:
            pass
        else:
            for i in range(len(methods)):
                clf()
    
                for j in range(i + 1):
    
                    method = methods[j]
                    name = method.__name__.replace("Transpose", "")
    
                    plot(sizes, numpy.array(mem_bandwidths[method]) / 1e9, "o-",
                            label=name)
    
                xlabel("Matrix width/height $N$")
                ylabel("Memory Bandwidth [GB/s]")
                legend(loc="best")
                grid()
    
                savefig("transpose-benchmark-%d.pdf" % i)