From 1cbed32f994fdda15f03dc18fcb2d72fb6d5ee1b Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 18 Sep 2009 03:42:35 -0400
Subject: [PATCH] Add examples/transpose.py.

---
 examples/transpose.py | 108 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 examples/transpose.py

diff --git a/examples/transpose.py b/examples/transpose.py
new file mode 100644
index 00000000..0159f32a
--- /dev/null
+++ b/examples/transpose.py
@@ -0,0 +1,108 @@
+# Transposition of a matrix
+# originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
+
+from __future__ import division
+
+import pyopencl as cl
+import numpy
+import numpy.linalg as la
+
+
+
+
+block_size = 16
+
+
+
+
+def inner_transpose(ctx, queue, tgt, src, shape):
+    knl = cl.Program(ctx, """
+    #define BLOCK_SIZE %(block_size)d
+    #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
+    #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
+
+    __kernel void transpose(
+      __global float *a_t, __global float *a,
+      unsigned a_width, unsigned a_height,
+      __local float a_local[BLOCK_SIZE][BLOCK_SIZE+1])
+    {
+      // Base indices in A and A_t
+      int base_idx_a   =
+        get_group_id(0) * BLOCK_SIZE +
+        get_group_id(1) * A_BLOCK_STRIDE;
+      int base_idx_a_t =
+        get_group_id(1) * BLOCK_SIZE +
+        get_group_id(0) * A_T_BLOCK_STRIDE;
+
+      // Global indices in A and A_t
+      int glob_idx_a   = base_idx_a + get_local_id(0) + a_width * get_local_id(1);
+      int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);
+
+      // Store transposed submatrix to local memory
+      a_local[get_local_id(1)][get_local_id(0)] = a[glob_idx_a];
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Write transposed submatrix to global memory
+      a_t[glob_idx_a_t] = a_local[get_local_id(0)][get_local_id(1)];
+    }
+    """% {"block_size": block_size}).build().transpose
+
+    w, h = shape
+    assert w % block_size == 0
+    assert h % block_size == 0
+
+    knl(queue, (w, h),
+        tgt, src, numpy.uint32(w), numpy.uint32(h),
+        cl.LocalMemory(4*block_size*(block_size+1)),
+        local_size=(block_size, block_size))
+
+
+
+
+def transpose_using_cl(ctx, queue, cpu_src):
+    mf = cl.mem_flags
+    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cpu_src)
+    a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=cpu_src.nbytes)
+    inner_transpose(ctx, queue, a_t_buf, a_buf, cpu_src.shape)
+
+    w, h = cpu_src.shape
+    result = numpy.empty((h, w), dtype=cpu_src.dtype)
+    cl.enqueue_read_buffer(queue, a_t_buf, result).wait()
+
+    a_buf.release()
+    a_t_buf.release()
+
+    return result
+
+
+
+
+
+def check_transpose():
+    ctx = cl.Context(dev_type=cl.device_type.ALL)
+
+    for dev in ctx.devices:
+        assert dev.local_mem_size > 0
+
+    queue = cl.CommandQueue(ctx)
+
+    from pycuda.curandom import rand
+
+    for i in numpy.arange(10, 13, 0.125):
+        size = int(((2**i) // 32) * 32)
+        print size
+
+        source = numpy.random.rand(size, size).astype(numpy.float32)
+        result = transpose_using_cl(ctx, queue, source)
+
+        err = source.T - result
+        err_norm = la.norm(err)
+
+        assert err_norm == 0, (size, err_norm)
+
+
+
+
+check_transpose()
+
-- 
GitLab