aliasing_in_cuda.py

import numpy as np
import pycuda.autoinit  # noqa
import pycuda.driver as cuda
from pycuda.compiler import SourceModule


prg = SourceModule(
        """
__global__ void __launch_bounds__(32) the_best_kernel(double * out)
{
    __shared__ double a[32];
    __shared__ double b[32];

    int loc_tid = threadIdx.x;
    int glob_tid = 32*blockIdx.x + threadIdx.x;

    a[31-loc_tid] = 2.0*loc_tid;
    b[31-loc_tid] = 4.0*loc_tid;

    __syncthreads();
    out[glob_tid] = a[loc_tid] + b[loc_tid];
}

__global__ void __launch_bounds__(32) the_best_aliased_kernel(double * out)
{
    __shared__ double temp_storage[64];
    double *const __restrict__ a = (__shared__ double *const __restrict__ ) (temp_storage);
    double *const __restrict__ b = (__shared__ double *const __restrict__ ) (temp_storage+32);

    int loc_tid = threadIdx.x;
    int glob_tid = 32*blockIdx.x + threadIdx.x;

    a[31-loc_tid] = 2.0*loc_tid;
    b[31-loc_tid] = 4.0*loc_tid;

    __syncthreads();
    out[glob_tid] = a[loc_tid] + b[loc_tid];
}
""")


knl = prg.get_function('the_best_kernel')
aliased_knl = prg.get_function('the_best_aliased_kernel')

grid_size = 409600
block_size = 32

out1 = cuda.mem_alloc(8*grid_size * block_size)
out2 = cuda.mem_alloc(8*grid_size * block_size)

for _ in range(10):  # warmup
    knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
    aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))

knl_time = 0
aliased_time = 0
rounds = 300
for _ in range(rounds):
    start = cuda.Event()
    end = cuda.Event()
    start.record()
    start.synchronize()
    knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
    end.record()
    end.synchronize()
    knl_time += (start.time_till(end)/1000)

    start = cuda.Event()
    end = cuda.Event()
    start.record()
    start.synchronize()
    aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))
    end.record()
    end.synchronize()
    aliased_time += (start.time_till(end)/1000)

out1_host = np.empty(grid_size * block_size)
out2_host = np.empty(grid_size * block_size)
cuda.memcpy_dtoh(out1_host, out1)
cuda.memcpy_dtoh(out2_host, out2)
assert np.allclose(out1_host, out2_host)


print('the_best_kernel time: {}s'.format(knl_time/rounds))
print('the_best_aliased_kernel time: {}s'.format(aliased_time/rounds))