Aliasing in CUDA
The snippet can be accessed without any authentication.
Authored by
Kaushik Kulkarni
aliasing_in_cuda.py 2.25 KiB
import numpy as np
import pycuda.autoinit # noqa
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
prg = SourceModule(
"""
__global__ void __launch_bounds__(32) the_best_kernel(double * out)
{
__shared__ double a[32];
__shared__ double b[32];
int loc_tid = threadIdx.x;
int glob_tid = 32*blockIdx.x + threadIdx.x;
a[31-loc_tid] = 2.0*loc_tid;
b[31-loc_tid] = 4.0*loc_tid;
__syncthreads();
out[glob_tid] = a[loc_tid] + b[loc_tid];
}
__global__ void __launch_bounds__(32) the_best_aliased_kernel(double * out)
{
__shared__ double temp_storage[64];
double *const __restrict__ a = (__shared__ double *const __restrict__ ) (temp_storage);
double *const __restrict__ b = (__shared__ double *const __restrict__ ) (temp_storage+32);
int loc_tid = threadIdx.x;
int glob_tid = 32*blockIdx.x + threadIdx.x;
a[31-loc_tid] = 2.0*loc_tid;
b[31-loc_tid] = 4.0*loc_tid;
__syncthreads();
out[glob_tid] = a[loc_tid] + b[loc_tid];
}
""")
knl = prg.get_function('the_best_kernel')
aliased_knl = prg.get_function('the_best_aliased_kernel')
grid_size = 409600
block_size = 32
out1 = cuda.mem_alloc(8*grid_size * block_size)
out2 = cuda.mem_alloc(8*grid_size * block_size)
for _ in range(10): # warmup
knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))
knl_time = 0
aliased_time = 0
rounds = 300
for _ in range(rounds):
start = cuda.Event()
end = cuda.Event()
start.record()
start.synchronize()
knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
end.record()
end.synchronize()
knl_time += (start.time_till(end)/1000)
start = cuda.Event()
end = cuda.Event()
start.record()
start.synchronize()
aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))
end.record()
end.synchronize()
aliased_time += (start.time_till(end)/1000)
out1_host = np.empty(grid_size * block_size)
out2_host = np.empty(grid_size * block_size)
cuda.memcpy_dtoh(out1_host, out1)
cuda.memcpy_dtoh(out2_host, out2)
assert np.allclose(out1_host, out2_host)
print('the_best_kernel time: {}s'.format(knl_time/rounds))
print('the_best_aliased_kernel time: {}s'.format(aliased_time/rounds))
Please register or sign in to comment