Skip to content
Snippets Groups Projects

Aliasing in CUDA

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by Kaushik Kulkarni
    aliasing_in_cuda.py 2.25 KiB
    import numpy as np
    import pycuda.autoinit  # noqa
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    
    
    prg = SourceModule(
            """
    __global__ void __launch_bounds__(32) the_best_kernel(double * out)
    {
        __shared__ double a[32];
        __shared__ double b[32];
    
        int loc_tid = threadIdx.x;
        int glob_tid = 32*blockIdx.x + threadIdx.x;
    
        a[31-loc_tid] = 2.0*loc_tid;
        b[31-loc_tid] = 4.0*loc_tid;
    
        __syncthreads();
        out[glob_tid] = a[loc_tid] + b[loc_tid];
    }
    
    __global__ void __launch_bounds__(32) the_best_aliased_kernel(double * out)
    {
        __shared__ double temp_storage[64];
        double *const __restrict__ a = (__shared__ double *const __restrict__ ) (temp_storage);
        double *const __restrict__ b = (__shared__ double *const __restrict__ ) (temp_storage+32);
    
        int loc_tid = threadIdx.x;
        int glob_tid = 32*blockIdx.x + threadIdx.x;
    
        a[31-loc_tid] = 2.0*loc_tid;
        b[31-loc_tid] = 4.0*loc_tid;
    
        __syncthreads();
        out[glob_tid] = a[loc_tid] + b[loc_tid];
    }
    """)
    
    
    knl = prg.get_function('the_best_kernel')
    aliased_knl = prg.get_function('the_best_aliased_kernel')
    
    grid_size = 409600
    block_size = 32
    
    out1 = cuda.mem_alloc(8*grid_size * block_size)
    out2 = cuda.mem_alloc(8*grid_size * block_size)
    
    for _ in range(10):  # warmup
        knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
        aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))
    
    knl_time = 0
    aliased_time = 0
    rounds = 300
    for _ in range(rounds):
        start = cuda.Event()
        end = cuda.Event()
        start.record()
        start.synchronize()
        knl(out1, block=(block_size, 1, 1), grid=(grid_size, 1))
        end.record()
        end.synchronize()
        knl_time += (start.time_till(end)/1000)
    
        start = cuda.Event()
        end = cuda.Event()
        start.record()
        start.synchronize()
        aliased_knl(out2, block=(block_size, 1, 1), grid=(grid_size, 1))
        end.record()
        end.synchronize()
        aliased_time += (start.time_till(end)/1000)
    
    out1_host = np.empty(grid_size * block_size)
    out2_host = np.empty(grid_size * block_size)
    cuda.memcpy_dtoh(out1_host, out1)
    cuda.memcpy_dtoh(out2_host, out2)
    assert np.allclose(out1_host, out2_host)
    
    
    print('the_best_kernel time: {}s'.format(knl_time/rounds))
    print('the_best_aliased_kernel time: {}s'.format(aliased_time/rounds))
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment