Multi-level prefetching fails with KeyError
When I attempt to prefetch a prefetched array (imagine device memory -> local/shared memory -> private on the GPU) the second prefetch (shared -> private) fails, complaining about a KeyError on the first prefetched array (the shared memory array).
The example code below multiplies two 32x32 arrays. The idea is to prefetch the array to shared memory and then load a single row of the shared memory array into private memory. This example does not use shared or private memory but the error is the same.
MWE:
import numpy as np
n = 32
m = 32
n_vecs = 32
knl = lp.make_kernel(
"""{[k,i,j]:
0<=k<n_vecs and
0<=i<m and
0<=j<n}""",
"""
result[i,k] = sum(j, mat[i, j] * vec[j, k])
""",
kernel_data = [
lp.GlobalArg("result", np.float32, shape=(m, n_vecs), order="C"),
lp.GlobalArg("mat", np.float32, shape=(m, n), order="C"),
lp.GlobalArg("vec", np.float32, shape=(n, n_vecs), order="C")
],
assumptions="n > 0 \
and m > 0 \
and n_vecs > 0",
name="mxm"
)
knl = lp.fix_parameters(knl, m=m, n=n, n_vecs=n_vecs)
knl = lp.prioritize_loops(knl, "i,k,j")
# Version 1 - Prefetch to "shared" memory: Works
#knl = lp.add_prefetch(knl, "mat", "i, j", temporary_name="s_mat", default_tag="for")
# Version 2 - Prefetch to "private" memory: Works
#knl = lp.add_prefetch(knl, "mat", "j", temporary_name="p_mat", default_tag="for")
# Version 3 - Multi-level prefetching: Broken
knl = lp.add_prefetch(knl, "mat", "i, j", temporary_name="s_mat", default_tag="for")
knl = lp.add_prefetch(knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
code = lp.generate_code_v2(knl).device_code()
print(code)
Result
Traceback (most recent call last):
File "multi-level_prefetch_test.py", line 37, in <module>
knl = lp.add_prefetch(knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
File "/home/njchris2/miniconda3/envs/dgfem/lib/python3.7/site-packages/loopy/transform/data.py", line 289, in add_prefetch
arg = kernel.arg_dict[var_name]
KeyError: 's_mat'
Expected result
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) mxm(__global float *__restrict__ result, __global float const *__restrict__ mat, __global float const *__restrict__ vec)
{
float acc_j;
float s_mat[32 * 32];
float p_mat[32];
for (int mat_dim_1 = 0; mat_dim_1 <= 31; ++mat_dim_1)
for (int mat_dim_0 = 0; mat_dim_0 <= 31; ++mat_dim_0)
s_mat[32 * mat_dim_0 + mat_dim_1] = mat[32 * mat_dim_0 + mat_dim_1];
for (int i = 0; i <= 31; ++i)
for (int mat_dim_1 = 0; mat_dim_1 <= 31; ++mat_dim_1)
p_mat[mat_dim_1] = s_mat[32 * i + mat_dim_1];
for (int k = 0; k <= 31; ++k)
{
acc_j = 0.0f;
for (int j = 0; j <= 31; ++j)
acc_j = acc_j + p_mat[j] * vec[k + 32 * j];
result[k + 32 * i] = acc_j;
}
}
Edited by Nicholas Christensen