Missing barriers in local-mem computation loop
The code generated for this kernel should have barriers around the k
loop. It currently doesn't.
import numpy as np
import loopy as lp
import pyopencl as cl
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j,k]: 0<=i<256 and 0<=j<256 and 0<=k<4 }",
"""
for i
<> tmp[j] = 15 - j+i
for k
tmp[j] = 2 * tmp[j] + k
end
a[i,j] = tmp[j]
end
""", seq_dependencies=True)
knl = lp.tag_inames(knl, "i:g.0,j:l.0")
evt, (out,) = knl(queue)
knl = lp.add_and_infer_dtypes(knl, {"tmp": np.dtype(np.float32)})
print(lp.generate_code_v2(knl).device_code())