Skip to content

Missing barriers in local-mem computation loop

The code generated for this kernel should have barriers around the k loop. It currently doesn't.

import numpy as np
import loopy as lp
import pyopencl as cl
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
        "{ [i,j,k]: 0<=i<256 and 0<=j<256 and 0<=k<4 }",
        """
        for i
            <> tmp[j] = 15 - j+i
            for k
                tmp[j] = 2 * tmp[j] + k
            end
            a[i,j] = tmp[j]
        end
        """, seq_dependencies=True)
knl = lp.tag_inames(knl, "i:g.0,j:l.0")
evt, (out,) = knl(queue)
knl = lp.add_and_infer_dtypes(knl, {"tmp": np.dtype(np.float32)})
print(lp.generate_code_v2(knl).device_code())