Kernel with multiple inames tagged LocalIndexTag might emit wrong bounds
The code snippet --
import loopy as lp
ref_knl = lp.make_kernel(
"{[i, iglob, j]: 0<=iglob<10 and 0<=i<3 and 0<=j<8}",
"""
subst(t) := 2
y[iglob, i, j] = subst(j)*x[iglob, i, j]
""", [lp.GlobalArg('x', shape=lp.auto, dtype=float), '...'])
knl = lp.split_iname(ref_knl, 'j', 7)
knl = lp.precompute(knl, 'subst', sweep_inames=['j_inner'],
precompute_outer_inames=frozenset(['j_outer', 'iglob']),
precompute_inames='iprcmpt',
default_tag=None,
temporary_scope=lp.AddressSpace.LOCAL)
knl = lp.split_iname(knl, 'iprcmpt', 3, inner_tag='l.0')
knl = lp.tag_inames(knl, "i:l.0, iglob:g.0")
print(lp.generate_code_v2(knl).device_code())
generates the code --
__kernel void __attribute__ ((reqd_work_group_size(3, 1, 1))) loopy_kernel(__global double const *__restrict__ x, __global double *__restrict__ y)
{
__local int subst_0[7];
for (int j_outer = 0; j_outer <= 1; ++j_outer)
{
barrier(CLK_LOCAL_MEM_FENCE) /* for subst_0 (subst rev-depends on insn) */;
if (7 + -7 * j_outer + -1 * lid(0) >= 0)
for (int iprcmpt_outer = 0; iprcmpt_outer <= (-1 + j_outer == 0 && lid(0) == 0 ? 0 : 2 + -1 * lid(0) + (2 * lid(0)) / 3); ++iprcmpt_outer)
subst_0[3 * iprcmpt_outer + lid(0)] = 2;
barrier(CLK_LOCAL_MEM_FENCE) /* for subst_0 (insn depends on subst) */;
for (int j_inner = 0; j_inner <= (-1 + j_outer == 0 && lid(0) == 0 ? 0 : 6); ++j_inner)
if (7 + -1 * j_inner + -7 * j_outer >= 0)
y[24 * gid(0) + 8 * lid(0) + 7 * j_outer + j_inner] = subst_0[j_inner] * x[24 * gid(0) + 8 * lid(0) + 7 * j_outer + j_inner];
}
}
Notice the unnecessary && lid(0)
in the j_inner
loop. Although this does not affect the result, an inefficient kernel is generated. The most likely cause of this iprcmpt_inner
as a parameter while finding the bounds of j_inner
.
Edited by Kaushik Kulkarni