split-iname + slab peeling at both ends produces bad code if the resulting loop has split-length or fewer iterations
If I have a loop with runtime-dependent bounds [k, n)
that I split the iname of (by 4 say) and peel at both ends (with slabs=(1, 1)), then if at runtime the total loop length n - k <= 4
and all iterations satisfy i mod 4 = const
(we're in one "block") then the expressions are executed twice producing the wrong answer.
Consider:
import loopy
loopy.set_caching_enabled(False)
k = loopy.make_kernel(
"{ [i] : k <= i < n}",
"""
a[i] = a[i] + 1
""",
[loopy.ValueArg("k", dtype="int32"),
loopy.ValueArg("n", dtype="int32"),
loopy.GlobalArg("a", shape=(None, ),
dtype="int32")],
target=loopy.CTarget(), lang_version=(2018, 2),
name="unpeeled_loopy_kernel")
k = loopy.assume(k, "k >= 0 and n >= k")
with open("foo.c", "w") as f:
f.write("#include <stdint.h>\n")
f.write("#include <stdio.h>\n")
f.write("#include <stdlib.h>\n")
ast, = loopy.generate_code_v2(k).device_programs
f.write(f"static {ast.ast}\n")
for slab in [(0, 0), (1, 0), (0, 1), (1, 1)]:
kslab = loopy.split_iname(k, "i", 4, slabs=slab)
kslab = loopy.prioritize_loops(kslab, "i_outer,i_inner")
name = "".join(map(str, slab))
kslab = kslab.copy(name=f"slabs{name}_loopy_kernel")
ast, = loopy.generate_code_v2(kslab).device_programs
f.write(f"static {ast.ast}\n")
f.write(r"""
#define N_MAX 8
static int32_t inline clamp(int32_t a)
{
if (a < 0) return 0;
else if (a > N_MAX) return N_MAX;
else return a;
}
static void run(void (*kernel)(int32_t, int32_t, int32_t *),
int32_t k, int32_t n,
const char * restrict name)
{
int32_t a[N_MAX] = {0};
int i;
kernel(k, n, a);
printf("%s: ", name);
for (i = 0; i < N_MAX; i++)
printf("%d", a[i]);
printf("\n");
}
int main(int argc, char **argv)
{
const int32_t k = argc >= 2 ? clamp(atoi(argv[1])) : 0;
const int32_t n = argc == 3 ? clamp(atoi(argv[2])) : N_MAX;
run(&unpeeled_loopy_kernel, k, n, "unpeeled");
run(&slabs00_loopy_kernel, k, n, " slabs00");
run(&slabs10_loopy_kernel, k, n, " slabs10");
run(&slabs01_loopy_kernel, k, n, " slabs01");
run(&slabs11_loopy_kernel, k, n, " slabs11");
return 0;
}
""")
Which produces a C file, foo.c
.
If I compile and run with:
$ cc -std=c99 -o foo -O0 -Wall foo.c
$ ./foo 0 4
unpeeled: 11110000
slabs00: 11110000
slabs10: 11110000
slabs01: 11110000
slabs11: 22220000
$ ./foo 0 5
unpeeled: 11111000
slabs00: 11111000
slabs10: 11111000
slabs01: 11111000
slabs11: 11111000
$ ./foo 3 7
unpeeled: 00011110
slabs00: 00011110
slabs10: 00011110
slabs01: 00011110
slabs11: 00011110
$ ./foo 3 8
unpeeled: 00011111
slabs00: 00011111
slabs10: 00011111
slabs01: 00011111
slabs11: 00011111
$ ./foo 4 8
unpeeled: 00001111
slabs00: 00001111
slabs10: 00001111
slabs01: 00001111
slabs11: 00002222
It looks like somehow one needs a condition that the peeled first and last iterations of i_outer
are not actually the same.