split-iname + slab peeling at both ends produces bad code if the resulting loop has split-length or fewer iterations

If I have a loop with runtime-dependent bounds [k, n) that I split the iname of (by 4 say) and peel at both ends (with slabs=(1, 1)), then if at runtime the total loop length n - k <= 4 and all iterations satisfy i mod 4 = const (we're in one "block") then the expressions are executed twice producing the wrong answer.

Consider:

import loopy

loopy.set_caching_enabled(False)
k = loopy.make_kernel(
    "{ [i] : k <= i < n}",
    """
    a[i] = a[i] + 1
    """,
    [loopy.ValueArg("k", dtype="int32"),
     loopy.ValueArg("n", dtype="int32"),
     loopy.GlobalArg("a", shape=(None, ),
                     dtype="int32")],
    target=loopy.CTarget(), lang_version=(2018, 2),
    name="unpeeled_loopy_kernel")

k = loopy.assume(k, "k >= 0 and n >= k")

with open("foo.c", "w") as f:
    f.write("#include <stdint.h>\n")
    f.write("#include <stdio.h>\n")
    f.write("#include <stdlib.h>\n")

    ast, = loopy.generate_code_v2(k).device_programs
    f.write(f"static {ast.ast}\n")
    for slab in [(0, 0), (1, 0), (0, 1), (1, 1)]:
        kslab = loopy.split_iname(k, "i", 4, slabs=slab)
        kslab = loopy.prioritize_loops(kslab, "i_outer,i_inner")
        name = "".join(map(str, slab))
        kslab = kslab.copy(name=f"slabs{name}_loopy_kernel")
        ast, = loopy.generate_code_v2(kslab).device_programs
        f.write(f"static {ast.ast}\n")

    f.write(r"""
#define N_MAX 8
static int32_t inline clamp(int32_t a)
{
    if (a < 0) return 0;
    else if (a > N_MAX) return N_MAX;
    else return a;
}

static void run(void (*kernel)(int32_t, int32_t, int32_t *),
                int32_t k, int32_t n,
                const char * restrict name)
{
  int32_t a[N_MAX] = {0};
  int i;
  kernel(k, n, a);
  printf("%s: ", name);
  for (i = 0; i < N_MAX; i++)
    printf("%d", a[i]);
  printf("\n");
}

int main(int argc, char **argv)
{
  const int32_t k = argc >= 2 ? clamp(atoi(argv[1])) : 0;
  const int32_t n = argc == 3 ? clamp(atoi(argv[2])) : N_MAX;
  run(&unpeeled_loopy_kernel, k, n, "unpeeled");
  run(&slabs00_loopy_kernel, k, n, " slabs00");
  run(&slabs10_loopy_kernel, k, n, " slabs10");
  run(&slabs01_loopy_kernel, k, n, " slabs01");
  run(&slabs11_loopy_kernel, k, n, " slabs11");
  return 0;
}
""")

Which produces a C file, foo.c.

If I compile and run with:

$ cc -std=c99 -o foo -O0 -Wall foo.c
$ ./foo 0 4
unpeeled: 11110000
 slabs00: 11110000
 slabs10: 11110000
 slabs01: 11110000
 slabs11: 22220000
$ ./foo 0 5
unpeeled: 11111000
 slabs00: 11111000
 slabs10: 11111000
 slabs01: 11111000
 slabs11: 11111000
$ ./foo 3 7
unpeeled: 00011110
 slabs00: 00011110
 slabs10: 00011110
 slabs01: 00011110
 slabs11: 00011110
$ ./foo 3 8
unpeeled: 00011111
 slabs00: 00011111
 slabs10: 00011111
 slabs01: 00011111
 slabs11: 00011111
$ ./foo 4 8
unpeeled: 00001111
 slabs00: 00001111
 slabs10: 00001111
 slabs01: 00001111
 slabs11: 00002222

It looks like somehow one needs a condition that the peeled first and last iterations of i_outer are not actually the same.