Loop bounds rely (incorrectly?) on fallthrough behavior in pw_aff_to_expr
383c9c8d illustrates the issue.
The main thing that makes this suspicious is that the loop iterator is somehow relying on the fallthrough behavior to get its bounds, which means that the pwaff for the bounds doesn't say what its values should be in that case.
Here is what happens when I run one of the failing tests in !107:
$ PYTHONHASHSEED=1 PYOPENCL_TEST=port python test_linalg.py "test_funny_shape_matrix_mul(cl._csc)"
No protocol specified
/home/matt/src/loopy/loopy/transform/iname.py:991: LoopyWarning: Kernel 'matmul' required the deprecated 'boostable_into' instruction attribute in order to be schedulable!
LoopyWarning)
---------------------------------------------------------------------------
Kernel #0:
---------------------------------------------------------------------------
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(8, 16, 1))) matmul(__global float const *__restrict__ a, __global float const *__restrict__ b, __global float *__restrict__ c, int const l, int const m, int const n)
{
__local float a_acc_0[16 * 32];
float acc_k_outer_k_inner;
__local float b_acc_0[32 * 8];
if (-1 + -16 * gid(0) + -1 * lid(1) + n >= 0 && -1 + -8 * gid(1) + -1 * lid(0) + l >= 0)
acc_k_outer_k_inner = 0.0f;
for (int k_outer = 1; k_outer <= 0; ++k_outer)
{
barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (a_acc rev-depends on insn_k_outer_k_inner_update) */;
if (-1 + -32 * k_outer + -1 * lid(0) + m >= 0 && -1 + -1 * lid(1) + -16 * gid(0) + n >= 0)
for (int i2_outer = (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + m + -1 * lid(1) + -32 * k_outer >= 0 ? 0 : 1); i2_outer <= (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -32 + m + -32 * k_outer >= 0 ? 3 : (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && 31 + -1 * m + 32 * k_outer >= 0 && -1 + m + -1 * lid(1) + -32 * k_outer >= 0 ? -1 + -1 * lid(0) + -4 * k_outer + ((7 + m + 7 * lid(0)) / 8) : 0)); ++i2_outer)
if (31 + -8 * i2_outer + -1 * lid(0) >= 0)
a_acc_0[32 * lid(1) + 8 * i2_outer + lid(0)] = a[m * (16 * gid(0) + lid(1)) + 32 * k_outer + 8 * i2_outer + lid(0)];
if (-1 + -1 * lid(0) + -8 * gid(1) + l >= 0)
for (int i1_0_outer = (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + m + -1 * lid(1) + -32 * k_outer >= 0 && -1 + m + -1 * lid(0) + -32 * k_outer >= 0 ? 0 : 1); i1_0_outer <= (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -32 + m + -32 * k_outer >= 0 ? 1 : (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && 31 + -1 * m + 32 * k_outer >= 0 && -1 + m + -1 * lid(0) + -32 * k_outer >= 0 && -1 * m + -15 * lid(1) + 16 * ((15 + m + 15 * lid(1)) / 16) >= 0 && -16 + -15 * lid(1) + -32 * k_outer + 16 * ((15 + m + 15 * lid(1)) / 16) >= 0 && 15 + m + 15 * lid(1) + -16 * ((15 + m + 15 * lid(1)) / 16) >= 0 ? -1 + -1 * lid(1) + -2 * k_outer + ((15 + m + 15 * lid(1)) / 16) : 0)); ++i1_0_outer)
if (31 + -16 * i1_0_outer + -1 * lid(1) >= 0)
b_acc_0[8 * (16 * i1_0_outer + lid(1)) + lid(0)] = b[l * (32 * k_outer + 16 * i1_0_outer + lid(1)) + 8 * gid(1) + lid(0)];
barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (insn_k_outer_k_inner_update depends on a_acc) */;
if (-1 + -1 * lid(0) + -8 * gid(1) + l >= 0 && -1 + -1 * lid(1) + -16 * gid(0) + n >= 0)
for (int k_inner = (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -1 + m + -1 * lid(1) + -32 * k_outer >= 0 && -1 + m + -1 * lid(0) + -32 * k_outer >= 0 ? 0 : 1); k_inner <= (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && -32 + m + -32 * k_outer >= 0 ? 31 : (-1 + n + -1 * lid(1) + -16 * gid(0) >= 0 && -1 + l + -1 * lid(0) + -8 * gid(1) >= 0 && 31 + -1 * m + 32 * k_outer >= 0 && -1 * m + -7 * lid(0) + 8 * ((7 + m + 7 * lid(0)) / 8) >= 0 && -8 + -7 * lid(0) + -32 * k_outer + 8 * ((7 + m + 7 * lid(0)) / 8) >= 0 && 7 + m + 7 * lid(0) + -8 * ((7 + m + 7 * lid(0)) / 8) >= 0 && -1 * m + -15 * lid(1) + 16 * ((15 + m + 15 * lid(1)) / 16) >= 0 && -16 + -15 * lid(1) + -32 * k_outer + 16 * ((15 + m + 15 * lid(1)) / 16) >= 0 && 15 + m + 15 * lid(1) + -16 * ((15 + m + 15 * lid(1)) / 16) >= 0 ? -1 + m + -32 * k_outer : 0)); ++k_inner)
if (31 + -1 * k_inner >= 0)
acc_k_outer_k_inner = acc_k_outer_k_inner + a_acc_0[32 * lid(1) + k_inner] * b_acc_0[8 * k_inner + lid(0)];
}
if (-1 + -16 * gid(0) + -1 * lid(1) + n >= 0 && -1 + -8 * gid(1) + -1 * lid(0) + l >= 0)
c[l * (16 * gid(0) + lid(1)) + 8 * gid(1) + lid(0)] = acc_k_outer_k_inner;
}
---------------------------------------------------------------------------
Traceback (most recent call last):
File "test_linalg.py", line 641, in <module>
exec(sys.argv[1])
File "<string>", line 1, in <module>
File "test_linalg.py", line 264, in test_funny_shape_matrix_mul
parameters={"n": n, "m": m, "l": l})
File "/home/matt/src/loopy/loopy/auto_test.py", line 591, in auto_test_vs_ref
raise AutomaticTestFailure(error)
loopy.diagnostic.AutomaticTestFailure: results do not match -- (rel) l_2 err: 1, l_inf err: 1
``