Poor code generation in nested iname splits unless slabs=(0,1) is explicitly specified.
The below code generates a kernel that multiplies a matrix times a set of element vectors. I split the k-axis (element axis) twice and specify g.0, ilp, and l.0 tags for k_outer, k_inner_outer, and k_inner_inner respectively. Despite all dimensions known at the time of code generation, the code generation is poor (many if-statements are inserted) unless I explicitly set slabs=(0,1). My expectation was that Loopy should automatically choose a smart slabs setting when the loop dimensions are known.
import numpy as np
import loopy as lp
import pyopencl as cl
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
import loopy.options
loopy.options.ALLOW_TERMINAL_COLORS = False
lp.set_caching_enabled(False)
def gen_diff_knl(n_elem, n_in, n_out, inline=4):
knl = lp.make_kernel(
"""{[k,i,j]:
0<=k<nelements and
0<=i<ndiscr_nodes_out and
0<=j<ndiscr_nodes_in}""",
"""
result1[i,k] = simul_reduce(sum, j, mat1[i, j] * vec[j, k])
""",
kernel_data = [
lp.GlobalArg("result1", np.float32, shape=(n_out, n_elem), order="C"),
lp.GlobalArg("mat1", np.float32, shape=(n_out, n_in), order="C"),
lp.GlobalArg("vec", np.float32, shape=(n_in, n_elem), order="C")
],
assumptions="nelements > 0 \
and ndiscr_nodes_out > 0 \
and ndiscr_nodes_in > 0",
name="diff"
)
knl = lp.fix_parameters(knl, nelements=n_elem, ndiscr_nodes_in=n_in, ndiscr_nodes_out=n_out)
elemBlockSize = 8
inlineBlocks = inline
private_memory_cols = inlineBlocks*elemBlockSize
## Test with only splitting iname, no problem here
#knl = lp.split_iname(knl, "k", elemBlockSize, outer_tag="g.0", inner_tag="l.0")
## Test with ilp and iname splitting
# Case 1: test with default -- worse code generation
knl = lp.split_iname(knl, "k", private_memory_cols, outer_tag="g.0")
# Case 2: Manually specified slab -- better code generation
#knl = lp.split_iname(knl, "k", private_memory_cols, outer_tag="g.0", slabs=(0,1))
knl = lp.split_iname(knl, "k_inner", elemBlockSize, outer_tag="ilp", inner_tag="l.0")
return knl
# Some arbitrary(ish) dimensions
n_elem = 12397
inline = 12
n_out = 120
n_in = 120
# Generate and print kernel
kern = gen_diff_knl(n_elem, n_in, n_out, inline=inline)
print(kern)
# Print device code
kern = kern.copy(target=lp.OpenCLTarget())
code = lp.generate_code_v2(kern).device_code()
print(code)
Poor code generation, default/slabs=(0,0)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(8, 1, 1))) diff(__global float *__restrict__ result1, __global float const *__restrict__ mat1, __global float const *__restrict__ vec)
{
float acc_j[12];
for (int i = 0; i <= 119; ++i)
{
acc_j[0] = 0.0f;
if (12388 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[1] = 0.0f;
if (12380 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[2] = 0.0f;
if (12372 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[3] = 0.0f;
if (12364 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[4] = 0.0f;
if (12356 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[5] = 0.0f;
if (12348 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[6] = 0.0f;
if (12340 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[7] = 0.0f;
if (12332 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[8] = 0.0f;
if (12324 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[9] = 0.0f;
if (12316 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[10] = 0.0f;
if (12308 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[11] = 0.0f;
for (int j = 0; j <= 119; ++j)
{
acc_j[0] = acc_j[0] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + lid(0)];
if (12388 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[1] = acc_j[1] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 + lid(0)];
if (12380 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[2] = acc_j[2] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 2 + lid(0)];
if (12372 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[3] = acc_j[3] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 3 + lid(0)];
if (12364 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[4] = acc_j[4] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 4 + lid(0)];
if (12356 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[5] = acc_j[5] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 5 + lid(0)];
if (12348 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[6] = acc_j[6] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 6 + lid(0)];
if (12340 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[7] = acc_j[7] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 7 + lid(0)];
if (12332 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[8] = acc_j[8] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 8 + lid(0)];
if (12324 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[9] = acc_j[9] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 9 + lid(0)];
if (12316 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[10] = acc_j[10] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 10 + lid(0)];
if (12308 + -1 * lid(0) + -96 * gid(0) >= 0)
acc_j[11] = acc_j[11] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 11 + lid(0)];
}
result1[12397 * i + 96 * gid(0) + lid(0)] = acc_j[0];
if (12388 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 + lid(0)] = acc_j[1];
if (12380 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 2 + lid(0)] = acc_j[2];
if (12372 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 3 + lid(0)] = acc_j[3];
if (12364 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 4 + lid(0)] = acc_j[4];
if (12356 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 5 + lid(0)] = acc_j[5];
if (12348 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 6 + lid(0)] = acc_j[6];
if (12340 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 7 + lid(0)] = acc_j[7];
if (12332 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 8 + lid(0)] = acc_j[8];
if (12324 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 9 + lid(0)] = acc_j[9];
if (12316 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 10 + lid(0)] = acc_j[10];
if (12308 + -1 * lid(0) + -96 * gid(0) >= 0)
result1[12397 * i + 96 * gid(0) + 8 * 11 + lid(0)] = acc_j[11];
}
}
Better code generation with slabs=(0,1)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(8, 1, 1))) diff(__global float *__restrict__ result1, __global float const *__restrict__ mat1, __global float const *__restrict__ vec)
{
float acc_j[12];
/* bulk slab for 'k_outer' */
if (128 + -1 * gid(0) >= 0)
for (int i = 0; i <= 119; ++i)
{
acc_j[0] = 0.0f;
acc_j[1] = 0.0f;
acc_j[2] = 0.0f;
acc_j[3] = 0.0f;
acc_j[4] = 0.0f;
acc_j[5] = 0.0f;
acc_j[6] = 0.0f;
acc_j[7] = 0.0f;
acc_j[8] = 0.0f;
acc_j[9] = 0.0f;
acc_j[10] = 0.0f;
acc_j[11] = 0.0f;
for (int j = 0; j <= 119; ++j)
{
acc_j[0] = acc_j[0] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + lid(0)];
acc_j[1] = acc_j[1] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 + lid(0)];
acc_j[2] = acc_j[2] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 2 + lid(0)];
acc_j[3] = acc_j[3] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 3 + lid(0)];
acc_j[4] = acc_j[4] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 4 + lid(0)];
acc_j[5] = acc_j[5] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 5 + lid(0)];
acc_j[6] = acc_j[6] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 6 + lid(0)];
acc_j[7] = acc_j[7] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 7 + lid(0)];
acc_j[8] = acc_j[8] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 8 + lid(0)];
acc_j[9] = acc_j[9] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 9 + lid(0)];
acc_j[10] = acc_j[10] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 10 + lid(0)];
acc_j[11] = acc_j[11] + mat1[120 * i + j] * vec[12397 * j + 96 * gid(0) + 8 * 11 + lid(0)];
}
result1[12397 * i + 96 * gid(0) + lid(0)] = acc_j[0];
result1[12397 * i + 96 * gid(0) + 8 + lid(0)] = acc_j[1];
result1[12397 * i + 96 * gid(0) + 8 * 2 + lid(0)] = acc_j[2];
result1[12397 * i + 96 * gid(0) + 8 * 3 + lid(0)] = acc_j[3];
result1[12397 * i + 96 * gid(0) + 8 * 4 + lid(0)] = acc_j[4];
result1[12397 * i + 96 * gid(0) + 8 * 5 + lid(0)] = acc_j[5];
result1[12397 * i + 96 * gid(0) + 8 * 6 + lid(0)] = acc_j[6];
result1[12397 * i + 96 * gid(0) + 8 * 7 + lid(0)] = acc_j[7];
result1[12397 * i + 96 * gid(0) + 8 * 8 + lid(0)] = acc_j[8];
result1[12397 * i + 96 * gid(0) + 8 * 9 + lid(0)] = acc_j[9];
result1[12397 * i + 96 * gid(0) + 8 * 10 + lid(0)] = acc_j[10];
result1[12397 * i + 96 * gid(0) + 8 * 11 + lid(0)] = acc_j[11];
}
/* final slab for 'k_outer' */
if (-129 + gid(0) == 0)
for (int i = 0; i <= 119; ++i)
{
acc_j[0] = 0.0f;
if (4 + -1 * lid(0) >= 0)
acc_j[1] = 0.0f;
for (int j = 0; j <= 119; ++j)
{
acc_j[0] = acc_j[0] + mat1[120 * i + j] * vec[12397 * j + 12384 + lid(0)];
if (4 + -1 * lid(0) >= 0)
acc_j[1] = acc_j[1] + mat1[120 * i + j] * vec[12397 * j + 12384 + 8 + lid(0)];
}
result1[12397 * i + 12384 + lid(0)] = acc_j[0];
if (4 + -1 * lid(0) >= 0)
result1[12397 * i + 12384 + 8 + lid(0)] = acc_j[1];
}
}