WIP: Smarter fusion
This adds two important (for me) control points to fuse_kernels
.
- This allows for initialized const. temporary variables to not be duplicated when fusing kernels (if they are identically defined between kernels).
- This allows duplicate instructions (specified by ID) to not be duplicated when fusing multiple kernels that contain them.
The use case here is that I need both the temporary variables and the instructions in subkernels for unit testing. However, when I smush all the subkernels together into a global kernel they shouldn't be duplicated for efficiency.
As a simple example
import loopy as lp
import pyopencl as cl
import numpy as np
from loopy.kernel.data import temp_var_scope
lp.set_caching_enabled(False)
ctx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(ctx)
knl1 = lp.make_kernel('{[i]: 0 <=i < 10}',
"""
<> mid = data[i] < 5 {id=mid}
out1[i] = 10 * mid
""",
[lp.TemporaryVariable('data', initializer=np.arange(10), scope=temp_var_scope.GLOBAL, read_only=True,
shape=(10,)),
'...'])
knl2 = lp.make_kernel('{[i]: 0 <=i < 10}',
"""
<> mid = data[i] < 5 {id=mid}
out2[i] = 10 * mid
""",
[lp.TemporaryVariable('data', initializer=np.arange(10), scope=temp_var_scope.GLOBAL, read_only=True,
shape=(10,)),
'...'])
knl = lp.fuse_kernels([knl1, knl2], duplicate_intialized=False, collapse_insns_ids=['mid'])
print(lp.generate_code(knl)[0])
results in
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__constant long const data[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel(__global int *restrict out1, __global int *restrict out2)
{
int mid;
for (int i = 0; i <= 9; ++i)
{
mid = data[i] < 5;
out2[i] = 10 * mid;
out1[i] = 10 * mid;
}
}
todo:
- Note, this requires the merge request I pushed to pymbolic to work (I don't know how to make loopy CI use that yet!)
- There's still a few bugs cropping up in the instruction checking I need to fix