Newer
Older
i1 = lp.CInstruction("i",
"doSomethingToGetPhi();",
from pymbolic.primitives import Subscript, Variable
i2 = lp.Assignment("a",
lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
within_inames=frozenset(),
within_inames_is_final=True)
k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
[i1, i2],
[
lp.GlobalArg("a", dtype=np.float32, shape=()),
lp.ValueArg("n", dtype=np.int32),
lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
],
target=lp.CTarget(),
)
k = lp.preprocess_kernel(k)
assert 'i' not in k.insn_inames("insn_0_j_update")
print(k.stringify(with_dependencies=True))
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
def test_kernel_splitting(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
c[i] = a[i + 1]
out[i] = c[i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory):
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
c[k,i] = a[k, i + 1]
out[k,i] = c[k,i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
ctx = ctx_factory()
pytest.xfail("spilling doesn't yet use local axes")
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
<> t_private_scalar = a[k,i+1]
<> t_private_array[i % 2] = a[k,i+1]
out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
knl = lp.set_temporary_scope(knl, "t_private_array", "private")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
<> t_local[i % 8,k] = i % 8
c[k,i] = a[k,i+1]
out[k,i] = c[k,i] + t_local[i % 8,k]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "t_local", "local")
ref_knl = knl
knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
def test_global_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n}",
"""
<> c[i] = a[i + 1]
out[i] = c[i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "c", "global")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
#print(cgr.device_code())
#print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_assign_to_linear_subscript(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl1 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[i,i] = 1")
knl2 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[[i*n + i]] = 1",
[lp.GlobalArg("a", shape="n,n"), "..."])
a1 = cl.array.zeros(queue, (10, 10), np.float32)
knl1(queue, a=a1)
a2 = cl.array.zeros(queue, (10, 10), np.float32)
knl2(queue, a=a2)
assert np.array_equal(a1.get(), a2.get())
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
def test_finite_difference_expr_subst(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="n+2"), "..."])
flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="n+2"),
lp.GlobalArg("u", shape="n+2"),
])
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
data_flow=[
("f", 1, 0)
])
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused_knl = lp.assignment_to_subst(fused_knl, "f")
fused_knl = lp.set_options(fused_knl, write_cl=True)
# This is the real test here: The automatically generated
# shape expressions are '2+n' and the ones above are 'n+2'.
# Is loopy smart enough to understand that these are equal?
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
gpu_knl = lp.split_iname(
fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
precomp_knl = lp.precompute(
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
# {{{ call without returned values
def test_call_with_no_returned_value(ctx_factory):
import pymbolic.primitives as p
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{:}",
[lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])
evt, _ = knl(queue)
Dominic Kempf
committed
def test_unschedulable_kernel_detection():
knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
"""
mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
mat2[j] = mat2[j] + 1 {inames=j, id=i2}
mat3[i] = mat3[i] + 1 {inames=i, id=i3}
""")
knl = lp.preprocess_kernel(knl)
# Check that loopy can detect the unschedulability of the kernel
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 4
Dominic Kempf
committed
for inames, insns in lp.get_iname_duplication_options(knl):
fixed_knl = lp.duplicate_inames(knl, inames, insns)
assert lp.has_schedulable_iname_nesting(fixed_knl)
Dominic Kempf
committed
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
""")
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 10
Andreas Klöckner
committed
def test_regression_no_ret_call_removal(ctx_factory):
# https://github.com/inducer/loopy/issues/32
knl = lp.make_kernel(
"{[i] : 0<=i<n}",
"f(sum(i, x[i]))")
knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
knl = lp.preprocess_kernel(knl)
assert len(knl.instructions) == 3
def test_regression_persistent_hash():
knl1 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[2]*d[2]")
knl2 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[0]*d[0]")
from loopy.tools import LoopyKeyBuilder
lkb = LoopyKeyBuilder()
assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
assert lkb(knl1) != lkb(knl2)
def test_sequential_dependencies(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
for i
<> aa = 5jf
<> bb = 5j
a[i] = imag(aa)
b[i] = imag(bb)
c[i] = 5f
end
""", seq_dependencies=True)
print(knl.stringify(with_dependencies=True))
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
def test_special_instructions(ctx_factory):
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for itrip,i
... nop {dep=wr_z:wr_v,id=yoink}
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
knl = lp.fix_parameters(knl, n=15)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))
def test_index_cse(ctx_factory):
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
for i
for j
c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l])
end
end
""")
knl = lp.tag_inames(knl, "l:unr")
knl = lp.set_loop_priority(knl, "i,j,k,l")
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
knl = lp.fix_parameters(knl, n=5)
print(lp.generate_code_v2(knl).device_code())
def test_ilp_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond = T[k] < 0.5
if Tcond
cp[k] = 2 * T[k] + Tcond
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
def test_unr_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_constant_array_args(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""",
[lp.ConstantArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from py.test.cmdline import main
main([__file__])
# vim: foldmethod=marker