Newer
Older
i1 = lp.CInstruction("i",
"doSomethingToGetPhi();",
from pymbolic.primitives import Subscript, Variable
i2 = lp.Assignment("a",
lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
within_inames=frozenset(),
within_inames_is_final=True)
k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
[i1, i2],
[
lp.GlobalArg("a", dtype=np.float32, shape=()),
lp.ValueArg("n", dtype=np.int32),
lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
],
target=lp.CTarget(),
)
k = lp.preprocess_kernel(k)
assert 'i' not in k.insn_inames("insn_0_j_update")
print(k.stringify(with_dependencies=True))
def test_kernel_splitting(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
for i
c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory):
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
c[k,i] = a[k, i + 1]
... gbarrier
out[k,i] = c[k,i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
ctx = ctx_factory()
pytest.xfail("spilling doesn't yet use local axes")
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
<> t_private_scalar = a[k,i+1]
<> t_private_array[i % 2] = a[k,i+1]
c[k,i] = a[k,i+1]
... gbarrier
out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
knl = lp.set_temporary_scope(knl, "t_private_array", "private")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
<> t_local[i % 8,k] = i % 8
c[k,i] = a[k,i+1]
... gbarrier
out[k,i] = c[k,i] + t_local[i % 8,k]
end
""", seq_dependencies=True)
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "t_local", "local")
ref_knl = knl
knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
def test_global_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n}",
"""
for i
<> c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "c", "global")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
#print(cgr.device_code())
#print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_assign_to_linear_subscript(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl1 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[i,i] = 1")
knl2 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[[i*n + i]] = 1",
[lp.GlobalArg("a", shape="n,n"), "..."])
a1 = cl.array.zeros(queue, (10, 10), np.float32)
knl1(queue, a=a1)
a2 = cl.array.zeros(queue, (10, 10), np.float32)
knl2(queue, a=a2)
assert np.array_equal(a1.get(), a2.get())
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
def test_finite_difference_expr_subst(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="n+2"), "..."])
flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="n+2"),
lp.GlobalArg("u", shape="n+2"),
])
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
data_flow=[
("f", 1, 0)
])
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused_knl = lp.assignment_to_subst(fused_knl, "f")
fused_knl = lp.set_options(fused_knl, write_cl=True)
# This is the real test here: The automatically generated
# shape expressions are '2+n' and the ones above are 'n+2'.
# Is loopy smart enough to understand that these are equal?
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
gpu_knl = lp.split_iname(
fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
precomp_knl = lp.precompute(
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
# {{{ call without returned values
def test_call_with_no_returned_value(ctx_factory):
import pymbolic.primitives as p
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{:}",
[lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])
evt, _ = knl(queue)
Dominic Kempf
committed
def test_unschedulable_kernel_detection():
knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
"""
mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
mat2[j] = mat2[j] + 1 {inames=j, id=i2}
mat3[i] = mat3[i] + 1 {inames=i, id=i3}
""")
knl = lp.preprocess_kernel(knl)
# Check that loopy can detect the unschedulability of the kernel
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 4
Dominic Kempf
committed
for inames, insns in lp.get_iname_duplication_options(knl):
fixed_knl = lp.duplicate_inames(knl, inames, insns)
assert lp.has_schedulable_iname_nesting(fixed_knl)
Dominic Kempf
committed
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
""")
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 10
Andreas Klöckner
committed
def test_regression_no_ret_call_removal(ctx_factory):
# https://github.com/inducer/loopy/issues/32
knl = lp.make_kernel(
"{[i] : 0<=i<n}",
"f(sum(i, x[i]))")
knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
knl = lp.preprocess_kernel(knl)
assert len(knl.instructions) == 3
def test_regression_persistent_hash():
knl1 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[2]*d[2]")
knl2 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[0]*d[0]")
from loopy.tools import LoopyKeyBuilder
lkb = LoopyKeyBuilder()
assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
assert lkb(knl1) != lkb(knl2)
def test_sequential_dependencies(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
for i
<> aa = 5jf
<> bb = 5j
a[i] = imag(aa)
b[i] = imag(bb)
c[i] = 5f
end
""", seq_dependencies=True)
print(knl.stringify(with_dependencies=True))
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
def test_nop(ctx_factory):
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for itrip,i
... nop {dep=wr_z:wr_v,id=yoink}
knl = lp.fix_parameters(knl, n=15)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
def test_global_barrier(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=top}
<> z[i] = z[i+1] + z[i] {id=wr_z,dep=top}
<> v[i] = 11 {id=wr_v,dep=top}
... gbarrier {dep=wr_z:wr_v,id=yoink}
z[i] = z[i] - z[i+1] + v[i] {id=iupd}
end
... gbarrier {dep=iupd,id=postloop}
z[i] = z[i] - z[i+1] + v[i] {dep=postloop}
end
""")
knl = lp.fix_parameters(knl, ntrips=3)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
ref_knl = knl
ref_knl = lp.set_temporary_scope(ref_knl, "z", "global")
ref_knl = lp.set_temporary_scope(ref_knl, "v", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
print(knl)
knl = lp.preprocess_kernel(knl)
assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL
assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL
print(knl)
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10))
def test_missing_global_barrier():
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=yoink}
<> z[i] = z[i] - z[i+1] {id=iupd,dep=yoink}
end
# This is where the barrier should be
z[i] = z[i] - z[i+1] + v[i] {dep=iupd}
end
""")
knl = lp.set_temporary_scope(knl, "z", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
knl = lp.preprocess_kernel(knl)
from loopy.diagnostic import MissingBarrierError
with pytest.raises(MissingBarrierError):
lp.get_one_scheduled_kernel(knl)
def test_index_cse(ctx_factory):
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
for i
for j
c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l])
end
end
""")
knl = lp.tag_inames(knl, "l:unr")
knl = lp.prioritize_loops(knl, "i,j,k,l")
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
knl = lp.fix_parameters(knl, n=5)
print(lp.generate_code_v2(knl).device_code())
def test_ilp_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond = T[k] < 0.5
if Tcond
cp[k] = 2 * T[k] + Tcond
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
def test_unr_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_constant_array_args(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""",
[lp.ConstantArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
@pytest.mark.parametrize("src_order", ["C"])
@pytest.mark.parametrize("tmp_order", ["C", "F"])
def test_temp_initializer(ctx_factory, src_order, tmp_order):
a = np.random.randn(3, 3).copy(order=src_order)
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"out[i,j] = tmp[i,j]",
[
lp.TemporaryVariable("tmp",
initializer=a,
shape=lp.auto,
scope=lp.temp_var_scope.PRIVATE,
order=tmp_order),
"..."
])
knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
knl = lp.fix_parameters(knl, n=a.shape[0])
evt, (a2,) = knl(queue, out_host=True)
assert np.array_equal(a, a2)
def test_header_extract():
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
T[k] = k**2
end
""",
[lp.GlobalArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
#test C
cknl = knl.copy(target=lp.CTarget())
assert str(lp.generate_header(cknl)[0]) == (
'void loopy_kernel(float *__restrict__ T);')
cuknl = knl.copy(target=lp.CudaTarget())
assert str(lp.generate_header(cuknl)[0]) == (
'extern "C" __global__ void __launch_bounds__(1) '
'loopy_kernel(float *__restrict__ T);')
oclknl = knl.copy(target=lp.PyOpenCLTarget())
assert str(lp.generate_header(oclknl)[0]) == (
'__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
'loopy_kernel(__global float *__restrict__ T);')
def test_scalars_with_base_storage(ctx_factory):
""" Regression test for !50 """
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<1}",
"a = 1",
[lp.TemporaryVariable("a", dtype=np.float64,
shape=(), base_storage="base")])
knl(queue, out_host=True)
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
def test_if_else(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end
"""
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.empty(50)
out_ref[::3] = 15
out_ref[1::3] = 11
out_ref[2::3] = 3
assert np.array_equal(out_ref, out)
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
def test_tight_loop_bounds(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)")
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
evt, (out,) = knl(queue, out_host=True)
assert (out == np.arange(10)).all()
def test_tight_loop_bounds_codegen():
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)",
target=lp.OpenCLTarget())
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
cgr = lp.generate_code_v2(knl)
#print(cgr.device_code())
Andreas Klöckner
committed
for_loop = \
"for (int j = " \
"(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \
"j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
Andreas Klöckner
committed
assert for_loop in cgr.device_code()
Andreas Klöckner
committed
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from py.test.cmdline import main
main([__file__])