Newer
Older
lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
lp.GlobalArg("a", dtype, shape=lp.auto),
"..."
],
assumptions="n>0")
ref_knl = knl
knl = lp.split_iname(knl, "i", 512)
knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def test_within_inames_and_reduction():
# See https://github.com/inducer/loopy/issues/24
# This is (purposefully) somewhat un-idiomatic, to replicate the conditions
# under which the above bug was found. If assignees were phi[i], then the
# iname propagation heuristic would not assume that dependent instructions
# need to run inside of 'i', and hence the forced_iname_* bits below would not
# be needed.
i1 = lp.CInstruction("i",
"doSomethingToGetPhi();",
from pymbolic.primitives import Subscript, Variable
i2 = lp.Assignment("a",
lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
within_inames=frozenset(),
within_inames_is_final=True)
k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
[i1, i2],
[
lp.GlobalArg("a", dtype=np.float32, shape=()),
lp.ValueArg("n", dtype=np.int32),
lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
],
target=lp.CTarget(),
)
k = lp.preprocess_kernel(k)
assert 'i' not in k.insn_inames("insn_0_j_update")
print(k.stringify(with_dependencies=True))
def test_literal_local_barrier(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
for i
... lbarrier
end
""", seq_dependencies=True)
knl = lp.fix_parameters(knl, n=128)
ref_knl = knl
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
for i
c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory):
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
c[k,i] = a[k, i + 1]
... gbarrier
out[k,i] = c[k,i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
from loopy.preprocess import preprocess_kernel
from loopy.schedule import get_one_scheduled_kernel
knl = preprocess_kernel(knl)
knl = get_one_scheduled_kernel(knl)
from loopy.transform.save import save_and_reload_temporaries
knl = save_and_reload_temporaries(knl)
knl = get_one_scheduled_kernel(knl)
if debug:
print(knl)
cgr = lp.generate_code_v2(knl)
print(cgr.device_code())
print(cgr.host_code())
1/0
_, (out,) = knl(queue, out_host=True)
assert (out == out_expect).all(), (out, out_expect)
@pytest.mark.parametrize("hw_loop", [True, False])
def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
"{ [i]: 0<=i<8 }",
"""
for i
<>t = i
... gbarrier
out[i] = t
end
""", seq_dependencies=True)
if hw_loop:
knl = lp.tag_inames(knl, dict(i="g.0"))
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
"{ [i]: 0<=i<8 }",
"""
for i
<>t[i] = i
... gbarrier
out[i] = t[i]
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
"{ [i,j,k]: 0<=i,j,k<8 }",
"""
for i
for j
<>t[j] = j
... gbarrier
for k
out[i,k] = t[k]
knl = lp.tag_inames(knl, dict(i="g.0"))
knl = lp.set_temporary_scope(knl, "t", "private")
save_and_reload_temporaries_test(
queue, knl, np.vstack((8 * (np.arange(8),))), debug)
def test_save_of_private_multidim_array(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
"""
for i
for j, k
<>t[j,k] = k
end
... gbarrier
for l, m
out[i,l,m] = t[l,m]
end
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
save_and_reload_temporaries_test(queue, knl, result, debug)
def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
"""
for i
for j, k
<>t[j,k] = k
end
... gbarrier
for l, m
out[i,l,m] = t[l,m]
end
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
knl = lp.tag_inames(knl, dict(i="g.0"))
result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
save_and_reload_temporaries_test(queue, knl, result, debug)
@pytest.mark.parametrize("hw_loop", [True, False])
def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False):
for i
for k
<> t_arr[k] = k
end
<> t_scalar = 1
for j
... gbarrier
out[j] = t_scalar
... gbarrier
t_scalar = 10
<> flag = i == 9
out[i] = t_arr[i] {if=flag}
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t_arr", "private")
if hw_loop:
knl = lp.tag_inames(knl, dict(i="g.0"))
result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9])
save_and_reload_temporaries_test(queue, knl, result, debug)
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j]: 0<=i,j<8 }",
"""
for i, j
<>t[2*j] = j
t[2*j+1] = j
... gbarrier
out[i] = t[2*i]
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "local")
knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
def test_save_of_local_array_with_explicit_local_barrier(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j]: 0<=i,j<8 }",
"""
for i, j
<>t[2*j] = j
... lbarrier
t[2*j+1] = t[2*j]
... gbarrier
out[i] = t[2*i]
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "local")
knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
def test_save_local_multidim_array(ctx_factory, debug=False):
"{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}",
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t_local", "local")
knl = lp.tag_inames(knl, dict(j="l.0", i="g.0"))
save_and_reload_temporaries_test(queue, knl, 1, debug)
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
def test_save_with_base_storage(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{[i]: 0 <= i < 10}",
"""
<>a[i] = 0
<>b[i] = i
... gbarrier
out[i] = a[i]
""",
"...",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.set_temporary_scope(knl, "b", "local")
knl = lp.alias_temporaries(knl, ["a", "b"],
synchronize_for_exclusive_use=False)
save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
def test_save_ambiguous_storage_requirements():
knl = lp.make_kernel(
"{[i,j]: 0 <= i < 10 and 0 <= j < 10}",
"""
<>a[j] = j
... gbarrier
out[i,j] = a[j]
""",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
from loopy.diagnostic import LoopyError
with pytest.raises(LoopyError):
lp.save_and_reload_temporaries(knl)
def test_save_across_inames_with_same_tag(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{[i]: 0 <= i < 10}",
"""
<>a[i] = i
... gbarrier
out[i] = a[i]
""",
"...",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"})
save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
def test_missing_temporary_definition_detection():
knl = lp.make_kernel(
"{ [i]: 0<=i<10 }",
"""
for i
<> t = 1
... gbarrier
out[i] = t
end
""", seq_dependencies=True)
from loopy.diagnostic import MissingDefinitionError
with pytest.raises(MissingDefinitionError):
lp.generate_code_v2(knl)
def test_missing_definition_check_respects_aliases():
# Based on https://github.com/inducer/loopy/issues/69
knl = lp.make_kernel("{ [i] : 0<=i<n }",
["a[i] = 0",
"c[i] = b[i]"],
temporary_variables={
"a": lp.TemporaryVariable("a",
dtype=np.float64, shape=("n",), base_storage="base"),
"b": lp.TemporaryVariable("b",
dtype=np.float64, shape=("n",), base_storage="base")
},
target=lp.CTarget(),
silenced_warnings=frozenset(["read_no_write(b)"]))
lp.generate_code_v2(knl)
def test_global_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n}",
"""
for i
<> c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "c", "global")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
#print(cgr.device_code())
#print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_assign_to_linear_subscript(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl1 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[i,i] = 1")
knl2 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[[i*n + i]] = 1",
[lp.GlobalArg("a", shape="n,n"), "..."])
a1 = cl.array.zeros(queue, (10, 10), np.float32)
knl1(queue, a=a1)
a2 = cl.array.zeros(queue, (10, 10), np.float32)
knl2(queue, a=a2)
assert np.array_equal(a1.get(), a2.get())
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
def test_finite_difference_expr_subst(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="n+2"), "..."])
flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="n+2"),
lp.GlobalArg("u", shape="n+2"),
])
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
data_flow=[
("f", 1, 0)
])
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused_knl = lp.assignment_to_subst(fused_knl, "f")
fused_knl = lp.set_options(fused_knl, write_cl=True)
# This is the real test here: The automatically generated
# shape expressions are '2+n' and the ones above are 'n+2'.
# Is loopy smart enough to understand that these are equal?
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
gpu_knl = lp.split_iname(
fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
precomp_knl = lp.precompute(
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
# {{{ call without returned values
def test_call_with_no_returned_value(ctx_factory):
import pymbolic.primitives as p
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{:}",
[lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])
evt, _ = knl(queue)
knl = lp.make_kernel(
"{:}",
"f() {id=init}"
)
knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
print(lp.generate_code_v2(knl).device_code())
Dominic Kempf
committed
def test_unschedulable_kernel_detection():
knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
"""
mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
mat2[j] = mat2[j] + 1 {inames=j, id=i2}
mat3[i] = mat3[i] + 1 {inames=i, id=i3}
""")
knl = lp.preprocess_kernel(knl)
# Check that loopy can detect the unschedulability of the kernel
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 4
Dominic Kempf
committed
for inames, insns in lp.get_iname_duplication_options(knl):
fixed_knl = lp.duplicate_inames(knl, inames, insns)
assert lp.has_schedulable_iname_nesting(fixed_knl)
Dominic Kempf
committed
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
""")
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 10
Andreas Klöckner
committed
def test_regression_no_ret_call_removal(ctx_factory):
# https://github.com/inducer/loopy/issues/32
knl = lp.make_kernel(
"{[i] : 0<=i<n}",
"f(sum(i, x[i]))")
knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
knl = lp.preprocess_kernel(knl)
assert len(knl.instructions) == 3
def test_regression_persistent_hash():
knl1 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[2]*d[2]")
knl2 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[0]*d[0]")
from loopy.tools import LoopyKeyBuilder
lkb = LoopyKeyBuilder()
assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
assert lkb(knl1) != lkb(knl2)
def test_sequential_dependencies(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
for i
<> aa = 5jf
<> bb = 5j
a[i] = imag(aa)
b[i] = imag(bb)
c[i] = 5f
end
""", seq_dependencies=True)
print(knl.stringify(with_dependencies=True))
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
def test_nop(ctx_factory):
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for itrip,i
... nop {dep=wr_z:wr_v,id=yoink}
knl = lp.fix_parameters(knl, n=15)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
def test_global_barrier(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=top}
<> z[i] = z[i+1] + z[i] {id=wr_z,dep=top}
<> v[i] = 11 {id=wr_v,dep=top}
... gbarrier {dep=wr_z:wr_v,id=yoink}
z[i] = z[i] - z[i+1] + v[i] {id=iupd}
end
... gbarrier {dep=iupd,id=postloop}
z[i] = z[i] - z[i+1] + v[i] {dep=postloop}
end
""")
knl = lp.fix_parameters(knl, ntrips=3)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
ref_knl = knl
ref_knl = lp.set_temporary_scope(ref_knl, "z", "global")
ref_knl = lp.set_temporary_scope(ref_knl, "v", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
print(knl)
knl = lp.preprocess_kernel(knl)
assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL
assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL
print(knl)
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10))
def test_missing_global_barrier():
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=yoink}
<> z[i] = z[i] - z[i+1] {id=iupd,dep=yoink}
end
# This is where the barrier should be
z[i] = z[i] - z[i+1] + v[i] {dep=iupd}
end
""")
knl = lp.set_temporary_scope(knl, "z", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
knl = lp.preprocess_kernel(knl)
from loopy.diagnostic import MissingBarrierError
with pytest.raises(MissingBarrierError):
lp.get_one_scheduled_kernel(knl)
def test_index_cse(ctx_factory):
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
for i
for j
c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l])
end
end
""")
knl = lp.tag_inames(knl, "l:unr")
knl = lp.prioritize_loops(knl, "i,j,k,l")
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
knl = lp.fix_parameters(knl, n=5)
print(lp.generate_code_v2(knl).device_code())
def test_ilp_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond = T[k] < 0.5
if Tcond
cp[k] = 2 * T[k] + Tcond
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
def test_unr_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_constant_array_args(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""",
[lp.ConstantArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
@pytest.mark.parametrize("src_order", ["C"])
@pytest.mark.parametrize("tmp_order", ["C", "F"])
def test_temp_initializer(ctx_factory, src_order, tmp_order):
a = np.random.randn(3, 3).copy(order=src_order)
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"out[i,j] = tmp[i,j]",
[
lp.TemporaryVariable("tmp",
initializer=a,
shape=lp.auto,
scope=lp.temp_var_scope.PRIVATE,
read_only=True,
order=tmp_order),
"..."
])
knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
knl = lp.fix_parameters(knl, n=a.shape[0])
evt, (a2,) = knl(queue, out_host=True)
assert np.array_equal(a, a2)
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
def test_const_temp_with_initializer_not_saved():
knl = lp.make_kernel(
"{[i]: 0<=i<10}",
"""
... gbarrier
out[i] = tmp[i]
""",
[
lp.TemporaryVariable("tmp",
initializer=np.arange(10),
shape=lp.auto,
scope=lp.temp_var_scope.PRIVATE,
read_only=True),
"..."
],
seq_dependencies=True)
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
knl = lp.save_and_reload_temporaries(knl)
# This ensures no save slot was added.
assert len(knl.temporary_variables) == 1
def test_header_extract():
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
T[k] = k**2
end
""",
[lp.GlobalArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
#test C
cknl = knl.copy(target=lp.CTarget())
assert str(lp.generate_header(cknl)[0]) == (
'void loopy_kernel(float *__restrict__ T);')
cuknl = knl.copy(target=lp.CudaTarget())
assert str(lp.generate_header(cuknl)[0]) == (
'extern "C" __global__ void __launch_bounds__(1) '
'loopy_kernel(float *__restrict__ T);')
oclknl = knl.copy(target=lp.PyOpenCLTarget())
assert str(lp.generate_header(oclknl)[0]) == (
'__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
'loopy_kernel(__global float *__restrict__ T);')
def test_scalars_with_base_storage(ctx_factory):
""" Regression test for !50 """
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<1}",
"a = 1",
[lp.TemporaryVariable("a", dtype=np.float64,
shape=(), base_storage="base")])
knl(queue, out_host=True)
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
def test_if_else(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end
"""
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.empty(50)
out_ref[::3] = 15
out_ref[1::3] = 11
out_ref[2::3] = 3
assert np.array_equal(out_ref, out)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
for i
if i % 2 == 0
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end