Newer
Older
lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
lp.GlobalArg("a", dtype, shape=lp.auto),
"..."
],
assumptions="n>0")
ref_knl = knl
knl = lp.split_iname(knl, "i", 512)
knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def test_within_inames_and_reduction():
# See https://github.com/inducer/loopy/issues/24
# This is (purposefully) somewhat un-idiomatic, to replicate the conditions
# under which the above bug was found. If assignees were phi[i], then the
# iname propagation heuristic would not assume that dependent instructions
# need to run inside of 'i', and hence the forced_iname_* bits below would not
# be needed.
i1 = lp.CInstruction("i",
"doSomethingToGetPhi();",
from pymbolic.primitives import Subscript, Variable
i2 = lp.Assignment("a",
lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
within_inames=frozenset(),
within_inames_is_final=True)
k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
[i1, i2],
[
lp.GlobalArg("a", dtype=np.float32, shape=()),
lp.ValueArg("n", dtype=np.int32),
lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
],
target=lp.CTarget(),
)
k = lp.preprocess_kernel(k)
assert 'i' not in k.insn_inames("insn_0_j_update")
print(k.stringify(with_dependencies=True))
def test_kernel_splitting(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
for i
c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory):
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
for i, k
... gbarrier
c[k,i] = a[k, i + 1]
... gbarrier
out[k,i] = c[k,i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
from loopy.preprocess import preprocess_kernel
from loopy.schedule import get_one_scheduled_kernel
knl = preprocess_kernel(knl)
knl = get_one_scheduled_kernel(knl)
from loopy.transform.save import save_and_reload_temporaries
knl = save_and_reload_temporaries(knl)
knl = get_one_scheduled_kernel(knl)
if debug:
print(knl)
cgr = lp.generate_code_v2(knl)
print(cgr.device_code())
print(cgr.host_code())
1/0
_, (out,) = knl(queue, out_host=True)
assert (out == out_expect).all()
@pytest.mark.parametrize("hw_loop", [True, False])
def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
"{ [i]: 0<=i<8 }",
"""
for i
<>t = i
... gbarrier
out[i] = t
end
""", seq_dependencies=True)
if hw_loop:
knl = lp.tag_inames(knl, dict(i="g.0"))
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
"{ [i]: 0<=i<8 }",
"""
for i
<>t[i] = i
... gbarrier
out[i] = t[i]
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
"{ [i,j,k]: 0<=i,j,k<8 }",
"""
for i
for j
<>t[j] = j
... gbarrier
for k
out[i,k] = t[k]
knl = lp.tag_inames(knl, dict(i="g.0"))
knl = lp.set_temporary_scope(knl, "t", "private")
save_and_reload_temporaries_test(
queue, knl, np.vstack((8 * (np.arange(8),))), debug)
def test_save_of_private_multidim_array(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
"""
for i
for j, k
<>t[j,k] = k
end
... gbarrier
for l, m
out[i,l,m] = t[l,m]
end
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
save_and_reload_temporaries_test(queue, knl, result, debug)
def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
"""
for i
for j, k
<>t[j,k] = k
end
... gbarrier
for l, m
out[i,l,m] = t[l,m]
end
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "private")
knl = lp.tag_inames(knl, dict(i="g.0"))
result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
save_and_reload_temporaries_test(queue, knl, result, debug)
@pytest.mark.parametrize("hw_loop", [True, False])
def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False):
for i
for k
<> t_arr[k] = k
end
<> t_scalar = 1
for j
... gbarrier
out[j] = t_scalar
... gbarrier
t_scalar = 10
<> flag = i == 9
out[i] = t_arr[i] {if=flag}
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t_arr", "private")
if hw_loop:
knl = lp.tag_inames(knl, dict(i="g.0"))
result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9])
save_and_reload_temporaries_test(queue, knl, result, debug)
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i,j]: 0<=i,j<8 }",
"""
for i, j
<>t[2*j] = j
t[2*j+1] = j
... gbarrier
out[i] = t[2*i]
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t", "local")
knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
def test_save_local_multidim_array(ctx_factory, debug=False):
"{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}",
end
""", seq_dependencies=True)
knl = lp.set_temporary_scope(knl, "t_local", "local")
knl = lp.tag_inames(knl, dict(j="l.0", i="g.0"))
save_and_reload_temporaries_test(queue, knl, 1, debug)
def test_missing_temporary_definition_detection():
knl = lp.make_kernel(
"{ [i]: 0<=i<10 }",
"""
for i
<> t = 1
... gbarrier
out[i] = t
end
""", seq_dependencies=True)
from loopy.diagnostic import MissingDefinitionError
with pytest.raises(MissingDefinitionError):
lp.generate_code_v2(knl)
def test_missing_definition_check_respects_aliases():
# Based on https://github.com/inducer/loopy/issues/69
knl = lp.make_kernel("{ [i] : 0<=i<n }",
["a[i] = 0",
"c[i] = b[i]"],
temporary_variables={
"a": lp.TemporaryVariable("a",
dtype=np.float64, shape=("n",), base_storage="base"),
"b": lp.TemporaryVariable("b",
dtype=np.float64, shape=("n",), base_storage="base")
},
target=lp.CTarget(),
silenced_warnings=frozenset(["read_no_write(b)"]))
lp.generate_code_v2(knl)
def test_global_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n}",
"""
for i
<> c[i] = a[i + 1]
... gbarrier
out[i] = c[i]
end
""", seq_dependencies=True)
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "c", "global")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
#print(cgr.device_code())
#print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_assign_to_linear_subscript(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl1 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[i,i] = 1")
knl2 = lp.make_kernel(
"{ [i]: 0<=i<n}",
"a[[i*n + i]] = 1",
[lp.GlobalArg("a", shape="n,n"), "..."])
a1 = cl.array.zeros(queue, (10, 10), np.float32)
knl1(queue, a=a1)
a2 = cl.array.zeros(queue, (10, 10), np.float32)
knl2(queue, a=a2)
assert np.array_equal(a1.get(), a2.get())
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
def test_finite_difference_expr_subst(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="n+2"), "..."])
flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="n+2"),
lp.GlobalArg("u", shape="n+2"),
])
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
data_flow=[
("f", 1, 0)
])
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused_knl = lp.assignment_to_subst(fused_knl, "f")
fused_knl = lp.set_options(fused_knl, write_cl=True)
# This is the real test here: The automatically generated
# shape expressions are '2+n' and the ones above are 'n+2'.
# Is loopy smart enough to understand that these are equal?
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
gpu_knl = lp.split_iname(
fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
precomp_knl = lp.precompute(
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
# {{{ call without returned values
def test_call_with_no_returned_value(ctx_factory):
import pymbolic.primitives as p
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{:}",
[lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])
evt, _ = knl(queue)
Dominic Kempf
committed
def test_unschedulable_kernel_detection():
knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
"""
mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
mat2[j] = mat2[j] + 1 {inames=j, id=i2}
mat3[i] = mat3[i] + 1 {inames=i, id=i3}
""")
knl = lp.preprocess_kernel(knl)
# Check that loopy can detect the unschedulability of the kernel
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 4
Dominic Kempf
committed
for inames, insns in lp.get_iname_duplication_options(knl):
fixed_knl = lp.duplicate_inames(knl, inames, insns)
assert lp.has_schedulable_iname_nesting(fixed_knl)
Dominic Kempf
committed
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
""")
assert not lp.has_schedulable_iname_nesting(knl)
assert len(list(lp.get_iname_duplication_options(knl))) == 10
Andreas Klöckner
committed
def test_regression_no_ret_call_removal(ctx_factory):
# https://github.com/inducer/loopy/issues/32
knl = lp.make_kernel(
"{[i] : 0<=i<n}",
"f(sum(i, x[i]))")
knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
knl = lp.preprocess_kernel(knl)
assert len(knl.instructions) == 3
def test_regression_persistent_hash():
knl1 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[2]*d[2]")
knl2 = lp.make_kernel(
"{[i] : 0<=i<n}",
"cse_exprvar = d[0]*d[0]")
from loopy.tools import LoopyKeyBuilder
lkb = LoopyKeyBuilder()
assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
assert lkb(knl1) != lkb(knl2)
def test_sequential_dependencies(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
for i
<> aa = 5jf
<> bb = 5j
a[i] = imag(aa)
b[i] = imag(bb)
c[i] = 5f
end
""", seq_dependencies=True)
print(knl.stringify(with_dependencies=True))
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
def test_nop(ctx_factory):
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for itrip,i
... nop {dep=wr_z:wr_v,id=yoink}
knl = lp.fix_parameters(knl, n=15)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
def test_global_barrier(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=top}
<> z[i] = z[i+1] + z[i] {id=wr_z,dep=top}
<> v[i] = 11 {id=wr_v,dep=top}
... gbarrier {dep=wr_z:wr_v,id=yoink}
z[i] = z[i] - z[i+1] + v[i] {id=iupd}
end
... gbarrier {dep=iupd,id=postloop}
z[i] = z[i] - z[i+1] + v[i] {dep=postloop}
end
""")
knl = lp.fix_parameters(knl, ntrips=3)
knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
ref_knl = knl
ref_knl = lp.set_temporary_scope(ref_knl, "z", "global")
ref_knl = lp.set_temporary_scope(ref_knl, "v", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
print(knl)
knl = lp.preprocess_kernel(knl)
assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL
assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL
print(knl)
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10))
def test_missing_global_barrier():
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=yoink}
<> z[i] = z[i] - z[i+1] {id=iupd,dep=yoink}
end
# This is where the barrier should be
z[i] = z[i] - z[i+1] + v[i] {dep=iupd}
end
""")
knl = lp.set_temporary_scope(knl, "z", "global")
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
knl = lp.preprocess_kernel(knl)
from loopy.diagnostic import MissingBarrierError
with pytest.raises(MissingBarrierError):
lp.get_one_scheduled_kernel(knl)
def test_index_cse(ctx_factory):
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
"""
for i
for j
c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l])
end
end
""")
knl = lp.tag_inames(knl, "l:unr")
knl = lp.prioritize_loops(knl, "i,j,k,l")
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
knl = lp.fix_parameters(knl, n=5)
print(lp.generate_code_v2(knl).device_code())
def test_ilp_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond = T[k] < 0.5
if Tcond
cp[k] = 2 * T[k] + Tcond
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
def test_unr_and_conditionals(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""")
knl = lp.fix_parameters(knl, n=200)
knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_constant_array_args(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
<> Tcond[k] = T[k] < 0.5
if Tcond[k]
cp[k] = 2 * T[k] + Tcond[k]
end
end
""",
[lp.ConstantArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
@pytest.mark.parametrize("src_order", ["C"])
@pytest.mark.parametrize("tmp_order", ["C", "F"])
def test_temp_initializer(ctx_factory, src_order, tmp_order):
a = np.random.randn(3, 3).copy(order=src_order)
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"out[i,j] = tmp[i,j]",
[
lp.TemporaryVariable("tmp",
initializer=a,
shape=lp.auto,
scope=lp.temp_var_scope.PRIVATE,
read_only=True,
order=tmp_order),
"..."
])
knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
knl = lp.fix_parameters(knl, n=a.shape[0])
evt, (a2,) = knl(queue, out_host=True)
assert np.array_equal(a, a2)
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
def test_const_temp_with_initializer_not_saved():
knl = lp.make_kernel(
"{[i]: 0<=i<10}",
"""
... gbarrier
out[i] = tmp[i]
""",
[
lp.TemporaryVariable("tmp",
initializer=np.arange(10),
shape=lp.auto,
scope=lp.temp_var_scope.PRIVATE,
read_only=True),
"..."
],
seq_dependencies=True)
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
knl = lp.save_and_reload_temporaries(knl)
# This ensures no save slot was added.
assert len(knl.temporary_variables) == 1
def test_header_extract():
knl = lp.make_kernel('{[k]: 0<=k<n}}',
"""
for k
T[k] = k**2
end
""",
[lp.GlobalArg('T', shape=(200,), dtype=np.float32),
'...'])
knl = lp.fix_parameters(knl, n=200)
#test C
cknl = knl.copy(target=lp.CTarget())
assert str(lp.generate_header(cknl)[0]) == (
'void loopy_kernel(float *__restrict__ T);')
cuknl = knl.copy(target=lp.CudaTarget())
assert str(lp.generate_header(cuknl)[0]) == (
'extern "C" __global__ void __launch_bounds__(1) '
'loopy_kernel(float *__restrict__ T);')
oclknl = knl.copy(target=lp.PyOpenCLTarget())
assert str(lp.generate_header(oclknl)[0]) == (
'__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
'loopy_kernel(__global float *__restrict__ T);')
def test_scalars_with_base_storage(ctx_factory):
""" Regression test for !50 """
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<1}",
"a = 1",
[lp.TemporaryVariable("a", dtype=np.float64,
shape=(), base_storage="base")])
knl(queue, out_host=True)
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
def test_if_else(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end
"""
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.empty(50)
out_ref[::3] = 15
out_ref[1::3] = 11
out_ref[2::3] = 3
assert np.array_equal(out_ref, out)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
for i
if i % 2 == 0
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end
end
end
"""
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.zeros(50)
out_ref[1::2] = 4
out_ref[0::6] = 15
out_ref[4::6] = 11
out_ref[2::6] = 3
assert np.array_equal(out_ref, out)
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
def test_tight_loop_bounds(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)")
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
evt, (out,) = knl(queue, out_host=True)
assert (out == np.arange(10)).all()
def test_tight_loop_bounds_codegen():
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)",
target=lp.OpenCLTarget())
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
cgr = lp.generate_code_v2(knl)
#print(cgr.device_code())
Andreas Klöckner
committed
for_loop = \
"for (int j = " \
"(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \
"j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
Andreas Klöckner
committed
assert for_loop in cgr.device_code()
Andreas Klöckner
committed
def test_unscheduled_insn_detection():
knl = lp.make_kernel(
"{ [i]: 0 <= i < 10 }",
"""
out[i] = i {id=insn1}
""",
"...")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
insn1, = lp.find_instructions(knl, "id:insn1")
knl.instructions.append(insn1.copy(id="insn2"))
from loopy.diagnostic import UnscheduledInstructionError
with pytest.raises(UnscheduledInstructionError):
lp.generate_code(knl)
def test_integer_reduction(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
from loopy.kernel.data import temp_var_scope as scopes
var_int = np.random.randint(1000, size=n).astype(vtype)
var_lp = lp.TemporaryVariable('var', initializer=var_int,
read_only=True,
shape=lp.auto)
reductions = [('max', lambda x: x == np.max(var_int)),
('min', lambda x: x == np.min(var_int)),
('sum', lambda x: x == np.sum(var_int)),
('product', lambda x: x == np.prod(var_int)),
('argmax', lambda x: (x[0] == np.max(var_int) and
Nick Curtis
committed
var_int[out[1]] == np.max(var_int))),
('argmin', lambda x: (x[0] == np.min(var_int) and
Nick Curtis
committed
var_int[out[1]] == np.min(var_int)))]
kstr = ("out" if 'arg' not in reduction
else "out[0], out[1]")
kstr += ' = {0}(k, var[k])'.format(reduction)
kstr,
[var_lp, '...'])
knl = lp.fix_parameters(knl, n=200)
Nick Curtis
committed
_, (out,) = knl(queue, out_host=True)
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])