Newer
Older
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.zeros(50)
out_ref[1::2] = 4
out_ref[0::6] = 15
out_ref[4::6] = 11
out_ref[2::6] = 3
assert np.array_equal(out_ref, out)
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
def test_tight_loop_bounds(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)")
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
evt, (out,) = knl(queue, out_host=True)
assert (out == np.arange(10)).all()
def test_tight_loop_bounds_codegen():
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)",
target=lp.OpenCLTarget())
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
cgr = lp.generate_code_v2(knl)
#print(cgr.device_code())
Andreas Klöckner
committed
for_loop = \
"(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \
"j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
Andreas Klöckner
committed
assert for_loop in cgr.device_code()
Andreas Klöckner
committed
def test_unscheduled_insn_detection():
knl = lp.make_kernel(
"{ [i]: 0 <= i < 10 }",
"""
out[i] = i {id=insn1}
""",
"...")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
insn1, = lp.find_instructions(knl, "id:insn1")
knl.instructions.append(insn1.copy(id="insn2"))
from loopy.diagnostic import UnscheduledInstructionError
with pytest.raises(UnscheduledInstructionError):
lp.generate_code(knl)
def test_integer_reduction(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
from loopy.kernel.data import temp_var_scope as scopes
var_int = np.random.randint(1000, size=n).astype(vtype)
var_lp = lp.TemporaryVariable('var', initializer=var_int,
read_only=True,
from collections import namedtuple
ReductionTest = namedtuple('ReductionTest', 'kind, check, args')
reductions = [
ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'),
ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'),
ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'),
ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'),
ReductionTest('argmax',
lambda x: (
x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)),
args='var[k], k'),
ReductionTest('argmin',
lambda x: (
x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)),
args='var[k], k')
]
for reduction, function, args in reductions:
kstr = ("out" if 'arg' not in reduction
else "out[0], out[1]")
kstr += ' = {0}(k, {1})'.format(reduction, args)
kstr,
[var_lp, '...'])
knl = lp.fix_parameters(knl, n=200)
Nick Curtis
committed
_, (out,) = knl(queue, out_host=True)
Matt Wala
committed
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
def test_complicated_argmin_reduction(ctx_factory):
cl_ctx = ctx_factory()
knl = lp.make_kernel(
"{[ictr,itgt,idim]: "
"0<=itgt<ntargets "
"and 0<=ictr<ncenters "
"and 0<=idim<ambient_dim}",
"""
for itgt
for ictr
<> dist_sq = sum(idim,
(tgt[idim,itgt] - center[idim,ictr])**2)
<> in_disk = dist_sq < (radius[ictr]*1.05)**2
<> matches = (
(in_disk
and qbx_forced_limit == 0)
or (in_disk
and qbx_forced_limit != 0
and qbx_forced_limit * center_side[ictr] > 0)
)
<> post_dist_sq = if(matches, dist_sq, HUGE)
end
<> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
end
""")
knl = lp.fix_parameters(knl, ambient_dim=2)
knl = lp.add_and_infer_dtypes(knl, {
"tgt,center,radius,HUGE": np.float32,
"center_side,qbx_forced_limit": np.int32,
})
lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
"HUGE": 1e20, "ncenters": 200, "ntargets": 300,
"qbx_forced_limit": 1})
def test_nosync_option_parsing():
knl = lp.make_kernel(
"{[i]: 0 <= i < 10}",
"""
<>t = 1 {id=insn1,nosync=insn1}
t = 2 {id=insn2,nosync=insn1:insn2}
t = 3 {id=insn3,nosync=insn1@local:insn2@global:insn3@any}
t = 4 {id=insn4,nosync_query=id:insn*@local}
t = 5 {id=insn5,nosync_query=id:insn1}
""",
options=lp.Options(allow_terminal_colors=False))
kernel_str = str(knl)
assert "# insn1,no_sync_with=insn1@any" in kernel_str
assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa
assert "# insn5,no_sync_with=insn1@any" in kernel_str
def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop)
watch_for_barrier = False
seen_barrier = False
loop_level = 0
for sched_item in knl.schedule:
if isinstance(sched_item, RunInstruction):
if sched_item.insn_id == id1:
watch_for_barrier = True
elif sched_item.insn_id == id2:
assert watch_for_barrier
assert seen_barrier
return
elif isinstance(sched_item, Barrier):
if watch_for_barrier and loop_level not in ignore_barriers_in_levels:
seen_barrier = True
elif isinstance(sched_item, EnterLoop):
loop_level += 1
elif isinstance(sched_item, LeaveLoop):
loop_level -= 1
raise RuntimeError("id2 was not seen")
def test_barrier_insertion_near_top_of_loop():
knl = lp.make_kernel(
"{[i,j]: 0 <= i,j < 10 }",
"""
for i
<>a[i] = i {id=ainit}
for j
<>t = a[(i + 1) % 10] {id=tcomp}
<>b[i,j] = a[i] + t {id=bcomp1}
b[i,j] = b[i,j] + 1 {id=bcomp2}
end
end
""",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.set_temporary_scope(knl, "b", "local")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
print(knl)
assert_barrier_between(knl, "ainit", "tcomp")
assert_barrier_between(knl, "tcomp", "bcomp1")
assert_barrier_between(knl, "bcomp1", "bcomp2")
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
def test_barrier_insertion_near_bottom_of_loop():
knl = lp.make_kernel(
["{[i]: 0 <= i < 10 }",
"[jmax] -> {[j]: 0 <= j < jmax}"],
"""
for i
<>a[i] = i {id=ainit}
for j
<>b[i,j] = a[i] + t {id=bcomp1}
b[i,j] = b[i,j] + 1 {id=bcomp2}
end
a[i] = i + 1 {id=aupdate}
end
""",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.set_temporary_scope(knl, "b", "local")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
print(knl)
assert_barrier_between(knl, "bcomp1", "bcomp2")
assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
def test_multi_argument_reduction_type_inference():
from loopy.type_inference import TypeInferenceMapper
from loopy.library.reduction import SegmentedSumReductionOperation
from loopy.types import to_loopy_type
op = SegmentedSumReductionOperation()
knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
int32 = to_loopy_type(np.int32)
expr = lp.symbolic.Reduction(
operation=op,
inames=("i",),
expr=lp.symbolic.Reduction(
operation=op,
inames="j",
expr=(1, 2),
allow_simultaneous=True),
allow_simultaneous=True)
t_inf_mapper = TypeInferenceMapper(knl)
assert (
t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
== [(int32, int32)])
def test_multi_argument_reduction_parsing():
from loopy.symbolic import parse, Reduction
assert isinstance(
parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
Reduction)
def test_global_barrier_order_finding():
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=top}
<> z[i] = z[i+1] + z[i] {id=wr_z,dep=top}
<> v[i] = 11 {id=wr_v,dep=top}
... gbarrier {dep=wr_z:wr_v,id=yoink}
z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink}
end
... nop {id=nop}
... gbarrier {dep=iupd,id=postloop}
z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop}
end
""")
assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop")
for insn, barrier in (
("nop", None),
("top", None),
("wr_z", "top"),
("wr_v", "top"),
("yoink", "top"),
("postloop", "yoink"),
("zzzv", "postloop")):
assert lp.find_most_recent_global_barrier(knl, insn) == barrier
def test_global_barrier_error_if_unordered():
# FIXME: Should be illegal to declare this
knl = lp.make_kernel("{[i]: 0 <= i < 10}",
"""
... gbarrier
... gbarrier
""")
from loopy.diagnostic import LoopyError
with pytest.raises(LoopyError):
lp.get_global_barrier_order(knl)
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
def test_struct_assignment(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
bbhit = np.dtype([
("tmin", np.float32),
("tmax", np.float32),
("bi", np.int32),
("hit", np.int32)])
bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct(
ctx.devices[0], "bbhit", bbhit)
bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit)
preamble = bbhit_c_decl
knl = lp.make_kernel(
"{ [i]: 0<=i<N }",
"""
for i
result[i].hit = i % 2
result[i].tmin = i
result[i].tmax = i+10
result[i].bi = i
end
""",
[
lp.GlobalArg("result", shape=("N",), dtype=bbhit),
"..."],
preambles=[("000", preamble)])
knl = lp.set_options(knl, write_cl=True)
knl(queue, N=200)
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
def test_inames_conditional_generation(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,j,k]: 0 < k < i and 0 < j < 10 and 0 < i < 10}",
"""
for k
... gbarrier
<>tmp1 = 0
end
for j
... gbarrier
<>tmp2 = i
end
""",
"...",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="g.0"))
with cl.CommandQueue(ctx) as queue:
knl(queue)
def test_kernel_var_name_generator():
knl = lp.make_kernel(
"{[i]: 0 <= i <= 10}",
"""
<>a = 0
<>b_s0 = 0
""")
vng = knl.get_var_name_generator()
assert vng("a_s0") != "a_s0"
assert vng("b") != "b"
def test_fixed_parameters(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"[n] -> {[i]: 0 <= i < n}",
"""
<>tmp[i] = i
tmp[0] = 0
""",
fixed_parameters=dict(n=1))
knl(queue)
def test_parameter_inference():
knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
assert knl.all_params() == set(["n"])
def test_execution_backend_can_cache_dtypes(ctx_factory):
# When the kernel is invoked, the execution backend uses it as a cache key
# for the type inference and scheduling cache. This tests to make sure that
# dtypes in the kernel can be cached, even though they may not have a
# target.
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i")
knl = lp.add_dtypes(knl, dict(tmp=int))
knl(queue)
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from py.test.cmdline import main
main([__file__])