Newer
Older
evt, (out,) = knl(queue, out_host=True)
out_ref = np.empty(50)
out_ref[::3] = 15
out_ref[1::3] = 11
out_ref[2::3] = 3
assert np.array_equal(out_ref, out)
knl = lp.make_kernel(
"{ [i]: 0<=i<50}",
"""
for i
if i % 2 == 0
if i % 3 == 0
a[i] = 15
elif i % 3 == 1
a[i] = 11
else
a[i] = 3
end
end
end
"""
)
evt, (out,) = knl(queue, out_host=True)
out_ref = np.zeros(50)
out_ref[1::2] = 4
out_ref[0::6] = 15
out_ref[4::6] = 11
out_ref[2::6] = 3
assert np.array_equal(out_ref, out)
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
def test_tight_loop_bounds(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)")
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
evt, (out,) = knl(queue, out_host=True)
assert (out == np.arange(10)).all()
def test_tight_loop_bounds_codegen():
knl = lp.make_kernel(
["{ [i] : 0 <= i <= 5 }",
"[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
"""
for i
for j
out[j] = j
end
end
""",
silenced_warnings="write_race(insn)",
target=lp.OpenCLTarget())
knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
cgr = lp.generate_code_v2(knl)
#print(cgr.device_code())
Andreas Klöckner
committed
for_loop = \
"(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \
"j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
Andreas Klöckner
committed
assert for_loop in cgr.device_code()
Andreas Klöckner
committed
def test_unscheduled_insn_detection():
knl = lp.make_kernel(
"{ [i]: 0 <= i < 10 }",
"""
out[i] = i {id=insn1}
""",
"...")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
insn1, = lp.find_instructions(knl, "id:insn1")
knl.instructions.append(insn1.copy(id="insn2"))
from loopy.diagnostic import UnscheduledInstructionError
with pytest.raises(UnscheduledInstructionError):
lp.generate_code(knl)
def test_integer_reduction(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
from loopy.kernel.data import temp_var_scope as scopes
var_int = np.random.randint(1000, size=n).astype(vtype)
var_lp = lp.TemporaryVariable('var', initializer=var_int,
read_only=True,
from collections import namedtuple
ReductionTest = namedtuple('ReductionTest', 'kind, check, args')
reductions = [
ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'),
ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'),
ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'),
ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'),
ReductionTest('argmax',
lambda x: (
x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)),
args='var[k], k'),
ReductionTest('argmin',
lambda x: (
x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)),
args='var[k], k')
]
for reduction, function, args in reductions:
kstr = ("out" if 'arg' not in reduction
else "out[0], out[1]")
kstr += ' = {0}(k, {1})'.format(reduction, args)
kstr,
[var_lp, '...'])
knl = lp.fix_parameters(knl, n=200)
Nick Curtis
committed
_, (out,) = knl(queue, out_host=True)
Matt Wala
committed
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
def test_complicated_argmin_reduction(ctx_factory):
cl_ctx = ctx_factory()
knl = lp.make_kernel(
"{[ictr,itgt,idim]: "
"0<=itgt<ntargets "
"and 0<=ictr<ncenters "
"and 0<=idim<ambient_dim}",
"""
for itgt
for ictr
<> dist_sq = sum(idim,
(tgt[idim,itgt] - center[idim,ictr])**2)
<> in_disk = dist_sq < (radius[ictr]*1.05)**2
<> matches = (
(in_disk
and qbx_forced_limit == 0)
or (in_disk
and qbx_forced_limit != 0
and qbx_forced_limit * center_side[ictr] > 0)
)
<> post_dist_sq = if(matches, dist_sq, HUGE)
end
<> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
end
""")
knl = lp.fix_parameters(knl, ambient_dim=2)
knl = lp.add_and_infer_dtypes(knl, {
"tgt,center,radius,HUGE": np.float32,
"center_side,qbx_forced_limit": np.int32,
})
lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
"HUGE": 1e20, "ncenters": 200, "ntargets": 300,
"qbx_forced_limit": 1})
def test_nosync_option_parsing():
knl = lp.make_kernel(
"{[i]: 0 <= i < 10}",
"""
<>t = 1 {id=insn1,nosync=insn1}
t = 2 {id=insn2,nosync=insn1:insn2}
t = 3 {id=insn3,nosync=insn1@local:insn2@global:insn3@any}
t = 4 {id=insn4,nosync_query=id:insn*@local}
t = 5 {id=insn5,nosync_query=id:insn1}
""",
options=lp.Options(allow_terminal_colors=False))
kernel_str = str(knl)
assert "# insn1,no_sync_with=insn1@any" in kernel_str
assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa
assert "# insn5,no_sync_with=insn1@any" in kernel_str
def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop)
watch_for_barrier = False
seen_barrier = False
loop_level = 0
for sched_item in knl.schedule:
if isinstance(sched_item, RunInstruction):
if sched_item.insn_id == id1:
watch_for_barrier = True
elif sched_item.insn_id == id2:
assert watch_for_barrier
assert seen_barrier
return
elif isinstance(sched_item, Barrier):
if watch_for_barrier and loop_level not in ignore_barriers_in_levels:
seen_barrier = True
elif isinstance(sched_item, EnterLoop):
loop_level += 1
elif isinstance(sched_item, LeaveLoop):
loop_level -= 1
raise RuntimeError("id2 was not seen")
def test_barrier_insertion_near_top_of_loop():
knl = lp.make_kernel(
"{[i,j]: 0 <= i,j < 10 }",
"""
for i
<>a[i] = i {id=ainit}
for j
<>t = a[(i + 1) % 10] {id=tcomp}
<>b[i,j] = a[i] + t {id=bcomp1}
b[i,j] = b[i,j] + 1 {id=bcomp2}
end
end
""",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.set_temporary_scope(knl, "b", "local")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
print(knl)
assert_barrier_between(knl, "ainit", "tcomp")
assert_barrier_between(knl, "tcomp", "bcomp1")
assert_barrier_between(knl, "bcomp1", "bcomp2")
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
def test_barrier_insertion_near_bottom_of_loop():
knl = lp.make_kernel(
["{[i]: 0 <= i < 10 }",
"[jmax] -> {[j]: 0 <= j < jmax}"],
"""
for i
<>a[i] = i {id=ainit}
for j
<>b[i,j] = a[i] + t {id=bcomp1}
b[i,j] = b[i,j] + 1 {id=bcomp2}
end
a[i] = i + 1 {id=aupdate}
end
""",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="l.0"))
knl = lp.set_temporary_scope(knl, "a", "local")
knl = lp.set_temporary_scope(knl, "b", "local")
knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
print(knl)
assert_barrier_between(knl, "bcomp1", "bcomp2")
assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
def test_barrier_in_overridden_get_grid_size_expanded_kernel():
from loopy.kernel.data import temp_var_scope as scopes
# make simple barrier'd kernel
knl = lp.make_kernel('{[i]: 0 <= i < 10}',
"""
for i
a[i] = i {id=a}
... lbarrier {id=barrier}
b[i + 1] = a[i] {nosync=a}
end
""",
[lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
scope=scopes.LOCAL),
lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
seq_dependencies=True)
# split into kernel w/ vesize larger than iname domain
vecsize = 16
knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
# artifically expand via overridden_get_grid_sizes_for_insn_ids
def __init__(self, clean, vecsize=vecsize):
self.clean = clean
self.vecsize = vecsize
def __call__(self, insn_ids, ignore_auto=True):
gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
return gsize, (self.vecsize,)
knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
knl.copy(), vecsize))
# make sure we can generate the code
lp.generate_code_v2(knl)
def test_multi_argument_reduction_type_inference():
from loopy.type_inference import TypeInferenceMapper
from loopy.library.reduction import SegmentedSumReductionOperation
from loopy.types import to_loopy_type
op = SegmentedSumReductionOperation()
knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
int32 = to_loopy_type(np.int32)
expr = lp.symbolic.Reduction(
operation=op,
inames=("i",),
expr=lp.symbolic.Reduction(
operation=op,
inames="j",
expr=(1, 2),
allow_simultaneous=True),
allow_simultaneous=True)
t_inf_mapper = TypeInferenceMapper(knl)
assert (
t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
== [(int32, int32)])
def test_multi_argument_reduction_parsing():
from loopy.symbolic import parse, Reduction
assert isinstance(
parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
Reduction)
def test_global_barrier_order_finding():
knl = lp.make_kernel(
"{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
"""
for i
for itrip
... gbarrier {id=top}
<> z[i] = z[i+1] + z[i] {id=wr_z,dep=top}
<> v[i] = 11 {id=wr_v,dep=top}
... gbarrier {dep=wr_z:wr_v,id=yoink}
z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink}
end
... nop {id=nop}
... gbarrier {dep=iupd,id=postloop}
z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop}
end
""")
assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop")
for insn, barrier in (
("nop", None),
("top", None),
("wr_z", "top"),
("wr_v", "top"),
("yoink", "top"),
("postloop", "yoink"),
("zzzv", "postloop")):
assert lp.find_most_recent_global_barrier(knl, insn) == barrier
def test_global_barrier_error_if_unordered():
# FIXME: Should be illegal to declare this
knl = lp.make_kernel("{[i]: 0 <= i < 10}",
"""
... gbarrier
... gbarrier
""")
from loopy.diagnostic import LoopyError
with pytest.raises(LoopyError):
lp.get_global_barrier_order(knl)
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
def test_struct_assignment(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
bbhit = np.dtype([
("tmin", np.float32),
("tmax", np.float32),
("bi", np.int32),
("hit", np.int32)])
bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct(
ctx.devices[0], "bbhit", bbhit)
bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit)
preamble = bbhit_c_decl
knl = lp.make_kernel(
"{ [i]: 0<=i<N }",
"""
for i
result[i].hit = i % 2
result[i].tmin = i
result[i].tmax = i+10
result[i].bi = i
end
""",
[
lp.GlobalArg("result", shape=("N",), dtype=bbhit),
"..."],
preambles=[("000", preamble)])
knl = lp.set_options(knl, write_cl=True)
knl(queue, N=200)
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
def test_inames_conditional_generation(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,j,k]: 0 < k < i and 0 < j < 10 and 0 < i < 10}",
"""
for k
... gbarrier
<>tmp1 = 0
end
for j
... gbarrier
<>tmp2 = i
end
""",
"...",
seq_dependencies=True)
knl = lp.tag_inames(knl, dict(i="g.0"))
with cl.CommandQueue(ctx) as queue:
knl(queue)
def test_kernel_var_name_generator():
knl = lp.make_kernel(
"{[i]: 0 <= i <= 10}",
"""
<>a = 0
<>b_s0 = 0
""")
vng = knl.get_var_name_generator()
assert vng("a_s0") != "a_s0"
assert vng("b") != "b"
def test_fixed_parameters(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"[n] -> {[i]: 0 <= i < n}",
"""
<>tmp[i] = i
tmp[0] = 0
""",
fixed_parameters=dict(n=1))
knl(queue)
def test_execution_backend_can_cache_dtypes(ctx_factory):
# When the kernel is invoked, the execution backend uses it as a cache key
# for the type inference and scheduling cache. This tests to make sure that
# dtypes in the kernel can be cached, even though they may not have a
# target.
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i")
knl = lp.add_dtypes(knl, dict(tmp=int))
knl(queue)
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from py.test.cmdline import main
main([__file__])