Newer
Older
Andreas Klöckner
committed
def test_precompute_nested_subst(ctx_factory):
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,j]: 0<=i<n and 0<=j<5}",
"""
E:=a[i]
D:=E*E
b[i] = D
""")
knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
ref_knl = knl
knl = lp.tag_inames(knl, dict(j="g.1"))
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
from loopy.symbolic import get_dependencies
assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
knl = lp.precompute(knl, "D", "i_inner")
# There's only one surviving 'E' rule.
assert len([
rule_name
for rule_name in knl.substitutions
if rule_name.startswith("E")]) == 1
# That rule should use the newly created prefetch inames,
# not the prior 'i_inner'
assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
lp.auto_test_vs_ref(
ref_knl, ctx, knl,
parameters=dict(n=12345))
Andreas Klöckner
committed
def test_poisson(ctx_factory):
# Stolen from Peter Coogan and Rob Kirby for FEM assembly
ctx = ctx_factory()
nbf = 5
nqp = 5
sdim = 3
knl = lp.make_kernel(
0 <= c < nels and \
0 <= i < nbf and \
0 <= j < nbf and \
0 <= k < nqp and \
dpsi(bf,k0,dir) := \
simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
""",
assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
ref_knl = knl
knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"])
def variant_1(knl):
knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
knl = lp.set_loop_priority(knl, "c,i,j")
return knl
def variant_2(knl):
knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
knl = lp.set_loop_priority(knl, "c,i,j")
return knl
def add_types(knl):
return lp.add_and_infer_dtypes(knl, dict(
w=np.float32,
J=np.float32,
DPsi=np.float32,
DFinv=np.float32,
))
for variant in [
#variant_1,
variant_2
]:
knl = variant(knl)
lp.auto_test_vs_ref(
add_types(ref_knl), ctx, add_types(knl),
parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
def test_auto_test_can_detect_problems(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"""
a[i,j] = 25
""")
knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
ref_knl = knl
knl = lp.link_inames(knl, "i,j", "i0")
from loopy.diagnostic import AutomaticTestFailure
with pytest.raises(AutomaticTestFailure):
lp.auto_test_vs_ref(
ref_knl, ctx, knl,
parameters=dict(n=123))
def test_generate_c_snippet():
from loopy.target.c import CTarget
from pymbolic import var
I = var("I") # noqa
f = var("f")
df = var("df")
q_v = var("q_v")
eN = var("eN") # noqa
k = var("k")
u = var("u")
from functools import partial
l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
knl = lp.make_kernel(
"{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
[
Instr(f[I], l_sum(k, q_v[k, I]*u)),
Instr(df[I], l_sum(k, q_v[k, I])),
],
[
lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
lp.GlobalArg("f,df", np.float64, shape="nSpace"),
lp.ValueArg("u", np.float64),
"...",
],
target=CTarget(),
assumptions="nQuad>=1")
if 0: # enable to play with prefetching
# (prefetch currently requires constant sizes)
knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)
knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
print(lp.generate_body(knl))
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
def test_precompute_with_preexisting_inames(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
"""
result[e,i] = sum(j, D1[i,j]*u[e,j])
result2[e,i] = sum(k, D2[i,k]*u[e,k])
""")
knl = lp.add_and_infer_dtypes(knl, {
"u": np.float32,
"D1": np.float32,
"D2": np.float32,
})
knl = lp.fix_parameters(knl, n=13)
ref_knl = knl
knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
precompute_inames="ii,jj")
knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
precompute_inames="ii,jj")
knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")
lp.auto_test_vs_ref(
ref_knl, ctx, knl,
parameters=dict(E=200))
def test_precompute_with_preexisting_inames_fail():
knl = lp.make_kernel(
"{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
"""
result[e,i] = sum(j, D1[i,j]*u[e,j])
result2[e,i] = sum(k, D2[i,k]*u[e,k])
""")
knl = lp.add_and_infer_dtypes(knl, {
"u": np.float32,
"D1": np.float32,
"D2": np.float32,
})
knl = lp.fix_parameters(knl, n=13)
knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
precompute_inames="ii,jj")
with pytest.raises(lp.LoopyError):
lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
precompute_inames="ii,jj")
def test_vectorize(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
<> temp = 2*b[i]
a[i] = temp
""")
knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
knl = lp.set_array_dim_names(knl, "a,b", "i")
knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
split_kwargs=dict(slabs=(0, 1)))
knl = lp.tag_data_axes(knl, "a,b", "c,vec")
ref_knl = knl
ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
knl = lp.tag_inames(knl, {"i_inner": "vec"})
knl = lp.preprocess_kernel(knl)
knl = lp.get_one_scheduled_kernel(knl)
code, inf = lp.generate_code(knl)
lp.auto_test_vs_ref(
ref_knl, ctx, knl,
parameters=dict(n=30))
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
def test_alias_temporaries(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i]: 0<=i<n}",
"""
times2(i) := 2*a[i]
times3(i) := 3*a[i]
times4(i) := 4*a[i]
x[i] = times2(i)
y[i] = times3(i)
z[i] = times4(i)
""")
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
knl = lp.precompute(knl, "times2", "i_inner")
knl = lp.precompute(knl, "times3", "i_inner")
knl = lp.precompute(knl, "times4", "i_inner")
knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])
lp.auto_test_vs_ref(
ref_knl, ctx, knl,
parameters=dict(n=30))
def test_fusion():
exp_kernel = lp.make_kernel(
''' { [i]: 0<=i<n } ''',
''' exp[i] = pow(E, z[i])''',
assumptions="n>0")
sum_kernel = lp.make_kernel(
'{ [j]: 0<=j<n }',
'out2 = sum(j, exp[j])',
assumptions='n>0')
knl = lp.fuse_kernels([exp_kernel, sum_kernel])
print(knl)
def test_sci_notation_literal(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
set_kernel = lp.make_kernel(
''' { [i]: 0<=i<12 } ''',
''' out[i] = 1e-12''')
set_kernel = lp.set_options(set_kernel, write_cl=True)
evt, (out,) = set_kernel(queue)
assert (np.abs(out.get() - 1e-12) < 1e-20).all()
def test_rename_argument(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
kernel = lp.make_kernel(
'''{ [i]: 0<=i<n }''',
'''out[i] = a + 2''')
kernel = lp.rename_argument(kernel, "a", "b")
evt, (out,) = kernel(queue, b=np.float32(12), n=20)
assert (np.abs(out.get() - 14) < 1e-8).all()
def test_to_batched(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
''' { [i,j]: 0<=i,j<n } ''',
''' out[i] = sum(j, a[i,j]*x[j])''')
bknl = lp.to_batched(knl, "nbatches", "out,x")
a = np.random.randn(5, 5)
x = np.random.randn(7, 5)
bknl(queue, a=a, x=x)
def test_variable_size_temporary():
knl = lp.make_kernel(
''' { [i,j]: 0<=i,j<n } ''',
''' out[i] = sum(j, a[i,j])''')
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
knl = lp.add_prefetch(
knl, "a[:,:]", default_tag=None)
# Make sure that code generation succeeds even if
# there are variable-length arrays.
knl = lp.preprocess_kernel(knl)
for k in lp.generate_loop_schedules(knl):
lp.generate_code(k)
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
def test_indexof(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
''' { [i,j]: 0<=i,j<5 } ''',
''' out[i,j] = indexof(out[i,j])''')
knl = lp.set_options(knl, write_cl=True)
(evt, (out,)) = knl(queue)
out = out.get()
assert np.array_equal(out.ravel(order="C"), np.arange(25))
def test_indexof_vec(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
if ctx.devices[0].platform.name.startswith("Portable"):
# Accurate as of 2015-10-08
pytest.skip("POCL miscompiles vector code")
knl = lp.make_kernel(
''' { [i,j,k]: 0<=i,j,k<4 } ''',
''' out[i,j,k] = indexof_vec(out[i,j,k])''')
knl = lp.tag_inames(knl, {"i": "vec"})
knl = lp.tag_data_axes(knl, "out", "vec,c,c")
knl = lp.set_options(knl, write_cl=True)
(evt, (out,)) = knl(queue)
#out = out.get()
#assert np.array_equal(out.ravel(order="C"), np.arange(25))
Andreas Klöckner
committed
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
def test_finite_difference_expr_subst(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="n+2"), "..."])
flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="n+2"),
lp.GlobalArg("u", shape="n+2"),
])
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
data_flow=[
("f", 1, 0)
])
Andreas Klöckner
committed
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused_knl = lp.assignment_to_subst(fused_knl, "f")
fused_knl = lp.set_options(fused_knl, write_cl=True)
# This is the real test here: The automatically generated
# shape expressions are '2+n' and the ones above are 'n+2'.
# Is loopy smart enough to understand that these are equal?
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
gpu_knl = lp.split_iname(
fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
precomp_knl = lp.precompute(
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
def test_is_expression_equal():
from loopy.symbolic import is_expression_equal
from pymbolic import var
x = var("x")
y = var("y")
assert is_expression_equal(x+2, 2+x)
assert is_expression_equal((x+2)**2, x**2 + 4*x + 4)
assert is_expression_equal((x+y)**2, x**2 + 2*x*y + y**2)
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
def test_collect_common_factors(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{[i,j,k]: 0<=i,j<n}",
"""
<float32> out_tmp = 0 {id=out_init,inames=i}
out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init}
out[i] = out_tmp {dep=out_up1:out_up2}
""")
knl = lp.add_and_infer_dtypes(knl,
dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32))
ref_knl = knl
knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
knl = lp.collect_common_factors_on_increment(knl, "out_tmp")
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13))
def test_ispc_target(occa_mode=False):
from loopy.target.ispc import ISPCTarget
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = 2*a[i]",
[
lp.GlobalArg("out,a", np.float32, shape=lp.auto),
"..."
],
target=ISPCTarget(occa_mode=occa_mode))
knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
codegen_result = lp.generate_code_v2(
lp.preprocess_kernel(knl)))
print(codegen_result.device_code())
print(codegen_result.host_code())
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
def test_cuda_target():
from loopy.target.cuda import CudaTarget
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = 2*a[i]",
[
lp.GlobalArg("out,a", np.float32, shape=lp.auto),
"..."
],
target=CudaTarget())
knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
print(
lp.generate_code(
lp.get_one_scheduled_kernel(
lp.preprocess_kernel(knl)))[0])
def test_chunk_iname(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = 2*a[i]",
[
lp.GlobalArg("out,a", np.float32, shape=lp.auto),
"..."
],
assumptions="n>0")
ref_knl = knl
knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
knl = lp.set_loop_priority(knl, "i_outer, i_inner")
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
def test_atomic(ctx_factory, dtype):
ctx = ctx_factory()
if (
np.dtype(dtype).itemsize == 8
and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
pytest.skip("64-bit atomics not supported on device")
import pyopencl.version # noqa
if (
cl.version.VERSION < (2015, 2)
and dtype == np.int64):
pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2")
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i%20] = out[i%20] + 2*a[i] {atomic}",
[
lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
lp.GlobalArg("a", dtype, shape=lp.auto),
"..."
],
assumptions="n>0")
ref_knl = knl
knl = lp.split_iname(knl, "i", 512)
knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def test_clamp(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
n = 15 * 10**6
x = cl.clrandom.rand(queue, n, dtype=np.float32)
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"out[i] = clamp(x[i], a, b)")
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.set_options(knl, write_cl=True)
evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
def test_forced_iname_deps_and_reduction():
# See https://github.com/inducer/loopy/issues/24
# This is (purposefully) somewhat un-idiomatic, to replicate the conditions
# under which the above bug was found. If assignees were phi[i], then the
# iname propagation heuristic would not assume that dependent instructions
# need to run inside of 'i', and hence the forced_iname_* bits below would not
# be needed.
i1 = lp.CInstruction("i",
"doSomethingToGetPhi();",
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
from pymbolic.primitives import Subscript, Variable
i2 = lp.Assignment("a",
lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
forced_iname_deps=frozenset(),
forced_iname_deps_is_final=True)
k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
[i1, i2],
[
lp.GlobalArg("a", dtype=np.float32, shape=()),
lp.ValueArg("n", dtype=np.int32),
lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
],
target=lp.CTarget(),
)
k = lp.preprocess_kernel(k)
assert 'i' not in k.insn_inames("insn_0_j_update")
print(k.stringify(with_dependencies=True))
@pytest.mark.parametrize("tp", ["f32", "f64"])
def test_random123(ctx_factory, tp):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
import pyopencl.version # noqa
if cl.version.VERSION < (2016, 2):
pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")
n = 150000
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
<> key2 = make_uint2(i, 324830944) {inames=i}
<> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
<> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr}
<> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr}
<> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr}
out[i, 0] = real.s0 + 1j * imag.s0
out[i, 1] = real.s1 + 1j * imag.s1
out[i, 2] = real.s2 + 1j * imag.s2
out[i, 3] = real.s3 + 1j * imag.s3
""".replace("TYPE", tp))
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.set_options(knl, write_cl=True)
evt, (out,) = knl(queue, n=n)
out = out.get()
assert (out < 1).all()
assert (0 <= out).all()
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
def test_kernel_splitting(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
c[i] = a[i + 1]
out[i] = c[i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory):
#ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i,k]: 0<=i<n and 0<=k<3 }",
"""
c[k,i] = a[k, i + 1]
out[k,i] = c[k,i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)
from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)
# map schedule onto host or device
print(knl)
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
print(cgr.device_code())
print(cgr.host_code())
# Doesn't yet work--not passing k
#lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
def test_global_temporary(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
"{ [i]: 0<=i<n}",
"""
<> c[i] = a[i + 1]
out[i] = c[i]
""")
knl = lp.add_and_infer_dtypes(knl,
{"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
knl = lp.set_temporary_scope(knl, "c", "global")
ref_knl = knl
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
cgr = lp.generate_code_v2(knl)
assert len(cgr.device_programs) == 2
#print(cgr.device_code())
#print(cgr.host_code())
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from py.test.cmdline import main
main([__file__])
# vim: foldmethod=marker