Skip to content
Snippets Groups Projects
Commit 12bd253a authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Delete stale examples.

parent 396da6d1
No related branches found
No related tags found
No related merge requests found
% ------------------------------------------------------
% VOLUME KERNEL
% input: u,v,w,p are float, Np x K
% input: DrDsDt is float4, Np x Np
% input: dRdx, dRdy, dRdz are float4, Np x K
% output: rhsu, rhsv, rhsw, rhsp, Np x K
% loop domain: 1<= n,m <= Np, 1<= e <= K
% BODY >>>
% reduction on m (assume DrDsDt is float4 for convenience)
dudR = DrDsDt(n,m)*u(m,e)
dvdR = DrDsDt(n,m)*v(m,e)
dwdR = DrDsDt(n,m)*w(m,e)
dpdR = DrDsDt(n,m)*p(m,e)
% volume flux
rhsu(n,e) = dot4(dRdx(k),dpdR)
rhsv(n,e) = dot4(dRdy(k),dpdR)
rhsw(n,e) = dot4(dRdz(k),dpdR)
rhsp(n,e) = dot4(dRdx(k), dudR) + dot4(dRdy(k), dvdR) + dot4(dRdz(k), dwdR)
% BODY <<<
% ------------------------------------------------------
% ------------------------------------------------------
% SURFACE KERNEL
% input: u,v,w,p are float, Np x K
% input: LIFT is float, Np x (Nfp*Nfaces)
% input: nx,ny,nz,Fscale are float, (Nfp*Nfaces) x K
% input/output: rhsu, rhsv, rhsw, rhsp are float Np x K
% loop-domain 1<= m <= Nfp*Nfaces, 1<= n <= Np, 1<= e <= K
% BODY >>>
% find surface node indices at both traces
idP = vmapP(m,e)
idM = vmapM(m,e)
% can we bounce to single index (row/column major is important)
% can we use this indexing here for clarity ?
du = u(idP)-u(idM)
dv = v(idP)-v(idM)
dw = w(idP)-w(idM)
dp = bc(idM)*p(idP) - p(idM)
dQ = 0.5*Fscale(m,e)*(dp - nx(m,e)*du - ny(m,e)*dv - nz(m,e)*dw)
fluxu = -nx(m,e)*dQ
fluxv = -ny(m,e)*dQ
fluxw = -nz(m,e)*dQ
fluxp = dQ
% reduction here
rhsu(n,e) += LIFT(n,m)*fluxu
rhsv(n,e) += LIFT(n,m)*fluxv
rhsw(n,e) += LIFT(n,m)*fluxw
rhsp(n,e) += LIFT(n,m)*fluxp
% BODY <<<
% ------------------------------------------------------
% ------------------------------------------------------
% RK kernel here
% input/output: u,v,w,p are float, Np*K
% input/output: resu,resv,resw,resp are float, Np*K
% input parameters rk4a, rk4b, dt
% 1 <= n <= K*Np,
% BODY >>>
resu(n) = rk4a*resu(n) + dt*rhsu
resv(n) = rk4a*resv(n) + dt*rhsv
resw(n) = rk4a*resw(n) + dt*rhsw
resp(n) = rk4a*resp(n) + dt*rhsp
u(n) += rk4b*resu(n)
v(n) += rk4b*resv(n)
w(n) += rk4b*resw(n)
p(n) += rk4b*resp(n)
% BODY <<<
% ------------------------------------------------------
\ No newline at end of file
import numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
import loopy as lp
def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32,
order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0):
if isinstance(shape, int):
shape = (shape, shape)
l = max(shape)
eye_ish = id_factor*np.eye(l, k=od)
if inc_factor:
eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l)
ary = np.asarray(
ran_factor*np.random.randn(*shape)
+ eye_ish[:shape[0], :shape[1]],
dtype=dtype, order=order)
return cl_array.to_device(queue, ary)
def image_matrix_mul_ilp(ctx_factory=cl.create_some_context):
dtype = np.float32
ctx = ctx_factory()
order = "C"
queue = cl.CommandQueue(ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
n = 16*10
from pymbolic import var
a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
knl = lp.LoopKernel(ctx.devices[0],
"{[i,j,k]: 0<=i,j,k<%d}" % n,
[
(c[i, j], a[i, k]*b[k, j])
],
[
lp.ImageArg("a", dtype, 2),
lp.ImageArg("b", dtype, 2),
lp.GlobalArg("c", dtype, shape=(n, n), order=order),
],
name="matmul")
ilp = 4
knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.1")
j_inner_split = 16
knl = lp.split_iname(knl, "j", ilp*j_inner_split, outer_tag="g.1")
knl = lp.split_iname(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0")
knl = lp.split_iname(knl, "k", 2)
knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
assert knl.get_problems({})[0] <= 2
kernel_gen = (lp.insert_register_prefetches(knl)
for knl in lp.generate_loop_schedules(knl))
a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
ran_factor=1, id_factor=5)
b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
ran_factor=1, id_factor=5, inc_factor=0)
c = cl_array.empty_like(a)
a_img = cl.image_from_array(ctx, a.get(), 1)
b_img = cl.image_from_array(ctx, b.get(), 1)
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
g_times_l=True)
return evt
from pyopencl.characterize import get_fast_inaccurate_build_options
lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
options=get_fast_inaccurate_build_options(ctx.devices[0]))
if __name__ == "__main__":
image_matrix_mul_ilp()
...@@ -187,6 +187,8 @@ def test_tim3d_slab(ctx_factory): ...@@ -187,6 +187,8 @@ def test_tim3d_slab(ctx_factory):
#knl = lp.precompute(knl, "us", ["i", "j"], within="... < lapr") #knl = lp.precompute(knl, "us", ["i", "j"], within="... < lapr")
#knl = lp.precompute(knl, "ut", ["i", "j"], within="... < lapr") #knl = lp.precompute(knl, "ut", ["i", "j"], within="... < lapr")
knl = lp.precompute(knl, "lapt", ["", "j"])
# prefetch the derivative matrix # prefetch the derivative matrix
knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "D[:,:]")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment