diff --git a/examples/acoustics3d.lpy b/examples/acoustics3d.lpy deleted file mode 100644 index 919566deeaf65b7ee3e13eaac38bfaaf9c75ab2c..0000000000000000000000000000000000000000 --- a/examples/acoustics3d.lpy +++ /dev/null @@ -1,86 +0,0 @@ -% ------------------------------------------------------ -% VOLUME KERNEL -% input: u,v,w,p are float, Np x K -% input: DrDsDt is float4, Np x Np -% input: dRdx, dRdy, dRdz are float4, Np x K -% output: rhsu, rhsv, rhsw, rhsp, Np x K - -% loop domain: 1<= n,m <= Np, 1<= e <= K - -% BODY >>> - -% reduction on m (assume DrDsDt is float4 for convenience) -dudR = DrDsDt(n,m)*u(m,e) -dvdR = DrDsDt(n,m)*v(m,e) -dwdR = DrDsDt(n,m)*w(m,e) -dpdR = DrDsDt(n,m)*p(m,e) - -% volume flux -rhsu(n,e) = dot4(dRdx(k),dpdR) -rhsv(n,e) = dot4(dRdy(k),dpdR) -rhsw(n,e) = dot4(dRdz(k),dpdR) -rhsp(n,e) = dot4(dRdx(k), dudR) + dot4(dRdy(k), dvdR) + dot4(dRdz(k), dwdR) - -% BODY <<< -% ------------------------------------------------------ - -% ------------------------------------------------------ -% SURFACE KERNEL - -% input: u,v,w,p are float, Np x K -% input: LIFT is float, Np x (Nfp*Nfaces) -% input: nx,ny,nz,Fscale are float, (Nfp*Nfaces) x K -% input/output: rhsu, rhsv, rhsw, rhsp are float Np x K - -% loop-domain 1<= m <= Nfp*Nfaces, 1<= n <= Np, 1<= e <= K - -% BODY >>> -% find surface node indices at both traces -idP = vmapP(m,e) -idM = vmapM(m,e) - -% can we bounce to single index (row/column major is important) -% can we use this indexing here for clarity ? -du = u(idP)-u(idM) -dv = v(idP)-v(idM) -dw = w(idP)-w(idM) -dp = bc(idM)*p(idP) - p(idM) - -dQ = 0.5*Fscale(m,e)*(dp - nx(m,e)*du - ny(m,e)*dv - nz(m,e)*dw) - -fluxu = -nx(m,e)*dQ -fluxv = -ny(m,e)*dQ -fluxw = -nz(m,e)*dQ -fluxp = dQ - -% reduction here -rhsu(n,e) += LIFT(n,m)*fluxu -rhsv(n,e) += LIFT(n,m)*fluxv -rhsw(n,e) += LIFT(n,m)*fluxw -rhsp(n,e) += LIFT(n,m)*fluxp -% BODY <<< -% ------------------------------------------------------ - -% ------------------------------------------------------ -% RK kernel here - -% input/output: u,v,w,p are float, Np*K -% input/output: resu,resv,resw,resp are float, Np*K -% input parameters rk4a, rk4b, dt - -% 1 <= n <= K*Np, - -% BODY >>> - -resu(n) = rk4a*resu(n) + dt*rhsu -resv(n) = rk4a*resv(n) + dt*rhsv -resw(n) = rk4a*resw(n) + dt*rhsw -resp(n) = rk4a*resp(n) + dt*rhsp - -u(n) += rk4b*resu(n) -v(n) += rk4b*resv(n) -w(n) += rk4b*resw(n) -p(n) += rk4b*resp(n) - -% BODY <<< -% ------------------------------------------------------ \ No newline at end of file diff --git a/examples/matrix-mul.py b/examples/matrix-mul.py deleted file mode 100644 index 6c4d9e97e362d074c5d379928162ba69ff4cdcc9..0000000000000000000000000000000000000000 --- a/examples/matrix-mul.py +++ /dev/null @@ -1,86 +0,0 @@ -import numpy as np -import pyopencl as cl -import pyopencl.array as cl_array -import loopy as lp - - - - -def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, - order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): - if isinstance(shape, int): - shape = (shape, shape) - l = max(shape) - eye_ish = id_factor*np.eye(l, k=od) - if inc_factor: - eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) - ary = np.asarray( - ran_factor*np.random.randn(*shape) - + eye_ish[:shape[0], :shape[1]], - dtype=dtype, order=order) - - return cl_array.to_device(queue, ary) - - - - -def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - n = 16*10 - from pymbolic import var - a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] - - knl = lp.LoopKernel(ctx.devices[0], - "{[i,j,k]: 0<=i,j,k<%d}" % n, - [ - (c[i, j], a[i, k]*b[k, j]) - ], - [ - lp.ImageArg("a", dtype, 2), - lp.ImageArg("b", dtype, 2), - lp.GlobalArg("c", dtype, shape=(n, n), order=order), - ], - name="matmul") - - ilp = 4 - knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") - j_inner_split = 16 - knl = lp.split_iname(knl, "j", ilp*j_inner_split, outer_tag="g.1") - knl = lp.split_iname(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 2) - - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) - assert knl.get_problems({})[0] <= 2 - - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl)) - - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, - ran_factor=1, id_factor=5) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, - ran_factor=1, id_factor=5, inc_factor=0) - c = cl_array.empty_like(a) - a_img = cl.image_from_array(ctx, a.get(), 1) - b_img = cl.image_from_array(ctx, b.get(), 1) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, - g_times_l=True) - - return evt - - from pyopencl.characterize import get_fast_inaccurate_build_options - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, - options=get_fast_inaccurate_build_options(ctx.devices[0])) - - - - -if __name__ == "__main__": - image_matrix_mul_ilp() diff --git a/examples/sem_reagan.py b/examples/sem_reagan.py index a420d70fb6d43edc5260944f959c7f9e1b1ba741..06d2072f6fcd8ad7ea3c2830880b07cd9643f471 100644 --- a/examples/sem_reagan.py +++ b/examples/sem_reagan.py @@ -187,6 +187,8 @@ def test_tim3d_slab(ctx_factory): #knl = lp.precompute(knl, "us", ["i", "j"], within="... < lapr") #knl = lp.precompute(knl, "ut", ["i", "j"], within="... < lapr") + knl = lp.precompute(knl, "lapt", ["", "j"]) + # prefetch the derivative matrix knl = lp.add_prefetch(knl, "D[:,:]")