diff --git a/test/test_advect.py b/test/test_advect.py index c5da3a66194c1774e717370e24c6dd547a366995..7932141d05c986268e9f0951198cd607f005aed0 100644 --- a/test/test_advect.py +++ b/test/test_advect.py @@ -3,6 +3,7 @@ def test_advect(ctx_factory): dtype = np.float32 ctx = ctx_factory() + order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) @@ -12,51 +13,59 @@ def test_advect(ctx_factory): from pymbolic import var K_sym = var("K") - field_shape = (N, N, N, K_sym) + field_shape = (K_sym, N, N, N) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic + + # A. updated for CSE: notation. + # B. fixed temp indexing and C ordering + # load: 3+9 fields + 1/N D entry + # store: 3 fields + # perform: N*2*6 + 3*5 + 3*5 flops + # ratio: (12*N+30)/15 flops per 4 bytes on bus + # ~ 8.4 FLOPS per 4 bytes at N=8 + # ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly knl = lp.make_kernel(ctx.devices[0], - "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" %N + "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" %N [ # differentiate u - "[|i,j,k] <float32> ur[i,j,k,e] = sum_float32(m, D[i,m]*u[m,j,k,e])", - "[|i,j,k] <float32> us[i,j,k,e] = sum_float32(m, D[j,m]*u[i,m,k,e])", - "[|i,j,k] <float32> ut[i,j,k,e] = sum_float32(m, D[k,m]*u[i,j,m,e])", + "CSE: ur[i,j,k] = sum_float32(@m, D[i,m]*u[e,m,j,k])", + "CSE: us[i,j,k] = sum_float32(@m, D[j,m]*u[e,i,m,k])", + "CSE: ut[i,j,k] = sum_float32(@m, D[k,m]*u[e,i,j,m])", # differentiate v - "[|i,j,k] <float32> vr[i,j,k,e] = sum_float32(m, D[i,m]*v[m,j,k,e])", - "[|i,j,k] <float32> vs[i,j,k,e] = sum_float32(m, D[j,m]*v[i,m,k,e])", - "[|i,j,k] <float32> vt[i,j,k,e] = sum_float32(m, D[k,m]*v[i,j,m,e])", + "CSE: vr[i,j,k] = sum_float32(@m, D[i,m]*v[e,m,j,k])", + "CSE: vs[i,j,k] = sum_float32(@m, D[j,m]*v[e,i,m,k])", + "CSE: vt[i,j,k] = sum_float32(@m, D[k,m]*v[e,i,j,m])", # differentiate w - "[|i,j,k] <float32> wr[i,j,k,e] = sum_float32(m, D[i,m]*w[m,j,k,e])", - "[|i,j,k] <float32> ws[i,j,k,e] = sum_float32(m, D[j,m]*w[i,m,k,e])", - "[|i,j,k] <float32> wt[i,j,k,e] = sum_float32(m, D[k,m]*w[i,j,m,e])", + "CSE: wr[i,j,k] = sum_float32(@m, D[i,m]*w[e,m,j,k])", + "CSE: ws[i,j,k] = sum_float32(@m, D[j,m]*w[e,i,m,k])", + "CSE: wt[i,j,k] = sum_float32(@m, D[k,m]*w[e,i,j,m])", # find velocity in (r,s,t) coordinates - "<float32> Vr[i,j,k,e] = " - "G[i,j,k,0,e]*u[i,j,k,e] + G[i,j,k,1,e]*v[i,j,k,e] + G[i,j,k,2,e]*w[i,j,k,e]", - "<float32> Vs[i,j,k,e] = " - "G[i,j,k,3,e]*u[i,j,k,e] + G[i,j,k,4,e]*v[i,j,k,e] + G[i,j,k,5,e]*w[i,j,k,e]", - "<float32> Vt[i,j,k,e] = " - "G[i,j,k,6,e]*u[i,j,k,e] + G[i,j,k,7,e]*v[i,j,k,e] + G[i,j,k,8,e]*w[i,j,k,e]", + # CSE? + "Vr[i,j,k] = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]", + "Vs[i,j,k] = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]", + "Vt[i,j,k] = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]", # form nonlinear term on integration nodes - "Nu[i,j,k,e] = Vr[i,j,k,e]*ur[i,j,k,e]+Vs[i,j,k,e]*us[i,j,k,e]+Vt[i,j,k,e]*ut[i,j,k,e]", - "Nv[i,j,k,e] = Vr[i,j,k,e]*vr[i,j,k,e]+Vs[i,j,k,e]*vs[i,j,k,e]+Vt[i,j,k,e]*vt[i,j,k,e]", - "Nw[i,j,k,e] = Vr[i,j,k,e]*wr[i,j,k,e]+Vs[i,j,k,e]*ws[i,j,k,e]+Vt[i,j,k,e]*wt[i,j,k,e]", + "Nu[e,i,j,k] = Vr[i,j,k]*ur[i,j,k]+Vs[i,j,k]*us[i,j,k]+Vt[i,j,k]*ut[i,j,k]", + "Nv[e,i,j,k] = Vr[i,j,k]*vr[i,j,k]+Vs[i,j,k]*vs[i,j,k]+Vt[i,j,k]*vt[i,j,k]", + "Nw[e,i,j,k] = Vr[i,j,k]*wr[i,j,k]+Vs[i,j,k]*ws[i,j,k]+Vt[i,j,k]*wt[i,j,k]", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("v", dtype, shape=field_shape, order=order), lp.ArrayArg("w", dtype, shape=field_shape, order=order), - lp.ArrayArg("Nu", dtype, shape=field_shape, order=order), - lp.ArrayArg("Nv", dtype, shape=field_shape, order=order), - lp.ArrayArg("Nw", dtype, shape=field_shape, order=order), + lp.ArrayArg("Nu", dtype, shape=field_shape, order=order), + lp.ArrayArg("Nv", dtype, shape=field_shape, order=order), + lp.ArrayArg("Nw", dtype, shape=field_shape, order=order), + lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order), lp.ArrayArg("D", dtype, shape=(N, N), order=order), lp.ScalarArg("K", np.int32, approximately=1000), ], diff --git a/test/test_advect_dealias.py b/test/test_advect_dealias.py index 072c840a6c5074f17b74ce4bc89eb76fee51e4cd..cd0df0da7d3274201eb758c7da2d11be7d541771 100644 --- a/test/test_advect_dealias.py +++ b/test/test_advect_dealias.py @@ -22,66 +22,66 @@ def test_advect(ctx_factory): # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], - "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=m,ip,jp,kp<%d 0<=e<K}" %M %N + "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N [ # interpolate u to integration nodes - "[|i,jp,kp] <float32> u0[i,jp,kp,e] = sum_float32(ip, I[i,ip]*u[ip,jp,kp,e])", - "[|i,j ,kp] <float32> u1[i,j,kp,e] = sum_float32(jp, I[j,jp]*u0[i,jp,kp,e])", - "[|i,j ,k ] <float32> Iu[i,j,k,e] = sum_float32(kp, I[k,kp]*u1[i,j,kp,e])", + "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", + "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", + "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", # differentiate u on integration nodes - "[|i,j ,k ] <float32> Iur[i,j,k,e] = sum_float32( m, D[i,m]*Iu[m,j,k,e])", - "[|i,j ,k ] <float32> Ius[i,j,k,e] = sum_float32( m, D[j,m]*Iu[i,m,k,e])", - "[|i,j ,k ] <float32> Iut[i,j,k,e] = sum_float32( m, D[k,m]*Iu[i,j,m,e])", + "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", + "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", + "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", # interpolate v to integration nodes - "[|i,jp,kp] <float32> v0[i,jp,kp,e] = sum_float32(ip, I[i,ip]*v[ip,jp,kp,e])", - "[|i,j ,kp] <float32> v1[i,j,kp,e] = sum_float32(jp, I[j,jp]*v0[i,jp,kp,e])", - "[|i,j ,k ] <float32> Iv[i,j,k,e] = sum_float32(kp, I[k,kp]*v1[i,j,kp,e])", + "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", + "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", + "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", # differentiate v on integration nodes - "[|i,j ,k ] <float32> Ivr[i,j,k,e] = sum_float32(ip, D[i,m]*Iv[m,j,k,e])", - "[|i,j ,k ] <float32> Ivs[i,j,k,e] = sum_float32(jp, D[j,m]*Iv[i,m,k,e])", - "[|i,j ,k ] <float32> Ivt[i,j,k,e] = sum_float32(kp, D[k,m]*Iv[i,j,m,e])", + "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", + "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", + "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", # interpolate w to integration nodes - "[|i,jp,kp] <float32> w0[i,jp,kp,e] = sum_float32(ip, I[i,ip]*w[ip,jp,kp,e])", - "[|i,j ,kp] <float32> w1[i,j,kp,e] = sum_float32(jp, I[j,jp]*w0[i,jp,kp,e])", - "[|i,j ,k ] <float32> Iw[i,j,k,e] = sum_float32(kp, I[k,kp]*w1[i,j,kp,e])", + "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", + "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", + "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", - # differentiate w on integration nodes - "[|i,j ,k ] <float32> Iwr[i,j,k,e] = sum_float32(ip, D[i,m]*Iw[m,j,k,e])", - "[|i,j ,k ] <float32> Iws[i,j,k,e] = sum_float32(jp, D[j,m]*Iw[i,m,k,e])", - "[|i,j ,k ] <float32> Iwt[i,j,k,e] = sum_float32(kp, D[k,m]*Iw[i,j,m,e])", + # differentiate v on integration nodes + "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", + "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", + "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", # find velocity in (r,s,t) coordinates - "<float32> Vr[i,j,k,e] = " - "G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", - "<float32> Vs[i,j,k,e] = " - "G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", - "<float32> Vt[i,j,k,e] = " - "G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", + # QUESTION: should I use CSE here ? + "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", + "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", + "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", # form nonlinear term on integration nodes - "<float32> Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", - "<float32> Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", - "<float32> Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", + # QUESTION: should I use CSE here ? + "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", + "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", + "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", # L2 project Nu back to Lagrange basis - "[|ip,j,k] <float32> Nu2[ip,j,k,e] = sum_float32(i, V[ip,i]*Nu[i,j,k,e])", - "[|ip,jp,k] <float32> Nu1[ip,jp,k,e] = sum_float32(j, V[jp,j]*Nu2[ip,j,k,e])", - "INu[ip,jp,kp,e] = sum_float32(k, V[kp,k]*Nu1[ip,jp,k,e])", + "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", + "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", + "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", # L2 project Nv back to Lagrange basis - "[|ip,j,k] <float32> Nv2[ip,j,k,e] = sum_float32(i, V[ip,i]*Nv[i,j,k,e])", - "[|ip,jp,k] <float32> Nv1[ip,jp,k,e] = sum_float32(j, V[jp,j]*Nv2[ip,j,k,e])", - "INv[ip,jp,kp,e] = sum_float32(k, V[kp,k]*Nv1[ip,jp,k,e])", + "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", + "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", + "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", # L2 project Nw back to Lagrange basis - "[|ip,j,k] <float32> Nw2[ip,j,k,e] = sum_float32(i, V[ip,i]*Nw[i,j,k,e])", - "[|ip,jp,k] <float32> Nw1[ip,jp,k,e] = sum_float32(j, V[jp,j]*Nw2[ip,j,k,e])", - "INw[ip,jp,kp,e] = sum_float32(k, V[kp,k]*Nw1[ip,jp,k,e])", + "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", + "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", + "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", + ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), diff --git a/test/test_sem.py b/test/test_sem.py index d6641db23fc26c1125922a29a4c76ab516e4297a..3e593cb0ad8f3155dcb90319ff27642df05c131e 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -12,263 +12,6 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ -def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, - order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): - if isinstance(shape, int): - shape = (shape, shape) - l = max(shape) - eye_ish = id_factor*np.eye(l, k=od) - if inc_factor: - eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) - ary = np.asarray( - ran_factor*np.random.randn(*shape) - + eye_ish[:shape[0], :shape[1]], - dtype=dtype, order=order) - - return cl_array.to_device(queue, ary) - - - - -DO_CHECK = True - -DEBUG_PREAMBLE = r""" - #pragma OPENCL EXTENSION cl_amd_printf: enable - #define MY_J (j_outer*64+j_inner_outer*16+j_inner_inner) - #define MY_I (i_outer*16+i_inner) - #define IFDIAG if (MY_I == MY_J) - #define TST(S) if (MY_J == 144 && MY_I == 16-48) \ - for (int aa = 0; aa < 16: ++ab) \ - for (int bb = 0; bb < 16: ++bb) - """ - - - - -def check_error(refsol, sol): - if not DO_CHECK: - return - - if sol.shape == 2: - norm_order = "fro" - else: - norm_order = 2 - - rel_err = la.norm(refsol-sol, norm_order)/la.norm(refsol, norm_order) - if rel_err > 1e-5 or np.isinf(rel_err) or np.isnan(rel_err): - if 1: - import matplotlib.pyplot as pt - pt.imshow(refsol-sol) - pt.colorbar() - pt.show() - elif 0: - print "---------------------------" - print "ACTUAL" - print "---------------------------" - np.set_printoptions(threshold=1000000, linewidth=200) - print sol[:16,:16] - print "---------------------------" - print "CORRECT" - print "---------------------------" - print refsol[:16,:16] - raise RuntimeError("check failed, rel err=%g" % rel_err) - - - -def test_sem(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - # n = get_suitable_size(ctx) - - # 0<=i,j,k,m<=N AND 0<=k<K - - # ur(i,j,k,e) = D(i,m)*u(m,j,k,e) - # us(i,j,k,e) = D(j,m)*u(i,m,k,e) - # ut(i,j,k,e) = D(k,m)*u(i,j,m,e) - - # (grad phi, grad u) = (Dr' Ds' Dt')*(G)*(Dr; Ds; Dt) - # lap(i,j,k,e) = D(m,i)*(G(0,m,j,k,e)*ur(m,j,k,e) + G(1,m,j,k,e)*us(m,j,k,e) + G(2,m,j,k,e)*ut(m,j,k,e)); - # lap(i,j,k,e) += D(m,j)*(G(1,i,m,k,e)*ur(i,m,k,e) + G(3,i,m,k,e)*us(i,m,k,e) + G(4,i,m,k,e)*ut(i,m,k,e)); - # lap(i,j,k,e) += D(m,k)*(G(2,i,j,m,e)*ur(i,j,m,e) + G(4,i,j,m,e)*us(i,j,m,e) + G(5,i,j,m,e)*ut(i,j,m,e)); - - - # K - run-time symbolic - from pymbolic import var - K = var("K") - n = 8 - knl = lp.make_kernel(ctx.devices[0], - #"[K] -> {[i,j,k,ip,jp,kp,e,m,mp]: 0<=i,j,k,m,ip,jp,kp,mp<%d AND 0<=e<K}" % n, - #[ - #"ur[e, k, j, i] = sum_float32(m, D[i, m]*u[e, k, j, m])", - #"lap[e, kp, jp, ip] = sum_float32(mp, D[ip, mp]*(G[e, 0, kp, jp, mp]*ur[e, kp, jp, mp]))" - #], - "[K] -> {[i,j,k,e,m,mp]: 0<=i,j,k,m,mp<%d AND 0<=e<K}" % n, - [ - "lap[e, k, j, i] = " - "sum_float32(mp, D[i, mp]*(G[e, 0, k, j, mp]*" - "cse(sum_float32(m, D[mp, m]*u[e, k, j, m]), build_ur)))" - ], - [ - lp.ArrayArg("u", dtype, shape=(K, n, n, n), order=order), - lp.ArrayArg("ur", dtype, shape=(K, n, n, n), order=order), - lp.ArrayArg("lap", dtype, shape=(K, n, n, n), order=order), - lp.ArrayArg("G", dtype, shape=(K, 6, n, n, n), order=order), - lp.ArrayArg("D", dtype, shape=(n, n), order=order), - lp.ScalarArg("K", np.int32, approximately=1000), - ], - name="semlap", assumptions="K>=1") - - knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp") - knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) - #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"]) - knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"]) - print knl - #1/0 - - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) - - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - c = cl_array.empty_like(a) - refsol = np.dot(a.get(), b.get()) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) - - - - -def test_sem_nd(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - # n = get_suitable_size(ctx) - - # 0<=i,j,k,m<=N AND 0<=k<K - - # ur(i,j,k,e) = D(i,m)*u(m,j,k,e) - # us(i,j,k,e) = D(j,m)*u(i,m,k,e) - # ut(i,j,k,e) = D(k,m)*u(i,j,m,e) - - # (grad phi, grad u) = (Dr' Ds' Dt')*(G)*(Dr; Ds; Dt) - # lap(i,j,k,e) = D(m,i)*(G(0,m,j,k,e)*ur(m,j,k,e) + G(1,m,j,k,e)*us(m,j,k,e) + G(2,m,j,k,e)*ut(m,j,k,e)); - # lap(i,j,k,e) += D(m,j)*(G(1,i,m,k,e)*ur(i,m,k,e) + G(3,i,m,k,e)*us(i,m,k,e) + G(4,i,m,k,e)*ut(i,m,k,e)); - # lap(i,j,k,e) += D(m,k)*(G(2,i,j,m,e)*ur(i,j,m,e) + G(4,i,j,m,e)*us(i,j,m,e) + G(5,i,j,m,e)*ut(i,j,m,e)); - - from pymbolic import var - K_sym, G_sym, u_sym, D_sym, m_sym, i_sym, j_sym, k_sym, e_sym = [ - var(i) for i in "KGuDmijke"] - - sym_lookup = { - (0,0): 0, - (0,1): 1, - (0,2): 2, - (1,1): 3, - (1,2): 4, - (2,2): 5, - } - - for i, j in sym_lookup.keys(): - sym_lookup[j, i] = sym_lookup[i, j] - - dim = 3 - local_derivatives = [] - - ijk = [i_sym, j_sym, k_sym] - from loopy.symbolic import Reduction - from loopy.kernel import parse_reduction_op - for axis in range(dim): - u_index = [i_sym, j_sym, k_sym, e_sym] - u_index[axis] = m_sym - local_derivatives.append( - Reduction( - parse_reduction_op("sum_float32"), - ("m",), - D_sym[ijk[axis], m_sym] - * u_sym[tuple(u_index)])) - - #for axis in range(dim): - #div - for ld in local_derivatives: - print ld - - 1/0 - - - - - field_shape = (K_sym,) + dim*(n,) - # K - run-time symbolic - n = 8 - knl = lp.make_kernel(ctx.devices[0], - #"[K] -> {[i,j,k,ip,jp,kp,e,m,mp]: 0<=i,j,k,m,ip,jp,kp,mp<%d AND 0<=e<K}" % n, - #[ - #"ur[e, k, j, i] = sum_float32(m, D[i, m]*u[e, k, j, m])", - #"lap[e, kp, jp, ip] = sum_float32(mp, D[ip, mp]*(G[e, 0, kp, jp, mp]*ur[e, kp, jp, mp]))" - #], - "[K] -> {[i,j,k,e,m,mp]: 0<=i,j,k,m,mp<%d AND 0<=e<K}" % n, - [ - "lap[e, k, j, i] = " - "sum_float32(mp, D[i, mp]*(G[e, 0, k, j, mp]*" - "cse(sum_float32(m, D[mp, m]*u[e, k, j, m]), build_ur)))" - ], - [ - lp.ArrayArg("u", dtype, shape=field_shape, order=order), - lp.ArrayArg("ur", dtype, shape=field_shape, order=order), - lp.ArrayArg("lap", dtype, shape=field_shape, order=order), - lp.ArrayArg("G", dtype, shape=field_shape + (6,), order=order), - lp.ArrayArg("D", dtype, shape=(n, n), order=order), - lp.ScalarArg("K", np.int32, approximately=1000), - ], - name="semlap", assumptions="K>=1") - - knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp") - knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) - #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"]) - knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"]) - print knl - #1/0 - - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) - - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - c = cl_array.empty_like(a) - refsol = np.dot(a.get(), b.get()) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) - - - - def test_sem_3d(ctx_factory): dtype = np.float32 ctx = ctx_factory() @@ -281,6 +24,13 @@ def test_sem_3d(ctx_factory): field_shape = (K_sym, n, n, n) + # load: 1+6 fields + 1/N D entry + # store: 1 fields + # perform: N*2*6 + 3*5 flops + # ratio: (12*N+15)/8 flops per 4 bytes on bus + # ~ 14 FLOPS per 4 bytes at N=8 + # ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly + # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n,