Skip to content
Snippets Groups Projects
test_sem.py 19.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • from __future__ import division
    
    import numpy as np
    import pyopencl as cl
    import loopy as lp
    
    from pyopencl.tools import pytest_generate_tests_for_pyopencl \
            as pytest_generate_tests
    
    
    
    
    
    Tim Warburton's avatar
    Tim Warburton committed
    def test_laplacian(ctx_factory):
    
        1/0 # not adapted to new language
    
    
        dtype = np.float32
        ctx = ctx_factory()
        order = "C"
    
        n = 8
    
        from pymbolic import var
        K_sym = var("K")
    
    
        field_shape = (K_sym, n, n, n)
    
    Tim Warburton's avatar
    Tim Warburton committed
        # load:     1+6 fields + 1/N D entry
        # store:    1   fields
        # perform:  N*2*6 + 3*5 flops
        # ratio:   (12*N+15)/8  flops per 4 bytes on bus 
        #          ~ 14 FLOPS per 4 bytes at N=8
        #          ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly
    
    
        # K - run-time symbolic
        knl = lp.make_kernel(ctx.devices[0],
    
    Tim Warburton's avatar
    Tim Warburton committed
                "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n,
    
    Tim Warburton's avatar
    Tim Warburton committed
                    "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))",
                    "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))",
                    "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))",
    
                    # define function
                    "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)",
                    "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)",
                    "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)",
    
    Tim Warburton's avatar
    Tim Warburton committed
    
                    "lap[e,i,j,k]  = "
    
    Tim Warburton's avatar
    Tim Warburton committed
                    "  sum_float32(m, D[m,i]*Gu(m,j,k))"
                    "+ sum_float32(m, D[m,j]*Gv(i,m,k))"
                    "+ sum_float32(m, D[m,k]*Gw(i,j,m))"
    
                lp.GlobalArg("u", dtype, shape=field_shape, order=order),
                lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
                lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
                lp.GlobalArg("D", dtype, shape=(n, n), order=order),
    
                lp.ValueArg("K", np.int32, approximately=1000),
    
                ],
                name="semlap", assumptions="K>=1")
    
    
    Tim Warburton's avatar
    Tim Warburton committed
        #print lp.preprocess_kernel(knl, cse_ok=True)
        #1/0
        #
        #print knl
        #1/0
        knl = lp.realize_cse(knl, "urf", np.float32, ["o1"])
        knl = lp.realize_cse(knl, "usf", np.float32, ["o2"])
        knl = lp.realize_cse(knl, "utf", np.float32, ["o3"])
    
        knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"])
        knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"])
        knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"])
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
        knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
        knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
        knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
    
        if 0:
    
    Tim Warburton's avatar
    Tim Warburton committed
            pass
            #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
            #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
            #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
        else:
            seq_knl = knl
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
    
        knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
        knl = lp.add_prefetch(knl, "D", ["m", "j"])
    
    Tim Warburton's avatar
    Tim Warburton committed
        #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    
    
        #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
    
    Andreas Klöckner's avatar
    Andreas Klöckner committed
        #print lp.preprocess_kernel(knl)
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
    Tim Warburton's avatar
    Tim Warburton committed
        kernel_gen = lp.generate_loop_schedules(knl,
                loop_priority=["m_fetch_G", "i_fetch_u"])
    
    Tim Warburton's avatar
    Tim Warburton committed
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
    
    Tim Warburton's avatar
    Tim Warburton committed
        K = 1000
    
        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
    
    Tim Warburton's avatar
    Tim Warburton committed
                op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
                op_label="GFlops",
                parameters={"K": K}, print_seq_code=True)
    
    Tim Warburton's avatar
    Tim Warburton committed
    
    
    def test_laplacian_lmem(ctx_factory):
    
        1/0 # not adapted to new language
    
    
        dtype = np.float32
        ctx = ctx_factory()
        order = "C"
    
        n = 8
    
        from pymbolic import var
        K_sym = var("K")
    
        field_shape = (K_sym, n, n, n)
    
        # K - run-time symbolic
        knl = lp.make_kernel(ctx.devices[0],
                "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n,
                [
                    "CSE: ur(i,j,k) = sum_float32(@o, D[i,o]*u[e,o,j,k])",
                    "CSE: us(i,j,k) = sum_float32(@o, D[j,o]*u[e,i,o,k])",
                    "CSE: ut(i,j,k) = sum_float32(@o, D[k,o]*u[e,i,j,o])",
    
                    "lap[e,i,j,k]  = "
                    "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                    "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                    "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                    ],
                [
    
                lp.GlobalArg("u", dtype, shape=field_shape, order=order),
                lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
                lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
                lp.GlobalArg("D", dtype, shape=(n, n), order=order),
    
                lp.ValueArg("K", np.int32, approximately=1000),
    
                ],
                name="semlap", assumptions="K>=1")
    
    
        knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
        knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
        knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
    
        if 0:
            seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
            seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
            seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
        else:
            seq_knl = knl
    
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    
    
        knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
        knl = lp.add_prefetch(knl, "D", ["m", "j"])
        knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    
        #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
    
    
        #print seq_knl
        #print lp.preprocess_kernel(knl)
        #1/0
    
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
    
        kernel_gen = lp.generate_loop_schedules(knl)
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
    
        K = 1000
    
        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
    
                op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
                op_label="GFlops",
                parameters={"K": K}, print_seq_code=True)
    
    
    
    
    
    
    def test_laplacian_lmem_ilp(ctx_factory):
        # This does not lead to practical/runnable code (out of lmem), but it's an
        # excellent stress test for the code generator. :)
    
        dtype = np.float32
        ctx = ctx_factory()
        order = "C"
    
        n = 8
    
        from pymbolic import var
        K_sym = var("K")
    
        field_shape = (K_sym, n, n, n)
    
        # K - run-time symbolic
        knl = lp.make_kernel(ctx.devices[0],
                "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n,
                [
                    "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
                    "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
                    "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",
    
                    "lap[e,i,j,k]  = "
                    "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                    "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                    "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                    ],
                [
    
                lp.GlobalArg("u", dtype, shape=field_shape, order=order),
                lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
                lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
                lp.GlobalArg("D", dtype, shape=(n, n), order=order),
    
                lp.ValueArg("K", np.int32, approximately=1000),
    
                ],
                name="semlap", assumptions="K>=1")
    
    
        # Must act on u first, otherwise stencil becomes crooked and
        # footprint becomes non-convex.
    
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
        knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
    
    
        knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])
    
        knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
        knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
        knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])
    
        knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
        knl = lp.add_prefetch(knl, "D", ["m", "j"])
    
        #print seq_knl
        #1/0
    
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
    
        kernel_gen = lp.generate_loop_schedules(knl)
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
    
        for knl in kernel_gen:
    
            print(lp.generate_code(knl))
    
    Tim Warburton's avatar
    Tim Warburton committed
    def test_advect(ctx_factory):
    
    Tim Warburton's avatar
    Tim Warburton committed
        dtype = np.float32
        ctx = ctx_factory()
    
        order = "C"
    
        N = 8
    
        from pymbolic import var
        K_sym = var("K")
    
        field_shape = (K_sym, N, N, N)
    
        # 1. direction-by-direction similarity transform on u
        # 2. invert diagonal 
        # 3. transform back (direction-by-direction)
    
        # K - run-time symbolic
    
        # A. updated for CSE: notation. 
        # B. fixed temp indexing and C ordering
        # load:     3+9 fields + 1/N D entry
        # store:    3   fields
        # perform:  N*2*6 + 3*5 + 3*5 flops
        # ratio:   (12*N+30)/15  flops per 4 bytes on bus 
        #          ~ 8.4 FLOPS per 4 bytes at N=8
        #          ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly
        knl = lp.make_kernel(ctx.devices[0],
                "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N,
                [
                    # differentiate u
                    "CSE:  ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])",
                    "CSE:  us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])",
                    "CSE:  ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])",
    
                    # differentiate v
                    "CSE:  vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])",
                    "CSE:  vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])",
                    "CSE:  vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])",
    
                    # differentiate w
                    "CSE:  wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])",
                    "CSE:  ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])",
                    "CSE:  wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])",
    
                    # find velocity in (r,s,t) coordinates
                    # CSE?
                    "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]",
                    "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]",
                    "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]",
    
                    # form nonlinear term on integration nodes
                    "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)",
                    "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)",
                    "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)",
                    ],
                [
    
                lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("v",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("w",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("Nu",  dtype, shape=field_shape, order=order),
                lp.GlobalArg("Nv",  dtype, shape=field_shape, order=order),
                lp.GlobalArg("Nw",  dtype, shape=field_shape, order=order),
                lp.GlobalArg("G",   dtype, shape=(9,)+field_shape, order=order),
                lp.GlobalArg("D",   dtype, shape=(N, N),  order=order),
    
                lp.ValueArg("K",  np.int32, approximately=1000),
    
    Tim Warburton's avatar
    Tim Warburton committed
                ],
                name="sem_advect", assumptions="K>=1")
    
    
        print(knl)
    
    Tim Warburton's avatar
    Tim Warburton committed
        1/0
    
        seq_knl = knl
    
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    
    Tim Warburton's avatar
    Tim Warburton committed
    
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
    Tim Warburton's avatar
    Tim Warburton committed
    
    
        kernel_gen = lp.generate_loop_schedules(knl)
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
    
    
        K = 1000
    
        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
    
    Tim Warburton's avatar
    Tim Warburton committed
                op_count=0,
                op_label="GFlops",
                parameters={"K": K}, print_seq_code=True,)
    
    
    
    
    
    def test_advect_dealias(ctx_factory):
        1/0 # not ready
    
        dtype = np.float32
        ctx = ctx_factory()
        order = "C"
    
        N = 8
        M = 8
    
        from pymbolic import var
        K_sym = var("K")
    
        field_shape = (N, N, N, K_sym)
        interim_field_shape = (M, M, M, K_sym)
    
        # 1. direction-by-direction similarity transform on u
        # 2. invert diagonal 
        # 3. transform back (direction-by-direction)
    
        # K - run-time symbolic
        knl = lp.make_kernel(ctx.devices[0],
                "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N
                [
    
                    # interpolate u to integration nodes
                    "CSE:  u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])",
                    "CSE:  u1[i,j,kp,e]  = sum_float32(@o, I[j,o]*u0[i,o,kp,e])",
                    "CSE:  Iu[i,j,k,e]   = sum_float32(@o, I[k,o]*u1[i,j,o,e])",
    
                    # differentiate u on integration nodes
                    "CSE:  Iur[i,j,k,e]  = sum_float32(@m, D[i,m]*Iu[m,j,k,e])",
                    "CSE:  Ius[i,j,k,e]  = sum_float32(@m, D[j,m]*Iu[i,m,k,e])",
                    "CSE:  Iut[i,j,k,e]  = sum_float32(@m, D[k,m]*Iu[i,j,m,e])",
    
                    # interpolate v to integration nodes
                    "CSE:  v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])",
                    "CSE:  v1[i,j,kp,e]  = sum_float32(@o, I[j,o]*v0[i,o,kp,e])",
                    "CSE:  Iv[i,j,k,e]   = sum_float32(@o, I[k,o]*v1[i,j,o,e])",
    
                    # differentiate v on integration nodes
                    "CSE:  Ivr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iv[m,j,k,e])",
                    "CSE:  Ivs[i,j,k,e]  = sum_float32(@m, D[j,m]*Iv[i,m,k,e])",
                    "CSE:  Ivt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iv[i,j,m,e])",
    
                    # interpolate w to integration nodes
                    "CSE:  w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])",
                    "CSE:  w1[i,j,kp,e]  = sum_float32(@o, I[j,o]*w0[i,o,kp,e])",
                    "CSE:  Iw[i,j,k,e]   = sum_float32(@o, I[k,o]*w1[i,j,o,e])",
    
                    # differentiate v on integration nodes
                    "CSE:  Iwr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iw[m,j,k,e])",
                    "CSE:  Iws[i,j,k,e]  = sum_float32(@m, D[j,m]*Iw[i,m,k,e])",
                    "CSE:  Iwt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iw[i,j,m,e])",
    
                    # find velocity in (r,s,t) coordinates
                    # QUESTION: should I use CSE here ?
                    "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]",
                    "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]",
                    "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]",
    
                    # form nonlinear term on integration nodes
                    # QUESTION: should I use CSE here ?
                    "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]",
                    "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]",
                    "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]",
    
                    # L2 project Nu back to Lagrange basis
                    "CSE: Nu2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])",
                    "CSE: Nu1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])",
                    "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])",
    
                    # L2 project Nv back to Lagrange basis
                    "CSE: Nv2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])",
                    "CSE: Nv1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])",
                    "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])",
    
                    # L2 project Nw back to Lagrange basis
                    "CSE: Nw2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])",
                    "CSE: Nw1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])",
                    "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])",
    
                    ],
                [
    
                lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("v",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("w",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("INu",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("INv",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("INw",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("D",   dtype, shape=(M,M),  order=order),
                lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
                lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
    
                lp.ValueArg("K",  np.int32, approximately=1000),
    
                ],
                name="sem_advect", assumptions="K>=1")
    
    
        print(knl)
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
        print(knl)
    
        #1/0
    
        kernel_gen = lp.generate_loop_schedules(knl)
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
    
    
        K = 1000
    
        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
    
                op_count=0,
                op_label="GFlops",
                parameters={"K": K}, print_seq_code=True,)
    
    
    
    
    def test_interp_diff(ctx_factory):
        1/0 # not ready
        dtype = np.float32
        ctx = ctx_factory()
        order = "C"
    
        N = 8
        M = 8
    
        from pymbolic import var
        K_sym = var("K")
    
        field_shape = (N, N, N, K_sym)
        interim_field_shape = (M, M, M, K_sym)
    
        # 1. direction-by-direction similarity transform on u
        # 2. invert diagonal 
        # 3. transform back (direction-by-direction)
    
        # K - run-time symbolic
        knl = lp.make_kernel(ctx.devices[0],
                "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N
                [
                    "[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
                    "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
                    "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
                    "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
                    "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
                    "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
                    "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])",
                    ],
                [
    
                lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
                lp.GlobalArg("P",   dtype, shape=interim_field_shape, order=order),
                lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
                lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
                lp.GlobalArg("Pu",  dtype, shape=field_shape, order=order),
    
                lp.ValueArg("K",  np.int32, approximately=1000),
    
                ],
                name="sem_lap_precon", assumptions="K>=1")
    
    
        print(knl)
    
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    
        print(knl)
    
        #1/0
    
        kernel_gen = lp.generate_loop_schedules(knl)
        kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
    
    
        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
    
                op_count=0,
                op_label="GFlops",
                parameters={"K": K}, print_seq_code=True,)
    
    
    
    
    
    if __name__ == "__main__":
        import sys
        if len(sys.argv) > 1:
            exec(sys.argv[1])
        else:
            from py.test.cmdline import main
            main([__file__])