diff --git a/WENO.F90 b/WENO.F90
index e8618ef65764d2f64d310a023855a0651cfe8652..829080e5f56556ffcc276e33a5ad8d0aa0628ab6 100644
--- a/WENO.F90
+++ b/WENO.F90
@@ -303,9 +303,9 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam
   if (im > 3) im = im - 3
 
   do j=1,2
-    u_orig(1,j) = states(ik+1,j)/states(1,j)
-    u_orig(2,j) = states(il+1,j)/states(1,j)
-    u_orig(3,j) = states(im+1,j)/states(1,j)
+    do i=1,ndim
+      u_orig(i,j) = states(i+1,j)/states(1,j)
+    end do
   end do
 
   do j=1,2
@@ -350,7 +350,7 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam
   c = sqrt((1.4 - 1.0)*(H - 0.5*q))
 
   b1 = (1.4 - 1.0)/(c**2)
-  b2 = 1.0 + b1*q**2 - b1*H
+  b2 = 1.0 + b1*q - b1*H
 
   u_tilde(1) = 0.0
   do i=1,ndim
@@ -370,11 +370,11 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam
   alpha = rho/(2.0*c)
   beta = 1.0/(2.0*alpha)
 
-  lambda_roe(1) = u(1)
-  lambda_roe(2) = u(1)
-  lambda_roe(3) = u(1)
-  lambda_roe(4) = u(1) + c
-  lambda_roe(5) = u(1) - c
+  lambda_roe(1) = u_tilde(1)*metric_norm(ik)
+  lambda_roe(2) = u_tilde(1)*metric_norm(ik)
+  lambda_roe(3) = u_tilde(1)*metric_norm(ik)
+  lambda_roe(4) = u_tilde(1)*metric_norm(ik) + c*metric_norm(ik)
+  lambda_roe(5) = u_tilde(1)*metric_norm(ik) - c*metric_norm(ik)
 
   R(1,1) = 1.0
   R(2,1) = u(1)
@@ -951,6 +951,20 @@ end subroutine
 !
 ! prg = lp.parse_fortran(lp.c_preprocess(SOURCE), FILENAME)
 ! prg = lp.fix_parameters(prg, ndim=3, nvars=5, _remove=False)
+!
+! cfd = prg["compute_flux_derivatives"]
+!
+! cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+!
+! cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
+!         lp.AddressSpace.GLOBAL)
+! cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
+!         lp.AddressSpace.GLOBAL)
+! cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
+!         lp.AddressSpace.GLOBAL)
+!
+! prg = prg.with_kernel(cfd)
+!
 ! RESULT = prg
 !
 !$loopy end
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..71859c74a50af454451a47b6bfa2a176114c66fb
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,96 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
+import pyopencl.clrandom  # noqa
+import loopy as lp  # noqa
+import sys
+
+import logging
+
+from utilities import *
+
+
+def benchmark_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
+    logging.basicConfig(level="INFO")
+
+    prg = get_weno_program()
+    prg = transform_weno_for_gpu(prg)
+
+    queue = get_queue(ctx_factory)
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+    prg = lp.set_options(prg, no_numpy=True)
+    prg = lp.set_options(prg, ignore_boostable_into=True)
+    #prg = lp.set_options(prg, write_wrapper=True)
+    #op_map = lp.get_op_map(prg, count_redundant_work=False)
+    #print(op_map)
+
+    ndim = 3
+    nvars = 5
+    n = 16*16
+    nx = n
+    ny = n
+    nz = n
+
+    print("ARRAY GEN")
+    states = random_array_on_device(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = random_array_on_device(queue, nx+6, ny+6, nz+6)
+    print("END ARRAY GEN")
+
+    flux_derivatives_dev = empty_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
+
+    if write_code:
+        write_target_device_code(prg)
+
+    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
+
+    from functools import partial
+    run = partial(prg, queue, nvars=nvars, ndim=ndim,
+            states=states, fluxes=fluxes, metrics=metrics,
+            metric_jacobians=metric_jacobians,
+            flux_derivatives=flux_derivatives_dev,
+            allocator=allocator)
+
+    # {{{ monkeypatch enqueue_nd_range_kernel to trace
+
+    if 0:
+        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel
+
+        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
+            print(f"Enqueueing {ker.function_name}")
+            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)
+
+        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper
+
+    # }}}
+
+    print("warmup")
+    for iwarmup_round in range(2):
+        run()
+
+    nrounds = 10
+
+    queue.finish()
+    print("timing")
+    from time import time
+    start = time()
+
+    for iround in range(nrounds):
+        run()
+
+    queue.finish()
+    one_round = (time() - start)/nrounds
+
+    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
+    print(f"elapsed per round: {one_round} s")
+    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        benchmark_compute_flux_derivatives_gpu(cl._csc)
diff --git a/build-and-test-py-project.sh b/build-and-test-py-project.sh
index 0810a3871a048bb83a6f317e6eba1fbc8dc00fa3..8bfbc06940aa87368d22708756076e22b9e615a8 100644
--- a/build-and-test-py-project.sh
+++ b/build-and-test-py-project.sh
@@ -55,9 +55,7 @@ export XDG_CACHE_HOME=$HOME/.cache/$CI_RUNNER_ID
 $PIP install --upgrade pip
 $PIP install setuptools
 
-# Pinned to 3.0.4 because of https://github.com/pytest-dev/pytest/issues/2434
-# Install before a newer version gets pulled in as a dependency
-$PIP install pytest==3.0.4 pytest-warnings==0.2.0
+$PIP install pytest
 
 if test "$EXTRA_INSTALL" != ""; then
   for i in $EXTRA_INSTALL ; do
@@ -87,6 +85,6 @@ if test -f $REQUIREMENTS_TXT; then
   $PIP install -r $REQUIREMENTS_TXT
 fi
 
-${PY_EXE} -m pytest -rw --durations=10 --tb=native  --junitxml=pytest.xml -rxsw test.py
+${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxsw --runslow test.py
 
 # vim: foldmethod=marker
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..29be9d5330e9401e60381cb48a17fd6de4ed5559
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,21 @@
+# setup to mark slow tests with @pytest.mark.slow, so that they don't run by
+# default, but can be forced to run with the command-line option --runslow
+# taken from
+# https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--runslow", action="store_true", default=False,
+            help="run slow tests")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/fixtures.py b/fixtures.py
deleted file mode 100644
index 257f58216af3fa307929dec6a4b79d34252ebfc5..0000000000000000000000000000000000000000
--- a/fixtures.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import numpy as np
-import numpy.linalg as la  # noqa
-import pyopencl as cl
-import pyopencl.array  # noqa
-import pyopencl.clrandom  # noqa
-import loopy as lp
-
-from pytest import approx
-
-
-def with_root_kernel(prg, root_name):
-    # FIXME This is a little less beautiful than it could be
-    new_prg = prg.copy(name=root_name)
-    for name in prg:
-        clbl = new_prg[name]
-        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
-            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
-
-    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
-    return new_prg
-
-
-class LoopyFixture:
-    _WENO_PRG = []
-    _QUEUE = []
-
-    def __init__(self):
-        self.prg = self.get_weno_program()
-
-    def get_weno_program(self):
-        if self._WENO_PRG:
-            return self._WENO_PRG[0]
-
-        fn = "WENO.F90"
-
-        with open(fn, "r") as infile:
-            infile_content = infile.read()
-
-        prg = lp.parse_transformed_fortran(infile_content, filename=fn)
-        self._WENO_PRG.append(prg)
-        return prg
-
-    def get_queue(self, ctx_factory):
-        if not self._QUEUE:
-            ctx = ctx_factory()
-            self._QUEUE.append(cl.CommandQueue(ctx))
-        return self._QUEUE[0]
-
-    def random_array(self, *args):
-        return np.random.random_sample(args).astype(np.float32).copy(order="F")
-
-    def mult_mat_vec(self, ctx_factory, alpha, a, b):
-        queue = self.get_queue(ctx_factory)
-
-        c_dev = cl.array.empty(queue, 10, dtype=np.float32)
-
-        prg = with_root_kernel(self.prg, "mult_mat_vec")
-        prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
-
-        return c_dev.get()
-
-    def compute_flux_derivatives(self, ctx_factory,
-            nvars, ndim, nx, ny, nz,
-            states, fluxes, metrics, metric_jacobians):
-
-        queue = self.get_queue(ctx_factory)
-
-        prg = self.prg
-        cfd = prg["compute_flux_derivatives"]
-
-        cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
-
-        cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-                lp.AddressSpace.GLOBAL)
-        cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-                lp.AddressSpace.GLOBAL)
-        cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-                lp.AddressSpace.GLOBAL)
-
-        prg = prg.with_kernel(cfd)
-
-        flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-            nz+6), dtype=np.float32, order="F")
-
-        prg(queue, nvars=nvars, ndim=ndim,
-                states=states, fluxes=fluxes, metrics=metrics,
-                metric_jacobians=metric_jacobians,
-                flux_derivatives=flux_derivatives_dev)
-        return flux_derivatives_dev.get()
diff --git a/test.py b/test.py
index 93890bcff9472def502c033cf228c3d563e14b84..97ccfe7cc1d668e3ccfeae5e4bb4dfd5410686ca 100644
--- a/test.py
+++ b/test.py
@@ -5,225 +5,123 @@ import pyopencl.array  # noqa
 import pyopencl.tools  # noqa
 import pyopencl.clrandom  # noqa
 import loopy as lp  # noqa
-import sys
 
+import sys
 import logging
 
 import pytest
+from pytest import approx
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-from fixtures import LoopyFixture
-
-
-f = LoopyFixture()
-
-
-def test_matvec(ctx_factory):
-    a = f.random_array(10, 10)
-    b = f.random_array(10)
+from utilities import *
+
+
+@pytest.mark.parametrize("states_str,fluxes_str,direction", [
+    ("2 4 4 4 20,1 1 1 1 5.5", "4 11.2 8 8 46.4,1 2.6 1 1 7.1", "x"),
+    ("2 4 4 4 20,1 1 1 1 5.5", "4 8 11.2 8 46.4,1 1 2.6 1 7.1", "y"),
+    ("2 4 4 4 20,1 1 1 1 5.5", "4 8 8 11.2 46.4,1 1 1 2.6 7.1", "z"),
+    ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 2.6 1 1 -7.1,-4 11.2 8 8 -46.4", "x"),
+    ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 1 2.6 1 -7.1,-4 8 11.2 8 -46.4", "y"),
+    ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 1 1 2.6 -7.1,-4 8 8 11.2 -46.4", "z"),
+    ("2 4 8 12 64,1 1 2 3 11", "4 11.2 16 24 134.4,1 2.6 2 3 12.6", "x"),
+    ("2 4 8 12 64,1 1 2 3 11", "8 16 35.2 48 268.8,2 2 5.6 6 25.2", "y"),
+    ("2 4 8 12 64,1 1 2 3 11", "12 24 48 75.2 403.2,3 3 6 10.6 37.8", "z"),
+    ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-1 2.6 2 3 -12.6,-4 11.2 16 24 -134.4", "x"),
+    ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-2 2 5.6 6 -25.2,-8 16 35.2 48 -268.8", "y"),
+    ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-3 3 6 10.6 -37.8,-12 24 48 75.2 -403.2", "z")
+    ])
+def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
+    def identity_matrix(n):
+        return np.identity(n).astype(np.float32).copy(order="F")
+
+    def check_roe_identity(states, R, Rinv):
+        dState = states[:,1] - states[:,0]
+        compare_arrays(R@(Rinv@dState), dState)
+
+    def check_roe_property(states, fluxes, R, Rinv, lam):
+        dState = states[:,1] - states[:,0]
+        dFlux = fluxes[:,1] - fluxes[:,0]
+
+        temp = Rinv@dState
+        temp = np.multiply(lam, temp)
+        compare_arrays(R@temp, dFlux)
+
+    prg = get_weno_program_with_root_kernel("roe_eigensystem")
+    queue = get_queue(ctx_factory)
 
-    c = f.mult_mat_vec(ctx_factory, a=a, b=b, alpha=1.0)
-
-    assert la.norm(a@b - c, 2)/la.norm(c) < 1e-5
-
-
-def test_compute_flux_derivatives(ctx_factory):
-    logging.basicConfig(level="INFO")
-
-    ndim = 3
     nvars = 5
-    nx = 10
-    ny = 10
-    nz = 10
-
-    states = f.random_array(nvars, nx+6, ny+6, nz+6)
-    fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
-
-    f.compute_flux_derivatives(ctx_factory,
-            nvars=nvars, ndim=ndim, nx=nx, ny=ny, nz=nz,
-            states=states, fluxes=fluxes, metrics=metrics,
-            metric_jacobians=metric_jacobians)
-
-
-def f_array(queue, *shape):
-    ary = np.random.random_sample(shape).astype(np.float32).copy(order="F")
-    return cl.array.to_device(queue, ary)
-
-
-def get_gpu_transformed_weno():
-    prg = f.prg
-
-    cfd = prg["compute_flux_derivatives"]
-
-    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
-
-    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-            lp.AddressSpace.GLOBAL)
+    ndim = 3
+    dirs = {"x" : 1, "y" : 2, "z" : 3}
 
-    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
-        cfd = lp.split_iname(cfd, "i"+suffix, 16,
-                outer_tag="g.0", inner_tag="l.0")
-        cfd = lp.split_iname(cfd, "j"+suffix, 16,
-                outer_tag="g.1", inner_tag="l.1")
+    states = transposed_array_from_string(states_str)
+    fluxes = transposed_array_from_string(fluxes_str)
+    metrics_frozen = identity_matrix(ndim)
 
-    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
-        cfd = lp.assignment_to_subst(cfd, var_name)
+    R_dev = empty_array_on_device(queue, nvars, nvars)
+    Rinv_dev = empty_array_on_device(queue, nvars, nvars)
+    lam_dev = empty_array_on_device(queue, nvars)
 
-    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
+    prg(queue, nvars=nvars, ndim=ndim, d=dirs[direction],
+            states=states, metrics_frozen=metrics_frozen,
+            R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev)
 
-    prg = prg.with_kernel(cfd)
+    R = R_dev.get()
+    Rinv = Rinv_dev.get()
+    lam = lam_dev.get()
 
-    # FIXME: These should work, but don't
-    # FIXME: Undo the hand-inlining in WENO.F90
-    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
-    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
+    print(lam)
 
-    if 0:
-        print(prg["convert_to_generalized_frozen"])
-        1/0
+    check_roe_identity(states, R, Rinv)
+    check_roe_property(states, fluxes, R, Rinv, lam)
 
-    return prg
 
+def test_matvec(ctx_factory):
+    prg = get_weno_program_with_root_kernel("mult_mat_vec")
+    queue = get_queue(ctx_factory)
 
-def test_compute_flux_derivatives_gpu(ctx_factory):
-    logging.basicConfig(level="INFO")
+    a = random_array_on_device(queue, 10, 10)
+    b = random_array_on_device(queue, 10)
 
-    prg = get_gpu_transformed_weno()
+    c = empty_array_on_device(queue, 10)
 
-    queue = f.get_queue(ctx_factory)
+    prg(queue, alpha=1.0, a=a, b=b, c=c)
 
-    ndim = 3
-    nvars = 5
-    nx = 10
-    ny = 10
-    nz = 10
+    compare_arrays(a.get()@b.get(), c.get())
 
-    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
 
-    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-        nz+6), dtype=np.float32, order="F")
+@pytest.mark.slow
+def test_compute_flux_derivatives(ctx_factory):
+    prg = get_weno_program()
 
+    queue = get_queue(ctx_factory)
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
-    if 1:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
-    prg = lp.set_options(prg, no_numpy=True)
-
-    prg(queue, nvars=nvars, ndim=ndim,
-            states=states, fluxes=fluxes, metrics=metrics,
-            metric_jacobians=metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
-
-    prg(queue, nvars=nvars, ndim=ndim,
-            states=states, fluxes=fluxes, metrics=metrics,
-            metric_jacobians=metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
-
-
-def benchmark_compute_flux_derivatives_gpu(ctx_factory):
-    logging.basicConfig(level="INFO")
-
-    prg = get_gpu_transformed_weno()
-
-    queue = f.get_queue(ctx_factory)
-
-    ndim = 3
-    nvars = 5
-    n = 16*16
-    nx = n
-    ny = n
-    nz = n
-
-    print("ARRAY GEN")
-    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
-    print("END ARRAY GEN")
+    lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1,
+            parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
-    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-        nz+6), dtype=np.float32, order="F")
 
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 0:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
+@pytest.mark.slow
+def test_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
+    prg = get_weno_program()
+    prg = transform_weno_for_gpu(prg)
 
+    queue = get_queue(ctx_factory)
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-    prg = lp.set_options(prg, ignore_boostable_into=True)
     prg = lp.set_options(prg, no_numpy=True)
-    #prg = lp.set_options(prg, write_wrapper=True)
-    #op_map = lp.get_op_map(prg, count_redundant_work=False)
-    #print(op_map)
-
-    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
-
-    from functools import partial
-    run = partial(prg, queue, nvars=nvars, ndim=ndim,
-            states=states, fluxes=fluxes, metrics=metrics,
-            metric_jacobians=metric_jacobians,
-            flux_derivatives=flux_derivatives_dev,
-            allocator=allocator)
-
-    # {{{ monkeypatch enqueue_nd_range_kernel to trace
-
-    if 0:
-        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel
-
-        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
-            print(f"Enqueueing {ker.function_name}")
-            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)
-
-        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper
-
-    # }}}
-
-    print("warmup")
-    for iwarmup_round in range(2):
-        run()
-
-    nrounds = 10
-
-    queue.finish()
-    print("timing")
-    from time import time
-    start = time()
-
-    for iround in range(nrounds):
-        run()
 
-    queue.finish()
-    one_round = (time() - start)/nrounds
+    if write_code:
+        write_target_device_code(prg)
 
-    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
-    print(f"elapsed per round: {one_round} s")
-    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")
+    lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1,
+            parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.
 if __name__ == "__main__":
     if len(sys.argv) > 1:
+        logging.basicConfig(level="INFO")
         exec(sys.argv[1])
     else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])
diff --git a/utilities.py b/utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..b825fcbe82ee7ef0d2a991edf9ed70f136b55f80
--- /dev/null
+++ b/utilities.py
@@ -0,0 +1,162 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
+import pyopencl.clrandom  # noqa
+import loopy as lp  # noqa
+from pytest import approx
+
+
+# {{{ arrays
+
+def compare_arrays(a, b):
+    assert a == approx(b)
+
+
+def random_array_on_device(queue, *shape):
+    ary = empty_array_on_device(queue, *shape)
+    cl.clrandom.fill_rand(ary)
+    return ary
+
+
+def empty_array_on_device(queue, *shape):
+    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
+
+
+def arrays_from_string(string_arrays):
+    return split_map_to_list(string_arrays, array_from_string, ":")
+
+
+def transposed_array_from_string(string_array):
+    return array_from_string(string_array).transpose().copy(order="F")
+
+
+def array_from_string(string_array):
+    def array_from_string_1d(string_array):
+        if string_array[0] == "i":
+            return np.array(split_map_to_list(string_array[1:], int, " "))
+        else:
+            return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32)
+
+    def array_from_string_2d(string_array):
+        if string_array[0] == ",":
+            return array_from_string_1d(string_array[1:]).reshape((-1, 1))
+        else:
+            return np.array(split_map_to_list(string_array, array_from_string_1d, ","))
+
+    def array_from_string_3d(string_array):
+        if string_array[0] == ";":
+            return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1))
+        else:
+            return np.array(split_map_to_list(string_array, array_from_string_2d, ";"))
+
+    if ";" not in string_array:
+        if "," not in string_array:
+            array = array_from_string_1d(string_array)
+        else:
+            array = array_from_string_2d(string_array)
+    else:
+        array = array_from_string_3d(string_array)
+    return array.copy(order="F")
+
+
+def split_map_to_list(string, map_func, splitter):
+    return list(map(map_func, string.split(splitter)))
+
+# }}}
+
+# {{{ device
+
+def get_queue(ctx_factory):
+    ctx = ctx_factory()
+    return cl.CommandQueue(ctx)
+
+# }}}
+
+# {{{ program / kernel
+
+_WENO_PRG = {}
+
+
+def parse_weno():
+    fn = "WENO.F90"
+
+    with open(fn, "r") as infile:
+        infile_content = infile.read()
+
+    prg = lp.parse_transformed_fortran(infile_content, filename=fn)
+    _WENO_PRG["default"] = prg
+
+
+def get_weno_program():
+    if not _WENO_PRG:
+        parse_weno()
+    return _WENO_PRG["default"]
+
+
+def with_root_kernel(prg, root_name):
+    # FIXME This is a little less beautiful than it could be
+    new_prg = prg.copy(name=root_name)
+    for name in prg:
+        clbl = new_prg[name]
+        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
+            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
+
+    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
+    return new_prg
+
+
+def setup_weno_program_with_root_kernel(knl):
+    prg = get_weno_program()
+    prg = with_root_kernel(prg, knl)
+    _WENO_PRG[knl] = prg
+
+
+def get_weno_program_with_root_kernel(knl):
+    if not knl in _WENO_PRG:
+        setup_weno_program_with_root_kernel(knl)
+    return _WENO_PRG[knl]
+
+
+def transform_weno_for_gpu(prg, print_kernel=False):
+    cfd = prg["compute_flux_derivatives"]
+
+    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
+        cfd = lp.split_iname(cfd, "i"+suffix, 16,
+                outer_tag="g.0", inner_tag="l.0")
+        cfd = lp.split_iname(cfd, "j"+suffix, 16,
+                outer_tag="g.1", inner_tag="l.1")
+
+    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
+        cfd = lp.assignment_to_subst(cfd, var_name)
+
+    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
+
+    prg = prg.with_kernel(cfd)
+
+    # FIXME: These should work, but don't
+    # FIXME: Undo the hand-inlining in WENO.F90
+    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
+    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
+
+    if print_kernel:
+        print(prg["convert_to_generalized_frozen"])
+        1/0
+
+    return prg
+
+
+def write_target_device_code(prg, outfilename="gen-code.cl"):
+    with open(outfilename, "w") as outf:
+        outf.write(lp.generate_code_v2(prg).device_code())
+
+# }}}
+
+# vim: foldmethod=marker