diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87a2197a5a307475eab124c61bb6489d1d689d4
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,106 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
+import pyopencl.clrandom  # noqa
+import loopy as lp  # noqa
+import sys
+
+import logging
+
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+import fixtures
+
+def benchmark_compute_flux_derivatives_gpu(ctx_factory):
+    logging.basicConfig(level="INFO")
+
+    prg = fixtures.get_gpu_transformed_weno()
+
+    queue = fixtures.get_queue(ctx_factory)
+
+    ndim = 3
+    nvars = 5
+    n = 16*16
+    nx = n
+    ny = n
+    nz = n
+
+    print("ARRAY GEN")
+    states = fixtures.f_array(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = fixtures.f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = fixtures.f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = fixtures.f_array(queue, nx+6, ny+6, nz+6)
+    print("END ARRAY GEN")
+
+    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
+        nz+6), dtype=np.float32, order="F")
+
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+
+    if 0:
+        with open("gen-code.cl", "w") as outf:
+            outf.write(lp.generate_code_v2(prg).device_code())
+
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+    prg = lp.set_options(prg, ignore_boostable_into=True)
+    prg = lp.set_options(prg, no_numpy=True)
+    #prg = lp.set_options(prg, write_wrapper=True)
+    #op_map = lp.get_op_map(prg, count_redundant_work=False)
+    #print(op_map)
+
+    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
+
+    from functools import partial
+    run = partial(prg, queue, nvars=nvars, ndim=ndim,
+            states=states, fluxes=fluxes, metrics=metrics,
+            metric_jacobians=metric_jacobians,
+            flux_derivatives=flux_derivatives_dev,
+            allocator=allocator)
+
+    # {{{ monkeypatch enqueue_nd_range_kernel to trace
+
+    if 0:
+        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel
+
+        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
+            print(f"Enqueueing {ker.function_name}")
+            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)
+
+        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper
+
+    # }}}
+
+    print("warmup")
+    for iwarmup_round in range(2):
+        run()
+
+    nrounds = 10
+
+    queue.finish()
+    print("timing")
+    from time import time
+    start = time()
+
+    for iround in range(nrounds):
+        run()
+
+    queue.finish()
+    one_round = (time() - start)/nrounds
+
+    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
+    print(f"elapsed per round: {one_round} s")
+    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        benchmark_compute_flux_derivatives_gpu(cl._csc):
+        #from pytest import main
+        #main([__file__])
diff --git a/fixtures.py b/fixtures.py
index 73df2fefb9fddbc371eac6297cbdf25cf4245fb2..3ec8efc07a3f773f769e019c2854b01b0a68236f 100644
--- a/fixtures.py
+++ b/fixtures.py
@@ -11,6 +11,51 @@ from pytest import approx
 _WENO_PRG = []
 _QUEUE = []
 
+def get_gpu_transformed_weno():
+    prg = fixtures.get_weno_program()
+
+    cfd = prg["compute_flux_derivatives"]
+
+    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+
+    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
+            lp.AddressSpace.GLOBAL)
+
+    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
+        cfd = lp.split_iname(cfd, "i"+suffix, 16,
+                outer_tag="g.0", inner_tag="l.0")
+        cfd = lp.split_iname(cfd, "j"+suffix, 16,
+                outer_tag="g.1", inner_tag="l.1")
+
+    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
+        cfd = lp.assignment_to_subst(cfd, var_name)
+
+    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
+
+    prg = prg.with_kernel(cfd)
+
+    # FIXME: These should work, but don't
+    # FIXME: Undo the hand-inlining in WENO.F90
+    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
+    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
+
+    if 0:
+        print(prg["convert_to_generalized_frozen"])
+        1/0
+
+    return prg
+
+
 def get_queue(ctx_factory):
     if not _QUEUE:
         ctx = ctx_factory()
@@ -43,6 +88,10 @@ def get_weno_program():
     _WENO_PRG.append(prg)
     return prg
 
+def f_array(queue, *shape):
+    ary = np.random.random_sample(shape).astype(np.float32).copy(order="F")
+    return cl.array.to_device(queue, ary)
+
 def random_array(*args):
     return np.random.random_sample(args).astype(np.float32).copy(order="F")
 
diff --git a/test.py b/test.py
index 1f4afa31dc186e82e19afa7163a829461a40d02a..5db0b73509d167009a80075d8cc12f546efc7d38 100644
--- a/test.py
+++ b/test.py
@@ -46,60 +46,10 @@ def test_compute_flux_derivatives(ctx_factory):
             metric_jacobians=metric_jacobians)
 
 
-def f_array(queue, *shape):
-    ary = np.random.random_sample(shape).astype(np.float32).copy(order="F")
-    return cl.array.to_device(queue, ary)
-
-
-def get_gpu_transformed_weno():
-    prg = fixtures.get_weno_program()
-
-    cfd = prg["compute_flux_derivatives"]
-
-    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
-
-    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-            lp.AddressSpace.GLOBAL)
-
-    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
-        cfd = lp.split_iname(cfd, "i"+suffix, 16,
-                outer_tag="g.0", inner_tag="l.0")
-        cfd = lp.split_iname(cfd, "j"+suffix, 16,
-                outer_tag="g.1", inner_tag="l.1")
-
-    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
-        cfd = lp.assignment_to_subst(cfd, var_name)
-
-    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
-
-    prg = prg.with_kernel(cfd)
-
-    # FIXME: These should work, but don't
-    # FIXME: Undo the hand-inlining in WENO.F90
-    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
-    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
-
-    if 0:
-        print(prg["convert_to_generalized_frozen"])
-        1/0
-
-    return prg
-
-
 def test_compute_flux_derivatives_gpu(ctx_factory):
     logging.basicConfig(level="INFO")
 
-    prg = get_gpu_transformed_weno()
+    prg = fixtures.get_gpu_transformed_weno()
 
     queue = fixtures.get_queue(ctx_factory)
 
@@ -109,10 +59,10 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     ny = 10
     nz = 10
 
-    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
+    states = fixtures.f_array(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = fixtures.f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = fixtures.f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = fixtures.f_array(queue, nx+6, ny+6, nz+6)
 
     flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
         nz+6), dtype=np.float32, order="F")
@@ -136,87 +86,6 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
             flux_derivatives=flux_derivatives_dev)
 
 
-def benchmark_compute_flux_derivatives_gpu(ctx_factory):
-    logging.basicConfig(level="INFO")
-
-    prg = get_gpu_transformed_weno()
-
-    queue = fixtures.get_queue(ctx_factory)
-
-    ndim = 3
-    nvars = 5
-    n = 16*16
-    nx = n
-    ny = n
-    nz = n
-
-    print("ARRAY GEN")
-    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
-    print("END ARRAY GEN")
-
-    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-        nz+6), dtype=np.float32, order="F")
-
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 0:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-    prg = lp.set_options(prg, ignore_boostable_into=True)
-    prg = lp.set_options(prg, no_numpy=True)
-    #prg = lp.set_options(prg, write_wrapper=True)
-    #op_map = lp.get_op_map(prg, count_redundant_work=False)
-    #print(op_map)
-
-    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
-
-    from functools import partial
-    run = partial(prg, queue, nvars=nvars, ndim=ndim,
-            states=states, fluxes=fluxes, metrics=metrics,
-            metric_jacobians=metric_jacobians,
-            flux_derivatives=flux_derivatives_dev,
-            allocator=allocator)
-
-    # {{{ monkeypatch enqueue_nd_range_kernel to trace
-
-    if 0:
-        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel
-
-        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
-            print(f"Enqueueing {ker.function_name}")
-            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)
-
-        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper
-
-    # }}}
-
-    print("warmup")
-    for iwarmup_round in range(2):
-        run()
-
-    nrounds = 10
-
-    queue.finish()
-    print("timing")
-    from time import time
-    start = time()
-
-    for iround in range(nrounds):
-        run()
-
-    queue.finish()
-    one_round = (time() - start)/nrounds
-
-    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
-    print(f"elapsed per round: {one_round} s")
-    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")
-
-
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.
 if __name__ == "__main__":
     if len(sys.argv) > 1: