diff --git a/test.py b/test.py
index 0a73b472fe00649a513dfe68b3078548658b50f4..93890bcff9472def502c033cf228c3d563e14b84 100644
--- a/test.py
+++ b/test.py
@@ -2,6 +2,7 @@ import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
 import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
 import pyopencl.clrandom  # noqa
 import loopy as lp  # noqa
 import sys
@@ -48,6 +49,11 @@ def test_compute_flux_derivatives(ctx_factory):
             metric_jacobians=metric_jacobians)
 
 
+def f_array(queue, *shape):
+    ary = np.random.random_sample(shape).astype(np.float32).copy(order="F")
+    return cl.array.to_device(queue, ary)
+
+
 def get_gpu_transformed_weno():
     prg = f.prg
 
@@ -62,7 +68,7 @@ def get_gpu_transformed_weno():
     cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
             lp.AddressSpace.GLOBAL)
 
-    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6"]:
+    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
         cfd = lp.split_iname(cfd, "i"+suffix, 16,
                 outer_tag="g.0", inner_tag="l.0")
         cfd = lp.split_iname(cfd, "j"+suffix, 16,
@@ -106,10 +112,10 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     ny = 10
     nz = 10
 
-    states = f.random_array(nvars, nx+6, ny+6, nz+6)
-    fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
+    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
 
     flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
         nz+6), dtype=np.float32, order="F")
@@ -120,6 +126,13 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
         with open("gen-code.cl", "w") as outf:
             outf.write(lp.generate_code_v2(prg).device_code())
 
+    prg = lp.set_options(prg, no_numpy=True)
+
+    prg(queue, nvars=nvars, ndim=ndim,
+            states=states, fluxes=fluxes, metrics=metrics,
+            metric_jacobians=metric_jacobians,
+            flux_derivatives=flux_derivatives_dev)
+
     prg(queue, nvars=nvars, ndim=ndim,
             states=states, fluxes=fluxes, metrics=metrics,
             metric_jacobians=metric_jacobians,
@@ -135,15 +148,17 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
 
     ndim = 3
     nvars = 5
-    n = 100
+    n = 16*16
     nx = n
     ny = n
     nz = n
 
-    states = f.random_array(nvars, nx+6, ny+6, nz+6)
-    fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
+    print("ARRAY GEN")
+    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
+    print("END ARRAY GEN")
 
     flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
         nz+6), dtype=np.float32, order="F")
@@ -154,15 +169,34 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
         with open("gen-code.cl", "w") as outf:
             outf.write(lp.generate_code_v2(prg).device_code())
 
-    prg = lp.set_options(prg, ignore_boostable_into=True, write_wrapper=True)
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+    prg = lp.set_options(prg, ignore_boostable_into=True)
+    prg = lp.set_options(prg, no_numpy=True)
+    #prg = lp.set_options(prg, write_wrapper=True)
     #op_map = lp.get_op_map(prg, count_redundant_work=False)
     #print(op_map)
 
+    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
+
     from functools import partial
     run = partial(prg, queue, nvars=nvars, ndim=ndim,
             states=states, fluxes=fluxes, metrics=metrics,
             metric_jacobians=metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
+            flux_derivatives=flux_derivatives_dev,
+            allocator=allocator)
+
+    # {{{ monkeypatch enqueue_nd_range_kernel to trace
+
+    if 0:
+        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel
+
+        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
+            print(f"Enqueueing {ker.function_name}")
+            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)
+
+        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper
+
+    # }}}
 
     print("warmup")
     for iwarmup_round in range(2):
@@ -181,7 +215,9 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
     queue.finish()
     one_round = (time() - start)/nrounds
 
-    print(f"DOFs/s: {n**3/one_round}, elapsed per round: {one_round} s")
+    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
+    print(f"elapsed per round: {one_round} s")
+    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")
 
 
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.