diff --git a/examples/dg.py b/examples/dg.py
index 4e034c4f90daf1ecbf52c01ae18848153945eba6..85563ca1ff27d4bd875485f67ac99f1c67a72d8a 100644
--- a/examples/dg.py
+++ b/examples/dg.py
@@ -1,4 +1,4 @@
-
+# FIXME NOT UPDATED YET FOR NEW-STYLE LOOPY!
 
 
 
diff --git a/examples/quadrature.py b/examples/quadrature.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf4e6f21f6b1ecd5b234dd31a0864dba372f09a
--- /dev/null
+++ b/examples/quadrature.py
@@ -0,0 +1,107 @@
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cl_array
+import loopy as lp
+
+
+
+
+def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, 
+        order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0):
+    if isinstance(shape, int):
+        shape = (shape, shape)
+    l = max(shape)
+    eye_ish = id_factor*np.eye(l, k=od)
+    if inc_factor:
+        eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l)
+    ary = np.asarray(
+        ran_factor*np.random.randn(*shape)
+        + eye_ish[:shape[0], :shape[1]],
+        dtype=dtype, order=order)
+
+    return cl_array.to_device(queue, ary)
+
+
+
+
+def build_mass_mat_maker(ctx_factory=cl.create_some_context):
+    dtype = np.float32
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    Nb = 3
+    Nv = 3
+    Nq = 3*3
+
+    Nc = 1600
+    from pymbolic import var
+    m, w, det_j, phi, c, i, j, q = [var(s) for s in "m w det_j phi c i j q".split()]
+
+    knl = lp.LoopKernel(ctx.devices[0],
+            "[ncells] -> {[c,i,j,q]: 0<=c<ncells and 0 <= i < %(Nv)s "
+            "and 0<=j<%(Nb)s and 0<=q<%(Nq)s}" % dict(
+                Nv=Nv, Nb=Nb, Nq=Nq),
+            [
+                (m[c, i, j], w[q]*det_j[c]*phi[i,q]*phi[j,q])
+                ],
+            [
+                lp.ArrayArg("m", dtype, shape=(Nc, Nv, Nb)),
+                lp.ArrayArg("w", dtype, shape=(Nq,)),
+                lp.ArrayArg("det_j", dtype, shape=(Nc,)),
+                lp.ArrayArg("phi", dtype, shape=(Nv, Nq,)),
+                lp.ScalarArg("ncells", np.int32, approximately=1000),
+                ],
+            name="mass_mat",
+            iname_to_tag=dict(i="l.0", j="l.1")
+            )
+    knl = lp.split_dimension(knl, "c", 8, outer_tag="g.0", inner_tag="l.2")
+    knl = lp.add_prefetch(knl, "det_j", ["c_inner"])
+
+    # fix reg prefetch
+    # fix redundant slab generation
+
+    # FIXME
+    #knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2")
+    #knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0")
+
+    #ilp = 4
+    #knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1")
+    #j_inner_split = 16
+    #knl = lp.split_dimension(knl, "j", ilp*j_inner_split, outer_tag="g.1")
+    #knl = lp.split_dimension(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0")
+    #knl = lp.split_dimension(knl, "k", 2)
+
+    #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
+    #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
+    #assert knl.get_problems({})[0] <= 2
+
+    kernel_gen = (lp.insert_register_prefetches(knl)
+            for knl in lp.generate_loop_schedules(knl))
+
+    if False:
+        a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
+                ran_factor=1, id_factor=5)
+        b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
+                ran_factor=1, id_factor=5, inc_factor=0)
+        c = cl_array.empty_like(a)
+        a_img = cl.image_from_array(ctx, a.get(), 1)
+        b_img = cl.image_from_array(ctx, b.get(), 1)
+
+    def launcher(kernel, gsize, lsize, check):
+        1/0
+        evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
+                g_times_l=True)
+
+        return evt
+
+    from pyopencl.characterize import get_fast_inaccurate_build_options
+    lp.drive_timing_run(kernel_gen, queue, launcher, flop_count=0,
+            options=get_fast_inaccurate_build_options(ctx.devices[0]))
+
+
+
+
+if __name__ == "__main__":
+    build_mass_mat_maker()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 390f9996e36e4b4195eb167e30e430512c6fc623..147efabf99ab20e57ee88c19cfc1bc06c9f96653 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -31,6 +31,9 @@ register_mpz_with_pymbolic()
 
 # TODO: Implement GT200 matmul, Fermi matmul, DG
 # TODO: DMA engine threads?
+# TODO: Specify initial implemented domain.
+#   (to filter away unnecessary conditions on parameters)
+# TODO: Deal with equalities that crop up.
 
 # Later:
 # ------
diff --git a/loopy/kernel.py b/loopy/kernel.py
index e5be621b01e13fb77389787bed1fc6788bbe3b2d..6eb67fde0b1dd257ee2b98f94fbee476e4d090d4 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -488,7 +488,7 @@ class LoopKernel(Record):
 
         for i in range(len(llens)):
             if llens[i] > self.device.max_work_item_sizes[i]:
-                msg(5, "group axis %d too big")
+                msg(5, "group axis %d too big" % i)
 
         from pytools import product
         if product(llens) > self.device.max_work_group_size:
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 1c6210f0d9c5c5e10636ae718ae04707c474693b..7aa72e646ac2aa800e390ee32245e103b25d24e4 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -127,10 +127,9 @@ def test_axpy(ctx_factory):
             #check_error(refsol, c.get())
 
         #return evt
-        1/0
+        pass
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
-            edit_code=True)
+    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)