diff --git a/grudge/execution.py b/grudge/execution.py
index 7e0a083569c589b8cc6645873f47a425955463c2..3ba817e1261e0652516cac4cd1ff82e190d6a160 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -174,6 +174,7 @@ class ExecutionMapper(mappers.Evaluator,
                 default_offset=lp.auto, name="diff")
 
             knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
+            knl = lp.tag_array_axes(knl, "mat", "stride:auto,stride:auto")
             return lp.tag_inames(knl, dict(k="g.0"))
 
         in_discr = self.discrwb.discr_from_dd(op.dd_in)
@@ -287,8 +288,18 @@ class ExecutionMapper(mappers.Evaluator,
         face_discr = self.discrwb.discr_from_dd(expr.dd)
         assert face_discr.dim == 0
 
+        from meshmode.discretization.connection import \
+                ChainedDiscretizationConnection
         all_faces_conn = self.discrwb.connection_from_dds("vol", expr.dd)
-        field = face_discr.empty(self.queue, allocator=self.bound_op.allocator)
+        if isinstance(all_faces_conn, ChainedDiscretizationConnection):
+            # NOTE: this can happen if oversampling on a face. however, since
+            # the face_id stays the same, we can safely just look at the first
+            # connection
+            all_faces_conn = all_faces_conn.connections[0]
+
+        field = face_discr.empty(self.queue,
+                dtype=self.discrwb.real_dtype,
+                allocator=self.bound_op.allocator)
         field.fill(1)
 
         for grp in all_faces_conn.groups:
diff --git a/grudge/function_registry.py b/grudge/function_registry.py
index 2f56eba4ad26d066039f5ac373619ac4ac10a656..4d521560d8e6c83a8e9fa238b3c46955dc62e7a7 100644
--- a/grudge/function_registry.py
+++ b/grudge/function_registry.py
@@ -136,6 +136,7 @@ class CElementwiseBinaryFunction(Function):
             return func(arg0, arg1)
 
         from pymbolic.primitives import Variable
+
         @memoize_in(self, "map_call_knl_%s" % func_name)
         def knl():
             i = Variable("i")
diff --git a/grudge/symbolic/operators.py b/grudge/symbolic/operators.py
index c1360b1627a79ccafe3e52405a099f550012a410..8c2f7fd0fecad5ef4c4dcae4e2d708d317edc3a7 100644
--- a/grudge/symbolic/operators.py
+++ b/grudge/symbolic/operators.py
@@ -452,8 +452,7 @@ class RefMassOperator(RefMassOperatorBase):
         vand_inv_t = np.linalg.inv(vand).T
 
         weights = in_element_group.weights
-
-        return np.einsum('c,bz,acz->abc', weights, vand_inv_t, o_vand)
+        return np.einsum('j,ik,jk->ij', weights, vand_inv_t, o_vand)
 
     mapper_method = intern("map_ref_mass")
 
diff --git a/test/test_grudge.py b/test/test_grudge.py
index 65cd9d8975d49ebdd46c9029fd803aafb8089766..12b7b116e010006c957d34a210d3455e223c25c4 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -88,39 +88,70 @@ def test_inverse_metric(ctx_factory, dim):
             assert err < 1e-12, (i, j, err)
 
 
-def test_1d_mass_mat_trig(ctx_factory):
+@pytest.mark.parametrize("ambient_dim", [1, 2, 3])
+@pytest.mark.parametrize("quad_tag", [sym.QTAG_NONE, "OVSMP"])
+def test_1d_mass_mat_trig(ctx_factory, ambient_dim, quad_tag):
     """Check the integral of some trig functions on an interval using the mass
-    matrix
+    matrix.
     """
-
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    from meshmode.mesh.generation import generate_regular_rect_mesh
-
-    mesh = generate_regular_rect_mesh(a=(-4*np.pi,), b=(9*np.pi,),
-            n=(17,), order=1)
-
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=8)
-
-    x = sym.nodes(1)
-    f = bind(discr, sym.cos(x[0])**2)(queue)
-    ones = bind(discr, sym.Ones(sym.DD_VOLUME))(queue)
+    nelements = 17
+    order = 4
 
-    mass_op = bind(discr, sym.MassOperator()(sym.var("f")))
+    a = -4.0 * np.pi
+    b = +9.0 * np.pi
+    true_integral = 13*np.pi/2 * (b - a)**(ambient_dim - 1)
 
-    num_integral_1 = np.dot(ones.get(), mass_op(queue, f=f))
-    num_integral_2 = np.dot(f.get(), mass_op(queue, f=ones))
-    num_integral_3 = bind(discr, sym.integral(sym.var("f")))(queue, f=f)
-
-    true_integral = 13*np.pi/2
-    err_1 = abs(num_integral_1-true_integral)
-    err_2 = abs(num_integral_2-true_integral)
-    err_3 = abs(num_integral_3-true_integral)
+    from meshmode.discretization.poly_element import QuadratureSimplexGroupFactory
+    dd_quad = sym.DOFDesc(sym.DTAG_VOLUME_ALL, quad_tag)
+    if quad_tag is sym.QTAG_NONE:
+        quad_tag_to_group_factory = {}
+    else:
+        quad_tag_to_group_factory = {
+                quad_tag: QuadratureSimplexGroupFactory(order=2*order)
+                }
 
-    assert err_1 < 1e-10
-    assert err_2 < 1e-10
-    assert err_3 < 1e-10
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    mesh = generate_regular_rect_mesh(
+            a=(a,)*ambient_dim, b=(b,)*ambient_dim,
+            n=(nelements,)*ambient_dim, order=1)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order,
+            quad_tag_to_group_factory=quad_tag_to_group_factory)
+
+    def _get_variables_on(dd):
+        sym_f = sym.var("f", dd=dd)
+        sym_x = sym.nodes(ambient_dim, dd=dd)
+        sym_ones = sym.Ones(dd)
+
+        return sym_f, sym_x, sym_ones
+
+    sym_f, sym_x, sym_ones = _get_variables_on(sym.DD_VOLUME)
+    f_volm = bind(discr, sym.cos(sym_x[0])**2)(queue).get()
+    ones_volm = bind(discr, sym_ones)(queue).get()
+
+    sym_f, sym_x, sym_ones = _get_variables_on(dd_quad)
+    f_quad = bind(discr, sym.cos(sym_x[0])**2)(queue)
+    ones_quad = bind(discr, sym_ones)(queue)
+
+    mass_op = bind(discr, sym.MassOperator(dd_quad, sym.DD_VOLUME)(sym_f))
+
+    num_integral_1 = np.dot(ones_volm, mass_op(queue, f=f_quad).get())
+    err_1 = abs(num_integral_1 - true_integral)
+    assert err_1 < 5.0e-10, err_1
+
+    num_integral_2 = np.dot(f_volm, mass_op(queue, f=ones_quad).get())
+    err_2 = abs(num_integral_2 - true_integral)
+    assert err_2 < 5.0e-10, err_2
+
+    if quad_tag is sym.QTAG_NONE:
+        # NOTE: `integral` always makes a square mass matrix and
+        # `QuadratureSimplexGroupFactory` does not have a `mass_matrix` method.
+        num_integral_3 = bind(discr,
+                sym.integral(sym_f, dd=dd_quad))(queue, f=f_quad)
+        err_3 = abs(num_integral_3 - true_integral)
+        assert err_3 < 5.0e-10, err_3
 
 
 @pytest.mark.parametrize("dim", [1, 2, 3])