diff --git a/grudge/execution.py b/grudge/execution.py
index 59e32a0db78fd4218f369d0e6f2f412ccddb8e3c..dba0648d820581f0f47febc308370cfea93e4d97 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -25,7 +25,6 @@ from numbers import Number
 import numpy as np
 
 from pytools import memoize_in
-from pytools.obj_array import make_obj_array
 
 import loopy as lp
 import pyopencl as cl
@@ -254,13 +253,7 @@ class ExecutionMapper(mappers.Evaluator,
         raise NotImplementedError(
                 "differentiation should be happening in batched form")
 
-    def map_elementwise_linear(self, op, field_expr):
-        field = self.rec(field_expr)
-
-        from grudge.tools import is_zero
-        if is_zero(field):
-            return 0
-
+    def _elwise_linear_loopy_prg(self):
         @memoize_in(self.array_context, (ExecutionMapper, "elwise_linear_knl"))
         def prg():
             result = make_loopy_program(
@@ -269,18 +262,27 @@ class ExecutionMapper(mappers.Evaluator,
                     0<=idof<ndiscr_nodes_out and
                     0<=j<ndiscr_nodes_in}""",
                 "result[iel, idof] = sum(j, mat[idof, j] * vec[iel, j])",
-                name="diff")
+                name="elwise_linear")
 
             result = lp.tag_array_axes(result, "mat", "stride:auto,stride:auto")
             return result
 
+        return prg()
+
+    def map_elementwise_linear(self, op, field_expr):
+        field = self.rec(field_expr)
+
+        from grudge.tools import is_zero
+        if is_zero(field):
+            return 0
+
         in_discr = self.discrwb.discr_from_dd(op.dd_in)
         out_discr = self.discrwb.discr_from_dd(op.dd_out)
 
         result = out_discr.empty(self.array_context, dtype=field.entry_dtype)
 
+        prg = self._elwise_linear_loopy_prg()
         for in_grp, out_grp in zip(in_discr.groups, out_discr.groups):
-
             cache_key = "elwise_linear", in_grp, out_grp, op, field.entry_dtype
             try:
                 matrix = self.bound_op.operator_data_cache[cache_key]
@@ -294,7 +296,7 @@ class ExecutionMapper(mappers.Evaluator,
                 self.bound_op.operator_data_cache[cache_key] = matrix
 
             self.array_context.call_loopy(
-                    prg(),
+                    prg,
                     mat=matrix,
                     result=result[out_grp.index],
                     vec=field[in_grp.index])
@@ -485,70 +487,42 @@ class ExecutionMapper(mappers.Evaluator,
     def map_insn_diff_batch_assign(self, insn, profile_data=None):
         field = self.rec(insn.field)
         repr_op = insn.operators[0]
-        # FIXME: There's no real reason why differentiation is special,
-        # execution-wise.
-        # This should be unified with map_elementwise_linear, which should
-        # be extended to support batching.
 
         assert repr_op.dd_in.domain_tag == repr_op.dd_out.domain_tag
 
-        @memoize_in(self.array_context,
-                (ExecutionMapper, "reference_derivative_prg"))
-        def prg(nmatrices):
-            result = make_loopy_program(
-                """{[imatrix, iel, idof, j]:
-                    0<=imatrix<nmatrices and
-                    0<=iel<nelements and
-                    0<=idof<nunit_nodes_out and
-                    0<=j<nunit_nodes_in}""",
-                """
-                result[imatrix, iel, idof] = sum(
-                        j, diff_mat[imatrix, idof, j] * vec[iel, j])
-                """,
-                name="diff")
+        in_discr = self.discrwb.discr_from_dd(repr_op.dd_in)
+        out_discr = self.discrwb.discr_from_dd(repr_op.dd_out)
 
-            result = lp.fix_parameters(result, nmatrices=nmatrices)
-            result = lp.tag_inames(result, "imatrix: unr")
-            result = lp.tag_array_axes(result, "result", "sep,c,c")
-            return result
+        prg = self._elwise_linear_loopy_prg()
 
-        noperators = len(insn.operators)
+        result = []
+        for name, op in zip(insn.names, insn.operators):
+            group_results = []
+            for in_grp, out_grp in zip(in_discr.groups, out_discr.groups):
+                assert in_grp.nelements == out_grp.nelements
 
-        in_discr = self.discrwb.discr_from_dd(repr_op.dd_in)
-        out_discr = self.discrwb.discr_from_dd(repr_op.dd_out)
+                if in_grp.nelements == 0:
+                    continue
 
-        result = make_obj_array([
-            out_discr.empty(self.array_context, dtype=field.entry_dtype)
-            for idim in range(noperators)])
+                # Cache operator
+                cache_key = "diff_batch", in_grp, out_grp, tuple(insn.operators),\
+                    field.entry_dtype
+                try:
+                    matrices_dev = self.bound_op.operator_data_cache[cache_key]
+                except KeyError:
+                    matrices_dev = [self.array_context.from_numpy(mat)
+                            for mat in repr_op.matrices(out_grp, in_grp)]
+                    self.bound_op.operator_data_cache[cache_key] = matrices_dev
 
-        for in_grp, out_grp in zip(in_discr.groups, out_discr.groups):
-            if in_grp.nelements == 0:
-                continue
+                group_results.append(self.array_context.call_loopy(
+                        prg,
+                        mat=matrices_dev[op.rst_axis],
+                        vec=field[in_grp.index])["result"])
 
-            # Cache operator
-            cache_key = "diff_batch", in_grp, out_grp, tuple(insn.operators),\
-                field.entry_dtype
-            try:
-                matrices_ary_dev = self.bound_op.operator_data_cache[cache_key]
-            except KeyError:
-                matrices = repr_op.matrices(out_grp, in_grp)
-                matrices_ary = np.empty(
-                    (noperators, out_grp.nunit_dofs, in_grp.nunit_dofs),
-                    dtype=field.entry_dtype)
-                for i, op in enumerate(insn.operators):
-                    matrices_ary[i] = matrices[op.rst_axis]
-                matrices_ary_dev = self.array_context.from_numpy(matrices_ary)
-                self.bound_op.operator_data_cache[cache_key] = matrices_ary_dev
+            result.append(
+                    (name, DOFArray(self.array_context, tuple(group_results))))
 
-            self.array_context.call_loopy(
-                    prg(noperators),
-                    diff_mat=matrices_ary_dev,
-                    result=make_obj_array([
-                        result[iop][out_grp.index]
-                        for iop in range(noperators)
-                        ]), vec=field[in_grp.index])
-
-        return [(name, result[i]) for i, name in enumerate(insn.names)], []
+        return result, []
 
     # }}}