diff --git a/examples/advection/var-velocity.py b/examples/advection/var-velocity.py
index b6beb7a2d493aa77a154b96b8e20f6461fb757cd..388c4ce576fd8e248432831f14efaadb248fb4a1 100644
--- a/examples/advection/var-velocity.py
+++ b/examples/advection/var-velocity.py
@@ -26,11 +26,11 @@ import os
 import numpy as np
 
 import pyopencl as cl
-import pyopencl.array
-import pyopencl.clmath
+
+from meshmode.array_context import PyOpenCLArrayContext
 
 from grudge import bind, sym
-from pytools.obj_array import join_fields
+from pytools.obj_array import flat_obj_array
 
 import logging
 logger = logging.getLogger(__name__)
@@ -92,6 +92,7 @@ class Plotter:
 def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     # {{{ parameters
 
@@ -119,7 +120,7 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
         c = sym_x
     else:
         # solid body rotation
-        c = join_fields(
+        c = flat_obj_array(
                 np.pi * (d/2 - sym_x[1]),
                 np.pi * (sym_x[0] - d/2),
                 0)[:dim]
@@ -145,7 +146,7 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
         quad_tag_to_group_factory = {}
 
     from grudge import DGDiscretizationWithBoundaries
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order,
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order,
             quad_tag_to_group_factory=quad_tag_to_group_factory)
 
     # }}}
@@ -176,10 +177,10 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
             flux_type=flux_type)
 
     bound_op = bind(discr, op.sym_operator())
-    u = bind(discr, f_gaussian(sym.nodes(dim)))(queue, t=0)
+    u = bind(discr, f_gaussian(sym.nodes(dim)))(actx, t=0)
 
     def rhs(t, u):
-        return bound_op(queue, t=t, u=u)
+        return bound_op(t=t, u=u)
 
     # }}}
 
@@ -197,7 +198,7 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
             continue
 
         if step % 10 == 0:
-            norm_u = norm(queue, u=event.state_component)
+            norm_u = norm(u=event.state_component)
             plot(event, "fld-var-velocity-%04d" % step)
 
         step += 1
diff --git a/examples/advection/weak.py b/examples/advection/weak.py
index 14ed08e1a757d149c1c96ecb9d84eb60d15400b1..88703a0e930ff703d15e3e9dee6d7be11974002a 100644
--- a/examples/advection/weak.py
+++ b/examples/advection/weak.py
@@ -22,8 +22,8 @@ import numpy as np
 import numpy.linalg as la
 
 import pyopencl as cl
-import pyopencl.array
-import pyopencl.clmath
+
+from meshmode.array_context import PyOpenCLArrayContext
 
 from grudge import bind, sym
 
@@ -88,6 +88,7 @@ class Plotter:
 def main(ctx_factory, dim=2, order=4, visualize=True):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     # {{{ parameters
 
@@ -123,7 +124,7 @@ def main(ctx_factory, dim=2, order=4, visualize=True):
             order=order)
 
     from grudge import DGDiscretizationWithBoundaries
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     # }}}
 
@@ -142,10 +143,10 @@ def main(ctx_factory, dim=2, order=4, visualize=True):
         flux_type=flux_type)
 
     bound_op = bind(discr, op.sym_operator())
-    u = bind(discr, u_analytic(sym.nodes(dim)))(queue, t=0)
+    u = bind(discr, u_analytic(sym.nodes(dim)))(actx, t=0)
 
     def rhs(t, u):
-        return bound_op(queue, t=t, u=u)
+        return bound_op(t=t, u=u)
 
     # }}}
 
@@ -165,7 +166,7 @@ def main(ctx_factory, dim=2, order=4, visualize=True):
             continue
 
         if step % 10 == 0:
-            norm_u = norm(queue, u=event.state_component)
+            norm_u = norm(u=event.state_component)
             plot(event, "fld-weak-%04d" % step)
 
         step += 1
diff --git a/examples/dagrt-fusion.py b/examples/dagrt-fusion.py
index fef376c4f94a32725b9e2537d151901a0667f00f..a493059e642e24867c2b6658722cad837189d3ae 100755
--- a/examples/dagrt-fusion.py
+++ b/examples/dagrt-fusion.py
@@ -34,7 +34,6 @@ import contextlib
 import logging
 import numpy as np
 import os
-import six
 import sys
 import pyopencl as cl
 import pyopencl.array  # noqa
@@ -42,6 +41,10 @@ import pytest
 
 import dagrt.language as lang
 import pymbolic.primitives as p
+
+from meshmode.dof_array import DOFArray
+from meshmode.array_context import PyOpenCLArrayContext
+
 import grudge.symbolic.mappers as gmap
 import grudge.symbolic.operators as sym_op
 from grudge.execution import ExecutionMapper
@@ -50,7 +53,7 @@ from pymbolic.mapper import Mapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
 from pytools import memoize
-from pytools.obj_array import join_fields
+from pytools.obj_array import flat_obj_array
 
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 from leap.rk import LSRK4MethodBuilder
@@ -82,6 +85,15 @@ def open_output_file(filename):
             outfile.close()
 
 
+def dof_array_nbytes(ary: np.ndarray):
+    if isinstance(ary, np.ndarray) and ary.dtype.char == "O":
+        return sum(
+                dof_array_nbytes(ary[idx])
+                for idx in np.ndindex(ary.shape))
+    else:
+        return ary.nbytes
+
+
 # {{{ topological sort
 
 def topological_sort(stmts, root_deps):
@@ -241,8 +253,7 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
 
 class RK4TimeStepperBase(object):
 
-    def __init__(self, queue, component_getter):
-        self.queue = queue
+    def __init__(self, component_getter):
         self.component_getter = component_getter
 
     def get_initial_context(self, fields, t_start, dt):
@@ -254,7 +265,7 @@ class RK4TimeStepperBase(object):
                 flattened_fields.extend(field)
             else:
                 flattened_fields.append(field)
-        flattened_fields = join_fields(*flattened_fields)
+        flattened_fields = flat_obj_array(*flattened_fields)
         del fields
 
         return {
@@ -286,7 +297,7 @@ class RK4TimeStepperBase(object):
         assert len(output_states) == num_fields
         assert len(output_states) == len(output_residuals)
 
-        flattened_results = join_fields(output_t, output_dt, *output_states)
+        flattened_results = flat_obj_array(output_t, output_dt, *output_states)
 
         self.bound_op = bind(
                 discr, flattened_results,
@@ -305,7 +316,6 @@ class RK4TimeStepperBase(object):
                 profile_data = None
 
             results = self.bound_op(
-                    self.queue,
                     profile_data=profile_data,
                     **context)
 
@@ -327,7 +337,7 @@ class RK4TimeStepperBase(object):
 
 class RK4TimeStepper(RK4TimeStepperBase):
 
-    def __init__(self, queue, discr, field_var_name, grudge_bound_op,
+    def __init__(self, discr, field_var_name, grudge_bound_op,
                  num_fields, component_getter, exec_mapper_factory=ExecutionMapper):
         """Arguments:
 
@@ -342,7 +352,7 @@ class RK4TimeStepper(RK4TimeStepperBase):
                its components
 
         """
-        super().__init__(queue, component_getter)
+        super().__init__(component_getter)
 
         # Construct sym_rhs to have the effect of replacing the RHS calls in the
         # dagrt code with calls of the grudge operator.
@@ -353,9 +363,8 @@ class RK4TimeStepper(RK4TimeStepperBase):
                     + tuple(
                         Variable(field_var_name, dd=sym.DD_VOLUME)[i]
                         for i in range(num_fields)))))
-        sym_rhs = join_fields(*(call[i] for i in range(num_fields)))
+        sym_rhs = flat_obj_array(*(call[i] for i in range(num_fields)))
 
-        self.queue = queue
         self.grudge_bound_op = grudge_bound_op
 
         from grudge.function_registry import register_external_function
@@ -373,12 +382,12 @@ class RK4TimeStepper(RK4TimeStepperBase):
 
         self.component_getter = component_getter
 
-    def _bound_op(self, queue, t, *args, profile_data=None):
+    def _bound_op(self, array_context, t, *args, profile_data=None):
         context = {
                 "t": t,
-                self.field_var_name: join_fields(*args)}
+                self.field_var_name: flat_obj_array(*args)}
         result = self.grudge_bound_op(
-                queue, profile_data=profile_data, **context)
+                array_context, profile_data=profile_data, **context)
         if profile_data is not None:
             result = result[0]
         return result
@@ -391,9 +400,9 @@ class RK4TimeStepper(RK4TimeStepperBase):
 
 class FusedRK4TimeStepper(RK4TimeStepperBase):
 
-    def __init__(self, queue, discr, field_var_name, sym_rhs, num_fields,
+    def __init__(self, discr, field_var_name, sym_rhs, num_fields,
                  component_getter, exec_mapper_factory=ExecutionMapper):
-        super().__init__(queue, component_getter)
+        super().__init__(component_getter)
         self.set_up_stepper(
                 discr, field_var_name, sym_rhs, num_fields,
                 base_function_registry,
@@ -404,7 +413,7 @@ class FusedRK4TimeStepper(RK4TimeStepperBase):
 
 # {{{ problem setup code
 
-def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
+def get_strong_wave_op_with_discr(actx, dims=2, order=4):
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
@@ -413,7 +422,7 @@ def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
 
     logger.debug("%d elements", mesh.nelements)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     source_center = np.array([0.1, 0.22, 0.33])[:dims]
     source_width = 0.05
@@ -448,22 +457,22 @@ def dg_flux(c, tpair):
     dims = len(v)
 
     normal = sym.normal(tpair.dd, dims)
-    flux_weak = join_fields(
+    flux_weak = flat_obj_array(
             np.dot(v.avg, normal),
             u.avg * normal)
 
-    flux_weak -= (1 if c > 0 else -1)*join_fields(
+    flux_weak -= (1 if c > 0 else -1)*flat_obj_array(
             0.5*(u.int-u.ext),
             0.5*(normal * np.dot(normal, v.int-v.ext)))
 
-    flux_strong = join_fields(
+    flux_strong = flat_obj_array(
             np.dot(v.int, normal),
             u.int * normal) - flux_weak
 
     return sym.project(tpair.dd, "all_faces")(c*flux_strong)
 
 
-def get_strong_wave_op_with_discr_direct(cl_ctx, dims=2, order=4):
+def get_strong_wave_op_with_discr_direct(actx, dims=2, order=4):
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
@@ -472,7 +481,7 @@ def get_strong_wave_op_with_discr_direct(cl_ctx, dims=2, order=4):
 
     logger.debug("%d elements", mesh.nelements)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     source_center = np.array([0.1, 0.22, 0.33])[:dims]
     source_width = 0.05
@@ -502,13 +511,13 @@ def get_strong_wave_op_with_discr_direct(cl_ctx, dims=2, order=4):
     rad_u = sym.cse(sym.project("vol", BTAG_ALL)(u))
     rad_v = sym.cse(sym.project("vol", BTAG_ALL)(v))
 
-    rad_bc = sym.cse(join_fields(
+    rad_bc = sym.cse(flat_obj_array(
             0.5*(rad_u - sign*np.dot(rad_normal, rad_v)),
             0.5*rad_normal*(np.dot(rad_normal, rad_v) - sign*rad_u)
             ), "rad_bc")
 
     sym_operator = (
-            - join_fields(
+            - flat_obj_array(
                 -c*np.dot(sym.nabla(dims), v) - source_f,
                 -c*(sym.nabla(dims)*u)
                 )
@@ -532,29 +541,30 @@ def get_strong_wave_component(state_component):
 def test_stepper_equivalence(ctx_factory, order=4):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
 
     sym_operator, _ = get_strong_wave_op_with_discr(
-            cl_ctx, dims=dims, order=order)
+            actx, dims=dims, order=order)
     sym_operator_direct, discr = get_strong_wave_op_with_discr_direct(
-            cl_ctx, dims=dims, order=order)
+            actx, dims=dims, order=order)
 
     if dims == 2:
         dt = 0.04
     elif dims == 3:
         dt = 0.02
 
-    ic = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    ic = flat_obj_array(discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     bound_op = bind(discr, sym_operator)
 
     stepper = RK4TimeStepper(
-            queue, discr, "w", bound_op, 1 + discr.dim, get_strong_wave_component)
+            discr, "w", bound_op, 1 + discr.dim, get_strong_wave_component)
 
     fused_stepper = FusedRK4TimeStepper(
-            queue, discr, "w", sym_operator_direct, 1 + discr.dim,
+            discr, "w", sym_operator_direct, 1 + discr.dim,
             get_strong_wave_component)
 
     t_start = 0
@@ -573,7 +583,7 @@ def test_stepper_equivalence(ctx_factory, order=4):
         logger.debug("step %d/%d", step, nsteps)
         t, (u, v) = next(fused_steps)
         assert t == t_ref, step
-        assert norm(queue, u=u, u_ref=u_ref) <= 1e-13, step
+        assert norm(u=u, u_ref=u_ref) <= 1e-13, step
 
 # }}}
 
@@ -582,9 +592,9 @@ def test_stepper_equivalence(ctx_factory, order=4):
 
 class ExecutionMapperWrapper(Mapper):
 
-    def __init__(self, queue, context, bound_op):
-        self.inner_mapper = ExecutionMapper(queue, context, bound_op)
-        self.queue = queue
+    def __init__(self, array_context, context, bound_op):
+        self.inner_mapper = ExecutionMapper(array_context, context, bound_op)
+        self.array_context = array_context
         self.context = context
         self.bound_op = bound_op
 
@@ -592,6 +602,9 @@ class ExecutionMapperWrapper(Mapper):
         # Needed, because bound op execution can ask for variable values.
         return self.inner_mapper.map_variable(expr)
 
+    def map_node_coordinate_component(self, expr):
+        return self.inner_mapper.map_node_coordinate_component(expr)
+
     def map_grudge_variable(self, expr):
         # See map_variable()
         return self.inner_mapper.map_grudge_variable(expr)
@@ -610,7 +623,7 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
     def map_profiled_call(self, expr, profile_data):
         args = [self.inner_mapper.rec(p) for p in expr.parameters]
         return self.inner_mapper.function_registry[expr.function.name](
-                self.queue, *args, profile_data=profile_data)
+                self.array_context, *args, profile_data=profile_data)
 
     def map_profiled_essentially_elementwise_linear(self, op, field_expr,
                                                     profile_data):
@@ -623,15 +636,19 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
 
             field = self.inner_mapper.rec(field_expr)
             profile_data["bytes_read"] = (
-                    profile_data.get("bytes_read", 0) + field.nbytes)
+                    profile_data.get("bytes_read", 0)
+                    + dof_array_nbytes(field))
             profile_data["bytes_written"] = (
-                    profile_data.get("bytes_written", 0) + result.nbytes)
+                    profile_data.get("bytes_written", 0)
+                    + dof_array_nbytes(result))
 
             if op.mapper_method == "map_projection":
                 profile_data["interp_bytes_read"] = (
-                        profile_data.get("interp_bytes_read", 0) + field.nbytes)
+                        profile_data.get("interp_bytes_read", 0)
+                        + dof_array_nbytes(field))
                 profile_data["interp_bytes_written"] = (
-                        profile_data.get("interp_bytes_written", 0) + result.nbytes)
+                        profile_data.get("interp_bytes_written", 0)
+                        + dof_array_nbytes(result))
 
         return result
 
@@ -672,45 +689,71 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
         return result, []
 
     def map_insn_loopy_kernel(self, insn, profile_data):
-        kwargs = {}
         kdescr = insn.kernel_descriptor
-        for name, expr in six.iteritems(kdescr.input_mappings):
-            val = self.inner_mapper.rec(expr)
-            kwargs[name] = val
-            assert not isinstance(val, np.ndarray)
-            if profile_data is not None and isinstance(val, pyopencl.array.Array):
-                profile_data["bytes_read"] = (
-                        profile_data.get("bytes_read", 0) + val.nbytes)
-                profile_data["bytes_read_by_scalar_assignments"] = (
-                        profile_data.get("bytes_read_by_scalar_assignments", 0)
-                        + val.nbytes)
-
         discr = self.inner_mapper.discrwb.discr_from_dd(kdescr.governing_dd)
+
+        dof_array_kwargs = {}
+        other_kwargs = {}
+
+        for name, expr in kdescr.input_mappings.items():
+            v = self.inner_mapper.rec(expr)
+            if isinstance(v, DOFArray):
+                dof_array_kwargs[name] = v
+
+                if profile_data is not None:
+                    size = dof_array_nbytes(v)
+                    profile_data["bytes_read"] = (
+                            profile_data.get("bytes_read", 0) + size)
+                    profile_data["bytes_read_by_scalar_assignments"] = (
+                            profile_data.get("bytes_read_by_scalar_assignments", 0)
+                            + size)
+            else:
+                assert not isinstance(v, np.ndarray)
+                other_kwargs[name] = v
+
         for name in kdescr.scalar_args():
-            v = kwargs[name]
+            v = other_kwargs[name]
             if isinstance(v, (int, float)):
-                kwargs[name] = discr.real_dtype.type(v)
+                other_kwargs[name] = discr.real_dtype.type(v)
             elif isinstance(v, complex):
-                kwargs[name] = discr.complex_dtype.type(v)
+                other_kwargs[name] = discr.complex_dtype.type(v)
             elif isinstance(v, np.number):
                 pass
             else:
                 raise ValueError("unrecognized scalar type for variable '%s': %s"
                         % (name, type(v)))
 
-        kwargs["grdg_n"] = discr.nnodes
-        evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
+        result = {}
+        for grp in discr.groups:
+            kwargs = other_kwargs.copy()
+            kwargs["nelements"] = grp.nelements
+            kwargs["nunit_dofs"] = grp.nunit_dofs
+
+            for name, ary in dof_array_kwargs.items():
+                kwargs[name] = ary[grp.index]
+
+            knl_result = self.inner_mapper.array_context.call_loopy(
+                    kdescr.loopy_kernel, **kwargs)
+
+            for name, val in knl_result.items():
+                result.setdefault(name, []).append(val)
 
-        for val in result_dict.values():
-            assert not isinstance(val, np.ndarray)
-            if profile_data is not None and isinstance(val, pyopencl.array.Array):
+        result = {
+                name: DOFArray.from_list(
+                    self.inner_mapper.array_context, val)
+                for name, val in result.items()}
+
+        for val in result.values():
+            assert isinstance(val, DOFArray)
+            if profile_data is not None:
+                size = dof_array_nbytes(val)
                 profile_data["bytes_written"] = (
-                        profile_data.get("bytes_written", 0) + val.nbytes)
+                        profile_data.get("bytes_written", 0) + size)
                 profile_data["bytes_written_by_scalar_assignments"] = (
                         profile_data.get("bytes_written_by_scalar_assignments", 0)
-                        + val.nbytes)
+                        + size)
 
-        return list(result_dict.items()), []
+        return list(result.items()), []
 
     def map_insn_assign_to_discr_scoped(self, insn, profile_data=None):
         assignments = []
@@ -743,11 +786,12 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
 
             field = self.inner_mapper.rec(insn.field)
             profile_data["bytes_read"] = (
-                    profile_data.get("bytes_read", 0) + field.nbytes)
+                    profile_data.get("bytes_read", 0) + dof_array_nbytes(field))
 
             for _, value in assignments:
                 profile_data["bytes_written"] = (
-                        profile_data.get("bytes_written", 0) + value.nbytes)
+                        profile_data.get("bytes_written", 0)
+                        + dof_array_nbytes(value))
 
         return assignments, futures
 
@@ -761,8 +805,9 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
 def test_assignment_memory_model(ctx_factory):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    _, discr = get_strong_wave_op_with_discr_direct(cl_ctx, dims=2, order=3)
+    _, discr = get_strong_wave_op_with_discr_direct(actx, dims=2, order=3)
 
     # Assignment instruction
     bound_op = bind(
@@ -771,35 +816,36 @@ def test_assignment_memory_model(ctx_factory):
             + sym.Variable("input1", sym.DD_VOLUME),
             exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
-    input0 = discr.zeros(queue)
-    input1 = discr.zeros(queue)
+    input0 = discr.zeros(actx)
+    input1 = discr.zeros(actx)
 
     result, profile_data = bound_op(
-            queue,
             profile_data={},
             input0=input0,
             input1=input1)
 
-    assert profile_data["bytes_read"] == input0.nbytes + input1.nbytes
-    assert profile_data["bytes_written"] == result.nbytes
+    assert profile_data["bytes_read"] == \
+            dof_array_nbytes(input0) + dof_array_nbytes(input1)
+    assert profile_data["bytes_written"] == dof_array_nbytes(result)
 
 
 @pytest.mark.parametrize("use_fusion", (True, False))
 def test_stepper_mem_ops(ctx_factory, use_fusion):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
 
     sym_operator, discr = get_strong_wave_op_with_discr_direct(
-            cl_ctx, dims=dims, order=3)
+            actx, dims=dims, order=3)
 
     t_start = 0
     dt = 0.04
     t_end = 0.2
 
-    ic = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    ic = flat_obj_array(discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     if not use_fusion:
         bound_op = bind(
@@ -807,13 +853,13 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
                 exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
         stepper = RK4TimeStepper(
-                queue, discr, "w", bound_op, 1 + discr.dim,
+                discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     else:
         stepper = FusedRK4TimeStepper(
-                queue, discr, "w", sym_operator, 1 + discr.dim,
+                discr, "w", sym_operator, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
@@ -886,9 +932,9 @@ def time_insn(f):
         if profile_data is None:
             return f(self, insn, profile_data)
 
-        start = cl.enqueue_marker(self.queue)
+        start = cl.enqueue_marker(self.array_context.queue)
         retval = f(self, insn, profile_data)
-        end = cl.enqueue_marker(self.queue)
+        end = cl.enqueue_marker(self.array_context.queue)
         profile_data\
                 .setdefault(time_field_name, TimingFutureList())\
                 .append(TimingFuture(start, end))
@@ -903,15 +949,15 @@ class ExecutionMapperWithTiming(ExecutionMapperWrapper):
     def map_profiled_call(self, expr, profile_data):
         args = [self.inner_mapper.rec(p) for p in expr.parameters]
         return self.inner_mapper.function_registry[expr.function.name](
-                self.queue, *args, profile_data=profile_data)
+                self.array_context, *args, profile_data=profile_data)
 
     def map_profiled_operator_binding(self, expr, profile_data):
         if profile_data is None:
             return self.inner_mapper.map_operator_binding(expr)
 
-        start = cl.enqueue_marker(self.queue)
+        start = cl.enqueue_marker(self.array_context.queue)
         retval = self.inner_mapper.map_operator_binding(expr)
-        end = cl.enqueue_marker(self.queue)
+        end = cl.enqueue_marker(self.array_context.queue)
         time_field_name = "time_op_%s" % expr.op.mapper_method
         profile_data\
                 .setdefault(time_field_name, TimingFutureList())\
@@ -958,18 +1004,19 @@ def test_stepper_timing(ctx_factory, use_fusion):
     queue = cl.CommandQueue(
             cl_ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 3
 
     sym_operator, discr = get_strong_wave_op_with_discr_direct(
-            cl_ctx, dims=dims, order=3)
+            actx, dims=dims, order=3)
 
     t_start = 0
     dt = 0.04
     t_end = 0.1
 
-    ic = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    ic = flat_obj_array(discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     if not use_fusion:
         bound_op = bind(
@@ -977,13 +1024,13 @@ def test_stepper_timing(ctx_factory, use_fusion):
                 exec_mapper_factory=ExecutionMapperWithTiming)
 
         stepper = RK4TimeStepper(
-                queue, discr, "w", bound_op, 1 + discr.dim,
+                discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=ExecutionMapperWithTiming)
 
     else:
         stepper = FusedRK4TimeStepper(
-                queue, discr, "w", sym_operator, 1 + discr.dim,
+                discr, "w", sym_operator, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=ExecutionMapperWithTiming)
 
@@ -1009,11 +1056,11 @@ def test_stepper_timing(ctx_factory, use_fusion):
 
 # {{{ paper outputs
 
-def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
+def get_example_stepper(actx, dims=2, order=3, use_fusion=True,
                         exec_mapper_factory=ExecutionMapper,
                         return_ic=False):
     sym_operator, discr = get_strong_wave_op_with_discr_direct(
-            queue.context, dims=dims, order=3)
+            actx, dims=dims, order=3)
 
     if not use_fusion:
         bound_op = bind(
@@ -1021,19 +1068,19 @@ def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
                 exec_mapper_factory=exec_mapper_factory)
 
         stepper = RK4TimeStepper(
-                queue, discr, "w", bound_op, 1 + discr.dim,
+                discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=exec_mapper_factory)
 
     else:
         stepper = FusedRK4TimeStepper(
-                queue, discr, "w", sym_operator, 1 + discr.dim,
+                discr, "w", sym_operator, 1 + discr.dim,
                 get_strong_wave_component,
                 exec_mapper_factory=exec_mapper_factory)
 
     if return_ic:
-        ic = join_fields(discr.zeros(queue),
-                [discr.zeros(queue) for i in range(discr.dim)])
+        ic = flat_obj_array(discr.zeros(actx),
+                [discr.zeros(actx) for i in range(discr.dim)])
         return stepper, ic
 
     return stepper
@@ -1079,21 +1126,23 @@ else:
 
 def problem_stats(order=3):
     cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     with open_output_file("grudge-problem-stats.txt") as outf:
         _, dg_discr_2d = get_strong_wave_op_with_discr_direct(
-            cl_ctx, dims=2, order=order)
+            actx, dims=2, order=order)
         print("Number of 2D elements:", dg_discr_2d.mesh.nelements, file=outf)
         vol_discr_2d = dg_discr_2d.discr_from_dd("vol")
-        dofs_2d = {group.nunit_nodes for group in vol_discr_2d.groups}
+        dofs_2d = {group.nunit_dofs for group in vol_discr_2d.groups}
         from pytools import one
         print("Number of DOFs per 2D element:", one(dofs_2d), file=outf)
 
         _, dg_discr_3d = get_strong_wave_op_with_discr_direct(
-            cl_ctx, dims=3, order=order)
+            actx, dims=3, order=order)
         print("Number of 3D elements:", dg_discr_3d.mesh.nelements, file=outf)
         vol_discr_3d = dg_discr_3d.discr_from_dd("vol")
-        dofs_3d = {group.nunit_nodes for group in vol_discr_3d.groups}
+        dofs_3d = {group.nunit_dofs for group in vol_discr_3d.groups}
         from pytools import one
         print("Number of DOFs per 3D element:", one(dofs_3d), file=outf)
 
@@ -1103,9 +1152,10 @@ def problem_stats(order=3):
 def statement_counts_table():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    fused_stepper = get_example_stepper(queue, use_fusion=True)
-    stepper = get_example_stepper(queue, use_fusion=False)
+    fused_stepper = get_example_stepper(actx, use_fusion=True)
+    stepper = get_example_stepper(actx, use_fusion=False)
 
     with open_output_file("statement-counts.tex") as outf:
         if not PAPER_OUTPUT:
@@ -1131,15 +1181,15 @@ def statement_counts_table():
 
 
 @memoize(key=lambda queue, dims: dims)
-def mem_ops_results(queue, dims):
+def mem_ops_results(actx, dims):
     fused_stepper = get_example_stepper(
-            queue,
+            actx,
             dims=dims,
             use_fusion=True,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     stepper, ic = get_example_stepper(
-            queue,
+            actx,
             dims=dims,
             use_fusion=False,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting,
@@ -1193,9 +1243,10 @@ def mem_ops_results(queue, dims):
 def scalar_assignment_percent_of_total_mem_ops_table():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    result2d = mem_ops_results(queue, 2)
-    result3d = mem_ops_results(queue, 3)
+    result2d = mem_ops_results(actx, 2)
+    result3d = mem_ops_results(actx, 3)
 
     with open_output_file("scalar-assignments-mem-op-percentage.tex") as outf:
         if not PAPER_OUTPUT:
diff --git a/examples/geometry.py b/examples/geometry.py
index 6e146f21dd31545b392d8006d45d94bed7a9db02..df81298d4865b0dce384952633ee4fd00717c72b 100644
--- a/examples/geometry.py
+++ b/examples/geometry.py
@@ -28,15 +28,18 @@ import numpy as np  # noqa
 import pyopencl as cl
 from grudge import sym, bind, DGDiscretizationWithBoundaries, shortcuts
 
+from meshmode.array_context import PyOpenCLArrayContext
+
 
 def main(write_output=True):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import generate_warped_rect_mesh
     mesh = generate_warped_rect_mesh(dim=2, order=4, n=6)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=4)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=4)
 
     sym_op = sym.normal(sym.BTAG_ALL, mesh.dim)
     #sym_op = sym.nodes(mesh.dim, where=sym.BTAG_ALL)
@@ -45,7 +48,7 @@ def main(write_output=True):
     print()
     print(op.eval_code)
 
-    vec = op(queue)
+    vec = op(actx)
 
     vis = shortcuts.make_visualizer(discr, 4)
     vis.write_vtk_file("geo.vtu", [
diff --git a/examples/maxwell/cavities.py b/examples/maxwell/cavities.py
index a58f27399ccfb6e32a16d6cf594ed2a3c78076b2..eb2ee28dae6b2ed74c2760d2359d9542a9df2f59 100644
--- a/examples/maxwell/cavities.py
+++ b/examples/maxwell/cavities.py
@@ -27,6 +27,9 @@ THE SOFTWARE.
 
 import numpy as np
 import pyopencl as cl
+
+from meshmode.array_context import PyOpenCLArrayContext
+
 from grudge.shortcuts import set_up_rk4
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 
@@ -39,6 +42,7 @@ STEPS = 60
 def main(dims, write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
@@ -46,7 +50,7 @@ def main(dims, write_output=True, order=4):
             b=(1.0,)*dims,
             n=(5,)*dims)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     if 0:
         epsilon0 = 8.8541878176e-12  # C**2 / (N m**2)
@@ -60,14 +64,12 @@ def main(dims, write_output=True, order=4):
     from grudge.models.em import MaxwellOperator
     op = MaxwellOperator(epsilon, mu, flux_type=0.5, dimensions=dims)
 
-    queue = cl.CommandQueue(discr.cl_context)
-
     if dims == 3:
         sym_mode = get_rectangular_cavity_mode(1, (1, 2, 2))
-        fields = bind(discr, sym_mode)(queue, t=0, epsilon=epsilon, mu=mu)
+        fields = bind(discr, sym_mode)(actx, t=0, epsilon=epsilon, mu=mu)
     else:
         sym_mode = get_rectangular_cavity_mode(1, (2, 3))
-        fields = bind(discr, sym_mode)(queue, t=0)
+        fields = bind(discr, sym_mode)(actx, t=0)
 
     # FIXME
     #dt = op.estimate_rk4_timestep(discr, fields=fields)
@@ -78,7 +80,7 @@ def main(dims, write_output=True, order=4):
     bound_op = bind(discr, op.sym_operator())
 
     def rhs(t, w):
-        return bound_op(queue, t=t, w=w)
+        return bound_op(t=t, w=w)
 
     if mesh.dim == 2:
         dt = 0.004
@@ -117,8 +119,8 @@ def main(dims, write_output=True, order=4):
 
             step += 1
 
-            print(step, event.t, norm(queue, u=e[0]), norm(queue, u=e[1]),
-                    norm(queue, u=h[0]), norm(queue, u=h[1]),
+            print(step, event.t, norm(u=e[0]), norm(u=e[1]),
+                    norm(u=h[0]), norm(u=h[1]),
                     time()-t_last_step)
             if step % 10 == 0:
                 e, h = op.split_eh(event.state_component)
diff --git a/examples/wave/var-propagation-speed.py b/examples/wave/var-propagation-speed.py
index a7a634f7a8b79f3006525059e70cc4379581127c..f392b420d4e06c6fb547c6461c0290726388cf29 100644
--- a/examples/wave/var-propagation-speed.py
+++ b/examples/wave/var-propagation-speed.py
@@ -30,10 +30,13 @@ import pyopencl as cl
 from grudge.shortcuts import set_up_rk4
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 
+from meshmode.array_context import PyOpenCLArrayContext
+
 
 def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
     from meshmode.mesh.generation import generate_regular_rect_mesh
@@ -42,7 +45,7 @@ def main(write_output=True, order=4):
             b=(0.5,)*dims,
             n=(20,)*dims)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     source_center = np.array([0.1, 0.22, 0.33])[:mesh.dim]
     source_width = 0.05
@@ -69,19 +72,18 @@ def main(write_output=True, order=4):
             radiation_tag=BTAG_ALL,
             flux_type="upwind")
 
-    queue = cl.CommandQueue(discr.cl_context)
-    from pytools.obj_array import join_fields
-    fields = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    from pytools.obj_array import flat_obj_array
+    fields = flat_obj_array(discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     op.check_bc_coverage(mesh)
 
-    c_eval = bind(discr, c)(queue)
+    c_eval = bind(discr, c)(actx)
 
     bound_op = bind(discr, op.sym_operator())
 
     def rhs(t, w):
-        return bound_op(queue, t=t, w=w)
+        return bound_op(t=t, w=w)
 
     if mesh.dim == 2:
         dt = 0.04 * 0.3
@@ -110,7 +112,7 @@ def main(write_output=True, order=4):
 
             step += 1
 
-            print(step, event.t, norm(queue, u=event.state_component[0]),
+            print(step, event.t, norm(u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
                 vis.write_vtk_file("fld-var-propogation-speed-%04d.vtu" % step,
diff --git a/examples/wave/wave-eager-mpi.py b/examples/wave/wave-eager-mpi.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b408bb599c5002a32325f5befd9ab66ab02cee9
--- /dev/null
+++ b/examples/wave/wave-eager-mpi.py
@@ -0,0 +1,206 @@
+from __future__ import division, print_function
+
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+
+from pytools.obj_array import flat_obj_array, make_obj_array
+
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw
+
+from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+
+from grudge.eager import (
+        EagerDGDiscretization, interior_trace_pair, cross_rank_trace_pairs)
+from grudge.shortcuts import make_visualizer
+from grudge.symbolic.primitives import TracePair
+from mpi4py import MPI
+
+
+# {{{ wave equation bits
+
+def scalar(arg):
+    return make_obj_array([arg])
+
+
+def wave_flux(discr, c, w_tpair):
+    u = w_tpair[0]
+    v = w_tpair[1:]
+
+    normal = thaw(u.int.array_context, discr.normal(w_tpair.dd))
+
+    flux_weak = flat_obj_array(
+            np.dot(v.avg, normal),
+            normal*scalar(u.avg),
+            )
+
+    # upwind
+    v_jump = np.dot(normal, v.int-v.ext)
+    flux_weak -= flat_obj_array(
+            0.5*(u.int-u.ext),
+            0.5*normal*scalar(v_jump),
+            )
+
+    return discr.project(w_tpair.dd, "all_faces", c*flux_weak)
+
+
+def wave_operator(discr, c, w):
+    u = w[0]
+    v = w[1:]
+
+    dir_u = discr.project("vol", BTAG_ALL, u)
+    dir_v = discr.project("vol", BTAG_ALL, v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
+
+    return (
+            discr.inverse_mass(
+                flat_obj_array(
+                    c*discr.weak_div(v),
+                    c*discr.weak_grad(u)
+                    )
+                -  # noqa: W504
+                discr.face_mass(
+                    wave_flux(discr, c=c, w_tpair=interior_trace_pair(discr, w))
+                    + wave_flux(discr, c=c, w_tpair=TracePair(
+                        BTAG_ALL, dir_bval, dir_bc))
+                    + sum(
+                        wave_flux(discr, c=c, w_tpair=tpair)
+                        for tpair in cross_rank_trace_pairs(discr, w))
+                    )
+                )
+                )
+
+# }}}
+
+
+def rk4_step(y, t, h, f):
+    k1 = f(t, y)
+    k2 = f(t+h/2, y + h/2*k1)
+    k3 = f(t+h/2, y + h/2*k2)
+    k4 = f(t+h, y + h*k3)
+    return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
+
+
+def bump(discr, actx, t=0):
+    source_center = np.array([0.2, 0.35, 0.1])[:discr.dim]
+    source_width = 0.05
+    source_omega = 3
+
+    nodes = thaw(actx, discr.nodes())
+    center_dist = flat_obj_array([
+        nodes[i] - source_center[i]
+        for i in range(discr.dim)
+        ])
+
+    return (
+        np.cos(source_omega*t)
+        * actx.np.exp(
+            -np.dot(center_dist, center_dist)
+            / source_width**2))
+
+
+def main():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
+
+    comm = MPI.COMM_WORLD
+    num_parts = comm.Get_size()
+
+    from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
+    mesh_dist = MPIMeshDistributor(comm)
+
+    dim = 2
+    nel_1d = 16
+
+    if mesh_dist.is_mananger_rank():
+        from meshmode.mesh.generation import generate_regular_rect_mesh
+        mesh = generate_regular_rect_mesh(
+                a=(-0.5,)*dim,
+                b=(0.5,)*dim,
+                n=(nel_1d,)*dim)
+
+        print("%d elements" % mesh.nelements)
+
+        part_per_element = get_partition_by_pymetis(mesh, num_parts)
+
+        local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)
+
+        del mesh
+
+    else:
+        local_mesh = mesh_dist.receive_mesh_part()
+
+    order = 3
+
+    discr = EagerDGDiscretization(actx, local_mesh, order=order,
+                    mpi_communicator=comm)
+
+    if dim == 2:
+        # no deep meaning here, just a fudge factor
+        dt = 0.75/(nel_1d*order**2)
+    elif dim == 3:
+        # no deep meaning here, just a fudge factor
+        dt = 0.45/(nel_1d*order**2)
+    else:
+        raise ValueError("don't have a stable time step guesstimate")
+
+    fields = flat_obj_array(
+            bump(discr, actx),
+            [discr.zeros(actx) for i in range(discr.dim)]
+            )
+
+    vis = make_visualizer(discr, order+3 if dim == 2 else order)
+
+    def rhs(t, w):
+        return wave_operator(discr, c=1, w=w)
+
+    rank = comm.Get_rank()
+
+    t = 0
+    t_final = 3
+    istep = 0
+    while t < t_final:
+        fields = rk4_step(fields, t, dt, rhs)
+
+        if istep % 10 == 0:
+            print(istep, t, discr.norm(fields[0]))
+            vis.write_vtk_file("fld-wave-eager-mpi-%03d-%04d.vtu" % (rank, istep),
+                    [
+                        ("u", fields[0]),
+                        ("v", fields[1:]),
+                        ])
+
+        t += dt
+        istep += 1
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: foldmethod=marker
diff --git a/examples/wave/wave-eager-var-velocity.py b/examples/wave/wave-eager-var-velocity.py
new file mode 100644
index 0000000000000000000000000000000000000000..721211314dfd5931374f0347e90435c624baf129
--- /dev/null
+++ b/examples/wave/wave-eager-var-velocity.py
@@ -0,0 +1,211 @@
+from __future__ import division, print_function
+
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+
+from pytools.obj_array import flat_obj_array, make_obj_array
+
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw
+
+from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+
+from grudge.eager import EagerDGDiscretization, interior_trace_pair
+from grudge.shortcuts import make_visualizer
+from grudge.symbolic.primitives import TracePair, QTAG_NONE, DOFDesc
+
+
+# {{{ wave equation bits
+
+def scalar(arg):
+    return make_obj_array([arg])
+
+
+def wave_flux(discr, c, w_tpair):
+    dd = w_tpair.dd
+    dd_quad = dd.with_qtag("vel_prod")
+
+    u = w_tpair[0]
+    v = w_tpair[1:]
+
+    normal = thaw(u.int.array_context, discr.normal(dd))
+
+    flux_weak = flat_obj_array(
+            np.dot(v.avg, normal),
+            normal*scalar(u.avg),
+            )
+
+    # upwind
+    v_jump = np.dot(normal, v.int-v.ext)
+    flux_weak -= flat_obj_array(
+            0.5*(u.int-u.ext),
+            0.5*normal*scalar(v_jump),
+            )
+
+    # FIMXE this flux is only correct for continuous c
+    dd_allfaces_quad = dd_quad.with_dtag("all_faces")
+    c_quad = discr.project("vol", dd_quad, c)
+    flux_quad = discr.project(dd, dd_quad, flux_weak)
+
+    return discr.project(dd_quad, dd_allfaces_quad, scalar(c_quad)*flux_quad)
+
+
+def wave_operator(discr, c, w):
+    u = w[0]
+    v = w[1:]
+
+    dir_u = discr.project("vol", BTAG_ALL, u)
+    dir_v = discr.project("vol", BTAG_ALL, v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
+
+    dd_quad = DOFDesc("vol", "vel_prod")
+    c_quad = discr.project("vol", dd_quad, c)
+    w_quad = discr.project("vol", dd_quad, w)
+    u_quad = w_quad[0]
+    v_quad = w_quad[1:]
+
+    dd_allfaces_quad = DOFDesc("all_faces", "vel_prod")
+
+    # FIXME Fix sign issue
+    return (
+            discr.inverse_mass(
+                flat_obj_array(
+                    discr.weak_div(dd_quad, scalar(c_quad)*v_quad),
+                    discr.weak_grad(dd_quad, c_quad*u_quad)
+                    )
+                -  # noqa: W504
+                discr.face_mass(
+                    dd_allfaces_quad,
+                    wave_flux(discr, c=c, w_tpair=interior_trace_pair(discr, w))
+                    + wave_flux(discr, c=c, w_tpair=TracePair(
+                        BTAG_ALL, dir_bval, dir_bc))
+                    ))
+                )
+
+# }}}
+
+
+def rk4_step(y, t, h, f):
+    k1 = f(t, y)
+    k2 = f(t+h/2, y + h/2*k1)
+    k3 = f(t+h/2, y + h/2*k2)
+    k4 = f(t+h, y + h*k3)
+    return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
+
+
+def bump(actx, discr, t=0, width=0.05, center=None):
+    if center is None:
+        center = np.array([0.2, 0.35, 0.1])
+
+    center = center[:discr.dim]
+    source_omega = 3
+
+    nodes = thaw(actx, discr.nodes())
+    center_dist = flat_obj_array([
+        nodes[i] - center[i]
+        for i in range(discr.dim)
+        ])
+
+    return (
+        np.cos(source_omega*t)
+        * actx.np.exp(
+            -np.dot(center_dist, center_dist)
+            / width**2))
+
+
+def main():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
+
+    dim = 2
+    nel_1d = 16
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    mesh = generate_regular_rect_mesh(
+            a=(-0.5,)*dim,
+            b=(0.5,)*dim,
+            n=(nel_1d,)*dim)
+
+    order = 3
+
+    if dim == 2:
+        # no deep meaning here, just a fudge factor
+        dt = 0.75/(nel_1d*order**2)
+    elif dim == 3:
+        # no deep meaning here, just a fudge factor
+        dt = 0.45/(nel_1d*order**2)
+    else:
+        raise ValueError("don't have a stable time step guesstimate")
+
+    print("%d elements" % mesh.nelements)
+
+    from meshmode.discretization.poly_element import \
+            QuadratureSimplexGroupFactory, \
+            PolynomialWarpAndBlendGroupFactory
+    discr = EagerDGDiscretization(actx, mesh,
+            quad_tag_to_group_factory={
+                QTAG_NONE: PolynomialWarpAndBlendGroupFactory(order),
+                "vel_prod": QuadratureSimplexGroupFactory(3*order),
+                })
+
+    # bounded above by 1
+    c = 0.2 + 0.8*bump(actx, discr, center=np.zeros(3), width=0.5)
+
+    fields = flat_obj_array(
+            bump(actx, discr, ),
+            [discr.zeros(actx) for i in range(discr.dim)]
+            )
+
+    vis = make_visualizer(discr, order+3 if dim == 2 else order)
+
+    def rhs(t, w):
+        return wave_operator(discr, c=c, w=w)
+
+    t = 0
+    t_final = 3
+    istep = 0
+    while t < t_final:
+        fields = rk4_step(fields, t, dt, rhs)
+
+        if istep % 10 == 0:
+            print(istep, t, discr.norm(fields[0]))
+            vis.write_vtk_file("fld-wave-eager-var-velocity-%04d.vtu" % istep,
+                    [
+                        ("c", c),
+                        ("u", fields[0]),
+                        ("v", fields[1:]),
+                        ])
+
+        t += dt
+        istep += 1
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: foldmethod=marker
diff --git a/examples/wave/wave-eager.py b/examples/wave/wave-eager.py
index f060c364b558273d7ab97d678f021ac6e30eabd6..7450f435798c249f97dd11e03cd59dbd1e95b542 100644
--- a/examples/wave/wave-eager.py
+++ b/examples/wave/wave-eager.py
@@ -26,47 +26,41 @@ THE SOFTWARE.
 import numpy as np
 import numpy.linalg as la  # noqa
 import pyopencl as cl
-import pyopencl.array as cla  # noqa
-import pyopencl.clmath as clmath
-from pytools.obj_array import (
-        join_fields, make_obj_array,
-        with_object_array_or_scalar)
+
+from pytools.obj_array import flat_obj_array, make_obj_array
+
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw
+
 from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
-from grudge.eager import EagerDGDiscretization, with_queue
-from grudge.symbolic.primitives import TracePair
+
+from grudge.eager import EagerDGDiscretization, interior_trace_pair
 from grudge.shortcuts import make_visualizer
+from grudge.symbolic.primitives import TracePair
 
 
-def interior_trace_pair(discr, vec):
-    i = discr.project("vol", "int_faces", vec)
-    e = with_object_array_or_scalar(
-            lambda el: discr.opposite_face_connection()(el.queue, el),
-            i)
-    return TracePair("int_faces", i, e)
+# {{{ wave equation bits
 
+def scalar(arg):
+    return make_obj_array([arg])
 
-# {{{ wave equation bits
 
 def wave_flux(discr, c, w_tpair):
     u = w_tpair[0]
     v = w_tpair[1:]
 
-    normal = with_queue(u.int.queue, discr.normal(w_tpair.dd))
-
-    def normal_times(scalar):
-        # workaround for object array behavior
-        return make_obj_array([ni*scalar for ni in normal])
+    normal = thaw(u.int.array_context, discr.normal(w_tpair.dd))
 
-    flux_weak = join_fields(
+    flux_weak = flat_obj_array(
             np.dot(v.avg, normal),
-            normal_times(u.avg),
+            normal*scalar(u.avg),
             )
 
     # upwind
     v_jump = np.dot(normal, v.int-v.ext)
-    flux_weak -= join_fields(
+    flux_weak -= flat_obj_array(
             0.5*(u.int-u.ext),
-            0.5*normal_times(v_jump),
+            0.5*normal*scalar(v_jump),
             )
 
     return discr.project(w_tpair.dd, "all_faces", c*flux_weak)
@@ -78,12 +72,12 @@ def wave_operator(discr, c, w):
 
     dir_u = discr.project("vol", BTAG_ALL, u)
     dir_v = discr.project("vol", BTAG_ALL, v)
-    dir_bval = join_fields(dir_u, dir_v)
-    dir_bc = join_fields(-dir_u, dir_v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
 
     return (
             discr.inverse_mass(
-                join_fields(
+                flat_obj_array(
                     c*discr.weak_div(v),
                     c*discr.weak_grad(u)
                     )
@@ -106,20 +100,20 @@ def rk4_step(y, t, h, f):
     return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
 
 
-def bump(discr, queue, t=0):
+def bump(discr, actx, t=0):
     source_center = np.array([0.2, 0.35, 0.1])[:discr.dim]
     source_width = 0.05
     source_omega = 3
 
-    nodes = discr.nodes().with_queue(queue)
-    center_dist = join_fields([
+    nodes = thaw(actx, discr.nodes())
+    center_dist = flat_obj_array([
         nodes[i] - source_center[i]
         for i in range(discr.dim)
         ])
 
     return (
         np.cos(source_omega*t)
-        * clmath.exp(
+        * actx.np.exp(
             -np.dot(center_dist, center_dist)
             / source_width**2))
 
@@ -127,6 +121,7 @@ def bump(discr, queue, t=0):
 def main():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dim = 2
     nel_1d = 16
@@ -149,14 +144,14 @@ def main():
 
     print("%d elements" % mesh.nelements)
 
-    discr = EagerDGDiscretization(cl_ctx, mesh, order=order)
+    discr = EagerDGDiscretization(actx, mesh, order=order)
 
-    fields = join_fields(
-            bump(discr, queue),
-            [discr.zeros(queue) for i in range(discr.dim)]
+    fields = flat_obj_array(
+            bump(discr, actx),
+            [discr.zeros(actx) for i in range(discr.dim)]
             )
 
-    vis = make_visualizer(discr, discr.order+3 if dim == 2 else discr.order)
+    vis = make_visualizer(discr, order+3 if dim == 2 else order)
 
     def rhs(t, w):
         return wave_operator(discr, c=1, w=w)
@@ -168,7 +163,8 @@ def main():
         fields = rk4_step(fields, t, dt, rhs)
 
         if istep % 10 == 0:
-            print(istep, t, la.norm(fields[0].get()))
+            print(f"step: {istep} t: {t} L2: {discr.norm(fields[0])} "
+                    f"sol max: {discr.nodal_max('vol', fields[0])}")
             vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep,
                     [
                         ("u", fields[0]),
diff --git a/examples/wave/wave-min-mpi.py b/examples/wave/wave-min-mpi.py
index 08705655cf874bf7701bb1c3f998c0d6ea7a6e06..bca2610199ed1cf50ae0c178d28caf1579730e3c 100644
--- a/examples/wave/wave-min-mpi.py
+++ b/examples/wave/wave-min-mpi.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 import numpy as np
 import pyopencl as cl
+from meshmode.array_context import PyOpenCLArrayContext
 from grudge.shortcuts import set_up_rk4
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 from mpi4py import MPI
@@ -35,6 +36,7 @@ from mpi4py import MPI
 def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     comm = MPI.COMM_WORLD
     num_parts = comm.Get_size()
@@ -61,7 +63,7 @@ def main(write_output=True, order=4):
     else:
         local_mesh = mesh_dist.receive_mesh_part()
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, local_mesh, order=order,
+    discr = DGDiscretizationWithBoundaries(actx, local_mesh, order=order,
             mpi_communicator=comm)
 
     if local_mesh.dim == 2:
@@ -90,10 +92,10 @@ def main(write_output=True, order=4):
             radiation_tag=BTAG_ALL,
             flux_type="upwind")
 
-    queue = cl.CommandQueue(discr.cl_context)
-    from pytools.obj_array import join_fields
-    fields = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    from pytools.obj_array import flat_obj_array
+    fields = flat_obj_array(
+            discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     # FIXME
     #dt = op.estimate_rk4_timestep(discr, fields=fields)
@@ -104,7 +106,7 @@ def main(write_output=True, order=4):
     bound_op = bind(discr, op.sym_operator())
 
     def rhs(t, w):
-        return bound_op(queue, t=t, w=w)
+        return bound_op(t=t, w=w)
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
@@ -130,7 +132,7 @@ def main(write_output=True, order=4):
 
             step += 1
 
-            print(step, event.t, norm(queue, u=event.state_component[0]),
+            print(step, event.t, norm(u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
                 vis.write_vtk_file(
diff --git a/examples/wave/wave-min.py b/examples/wave/wave-min.py
index d793e3a09716b883d91c90f1fddfb2f96bacf1a2..de6c9f92e75d449e6a42de80f7b6fa7270df8a15 100644
--- a/examples/wave/wave-min.py
+++ b/examples/wave/wave-min.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 import numpy as np
 import pyopencl as cl
+from meshmode.array_context import PyOpenCLArrayContext
 from grudge.shortcuts import set_up_rk4
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 
@@ -34,6 +35,7 @@ from grudge import sym, bind, DGDiscretizationWithBoundaries
 def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
     from meshmode.mesh.generation import generate_regular_rect_mesh
@@ -49,7 +51,7 @@ def main(write_output=True, order=4):
 
     print("%d elements" % mesh.nelements)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
     source_center = np.array([0.1, 0.22, 0.33])[:mesh.dim]
     source_width = 0.05
@@ -72,10 +74,9 @@ def main(write_output=True, order=4):
             radiation_tag=BTAG_ALL,
             flux_type="upwind")
 
-    queue = cl.CommandQueue(discr.cl_context)
-    from pytools.obj_array import join_fields
-    fields = join_fields(discr.zeros(queue),
-            [discr.zeros(queue) for i in range(discr.dim)])
+    from pytools.obj_array import flat_obj_array
+    fields = flat_obj_array(discr.zeros(actx),
+            [discr.zeros(actx) for i in range(discr.dim)])
 
     # FIXME
     #dt = op.estimate_rk4_timestep(discr, fields=fields)
@@ -86,7 +87,7 @@ def main(write_output=True, order=4):
     bound_op = bind(discr, op.sym_operator())
 
     def rhs(t, w):
-        return bound_op(queue, t=t, w=w)
+        return bound_op(t=t, w=w)
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
@@ -110,7 +111,7 @@ def main(write_output=True, order=4):
 
             step += 1
 
-            print(step, event.t, norm(queue, u=event.state_component[0]),
+            print(step, event.t, norm(u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
                 vis.write_vtk_file("fld-wave-min-%04d.vtu" % step,
diff --git a/grudge/discretization.py b/grudge/discretization.py
index d24fb2c082ddf3585d7de13f58a7a0b371cc4ed0..7c0573c2e5ec3515843b6f8c377ad10301256aaa 100644
--- a/grudge/discretization.py
+++ b/grudge/discretization.py
@@ -25,9 +25,9 @@ THE SOFTWARE.
 
 import six
 from pytools import memoize_method
-import pyopencl as cl
 from grudge import sym
-import numpy as np
+import numpy as np  # noqa: F401
+from meshmode.array_context import ArrayContext
 
 
 # FIXME Naming not ideal
@@ -40,7 +40,6 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
     .. automethod :: discr_from_dd
     .. automethod :: connection_from_dds
 
-    .. autoattribute :: cl_context
     .. autoattribute :: dim
     .. autoattribute :: ambient_dim
     .. autoattribute :: mesh
@@ -49,8 +48,8 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
     .. automethod :: zeros
     """
 
-    def __init__(self, cl_ctx, mesh, order, quad_tag_to_group_factory=None,
-            mpi_communicator=None):
+    def __init__(self, array_context, mesh, order=None,
+            quad_tag_to_group_factory=None, mpi_communicator=None):
         """
         :param quad_tag_to_group_factory: A mapping from quadrature tags (typically
             strings--but may be any hashable/comparable object) to a
@@ -60,15 +59,34 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
             be carried out with the standard volume discretization.
         """
 
+        self._setup_actx = array_context
+
+        from meshmode.discretization.poly_element import \
+                PolynomialWarpAndBlendGroupFactory
+
         if quad_tag_to_group_factory is None:
-            quad_tag_to_group_factory = {}
+            if order is None:
+                raise TypeError("one of 'order' and "
+                        "'quad_tag_to_group_factory' must be given")
+
+            quad_tag_to_group_factory = {
+                    sym.QTAG_NONE: PolynomialWarpAndBlendGroupFactory(order=order)}
+        else:
+            if order is not None:
+                quad_tag_to_group_factory = quad_tag_to_group_factory.copy()
+                if sym.QTAG_NONE in quad_tag_to_group_factory:
+                    raise ValueError("if 'order' is given, "
+                            "'quad_tag_to_group_factory' must not have a "
+                            "key of QTAG_NONE")
+
+                quad_tag_to_group_factory[sym.QTAG_NONE] = \
+                        PolynomialWarpAndBlendGroupFactory(order=order)
 
-        self.order = order
         self.quad_tag_to_group_factory = quad_tag_to_group_factory
 
         from meshmode.discretization import Discretization
 
-        self._volume_discr = Discretization(cl_ctx, mesh,
+        self._volume_discr = Discretization(array_context, mesh,
                 self.group_factory_for_quadrature_tag(sym.QTAG_NONE))
 
         # {{{ management of discretization-scoped common subexpressions
@@ -81,9 +99,9 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
 
         # }}}
 
-        with cl.CommandQueue(cl_ctx) as queue:
-            self._dist_boundary_connections = \
-                    self._set_up_distributed_communication(mpi_communicator, queue)
+        self._dist_boundary_connections = \
+                self._set_up_distributed_communication(
+                        mpi_communicator, array_context)
 
         self.mpi_communicator = mpi_communicator
 
@@ -97,7 +115,7 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
             return self.mpi_communicator.Get_rank() \
                     == self._get_management_rank_index()
 
-    def _set_up_distributed_communication(self, mpi_communicator, queue):
+    def _set_up_distributed_communication(self, mpi_communicator, array_context):
         from_dd = sym.DOFDesc("vol", sym.QTAG_NONE)
 
         from meshmode.distributed import get_connected_partitions
@@ -118,7 +136,8 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
                     from_dd,
                     sym.DOFDesc(sym.BTAG_PARTITION(i_remote_part), sym.QTAG_NONE))
             setup_helper = setup_helpers[i_remote_part] = MPIBoundaryCommSetupHelper(
-                    mpi_communicator, queue, conn, i_remote_part, grp_factory)
+                    mpi_communicator, array_context, conn,
+                    i_remote_part, grp_factory)
             setup_helper.post_sends()
 
         for i_remote_part, setup_helper in six.iteritems(setup_helpers):
@@ -152,7 +171,7 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
 
             from meshmode.discretization import Discretization
             return Discretization(
-                    self._volume_discr.cl_context,
+                    self._setup_actx,
                     no_quad_discr.mesh,
                     self.group_factory_for_quadrature_tag(qtag))
 
@@ -186,6 +205,7 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
                     make_face_to_all_faces_embedding
 
             return make_face_to_all_faces_embedding(
+                    self._setup_actx,
                     faces_conn, self.discr_from_dd(to_dd),
                     self.discr_from_dd(from_dd))
 
@@ -224,6 +244,7 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
                     make_same_mesh_connection
 
             return make_same_mesh_connection(
+                    self._setup_actx,
                     self.discr_from_dd(to_dd),
                     self.discr_from_dd(from_dd))
 
@@ -264,19 +285,13 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
         if quadrature_tag is None:
             quadrature_tag = sym.QTAG_NONE
 
-        from meshmode.discretization.poly_element import \
-                PolynomialWarpAndBlendGroupFactory
-
-        if quadrature_tag is not sym.QTAG_NONE:
-            return self.quad_tag_to_group_factory[quadrature_tag]
-        else:
-            return PolynomialWarpAndBlendGroupFactory(order=self.order)
+        return self.quad_tag_to_group_factory[quadrature_tag]
 
     @memoize_method
     def _quad_volume_discr(self, quadrature_tag):
         from meshmode.discretization import Discretization
 
-        return Discretization(self._volume_discr.cl_context, self._volume_discr.mesh,
+        return Discretization(self._setup_actx, self._volume_discr.mesh,
                 self.group_factory_for_quadrature_tag(quadrature_tag))
 
     # {{{ boundary
@@ -285,9 +300,10 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
     def _boundary_connection(self, boundary_tag):
         from meshmode.discretization.connection import make_face_restriction
         return make_face_restriction(
-                        self._volume_discr,
-                        self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
-                        boundary_tag=boundary_tag)
+                self._setup_actx,
+                self._volume_discr,
+                self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
+                boundary_tag=boundary_tag)
 
     # }}}
 
@@ -298,21 +314,24 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
         from meshmode.discretization.connection import (
                 make_face_restriction, FACE_RESTR_INTERIOR)
         return make_face_restriction(
-                        self._volume_discr,
-                        self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
-                        FACE_RESTR_INTERIOR,
+                self._setup_actx,
+                self._volume_discr,
+                self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
+                FACE_RESTR_INTERIOR,
 
-                        # FIXME: This will need to change as soon as we support
-                        # pyramids or other elements with non-identical face
-                        # types.
-                        per_face_groups=False)
+                # FIXME: This will need to change as soon as we support
+                # pyramids or other elements with non-identical face
+                # types.
+                per_face_groups=False)
 
     @memoize_method
     def opposite_face_connection(self):
         from meshmode.discretization.connection import \
                 make_opposite_face_connection
 
-        return make_opposite_face_connection(self._interior_faces_connection())
+        return make_opposite_face_connection(
+                self._setup_actx,
+                self._interior_faces_connection())
 
     # }}}
 
@@ -323,21 +342,18 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
         from meshmode.discretization.connection import (
                 make_face_restriction, FACE_RESTR_ALL)
         return make_face_restriction(
-                        self._volume_discr,
-                        self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
-                        FACE_RESTR_ALL,
+                self._setup_actx,
+                self._volume_discr,
+                self.group_factory_for_quadrature_tag(sym.QTAG_NONE),
+                FACE_RESTR_ALL,
 
-                        # FIXME: This will need to change as soon as we support
-                        # pyramids or other elements with non-identical face
-                        # types.
-                        per_face_groups=False)
+                # FIXME: This will need to change as soon as we support
+                # pyramids or other elements with non-identical face
+                # types.
+                per_face_groups=False)
 
     # }}}
 
-    @property
-    def cl_context(self):
-        return self._volume_discr.cl_context
-
     @property
     def dim(self):
         return self._volume_discr.dim
@@ -358,13 +374,11 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
     def mesh(self):
         return self._volume_discr.mesh
 
-    def empty(self, queue=None, dtype=None, extra_dims=None, allocator=None):
-        return self._volume_discr.empty(queue, dtype, extra_dims=extra_dims,
-                allocator=allocator)
+    def empty(self, array_context: ArrayContext, dtype=None):
+        return self._volume_discr.empty(array_context, dtype)
 
-    def zeros(self, queue, dtype=None, extra_dims=None, allocator=None):
-        return self._volume_discr.zeros(queue, dtype, extra_dims=extra_dims,
-                allocator=allocator)
+    def zeros(self, array_context: ArrayContext, dtype=None):
+        return self._volume_discr.zeros(array_context, dtype)
 
     def is_volume_where(self, where):
         from grudge import sym
@@ -372,55 +386,16 @@ class DGDiscretizationWithBoundaries(DiscretizationBase):
                 where is None
                 or where == sym.VTAG_ALL)
 
-
-class PointsDiscretization(DiscretizationBase):
-    """Implements just enough of the discretization interface to be
-    able to smuggle some points into :func:`bind`.
-    """
-
-    def __init__(self, nodes):
-        self._nodes = nodes
-        self.real_dtype = np.dtype(np.float64)
-        self.complex_dtype = np.dtype({
-                np.float32: np.complex64,
-                np.float64: np.complex128
-        }[self.real_dtype.type])
-
-        self.mpi_communicator = None
-
-    def ambient_dim(self):
-        return self._nodes.shape[0]
-
-    @property
-    def mesh(self):
-        return self
-
-    @property
-    def nnodes(self):
-        return self._nodes.shape[-1]
-
-    def nodes(self):
-        return self._nodes
-
-    @property
-    def facial_adjacency_groups(self):
-        return []
-
-    def discr_from_dd(self, dd):
-        dd = sym.as_dofdesc(dd)
-
-        if dd.quadrature_tag is not sym.QTAG_NONE:
-            raise ValueError("quadrature discretization requested from "
-                    "PointsDiscretization")
-        if dd.domain_tag is not sym.DTAG_VOLUME_ALL:
-            raise ValueError("non-volume discretization requested from "
-                    "PointsDiscretization")
-
-        return self
-
     @property
-    def quad_tag_to_group_factory(self):
-        return {}
+    def order(self):
+        from warnings import warn
+        warn("DGDiscretizationWithBoundaries.order is deprecated, "
+                "consider the orders of element groups instead. "
+                "'order' will go away in 2021.",
+                DeprecationWarning, stacklevel=2)
+
+        from pytools import single_valued
+        return single_valued(egrp.order for egrp in self._volume_discr.groups)
 
 
 # vim: foldmethod=marker
diff --git a/grudge/eager.py b/grudge/eager.py
index 43d73a4db5ad6eef406bc75e66f4d1bc39b74bfb..71a01873001d56883d49754bd4264f18c3c20935 100644
--- a/grudge/eager.py
+++ b/grudge/eager.py
@@ -24,15 +24,16 @@ THE SOFTWARE.
 
 
 import numpy as np  # noqa
-from grudge.discretization import DGDiscretizationWithBoundaries
 from pytools import memoize_method
-from pytools.obj_array import (
-        with_object_array_or_scalar,
-        is_obj_array)
-import pyopencl as cl
+from pytools.obj_array import obj_array_vectorize, make_obj_array
 import pyopencl.array as cla  # noqa
 from grudge import sym, bind
-from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+
+from meshmode.mesh import BTAG_ALL, BTAG_NONE, BTAG_PARTITION  # noqa
+from meshmode.dof_array import freeze, DOFArray, flatten, unflatten
+
+from grudge.discretization import DGDiscretizationWithBoundaries
+from grudge.symbolic.primitives import TracePair
 
 
 __doc__ = """
@@ -40,15 +41,6 @@ __doc__ = """
 """
 
 
-def with_queue(queue, ary):
-    return with_object_array_or_scalar(
-            lambda x: x.with_queue(queue), ary)
-
-
-def without_queue(ary):
-    return with_queue(None, ary)
-
-
 class EagerDGDiscretization(DGDiscretizationWithBoundaries):
     """
     .. automethod:: project
@@ -70,66 +62,215 @@ class EagerDGDiscretization(DGDiscretizationWithBoundaries):
         return self.project(src, tgt, vec)
 
     def project(self, src, tgt, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
                     lambda el: self.project(src, tgt, el), vec)
 
-        return self.connection_from_dds(src, tgt)(vec.queue, vec)
+        return self.connection_from_dds(src, tgt)(vec)
 
     def nodes(self):
         return self._volume_discr.nodes()
 
     @memoize_method
     def _bound_grad(self):
-        return bind(self, sym.nabla(self.dim) * sym.Variable("u"))
+        return bind(self, sym.nabla(self.dim) * sym.Variable("u"), local_only=True)
 
     def grad(self, vec):
-        return self._bound_grad()(vec.queue, u=vec)
+        return self._bound_grad()(u=vec)
 
     def div(self, vecs):
         return sum(
                 self.grad(vec_i)[i] for i, vec_i in enumerate(vecs))
 
     @memoize_method
-    def _bound_weak_grad(self):
-        return bind(self, sym.stiffness_t(self.dim) * sym.Variable("u"))
-
-    def weak_grad(self, vec):
-        return self._bound_weak_grad()(vec.queue, u=vec)
+    def _bound_weak_grad(self, dd):
+        return bind(self,
+                sym.stiffness_t(self.dim, dd_in=dd) * sym.Variable("u", dd),
+                local_only=True)
+
+    def weak_grad(self, *args):
+        if len(args) == 1:
+            vec, = args
+            dd = sym.DOFDesc("vol", sym.QTAG_NONE)
+        elif len(args) == 2:
+            dd, vec = args
+        else:
+            raise TypeError("invalid number of arguments")
+
+        return self._bound_weak_grad(dd)(u=vec)
+
+    def weak_div(self, *args):
+        if len(args) == 1:
+            vecs, = args
+            dd = sym.DOFDesc("vol", sym.QTAG_NONE)
+        elif len(args) == 2:
+            dd, vecs = args
+        else:
+            raise TypeError("invalid number of arguments")
 
-    def weak_div(self, vecs):
         return sum(
-                self.weak_grad(vec_i)[i] for i, vec_i in enumerate(vecs))
+                self.weak_grad(dd, vec_i)[i] for i, vec_i in enumerate(vecs))
 
     @memoize_method
     def normal(self, dd):
-        with cl.CommandQueue(self.cl_context) as queue:
-            surface_discr = self.discr_from_dd(dd)
-            return without_queue(
-                    bind(self, sym.normal(
-                        dd, surface_discr.ambient_dim, surface_discr.dim))(queue))
+        surface_discr = self.discr_from_dd(dd)
+        actx = surface_discr._setup_actx
+        return freeze(
+                bind(self,
+                    sym.normal(dd, surface_discr.ambient_dim, surface_discr.dim),
+                    local_only=True)
+                (array_context=actx))
 
     @memoize_method
     def _bound_inverse_mass(self):
-        return bind(self, sym.InverseMassOperator()(sym.Variable("u")))
+        return bind(self, sym.InverseMassOperator()(sym.Variable("u")),
+                local_only=True)
 
     def inverse_mass(self, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
                     lambda el: self.inverse_mass(el), vec)
 
-        return self._bound_inverse_mass()(vec.queue, u=vec)
+        return self._bound_inverse_mass()(u=vec)
+
+    @memoize_method
+    def _bound_face_mass(self, dd):
+        u = sym.Variable("u", dd=dd)
+        return bind(self, sym.FaceMassOperator(dd_in=dd)(u), local_only=True)
+
+    def face_mass(self, *args):
+        if len(args) == 1:
+            vec, = args
+            dd = sym.DOFDesc("all_faces", sym.QTAG_NONE)
+        elif len(args) == 2:
+            dd, vec = args
+        else:
+            raise TypeError("invalid number of arguments")
+
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
+                    lambda el: self.face_mass(dd, el), vec)
+
+        return self._bound_face_mass(dd)(u=vec)
 
     @memoize_method
-    def _bound_face_mass(self):
-        u = sym.Variable("u", dd=sym.as_dofdesc("all_faces"))
-        return bind(self, sym.FaceMassOperator()(u))
+    def _norm(self, p):
+        return bind(self, sym.norm(p, sym.var("arg")), local_only=True)
+
+    def norm(self, vec, p=2):
+        return self._norm(p)(arg=vec)
+
+    @memoize_method
+    def _nodal_reduction(self, operator, dd):
+        return bind(self, operator(dd)(sym.var("arg")), local_only=True)
+
+    def nodal_sum(self, dd, vec):
+        return self._nodal_reduction(sym.NodalSum, dd)(arg=vec)
+
+    def nodal_min(self, dd, vec):
+        return self._nodal_reduction(sym.NodalMin, dd)(arg=vec)
+
+    def nodal_max(self, dd, vec):
+        return self._nodal_reduction(sym.NodalMax, dd)(arg=vec)
+
+    @memoize_method
+    def connected_ranks(self):
+        from meshmode.distributed import get_connected_partitions
+        return get_connected_partitions(self._volume_discr.mesh)
+
+
+def interior_trace_pair(discrwb, vec):
+    i = discrwb.project("vol", "int_faces", vec)
+
+    if (isinstance(vec, np.ndarray)
+            and vec.dtype.char == "O"
+            and not isinstance(vec, DOFArray)):
+        e = obj_array_vectorize(
+                lambda el: discrwb.opposite_face_connection()(el),
+                i)
+
+    return TracePair("int_faces", i, e)
+
+
+class RankBoundaryCommunication:
+    base_tag = 1273
+
+    def __init__(self, discrwb, remote_rank, vol_field, tag=None):
+        self.tag = self.base_tag
+        if tag is not None:
+            self.tag += tag
+
+        self.discrwb = discrwb
+        self.array_context = vol_field.array_context
+        self.remote_btag = BTAG_PARTITION(remote_rank)
+
+        self.bdry_discr = discrwb.discr_from_dd(self.remote_btag)
+        self.local_dof_array = discrwb.project("vol", self.remote_btag, vol_field)
+
+        local_data = self.array_context.to_numpy(flatten(self.local_dof_array))
+
+        comm = self.discrwb.mpi_communicator
+
+        self.send_req = comm.Isend(
+                local_data, remote_rank, tag=self.tag)
+
+        self.remote_data_host = np.empty_like(local_data)
+        self.recv_req = comm.Irecv(self.remote_data_host, remote_rank, self.tag)
+
+    def finish(self):
+        self.recv_req.Wait()
+
+        actx = self.array_context
+        remote_dof_array = unflatten(self.array_context, self.bdry_discr,
+                actx.from_numpy(self.remote_data_host))
+
+        bdry_conn = self.discrwb.get_distributed_boundary_swap_connection(
+                sym.as_dofdesc(sym.DTAG_BOUNDARY(self.remote_btag)))
+        swapped_remote_dof_array = bdry_conn(remote_dof_array)
+
+        self.send_req.Wait()
+
+        return TracePair(self.remote_btag, self.local_dof_array,
+                swapped_remote_dof_array)
+
+
+def _cross_rank_trace_pairs_scalar_field(discrwb, vec, tag=None):
+    rbcomms = [RankBoundaryCommunication(discrwb, remote_rank, vec, tag=tag)
+            for remote_rank in discrwb.connected_ranks()]
+    return [rbcomm.finish() for rbcomm in rbcomms]
+
+
+def cross_rank_trace_pairs(discrwb, vec, tag=None):
+    if (isinstance(vec, np.ndarray)
+            and vec.dtype.char == "O"
+            and not isinstance(vec, DOFArray)):
+
+        n, = vec.shape
+        result = {}
+        for ivec in range(n):
+            for rank_tpair in _cross_rank_trace_pairs_scalar_field(
+                    discrwb, vec[ivec]):
+                assert isinstance(rank_tpair.dd.domain_tag, sym.DTAG_BOUNDARY)
+                assert isinstance(rank_tpair.dd.domain_tag.tag, BTAG_PARTITION)
+                result[rank_tpair.dd.domain_tag.tag.part_nr, ivec] = rank_tpair
 
-    def face_mass(self, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
-                    lambda el: self.face_mass(el), vec)
+        return [
+            TracePair(
+                dd=sym.as_dofdesc(sym.DTAG_BOUNDARY(BTAG_PARTITION(remote_rank))),
+                interior=make_obj_array([
+                    result[remote_rank, i].int for i in range(n)]),
+                exterior=make_obj_array([
+                    result[remote_rank, i].ext for i in range(n)])
+                )
+            for remote_rank in discrwb.connected_ranks()]
+    else:
+        return _cross_rank_trace_pairs_scalar_field(discrwb, vec, tag=tag)
 
-        return self._bound_face_mass()(vec.queue, u=vec)
 
 # vim: foldmethod=marker
diff --git a/grudge/execution.py b/grudge/execution.py
index 7c3fbb07846880b64f01cab250379a2afb8e33db..1c25739d1964863fbda151851607aab6a746f09b 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -22,12 +22,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
+from typing import Optional, Union, Dict
+from numbers import Number
 import numpy as np
+
+from pytools import memoize_in
+from pytools.obj_array import make_obj_array
+
 import loopy as lp
 import pyopencl as cl
 import pyopencl.array  # noqa
-from pytools import memoize_in
+
+from meshmode.dof_array import DOFArray, thaw, flatten, unflatten
+from meshmode.array_context import ArrayContext, make_loopy_program
 
 import grudge.symbolic.mappers as mappers
 from grudge import sym
@@ -42,17 +49,20 @@ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa: F401
 MPI_TAG_SEND_TAGS = 1729
 
 
+ResultType = Union[DOFArray, Number]
+
+
 # {{{ exec mapper
 
 class ExecutionMapper(mappers.Evaluator,
         mappers.BoundOpMapperMixin,
         mappers.LocalOpReducerMixin):
-    def __init__(self, queue, context, bound_op):
+    def __init__(self, array_context, context, bound_op):
         super(ExecutionMapper, self).__init__(context)
         self.discrwb = bound_op.discrwb
         self.bound_op = bound_op
         self.function_registry = bound_op.function_registry
-        self.queue = queue
+        self.array_context = array_context
 
     # {{{ expression mappings
 
@@ -62,13 +72,14 @@ class ExecutionMapper(mappers.Evaluator,
 
         discr = self.discrwb.discr_from_dd(expr.dd)
 
-        result = discr.empty(self.queue, allocator=self.bound_op.allocator)
-        result.fill(1)
+        result = discr.empty(self.array_context)
+        for grp_ary in result:
+            grp_ary.fill(1)
         return result
 
     def map_node_coordinate_component(self, expr):
         discr = self.discrwb.discr_from_dd(expr.dd)
-        return discr.nodes()[expr.axis].with_queue(self.queue)
+        return thaw(self.array_context, discr.nodes()[expr.axis])
 
     def map_grudge_variable(self, expr):
         from numbers import Number
@@ -76,8 +87,9 @@ class ExecutionMapper(mappers.Evaluator,
         value = self.context[expr.name]
         if not expr.dd.is_scalar() and isinstance(value, Number):
             discr = self.discrwb.discr_from_dd(expr.dd)
-            ary = discr.empty(self.queue)
-            ary.fill(value)
+            ary = discr.empty(self.array_context)
+            for grp_ary in ary:
+                grp_ary.fill(value)
             value = ary
 
         return value
@@ -91,42 +103,42 @@ class ExecutionMapper(mappers.Evaluator,
             from numbers import Number
             if not dd.is_scalar() and isinstance(value, Number):
                 discr = self.discrwb.discr_from_dd(dd)
-                ary = discr.empty(self.queue)
-                ary.fill(value)
+                ary = discr.empty(self.array_context)
+                for grp_ary in ary:
+                    grp_ary.fill(value)
                 value = ary
         return value
 
     def map_call(self, expr):
         args = [self.rec(p) for p in expr.parameters]
-        return self.function_registry[expr.function.name](self.queue, *args)
+        return self.function_registry[expr.function.name](self.array_context, *args)
 
     # }}}
 
     # {{{ elementwise reductions
 
     def _map_elementwise_reduction(self, op_name, field_expr, dd):
-        @memoize_in(self, "elementwise_%s_knl" % op_name)
-        def knl():
-            knl = lp.make_kernel(
-                "{[el, idof, jdof]: 0<=el<nelements and 0<=idof, jdof<ndofs}",
+        @memoize_in(self.array_context,
+                (ExecutionMapper, "elementwise_%s_prg" % op_name))
+        def prg():
+            return make_loopy_program(
+                "{[iel, idof, jdof]: 0<=iel<nelements and 0<=idof, jdof<ndofs}",
                 """
-                result[el, idof] = %s(jdof, operand[el, jdof])
+                result[iel, idof] = %s(jdof, operand[iel, jdof])
                 """ % op_name,
-                default_offset=lp.auto,
-                name="elementwise_%s_knl" % op_name)
-
-            return lp.tag_inames(knl, "el:g.0,idof:l.0")
+                name="grudge_elementwise_%s" % op_name)
 
         field = self.rec(field_expr)
         discr = self.discrwb.discr_from_dd(dd)
-        assert field.shape == (discr.nnodes,)
+        assert field.shape == (len(discr.groups),)
 
-        result = discr.empty(queue=self.queue, dtype=field.dtype,
-                allocator=self.bound_op.allocator)
+        result = discr.empty(self.array_context, dtype=field.entry_dtype)
         for grp in discr.groups:
-            knl()(self.queue,
-                    operand=grp.view(field),
-                    result=grp.view(result))
+            assert field[grp.index].shape == (grp.nelements, grp.nunit_dofs)
+            self.array_context.call_loopy(
+                    prg(),
+                    operand=field[grp.index],
+                    result=result[grp.index])
 
         return result
 
@@ -145,57 +157,101 @@ class ExecutionMapper(mappers.Evaluator,
 
     def map_nodal_sum(self, op, field_expr):
         # FIXME: Could allow array scalars
-        return cl.array.sum(self.rec(field_expr)).get()[()]
+        # FIXME: Fix CL-specific-ness
+        return sum([
+                cl.array.sum(grp_ary).get()[()]
+                for grp_ary in self.rec(field_expr)
+                ])
 
     def map_nodal_max(self, op, field_expr):
         # FIXME: Could allow array scalars
-        return cl.array.max(self.rec(field_expr)).get()[()]
+        # FIXME: Fix CL-specific-ness
+        return np.max([
+            cl.array.max(grp_ary).get()[()]
+            for grp_ary in self.rec(field_expr)])
 
     def map_nodal_min(self, op, field_expr):
         # FIXME: Could allow array scalars
-        return cl.array.min(self.rec(field_expr)).get()[()]
+        # FIXME: Fix CL-specific-ness
+        return np.min([
+            cl.array.min(grp_ary).get()[()]
+            for grp_ary in self.rec(field_expr)])
 
     # }}}
 
     def map_if(self, expr):
         bool_crit = self.rec(expr.condition)
 
+        if isinstance(bool_crit,  DOFArray):
+            # continues below
+            pass
+        elif isinstance(bool_crit,  np.number):
+            if bool_crit:
+                return self.rec(expr.then)
+            else:
+                return self.rec(expr.else_)
+        else:
+            raise TypeError(
+                "Expected criterion to be of type np.number or DOFArray")
+
+        assert isinstance(bool_crit,  DOFArray)
+        ngroups = len(bool_crit)
+
+        from pymbolic import var
+        iel = var("iel")
+        idof = var("idof")
+        subscript = (iel, idof)
+
         then = self.rec(expr.then)
         else_ = self.rec(expr.else_)
 
         import pymbolic.primitives as p
         var = p.Variable
 
-        i = var("i")
-        if isinstance(then,  pyopencl.array.Array):
-            sym_then = var("a")[i]
-        elif isinstance(then,  np.number):
+        if isinstance(then,  DOFArray):
+            sym_then = var("a")[subscript]
+
+            def get_then(igrp):
+                return then[igrp]
+        elif isinstance(then, np.number):
             sym_then = var("a")
+
+            def get_then(igrp):
+                return then
         else:
             raise TypeError(
-                "Expected parameter to be of type np.number or pyopencl.array.Array")
+                "Expected 'then' to be of type np.number or DOFArray")
+
+        if isinstance(else_,  DOFArray):
+            sym_else = var("b")[subscript]
 
-        if isinstance(else_,  pyopencl.array.Array):
-            sym_else = var("b")[i]
+            def get_else(igrp):
+                return else_[igrp]
         elif isinstance(else_,  np.number):
             sym_else = var("b")
+
+            def get_else(igrp):
+                return else_
         else:
             raise TypeError(
-                "Expected parameter to be of type np.number or pyopencl.array.Array")
+                "Expected 'else' to be of type np.number or DOFArray")
 
-        @memoize_in(self.bound_op, "map_if_knl")
-        def knl():
-            knl = lp.make_kernel(
-                "{[i]: 0<=i<n}",
+        @memoize_in(self.array_context, (ExecutionMapper, "map_if_knl"))
+        def knl(sym_then, sym_else):
+            return make_loopy_program(
+                "{[iel, idof]: 0<=iel<nelements and 0<=idof<nunit_dofs}",
                 [
-                    lp.Assignment(var("out")[i],
-                        p.If(var("crit")[i], sym_then, sym_else))
+                    lp.Assignment(var("out")[iel, idof],
+                        p.If(var("crit")[iel, idof], sym_then, sym_else))
                 ])
-            return lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-        evt, (out,) = knl()(self.queue, crit=bool_crit, a=then, b=else_)
 
-        return out
+        return DOFArray.from_list(self.array_context, [
+            self.array_context.call_loopy(
+                knl(sym_then, sym_else),
+                crit=bool_crit[igrp],
+                a=get_then(igrp),
+                b=get_else(igrp))
+            for igrp in range(ngroups)])
 
     # {{{ elementwise linear operators
 
@@ -210,26 +266,23 @@ class ExecutionMapper(mappers.Evaluator,
         if is_zero(field):
             return 0
 
-        @memoize_in(self.bound_op, "elwise_linear_knl")
-        def knl():
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i<ndiscr_nodes_out and
+        @memoize_in(self.array_context, (ExecutionMapper, "elwise_linear_knl"))
+        def prg():
+            result = make_loopy_program(
+                """{[iel, idof, j]:
+                    0<=iel<nelements and
+                    0<=idof<ndiscr_nodes_out and
                     0<=j<ndiscr_nodes_in}""",
-                "result[k,i] = sum(j, mat[i, j] * vec[k, j])",
-                default_offset=lp.auto, name="diff")
+                "result[iel, idof] = sum(j, mat[idof, j] * vec[iel, j])",
+                name="diff")
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            knl = lp.tag_array_axes(knl, "mat", "stride:auto,stride:auto")
-            return lp.tag_inames(knl, dict(k="g.0"))
+            result = lp.tag_array_axes(result, "mat", "stride:auto,stride:auto")
+            return result
 
         in_discr = self.discrwb.discr_from_dd(op.dd_in)
         out_discr = self.discrwb.discr_from_dd(op.dd_out)
 
-        result = out_discr.empty(
-                queue=self.queue,
-                dtype=field.dtype, allocator=self.bound_op.allocator)
+        result = out_discr.empty(self.array_context, dtype=field.entry_dtype)
 
         for in_grp, out_grp in zip(in_discr.groups, out_discr.groups):
 
@@ -237,32 +290,34 @@ class ExecutionMapper(mappers.Evaluator,
             try:
                 matrix = self.bound_op.operator_data_cache[cache_key]
             except KeyError:
-                matrix = (
-                    cl.array.to_device(
-                        self.queue,
-                        np.asarray(op.matrix(out_grp, in_grp), dtype=field.dtype))
-                    .with_queue(None))
+                matrix = self.array_context.freeze(
+                        self.array_context.from_numpy(
+                            np.asarray(
+                                op.matrix(out_grp, in_grp),
+                                dtype=field.entry_dtype)))
 
                 self.bound_op.operator_data_cache[cache_key] = matrix
 
-            knl()(self.queue, mat=matrix, result=out_grp.view(result),
-                    vec=in_grp.view(field))
+            self.array_context.call_loopy(
+                    prg(),
+                    mat=matrix,
+                    result=result[out_grp.index],
+                    vec=field[in_grp.index])
 
         return result
 
     def map_projection(self, op, field_expr):
         conn = self.discrwb.connection_from_dds(op.dd_in, op.dd_out)
-        return conn(self.queue, self.rec(field_expr)).with_queue(self.queue)
+        return conn(self.rec(field_expr))
 
     def map_opposite_partition_face_swap(self, op, field_expr):
         assert op.dd_in == op.dd_out
         bdry_conn = self.discrwb.get_distributed_boundary_swap_connection(op.dd_in)
         remote_bdry_vec = self.rec(field_expr)  # swapped by RankDataSwapAssign
-        return bdry_conn(self.queue, remote_bdry_vec).with_queue(self.queue)
+        return bdry_conn(remote_bdry_vec)
 
     def map_opposite_interior_face_swap(self, op, field_expr):
-        return self.discrwb.opposite_face_connection()(
-                self.queue, self.rec(field_expr)).with_queue(self.queue)
+        return self.discrwb.opposite_face_connection()(self.rec(field_expr))
 
     # }}}
 
@@ -275,27 +330,24 @@ class ExecutionMapper(mappers.Evaluator,
         if is_zero(field):
             return 0
 
-        @memoize_in(self.bound_op, "face_mass_knl")
-        def knl():
-            knl = lp.make_kernel(
-                """{[k,i,f,j]:
-                    0<=k<nelements and
+        @memoize_in(self.array_context, (ExecutionMapper, "face_mass_knl"))
+        def prg():
+            return make_loopy_program(
+                """{[iel,idof,f,j]:
+                    0<=iel<nelements and
                     0<=f<nfaces and
-                    0<=i<nvol_nodes and
+                    0<=idof<nvol_nodes and
                     0<=j<nface_nodes}""",
-                "result[k,i] = sum(f, sum(j, mat[i, f, j] * vec[f, k, j]))",
-                default_offset=lp.auto, name="face_mass")
-
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+                """
+                result[iel,idof] = sum(f, sum(j, mat[idof, f, j] * vec[f, iel, j]))
+                """,
+                name="face_mass")
 
         all_faces_conn = self.discrwb.connection_from_dds("vol", op.dd_in)
         all_faces_discr = all_faces_conn.to_discr
         vol_discr = all_faces_conn.from_discr
 
-        result = vol_discr.empty(
-                queue=self.queue,
-                dtype=field.dtype, allocator=self.bound_op.allocator)
+        result = vol_discr.empty(self.array_context, dtype=field.entry_dtype)
 
         assert len(all_faces_discr.groups) == len(vol_discr.groups)
 
@@ -307,16 +359,18 @@ class ExecutionMapper(mappers.Evaluator,
             try:
                 matrix = self.bound_op.operator_data_cache[cache_key]
             except KeyError:
-                matrix = op.matrix(afgrp, volgrp, field.dtype)
-                matrix = (
-                        cl.array.to_device(self.queue, matrix)
-                        .with_queue(None))
+                matrix = op.matrix(afgrp, volgrp, field.entry_dtype)
+                matrix = self.array_context.freeze(
+                        self.array_context.from_numpy(matrix))
 
                 self.bound_op.operator_data_cache[cache_key] = matrix
 
-            input_view = afgrp.view(field).reshape(
-                    nfaces, volgrp.nelements, afgrp.nunit_nodes)
-            knl()(self.queue, mat=matrix, result=volgrp.view(result),
+            input_view = field[afgrp.index].reshape(
+                    nfaces, volgrp.nelements, afgrp.nunit_dofs)
+            self.array_context.call_loopy(
+                    prg(),
+                    mat=matrix,
+                    result=result[volgrp.index],
                     vec=input_view)
 
         return result
@@ -332,15 +386,16 @@ class ExecutionMapper(mappers.Evaluator,
                 sym.DD_VOLUME,
                 sym.DOFDesc(expr.dd.domain_tag))
 
-        field = face_discr.empty(self.queue,
-                dtype=self.discrwb.real_dtype,
-                allocator=self.bound_op.allocator)
-        field.fill(1)
+        field = face_discr.empty(self.array_context, dtype=self.discrwb.real_dtype)
+        for grp_ary in field:
+            grp_ary.fill(1)
 
-        for grp in all_faces_conn.groups:
+        for igrp, grp in enumerate(all_faces_conn.groups):
             for batch in grp.batches:
-                i = batch.to_element_indices.with_queue(self.queue)
-                field[i] = (2.0 * (batch.to_element_face % 2) - 1.0) * field[i]
+                i = self.array_context.thaw(batch.to_element_indices)
+                grp_field = field[igrp].reshape(-1)
+                grp_field[i] = \
+                        (2.0 * (batch.to_element_face % 2) - 1.0) * grp_field[i]
 
         return field
 
@@ -349,7 +404,7 @@ class ExecutionMapper(mappers.Evaluator,
     # {{{ instruction execution functions
 
     def map_insn_rank_data_swap(self, insn, profile_data=None):
-        local_data = self.rec(insn.field).get(self.queue)
+        local_data = self.array_context.to_numpy(flatten(self.rec(insn.field)))
         comm = self.discrwb.mpi_communicator
 
         # print("Sending data to rank %d with tag %d"
@@ -360,31 +415,60 @@ class ExecutionMapper(mappers.Evaluator,
         recv_req = comm.Irecv(remote_data_host, insn.i_remote_rank, insn.recv_tag)
 
         return [], [
-                MPIRecvFuture(recv_req, insn.name, remote_data_host, self.queue),
+                MPIRecvFuture(
+                    array_context=self.array_context,
+                    bdry_discr=self.discrwb.discr_from_dd(insn.dd_out),
+                    recv_req=recv_req,
+                    insn_name=insn.name,
+                    remote_data_host=remote_data_host),
                 MPISendFuture(send_req)]
 
     def map_insn_loopy_kernel(self, insn, profile_data=None):
-        kwargs = {}
         kdescr = insn.kernel_descriptor
-        for name, expr in six.iteritems(kdescr.input_mappings):
-            kwargs[name] = self.rec(expr)
-
         discr = self.discrwb.discr_from_dd(kdescr.governing_dd)
+
+        dof_array_kwargs = {}
+        other_kwargs = {}
+
+        for name, expr in kdescr.input_mappings.items():
+            v = self.rec(expr)
+            if isinstance(v, DOFArray):
+                dof_array_kwargs[name] = v
+            else:
+                other_kwargs[name] = v
+
         for name in kdescr.scalar_args():
-            v = kwargs[name]
+            v = other_kwargs[name]
             if isinstance(v, (int, float)):
-                kwargs[name] = discr.real_dtype.type(v)
+                other_kwargs[name] = discr.real_dtype.type(v)
             elif isinstance(v, complex):
-                kwargs[name] = discr.complex_dtype.type(v)
+                other_kwargs[name] = discr.complex_dtype.type(v)
             elif isinstance(v, np.number):
                 pass
             else:
                 raise ValueError("unrecognized scalar type for variable '%s': %s"
                         % (name, type(v)))
 
-        kwargs["grdg_n"] = discr.nnodes
-        evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
-        return list(result_dict.items()), []
+        result = {}
+        for grp in discr.groups:
+            kwargs = other_kwargs.copy()
+            kwargs["nelements"] = grp.nelements
+            kwargs["nunit_dofs"] = grp.nunit_dofs
+
+            for name, ary in dof_array_kwargs.items():
+                kwargs[name] = ary[grp.index]
+
+            knl_result = self.array_context.call_loopy(
+                    kdescr.loopy_kernel, **kwargs)
+
+            for name, val in knl_result.items():
+                result.setdefault(name, []).append(val)
+
+        result = {
+                name: DOFArray.from_list(self.array_context, val)
+                for name, val in result.items()}
+
+        return list(result.items()), []
 
     def map_insn_assign(self, insn, profile_data=None):
         return [(name, self.rec(expr))
@@ -413,35 +497,36 @@ class ExecutionMapper(mappers.Evaluator,
 
         assert repr_op.dd_in.domain_tag == repr_op.dd_out.domain_tag
 
-        @memoize_in(self.discrwb, "reference_derivative_knl")
-        def knl():
-            knl = lp.make_kernel(
-                """{[imatrix,k,i,j]:
+        @memoize_in(self.array_context,
+                (ExecutionMapper, "reference_derivative_prg"))
+        def prg(nmatrices):
+            result = make_loopy_program(
+                """{[imatrix, iel, idof, j]:
                     0<=imatrix<nmatrices and
-                    0<=k<nelements and
-                    0<=i<nunit_nodes_out and
+                    0<=iel<nelements and
+                    0<=idof<nunit_nodes_out and
                     0<=j<nunit_nodes_in}""",
                 """
-                result[imatrix, k, i] = sum(
-                        j, diff_mat[imatrix, i, j] * vec[k, j])
+                result[imatrix, iel, idof] = sum(
+                        j, diff_mat[imatrix, idof, j] * vec[iel, j])
                 """,
-                default_offset=lp.auto, name="diff")
+                name="diff")
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+            result = lp.fix_parameters(result, nmatrices=nmatrices)
+            result = lp.tag_inames(result, "imatrix: unr")
+            result = lp.tag_array_axes(result, "result", "sep,c,c")
+            return result
 
         noperators = len(insn.operators)
 
         in_discr = self.discrwb.discr_from_dd(repr_op.dd_in)
         out_discr = self.discrwb.discr_from_dd(repr_op.dd_out)
 
-        result = out_discr.empty(
-                queue=self.queue,
-                dtype=field.dtype, extra_dims=(noperators,),
-                allocator=self.bound_op.allocator)
+        result = make_obj_array([
+            out_discr.empty(self.array_context, dtype=field.entry_dtype)
+            for idim in range(noperators)])
 
         for in_grp, out_grp in zip(in_discr.groups, out_discr.groups):
-
             if in_grp.nelements == 0:
                 continue
 
@@ -449,13 +534,18 @@ class ExecutionMapper(mappers.Evaluator,
 
             # FIXME: Should transfer matrices to device and cache them
             matrices_ary = np.empty((
-                noperators, out_grp.nunit_nodes, in_grp.nunit_nodes))
+                noperators, out_grp.nunit_dofs, in_grp.nunit_dofs))
             for i, op in enumerate(insn.operators):
                 matrices_ary[i] = matrices[op.rst_axis]
+            matrices_ary_dev = self.array_context.from_numpy(matrices_ary)
 
-            knl()(self.queue,
-                    diff_mat=matrices_ary,
-                    result=out_grp.view(result), vec=in_grp.view(field))
+            self.array_context.call_loopy(
+                    prg(noperators),
+                    diff_mat=matrices_ary_dev,
+                    result=make_obj_array([
+                        result[iop][out_grp.index]
+                        for iop in range(noperators)
+                        ]), vec=field[in_grp.index])
 
         return [(name, result[i]) for i, name in enumerate(insn.names)], []
 
@@ -467,18 +557,22 @@ class ExecutionMapper(mappers.Evaluator,
 # {{{ futures
 
 class MPIRecvFuture(object):
-    def __init__(self, recv_req, insn_name, remote_data_host, queue):
+    def __init__(self, array_context, bdry_discr, recv_req, insn_name,
+            remote_data_host):
+        self.array_context = array_context
+        self.bdry_discr = bdry_discr
         self.receive_request = recv_req
         self.insn_name = insn_name
         self.remote_data_host = remote_data_host
-        self.queue = queue
 
     def is_ready(self):
         return self.receive_request.Test()
 
     def __call__(self):
         self.receive_request.Wait()
-        remote_data = cl.array.to_device(self.queue, self.remote_data_host)
+        actx = self.array_context
+        remote_data = unflatten(self.array_context, self.bdry_discr,
+                actx.from_numpy(self.remote_data_host))
         return [(self.insn_name, remote_data)], []
 
 
@@ -490,7 +584,7 @@ class MPISendFuture(object):
         return self.send_request.Test()
 
     def __call__(self):
-        self.send_request.wait()
+        self.send_request.Wait()
         return [], []
 
 # }}}
@@ -499,16 +593,14 @@ class MPISendFuture(object):
 # {{{ bound operator
 
 class BoundOperator(object):
-
     def __init__(self, discrwb, discr_code, eval_code, debug_flags,
-            function_registry, exec_mapper_factory, allocator=None):
+            function_registry, exec_mapper_factory):
         self.discrwb = discrwb
         self.discr_code = discr_code
         self.eval_code = eval_code
         self.operator_data_cache = {}
         self.debug_flags = debug_flags
         self.function_registry = function_registry
-        self.allocator = allocator
         self.exec_mapper_factory = exec_mapper_factory
 
     def __str__(self):
@@ -523,16 +615,50 @@ class BoundOperator(object):
                 + sep
                 + str(self.eval_code))
 
-    def __call__(self, queue, profile_data=None, log_quantities=None, **context):
-        import pyopencl.array as cl_array
-
-        def replace_queue(a):
-            if isinstance(a, cl_array.Array):
-                return a.with_queue(queue)
+    def __call__(self, array_context: Optional[ArrayContext] = None,
+            *, profile_data=None, log_quantities=None, **context):
+        """
+        :arg array_context: only needs to be supplied if no instances of
+            :class:`~meshmode.dof_array.DOFArray` with a
+            :class:`~meshmode.array_context.ArrayContext`
+            are supplied as part of *context*.
+        """
+
+        # {{{ figure array context
+
+        array_contexts = []
+        if array_context is not None:
+            if not isinstance(array_context, ArrayContext):
+                raise TypeError(
+                        "first positional argument (if supplied) must be "
+                        "an ArrayContext")
+
+            array_contexts.append(array_context)
+        del array_context
+
+        def look_for_array_contexts(ary):
+            if isinstance(ary, DOFArray):
+                if ary.array_context is not None:
+                    array_contexts.append(ary.array_context)
+            elif isinstance(ary, np.ndarray) and ary.dtype.char == "O":
+                for idx in np.ndindex(ary.shape):
+                    look_for_array_contexts(ary[idx])
             else:
-                return a
+                pass
+
+        for key, val in context.items():
+            look_for_array_contexts(val)
 
-        from pytools.obj_array import with_object_array_or_scalar
+        if array_contexts:
+            from pytools import is_single_valued
+            if not is_single_valued(array_contexts):
+                raise ValueError("arguments do not agree on an array context")
+
+            array_context = array_contexts[0]
+        else:
+            raise ValueError("no array context given or available from arguments")
+
+        # }}}
 
         # {{{ discrwb-scope evaluation
 
@@ -541,18 +667,15 @@ class BoundOperator(object):
                     self.discrwb._discr_scoped_subexpr_name_to_value)
                 for result_var in self.discr_code.result):
             # need to do discrwb-scope evaluation
-            discrwb_eval_context = {}
+            discrwb_eval_context: Dict[str, ResultType] = {}
             self.discr_code.execute(
-                    self.exec_mapper_factory(queue, discrwb_eval_context, self))
+                    self.exec_mapper_factory(
+                        array_context, discrwb_eval_context, self))
 
         # }}}
 
-        new_context = {}
-        for name, var in six.iteritems(context):
-            new_context[name] = with_object_array_or_scalar(replace_queue, var)
-
         return self.eval_code.execute(
-                self.exec_mapper_factory(queue, new_context, self),
+                self.exec_mapper_factory(array_context, context, self),
                 profile_data=profile_data,
                 log_quantities=log_quantities)
 
@@ -561,8 +684,14 @@ class BoundOperator(object):
 
 # {{{ process_sym_operator function
 
-def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
-        dumper=lambda name, sym_operator: None):
+def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None, dumper=None,
+        local_only=None):
+    if local_only is None:
+        local_only = False
+
+    if dumper is None:
+        def dumper(name, sym_operator):
+            return
 
     orig_sym_operator = sym_operator
     import grudge.symbolic.mappers as mappers
@@ -575,26 +704,26 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
     sym_operator = \
             mappers.OppositeInteriorFaceSwapUniqueIDAssigner()(sym_operator)
 
-    # {{{ broadcast root rank's symn_operator
+    if not local_only:
+        # {{{ broadcast root rank's symn_operator
 
-    # also make sure all ranks had same orig_sym_operator
+        # also make sure all ranks had same orig_sym_operator
 
-    if discrwb.mpi_communicator is not None:
-        (mgmt_rank_orig_sym_operator, mgmt_rank_sym_operator) = \
-                discrwb.mpi_communicator.bcast(
-                    (orig_sym_operator, sym_operator),
-                    discrwb.get_management_rank_index())
+        if discrwb.mpi_communicator is not None:
+            (mgmt_rank_orig_sym_operator, mgmt_rank_sym_operator) = \
+                    discrwb.mpi_communicator.bcast(
+                        (orig_sym_operator, sym_operator),
+                        discrwb.get_management_rank_index())
 
-        from pytools.obj_array import is_equal as is_oa_equal
-        if not is_oa_equal(mgmt_rank_orig_sym_operator, orig_sym_operator):
-            raise ValueError("rank %d received a different symbolic "
-                    "operator to bind from rank %d"
-                    % (discrwb.mpi_communicator.Get_rank(),
-                        discrwb.get_management_rank_index()))
+            if not np.array_equal(mgmt_rank_orig_sym_operator, orig_sym_operator):
+                raise ValueError("rank %d received a different symbolic "
+                        "operator to bind from rank %d"
+                        % (discrwb.mpi_communicator.Get_rank(),
+                            discrwb.get_management_rank_index()))
 
-        sym_operator = mgmt_rank_sym_operator
+            sym_operator = mgmt_rank_sym_operator
 
-    # }}}
+        # }}}
 
     if post_bind_mapper is not None:
         dumper("before-postbind", sym_operator)
@@ -630,12 +759,13 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
 
     dumper("before-distributed", sym_operator)
 
-    volume_mesh = discrwb.discr_from_dd("vol").mesh
-    from meshmode.distributed import get_connected_partitions
-    connected_parts = get_connected_partitions(volume_mesh)
+    if not local_only:
+        volume_mesh = discrwb.discr_from_dd("vol").mesh
+        from meshmode.distributed import get_connected_partitions
+        connected_parts = get_connected_partitions(volume_mesh)
 
-    if connected_parts:
-        sym_operator = mappers.DistributedMapper(connected_parts)(sym_operator)
+        if connected_parts:
+            sym_operator = mappers.DistributedMapper(connected_parts)(sym_operator)
 
     dumper("before-imass", sym_operator)
     sym_operator = mappers.InverseMassContractor()(sym_operator)
@@ -654,10 +784,16 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
 # }}}
 
 
-def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
+def bind(discr, sym_operator, *, post_bind_mapper=lambda x: x,
         function_registry=base_function_registry,
         exec_mapper_factory=ExecutionMapper,
-        debug_flags=frozenset(), allocator=None):
+        debug_flags=frozenset(), local_only=None):
+    """
+    :param local_only: If *True*, *sym_operator* should oly be evaluated on the
+        local part of the mesh. No inter-rank communication will take place.
+        (However rank boundaries, tagged :class:`~meshmode.mesh.BTAG_PARTITION`,
+        will not automatically be considered part of the domain boundary.)
+    """
     # from grudge.symbolic.mappers import QuadratureUpsamplerRemover
     # sym_operator = QuadratureUpsamplerRemover(self.quad_min_degrees)(
     #         sym_operator)
@@ -677,7 +813,8 @@ def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
             discr,
             sym_operator,
             post_bind_mapper=post_bind_mapper,
-            dumper=dump_sym_operator)
+            dumper=dump_sym_operator,
+            local_only=local_only)
 
     from grudge.symbolic.compiler import OperatorCompiler
     discr_code, eval_code = OperatorCompiler(discr, function_registry)(sym_operator)
@@ -685,8 +822,7 @@ def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
     bound_op = BoundOperator(discr, discr_code, eval_code,
             function_registry=function_registry,
             exec_mapper_factory=exec_mapper_factory,
-            debug_flags=debug_flags,
-            allocator=allocator)
+            debug_flags=debug_flags)
 
     if "dump_op_code" in debug_flags:
         from pytools.debug import open_unique_debug_file
diff --git a/grudge/function_registry.py b/grudge/function_registry.py
index 4d521560d8e6c83a8e9fa238b3c46955dc62e7a7..7b85d18fcfbd4626b8c6ee6055c421c021c538be 100644
--- a/grudge/function_registry.py
+++ b/grudge/function_registry.py
@@ -26,11 +26,10 @@ THE SOFTWARE.
 """
 
 
-import loopy as lp
 import numpy as np
 
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
-from pytools import RecordWithoutPickling, memoize_in
+from pytools import RecordWithoutPickling
 
 
 # {{{ helpers
@@ -82,76 +81,29 @@ class Function(RecordWithoutPickling):
         raise NotImplementedError
 
 
-class CElementwiseUnaryFunction(Function):
-
+class CElementwiseFunction(Function):
     supports_codegen = True
 
+    def __init__(self, identifier, nargs):
+        super().__init__(identifier=identifier, nargs=nargs)
+
     def get_result_dofdesc(self, arg_dds):
-        assert len(arg_dds) == 1
+        assert len(arg_dds) == self.nargs
         return arg_dds[0]
 
-    def __call__(self, queue, arg):
+    def __call__(self, array_context, *args):
         func_name = self.identifier
-        if should_use_numpy(arg):
+        from pytools import single_valued
+        if single_valued(should_use_numpy(arg) for arg in args):
             func = getattr(np, func_name)
-            return func(arg)
+            return func(*args)
 
-        cached_name = "map_call_knl_"
-
-        from pymbolic.primitives import Variable
-        i = Variable("i")
-
-        if self.identifier == "fabs":  # FIXME
+        if func_name == "fabs":  # FIXME
             # Loopy has a type-adaptive "abs", but no "fabs".
             func_name = "abs"
 
-        cached_name += func_name
-
-        @memoize_in(self, cached_name)
-        def knl():
-            knl = lp.make_kernel(
-                "{[i]: 0<=i<n}",
-                [
-                    lp.Assignment(Variable("out")[i],
-                        Variable(func_name)(Variable("a")[i]))
-                ], default_offset=lp.auto)
-            return lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-        evt, (out,) = knl()(queue, a=arg)
-        return out
-
-
-class CElementwiseBinaryFunction(Function):
-    supports_codegen = True
-
-    def get_result_dofdesc(self, arg_dds):
-        assert len(arg_dds) == 2
-        from pytools import single_valued
-        return single_valued(arg_dds)
-
-    def __call__(self, queue, arg0, arg1):
-        func_name = self.identifier
-        if should_use_numpy(arg0) and should_use_numpy(arg1):
-            func = getattr(np, cl_to_numpy_function_name(func_name))
-            return func(arg0, arg1)
-
-        from pymbolic.primitives import Variable
-
-        @memoize_in(self, "map_call_knl_%s" % func_name)
-        def knl():
-            i = Variable("i")
-            knl = lp.make_kernel(
-                "{[i]: 0<=i<n}",
-                [
-                    lp.Assignment(
-                        Variable("out")[i],
-                        Variable(func_name)(Variable("a")[i], Variable("b")[i]))
-                ], default_offset=lp.auto)
-
-            return lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-        evt, (out,) = knl()(queue, a=arg0, b=arg1)
-        return out
+        sfunc = getattr(array_context.np, func_name)
+        return sfunc(*args)
 
 
 class CBesselFunction(Function):
@@ -173,8 +125,8 @@ class FixedDOFDescExternalFunction(Function):
                 implementation=implementation,
                 dd=dd)
 
-    def __call__(self, queue, *args, **kwargs):
-        return self.implementation(queue, *args, **kwargs)
+    def __call__(self, array_context, *args, **kwargs):
+        return self.implementation(array_context, *args, **kwargs)
 
     def get_result_dofdesc(self, arg_dds):
         return self.dd
@@ -220,12 +172,12 @@ class FunctionRegistry(RecordWithoutPickling):
 def _make_bfr():
     bfr = FunctionRegistry()
 
-    bfr = bfr.register(CElementwiseUnaryFunction("sqrt"))
-    bfr = bfr.register(CElementwiseUnaryFunction("exp"))
-    bfr = bfr.register(CElementwiseUnaryFunction("fabs"))
-    bfr = bfr.register(CElementwiseUnaryFunction("sin"))
-    bfr = bfr.register(CElementwiseUnaryFunction("cos"))
-    bfr = bfr.register(CElementwiseBinaryFunction("atan2"))
+    bfr = bfr.register(CElementwiseFunction("sqrt", 1))
+    bfr = bfr.register(CElementwiseFunction("exp", 1))
+    bfr = bfr.register(CElementwiseFunction("fabs", 1))
+    bfr = bfr.register(CElementwiseFunction("sin", 1))
+    bfr = bfr.register(CElementwiseFunction("cos", 1))
+    bfr = bfr.register(CElementwiseFunction("atan2", 2))
     bfr = bfr.register(CBesselFunction("bessel_j"))
     bfr = bfr.register(CBesselFunction("bessel_y"))
 
diff --git a/grudge/models/em.py b/grudge/models/em.py
index f564196711a1b2749a1987d93cb6f2eeaf85eb74..4810b4d0ebd8f023adc2eebf01dd62bc67957d6d 100644
--- a/grudge/models/em.py
+++ b/grudge/models/em.py
@@ -34,7 +34,7 @@ from pytools import memoize_method
 from grudge.models import HyperbolicOperator
 from meshmode.mesh import BTAG_ALL, BTAG_NONE
 from grudge import sym
-from pytools.obj_array import join_fields, make_obj_array
+from pytools.obj_array import flat_obj_array, make_obj_array
 
 # TODO: Check PML
 
@@ -133,7 +133,7 @@ class MaxwellOperator(HyperbolicOperator):
             # if self.fixed_material:
             #     max_c = (self.epsilon*self.mu)**(-0.5)
 
-            return join_fields(
+            return flat_obj_array(
                     # flux e,
                     1/2*(
                         -self.space_cross_h(normal, h.int-h.ext)
@@ -148,7 +148,7 @@ class MaxwellOperator(HyperbolicOperator):
                     ))
         elif isinstance(self.flux_type, (int, float)):
             # see doc/maxima/maxwell.mac
-            return join_fields(
+            return flat_obj_array(
                     # flux e,
                     (
                         -1/(Z_int+Z_ext)*self.space_cross_h(normal,
@@ -182,7 +182,7 @@ class MaxwellOperator(HyperbolicOperator):
             return self.space_cross_h(nabla, field)
 
         # in conservation form: u_t + A u_x = 0
-        return join_fields(
+        return flat_obj_array(
                 (self.current - h_curl(h)),
                 e_curl(e)
                 )
@@ -194,7 +194,7 @@ class MaxwellOperator(HyperbolicOperator):
         pec_e = sym.cse(sym.project("vol", self.pec_tag)(e))
         pec_h = sym.cse(sym.project("vol", self.pec_tag)(h))
 
-        return join_fields(-pec_e, pec_h)
+        return flat_obj_array(-pec_e, pec_h)
 
     def pmc_bc(self, w):
         "Construct part of the flux operator template for PMC boundary conditions"
@@ -203,7 +203,7 @@ class MaxwellOperator(HyperbolicOperator):
         pmc_e = sym.cse(sym.project("vol", self.pmc_tag)(e))
         pmc_h = sym.cse(sym.project("vol", self.pmc_tag)(h))
 
-        return join_fields(pmc_e, -pmc_h)
+        return flat_obj_array(pmc_e, -pmc_h)
 
     def absorbing_bc(self, w):
         """Construct part of the flux operator template for 1st order
@@ -224,7 +224,7 @@ class MaxwellOperator(HyperbolicOperator):
         absorb_e = sym.cse(sym.project("vol", self.absorb_tag)(e))
         absorb_h = sym.cse(sym.project("vol", self.absorb_tag)(h))
 
-        bc = join_fields(
+        bc = flat_obj_array(
                 absorb_e + 1/2*(self.space_cross_h(absorb_normal, self.space_cross_e(
                     absorb_normal, absorb_e))
                     - absorb_Z*self.space_cross_h(absorb_normal, absorb_h)),
@@ -437,7 +437,7 @@ def get_rectangular_cavity_mode(E_0, mode_indices):  # noqa: N803
     if dims == 2:
         tfac = sym.ScalarVariable("t") * omega
 
-        result = join_fields(
+        result = flat_obj_array(
             0,
             0,
             sym.sin(kx * x) * sym.sin(ky * y) * sym.cos(tfac),  # ez
@@ -449,7 +449,7 @@ def get_rectangular_cavity_mode(E_0, mode_indices):  # noqa: N803
         tdep = sym.exp(-1j * omega * sym.ScalarVariable("t"))
 
         gamma_squared = ky**2 + kx**2
-        result = join_fields(
+        result = flat_obj_array(
             -kx * kz * E_0*cx*sy*sz*tdep / gamma_squared,  # ex
             -ky * kz * E_0*sx*cy*sz*tdep / gamma_squared,  # ey
             E_0 * sx*sy*cz*tdep,  # ez
diff --git a/grudge/models/wave.py b/grudge/models/wave.py
index 116cd329b0d648722b2537bb99545d783301b7b0..09a954e873573df374402c28762668573ae71882 100644
--- a/grudge/models/wave.py
+++ b/grudge/models/wave.py
@@ -30,7 +30,7 @@ import numpy as np
 from grudge.models import HyperbolicOperator
 from meshmode.mesh import BTAG_ALL, BTAG_NONE
 from grudge import sym
-from pytools.obj_array import join_fields
+from pytools.obj_array import flat_obj_array
 
 
 # {{{ constant-velocity
@@ -83,7 +83,7 @@ class StrongWaveOperator(HyperbolicOperator):
         v = w[1:]
         normal = sym.normal(w.dd, self.ambient_dim)
 
-        flux_weak = join_fields(
+        flux_weak = flat_obj_array(
                 np.dot(v.avg, normal),
                 u.avg * normal)
 
@@ -91,13 +91,13 @@ class StrongWaveOperator(HyperbolicOperator):
             pass
         elif self.flux_type == "upwind":
             # see doc/notes/grudge-notes.tm
-            flux_weak -= self.sign*join_fields(
+            flux_weak -= self.sign*flat_obj_array(
                     0.5*(u.int-u.ext),
                     0.5*(normal * np.dot(normal, v.int-v.ext)))
         else:
             raise ValueError("invalid flux type '%s'" % self.flux_type)
 
-        flux_strong = join_fields(
+        flux_strong = flat_obj_array(
                 np.dot(v.int, normal),
                 u.int * normal) - flux_weak
 
@@ -122,16 +122,16 @@ class StrongWaveOperator(HyperbolicOperator):
                     "are still having issues.")
 
             dir_g = sym.Field("dir_bc_u")
-            dir_bc = join_fields(2*dir_g - dir_u, dir_v)
+            dir_bc = flat_obj_array(2*dir_g - dir_u, dir_v)
         else:
-            dir_bc = join_fields(-dir_u, dir_v)
+            dir_bc = flat_obj_array(-dir_u, dir_v)
 
         dir_bc = sym.cse(dir_bc, "dir_bc")
 
         # neumann BCs ---------------------------------------------------------
         neu_u = sym.cse(sym.project("vol", self.neumann_tag)(u))
         neu_v = sym.cse(sym.project("vol", self.neumann_tag)(v))
-        neu_bc = sym.cse(join_fields(neu_u, -neu_v), "neu_bc")
+        neu_bc = sym.cse(flat_obj_array(neu_u, -neu_v), "neu_bc")
 
         # radiation BCs -------------------------------------------------------
         rad_normal = sym.normal(self.radiation_tag, d)
@@ -139,7 +139,7 @@ class StrongWaveOperator(HyperbolicOperator):
         rad_u = sym.cse(sym.project("vol", self.radiation_tag)(u))
         rad_v = sym.cse(sym.project("vol", self.radiation_tag)(v))
 
-        rad_bc = sym.cse(join_fields(
+        rad_bc = sym.cse(flat_obj_array(
                 0.5*(rad_u - self.sign*np.dot(rad_normal, rad_v)),
                 0.5*rad_normal*(np.dot(rad_normal, rad_v) - self.sign*rad_u)
                 ), "rad_bc")
@@ -152,7 +152,7 @@ class StrongWaveOperator(HyperbolicOperator):
                     self.flux(pair))
 
         result = (
-                - join_fields(
+                - flat_obj_array(
                     -self.c*np.dot(nabla, v),
                     -self.c*(nabla*u)
                     )
@@ -230,14 +230,14 @@ class WeakWaveOperator(HyperbolicOperator):
         v = w[1:]
         normal = sym.normal(w.dd, self.ambient_dim)
 
-        flux_weak = join_fields(
+        flux_weak = flat_obj_array(
                 np.dot(v.avg, normal),
                 u.avg * normal)
 
         if self.flux_type == "central":
             return -self.c*flux_weak
         elif self.flux_type == "upwind":
-            return -self.c*(flux_weak + self.sign*join_fields(
+            return -self.c*(flux_weak + self.sign*flat_obj_array(
                     0.5*(u.int-u.ext),
                     0.5*(normal * np.dot(normal, v.int-v.ext))))
         else:
@@ -262,16 +262,16 @@ class WeakWaveOperator(HyperbolicOperator):
                     "are still having issues.")
 
             dir_g = sym.Field("dir_bc_u")
-            dir_bc = join_fields(2*dir_g - dir_u, dir_v)
+            dir_bc = flat_obj_array(2*dir_g - dir_u, dir_v)
         else:
-            dir_bc = join_fields(-dir_u, dir_v)
+            dir_bc = flat_obj_array(-dir_u, dir_v)
 
         dir_bc = sym.cse(dir_bc, "dir_bc")
 
         # neumann BCs ---------------------------------------------------------
         neu_u = sym.cse(sym.project("vol", self.neumann_tag)(u))
         neu_v = sym.cse(sym.project("vol", self.neumann_tag)(v))
-        neu_bc = sym.cse(join_fields(neu_u, -neu_v), "neu_bc")
+        neu_bc = sym.cse(flat_obj_array(neu_u, -neu_v), "neu_bc")
 
         # radiation BCs -------------------------------------------------------
         rad_normal = sym.normal(self.radiation_tag, d)
@@ -279,7 +279,7 @@ class WeakWaveOperator(HyperbolicOperator):
         rad_u = sym.cse(sym.project("vol", self.radiation_tag)(u))
         rad_v = sym.cse(sym.project("vol", self.radiation_tag)(v))
 
-        rad_bc = sym.cse(join_fields(
+        rad_bc = sym.cse(flat_obj_array(
                 0.5*(rad_u - self.sign*np.dot(rad_normal, rad_v)),
                 0.5*rad_normal*(np.dot(rad_normal, rad_v) - self.sign*rad_u)
                 ), "rad_bc")
@@ -289,7 +289,7 @@ class WeakWaveOperator(HyperbolicOperator):
             return sym.project(pair.dd, "all_faces")(self.flux(pair))
 
         result = sym.InverseMassOperator()(
-                join_fields(
+                flat_obj_array(
                     -self.c*np.dot(sym.stiffness_t(self.ambient_dim), v),
                     -self.c*(sym.stiffness_t(self.ambient_dim)*u)
                     )
@@ -371,14 +371,16 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
         normal = sym.normal(w.dd, self.ambient_dim)
 
         if self.flux_type == "central":
-            return -0.5 * join_fields(
+            return -0.5 * flat_obj_array(
                 np.dot(v.int*c.int + v.ext*c.ext, normal),
                 (u.int * c.int + u.ext*c.ext) * normal)
 
         elif self.flux_type == "upwind":
-            return -0.5 * join_fields(np.dot(normal, c.ext * v.ext + c.int * v.int)
+            return -0.5 * flat_obj_array(
+                    np.dot(normal, c.ext * v.ext + c.int * v.int)
                     + c.ext*u.ext - c.int * u.int,
-                normal * (np.dot(normal, c.ext * v.ext - c.int * v.int)
+
+                    normal * (np.dot(normal, c.ext * v.ext - c.int * v.int)
                     + c.ext*u.ext + c.int * u.int))
 
         else:
@@ -390,7 +392,7 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
         w = sym.make_sym_array("w", d+1)
         u = w[0]
         v = w[1:]
-        flux_w = join_fields(self.c, w)
+        flux_w = flat_obj_array(self.c, w)
 
         # boundary conditions -------------------------------------------------
 
@@ -405,9 +407,9 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
                     "are still having issues.")
 
             dir_g = sym.Field("dir_bc_u")
-            dir_bc = join_fields(dir_c, 2*dir_g - dir_u, dir_v)
+            dir_bc = flat_obj_array(dir_c, 2*dir_g - dir_u, dir_v)
         else:
-            dir_bc = join_fields(dir_c, -dir_u, dir_v)
+            dir_bc = flat_obj_array(dir_c, -dir_u, dir_v)
 
         dir_bc = sym.cse(dir_bc, "dir_bc")
 
@@ -415,7 +417,7 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
         neu_c = sym.cse(sym.project("vol", self.neumann_tag)(self.c))
         neu_u = sym.cse(sym.project("vol", self.neumann_tag)(u))
         neu_v = sym.cse(sym.project("vol", self.neumann_tag)(v))
-        neu_bc = sym.cse(join_fields(neu_c, neu_u, -neu_v), "neu_bc")
+        neu_bc = sym.cse(flat_obj_array(neu_c, neu_u, -neu_v), "neu_bc")
 
         # radiation BCs -------------------------------------------------------
         rad_normal = sym.normal(self.radiation_tag, d)
@@ -424,7 +426,7 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
         rad_u = sym.cse(sym.project("vol", self.radiation_tag)(u))
         rad_v = sym.cse(sym.project("vol", self.radiation_tag)(v))
 
-        rad_bc = sym.cse(join_fields(rad_c,
+        rad_bc = sym.cse(flat_obj_array(rad_c,
                 0.5*(rad_u - sym.project("vol", self.radiation_tag)(self.sign)
                     * np.dot(rad_normal, rad_v)),
                 0.5*rad_normal*(np.dot(rad_normal, rad_v)
@@ -436,7 +438,7 @@ class VariableCoefficientWeakWaveOperator(HyperbolicOperator):
             return sym.project(pair.dd, "all_faces")(self.flux(pair))
 
         result = sym.InverseMassOperator()(
-                join_fields(
+                flat_obj_array(
                     -self.c*np.dot(sym.stiffness_t(self.ambient_dim), v),
                     -self.c*(sym.stiffness_t(self.ambient_dim)*u)
                     )
diff --git a/grudge/shortcuts.py b/grudge/shortcuts.py
index 475f72d24be204627b114813493ce623c431f9c4..94b8fb2ac6674667f8753aed501fbcc1806a1ffc 100644
--- a/grudge/shortcuts.py
+++ b/grudge/shortcuts.py
@@ -25,9 +25,6 @@ THE SOFTWARE.
 """
 
 
-import pyopencl as cl
-
-
 def set_up_rk4(field_var_name, dt, fields, rhs, t_start=0):
     from leap.rk import LSRK4MethodBuilder
     from dagrt.codegen import PythonCodeGenerator
@@ -46,14 +43,14 @@ def set_up_rk4(field_var_name, dt, fields, rhs, t_start=0):
 
 def make_visualizer(discrwb, vis_order):
     from meshmode.discretization.visualization import make_visualizer
-    with cl.CommandQueue(discrwb.cl_context) as queue:
-        return make_visualizer(queue, discrwb.discr_from_dd("vol"), vis_order)
+    return make_visualizer(
+            discrwb._setup_actx,
+            discrwb.discr_from_dd("vol"), vis_order)
 
 
 def make_boundary_visualizer(discrwb, vis_order):
     from meshmode.discretization.visualization import make_visualizer
     from grudge import sym
-    with cl.CommandQueue(discrwb.cl_context) as queue:
-        return make_visualizer(
-                queue, discrwb.discr_from_dd(sym.BTAG_ALL),
-                vis_order)
+    return make_visualizer(
+            discrwb._setup_actx, discrwb.discr_from_dd(sym.BTAG_ALL),
+            vis_order)
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index c90f61238508af66e0abf5b028608426211fc45d..9ff67b5c7791045f14a80028bfe314805c8a48a5 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -28,7 +28,10 @@ import numpy as np
 
 import six  # noqa
 from six.moves import zip, reduce
+
 from pytools import Record, memoize_method, memoize
+from pytools.obj_array import obj_array_vectorize
+
 from grudge import sym
 import grudge.symbolic.mappers as mappers
 from pymbolic.primitives import Variable, Subscript
@@ -40,7 +43,6 @@ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa: F401
 # {{{ instructions
 
 class Instruction(Record):
-    __slots__ = []
     priority = 0
     neglect_for_dofdesc_inference = False
 
@@ -87,7 +89,7 @@ class LoopyKernelDescriptor(object):
         import loopy as lp
         return [arg.name for arg in self.loopy_kernel.args
                 if isinstance(arg, lp.ValueArg)
-                and arg.name != "grdg_n"]
+                and arg.name not in ["nelements", "nunit_dofs"]]
 
 
 class LoopyKernelInstruction(Instruction):
@@ -364,9 +366,7 @@ def dot_dataflow_graph(code, max_node_label_length=30,
         for dep in insn.get_dependencies():
             gen_expr_arrow(dep, node_names[insn])
 
-    from pytools.obj_array import is_obj_array
-
-    if is_obj_array(code.result):
+    if isinstance(code.result, np.ndarray) and code.result.dtype.char == "O":
         for subexp in code.result:
             gen_expr_arrow(subexp, "result")
     else:
@@ -470,8 +470,6 @@ class Code(object):
 
         # {{{ make sure results do not get discarded
 
-        from pytools.obj_array import with_object_array_or_scalar
-
         dm = mappers.DependencyMapper(composite_leaves=False)
 
         def remove_result_variable(result_expr):
@@ -484,7 +482,7 @@ class Code(object):
                 assert isinstance(var, Variable)
                 discardable_vars.discard(var.name)
 
-        with_object_array_or_scalar(remove_result_variable, self.result)
+        obj_array_vectorize(remove_result_variable, self.result)
 
         # }}}
 
@@ -594,12 +592,11 @@ class Code(object):
 
         if log_quantities is not None:
             exec_sub_timer.stop().submit()
-        from pytools.obj_array import with_object_array_or_scalar
         if profile_data is not None:
             profile_data['total_time'] = time() - start_time
-            return (with_object_array_or_scalar(exec_mapper, self.result),
+            return (obj_array_vectorize(exec_mapper, self.result),
                     profile_data)
-        return with_object_array_or_scalar(exec_mapper, self.result)
+        return obj_array_vectorize(exec_mapper, self.result)
 
 # }}}
 
@@ -768,8 +765,7 @@ def aggregate_assignments(inf_mapper, instructions, result,
             for insn in processed_assigns + other_insns
             for expr in insn.get_dependencies())
 
-    from pytools.obj_array import is_obj_array
-    if is_obj_array(result):
+    if isinstance(result, np.ndarray) and result.dtype.char == "O":
         externally_used_names |= set(expr for expr in result)
     else:
         externally_used_names |= set([result])
@@ -846,13 +842,11 @@ def is_function_loopyable(function, function_registry):
 
 
 class ToLoopyExpressionMapper(mappers.IdentityMapper):
-    def __init__(self, dd_inference_mapper, temp_names, iname):
+    def __init__(self, dd_inference_mapper, temp_names, subscript):
         self.dd_inference_mapper = dd_inference_mapper
         self.function_registry = dd_inference_mapper.function_registry
         self.temp_names = temp_names
-        self.iname = iname
-        from pymbolic import var
-        self.iname_expr = var(iname)
+        self.subscript = subscript
 
         self.expr_to_name = {}
         self.used_names = set()
@@ -888,7 +882,7 @@ class ToLoopyExpressionMapper(mappers.IdentityMapper):
             return var(name)
         else:
             self.non_scalar_vars.append(name)
-            return var(name)[self.iname_expr]
+            return var(name)[self.subscript]
 
     def map_variable(self, expr):
         return self.map_variable_ref_expr(expr, expr.name)
@@ -1005,16 +999,19 @@ class ToLoopyInstructionMapper(object):
                         insn.exprs[0], self.function_registry))):
             return insn
 
-        iname = "grdg_i"
-        size_name = "grdg_n"
+        # FIXME: These names and the size names could clash with user-given names.
+        # Need better metadata tracking in loopy.
+        iel = "iel"
+        idof = "idof"
 
         temp_names = [
                 name
                 for name, dnr in zip(insn.names, insn.do_not_return)
                 if dnr]
 
+        from pymbolic import var
         expr_mapper = ToLoopyExpressionMapper(
-                self.dd_inference_mapper, temp_names, iname)
+                self.dd_inference_mapper, temp_names, (var(iel), var(idof)))
         insns = []
 
         import loopy as lp
@@ -1034,17 +1031,21 @@ class ToLoopyInstructionMapper(object):
             return insn
 
         knl = lp.make_kernel(
-                "{[%s]: 0 <= %s < %s}" % (iname, iname, size_name),
+                "{[%(iel)s, %(idof)s]: "
+                "0 <= %(iel)s < nelements and 0 <= %(idof)s < nunit_dofs}"
+                % {"iel": iel, "idof": idof},
                 insns,
-                default_offset=lp.auto,
 
                 name="grudge_assign_%d" % self.insn_count,
+
                 # Single-insn kernels may have their no_sync_with resolve to an
                 # empty set, that's OK.
-                options=lp.Options(check_dep_resolution=False))
-
-        knl = lp.set_options(knl, return_dict=True)
-        knl = lp.split_iname(knl, iname, 128, outer_tag="g.0", inner_tag="l.0")
+                options=lp.Options(
+                    check_dep_resolution=False,
+                    return_dict=True,
+                    no_numpy=True,
+                    )
+                )
 
         self.insn_count += 1
 
diff --git a/grudge/symbolic/operators.py b/grudge/symbolic/operators.py
index b57f22fd3b9fdb412c13fd01df5adc4045e0ea6c..b5d240606136a31764cfab68626e1fc78f401a8f 100644
--- a/grudge/symbolic/operators.py
+++ b/grudge/symbolic/operators.py
@@ -27,6 +27,8 @@ from six.moves import intern
 import numpy as np
 import pymbolic.primitives
 
+from typing import Tuple
+
 __doc__ = """
 
 Building blocks and mappers for operator expression trees.
@@ -111,7 +113,7 @@ class Operator(pymbolic.primitives.Expression):
         return StringifyMapper
 
     def __call__(self, expr):
-        from pytools.obj_array import with_object_array_or_scalar
+        from pytools.obj_array import obj_array_vectorize
         from grudge.tools import is_zero
 
         def bind_one(subexpr):
@@ -121,7 +123,7 @@ class Operator(pymbolic.primitives.Expression):
                 from grudge.symbolic.primitives import OperatorBinding
                 return OperatorBinding(self, subexpr)
 
-        return with_object_array_or_scalar(bind_one, expr)
+        return obj_array_vectorize(bind_one, expr)
 
     def with_dd(self, dd_in=None, dd_out=None):
         """Return a copy of *self*, modified to the given DOF descriptors.
@@ -131,7 +133,7 @@ class Operator(pymbolic.primitives.Expression):
                 dd_in=dd_in or self.dd_in,
                 dd_out=dd_out or self.dd_out)
 
-    init_arg_names = ("dd_in", "dd_out")
+    init_arg_names: Tuple[str, ...] = ("dd_in", "dd_out")
 
     def __getinitargs__(self):
         return (self.dd_in, self.dd_out,)
@@ -151,7 +153,7 @@ class ProjectionOperator(Operator):
         super(ProjectionOperator, self).__init__(dd_in, dd_out)
 
     def __call__(self, expr):
-        from pytools.obj_array import with_object_array_or_scalar
+        from pytools.obj_array import obj_array_vectorize
 
         def project_one(subexpr):
             from pymbolic.primitives import is_constant
@@ -164,7 +166,7 @@ class ProjectionOperator(Operator):
                 from grudge.symbolic.primitives import OperatorBinding
                 return OperatorBinding(self, subexpr)
 
-        return with_object_array_or_scalar(project_one, expr)
+        return obj_array_vectorize(project_one, expr)
 
     mapper_method = intern("map_projection")
 
@@ -640,9 +642,9 @@ class RefFaceMassOperator(ElementwiseLinearOperator):
         assert afgrp.nelements == nfaces * volgrp.nelements
 
         matrix = np.empty(
-                (volgrp.nunit_nodes,
+                (volgrp.nunit_dofs,
                     nfaces,
-                    afgrp.nunit_nodes),
+                    afgrp.nunit_dofs),
                 dtype=dtype)
 
         from modepy.tools import UNIT_VERTICES
diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index b1f121d209b6fa74170877170512a98681bb3a01..ef6ce5a7d38f92b3693b8132c305a500f7a8954f 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -363,8 +363,8 @@ class FunctionSymbol(ExpressionBase, VariableBase):
     """
 
     def __call__(self, *exprs):
-        from pytools.obj_array import with_object_array_or_scalar_n_args
-        return with_object_array_or_scalar_n_args(
+        from pytools.obj_array import obj_array_vectorize_n_args
+        return obj_array_vectorize_n_args(
                 super(FunctionSymbol, self).__call__, *exprs)
 
     mapper_method = "map_function_symbol"
@@ -397,10 +397,9 @@ class OperatorBinding(ExpressionBase):
         return self.op, self.field
 
     def is_equal(self, other):
-        from pytools.obj_array import obj_array_equal
         return (other.__class__ == self.__class__
                 and other.op == self.op
-                and obj_array_equal(other.field, self.field))
+                and np.array_equal(other.field, self.field))
 
     def get_hash(self):
         from pytools.obj_array import obj_array_to_hashable
diff --git a/grudge/tools.py b/grudge/tools.py
index 8c5f57b6f0704bc3a5d872cc77ddac5fe366fcf8..67107e48048b4e78727a97e5d828f9a53cffdcad 100644
--- a/grudge/tools.py
+++ b/grudge/tools.py
@@ -92,11 +92,11 @@ class SubsettableCrossProduct:
           used in place of the product *sign*xj*yk*. Defaults to just this
           product if not given.
         """
-        from pytools.obj_array import join_fields
+        from pytools.obj_array import flat_obj_array
         if three_mult is None:
-            return join_fields(*[f(x, y) for f in self.functions])
+            return flat_obj_array(*[f(x, y) for f in self.functions])
         else:
-            return join_fields(
+            return flat_obj_array(
                     *[sum(three_mult(lc, x[j], y[k]) for lc, j, k in lcjk)
                     for lcjk in self.component_lcjk])
 
diff --git a/requirements.txt b/requirements.txt
index deb093942eb18a06631afdde454793a5da1d2fb4..101166fb2a22e850d7698f14b16369bf083ca3d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
+git+https://github.com/inducer/pytools.git
 git+https://github.com/inducer/pymbolic.git
 git+https://github.com/inducer/islpy.git
 git+https://github.com/inducer/pyopencl.git
diff --git a/setup.cfg b/setup.cfg
index 5a9d9143c76ed52ca9dc4b326cf18d6b1f7f3de4..b2d98b73fe5e4ba7299a6e5aa4ce24b5e9591691 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,3 +8,10 @@ exclude=
   grudge/models/diffusion.py,
   grudge/models/nd_calculus.py,
   grudge/dt_finding.py
+
+[mypy]
+ignore_missing_imports = True
+
+[mypy-grudge.models.gas_dynamics]
+ignore_errors = True
+
diff --git a/setup.py b/setup.py
index 59dcb5c85c39006f335642f400b9f991778f985f..dd85b0dbc1f6b20bdae108442ba1bea58a7f2f1f 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@ def main():
           author="Andreas Kloeckner",
           author_email="inform@tiker.net",
           license="MIT",
-          url="http://gitlab.tiker.net/inducer/grudge",
+          url="http://github.com/inducer/grudge",
           classifiers=[
               'Development Status :: 3 - Alpha',
               'Intended Audience :: Developers',
@@ -45,9 +45,9 @@ def main():
           python_requires="~=3.6",
           install_requires=[
               "pytest>=2.3",
-              "pytools>=2018.5.2",
+              "pytools>=2020.3",
               "modepy>=2013.3",
-              "meshmode>=2013.3",
+              "meshmode>=2020.2",
               "pyopencl>=2013.1",
               "pymbolic>=2013.2",
               "loo.py>=2013.1beta",
diff --git a/test/test_grudge.py b/test/test_grudge.py
index 4734a199551abe8bb625e9cd5744dce46610a9cd..b6d82a3f4b7e64f871b39988077b9c9ba64f0bea 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -25,14 +25,14 @@ THE SOFTWARE.
 
 import numpy as np
 import numpy.linalg as la
+import pytest  # noqa
 
-import pyopencl as cl
-import pyopencl.array
-import pyopencl.clmath
+from pytools.obj_array import flat_obj_array, make_obj_array
 
-from pytools.obj_array import join_fields, make_obj_array
+import pyopencl as cl
 
-import pytest  # noqa
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import unflatten, flatten, flat_norm
 
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
@@ -50,6 +50,7 @@ from grudge import sym, bind, DGDiscretizationWithBoundaries
 def test_inverse_metric(ctx_factory, dim):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(a=(-0.5,)*dim, b=(0.5,)*dim,
@@ -70,7 +71,7 @@ def test_inverse_metric(ctx_factory, dim):
     from meshmode.mesh.processing import map_mesh
     mesh = map_mesh(mesh, m)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=4)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=4)
 
     sym_op = (
             sym.forward_metric_derivative_mat(mesh.dim)
@@ -80,13 +81,13 @@ def test_inverse_metric(ctx_factory, dim):
             .reshape(-1))
 
     op = bind(discr, sym_op)
-    mat = op(queue).reshape(mesh.dim, mesh.dim)
+    mat = op(actx).reshape(mesh.dim, mesh.dim)
 
     for i in range(mesh.dim):
         for j in range(mesh.dim):
             tgt = 1 if i == j else 0
 
-            err = np.max(np.abs((mat[i, j] - tgt).get(queue=queue)))
+            err = flat_norm(mat[i, j] - tgt, np.inf)
             logger.info("error[%d, %d]: %.5e", i, j, err)
             assert err < 1.0e-12, (i, j, err)
 
@@ -99,6 +100,7 @@ def test_mass_mat_trig(ctx_factory, ambient_dim, quad_tag):
     """
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     nelements = 17
     order = 4
@@ -120,7 +122,7 @@ def test_mass_mat_trig(ctx_factory, ambient_dim, quad_tag):
     mesh = generate_regular_rect_mesh(
             a=(a,)*ambient_dim, b=(b,)*ambient_dim,
             n=(nelements,)*ambient_dim, order=1)
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order,
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=order,
             quad_tag_to_group_factory=quad_tag_to_group_factory)
 
     def _get_variables_on(dd):
@@ -131,20 +133,20 @@ def test_mass_mat_trig(ctx_factory, ambient_dim, quad_tag):
         return sym_f, sym_x, sym_ones
 
     sym_f, sym_x, sym_ones = _get_variables_on(sym.DD_VOLUME)
-    f_volm = bind(discr, sym.cos(sym_x[0])**2)(queue).get()
-    ones_volm = bind(discr, sym_ones)(queue).get()
+    f_volm = actx.to_numpy(flatten(bind(discr, sym.cos(sym_x[0])**2)(actx)))
+    ones_volm = actx.to_numpy(flatten(bind(discr, sym_ones)(actx)))
 
     sym_f, sym_x, sym_ones = _get_variables_on(dd_quad)
-    f_quad = bind(discr, sym.cos(sym_x[0])**2)(queue)
-    ones_quad = bind(discr, sym_ones)(queue)
+    f_quad = bind(discr, sym.cos(sym_x[0])**2)(actx)
+    ones_quad = bind(discr, sym_ones)(actx)
 
     mass_op = bind(discr, sym.MassOperator(dd_quad, sym.DD_VOLUME)(sym_f))
 
-    num_integral_1 = np.dot(ones_volm, mass_op(queue, f=f_quad).get())
+    num_integral_1 = np.dot(ones_volm, actx.to_numpy(flatten(mass_op(f=f_quad))))
     err_1 = abs(num_integral_1 - true_integral)
     assert err_1 < 1e-9, err_1
 
-    num_integral_2 = np.dot(f_volm, mass_op(queue, f=ones_quad).get())
+    num_integral_2 = np.dot(f_volm, actx.to_numpy(flatten(mass_op(f=ones_quad))))
     err_2 = abs(num_integral_2 - true_integral)
     assert err_2 < 1.0e-9, err_2
 
@@ -152,7 +154,7 @@ def test_mass_mat_trig(ctx_factory, ambient_dim, quad_tag):
         # NOTE: `integral` always makes a square mass matrix and
         # `QuadratureSimplexGroupFactory` does not have a `mass_matrix` method.
         num_integral_3 = bind(discr,
-                sym.integral(sym_f, dd=dd_quad))(queue, f=f_quad)
+                sym.integral(sym_f, dd=dd_quad))(f=f_quad)
         err_3 = abs(num_integral_3 - true_integral)
         assert err_3 < 5.0e-10, err_3
 
@@ -166,6 +168,7 @@ def test_tri_diff_mat(ctx_factory, dim, order=4):
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import generate_regular_rect_mesh
 
@@ -176,20 +179,20 @@ def test_tri_diff_mat(ctx_factory, dim, order=4):
         mesh = generate_regular_rect_mesh(a=(-0.5,)*dim, b=(0.5,)*dim,
                 n=(n,)*dim, order=4)
 
-        discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=4)
+        discr = DGDiscretizationWithBoundaries(actx, mesh, order=4)
         nabla = sym.nabla(dim)
 
         for axis in range(dim):
             x = sym.nodes(dim)
 
-            f = bind(discr, sym.sin(3*x[axis]))(queue)
-            df = bind(discr, 3*sym.cos(3*x[axis]))(queue)
+            f = bind(discr, sym.sin(3*x[axis]))(actx)
+            df = bind(discr, 3*sym.cos(3*x[axis]))(actx)
 
             sym_op = nabla[axis](sym.var("f"))
             bound_op = bind(discr, sym_op)
-            df_num = bound_op(queue, f=f)
+            df_num = bound_op(f=f)
 
-            linf_error = la.norm((df_num-df).get(), np.Inf)
+            linf_error = flat_norm(df_num-df, np.Inf)
             axis_eoc_recs[axis].add_data_point(1/n, linf_error)
 
     for axis, eoc_rec in enumerate(axis_eoc_recs):
@@ -217,11 +220,12 @@ def test_2d_gauss_theorem(ctx_factory):
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=2)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=2)
 
     def f(x):
-        return join_fields(
+        return flat_obj_array(
                 sym.sin(3*x[0])+sym.cos(3*x[1]),
                 sym.sin(2*x[0])+sym.cos(x[1]))
 
@@ -234,7 +238,7 @@ def test_2d_gauss_theorem(ctx_factory):
                 sym.project("vol", sym.BTAG_ALL)(f(sym.nodes(2)))
                 .dot(sym.normal(sym.BTAG_ALL, 2)),
                 dd=sym.BTAG_ALL)
-            )(queue)
+            )(actx)
 
     assert abs(gauss_err) < 1e-13
 
@@ -255,6 +259,7 @@ def test_convergence_advec(ctx_factory, mesh_name, mesh_pars, op_type, flux_type
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from pytools.convergence import EOCRecorder
     eoc_rec = EOCRecorder()
@@ -314,7 +319,7 @@ def test_convergence_advec(ctx_factory, mesh_name, mesh_pars, op_type, flux_type
 
         from grudge.models.advection import (
                 StrongAdvectionOperator, WeakAdvectionOperator)
-        discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+        discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
         op_class = {
                 "strong": StrongAdvectionOperator,
                 "weak": WeakAdvectionOperator,
@@ -325,17 +330,17 @@ def test_convergence_advec(ctx_factory, mesh_name, mesh_pars, op_type, flux_type
 
         bound_op = bind(discr, op.sym_operator())
 
-        u = bind(discr, u_analytic(sym.nodes(dim)))(queue, t=0)
+        u = bind(discr, u_analytic(sym.nodes(dim)))(actx, t=0)
 
         def rhs(t, u):
-            return bound_op(queue, t=t, u=u)
+            return bound_op(t=t, u=u)
 
         if dim == 3:
             final_time = 0.1
         else:
             final_time = 0.2
 
-        h_max = bind(discr, sym.h_max_from_volume(discr.ambient_dim))(queue)
+        h_max = bind(discr, sym.h_max_from_volume(discr.ambient_dim))(actx)
         dt = dt_factor * h_max/order**2
         nsteps = (final_time // dt) + 1
         dt = final_time/nsteps + 1e-15
@@ -364,7 +369,7 @@ def test_convergence_advec(ctx_factory, mesh_name, mesh_pars, op_type, flux_type
 
         error_l2 = bind(discr,
             sym.norm(2, sym.var("u")-u_analytic(sym.nodes(dim))))(
-                queue, t=last_t, u=last_u)
+                t=last_t, u=last_u)
         logger.info("h_max %.5e error %.5e", h_max, error_l2)
         eoc_rec.add_data_point(h_max, error_l2)
 
@@ -381,6 +386,7 @@ def test_convergence_maxwell(ctx_factory,  order):
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from pytools.convergence import EOCRecorder
     eoc_rec = EOCRecorder()
@@ -394,7 +400,7 @@ def test_convergence_maxwell(ctx_factory,  order):
                 b=(1.0,)*dims,
                 n=(n,)*dims)
 
-        discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+        discr = DGDiscretizationWithBoundaries(actx, mesh, order=order)
 
         epsilon = 1
         mu = 1
@@ -403,7 +409,7 @@ def test_convergence_maxwell(ctx_factory,  order):
         sym_mode = get_rectangular_cavity_mode(1, (1, 2, 2))
 
         analytic_sol = bind(discr, sym_mode)
-        fields = analytic_sol(queue, t=0, epsilon=epsilon, mu=mu)
+        fields = analytic_sol(actx, t=0, epsilon=epsilon, mu=mu)
 
         from grudge.models.em import MaxwellOperator
         op = MaxwellOperator(epsilon, mu, flux_type=0.5, dimensions=dims)
@@ -411,7 +417,7 @@ def test_convergence_maxwell(ctx_factory,  order):
         bound_op = bind(discr, op.sym_operator())
 
         def rhs(t, w):
-            return bound_op(queue, t=t, w=w)
+            return bound_op(t=t, w=w)
 
         dt = 0.002
         final_t = dt * 5
@@ -433,8 +439,8 @@ def test_convergence_maxwell(ctx_factory,  order):
                 step += 1
                 logger.debug("[%04d] t = %.5e", step, event.t)
 
-        sol = analytic_sol(queue, mu=mu, epsilon=epsilon, t=step * dt)
-        vals = [norm(queue, u=(esc[i] - sol[i])) / norm(queue, u=sol[i]) for i in range(5)] # noqa E501
+        sol = analytic_sol(actx, mu=mu, epsilon=epsilon, t=step * dt)
+        vals = [norm(u=(esc[i] - sol[i])) / norm(u=sol[i]) for i in range(5)] # noqa E501
         total_error = sum(vals)
         eoc_rec.add_data_point(1.0/n, total_error)
 
@@ -455,10 +461,11 @@ def test_improvement_quadrature(ctx_factory, order):
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
     sym_nds = sym.nodes(dims)
-    advec_v = join_fields(-1*sym_nds[1], sym_nds[0])
+    advec_v = flat_obj_array(-1*sym_nds[1], sym_nds[0])
 
     flux = "upwind"
     op = VariableCoefficientAdvectionOperator(advec_v, 0, flux_type=flux)
@@ -489,15 +496,15 @@ def test_improvement_quadrature(ctx_factory, order):
             else:
                 quad_tag_to_group_factory = {"product": None}
 
-            discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order,
+            discr = DGDiscretizationWithBoundaries(actx, mesh, order=order,
                     quad_tag_to_group_factory=quad_tag_to_group_factory)
 
             bound_op = bind(discr, op.sym_operator())
-            fields = bind(discr, gaussian_mode())(queue, t=0)
+            fields = bind(discr, gaussian_mode())(actx, t=0)
             norm = bind(discr, sym.norm(2, sym.var("u")))
 
-            esc = bound_op(queue, u=fields)
-            total_error = norm(queue, u=esc)
+            esc = bound_op(u=fields)
+            total_error = norm(u=esc)
             eoc_rec.add_data_point(1.0/n, total_error)
 
         logger.info("\n%s", eoc_rec.pretty_print(
@@ -514,22 +521,6 @@ def test_improvement_quadrature(ctx_factory, order):
     assert q_eoc > order
 
 
-def test_foreign_points(ctx_factory):
-    pytest.importorskip("sumpy")
-    import sumpy.point_calculus as pc
-
-    cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
-
-    dim = 2
-    cp = pc.CalculusPatch(np.zeros(dim))
-
-    from grudge.discretization import PointsDiscretization
-    pdiscr = PointsDiscretization(cl.array.to_device(queue, cp.points))
-
-    bind(pdiscr, sym.nodes(dim)**2)(queue)
-
-
 def test_op_collector_order_determinism():
     class TestOperator(sym.Operator):
 
@@ -558,6 +549,7 @@ def test_op_collector_order_determinism():
 def test_bessel(ctx_factory):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     dims = 2
 
@@ -567,7 +559,7 @@ def test_bessel(ctx_factory):
             b=(1.0,)*dims,
             n=(8,)*dims)
 
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=3)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=3)
 
     nodes = sym.nodes(dims)
     r = sym.cse(sym.sqrt(nodes[0]**2 + nodes[1]**2))
@@ -579,7 +571,7 @@ def test_bessel(ctx_factory):
             + sym.bessel_j(n-1, r)
             - 2*n/r * sym.bessel_j(n, r))
 
-    z = bind(discr, sym.norm(2, bessel_zero))(queue)
+    z = bind(discr, sym.norm(2, bessel_zero))(actx)
 
     assert z < 1e-15
 
@@ -587,6 +579,7 @@ def test_bessel(ctx_factory):
 def test_external_call(ctx_factory):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     def double(queue, x):
         return 2 * x
@@ -596,7 +589,7 @@ def test_external_call(ctx_factory):
     dims = 2
 
     mesh = generate_regular_rect_mesh(a=(0,) * dims, b=(1,) * dims, n=(4,) * dims)
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=1)
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=1)
 
     ones = sym.Ones(sym.DD_VOLUME)
     op = (
@@ -614,37 +607,41 @@ def test_external_call(ctx_factory):
 
     bound_op = bind(discr, op, function_registry=freg)
 
-    result = bound_op(queue, double=double)
-    assert (result == 5).get().all()
+    result = bound_op(actx, double=double)
+    assert actx.to_numpy(flatten(result) == 5).all()
 
 
 @pytest.mark.parametrize("array_type", ["scalar", "vector"])
 def test_function_symbol_array(ctx_factory, array_type):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import generate_regular_rect_mesh
     dim = 2
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dim, b=(0.5,)*dim,
             n=(8,)*dim, order=4)
-    discr = DGDiscretizationWithBoundaries(ctx, mesh, order=4)
-    nnodes = discr.discr_from_dd(sym.DD_VOLUME).nnodes
+    discr = DGDiscretizationWithBoundaries(actx, mesh, order=4)
+    volume_discr = discr.discr_from_dd(sym.DD_VOLUME)
+    ndofs = sum(grp.ndofs for grp in volume_discr.groups)
 
     import pyopencl.clrandom        # noqa: F401
     if array_type == "scalar":
         sym_x = sym.var("x")
-        x = cl.clrandom.rand(queue, nnodes, dtype=np.float)
+        x = unflatten(actx, volume_discr,
+                cl.clrandom.rand(queue, ndofs, dtype=np.float))
     elif array_type == "vector":
         sym_x = sym.make_sym_array("x", dim)
         x = make_obj_array([
-            cl.clrandom.rand(queue, nnodes, dtype=np.float)
+            unflatten(actx, volume_discr,
+                cl.clrandom.rand(queue, ndofs, dtype=np.float))
             for _ in range(dim)
             ])
     else:
         raise ValueError("unknown array type")
 
-    norm = bind(discr, sym.norm(2, sym_x))(queue, x=x)
+    norm = bind(discr, sym.norm(2, sym_x))(x=x)
     assert isinstance(norm, float)
 
 
diff --git a/test/test_mpi_communication.py b/test/test_mpi_communication.py
index 0b5ae492798b2783564f4afc75ec3051a43c3461..a5109737106279f0a76d147a9d77a4e35d2d6d84 100644
--- a/test/test_mpi_communication.py
+++ b/test/test_mpi_communication.py
@@ -31,6 +31,9 @@ import numpy as np
 import pyopencl as cl
 import logging
 
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import flat_norm
+
 logger = logging.getLogger(__name__)
 logging.basicConfig()
 logger.setLevel(logging.INFO)
@@ -42,6 +45,8 @@ from grudge.shortcuts import set_up_rk4
 def simple_mpi_communication_entrypoint():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
+
     from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
 
     from mpi4py import MPI
@@ -62,12 +67,12 @@ def simple_mpi_communication_entrypoint():
     else:
         local_mesh = mesh_dist.receive_mesh_part()
 
-    vol_discr = DGDiscretizationWithBoundaries(cl_ctx, local_mesh, order=5,
+    vol_discr = DGDiscretizationWithBoundaries(actx, local_mesh, order=5,
             mpi_communicator=comm)
 
     sym_x = sym.nodes(local_mesh.dim)
     myfunc_symb = sym.sin(np.dot(sym_x, [2, 3]))
-    myfunc = bind(vol_discr, myfunc_symb)(queue)
+    myfunc = bind(vol_discr, myfunc_symb)(actx)
 
     sym_all_faces_func = sym.cse(
         sym.project("vol", "all_faces")(sym.var("myfunc")))
@@ -84,9 +89,8 @@ def simple_mpi_communication_entrypoint():
             ) - (sym_all_faces_func - sym_bdry_faces_func)
             )
 
-    hopefully_zero = bound_face_swap(queue, myfunc=myfunc)
-    import numpy.linalg as la
-    error = la.norm(hopefully_zero.get())
+    hopefully_zero = bound_face_swap(myfunc=myfunc)
+    error = flat_norm(hopefully_zero, np.inf)
 
     print(__file__)
     with np.printoptions(threshold=100000000, suppress=True):
@@ -99,6 +103,7 @@ def simple_mpi_communication_entrypoint():
 def mpi_communication_entrypoint():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from mpi4py import MPI
     comm = MPI.COMM_WORLD
@@ -124,7 +129,7 @@ def mpi_communication_entrypoint():
     else:
         local_mesh = mesh_dist.receive_mesh_part()
 
-    vol_discr = DGDiscretizationWithBoundaries(cl_ctx, local_mesh, order=order,
+    vol_discr = DGDiscretizationWithBoundaries(actx, local_mesh, order=order,
                                                mpi_communicator=comm)
 
     source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim]
@@ -148,9 +153,9 @@ def mpi_communication_entrypoint():
             radiation_tag=BTAG_ALL,
             flux_type="upwind")
 
-    from pytools.obj_array import join_fields
-    fields = join_fields(vol_discr.zeros(queue),
-            [vol_discr.zeros(queue) for i in range(vol_discr.dim)])
+    from pytools.obj_array import flat_obj_array
+    fields = flat_obj_array(vol_discr.zeros(actx),
+            [vol_discr.zeros(actx) for i in range(vol_discr.dim)])
 
     # FIXME
     # dt = op.estimate_rk4_timestep(vol_discr, fields=fields)
@@ -189,7 +194,7 @@ def mpi_communication_entrypoint():
     bound_op = bind(vol_discr, op.sym_operator())
 
     def rhs(t, w):
-        val, rhs.profile_data = bound_op(queue, profile_data=rhs.profile_data,
+        val, rhs.profile_data = bound_op(profile_data=rhs.profile_data,
                                          log_quantities=log_quantities,
                                          t=t, w=w)
         return val
@@ -219,7 +224,7 @@ def mpi_communication_entrypoint():
             step += 1
             logger.debug("[%04d] t = %.5e |u| = %.5e ellapsed %.5e",
                     step, event.t,
-                    norm(queue, u=event.state_component[0]),
+                    norm(u=event.state_component[0]),
                     time() - t_last_step)
 
             # if step % 10 == 0: