diff --git a/doc/array.rst b/doc/array.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a722c4799ace3d3d6f4d10b38333b68404bbbcc0
--- /dev/null
+++ b/doc/array.rst
@@ -0,0 +1,9 @@
+DOF (Degree-of-Freedom) Arrays
+==============================
+
+.. automodule:: meshmode.dof_array
+
+Array Contexts
+==============
+
+.. automodule:: meshmode.array_context
diff --git a/doc/index.rst b/doc/index.rst
index 3a7a2ba71dab133f5f754ef2d213c01f23d97d32..a5e39233edd23cf2ae9fc7a3ec6757d5fffd8fe1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -7,6 +7,7 @@ Contents:
    :maxdepth: 2
 
    mesh
+   array
    discretization
    connection
    misc
diff --git a/examples/plot-connectivity.py b/examples/plot-connectivity.py
index 068983ca81672295af65cdf3c82112478edac1a5..48e6be76b7033c4a3f04f55c6958c7efd856f0e0 100644
--- a/examples/plot-connectivity.py
+++ b/examples/plot-connectivity.py
@@ -2,7 +2,8 @@ from __future__ import division
 
 import numpy as np  # noqa
 import pyopencl as cl
-
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw
 
 order = 4
 
@@ -10,6 +11,7 @@ order = 4
 def main():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.mesh.generation import (  # noqa
             generate_icosphere, generate_icosahedron,
@@ -23,13 +25,13 @@ def main():
             PolynomialWarpAndBlendGroupFactory
 
     discr = Discretization(
-            cl_ctx, mesh, PolynomialWarpAndBlendGroupFactory(order))
+            actx, mesh, PolynomialWarpAndBlendGroupFactory(order))
 
     from meshmode.discretization.visualization import make_visualizer
-    vis = make_visualizer(queue, discr, order)
+    vis = make_visualizer(actx, discr, order)
 
     vis.write_vtk_file("geometry.vtu", [
-        ("f", discr.nodes()[0]),
+        ("f", thaw(actx, discr.nodes()[0])),
         ])
 
     from meshmode.discretization.visualization import \
diff --git a/examples/simple-dg.py b/examples/simple-dg.py
index 8fe6f70600c3230bca4240b3c0808bf853ea650b..b945687758f172194d859560bc1723064d56971b 100644
--- a/examples/simple-dg.py
+++ b/examples/simple-dg.py
@@ -27,14 +27,13 @@ import numpy as np
 import numpy.linalg as la  # noqa
 import pyopencl as cl
 import pyopencl.array as cla  # noqa
-import pyopencl.clmath as clmath
 from pytools import memoize_method, memoize_in
 from pytools.obj_array import (
-        join_fields, make_obj_array,
-        with_object_array_or_scalar,
-        is_obj_array)
-import loopy as lp
+        flat_obj_array, make_obj_array,
+        obj_array_vectorize)
 from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+from meshmode.dof_array import DOFArray, freeze, thaw
+from meshmode.array_context import PyOpenCLArrayContext, make_loopy_program
 
 
 # Features lost vs. https://github.com/inducer/grudge:
@@ -44,42 +43,39 @@ from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
 # - distributed-memory
 
 
-def with_queue(queue, ary):
-    return with_object_array_or_scalar(
-            lambda x: x.with_queue(queue), ary)
-
-
-def without_queue(ary):
-    return with_queue(None, ary)
-
-
 # {{{ discretization
 
-def parametrization_derivative(queue, discr):
+def parametrization_derivative(actx, discr):
+    thawed_nodes = thaw(actx, discr.nodes())
+
     result = np.zeros((discr.ambient_dim, discr.dim), dtype=object)
     for iambient in range(discr.ambient_dim):
         for idim in range(discr.dim):
             result[iambient, idim] = discr.num_reference_derivative(
-                    queue, (idim,), discr.nodes()[iambient])
+                    (idim,), thawed_nodes[iambient])
 
     return result
 
 
 class DGDiscretization:
-    def __init__(self, cl_ctx, mesh, order):
+    def __init__(self, actx, mesh, order):
         self.order = order
 
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
                 PolynomialWarpAndBlendGroupFactory
         self.group_factory = PolynomialWarpAndBlendGroupFactory(order=order)
-        self.volume_discr = Discretization(cl_ctx, mesh, self.group_factory)
+        self.volume_discr = Discretization(actx, mesh, self.group_factory)
 
         assert self.volume_discr.dim == 2
 
     @property
-    def cl_context(self):
-        return self.volume_discr.cl_context
+    def _setup_actx(self):
+        return self.volume_discr._setup_actx
+
+    @property
+    def array_context(self):
+        return self.volume_discr.array_context
 
     @property
     def dim(self):
@@ -91,6 +87,7 @@ class DGDiscretization:
     def boundary_connection(self, boundary_tag):
         from meshmode.discretization.connection import make_face_restriction
         return make_face_restriction(
+                        self.volume_discr._setup_actx,
                         self.volume_discr,
                         self.group_factory,
                         boundary_tag=boundary_tag)
@@ -100,6 +97,7 @@ class DGDiscretization:
         from meshmode.discretization.connection import (
                 make_face_restriction, FACE_RESTR_INTERIOR)
         return make_face_restriction(
+                        self.volume_discr._setup_actx,
                         self.volume_discr,
                         self.group_factory,
                         FACE_RESTR_INTERIOR,
@@ -110,13 +108,15 @@ class DGDiscretization:
         from meshmode.discretization.connection import \
                 make_opposite_face_connection
 
-        return make_opposite_face_connection(self.interior_faces_connection())
+        return make_opposite_face_connection(
+                self._setup_actx, self.interior_faces_connection())
 
     @memoize_method
     def all_faces_connection(self):
         from meshmode.discretization.connection import (
                 make_face_restriction, FACE_RESTR_ALL)
         return make_face_restriction(
+                        self.volume_discr._setup_actx,
                         self.volume_discr,
                         self.group_factory,
                         FACE_RESTR_ALL,
@@ -129,7 +129,7 @@ class DGDiscretization:
 
         faces_conn = self.get_connection("vol", where)
         return make_face_to_all_faces_embedding(
-                faces_conn, self.get_discr("all_faces"))
+                self._setup_actx, faces_conn, self.get_discr("all_faces"))
 
     def get_connection(self, src, tgt):
         src_tgt = (src, tgt)
@@ -148,11 +148,13 @@ class DGDiscretization:
             raise ValueError(f"locations '{src}'->'{tgt}' not understood")
 
     def interp(self, src, tgt, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
                     lambda el: self.interp(src, tgt, el), vec)
 
-        return self.get_connection(src, tgt)(vec.queue, vec)
+        return self.get_connection(src, tgt)(vec)
 
     def get_discr(self, where):
         if where == "vol":
@@ -170,40 +172,35 @@ class DGDiscretization:
 
     @memoize_method
     def parametrization_derivative(self):
-        with cl.CommandQueue(self.cl_context) as queue:
-            return without_queue(
-                    parametrization_derivative(queue, self.volume_discr))
+        return freeze(
+                parametrization_derivative(self._setup_actx, self.volume_discr))
 
     @memoize_method
     def vol_jacobian(self):
-        with cl.CommandQueue(self.cl_context) as queue:
-            [a, b], [c, d] = with_queue(queue, self.parametrization_derivative())
-            return (a*d-b*c).with_queue(None)
+        [a, b], [c, d] = thaw(self._setup_actx, self.parametrization_derivative())
+        return freeze(a*d-b*c)
 
     @memoize_method
     def inverse_parametrization_derivative(self):
-        with cl.CommandQueue(self.cl_context) as queue:
-            [a, b], [c, d] = with_queue(queue, self.parametrization_derivative())
+        [a, b], [c, d] = thaw(self._setup_actx, self.parametrization_derivative())
 
-            result = np.zeros((2, 2), dtype=object)
-            det = a*d-b*c
-            result[0, 0] = d/det
-            result[0, 1] = -b/det
-            result[1, 0] = -c/det
-            result[1, 1] = a/det
+        result = np.zeros((2, 2), dtype=object)
+        det = a*d-b*c
+        result[0, 0] = d/det
+        result[0, 1] = -b/det
+        result[1, 0] = -c/det
+        result[1, 1] = a/det
 
-            return without_queue(result)
+        return freeze(result)
 
-    def zeros(self, queue):
-        return self.volume_discr.zeros(queue)
+    def zeros(self, actx):
+        return self.volume_discr.zeros(actx)
 
     def grad(self, vec):
         ipder = self.inverse_parametrization_derivative()
 
-        queue = vec.queue
         dref = [
-                self.volume_discr.num_reference_derivative(
-                    queue, (idim,), vec).with_queue(queue)
+                self.volume_discr.num_reference_derivative((idim,), vec)
                 for idim in range(self.volume_discr.dim)]
 
         return make_obj_array([
@@ -218,22 +215,18 @@ class DGDiscretization:
     def normal(self, where):
         bdry_discr = self.get_discr(where)
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            ((a,), (b,)) = with_queue(
-                    queue, parametrization_derivative(queue, bdry_discr))
+        ((a,), (b,)) = parametrization_derivative(self._setup_actx, bdry_discr)
 
-            nrm = 1/(a**2+b**2)**0.5
-            return without_queue(join_fields(b*nrm, -a*nrm))
+        nrm = 1/(a**2+b**2)**0.5
+        return freeze(flat_obj_array(b*nrm, -a*nrm))
 
     @memoize_method
     def face_jacobian(self, where):
         bdry_discr = self.get_discr(where)
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            ((a,), (b,)) = with_queue(queue,
-                    parametrization_derivative(queue, bdry_discr))
+        ((a,), (b,)) = parametrization_derivative(self._setup_actx, bdry_discr)
 
-            return ((a**2 + b**2)**0.5).with_queue(None)
+        return freeze((a**2 + b**2)**0.5)
 
     @memoize_method
     def get_inverse_mass_matrix(self, grp, dtype):
@@ -242,37 +235,36 @@ class DGDiscretization:
                 grp.basis(),
                 grp.unit_nodes)
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            return (cla.to_device(queue, matrix)
-                    .with_queue(None))
+        actx = self._setup_actx
+        return actx.freeze(actx.from_numpy(matrix))
 
     def inverse_mass(self, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
                     lambda el: self.inverse_mass(el), vec)
 
         @memoize_in(self, "elwise_linear_knl")
         def knl():
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i<ndiscr_nodes_out and
+            return make_loopy_program(
+                """{[iel,idof,j]:
+                    0<=iel<nelements and
+                    0<=idof<ndiscr_nodes_out and
                     0<=j<ndiscr_nodes_in}""",
-                "result[k,i] = sum(j, mat[i, j] * vec[k, j])",
-                default_offset=lp.auto, name="diff")
-
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+                "result[iel,idof] = sum(j, mat[idof, j] * vec[iel, j])",
+                name="diff")
 
         discr = self.volume_discr
 
-        result = discr.empty(queue=vec.queue, dtype=vec.dtype)
+        result = discr.empty_like(vec)
 
         for grp in discr.groups:
-            matrix = self.get_inverse_mass_matrix(grp, vec.dtype)
+            matrix = self.get_inverse_mass_matrix(grp, vec.entry_dtype)
 
-            knl()(vec.queue, mat=matrix, result=grp.view(result),
-                    vec=grp.view(vec))
+            vec.array_context.call_loopy(
+                    knl(),
+                    mat=matrix, result=result[grp.index], vec=vec[grp.index])
 
         return result/self.vol_jacobian()
 
@@ -282,9 +274,9 @@ class DGDiscretization:
         assert afgrp.nelements == nfaces * volgrp.nelements
 
         matrix = np.empty(
-                (volgrp.nunit_nodes,
+                (volgrp.nunit_dofs,
                     nfaces,
-                    afgrp.nunit_nodes),
+                    afgrp.nunit_dofs),
                 dtype=dtype)
 
         from modepy.tools import UNIT_VERTICES
@@ -297,34 +289,33 @@ class DGDiscretization:
                     volgrp.order,
                     face_vertices)
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            return (cla.to_device(queue, matrix)
-                    .with_queue(None))
+        actx = self._setup_actx
+        return actx.freeze(actx.from_numpy(matrix))
 
     def face_mass(self, vec):
-        if is_obj_array(vec):
-            return with_object_array_or_scalar(
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            return obj_array_vectorize(
                     lambda el: self.face_mass(el), vec)
 
         @memoize_in(self, "face_mass_knl")
         def knl():
-            knl = lp.make_kernel(
-                """{[k,i,f,j]:
-                    0<=k<nelements and
+            return make_loopy_program(
+                """{[iel,idof,f,j]:
+                    0<=iel<nelements and
                     0<=f<nfaces and
-                    0<=i<nvol_nodes and
+                    0<=idof<nvol_nodes and
                     0<=j<nface_nodes}""",
-                "result[k,i] = sum(f, sum(j, mat[i, f, j] * vec[f, k, j]))",
-                default_offset=lp.auto, name="face_mass")
-
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+                "result[iel,idof] = "
+                "sum(f, sum(j, mat[idof, f, j] * vec[f, iel, j]))",
+                name="face_mass")
 
         all_faces_conn = self.get_connection("vol", "all_faces")
         all_faces_discr = all_faces_conn.to_discr
         vol_discr = all_faces_conn.from_discr
 
-        result = vol_discr.empty(queue=vec.queue, dtype=vec.dtype)
+        result = vol_discr.empty_like(vec)
 
         fj = self.face_jacobian("all_faces")
         vec = vec*fj
@@ -334,12 +325,12 @@ class DGDiscretization:
         for afgrp, volgrp in zip(all_faces_discr.groups, vol_discr.groups):
             nfaces = volgrp.mesh_el_group.nfaces
 
-            matrix = self.get_local_face_mass_matrix(afgrp, volgrp, vec.dtype)
+            matrix = self.get_local_face_mass_matrix(afgrp, volgrp, vec.entry_dtype)
 
-            input_view = afgrp.view(vec).reshape(
-                    nfaces, volgrp.nelements, afgrp.nunit_nodes)
-            knl()(vec.queue, mat=matrix, result=volgrp.view(result),
-                    vec=input_view)
+            vec.array_context.call_loopy(knl(),
+                    mat=matrix, result=result[volgrp.index],
+                    vec=vec[afgrp.index].reshape(
+                        nfaces, volgrp.nelements, afgrp.nunit_dofs))
 
         return result
 
@@ -379,8 +370,8 @@ class TracePair:
 
 def interior_trace_pair(discr, vec):
     i = discr.interp("vol", "int_faces", vec)
-    e = with_object_array_or_scalar(
-            lambda el: discr.opposite_face_connection()(el.queue, el),
+    e = obj_array_vectorize(
+            lambda el: discr.opposite_face_connection()(el),
             i)
     return TracePair("int_faces", i, e)
 
@@ -389,26 +380,26 @@ def interior_trace_pair(discr, vec):
 
 # {{{ wave equation bits
 
-def wave_flux(discr, c, w_tpair):
+def wave_flux(actx, discr, c, w_tpair):
     u = w_tpair[0]
     v = w_tpair[1:]
 
-    normal = with_queue(u.int.queue, discr.normal(w_tpair.where))
+    normal = thaw(actx, discr.normal(w_tpair.where))
 
-    flux_weak = join_fields(
+    flux_weak = flat_obj_array(
             np.dot(v.avg, normal),
             normal[0] * u.avg,
             normal[1] * u.avg)
 
     # upwind
     v_jump = np.dot(normal, v.int-v.ext)
-    flux_weak -= join_fields(
+    flux_weak -= flat_obj_array(
             0.5*(u.int-u.ext),
             0.5*normal[0]*v_jump,
             0.5*normal[1]*v_jump,
             )
 
-    flux_strong = join_fields(
+    flux_strong = flat_obj_array(
             np.dot(v.int, normal),
             u.int * normal[0],
             u.int * normal[1],
@@ -417,26 +408,27 @@ def wave_flux(discr, c, w_tpair):
     return discr.interp(w_tpair.where, "all_faces", c*flux_strong)
 
 
-def wave_operator(discr, c, w):
+def wave_operator(actx, discr, c, w):
     u = w[0]
     v = w[1:]
 
     dir_u = discr.interp("vol", BTAG_ALL, u)
     dir_v = discr.interp("vol", BTAG_ALL, v)
-    dir_bval = join_fields(dir_u, dir_v)
-    dir_bc = join_fields(-dir_u, dir_v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
 
     return (
-            - join_fields(
+            - flat_obj_array(
                 -c*discr.div(v),
                 -c*discr.grad(u)
                 )
             +  # noqa: W504
             discr.inverse_mass(
                 discr.face_mass(
-                    wave_flux(discr, c=c, w_tpair=interior_trace_pair(discr, w))
-                    + wave_flux(discr, c=c, w_tpair=TracePair(
-                        BTAG_ALL, dir_bval, dir_bc))
+                    wave_flux(actx, discr, c=c,
+                        w_tpair=interior_trace_pair(discr, w))
+                    + wave_flux(actx, discr, c=c,
+                        w_tpair=TracePair(BTAG_ALL, dir_bval, dir_bc))
                     ))
                 )
 
@@ -452,20 +444,20 @@ def rk4_step(y, t, h, f):
     return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
 
 
-def bump(discr, queue, t=0):
+def bump(actx, discr, t=0):
     source_center = np.array([0.0, 0.05])
     source_width = 0.05
     source_omega = 3
 
-    nodes = discr.volume_discr.nodes().with_queue(queue)
-    center_dist = join_fields([
+    nodes = thaw(actx, discr.volume_discr.nodes())
+    center_dist = flat_obj_array([
         nodes[0] - source_center[0],
         nodes[1] - source_center[1],
         ])
 
     return (
         np.cos(source_omega*t)
-        * clmath.exp(
+        * actx.np.exp(
             -np.dot(center_dist, center_dist)
             / source_width**2))
 
@@ -474,6 +466,8 @@ def main():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
+    actx = PyOpenCLArrayContext(queue)
+
     nel_1d = 16
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
@@ -488,18 +482,18 @@ def main():
 
     print("%d elements" % mesh.nelements)
 
-    discr = DGDiscretization(cl_ctx, mesh, order=order)
+    discr = DGDiscretization(actx, mesh, order=order)
 
-    fields = join_fields(
-            bump(discr, queue),
-            [discr.zeros(queue) for i in range(discr.dim)]
+    fields = flat_obj_array(
+            bump(actx, discr),
+            [discr.zeros(actx) for i in range(discr.dim)]
             )
 
     from meshmode.discretization.visualization import make_visualizer
-    vis = make_visualizer(queue, discr.volume_discr, discr.order+3)
+    vis = make_visualizer(actx, discr.volume_discr, discr.order+3)
 
     def rhs(t, w):
-        return wave_operator(discr, c=1, w=w)
+        return wave_operator(actx, discr, c=1, w=w)
 
     t = 0
     t_final = 3
@@ -508,7 +502,10 @@ def main():
         fields = rk4_step(fields, t, dt, rhs)
 
         if istep % 10 == 0:
-            print(istep, t, la.norm(fields[0].get()))
+            # FIXME: Maybe an integral function to go with the
+            # DOFArray would be nice?
+            assert len(fields[0]) == 1
+            print(istep, t, la.norm(actx.to_numpy(fields[0][0])))
             vis.write_vtk_file("fld-wave-min-%04d.vtu" % istep,
                     [
                         ("u", fields[0]),
diff --git a/meshmode/array_context.py b/meshmode/array_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..6118e9d5530c1ee46aff3ff75ed4d0413698420b
--- /dev/null
+++ b/meshmode/array_context.py
@@ -0,0 +1,306 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+from pytools import memoize_method
+from pytools.obj_array import obj_array_vectorized_n_args
+
+__doc__ = """
+.. autofunction:: make_loopy_program
+.. autoclass:: ArrayContext
+.. autoclass:: PyOpenCLArrayContext
+"""
+
+
+def make_loopy_program(domains, statements, kernel_data=["..."],
+        name="mm_actx_kernel"):
+    """Return a :class:`loopy.Program` suitable for use with
+    :meth:`ArrayContext.call_loopy`.
+    """
+    return lp.make_kernel(
+            domains,
+            statements,
+            kernel_data=kernel_data,
+            options=lp.Options(
+                no_numpy=True,
+                return_dict=True),
+            default_offset=lp.auto,
+            name=name,
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+
+# {{{ ArrayContext
+
+class _BaseFakeNumpyNamespace:
+    def __init__(self, array_context):
+        self._array_context = array_context
+
+    def __getattr__(self, name):
+        def f(*args):
+            actx = self._array_context
+            # FIXME: Maybe involve loopy type inference?
+            result = actx.empty(args[0].shape, args[0].dtype)
+            prg = actx._get_scalar_func_loopy_program(
+                    name, nargs=len(args), naxes=len(args[0].shape))
+            actx.call_loopy(prg, out=result,
+                    **{"inp%d" % i: arg for i, arg in enumerate(args)})
+            return result
+
+        return obj_array_vectorized_n_args(f)
+
+
+class ArrayContext:
+    """An interface that allows a :class:`Discretization` to create and interact with
+    arrays of degrees of freedom without fully specifying their types.
+
+    .. automethod:: empty
+    .. automethod:: zeros
+    .. automethod:: empty_like
+    .. automethod:: zeros_like
+    .. automethod:: from_numpy
+    .. automethod:: to_numpy
+    .. automethod:: call_loopy
+    .. attribute:: np
+
+         Provides access to a namespace that serves as a work-alike to
+         :mod:`numpy`.  The actual level of functionality provided is up to the
+         individual array context implementation, however the functions and
+         objects available under this namespace must not behave differently
+         from :mod:`numpy`.
+
+         As a baseline, special functions available through :mod:`loopy`
+         (e.g. ``sin``, ``exp``) are accessible through this interface.
+
+         Callables accessible through this namespace vectorize over object
+         arrays, including :class:`meshmode.dof_array.DOFArray`.
+
+    .. automethod:: freeze
+    .. automethod:: thaw
+
+    .. versionadded:: 2020.2
+    """
+
+    def __init__(self):
+        self.np = self._get_fake_numpy_namespace()
+
+    def _get_fake_numpy_namespace(self):
+        return _BaseFakeNumpyNamespace(self)
+
+    def empty(self, shape, dtype):
+        raise NotImplementedError
+
+    def zeros(self, shape, dtype):
+        raise NotImplementedError
+
+    def empty_like(self, ary):
+        return self.empty(shape=ary.shape, dtype=ary.dtype)
+
+    def zeros_like(self, ary):
+        return self.zeros(shape=ary.shape, dtype=ary.dtype)
+
+    def from_numpy(self, array: np.ndarray):
+        r"""
+        :returns: the :class:`numpy.ndarray` *array* converted to the
+            array context's array type. The returned array will be
+            :meth:`thaw`\ ed.
+        """
+        raise NotImplementedError
+
+    def to_numpy(self, array):
+        r"""
+        :returns: *array*, an array recognized by the context, converted
+            to a :class:`numpy.ndarray`. *array* must be
+            :meth:`thaw`\ ed.
+        """
+        raise NotImplementedError
+
+    def call_loopy(self, program, **kwargs):
+        """Execute the :mod:`loopy` program *program* on the arguments
+        *kwargs*.
+
+        *program* is a :class:`loopy.LoopKernel` or :class:`loopy.Program`.
+        It is expected to not yet be transformed for execution speed.
+        It must have :class:`loopy.Options.return_dict` set.
+
+        :return: a :class:`dict` of outputs from the program, each an
+            array understood by the context.
+        """
+        raise NotImplementedError
+
+    @memoize_method
+    def _get_scalar_func_loopy_program(self, name, nargs, naxes):
+        from pymbolic import var
+
+        var_names = ["i%d" % i for i in range(naxes)]
+        size_names = ["n%d" % i for i in range(naxes)]
+        subscript = tuple(var(vname) for vname in var_names)
+        from islpy import make_zero_and_vars
+        v = make_zero_and_vars(var_names, params=size_names)
+        domain = v[0].domain()
+        for vname, sname in zip(var_names, size_names):
+            domain = domain & v[0].le_set(v[vname]) & v[vname].lt_set(v[sname])
+
+        domain_bset, = domain.get_basic_sets()
+
+        return make_loopy_program(
+                [domain_bset],
+                [
+                    lp.Assignment(
+                        var("out")[subscript],
+                        var(name)(*[
+                            var("inp%d" % i)[subscript] for i in range(nargs)]))
+                    ],
+                name="actx_special_%s" % name)
+
+    def freeze(self, array):
+        """Return a version of the context-defined array *array* that is
+        'frozen', i.e. suitable for long-term storage and reuse. Frozen arrays
+        do not support arithmetic. For example, in the context of
+        :class:`~pyopencl.array.Array`, this might mean stripping the array
+        of an associated command queue, whereas in a lazily-evaluated context,
+        it might mean that the array is evaluated and stored.
+
+        Freezing makes the array independent of this :class:`ArrayContext`;
+        it is permitted to :meth:`thaw` it in a different one, as long as that
+        context understands the array format.
+        """
+        raise NotImplementedError
+
+    def thaw(self, array):
+        """Take a 'frozen' array and return a new array representing the data in
+        *array* that is able to perform arithmetic and other operations, using
+        the execution resources of this context. In the context of
+        :class:`~pyopencl.array.Array`, this might mean that the array is
+        equipped with a command queue, whereas in a lazily-evaluated context,
+        it might mean that the returned array is a symbol bound to
+        the data in *array*.
+
+        The returned array may not be used with other contexts while thawed.
+        """
+        raise NotImplementedError
+
+# }}}
+
+
+# {{{ PyOpenCLArrayContext
+
+class _PyOpenCLFakeNumpyNamespace(_BaseFakeNumpyNamespace):
+    def __getattr__(self, name):
+        if name in ["minimum", "maximum"]:
+            import pyopencl.array as cl_array
+            return obj_array_vectorized_n_args(getattr(cl_array, name))
+
+        return super().__getattr__(name)
+
+
+class PyOpenCLArrayContext(ArrayContext):
+    """
+    A :class:`ArrayContext` that uses :class:`pyopencl.array.Array` instances
+    for DOF arrays.
+
+    .. attribute:: context
+
+        A :class:`pyopencl.Context`.
+
+    .. attribute:: queue
+
+        A :class:`pyopencl.CommandQueue`.
+
+    .. attribute:: allocator
+    """
+
+    def __init__(self, queue, allocator=None):
+        super().__init__()
+        self.context = queue.context
+        self.queue = queue
+        self.allocator = allocator
+
+    def _get_fake_numpy_namespace(self):
+        return _PyOpenCLFakeNumpyNamespace(self)
+
+    # {{{ ArrayContext interface
+
+    def empty(self, shape, dtype):
+        import pyopencl.array as cla
+        return cla.empty(self.queue, shape=shape, dtype=dtype,
+                allocator=self.allocator)
+
+    def zeros(self, shape, dtype):
+        import pyopencl.array as cla
+        return cla.zeros(self.queue, shape=shape, dtype=dtype,
+                allocator=self.allocator)
+
+    def from_numpy(self, np_array: np.ndarray):
+        import pyopencl.array as cla
+        return cla.to_device(self.queue, np_array, allocator=self.allocator)
+
+    def to_numpy(self, array):
+        return array.get(queue=self.queue)
+
+    def call_loopy(self, program, **kwargs):
+        program = self.transform_loopy_program(program)
+        assert program.options.return_dict
+        assert program.options.no_numpy
+
+        evt, result = program(self.queue, **kwargs, allocator=self.allocator)
+        return result
+
+    def freeze(self, array):
+        array.finish()
+        return array.with_queue(None)
+
+    def thaw(self, array):
+        return array.with_queue(self.queue)
+
+    # }}}
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+        # FIXME: This could be much smarter.
+        import loopy as lp
+        all_inames = program.all_inames()
+
+        inner_iname = None
+        if "iel" not in all_inames and "i0" in all_inames:
+            outer_iname = "i0"
+
+            if "i1" in all_inames:
+                inner_iname = "i1"
+        else:
+            outer_iname = "iel"
+
+            if "idof" in all_inames:
+                inner_iname = "idof"
+
+        if inner_iname is not None:
+            program = lp.split_iname(program, inner_iname, 16, inner_tag="l.0")
+        return lp.tag_inames(program, {outer_iname: "g.0"})
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/meshmode/discretization/__init__.py b/meshmode/discretization/__init__.py
index a3847f2c3ab6c262c390f3c805619d74c85ab401..d19e5b92e1f58bf21416e09e8e23b734f24db206 100644
--- a/meshmode/discretization/__init__.py
+++ b/meshmode/discretization/__init__.py
@@ -1,6 +1,4 @@
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+__copyright__ = "Copyright (C) 2013-2020 Andreas Kloeckner"
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -23,12 +21,13 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import loopy as lp
-import pyopencl as cl
-import pyopencl.array  # noqa
 
-from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-from pytools import memoize_method, memoize_in
+from pytools import memoize_in, memoize_method
+from pytools.obj_array import make_obj_array
+from meshmode.array_context import ArrayContext, make_loopy_program
+
+# underscored because it shouldn't be imported from here.
+from meshmode.dof_array import DOFArray as _DOFArray
 
 __doc__ = """
 .. autoclass:: ElementGroupBase
@@ -50,22 +49,21 @@ class ElementGroupBase(object):
 
     .. attribute :: mesh_el_group
     .. attribute :: order
-    .. attribute :: node_nr_base
+    .. attribute :: index
 
     .. autoattribute:: nelements
-    .. autoattribute:: nunit_nodes
-    .. autoattribute:: nnodes
+    .. autoattribute:: nunit_dofs
+    .. autoattribute:: ndofs
     .. autoattribute:: dim
-    .. automethod:: view
 
     .. method:: unit_nodes()
 
-        Returns a :class:`numpy.ndarray` of shape ``(dim, nunit_nodes)``
+        Returns a :class:`numpy.ndarray` of shape ``(dim, nunit_dofs)``
         of reference coordinates of interpolation nodes.
 
     .. method:: weights()
 
-        Returns an array of length :attr:`nunit_nodes` containing
+        Returns an array of length :attr:`nunit_dofs` containing
         quadrature weights.
 
     .. attribute:: is_affine
@@ -75,14 +73,14 @@ class ElementGroupBase(object):
         :attr:`meshmode.mesh.MeshElementGroup.is_affine`.
     """
 
-    def __init__(self, mesh_el_group, order, node_nr_base):
+    def __init__(self, mesh_el_group, order, index):
         """
         :arg mesh_el_group: an instance of
             :class:`meshmode.mesh.MeshElementGroup`
         """
         self.mesh_el_group = mesh_el_group
         self.order = order
-        self.node_nr_base = node_nr_base
+        self.index = index
 
     @property
     def is_affine(self):
@@ -93,12 +91,18 @@ class ElementGroupBase(object):
         return self.mesh_el_group.nelements
 
     @property
-    def nunit_nodes(self):
+    def nunit_dofs(self):
+        """The number of (for now: nodal) degrees of freedom ("DOFs") associated with a
+        single element.
+        """
         return self.unit_nodes.shape[-1]
 
     @property
-    def nnodes(self):
-        return self.nunit_nodes * self.nelements
+    def ndofs(self):
+        """The total number of (for now: nodal) degrees of freedom ("DOFs") associated with
+        the element group.
+        """
+        return self.nunit_dofs * self.nelements
 
     @property
     def dim(self):
@@ -113,27 +117,6 @@ class ElementGroupBase(object):
     grad_basis = basis
     diff_matrices = basis
 
-    def _nodes(self):
-        # Not cached, because the global nodes array is what counts.
-        # This is just used to build that.
-
-        return np.tensordot(
-                self.mesh_el_group.nodes,
-                self._from_mesh_interp_matrix(),
-                (-1, -1))
-
-    def view(self, global_array):
-        """Return a view of *global_array* of shape ``(..., nelements,
-        nunit_nodes)`` where *global_array* is of shape ``(..., nnodes)``,
-        where *nnodes* is the global (per-discretization) node count.
-        """
-
-        return global_array[
-                ..., self.node_nr_base:self.node_nr_base + self.nnodes] \
-                .reshape(
-                        global_array.shape[:-1]
-                        + (self.nelements, self.nunit_nodes))
-
 # }}}
 
 
@@ -195,13 +178,6 @@ class OrderBasedGroupFactory(ElementGroupFactory):
 class Discretization(object):
     """An unstructured composite discretization.
 
-    :param cl_ctx: A :class:`pyopencl.Context`
-    :param mesh: A :class:`meshmode.mesh.Mesh` over which the discretization is
-        built
-    :param group_factory: An :class:`ElementGroupFactory`
-    :param real_dtype: The :mod:`numpy` data type used for representing real
-        data, either :class:`numpy.float32` or :class:`numpy.float64`
-
     .. attribute:: real_dtype
 
     .. attribute:: complex_dtype
@@ -212,35 +188,44 @@ class Discretization(object):
 
     .. attribute:: ambient_dim
 
-    .. attribute :: nnodes
+    .. attribute:: ndofs
 
     .. attribute :: groups
 
     .. automethod:: empty
-
     .. automethod:: zeros
+    .. automethod:: empty_like
+    .. automethod:: zeros_like
 
-    .. method:: nodes()
-
-        shape: ``(ambient_dim, nnodes)``
+    .. automethod:: nodes()
 
-    .. method:: num_reference_derivative(queue, ref_axes, vec)
+    .. automethod:: num_reference_derivative
 
-    .. method:: quad_weights(queue)
-
-        shape: ``(nnodes)``
+    .. automethod:: quad_weights
     """
 
-    def __init__(self, cl_ctx, mesh, group_factory, real_dtype=np.float64):
-        self.cl_context = cl_ctx
+    def __init__(self, actx: ArrayContext, mesh, group_factory,
+            real_dtype=np.float64):
+        """
+        :param actx: A :class:`ArrayContext` used to perform computation needed
+            during initial set-up of the mesh.
+        :param mesh: A :class:`meshmode.mesh.Mesh` over which the discretization is
+            built.
+        :param group_factory: An :class:`ElementGroupFactory`.
+        :param real_dtype: The :mod:`numpy` data type used for representing real
+            data, either :class:`numpy.float32` or :class:`numpy.float64`.
+        """
+
+        if not isinstance(actx, ArrayContext):
+            raise TypeError("'actx' must be an ArrayContext")
 
         self.mesh = mesh
-        self.nnodes = 0
-        self.groups = []
+        groups = []
         for mg in mesh.groups:
-            ng = group_factory(mg, self.nnodes)
-            self.groups.append(ng)
-            self.nnodes += ng.nnodes
+            ng = group_factory(mg, len(groups))
+            groups.append(ng)
+
+        self.groups = groups
 
         self.real_dtype = np.dtype(real_dtype)
         self.complex_dtype = np.dtype({
@@ -248,6 +233,8 @@ class Discretization(object):
                 np.float64: np.complex128
                 }[self.real_dtype.type])
 
+        self._setup_actx = actx
+
     @property
     def dim(self):
         return self.mesh.dim
@@ -256,13 +243,11 @@ class Discretization(object):
     def ambient_dim(self):
         return self.mesh.ambient_dim
 
-    def empty(self, queue=None, dtype=None, extra_dims=None, allocator=None):
-        """Return an empty DOF vector.
+    @property
+    def ndofs(self):
+        return sum(grp.ndofs for grp in self.groups)
 
-        :arg dtype: type special value 'c' will result in a
-            vector of dtype :attr:`self.complex_dtype`. If
-            *None* (the default), a real vector will be returned.
-        """
+    def _new_array(self, actx, creation_func, dtype=None):
         if dtype is None:
             dtype = self.real_dtype
         elif dtype == "c":
@@ -270,45 +255,55 @@ class Discretization(object):
         else:
             dtype = np.dtype(dtype)
 
-        if queue is None:
-            first_arg = self.cl_context
-        else:
-            first_arg = queue
+        return _DOFArray.from_list(actx, [
+            creation_func(shape=(grp.nelements, grp.nunit_dofs), dtype=dtype)
+            for grp in self.groups])
 
-        shape = (self.nnodes,)
-        if extra_dims is not None:
-            shape = extra_dims + shape
+    def empty(self, actx: ArrayContext, dtype=None):
+        """Return an empty :class:`~meshmode.dof_array.DOFArray`.
 
-        return cl.array.empty(first_arg, shape, dtype=dtype, allocator=allocator)
+        :arg dtype: type special value 'c' will result in a
+            vector of dtype :attr:`self.complex_dtype`. If
+            *None* (the default), a real vector will be returned.
+        """
+        if not isinstance(actx, ArrayContext):
+            raise TypeError("'actx' must be an ArrayContext, not '%s'"
+                    % type(actx).__name__)
+
+        return self._new_array(actx, actx.empty, dtype=dtype)
+
+    def zeros(self, actx: ArrayContext, dtype=None):
+        """Return a zero-initialized :class:`~meshmode.dof_array.DOFArray`.
 
-    def zeros(self, queue, dtype=None, extra_dims=None, allocator=None):
-        return self.empty(queue, dtype=dtype, extra_dims=extra_dims,
-                allocator=allocator).fill(0)
+        :arg dtype: type special value 'c' will result in a
+            vector of dtype :attr:`self.complex_dtype`. If
+            *None* (the default), a real vector will be returned.
+        """
+        if not isinstance(actx, ArrayContext):
+            raise TypeError("'actx' must be an ArrayContext, not '%s'"
+                    % type(actx).__name__)
 
-    def num_reference_derivative(self, queue, ref_axes, vec):
-        @memoize_in(self, "reference_derivative_knl")
-        def knl():
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i,j<ndiscr_nodes}""",
-                "result[k,i] = sum(j, diff_mat[i, j] * vec[k, j])",
-                default_offset=lp.auto, name="diff",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+        return self._new_array(actx, actx.zeros, dtype=dtype)
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+    def empty_like(self, array: _DOFArray):
+        return self.empty(array.array_context, dtype=array.entry_dtype)
 
-        result = self.empty(dtype=vec.dtype)
+    def zeros_like(self, array: _DOFArray):
+        return self.zeros(array.array_context, dtype=array.entry_dtype)
 
-        # Check that we don't differentiate vectors belonging to other
-        # discretizations.
-        assert vec.shape == (self.nnodes,)
+    def num_reference_derivative(self, ref_axes, vec):
+        actx = vec.array_context
 
-        for grp in self.groups:
-            if grp.nelements == 0:
-                continue
+        @memoize_in(actx, (Discretization, "reference_derivative_prg"))
+        def prg():
+            return make_loopy_program(
+                """{[iel,idof,j]:
+                    0<=iel<nelements and
+                    0<=idof,j<nunit_dofs}""",
+                "result[iel,idof] = sum(j, diff_mat[idof, j] * vec[iel, j])",
+                name="diff")
 
+        def get_mat(grp):
             mat = None
             for ref_axis in ref_axes:
                 next_mat = grp.diff_matrices()[ref_axis]
@@ -317,68 +312,68 @@ class Discretization(object):
                 else:
                     mat = np.dot(next_mat, mat)
 
-            knl()(queue, diff_mat=mat, result=grp.view(result), vec=grp.view(vec))
-
-        return result
-
-    def quad_weights(self, queue):
-        @memoize_in(self, "quad_weights_knl")
-        def knl():
-            knl = lp.make_kernel(
-                "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}",
-                "result[k,i] = weights[i]",
-                name="quad_weights",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            return mat
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+        return _DOFArray.from_list(actx, [
+                actx.call_loopy(
+                    prg(), diff_mat=actx.from_numpy(get_mat(grp)), vec=vec[grp.index]
+                    )["result"]
+                for grp in self.groups])
 
-        result = self.empty(dtype=self.real_dtype)
-        for grp in self.groups:
-            if grp.nelements == 0:
-                continue
-
-            knl()(queue, result=grp.view(result), weights=grp.weights)
-        return result
+    @memoize_method
+    def quad_weights(self):
+        """:returns: A :class:`~meshmode.dof_array.DOFArray` with quadrature weights.
+        """
+        actx = self._setup_actx
+
+        @memoize_in(actx, (Discretization, "quad_weights_prg"))
+        def prg():
+            return make_loopy_program(
+                "{[iel,idof]: 0<=iel<nelements and 0<=idof<nunit_dofs}",
+                "result[iel,idof] = weights[idof]",
+                name="quad_weights")
+
+        return _DOFArray.from_list(None, [
+                actx.freeze(
+                    actx.call_loopy(
+                        prg(),
+                        weights=actx.from_numpy(grp.weights),
+                        nelements=grp.nelements,
+                        )["result"])
+                for grp in self.groups])
 
     @memoize_method
     def nodes(self):
-        @memoize_in(self, "nodes_knl")
-        def knl():
-            knl = lp.make_kernel(
-                """{[d,k,i,j]:
-                    0<=d<dims and
-                    0<=k<nelements and
-                    0<=i<ndiscr_nodes and
+        r"""
+        :returns: object array of shape ``(ambient_dim,)`` containing
+            :class:`~meshmode.dof_array.DOFArray`\ s of node coordinates.
+        """
+
+        actx = self._setup_actx
+
+        @memoize_in(actx, (Discretization, "nodes_prg"))
+        def prg():
+            return make_loopy_program(
+                """{[iel,idof,j]:
+                    0<=iel<nelements and
+                    0<=idof<ndiscr_nodes and
                     0<=j<nmesh_nodes}""",
                 """
-                    result[d, k, i] = \
-                        sum(j, resampling_mat[i, j] * nodes[d, k, j])
+                    result[iel, idof] = \
+                        sum(j, resampling_mat[idof, j] * nodes[iel, j])
                     """,
-                name="nodes",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            knl = lp.tag_inames(knl, dict(k="g.0"))
-            knl = lp.tag_array_axes(knl, "result",
-                    "stride:auto,stride:auto,stride:auto")
-            return knl
-
-        result = self.empty(dtype=self.real_dtype, extra_dims=(self.ambient_dim,))
-
-        with cl.CommandQueue(self.cl_context) as queue:
-            for grp in self.groups:
-                if grp.nelements == 0:
-                    continue
-
-                meg = grp.mesh_el_group
-                knl()(queue,
-                        resampling_mat=grp.from_mesh_interp_matrix(),
-                        result=grp.view(result), nodes=meg.nodes)
-
-        return result
-
+                name="nodes")
+
+        return make_obj_array([
+            _DOFArray.from_list(None, [
+                actx.freeze(
+                    actx.call_loopy(
+                        prg(),
+                        resampling_mat=actx.from_numpy(
+                            grp.from_mesh_interp_matrix()),
+                        nodes=actx.from_numpy(grp.mesh_el_group.nodes[iaxis])
+                        )["result"])
+                for grp in self.groups])
+            for iaxis in range(self.ambient_dim)])
 
 # vim: fdm=marker
diff --git a/meshmode/discretization/connection/__init__.py b/meshmode/discretization/connection/__init__.py
index 5942c7e2e1c7fc04dacd0a5309faac621bc27e9e..cf812738bbce21fa7c508399d9a4cdc9cc4e806b 100644
--- a/meshmode/discretization/connection/__init__.py
+++ b/meshmode/discretization/connection/__init__.py
@@ -25,9 +25,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import pyopencl as cl
-import pyopencl.array  # noqa
-
 from meshmode.discretization.connection.direct import (
         InterpolationBatch,
         DiscretizationConnectionElementGroup,
@@ -38,6 +35,7 @@ from meshmode.discretization.connection.chained import \
 from meshmode.discretization.connection.projection import \
         L2ProjectionInverseDiscretizationConnection
 
+from meshmode.array_context import ArrayContext
 from meshmode.discretization.connection.same_mesh import \
         make_same_mesh_connection
 from meshmode.discretization.connection.face import (
@@ -120,27 +118,26 @@ Implementation details
 
 # {{{ check connection
 
-def check_connection(connection):
+def check_connection(actx: ArrayContext, connection: DirectDiscretizationConnection):
     from_discr = connection.from_discr
     to_discr = connection.to_discr
 
     assert len(connection.groups) == len(to_discr.groups)
 
-    with cl.CommandQueue(to_discr.cl_context) as queue:
-        for cgrp, tgrp in zip(connection.groups, to_discr.groups):
-            for batch in cgrp.batches:
-                fgrp = from_discr.groups[batch.from_group_index]
-
-                from_element_indices = batch.from_element_indices.get(queue)
-                to_element_indices = batch.to_element_indices.get(queue)
+    for cgrp, tgrp in zip(connection.groups, to_discr.groups):
+        for batch in cgrp.batches:
+            fgrp = from_discr.groups[batch.from_group_index]
 
-                assert (0 <= from_element_indices).all()
-                assert (0 <= to_element_indices).all()
-                assert (from_element_indices < fgrp.nelements).all()
-                assert (to_element_indices < tgrp.nelements).all()
-                if batch.to_element_face is not None:
-                    assert 0 <= batch.to_element_face < fgrp.mesh_el_group.nfaces
+            from_element_indices = actx.to_numpy(
+                    actx.thaw(batch.from_element_indices))
+            to_element_indices = actx.to_numpy(actx.thaw(batch.to_element_indices))
 
+            assert (0 <= from_element_indices).all()
+            assert (0 <= to_element_indices).all()
+            assert (from_element_indices < fgrp.nelements).all()
+            assert (to_element_indices < tgrp.nelements).all()
+            if batch.to_element_face is not None:
+                assert 0 <= batch.to_element_face < fgrp.mesh_el_group.nfaces
 
 # }}}
 
diff --git a/meshmode/discretization/connection/chained.py b/meshmode/discretization/connection/chained.py
index bd1da1f4221f079ad081bba4d86c10742969f2bf..53e57e2ea02fbc9395244f10cd07f23752bbbeca 100644
--- a/meshmode/discretization/connection/chained.py
+++ b/meshmode/discretization/connection/chained.py
@@ -23,8 +23,6 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 
 from pytools import Record
 
@@ -65,9 +63,9 @@ class ChainedDiscretizationConnection(DiscretizationConnection):
 
         self.connections = connections
 
-    def __call__(self, queue, vec):
+    def __call__(self, vec):
         for cnx in self.connections:
-            vec = cnx(queue, vec)
+            vec = cnx(vec)
 
         return vec
 
@@ -86,13 +84,13 @@ def _iterbatches(groups):
             yield (igrp, ibatch), (grp, batch)
 
 
-def _build_element_lookup_table(queue, conn):
+def _build_element_lookup_table(actx, conn):
     el_table = [np.full(g.nelements, -1, dtype=np.int)
                 for g in conn.to_discr.groups]
 
     for (igrp, _), (_, batch) in _iterbatches(conn.groups):
-        el_table[igrp][batch.to_element_indices.get(queue)] = \
-                batch.from_element_indices.get(queue)
+        el_table[igrp][actx.to_numpy(batch.to_element_indices)] = \
+                actx.to_numpy(batch.from_element_indices)
 
     return el_table
 
@@ -146,12 +144,12 @@ def _build_new_group_table(from_conn, to_conn):
     return grp_to_grp, batch_info
 
 
-def _build_batches(queue, from_bins, to_bins, batch):
+def _build_batches(actx, from_bins, to_bins, batch):
     from meshmode.discretization.connection.direct import \
             InterpolationBatch
 
     def to_device(x):
-        return cl.array.to_device(queue, np.asarray(x))
+        return actx.freeze(actx.from_numpy(np.asarray(x)))
 
     for ibatch, (from_bin, to_bin) in enumerate(zip(from_bins, to_bins)):
         yield InterpolationBatch(
@@ -162,7 +160,7 @@ def _build_batches(queue, from_bins, to_bins, batch):
                 to_element_face=batch[ibatch].to_element_face)
 
 
-def flatten_chained_connection(queue, connection):
+def flatten_chained_connection(actx, connection):
     """Collapse a connection into a direct connection.
 
     If the given connection is already a
@@ -189,7 +187,7 @@ def flatten_chained_connection(queue, connection):
         If a large number of connections is chained, the number of groups and
         batches can become very large.
 
-    :arg queue: An instance of :class:`pyopencl.CommandQueue`.
+    :arg actx: An instance of :class:`meshmode.array_contex.ArrayContext`.
     :arg connection: An instance of
         :class:`~meshmode.discretization.connection.DiscretizationConnection`.
     :return: An instance of
@@ -204,19 +202,19 @@ def flatten_chained_connection(queue, connection):
         return connection
 
     if not connection.connections:
-        return make_same_mesh_connection(connection.to_discr,
+        return make_same_mesh_connection(actx, connection.to_discr,
                                          connection.from_discr)
 
     # recursively build direct connections
     connections = connection.connections
     direct_connections = []
     for conn in connections:
-        direct_connections.append(flatten_chained_connection(queue, conn))
+        direct_connections.append(flatten_chained_connection(actx, conn))
 
     # merge all the direct connections
     from_conn = direct_connections[0]
     for to_conn in direct_connections[1:]:
-        el_table = _build_element_lookup_table(queue, from_conn)
+        el_table = _build_element_lookup_table(actx, from_conn)
         grp_to_grp, batch_info = _build_new_group_table(from_conn, to_conn)
 
         # distribute the indices to new groups and batches
@@ -224,13 +222,13 @@ def flatten_chained_connection(queue, connection):
         to_bins = [[np.empty(0, dtype=np.int) for _ in g] for g in batch_info]
 
         for (igrp, ibatch), (_, from_batch) in _iterbatches(from_conn.groups):
-            from_to_element_indices = from_batch.to_element_indices.get(queue)
+            from_to_element_indices = actx.to_numpy(from_batch.to_element_indices)
 
             for (jgrp, jbatch), (_, to_batch) in _iterbatches(to_conn.groups):
                 igrp_new, ibatch_new = grp_to_grp[igrp, ibatch, jgrp, jbatch]
 
-                jfrom = to_batch.from_element_indices.get(queue)
-                jto = to_batch.to_element_indices.get(queue)
+                jfrom = actx.to_numpy(to_batch.from_element_indices)
+                jto = actx.to_numpy(to_batch.to_element_indices)
 
                 mask = np.isin(jfrom, from_to_element_indices)
                 from_bins[igrp_new][ibatch_new] = \
@@ -244,7 +242,7 @@ def flatten_chained_connection(queue, connection):
         groups = []
         for igrp, (from_bin, to_bin) in enumerate(zip(from_bins, to_bins)):
             groups.append(DiscretizationConnectionElementGroup(
-                list(_build_batches(queue, from_bin, to_bin,
+                list(_build_batches(actx, from_bin, to_bin,
                                     batch_info[igrp]))))
 
         from_conn = DirectDiscretizationConnection(
@@ -260,7 +258,7 @@ def flatten_chained_connection(queue, connection):
 
 # {{{ build chained resample matrix
 
-def make_full_resample_matrix(queue, connection):
+def make_full_resample_matrix(actx, connection):
     """Build a dense matrix representing the discretization connection.
 
     This is based on
@@ -273,7 +271,7 @@ def make_full_resample_matrix(queue, connection):
         This method will be very slow, both in terms of speed and memory
         usage, and should only be used for testing or if absolutely necessary.
 
-    :arg queue: a :class:`pyopencl.CommandQueue`.
+    :arg actx: a :class:`meshmode.array_context.ArrayContext`.
     :arg connection: a
         :class:`~meshmode.discretization.connection.DiscretizationConnection`.
     :return: a :class:`pyopencl.array.Array` of shape
@@ -281,21 +279,22 @@ def make_full_resample_matrix(queue, connection):
     """
 
     if hasattr(connection, "full_resample_matrix"):
-        return connection.full_resample_matrix(queue)
+        return connection.full_resample_matrix(actx)
 
     if not hasattr(connection, 'connections'):
         raise TypeError('connection is not chained')
 
     if not connection.connections:
         result = np.eye(connection.to_discr.nnodes)
-        return cl.array.to_device(queue, result)
+        return actx.from_numpy(result)
 
-    acc = make_full_resample_matrix(queue, connection.connections[0]).get(queue)
+    acc = actx.to_numpy(
+            make_full_resample_matrix(actx, connection.connections[0]))
     for conn in connection.connections[1:]:
-        resampler = make_full_resample_matrix(queue, conn).get(queue)
-        acc = resampler.dot(acc)
+        resampler = actx.to_numpy(make_full_resample_matrix(actx, conn))
+        acc = resampler @ acc
 
-    return cl.array.to_device(queue, acc)
+    return actx.from_numpy(acc)
 
 # }}}
 
diff --git a/meshmode/discretization/connection/direct.py b/meshmode/discretization/connection/direct.py
index 20f5cdb94023b15e42d713dffd538986d3bc823c..0fb957b8e92584519a665b277da0d14730a8d81f 100644
--- a/meshmode/discretization/connection/direct.py
+++ b/meshmode/discretization/connection/direct.py
@@ -25,11 +25,10 @@ THE SOFTWARE.
 from six.moves import range, zip
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 
-from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-from pytools import memoize_method, memoize_in
+import loopy as lp
+from pytools import memoize_in, keyed_memoize_method
+from meshmode.array_context import ArrayContext, make_loopy_program
 
 
 # {{{ interpolation batch
@@ -134,12 +133,6 @@ class DiscretizationConnection(object):
     .. automethod:: __call__
     """
     def __init__(self, from_discr, to_discr, is_surjective):
-        if from_discr.cl_context != to_discr.cl_context:
-            raise ValueError("from_discr and to_discr must live in the "
-                    "same OpenCL context")
-
-        self.cl_context = from_discr.cl_context
-
         if from_discr.mesh.vertex_id_dtype != to_discr.mesh.vertex_id_dtype:
             raise ValueError("from_discr and to_discr must agree on the "
                     "vertex_id_dtype")
@@ -153,7 +146,7 @@ class DiscretizationConnection(object):
 
         self.is_surjective = is_surjective
 
-    def __call__(self, queue, vec):
+    def __call__(self, ary):
         raise NotImplementedError()
 
 
@@ -188,8 +181,9 @@ class DirectDiscretizationConnection(DiscretizationConnection):
 
         self.groups = groups
 
-    @memoize_method
-    def _resample_matrix(self, to_group_index, ibatch_index):
+    @keyed_memoize_method(key=lambda actx, to_group_index, ibatch_index:
+            (to_group_index, ibatch_index))
+    def _resample_matrix(self, actx: ArrayContext, to_group_index, ibatch_index):
         import modepy as mp
         ibatch = self.groups[to_group_index].batches[ibatch_index]
         from_grp = self.from_discr.groups[ibatch.from_group_index]
@@ -214,11 +208,12 @@ class DirectDiscretizationConnection(DiscretizationConnection):
                     from_grp.basis(),
                     ibatch.result_unit_nodes, from_grp.unit_nodes)
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            return cl.array.to_device(queue, result).with_queue(None)
+        return actx.freeze(actx.from_numpy(result))
 
-    @memoize_method
-    def _resample_point_pick_indices(self, to_group_index, ibatch_index,
+    @keyed_memoize_method(lambda actx, to_group_index, ibatch_index,
+            tol_multiplier=None: (to_group_index, ibatch_index, tol_multiplier))
+    def _resample_point_pick_indices(self, actx: ArrayContext,
+            to_group_index, ibatch_index,
             tol_multiplier=None):
         """If :meth:`_resample_matrix` *R* is a row subset of a permutation matrix *P*,
         return the index subset I so that, loosely, ``x[I] == R @ x``.
@@ -227,9 +222,8 @@ class DirectDiscretizationConnection(DiscretizationConnection):
         :class:`pyopencl.array.Array` containing the index subset.
         """
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            mat = self._resample_matrix(to_group_index, ibatch_index).get(
-                    queue=queue)
+        mat = actx.to_numpy(actx.thaw(
+                self._resample_matrix(actx, to_group_index, ibatch_index)))
 
         nrows, ncols = mat.shape
         result = np.zeros(nrows, dtype=self.to_discr.mesh.element_id_dtype)
@@ -251,30 +245,34 @@ class DirectDiscretizationConnection(DiscretizationConnection):
             one_index, = one_indices
             result[irow] = one_index
 
-        with cl.CommandQueue(self.cl_context) as queue:
-            return cl.array.to_device(queue, result).with_queue(None)
+        return actx.freeze(actx.from_numpy(result))
 
-    def full_resample_matrix(self, queue):
+    def full_resample_matrix(self, actx):
         """Build a dense matrix representing this discretization connection.
 
         .. warning::
 
             On average, this will be exceedingly expensive (:math:`O(N^2)` in
             the number *N* of discretization points) in terms of memory usage
-            and thus not what you'd typically want.
+            and thus not what you'd typically want, other than maybe for
+            testing.
+
+        .. note::
+
+            This function assumes a flattened DOF array, as produced by
+            :class:`~meshmode.dof_array.flatten`.
         """
 
-        @memoize_in(self, "oversample_mat_knl")
+        @memoize_in(actx, (DirectDiscretizationConnection, "oversample_mat_knl"))
         def knl():
-            import loopy as lp
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i<n_to_nodes and
+            return make_loopy_program(
+                """{[iel, idof, j]:
+                    0<=iel<nelements and
+                    0<=idof<n_to_nodes and
                     0<=j<n_from_nodes}""",
-                "result[itgt_base + to_element_indices[k]*n_to_nodes + i, \
-                        isrc_base + from_element_indices[k]*n_from_nodes + j] \
-                    = resample_mat[i, j]",
+                "result[itgt_base + to_element_indices[iel]*n_to_nodes + idof, \
+                        isrc_base + from_element_indices[iel]*n_from_nodes + j] \
+                    = resample_mat[idof, j]",
                 [
                     lp.GlobalArg("result", None,
                         shape="nnodes_tgt, nnodes_src",
@@ -283,78 +281,87 @@ class DirectDiscretizationConnection(DiscretizationConnection):
                     lp.ValueArg("nnodes_tgt,nnodes_src", np.int32),
                     "...",
                     ],
-                name="oversample_mat",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+                name="oversample_mat")
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+        to_discr_ndofs = sum(grp.nelements*grp.nunit_dofs
+                for grp in self.to_discr.groups)
+        from_discr_ndofs = sum(grp.nelements*grp.nunit_dofs
+                for grp in self.from_discr.groups)
 
-        result = cl.array.zeros(
-                queue,
-                (self.to_discr.nnodes, self.from_discr.nnodes),
+        result = actx.zeros(
+                (to_discr_ndofs, from_discr_ndofs),
                 dtype=self.to_discr.real_dtype)
 
+        from_group_sizes = [
+                grp.nelements*grp.nunit_dofs
+                for grp in self.from_discr.groups]
+        from_group_starts = np.cumsum([0] + from_group_sizes)
+
+        tgt_node_nr_base = 0
         for i_tgrp, (tgrp, cgrp) in enumerate(
                 zip(self.to_discr.groups, self.groups)):
             for i_batch, batch in enumerate(cgrp.batches):
                 if not len(batch.from_element_indices):
                     continue
 
-                sgrp = self.from_discr.groups[batch.from_group_index]
+                actx.call_loopy(knl(),
+                        resample_mat=self._resample_matrix(actx, i_tgrp, i_batch),
+                        result=result,
+                        itgt_base=tgt_node_nr_base,
+                        isrc_base=from_group_starts[batch.from_group_index],
+                        from_element_indices=batch.from_element_indices,
+                        to_element_indices=batch.to_element_indices)
 
-                knl()(queue,
-                      resample_mat=self._resample_matrix(i_tgrp, i_batch),
-                      result=result,
-                      itgt_base=tgrp.node_nr_base,
-                      isrc_base=sgrp.node_nr_base,
-                      from_element_indices=batch.from_element_indices,
-                      to_element_indices=batch.to_element_indices)
+            tgt_node_nr_base += tgrp.nelements*tgrp.nunit_dofs
 
         return result
 
-    def __call__(self, queue, vec):
-        @memoize_in(self, "resample_by_mat_knl")
+    def __call__(self, ary):
+        from meshmode.dof_array import DOFArray
+        if not isinstance(ary, DOFArray):
+            raise TypeError("non-array passed to discretization connection")
+
+        actx = ary.array_context
+
+        @memoize_in(actx, (DirectDiscretizationConnection, "resample_by_mat_knl"))
         def mat_knl():
-            import loopy as lp
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i<n_to_nodes and
+            knl = make_loopy_program(
+                """{[iel, idof, j]:
+                    0<=iel<nelements and
+                    0<=idof<n_to_nodes and
                     0<=j<n_from_nodes}""",
-                "result[to_element_indices[k], i] \
-                    = sum(j, resample_mat[i, j] \
-                    * vec[from_element_indices[k], j])",
+                "result[to_element_indices[iel], idof] \
+                    = sum(j, resample_mat[idof, j] \
+                    * ary[from_element_indices[iel], j])",
                 [
                     lp.GlobalArg("result", None,
                         shape="nelements_result, n_to_nodes",
                         offset=lp.auto),
-                    lp.GlobalArg("vec", None,
+                    lp.GlobalArg("ary", None,
                         shape="nelements_vec, n_from_nodes",
                         offset=lp.auto),
                     lp.ValueArg("nelements_result", np.int32),
                     lp.ValueArg("nelements_vec", np.int32),
                     "...",
                     ],
-                name="resample_by_mat",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+                name="resample_by_mat")
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
+            return knl
 
-        @memoize_in(self, "resample_by_picking_knl")
+        @memoize_in(actx,
+                (DirectDiscretizationConnection, "resample_by_picking_knl"))
         def pick_knl():
-            import loopy as lp
-            knl = lp.make_kernel(
-                """{[k,i,j]:
-                    0<=k<nelements and
-                    0<=i<n_to_nodes}""",
-                "result[to_element_indices[k], i] \
-                    = vec[from_element_indices[k], pick_list[i]]",
+            knl = make_loopy_program(
+                """{[iel, idof, j]:
+                    0<=iel<nelements and
+                    0<=idof<n_to_nodes}""",
+                "result[to_element_indices[iel], idof] \
+                    = ary[from_element_indices[iel], pick_list[idof]]",
                 [
                     lp.GlobalArg("result", None,
                         shape="nelements_result, n_to_nodes",
                         offset=lp.auto),
-                    lp.GlobalArg("vec", None,
+                    lp.GlobalArg("ary", None,
                         shape="nelements_vec, n_from_nodes",
                         offset=lp.auto),
                     lp.ValueArg("nelements_result", np.int32),
@@ -362,45 +369,41 @@ class DirectDiscretizationConnection(DiscretizationConnection):
                     lp.ValueArg("n_from_nodes", np.int32),
                     "...",
                     ],
-                name="resample_by_picking",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+                name="resample_by_picking")
 
-            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
-            return lp.tag_inames(knl, dict(k="g.0"))
-
-        if not isinstance(vec, cl.array.Array):
-            raise TypeError("non-array passed to discretization connection")
+            return knl
 
         if self.is_surjective:
-            result = self.to_discr.empty(queue, dtype=vec.dtype)
+            result = self.to_discr.empty(actx, dtype=ary.entry_dtype)
         else:
-            result = self.to_discr.zeros(queue, dtype=vec.dtype)
+            result = self.to_discr.zeros(actx, dtype=ary.entry_dtype)
 
-        if vec.shape != (self.from_discr.nnodes,):
+        if ary.shape != (len(self.from_discr.groups),):
             raise ValueError("invalid shape of incoming resampling data")
 
         for i_tgrp, (tgrp, cgrp) in enumerate(
                 zip(self.to_discr.groups, self.groups)):
             for i_batch, batch in enumerate(cgrp.batches):
-                sgrp = self.from_discr.groups[batch.from_group_index]
-
                 if not len(batch.from_element_indices):
                     continue
 
                 point_pick_indices = self._resample_point_pick_indices(
-                        i_tgrp, i_batch)
+                        actx, i_tgrp, i_batch)
 
                 if point_pick_indices is None:
-                    mat_knl()(queue,
-                            resample_mat=self._resample_matrix(i_tgrp, i_batch),
-                            result=tgrp.view(result), vec=sgrp.view(vec),
+                    actx.call_loopy(mat_knl(),
+                            resample_mat=self._resample_matrix(
+                                actx, i_tgrp, i_batch),
+                            result=result[i_tgrp],
+                            ary=ary[batch.from_group_index],
                             from_element_indices=batch.from_element_indices,
                             to_element_indices=batch.to_element_indices)
 
                 else:
-                    pick_knl()(queue,
+                    actx.call_loopy(pick_knl(),
                             pick_list=point_pick_indices,
-                            result=tgrp.view(result), vec=sgrp.view(vec),
+                            result=result[i_tgrp],
+                            ary=ary[batch.from_group_index],
                             from_element_indices=batch.from_element_indices,
                             to_element_indices=batch.to_element_indices)
 
diff --git a/meshmode/discretization/connection/face.py b/meshmode/discretization/connection/face.py
index f45a95c1b72fe742505c90535aa783c6ae5fb971..bac1ccc9e59ff82716fbd4e87213db1d48798e2d 100644
--- a/meshmode/discretization/connection/face.py
+++ b/meshmode/discretization/connection/face.py
@@ -28,8 +28,6 @@ from six.moves import range, zip
 from pytools import Record
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 import modepy as mp
 
 import logging
@@ -63,7 +61,7 @@ class _ConnectionBatchData(Record):
     pass
 
 
-def _build_boundary_connection(queue, vol_discr, bdry_discr, connection_data,
+def _build_boundary_connection(actx, vol_discr, bdry_discr, connection_data,
         per_face_groups):
     from meshmode.discretization.connection.direct import (
             InterpolationBatch,
@@ -87,14 +85,10 @@ def _build_boundary_connection(queue, vol_discr, bdry_discr, connection_data,
             batches.append(
                 InterpolationBatch(
                     from_group_index=igrp,
-                    from_element_indices=cl.array.to_device(
-                        queue,
-                        data.group_source_element_indices)
-                    .with_queue(None),
-                    to_element_indices=cl.array.to_device(
-                        queue,
-                        data.group_target_element_indices)
-                    .with_queue(None),
+                    from_element_indices=actx.freeze(actx.from_numpy(
+                        data.group_source_element_indices)),
+                    to_element_indices=actx.freeze(actx.from_numpy(
+                        data.group_target_element_indices)),
                     result_unit_nodes=result_unit_nodes,
                     to_element_face=face_id
                     ))
@@ -160,7 +154,7 @@ def _get_face_vertices(mesh, boundary_tag):
 # }}}
 
 
-def make_face_restriction(discr, group_factory, boundary_tag,
+def make_face_restriction(actx, discr, group_factory, boundary_tag,
         per_face_groups=False):
     """Create a mesh, a discretization and a connection to restrict
     a function on *discr* to its values on the edges of element faces
@@ -373,12 +367,11 @@ def make_face_restriction(discr, group_factory, boundary_tag,
 
     from meshmode.discretization import Discretization
     bdry_discr = Discretization(
-            discr.cl_context, bdry_mesh, group_factory)
+            actx, bdry_mesh, group_factory)
 
-    with cl.CommandQueue(discr.cl_context) as queue:
-        connection = _build_boundary_connection(
-                queue, discr, bdry_discr, connection_data,
-                per_face_groups)
+    connection = _build_boundary_connection(
+            actx, discr, bdry_discr, connection_data,
+            per_face_groups)
 
     logger.info("building face restriction: done")
 
@@ -389,7 +382,7 @@ def make_face_restriction(discr, group_factory, boundary_tag,
 
 # {{{ face -> all_faces connection
 
-def make_face_to_all_faces_embedding(faces_connection, all_faces_discr,
+def make_face_to_all_faces_embedding(actx, faces_connection, all_faces_discr,
         from_discr=None):
     """Return a
     :class:`meshmode.discretization.connection.DiscretizationConnection`
@@ -437,61 +430,59 @@ def make_face_to_all_faces_embedding(faces_connection, all_faces_discr,
 
     i_faces_grp = 0
 
-    with cl.CommandQueue(vol_discr.cl_context) as queue:
-        groups = []
-        for ivol_grp, vol_grp in enumerate(vol_discr.groups):
-            batches = []
+    groups = []
+    for ivol_grp, vol_grp in enumerate(vol_discr.groups):
+        batches = []
 
-            nfaces = vol_grp.mesh_el_group.nfaces
-            for iface in range(nfaces):
-                all_faces_grp = all_faces_discr.groups[i_faces_grp]
+        nfaces = vol_grp.mesh_el_group.nfaces
+        for iface in range(nfaces):
+            all_faces_grp = all_faces_discr.groups[i_faces_grp]
 
-                if per_face_groups:
-                    assert len(faces_connection.groups[i_faces_grp].batches) == 1
-                else:
-                    assert (len(faces_connection.groups[i_faces_grp].batches)
-                            == nfaces)
+            if per_face_groups:
+                assert len(faces_connection.groups[i_faces_grp].batches) == 1
+            else:
+                assert (len(faces_connection.groups[i_faces_grp].batches)
+                        == nfaces)
 
-                assert np.array_equal(
-                        from_discr.groups[i_faces_grp].unit_nodes,
-                        all_faces_grp.unit_nodes)
+            assert np.array_equal(
+                    from_discr.groups[i_faces_grp].unit_nodes,
+                    all_faces_grp.unit_nodes)
 
-                # {{{ find src_batch
+            # {{{ find src_batch
 
-                src_batches = faces_connection.groups[i_faces_grp].batches
-                if per_face_groups:
-                    src_batch, = src_batches
-                else:
-                    src_batch = src_batches[iface]
-                del src_batches
+            src_batches = faces_connection.groups[i_faces_grp].batches
+            if per_face_groups:
+                src_batch, = src_batches
+            else:
+                src_batch = src_batches[iface]
+            del src_batches
 
-                # }}}
+            # }}}
 
-                if per_face_groups:
-                    to_element_indices = src_batch.from_element_indices
-                else:
-                    assert all_faces_grp.nelements == nfaces * vol_grp.nelements
-
-                    to_element_indices = (
-                            vol_grp.nelements*iface
-                            + src_batch.from_element_indices.with_queue(queue)
-                            ).with_queue(None)
-
-                batches.append(
-                        InterpolationBatch(
-                            from_group_index=i_faces_grp,
-                            from_element_indices=src_batch.to_element_indices,
-                            to_element_indices=to_element_indices,
-                            result_unit_nodes=all_faces_grp.unit_nodes,
-                            to_element_face=None))
-
-                is_last_face = iface + 1 == nfaces
-                if per_face_groups or is_last_face:
-                    groups.append(
-                            DiscretizationConnectionElementGroup(batches=batches))
-                    batches = []
-
-                    i_faces_grp += 1
+            if per_face_groups:
+                to_element_indices = src_batch.from_element_indices
+            else:
+                assert all_faces_grp.nelements == nfaces * vol_grp.nelements
+
+                to_element_indices = actx.freeze(
+                        vol_grp.nelements*iface
+                        + actx.thaw(src_batch.from_element_indices))
+
+            batches.append(
+                    InterpolationBatch(
+                        from_group_index=i_faces_grp,
+                        from_element_indices=src_batch.to_element_indices,
+                        to_element_indices=to_element_indices,
+                        result_unit_nodes=all_faces_grp.unit_nodes,
+                        to_element_face=None))
+
+            is_last_face = iface + 1 == nfaces
+            if per_face_groups or is_last_face:
+                groups.append(
+                        DiscretizationConnectionElementGroup(batches=batches))
+                batches = []
+
+                i_faces_grp += 1
 
     return DirectDiscretizationConnection(
             from_discr,
diff --git a/meshmode/discretization/connection/opposite_face.py b/meshmode/discretization/connection/opposite_face.py
index e084525ba9a6eb5001548f432c25a0a8f861173e..8f83cd4e7e1031a7e319132397899e0cc38ff5cc 100644
--- a/meshmode/discretization/connection/opposite_face.py
+++ b/meshmode/discretization/connection/opposite_face.py
@@ -26,43 +26,45 @@ from six.moves import range
 
 import numpy as np
 import numpy.linalg as la
-import pyopencl as cl
-import pyopencl.array  # noqa
 
 import logging
 logger = logging.getLogger(__name__)
 
 
+def freeze_from_numpy(actx, array):
+    return actx.freeze(actx.from_numpy(array))
+
+
+def thaw_to_numpy(actx, array):
+    return actx.to_numpy(actx.thaw(array))
+
+
 # {{{ _make_cross_face_batches
 
-def _make_cross_face_batches(queue,
+def _make_cross_face_batches(actx,
         tgt_bdry_discr, src_bdry_discr,
         i_tgt_grp, i_src_grp,
         tgt_bdry_element_indices, src_bdry_element_indices):
-    def to_dev(ary):
-        return cl.array.to_device(queue, ary, array_queue=None)
 
     from meshmode.discretization.connection.direct import InterpolationBatch
     if tgt_bdry_discr.dim == 0:
         yield InterpolationBatch(
             from_group_index=i_src_grp,
-            from_element_indices=to_dev(src_bdry_element_indices),
-            to_element_indices=to_dev(tgt_bdry_element_indices),
+            from_element_indices=freeze_from_numpy(actx, src_bdry_element_indices),
+            to_element_indices=freeze_from_numpy(actx, tgt_bdry_element_indices),
             result_unit_nodes=src_bdry_discr.groups[i_src_grp].unit_nodes,
             to_element_face=None)
         return
 
-    # FIXME: This should view-then-transfer
-    # (but PyOpenCL doesn't do non-contiguous transfers for now).
-    tgt_bdry_nodes = (tgt_bdry_discr.groups[i_tgt_grp]
-            .view(tgt_bdry_discr.nodes().get(queue=queue))
-            [:, tgt_bdry_element_indices])
+    tgt_bdry_nodes = np.array([
+        thaw_to_numpy(actx, ary[i_tgt_grp])[tgt_bdry_element_indices]
+        for ary in tgt_bdry_discr.nodes()
+        ])
 
-    # FIXME: This should view-then-transfer
-    # (but PyOpenCL doesn't do non-contiguous transfers for now).
-    src_bdry_nodes = (src_bdry_discr.groups[i_src_grp]
-            .view(src_bdry_discr.nodes().get(queue=queue))
-            [:, src_bdry_element_indices])
+    src_bdry_nodes = np.array([
+        thaw_to_numpy(actx, ary[i_src_grp])[src_bdry_element_indices]
+        for ary in src_bdry_discr.nodes()
+        ])
 
     tol = 1e4 * np.finfo(tgt_bdry_nodes.dtype).eps
 
@@ -255,8 +257,10 @@ def _make_cross_face_batches(queue,
         from meshmode.discretization.connection.direct import InterpolationBatch
         yield InterpolationBatch(
                 from_group_index=i_src_grp,
-                from_element_indices=to_dev(src_bdry_element_indices[close_els]),
-                to_element_indices=to_dev(tgt_bdry_element_indices[close_els]),
+                from_element_indices=freeze_from_numpy(
+                    actx, src_bdry_element_indices[close_els]),
+                to_element_indices=freeze_from_numpy(
+                    actx, tgt_bdry_element_indices[close_els]),
                 result_unit_nodes=template_unit_nodes,
                 to_element_face=None)
 
@@ -276,7 +280,7 @@ def _find_ibatch_for_face(vbc_tgt_grp_batches, iface):
     return vbc_tgt_grp_face_batch
 
 
-def _make_bdry_el_lookup_table(queue, connection, igrp):
+def _make_bdry_el_lookup_table(actx, connection, igrp):
     """Given a volume-to-boundary connection as *connection*, return
     a table of shape ``(from_nelements, nfaces)`` to look up the
     element number of the boundary element for that face.
@@ -289,9 +293,9 @@ def _make_bdry_el_lookup_table(queue, connection, igrp):
     iel_lookup.fill(-1)
 
     for ibatch, batch in enumerate(connection.groups[igrp].batches):
-        from_element_indices = batch.from_element_indices.get(queue=queue)
+        from_element_indices = thaw_to_numpy(actx, batch.from_element_indices)
         iel_lookup[from_element_indices, batch.to_element_face] = \
-                batch.to_element_indices.get(queue=queue)
+                thaw_to_numpy(actx, batch.to_element_indices)
 
     return iel_lookup
 
@@ -300,7 +304,7 @@ def _make_bdry_el_lookup_table(queue, connection, igrp):
 
 # {{{ make_opposite_face_connection
 
-def make_opposite_face_connection(volume_to_bdry_conn):
+def make_opposite_face_connection(actx, volume_to_bdry_conn):
     """Given a boundary restriction connection *volume_to_bdry_conn*,
     return a :class:`DirectDiscretizationConnection` that performs data
     exchange across opposite faces.
@@ -323,103 +327,104 @@ def make_opposite_face_connection(volume_to_bdry_conn):
     # One interpolation batch in this connection corresponds
     # to a key (i_tgt_grp,)  (i_src_grp, i_face_tgt,)
 
-    with cl.CommandQueue(vol_discr.cl_context) as queue:
-        # a list of batches for each group
-        groups = [[] for i_tgt_grp in range(ngrps)]
+    # a list of batches for each group
+    groups = [[] for i_tgt_grp in range(ngrps)]
 
-        for i_src_grp in range(ngrps):
-            src_grp_el_lookup = _make_bdry_el_lookup_table(
-                    queue, volume_to_bdry_conn, i_src_grp)
+    for i_src_grp in range(ngrps):
+        src_grp_el_lookup = _make_bdry_el_lookup_table(
+                actx, volume_to_bdry_conn, i_src_grp)
 
-            for i_tgt_grp in range(ngrps):
-                vbc_tgt_grp_batches = volume_to_bdry_conn.groups[i_tgt_grp].batches
+        for i_tgt_grp in range(ngrps):
+            vbc_tgt_grp_batches = volume_to_bdry_conn.groups[i_tgt_grp].batches
 
-                adj = vol_mesh.facial_adjacency_groups[i_tgt_grp][i_src_grp]
+            adj = vol_mesh.facial_adjacency_groups[i_tgt_grp][i_src_grp]
 
-                for i_face_tgt in range(vol_mesh.groups[i_tgt_grp].nfaces):
-                    vbc_tgt_grp_face_batch = _find_ibatch_for_face(
-                            vbc_tgt_grp_batches, i_face_tgt)
+            for i_face_tgt in range(vol_mesh.groups[i_tgt_grp].nfaces):
+                vbc_tgt_grp_face_batch = _find_ibatch_for_face(
+                        vbc_tgt_grp_batches, i_face_tgt)
 
-                    # {{{ index wrangling
+                # {{{ index wrangling
 
-                    # The elements in the adjacency group will be a subset of
-                    # the elements in the restriction interpolation batch:
-                    # Imagine an inter-group boundary. The volume-to-boundary
-                    # connection will include all faces as targets, whereas
-                    # there will be separate adjacency groups for intra- and
-                    # inter-group connections.
+                # The elements in the adjacency group will be a subset of
+                # the elements in the restriction interpolation batch:
+                # Imagine an inter-group boundary. The volume-to-boundary
+                # connection will include all faces as targets, whereas
+                # there will be separate adjacency groups for intra- and
+                # inter-group connections.
 
-                    adj_tgt_flags = adj.element_faces == i_face_tgt
-                    adj_els = adj.elements[adj_tgt_flags]
-                    if adj_els.size == 0:
-                        # NOTE: this case can happen for inter-group boundaries
-                        # when all elements are adjacent on the same face
-                        # index, so all other ones will be empty
-                        continue
+                adj_tgt_flags = adj.element_faces == i_face_tgt
+                adj_els = adj.elements[adj_tgt_flags]
+                if adj_els.size == 0:
+                    # NOTE: this case can happen for inter-group boundaries
+                    # when all elements are adjacent on the same face
+                    # index, so all other ones will be empty
+                    continue
 
-                    vbc_els = vbc_tgt_grp_face_batch.from_element_indices.get(queue)
+                vbc_els = thaw_to_numpy(actx,
+                        vbc_tgt_grp_face_batch.from_element_indices)
 
-                    if len(adj_els) == len(vbc_els):
-                        # Same length: assert (below) that the two use the same
-                        # ordering.
-                        vbc_used_els = slice(None)
+                if len(adj_els) == len(vbc_els):
+                    # Same length: assert (below) that the two use the same
+                    # ordering.
+                    vbc_used_els = slice(None)
 
-                    else:
-                        # Genuine subset: figure out an index mapping.
-                        vbc_els_sort_idx = np.argsort(vbc_els)
-                        vbc_used_els = vbc_els_sort_idx[np.searchsorted(
-                            vbc_els, adj_els, sorter=vbc_els_sort_idx
-                            )]
+                else:
+                    # Genuine subset: figure out an index mapping.
+                    vbc_els_sort_idx = np.argsort(vbc_els)
+                    vbc_used_els = vbc_els_sort_idx[np.searchsorted(
+                        vbc_els, adj_els, sorter=vbc_els_sort_idx
+                        )]
 
-                    assert np.array_equal(vbc_els[vbc_used_els], adj_els)
+                assert np.array_equal(vbc_els[vbc_used_els], adj_els)
 
-                    # find to_element_indices
+                # find to_element_indices
 
-                    tgt_bdry_element_indices = (
-                            vbc_tgt_grp_face_batch.to_element_indices
-                            .get(queue=queue)[vbc_used_els])
+                tgt_bdry_element_indices = thaw_to_numpy(
+                        actx,
+                        vbc_tgt_grp_face_batch.to_element_indices
+                        )[vbc_used_els]
 
-                    # find from_element_indices
+                # find from_element_indices
 
-                    src_vol_element_indices = adj.neighbors[adj_tgt_flags]
-                    src_element_faces = adj.neighbor_faces[adj_tgt_flags]
+                src_vol_element_indices = adj.neighbors[adj_tgt_flags]
+                src_element_faces = adj.neighbor_faces[adj_tgt_flags]
 
-                    src_bdry_element_indices = src_grp_el_lookup[
-                            src_vol_element_indices, src_element_faces]
+                src_bdry_element_indices = src_grp_el_lookup[
+                        src_vol_element_indices, src_element_faces]
 
-                    # }}}
+                # }}}
 
-                    # {{{ visualization (for debugging)
+                # {{{ visualization (for debugging)
 
-                    if 0:
-                        print("TVE", adj.elements[adj_tgt_flags])
-                        print("TBE", tgt_bdry_element_indices)
-                        print("FVE", src_vol_element_indices)
-                        from meshmode.mesh.visualization import draw_2d_mesh
-                        import matplotlib.pyplot as pt
-                        draw_2d_mesh(vol_discr.mesh, draw_element_numbers=True,
-                                set_bounding_box=True,
-                                draw_vertex_numbers=False,
-                                draw_face_numbers=True,
-                                fill=None)
-                        pt.figure()
+                if 0:
+                    print("TVE", adj.elements[adj_tgt_flags])
+                    print("TBE", tgt_bdry_element_indices)
+                    print("FVE", src_vol_element_indices)
+                    from meshmode.mesh.visualization import draw_2d_mesh
+                    import matplotlib.pyplot as pt
+                    draw_2d_mesh(vol_discr.mesh, draw_element_numbers=True,
+                            set_bounding_box=True,
+                            draw_vertex_numbers=False,
+                            draw_face_numbers=True,
+                            fill=None)
+                    pt.figure()
 
-                        draw_2d_mesh(bdry_discr.mesh, draw_element_numbers=True,
-                                set_bounding_box=True,
-                                draw_vertex_numbers=False,
-                                draw_face_numbers=True,
-                                fill=None)
+                    draw_2d_mesh(bdry_discr.mesh, draw_element_numbers=True,
+                            set_bounding_box=True,
+                            draw_vertex_numbers=False,
+                            draw_face_numbers=True,
+                            fill=None)
 
-                        pt.show()
+                    pt.show()
 
-                    # }}}
+                # }}}
 
-                    batches = _make_cross_face_batches(queue,
-                            bdry_discr, bdry_discr,
-                            i_tgt_grp, i_src_grp,
-                            tgt_bdry_element_indices,
-                            src_bdry_element_indices)
-                    groups[i_tgt_grp].extend(batches)
+                batches = _make_cross_face_batches(actx,
+                        bdry_discr, bdry_discr,
+                        i_tgt_grp, i_src_grp,
+                        tgt_bdry_element_indices,
+                        src_bdry_element_indices)
+                groups[i_tgt_grp].extend(batches)
 
     from meshmode.discretization.connection import (
             DirectDiscretizationConnection, DiscretizationConnectionElementGroup)
@@ -436,7 +441,7 @@ def make_opposite_face_connection(volume_to_bdry_conn):
 
 # {{{ make_partition_connection
 
-def make_partition_connection(local_bdry_conn, i_local_part,
+def make_partition_connection(actx, local_bdry_conn, i_local_part,
                               remote_bdry, remote_adj_groups,
                               remote_from_elem_faces, remote_from_elem_indices):
     """
@@ -472,52 +477,50 @@ def make_partition_connection(local_bdry_conn, i_local_part,
 
     part_batches = [[] for _ in local_groups]
 
-    with cl.CommandQueue(local_bdry_conn.cl_context) as queue:
-
-        for i_remote_grp, adj in enumerate(remote_adj_groups):
-            indices = (i_local_part == adj.neighbor_partitions)
-            if not np.any(indices):
-                # Skip because i_remote_grp is not connected to i_local_part.
-                continue
-            i_remote_faces = adj.element_faces[indices]
-            i_local_meshwide_elems = adj.global_neighbors[indices]
-            i_local_faces = adj.neighbor_faces[indices]
-
-            i_local_grps = find_group_indices(local_groups, i_local_meshwide_elems)
-
-            for i_local_grp in np.unique(i_local_grps):
-
-                elem_base = local_groups[i_local_grp].element_nr_base
-                local_el_lookup = _make_bdry_el_lookup_table(queue,
-                                                             local_bdry_conn,
-                                                             i_local_grp)
-
-                for i_remote_face in i_remote_faces:
-
-                    index_flags = np.logical_and(i_local_grps == i_local_grp,
-                                                 i_remote_faces == i_remote_face)
-                    if not np.any(index_flags):
-                        continue
-
-                    remote_bdry_indices = None
-                    for idxs, face in zip(remote_from_elem_indices[i_remote_grp],
-                                          remote_from_elem_faces[i_remote_grp]):
-                        if face == i_remote_face:
-                            remote_bdry_indices = idxs
-                            break
-                    assert remote_bdry_indices is not None
-
-                    elems = i_local_meshwide_elems[index_flags] - elem_base
-                    faces = i_local_faces[index_flags]
-                    local_bdry_indices = local_el_lookup[elems, faces]
-
-                    batches = _make_cross_face_batches(queue,
-                            local_bdry, remote_bdry,
-                            i_local_grp, i_remote_grp,
-                            local_bdry_indices,
-                            remote_bdry_indices)
-
-                    part_batches[i_local_grp].extend(batches)
+    for i_remote_grp, adj in enumerate(remote_adj_groups):
+        indices = (i_local_part == adj.neighbor_partitions)
+        if not np.any(indices):
+            # Skip because i_remote_grp is not connected to i_local_part.
+            continue
+        i_remote_faces = adj.element_faces[indices]
+        i_local_meshwide_elems = adj.global_neighbors[indices]
+        i_local_faces = adj.neighbor_faces[indices]
+
+        i_local_grps = find_group_indices(local_groups, i_local_meshwide_elems)
+
+        for i_local_grp in np.unique(i_local_grps):
+
+            elem_base = local_groups[i_local_grp].element_nr_base
+            local_el_lookup = _make_bdry_el_lookup_table(actx,
+                                                         local_bdry_conn,
+                                                         i_local_grp)
+
+            for i_remote_face in i_remote_faces:
+
+                index_flags = np.logical_and(i_local_grps == i_local_grp,
+                                             i_remote_faces == i_remote_face)
+                if not np.any(index_flags):
+                    continue
+
+                remote_bdry_indices = None
+                for idxs, face in zip(remote_from_elem_indices[i_remote_grp],
+                                      remote_from_elem_faces[i_remote_grp]):
+                    if face == i_remote_face:
+                        remote_bdry_indices = idxs
+                        break
+                assert remote_bdry_indices is not None
+
+                elems = i_local_meshwide_elems[index_flags] - elem_base
+                faces = i_local_faces[index_flags]
+                local_bdry_indices = local_el_lookup[elems, faces]
+
+                batches = _make_cross_face_batches(actx,
+                        local_bdry, remote_bdry,
+                        i_local_grp, i_remote_grp,
+                        local_bdry_indices,
+                        remote_bdry_indices)
+
+                part_batches[i_local_grp].extend(batches)
 
     return DirectDiscretizationConnection(
             from_discr=remote_bdry,
diff --git a/meshmode/discretization/connection/projection.py b/meshmode/discretization/connection/projection.py
index fae3f9b7435b526cd7342090bd31012885945ff9..575b07aa5a945ee346d7f67af03da001455a029a 100644
--- a/meshmode/discretization/connection/projection.py
+++ b/meshmode/discretization/connection/projection.py
@@ -23,12 +23,13 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 
-from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-from pytools import memoize_method, memoize_in
+from pytools import keyed_memoize_method, memoize_in
 
+import loopy as lp
+
+from meshmode.array_context import make_loopy_program
+from meshmode.dof_array import DOFArray
 from meshmode.discretization.connection.direct import (
         DiscretizationConnection,
         DirectDiscretizationConnection)
@@ -78,8 +79,8 @@ class L2ProjectionInverseDiscretizationConnection(DiscretizationConnection):
                 to_discr=self.conn.from_discr,
                 is_surjective=is_surjective)
 
-    @memoize_method
-    def _batch_weights(self):
+    @keyed_memoize_method(key=lambda actx: ())
+    def _batch_weights(self, actx):
         """Computes scaled quadrature weights for each interpolation batch in
         :attr:`conn`. The quadrature weights can be used to integrate over
         child elements in the domain of the parent element, by a change of
@@ -111,26 +112,32 @@ class L2ProjectionInverseDiscretizationConnection(DiscretizationConnection):
                     mat = grp.diff_matrices()[iaxis]
                     jac[iaxis] = mat.dot(batch.result_unit_nodes.T)
 
-                weights[igrp, ibatch] = det(jac) * grp.weights
+                weights[igrp, ibatch] = actx.freeze(actx.from_numpy(
+                    det(jac) * grp.weights))
 
         return weights
 
-    def __call__(self, queue, vec):
-        @memoize_in(self, "conn_projection_knl")
+    def __call__(self, vec):
+        if not isinstance(vec, DOFArray):
+            raise TypeError("non-array passed to discretization connection")
+
+        actx = vec.array_context
+
+        @memoize_in(actx, (L2ProjectionInverseDiscretizationConnection,
+            "conn_projection_knl"))
         def kproj():
-            import loopy as lp
-            knl = lp.make_kernel([
-                "{[k]: 0 <= k < nelements}",
-                "{[j]: 0 <= j < n_from_nodes}"
+            return make_loopy_program([
+                "{[iel]: 0 <= iel < nelements}",
+                "{[idof_quad]: 0 <= idof_quad < n_from_nodes}"
                 ],
                 """
-                for k
-                    <> element_dot = \
-                            sum(j, vec[from_element_indices[k], j] * \
-                                   basis[j] * weights[j])
+                for iel
+                    <> element_dot = sum(idof_quad,
+                                vec[from_element_indices[iel], idof_quad]
+                                * basis[idof_quad] * weights[idof_quad])
 
-                    result[to_element_indices[k], ibasis] = \
-                            result[to_element_indices[k], ibasis] + element_dot
+                    result[to_element_indices[iel], ibasis] = \
+                            result[to_element_indices[iel], ibasis] + element_dot
                 end
                 """,
                 [
@@ -148,21 +155,18 @@ class L2ProjectionInverseDiscretizationConnection(DiscretizationConnection):
                     lp.ValueArg("ibasis", np.int32),
                     '...'
                     ],
-                name="conn_projection_knl",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-            return knl
+                name="conn_projection_knl")
 
-        @memoize_in(self, "conn_evaluation_knl")
+        @memoize_in(actx, (L2ProjectionInverseDiscretizationConnection,
+            "conn_evaluation_knl"))
         def keval():
-            import loopy as lp
-            knl = lp.make_kernel([
-                "{[k]: 0 <= k < nelements}",
-                "{[j]: 0 <= j < n_to_nodes}"
+            return make_loopy_program([
+                "{[iel]: 0 <= iel < nelements}",
+                "{[idof]: 0 <= idof < n_to_nodes}"
                 ],
                 """
-                    result[k, j] = result[k, j] + \
-                            coefficients[k, ibasis] * basis[j]
+                    result[iel, idof] = result[iel, idof] + \
+                            coefficients[iel, ibasis] * basis[idof]
                 """,
                 [
                     lp.GlobalArg("coefficients", None,
@@ -170,55 +174,49 @@ class L2ProjectionInverseDiscretizationConnection(DiscretizationConnection):
                     lp.ValueArg("ibasis", np.int32),
                     '...'
                     ],
-                name="conn_evaluate_knl",
-                default_offset=lp.auto,
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-
-            return knl
-
-        if not isinstance(vec, cl.array.Array):
-            raise TypeError("non-array passed to discretization connection")
-
-        if vec.shape != (self.from_discr.nnodes,):
-            raise ValueError("invalid shape of incoming resampling data")
+                name="conn_evaluate_knl")
 
         # compute weights on each refinement of the reference element
-        weights = self._batch_weights()
+        weights = self._batch_weights(actx)
 
         # perform dot product (on reference element) to get basis coefficients
-        c = self.to_discr.zeros(queue, dtype=vec.dtype)
+        c = self.to_discr.zeros(actx, dtype=vec.entry_dtype)
+
         for igrp, (tgrp, cgrp) in enumerate(
                 zip(self.to_discr.groups, self.conn.groups)):
             for ibatch, batch in enumerate(cgrp.batches):
                 sgrp = self.from_discr.groups[batch.from_group_index]
 
                 for ibasis, basis_fn in enumerate(sgrp.basis()):
-                    basis = basis_fn(batch.result_unit_nodes).flatten()
+                    basis = actx.from_numpy(
+                            basis_fn(batch.result_unit_nodes).flatten())
 
                     # NOTE: batch.*_element_indices are reversed here because
                     # they are from the original forward connection, but
                     # we are going in reverse here. a bit confusing, but
                     # saves on recreating the connection groups and batches.
-                    kproj()(queue,
+                    actx.call_loopy(kproj(),
                             ibasis=ibasis,
-                            vec=sgrp.view(vec),
+                            vec=vec[sgrp.index],
                             basis=basis,
                             weights=weights[igrp, ibatch],
-                            result=tgrp.view(c),
+                            result=c[igrp],
                             from_element_indices=batch.to_element_indices,
                             to_element_indices=batch.from_element_indices)
 
         # evaluate at unit_nodes to get the vector on to_discr
-        result = self.to_discr.zeros(queue, dtype=vec.dtype)
+        result = self.to_discr.zeros(actx, dtype=vec.entry_dtype)
         for igrp, grp in enumerate(self.to_discr.groups):
             for ibasis, basis_fn in enumerate(grp.basis()):
-                basis = basis_fn(grp.unit_nodes).flatten()
+                basis = actx.from_numpy(
+                        basis_fn(grp.unit_nodes).flatten())
 
-                keval()(queue,
+                actx.call_loopy(
+                        keval(),
                         ibasis=ibasis,
-                        result=grp.view(result),
+                        result=result[grp.index],
                         basis=basis,
-                        coefficients=grp.view(c))
+                        coefficients=c[grp.index])
 
         return result
 
diff --git a/meshmode/discretization/connection/refinement.py b/meshmode/discretization/connection/refinement.py
index 9a2d60d890f6b436e72151d66171d819cdce6a98..55026e59a453c5a6fd1738c381ac313863242d15 100644
--- a/meshmode/discretization/connection/refinement.py
+++ b/meshmode/discretization/connection/refinement.py
@@ -24,8 +24,6 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 
 import logging
 logger = logging.getLogger(__name__)
@@ -36,7 +34,7 @@ from pytools import log_process
 # {{{ Build interpolation batches for group
 
 def _build_interpolation_batches_for_group(
-        queue, group_idx, coarse_discr_group, fine_discr_group, record):
+        actx, group_idx, coarse_discr_group, fine_discr_group, record):
     r"""
     To map between discretizations, we sort each of the fine mesh
     elements into an interpolation batch.  Which batch they go
@@ -104,8 +102,8 @@ def _build_interpolation_batches_for_group(
             continue
         yield InterpolationBatch(
             from_group_index=group_idx,
-            from_element_indices=cl.array.to_device(queue, np.asarray(from_bin)),
-            to_element_indices=cl.array.to_device(queue, np.asarray(to_bin)),
+            from_element_indices=actx.from_numpy(np.asarray(from_bin)),
+            to_element_indices=actx.from_numpy(np.asarray(to_bin)),
             result_unit_nodes=unit_nodes,
             to_element_face=None)
 
@@ -113,7 +111,7 @@ def _build_interpolation_batches_for_group(
 
 
 @log_process(logger)
-def make_refinement_connection(refiner, coarse_discr, group_factory):
+def make_refinement_connection(actx, refiner, coarse_discr, group_factory):
     """Return a
     :class:`meshmode.discretization.connection.DiscretizationConnection`
     connecting `coarse_discr` to a discretization on the fine mesh.
@@ -142,21 +140,20 @@ def make_refinement_connection(refiner, coarse_discr, group_factory):
 
     from meshmode.discretization import Discretization
     fine_discr = Discretization(
-        coarse_discr.cl_context,
+        actx,
         fine_mesh,
         group_factory,
         real_dtype=coarse_discr.real_dtype)
 
     groups = []
-    with cl.CommandQueue(fine_discr.cl_context) as queue:
-        for group_idx, (coarse_discr_group, fine_discr_group, record) in \
-                enumerate(zip(coarse_discr.groups, fine_discr.groups,
-                              refiner.group_refinement_records)):
-            groups.append(
-                DiscretizationConnectionElementGroup(
-                    list(_build_interpolation_batches_for_group(
-                            queue, group_idx, coarse_discr_group,
-                            fine_discr_group, record))))
+    for group_idx, (coarse_discr_group, fine_discr_group, record) in \
+            enumerate(zip(coarse_discr.groups, fine_discr.groups,
+                          refiner.group_refinement_records)):
+        groups.append(
+            DiscretizationConnectionElementGroup(
+                list(_build_interpolation_batches_for_group(
+                        actx, group_idx, coarse_discr_group,
+                        fine_discr_group, record))))
 
     return DirectDiscretizationConnection(
         from_discr=coarse_discr,
diff --git a/meshmode/discretization/connection/same_mesh.py b/meshmode/discretization/connection/same_mesh.py
index e7b781cd8e2908e1ef9b5af7da202a009931a256..355d534944e905974ca61ec210f64bb48e91bfcd 100644
--- a/meshmode/discretization/connection/same_mesh.py
+++ b/meshmode/discretization/connection/same_mesh.py
@@ -23,13 +23,11 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
 
 
 # {{{ same-mesh constructor
 
-def make_same_mesh_connection(to_discr, from_discr):
+def make_same_mesh_connection(actx, to_discr, from_discr):
     from meshmode.discretization.connection.direct import (
             InterpolationBatch,
             DiscretizationConnectionElementGroup,
@@ -39,23 +37,22 @@ def make_same_mesh_connection(to_discr, from_discr):
         raise ValueError("from_discr and to_discr must be based on "
                 "the same mesh")
 
-    assert to_discr.cl_context == from_discr.cl_context
-
-    with cl.CommandQueue(to_discr.cl_context) as queue:
-        groups = []
-        for igrp, (fgrp, tgrp) in enumerate(zip(from_discr.groups, to_discr.groups)):
-            all_elements = cl.array.arange(queue,
-                    fgrp.nelements,
-                    dtype=np.intp).with_queue(None)
-            ibatch = InterpolationBatch(
-                    from_group_index=igrp,
-                    from_element_indices=all_elements,
-                    to_element_indices=all_elements,
-                    result_unit_nodes=tgrp.unit_nodes,
-                    to_element_face=None)
-
-            groups.append(
-                    DiscretizationConnectionElementGroup([ibatch]))
+    groups = []
+    for igrp, (fgrp, tgrp) in enumerate(zip(from_discr.groups, to_discr.groups)):
+        all_elements = actx.freeze(
+                actx.from_numpy(
+                    np.arange(
+                        fgrp.nelements,
+                        dtype=np.intp)))
+        ibatch = InterpolationBatch(
+                from_group_index=igrp,
+                from_element_indices=all_elements,
+                to_element_indices=all_elements,
+                result_unit_nodes=tgrp.unit_nodes,
+                to_element_face=None)
+
+        groups.append(
+                DiscretizationConnectionElementGroup([ibatch]))
 
     return DirectDiscretizationConnection(
             from_discr, to_discr, groups,
diff --git a/meshmode/discretization/poly_element.py b/meshmode/discretization/poly_element.py
index 436b1fb1229e2dc11bc56faa280cf0185763735c..d0f3823c0b9bb368fa215a7187f39594bb190def 100644
--- a/meshmode/discretization/poly_element.py
+++ b/meshmode/discretization/poly_element.py
@@ -53,15 +53,6 @@ Group factories
 .. autoclass:: LegendreGaussLobattoTensorProductGroupFactory
 """
 
-# FIXME Most of the loopy kernels will break as soon as we start using multiple
-# element groups. That's because then the dimension-to-dimension stride will no
-# longer just be the long axis of the array, but something entirely
-# independent.  The machinery for this on the loopy end is there, in the form
-# of the "stride:auto" dim tag, it just needs to be pushed through all the
-# kernels.  Fortunately, this will fail in an obvious and noisy way, because
-# loopy sees strides that it doesn't expect and complains.
-
-
 from meshmode.discretization import ElementGroupBase, InterpolatoryElementGroupBase
 
 
@@ -221,7 +212,11 @@ class PolynomialWarpAndBlendElementGroup(_MassMatrixQuadratureElementGroup):
     @memoize_method
     def unit_nodes(self):
         dim = self.mesh_el_group.dim
-        result = mp.warp_and_blend_nodes(dim, self.order)
+        if self.order == 0:
+            result = mp.warp_and_blend_nodes(dim, 1)
+            result = np.mean(result, axis=1).reshape(-1, 1)
+        else:
+            result = mp.warp_and_blend_nodes(dim, self.order)
 
         dim2, nunit_nodes = result.shape
         assert dim2 == dim
@@ -300,12 +295,12 @@ class HomogeneousOrderBasedGroupFactory(ElementGroupFactory):
     def __init__(self, order):
         self.order = order
 
-    def __call__(self, mesh_el_group, node_nr_base):
+    def __call__(self, mesh_el_group, index):
         if not isinstance(mesh_el_group, self.mesh_group_class):
             raise TypeError("only mesh element groups of type '%s' "
                     "are supported" % self.mesh_group_class.__name__)
 
-        return self.group_class(mesh_el_group, self.order, node_nr_base)
+        return self.group_class(mesh_el_group, self.order, index)
 
 
 class OrderAndTypeBasedGroupFactory(ElementGroupFactory):
@@ -314,7 +309,7 @@ class OrderAndTypeBasedGroupFactory(ElementGroupFactory):
         self.simplex_group_class = simplex_group_class
         self.tensor_product_group_class = tensor_product_group_class
 
-    def __call__(self, mesh_el_group, node_nr_base):
+    def __call__(self, mesh_el_group, index):
         if isinstance(mesh_el_group, _MeshSimplexElementGroup):
             group_class = self.simplex_group_class
         elif isinstance(mesh_el_group, _MeshTensorProductElementGroup):
@@ -325,7 +320,7 @@ class OrderAndTypeBasedGroupFactory(ElementGroupFactory):
                         _MeshSimplexElementGroup.__name__,
                         _MeshTensorProductElementGroup.__name__))
 
-        return group_class(mesh_el_group, self.order, node_nr_base)
+        return group_class(mesh_el_group, self.order, index)
 
 
 # {{{ group factories for simplices
diff --git a/meshmode/discretization/visualization.py b/meshmode/discretization/visualization.py
index e985f3ffcd9f077a2c6dcb97b6e94e4c8147c49c..24fe92f675293ef6ab048553778ac50a394d4173 100644
--- a/meshmode/discretization/visualization.py
+++ b/meshmode/discretization/visualization.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2014 Andreas Kloeckner"
 
 __license__ = """
@@ -22,10 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
 import numpy as np
 from pytools import memoize_method, Record
-import pyopencl as cl
+from meshmode.dof_array import DOFArray, flatten, thaw
+
 
 __doc__ = """
 
@@ -40,29 +38,28 @@ __doc__ = """
 # {{{ visualizer
 
 def separate_by_real_and_imag(data, real_only):
-    for name, field in data:
-        from pytools.obj_array import log_shape, is_obj_array
-        ls = log_shape(field)
+    # This function is called on numpy data that has already been
+    # merged into a single vector.
 
-        if is_obj_array(field):
-            assert len(ls) == 1
+    for name, field in data:
+        if isinstance(field, np.ndarray) and field.dtype.char == "O":
+            assert len(field.shape) == 1
             from pytools.obj_array import (
-                    oarray_real_copy, oarray_imag_copy,
-                    with_object_array_or_scalar)
+                    obj_array_real_copy, obj_array_imag_copy,
+                    obj_array_vectorize)
 
             if field[0].dtype.kind == "c":
                 if real_only:
                     yield (name,
-                            with_object_array_or_scalar(oarray_real_copy, field))
+                            obj_array_vectorize(obj_array_real_copy, field))
                 else:
                     yield (name+"_r",
-                            with_object_array_or_scalar(oarray_real_copy, field))
+                            obj_array_vectorize(obj_array_real_copy, field))
                     yield (name+"_i",
-                            with_object_array_or_scalar(oarray_imag_copy, field))
+                            obj_array_vectorize(obj_array_imag_copy, field))
             else:
                 yield (name, field)
         else:
-            # ls == ()
             if field.dtype.kind == "c":
                 yield (name+"_r", field.real.copy())
                 yield (name+"_i", field.imag.copy())
@@ -115,17 +112,29 @@ class Visualizer(object):
 
         self.element_shrink_factor = element_shrink_factor
 
-    def _resample_and_get(self, queue, vec):
-        from pytools.obj_array import with_object_array_or_scalar
-
-        def resample_and_get_one(fld):
-            from numbers import Number
-            if isinstance(fld, Number):
-                return np.ones(self.connection.to_discr.nnodes) * fld
-            else:
-                return self.connection(queue, fld).get(queue=queue)
+    def _resample_to_numpy(self, vec):
+        if (isinstance(vec, np.ndarray)
+                and vec.dtype.char == "O"
+                and not isinstance(vec, DOFArray)):
+            from pytools.obj_array import obj_array_vectorize
+            return obj_array_vectorize(self._resample_to_numpy, vec)
+
+        from numbers import Number
+        if isinstance(vec, Number):
+            nnodes = sum(grp.ndofs for grp in self.connection.to_discr.groups)
+            return np.ones(nnodes) * vec
+        else:
+            resampled = self.connection(vec)
+            actx = resampled.array_context
+            return actx.to_numpy(flatten(resampled))
 
-        return with_object_array_or_scalar(resample_and_get_one, vec)
+    @memoize_method
+    def _vis_nodes(self):
+        actx = self.vis_discr._setup_actx
+        return np.array([
+            actx.to_numpy(flatten(thaw(actx, ary)))
+            for ary in self.vis_discr.nodes()
+            ])
 
     # {{{ vis sub-element connectivity
 
@@ -148,14 +157,15 @@ class Visualizer(object):
                 VTK_QUAD, VTK_HEXAHEDRON)
 
         subel_nr_base = 0
+        node_nr_base = 0
 
         for group in self.vis_discr.groups:
             if isinstance(group.mesh_el_group, SimplexElementGroup):
                 node_tuples = list(gnitstam(group.order, group.dim))
 
-                from modepy.tools import submesh
+                from modepy.tools import simplex_submesh
                 el_connectivity = np.array(
-                        submesh(node_tuples),
+                        simplex_submesh(node_tuples),
                         dtype=np.intp)
                 vtk_cell_type = {
                         1: VTK_LINE,
@@ -202,10 +212,10 @@ class Visualizer(object):
                 raise NotImplementedError("visualization for element groups "
                         "of type '%s'" % type(group.mesh_el_group).__name__)
 
-            assert len(node_tuples) == group.nunit_nodes
+            assert len(node_tuples) == group.nunit_dofs
             vis_connectivity = (
-                    group.node_nr_base + np.arange(
-                        0, group.nelements*group.nunit_nodes, group.nunit_nodes
+                    node_nr_base + np.arange(
+                        0, group.nelements*group.nunit_dofs, group.nunit_dofs
                         )[:, np.newaxis, np.newaxis]
                     + el_connectivity).astype(np.intp)
 
@@ -216,6 +226,7 @@ class Visualizer(object):
             result.append(vgrp)
 
             subel_nr_base += vgrp.nsubelements
+            node_nr_base += group.ndofs
 
         return result
 
@@ -228,10 +239,8 @@ class Visualizer(object):
 
         do_show = kwargs.pop("do_show", True)
 
-        with cl.CommandQueue(self.vis_discr.cl_context) as queue:
-            nodes = self.vis_discr.nodes().with_queue(queue).get()
-
-            field = self._resample_and_get(queue, field)
+        nodes = self._vis_nodes()
+        field = self._resample_to_numpy(field)
 
         assert nodes.shape[0] == self.vis_discr.ambient_dim
         #mlab.points3d(nodes[0], nodes[1], 0*nodes[0])
@@ -286,12 +295,10 @@ class Visualizer(object):
                 AppendedDataXMLGenerator,
                 VF_LIST_OF_COMPONENTS)
 
-        with cl.CommandQueue(self.vis_discr.cl_context) as queue:
-            nodes = self.vis_discr.nodes().with_queue(queue).get()
-
-            names_and_fields = [
-                    (name, self._resample_and_get(queue, fld))
-                    for name, fld in names_and_fields]
+        nodes = self._vis_nodes()
+        names_and_fields = [
+                (name, self._resample_to_numpy(fld))
+                for name, fld in names_and_fields]
 
         vc_groups = self._vis_connectivity()
 
@@ -318,8 +325,12 @@ class Visualizer(object):
                         + (1-self.element_shrink_factor)
                         * el_centers[:, :, np.newaxis])
 
+        if len(self.vis_discr.groups) != 1:
+            raise NotImplementedError("visualization with multiple "
+                    "element groups")
+
         grid = UnstructuredGrid(
-                (self.vis_discr.nnodes,
+                (sum(grp.ndofs for grp in self.vis_discr.groups),
                     DataArray("points",
                         nodes.reshape(self.vis_discr.ambient_dim, -1),
                         vector_format=VF_LIST_OF_COMPONENTS)),
@@ -362,10 +373,8 @@ class Visualizer(object):
         vmax = kwargs.pop("vmax", None)
         norm = kwargs.pop("norm", None)
 
-        with cl.CommandQueue(self.vis_discr.cl_context) as queue:
-            nodes = self.vis_discr.nodes().with_queue(queue).get()
-
-            field = self._resample_and_get(queue, field)
+        nodes = self._vis_nodes()
+        field = self._resample_to_numpy(field)
 
         assert nodes.shape[0] == self.vis_discr.ambient_dim
 
@@ -420,14 +429,14 @@ class Visualizer(object):
     # }}}
 
 
-def make_visualizer(queue, discr, vis_order, element_shrink_factor=None):
+def make_visualizer(actx, discr, vis_order, element_shrink_factor=None):
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import (
             PolynomialWarpAndBlendElementGroup,
             LegendreGaussLobattoTensorProductElementGroup,
             OrderAndTypeBasedGroupFactory)
     vis_discr = Discretization(
-            discr.cl_context, discr.mesh,
+            actx, discr.mesh,
             OrderAndTypeBasedGroupFactory(
                 vis_order,
                 simplex_group_class=PolynomialWarpAndBlendElementGroup,
@@ -438,7 +447,7 @@ def make_visualizer(queue, discr, vis_order, element_shrink_factor=None):
             make_same_mesh_connection
 
     return Visualizer(
-            make_same_mesh_connection(vis_discr, discr),
+            make_same_mesh_connection(actx, vis_discr, discr),
             element_shrink_factor=element_shrink_factor)
 
 # }}}
@@ -453,16 +462,18 @@ def draw_curve(discr):
     plt.plot(mesh.vertices[0], mesh.vertices[1], "o")
 
     color = plt.cm.rainbow(np.linspace(0, 1, len(discr.groups)))
-    with cl.CommandQueue(discr.cl_context) as queue:
-        for i, group in enumerate(discr.groups):
-            group_nodes = group.view(discr.nodes()).get(queue=queue)
-            artist_handles = plt.plot(
-                    group_nodes[0].T,
-                    group_nodes[1].T, "-x",
-                    color=color[i])
-
-            if artist_handles:
-                artist_handles[0].set_label("Group %d" % i)
+    for igrp, group in enumerate(discr.groups):
+        group_nodes = np.array([
+            discr._setup_actx.to_numpy(discr.nodes()[iaxis][igrp])
+            for iaxis in range(discr.ambient_dim)
+            ])
+        artist_handles = plt.plot(
+                group_nodes[0].T,
+                group_nodes[1].T, "-x",
+                color=color[igrp])
+
+        if artist_handles:
+            artist_handles[0].set_label("Group %d" % igrp)
 
 # }}}
 
diff --git a/meshmode/distributed.py b/meshmode/distributed.py
index 54135fb061097ed8064b696b1b4b778e70dd02c4..bbe24bb0511b072a89192f2703c2e232f7dfcf7c 100644
--- a/meshmode/distributed.py
+++ b/meshmode/distributed.py
@@ -128,15 +128,14 @@ class MPIBoundaryCommSetupHelper(object):
     .. automethod:: __call__
     .. automethod:: is_ready
     """
-    def __init__(self, mpi_comm, queue, local_bdry_conn, i_remote_part,
+    def __init__(self, mpi_comm, actx, local_bdry_conn, i_remote_part,
             bdry_grp_factory):
         """
         :arg mpi_comm: A :class:`MPI.Intracomm`
-        :arg queue:
         :arg i_remote_part: The part number of the remote partition
         """
         self.mpi_comm = mpi_comm
-        self.queue = queue
+        self.array_context = actx
         self.i_local_part = mpi_comm.Get_rank()
         self.i_remote_part = i_remote_part
         self.local_bdry_conn = local_bdry_conn
@@ -151,9 +150,11 @@ class MPIBoundaryCommSetupHelper(object):
                          for i in range(len(local_mesh.groups))]
         local_to_elem_faces = [[batch.to_element_face for batch in grp_batches]
                                 for grp_batches in local_batches]
-        local_to_elem_indices = [[batch.to_element_indices.get(queue=self.queue)
-                                        for batch in grp_batches]
-                                    for grp_batches in local_batches]
+        local_to_elem_indices = [
+                [
+                    self.array_context.to_numpy(batch.to_element_indices)
+                    for batch in grp_batches]
+                for grp_batches in local_batches]
 
         local_data = {'bdry_mesh': local_bdry.mesh,
                       'adj': local_adj_groups,
@@ -190,7 +191,7 @@ class MPIBoundaryCommSetupHelper(object):
 
         from meshmode.discretization import Discretization
         remote_bdry_mesh = remote_data['bdry_mesh']
-        remote_bdry = Discretization(self.queue.context, remote_bdry_mesh,
+        remote_bdry = Discretization(self.array_context, remote_bdry_mesh,
                                      self.bdry_grp_factory)
         remote_adj_groups = remote_data['adj']
         remote_to_elem_faces = remote_data['to_elem_faces']
@@ -198,12 +199,11 @@ class MPIBoundaryCommSetupHelper(object):
 
         # Connect local_mesh to remote_mesh
         from meshmode.discretization.connection import make_partition_connection
-        remote_to_local_bdry_conn = make_partition_connection(self.local_bdry_conn,
-                                                              self.i_local_part,
-                                                              remote_bdry,
-                                                              remote_adj_groups,
-                                                              remote_to_elem_faces,
-                                                              remote_to_elem_indices)
+        remote_to_local_bdry_conn = make_partition_connection(
+                self.array_context, self.local_bdry_conn, self.i_local_part,
+                remote_bdry, remote_adj_groups, remote_to_elem_faces,
+                remote_to_elem_indices)
+
         self.send_req.wait()
         return remote_to_local_bdry_conn
 
diff --git a/meshmode/dof_array.py b/meshmode/dof_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74f2090672c4f94a13426f05cd71a2cfe583297
--- /dev/null
+++ b/meshmode/dof_array.py
@@ -0,0 +1,266 @@
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+from typing import Optional, Iterable, TYPE_CHECKING, Any
+from functools import partial
+
+from pytools import single_valued, memoize_in
+from pytools.obj_array import obj_array_vectorize
+
+from meshmode.array_context import ArrayContext, make_loopy_program
+
+if TYPE_CHECKING:
+    from meshmode.discretization import Discretization as _Discretization
+
+
+__doc__ = """
+.. autoclass:: DOFArray
+.. autofunction:: thaw
+.. autofunction:: freeze
+.. autofunction:: flatten
+.. autofunction:: unflatten
+.. autofunction:: flat_norm
+"""
+
+
+# {{{ DOFArray
+
+class DOFArray(np.ndarray):
+    """This array type is a subclass of :class:`numpy.ndarray` intended to hold
+    degree-of-freedom arrays for use with
+    :class:`~meshmode.discretization.Discretization`,
+    with one entry in the :class:`DOFArray` for each
+    :class:`~meshmode.discretization.ElementGroupBase`.
+    The arrays contained within a :class:`DOFArray`
+    are expected to be logically two-dimensional, with shape
+    ``(nelements, ndofs_per_element)``, where ``nelements`` is the same as
+    :attr:`~meshmode.discretization.ElementGroupBase.nelements`
+    of the associated group.
+    ``ndofs_per_element`` is typically, but not necessarily, the same as
+    :attr:`~meshmode.discretization.ElementGroupBase.nunit_dofs`
+    of the associated group.
+    This array is derived from :class:`numpy.ndarray` with dtype object ("an
+    object array").  The entries in this array are further arrays managed by
+    :attr:`array_context`.
+
+    One main purpose of this class is to describe the data structure,
+    i.e. when a :class:`DOFArray` occurs inside of further numpy object array,
+    the level representing the array of element groups can be recognized (by
+    people and programs).
+
+    .. attribute:: array_context
+
+        An :class:`meshmode.array_context.ArrayContext`.
+
+    .. attribute:: entry_dtype
+
+        The (assumed uniform) :class:`numpy.dtype` of the group arrays
+        contained in this instance.
+
+    .. automethod:: from_list
+    """
+
+    # Follows https://numpy.org/devdocs/user/basics.subclassing.html
+
+    def __new__(cls, actx: Optional[ArrayContext], input_array):
+        if not (actx is None or isinstance(actx, ArrayContext)):
+            raise TypeError("actx must be of type ArrayContext")
+
+        result = np.asarray(input_array).view(cls)
+        if len(result.shape) != 1:
+            raise ValueError("DOFArray instances must have one-dimensional "
+                    "shape, with one entry per element group")
+
+        result.array_context = actx
+        return result
+
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self.array_context = getattr(obj, "array_context", None)
+
+    @property
+    def entry_dtype(self):
+        return single_valued(subary.dtype for subary in self.flat)
+
+    @classmethod
+    def from_list(cls, actx: Optional[ArrayContext], res_list) -> "DOFArray":
+        r"""Create a :class:`DOFArray` from a list of arrays
+        (one per :class:`~meshmode.discretization.ElementGroupBase`).
+
+        :arg actx: If *None*, the arrays in *res_list* must be
+            :meth:`~meshmode.array_context.ArrayContext.thaw`\ ed.
+        """
+        if not (actx is None or isinstance(actx, ArrayContext)):
+            raise TypeError("actx must be of type ArrayContext")
+
+        ngroups = len(res_list)
+
+        result = np.empty(ngroups, dtype=object).view(cls)
+        result.array_context = actx
+
+        # 'result[:] = res_list' may look tempting, however:
+        # https://github.com/numpy/numpy/issues/16564
+        for igrp in range(ngroups):
+            result[igrp] = res_list[igrp]
+
+        return result
+
+# }}}
+
+
+def thaw(actx: ArrayContext, ary: np.ndarray) -> np.ndarray:
+    r"""Call :meth:`~meshmode.array_context.ArrayContext.thaw` on the element
+    group arrays making up the :class:`DOFArray`, using *actx*.
+
+    Vectorizes over object arrays of :class:`DOFArray`\ s.
+    """
+    if (isinstance(ary, np.ndarray)
+            and ary.dtype.char == "O"
+            and not isinstance(ary, DOFArray)):
+        return obj_array_vectorize(partial(thaw, actx), ary)
+
+    if ary.array_context is not None:
+        raise ValueError("DOFArray passed to thaw is not frozen")
+
+    return DOFArray.from_list(actx, [
+        actx.thaw(subary)
+        for subary in ary
+        ])
+
+
+def freeze(ary: np.ndarray) -> np.ndarray:
+    r"""Call :meth:`~meshmode.array_context.ArrayContext.freeze` on the element
+    group arrays making up the :class:`DOFArray`, using the
+    :class:`~meshmode.array_context.ArrayContext` in *ary*.
+
+    Vectorizes over object arrays of :class:`DOFArray`\ s.
+    """
+    if (isinstance(ary, np.ndarray)
+            and ary.dtype.char == "O"
+            and not isinstance(ary, DOFArray)):
+        return obj_array_vectorize(freeze, ary)
+
+    if ary.array_context is None:
+        raise ValueError("DOFArray passed to freeze is already frozen")
+
+    return DOFArray.from_list(None, [
+        ary.array_context.freeze(subary)
+        for subary in ary
+        ])
+
+
+def flatten(ary: np.ndarray) -> Any:
+    r"""Convert a :class:`DOFArray` into a "flat" array of degrees of freedom,
+    where the resulting type of the array is given by the
+    :attr:`DOFArray.array_context`.
+
+    Array elements are laid out contiguously, with the element group
+    index varying slowest, element index next, and intra-element DOF
+    index fastest.
+
+    Vectorizes over object arrays of :class:`DOFArray`\ s.
+    """
+    if (isinstance(ary, np.ndarray)
+            and ary.dtype.char == "O"
+            and not isinstance(ary, DOFArray)):
+        return obj_array_vectorize(flatten, ary)
+
+    group_sizes = [grp_ary.shape[0] * grp_ary.shape[1] for grp_ary in ary]
+    group_starts = np.cumsum([0] + group_sizes)
+
+    actx = ary.array_context
+
+    @memoize_in(actx, (flatten, "flatten_prg"))
+    def prg():
+        return make_loopy_program(
+            "{[iel,idof]: 0<=iel<nelements and 0<=idof<ndofs_per_element}",
+            """result[grp_start + iel*ndofs_per_element + idof] \
+                = grp_ary[iel, idof]""",
+            name="flatten")
+
+    result = actx.empty(group_starts[-1], dtype=ary.entry_dtype)
+
+    for grp_start, grp_ary in zip(group_starts, ary):
+        actx.call_loopy(prg(), grp_ary=grp_ary, result=result, grp_start=grp_start)
+
+    return result
+
+
+def unflatten(actx: ArrayContext, discr: "_Discretization", ary,
+        ndofs_per_element_per_group: Optional[Iterable[int]] = None) -> np.ndarray:
+    r"""Convert a 'flat' array returned by :func:`flatten` back to a :class:`DOFArray`.
+
+    Vectorizes over object arrays of :class:`DOFArray`\ s.
+    """
+    if (isinstance(ary, np.ndarray)
+            and ary.dtype.char == "O"
+            and not isinstance(ary, DOFArray)):
+        return obj_array_vectorize(
+                lambda subary: unflatten(
+                    actx, discr, subary, ndofs_per_element_per_group),
+                ary)
+
+    @memoize_in(actx, (unflatten, "unflatten_prg"))
+    def prg():
+        return make_loopy_program(
+            "{[iel,idof]: 0<=iel<nelements and 0<=idof<ndofs_per_element}",
+            "result[iel, idof] = ary[grp_start + iel*ndofs_per_element + idof]",
+            name="unflatten")
+
+    if ndofs_per_element_per_group is None:
+        ndofs_per_element_per_group = [
+                grp.nunit_dofs for grp in discr.groups]
+
+    group_sizes = [
+            grp.nelements * ndofs_per_element
+            for grp, ndofs_per_element
+            in zip(discr.groups, ndofs_per_element_per_group)]
+
+    if ary.size != sum(group_sizes):
+        raise ValueError("array has size %d, expected %d"
+                % (ary.size, sum(group_sizes)))
+
+    group_starts = np.cumsum([0] + group_sizes)
+
+    return DOFArray.from_list(actx, [
+        actx.call_loopy(
+            prg(),
+            grp_start=grp_start, ary=ary,
+            nelements=grp.nelements,
+            ndofs_per_element=ndofs_per_element,
+            )["result"]
+        for grp_start, grp, ndofs_per_element in zip(
+            group_starts,
+            discr.groups,
+            ndofs_per_element_per_group)])
+
+
+def flat_norm(ary: DOFArray, ord=2):
+    # FIXME This could be done without flattening and copying
+    actx = ary.array_context
+    import numpy.linalg as la
+    return la.norm(actx.to_numpy(flatten(ary)), ord)
+
+
+# vim: foldmethod=marker
diff --git a/meshmode/mesh/generation.py b/meshmode/mesh/generation.py
index 4fa520bc4d7a62a4dfb27f55025ff9d906b52a0f..c524292c1fffd741951b5838729014ac38db64b5 100644
--- a/meshmode/mesh/generation.py
+++ b/meshmode/mesh/generation.py
@@ -321,6 +321,9 @@ def make_group_from_vertices(vertices, vertex_indices, order,
         group_factory = SimplexElementGroup
 
     if issubclass(group_factory, SimplexElementGroup):
+        if order < 1:
+            raise ValueError("can't represent simplices with mesh order < 1")
+
         el_origins = el_vertices[:, :, 0][:, :, np.newaxis]
         # ambient_dim, nelements, nspan_vectors
         spanning_vectors = (
@@ -811,10 +814,9 @@ def generate_box_mesh(axis_coords, order=1, coord_dtype=np.float64,
     axes = ["x", "y", "z", "w"]
 
     if boundary_tags:
-        from pytools import indices_in_shape
         vert_index_to_tuple = {
                 vertex_indices[itup]: itup
-                for itup in indices_in_shape(shape)}
+                for itup in np.ndindex(shape)}
 
     for tag_idx, tag in enumerate(boundary_tags):
         # Need to map the correct face vertices to the boundary tags
diff --git a/meshmode/version.py b/meshmode/version.py
index 5cea8857d9550183c2caabd0948032faac48e0ae..82ea6fb128de4adc8cddd1928ee3eb063bd70aa0 100644
--- a/meshmode/version.py
+++ b/meshmode/version.py
@@ -1,2 +1,2 @@
-VERSION = (2020, 1)
+VERSION = (2020, 2)
 VERSION_TEXT = ".".join(str(i) for i in VERSION)
diff --git a/requirements.txt b/requirements.txt
index 62920342df6e2e29d0772be0b2e7cc705f87a214..818dd86c6ca8ad96a2ce29de55edbcc1f2fcdcba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
+git+https://gitlab.tiker.net/inducer/pytools.git
 git+https://gitlab.tiker.net/inducer/gmsh_interop.git
 git+https://gitlab.tiker.net/inducer/modepy.git
 git+https://gitlab.tiker.net/inducer/pyopencl.git
@@ -14,7 +15,7 @@ git+https://gitlab.tiker.net/inducer/loopy.git
 # more pytential dependencies
 git+https://gitlab.tiker.net/inducer/boxtree.git
 git+https://gitlab.tiker.net/inducer/sumpy.git
-git+https://gitlab.tiker.net/inducer/pytential.git
+git+https://gitlab.tiker.net/inducer/pytential.git@array-context
 
 # requires pymetis for tests for partition_mesh
 git+https://gitlab.tiker.net/inducer/pymetis.git
diff --git a/setup.py b/setup.py
index 55a523ca5cf961abd0c8bfb9b70c35f0898b0b5e..810d2560b674251f4b6b72d617e4fba74f38c692 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@ def main():
               'Natural Language :: English',
               'Programming Language :: Python',
               'Programming Language :: Python :: 3',
+              'Programming Language :: Python :: 3.6',
+              'Programming Language :: Python :: 3.7',
+              'Programming Language :: Python :: 3.8',
               'Topic :: Scientific/Engineering',
               'Topic :: Scientific/Engineering :: Information Analysis',
               'Topic :: Scientific/Engineering :: Mathematics',
@@ -42,7 +45,7 @@ def main():
               "modepy",
               "gmsh_interop",
               "six",
-              "pytools>=2018.4",
+              "pytools>=2020.3",
               "pytest>=2.3",
               "loo.py>=2014.1",
               ],
diff --git a/test/test_chained.py b/test/test_chained.py
index 21ebbda4f202b5175f3d04e5c4eae2c95027b759..51c1e12c494f32af90e8a7fa98724188a04bc2ca 100644
--- a/test/test_chained.py
+++ b/test/test_chained.py
@@ -23,27 +23,25 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import numpy.linalg as la
 
 import pyopencl as cl
-import pyopencl.array  # noqa
-import pyopencl.clmath  # noqa
 
 import pytest
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw, flat_norm, flatten
+
 import logging
 logger = logging.getLogger(__name__)
 
 
-def create_discretization(queue, ndim,
+def create_discretization(actx, ndim,
                           nelements=42,
                           mesh_name=None,
                           order=4):
-    ctx = queue.context
-
     # construct mesh
     if ndim == 2:
         from functools import partial
@@ -80,13 +78,13 @@ def create_discretization(queue, ndim,
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
-    discr = Discretization(ctx, mesh,
+    discr = Discretization(actx, mesh,
             InterpolatoryQuadratureSimplexGroupFactory(order))
 
     return discr
 
 
-def create_refined_connection(queue, discr, threshold=0.3):
+def create_refined_connection(actx, discr, threshold=0.3):
     from meshmode.mesh.refinement import RefinerWithoutAdjacency
     from meshmode.discretization.connection import make_refinement_connection
     from meshmode.discretization.poly_element import \
@@ -97,20 +95,20 @@ def create_refined_connection(queue, discr, threshold=0.3):
     refiner.refine(flags)
 
     discr_order = discr.groups[0].order
-    connection = make_refinement_connection(refiner, discr,
+    connection = make_refinement_connection(actx, refiner, discr,
             InterpolatoryQuadratureSimplexGroupFactory(discr_order))
 
     return connection
 
 
-def create_face_connection(queue, discr):
+def create_face_connection(actx, discr):
     from meshmode.discretization.connection import FACE_RESTR_ALL
     from meshmode.discretization.connection import make_face_restriction
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
 
     discr_order = discr.groups[0].order
-    connection = make_face_restriction(discr,
+    connection = make_face_restriction(actx, discr,
             InterpolatoryQuadratureSimplexGroupFactory(discr_order),
             FACE_RESTR_ALL,
             per_face_groups=True)
@@ -126,19 +124,20 @@ def test_chained_batch_table(ctx_factory, ndim, visualize=False):
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = create_discretization(queue, ndim)
+    discr = create_discretization(actx, ndim)
     connections = []
-    conn = create_refined_connection(queue, discr)
+    conn = create_refined_connection(actx, discr)
     connections.append(conn)
-    conn = create_refined_connection(queue, conn.to_discr)
+    conn = create_refined_connection(actx, conn.to_discr)
     connections.append(conn)
 
     from meshmode.discretization.connection import ChainedDiscretizationConnection
     chained = ChainedDiscretizationConnection(connections)
 
     conn = chained.connections[0]
-    el_table = _build_element_lookup_table(queue, conn)
+    el_table = _build_element_lookup_table(actx, conn)
     for igrp, grp in enumerate(conn.groups):
         for ibatch, batch in enumerate(grp.batches):
             ifrom = batch.from_element_indices.get(queue)
@@ -156,15 +155,15 @@ def test_chained_new_group_table(ctx_factory, ndim, visualize=False):
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = create_discretization(queue, ndim,
+    discr = create_discretization(actx, ndim,
                                   nelements=8,
-                                  mesh_order=2,
-                                  discr_order=2)
+                                  order=2)
     connections = []
-    conn = create_refined_connection(queue, discr)
+    conn = create_refined_connection(actx, discr)
     connections.append(conn)
-    conn = create_refined_connection(queue, conn.to_discr)
+    conn = create_refined_connection(actx, conn.to_discr)
     connections.append(conn)
 
     grp_to_grp, grp_info = _build_new_group_table(connections[0],
@@ -202,13 +201,13 @@ def test_chained_new_group_table(ctx_factory, ndim, visualize=False):
 def test_chained_connection(ctx_factory, ndim, visualize=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = create_discretization(queue, ndim,
-                                  nelements=10)
+    discr = create_discretization(actx, ndim, nelements=10)
     connections = []
-    conn = create_refined_connection(queue, discr, threshold=np.inf)
+    conn = create_refined_connection(actx, discr, threshold=np.inf)
     connections.append(conn)
-    conn = create_refined_connection(queue, conn.to_discr, threshold=np.inf)
+    conn = create_refined_connection(actx, conn.to_discr, threshold=np.inf)
     connections.append(conn)
 
     from meshmode.discretization.connection import \
@@ -216,15 +215,15 @@ def test_chained_connection(ctx_factory, ndim, visualize=False):
     chained = ChainedDiscretizationConnection(connections)
 
     def f(x):
-        from six.moves import reduce
-        return 0.1 * reduce(lambda x, y: x * cl.clmath.sin(5 * y), x)
+        from functools import reduce
+        return 0.1 * reduce(lambda x, y: x * actx.np.sin(5 * y), x)
 
-    x = connections[0].from_discr.nodes().with_queue(queue)
+    x = thaw(actx, connections[0].from_discr.nodes())
     fx = f(x)
-    f1 = chained(queue, fx).get(queue)
-    f2 = connections[1](queue, connections[0](queue, fx)).get(queue)
+    f1 = chained(fx)
+    f2 = connections[1](connections[0](fx))
 
-    assert np.allclose(f1, f2)
+    assert flat_norm(f1-f2, np.inf) / flat_norm(f2) < 1e-11
 
 
 @pytest.mark.skip(reason='slow test')
@@ -235,12 +234,13 @@ def test_chained_full_resample_matrix(ctx_factory, ndim, visualize=False):
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = create_discretization(queue, ndim)
+    discr = create_discretization(actx, ndim)
     connections = []
-    conn = create_refined_connection(queue, discr)
+    conn = create_refined_connection(actx, discr)
     connections.append(conn)
-    conn = create_refined_connection(queue, conn.to_discr)
+    conn = create_refined_connection(actx, conn.to_discr)
     connections.append(conn)
 
     from meshmode.discretization.connection import \
@@ -249,15 +249,15 @@ def test_chained_full_resample_matrix(ctx_factory, ndim, visualize=False):
 
     def f(x):
         from six.moves import reduce
-        return 0.1 * reduce(lambda x, y: x * cl.clmath.sin(5 * y), x)
+        return 0.1 * reduce(lambda x, y: x * actx.np.sin(5 * y), x)
 
-    resample_mat = make_full_resample_matrix(queue, chained).get(queue)
+    resample_mat = actx.to_numpy(make_full_resample_matrix(actx, chained))
 
-    x = connections[0].from_discr.nodes().with_queue(queue)
+    x = thaw(actx, connections[0].from_discr.nodes())
     fx = f(x)
-    f1 = np.dot(resample_mat, fx.get(queue))
-    f2 = chained(queue, fx).get(queue)
-    f3 = connections[1](queue, connections[0](queue, fx)).get(queue)
+    f1 = resample_mat @ actx.to_numpy(flatten(fx))
+    f2 = actx.to_numpy(flatten(chained(fx)))
+    f3 = actx.to_numpy(flatten(connections[1](connections[0](fx))))
 
     assert np.allclose(f1, f2)
     assert np.allclose(f2, f3)
@@ -273,25 +273,26 @@ def test_chained_to_direct(ctx_factory, ndim, chain_type,
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
-    discr = create_discretization(queue, ndim, nelements=nelements)
+    discr = create_discretization(actx, ndim, nelements=nelements)
     connections = []
     if chain_type == 1:
-        conn = create_refined_connection(queue, discr)
+        conn = create_refined_connection(actx, discr)
         connections.append(conn)
-        conn = create_refined_connection(queue, conn.to_discr)
+        conn = create_refined_connection(actx, conn.to_discr)
         connections.append(conn)
     elif chain_type == 2:
-        conn = create_refined_connection(queue, discr)
+        conn = create_refined_connection(actx, discr)
         connections.append(conn)
-        conn = create_refined_connection(queue, conn.to_discr)
+        conn = create_refined_connection(actx, conn.to_discr)
         connections.append(conn)
-        conn = create_refined_connection(queue, conn.to_discr)
+        conn = create_refined_connection(actx, conn.to_discr)
         connections.append(conn)
     elif chain_type == 3 and ndim == 3:
-        conn = create_refined_connection(queue, discr, threshold=np.inf)
+        conn = create_refined_connection(actx, discr, threshold=np.inf)
         connections.append(conn)
-        conn = create_face_connection(queue, conn.to_discr)
+        conn = create_face_connection(actx, conn.to_discr)
         connections.append(conn)
     else:
         raise ValueError('unknown test case')
@@ -301,7 +302,7 @@ def test_chained_to_direct(ctx_factory, ndim, chain_type,
     chained = ChainedDiscretizationConnection(connections)
 
     t_start = time.time()
-    direct = flatten_chained_connection(queue, chained)
+    direct = flatten_chained_connection(actx, chained)
     t_end = time.time()
     if visualize:
         print('[TIME] Flatten: {:.5e}'.format(t_end - t_start))
@@ -317,19 +318,19 @@ def test_chained_to_direct(ctx_factory, ndim, chain_type,
 
     def f(x):
         from six.moves import reduce
-        return 0.1 * reduce(lambda x, y: x * cl.clmath.sin(5 * y), x)
+        return 0.1 * reduce(lambda x, y: x * actx.np.sin(5 * y), x)
 
-    x = connections[0].from_discr.nodes().with_queue(queue)
+    x = thaw(actx, connections[0].from_discr.nodes())
     fx = f(x)
 
     t_start = time.time()
-    f1 = direct(queue, fx).get(queue)
+    f1 = actx.to_numpy(flatten(direct(fx)))
     t_end = time.time()
     if visualize:
         print('[TIME] Direct: {:.5e}'.format(t_end - t_start))
 
     t_start = time.time()
-    f2 = chained(queue, fx).get(queue)
+    f2 = actx.to_numpy(flatten(chained(fx)))
     t_end = time.time()
     if visualize:
         print('[TIME] Chained: {:.5e}'.format(t_end - t_start))
@@ -351,27 +352,28 @@ def test_chained_to_direct(ctx_factory, ndim, chain_type,
 @pytest.mark.parametrize(("ndim", "mesh_name"), [
     (2, "starfish"),
     (3, "torus")])
-def test_reversed_chained_connection(ctx_factory, ndim, mesh_name, visualize=False):
+def test_reversed_chained_connection(ctx_factory, ndim, mesh_name):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     def run(nelements, order):
-        discr = create_discretization(queue, ndim,
+        discr = create_discretization(actx, ndim,
             nelements=nelements,
             order=order,
             mesh_name=mesh_name)
 
         threshold = 1.0
         connections = []
-        conn = create_refined_connection(queue,
+        conn = create_refined_connection(actx,
                 discr, threshold=threshold)
         connections.append(conn)
         if ndim == 2:
             # NOTE: additional refinement makes the 3D meshes explode in size
-            conn = create_refined_connection(queue,
+            conn = create_refined_connection(actx,
                     conn.to_discr, threshold=threshold)
             connections.append(conn)
-            conn = create_refined_connection(queue,
+            conn = create_refined_connection(actx,
                     conn.to_discr, threshold=threshold)
             connections.append(conn)
 
@@ -383,21 +385,19 @@ def test_reversed_chained_connection(ctx_factory, ndim, mesh_name, visualize=Fal
         reverse = L2ProjectionInverseDiscretizationConnection(chained)
 
         # create test vector
-        from_nodes = chained.from_discr.nodes().with_queue(queue)
-        to_nodes = chained.to_discr.nodes().with_queue(queue)
+        from_nodes = thaw(actx, chained.from_discr.nodes())
+        to_nodes = thaw(actx, chained.to_discr.nodes())
 
         from_x = 0
         to_x = 0
         for d in range(ndim):
-            from_x += cl.clmath.cos(from_nodes[d]) ** (d + 1)
-            to_x += cl.clmath.cos(to_nodes[d]) ** (d + 1)
-
-        from_interp = reverse(queue, to_x)
+            from_x += actx.np.cos(from_nodes[d]) ** (d + 1)
+            to_x += actx.np.cos(to_nodes[d]) ** (d + 1)
 
-        from_interp = from_interp.get(queue)
-        from_x = from_x.get(queue)
+        from_interp = reverse(to_x)
 
-        return 1.0 / nelements, la.norm(from_interp - from_x) / la.norm(from_x)
+        return (1.0 / nelements,
+                flat_norm(from_interp - from_x, np.inf) / flat_norm(from_x, np.inf))
 
     from pytools.convergence import EOCRecorder
     eoc = EOCRecorder()
diff --git a/test/test_meshmode.py b/test/test_meshmode.py
index 047362d20c1b297e22eed758fbe633bee01a82cc..a888beff918cc0ef1ccc002fa2f9878123999911 100644
--- a/test/test_meshmode.py
+++ b/test/test_meshmode.py
@@ -26,8 +26,6 @@ from six.moves import range
 import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
-import pyopencl.array  # noqa
-import pyopencl.clmath  # noqa
 
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
@@ -40,6 +38,8 @@ from meshmode.discretization.poly_element import (
         PolynomialEquidistantSimplexGroupFactory,
         )
 from meshmode.mesh import Mesh, BTAG_ALL
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw, flat_norm, flatten
 from meshmode.discretization.connection import \
         FACE_RESTR_ALL, FACE_RESTR_INTERIOR
 import meshmode.mesh.generation as mgen
@@ -59,7 +59,8 @@ def test_circle_mesh(do_plot=False):
             FileSource("circle.step"), 2, order=2,
             force_ambient_dim=2,
             other_options=[
-                "-string", "Mesh.CharacteristicLengthMax = 0.05;"]
+                "-string", "Mesh.CharacteristicLengthMax = 0.05;"],
+            target_unit="MM",
             )
     print("END GEN")
     print(mesh.nelements)
@@ -223,6 +224,7 @@ def test_boundary_interpolation(ctx_factory, group_factory, boundary_tag,
         mesh_name, dim, mesh_pars, per_face_groups):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.discretization import Discretization
     from meshmode.discretization.connection import (
@@ -234,7 +236,7 @@ def test_boundary_interpolation(ctx_factory, group_factory, boundary_tag,
     order = 4
 
     def f(x):
-        return 0.1*cl.clmath.sin(30*x)
+        return 0.1*actx.np.sin(30*x)
 
     for mesh_par in mesh_pars:
         # {{{ get mesh
@@ -267,32 +269,31 @@ def test_boundary_interpolation(ctx_factory, group_factory, boundary_tag,
 
         # }}}
 
-        vol_discr = Discretization(cl_ctx, mesh,
-                group_factory(order))
+        vol_discr = Discretization(actx, mesh, group_factory(order))
         print("h=%s -> %d elements" % (
                 h, sum(mgrp.nelements for mgrp in mesh.groups)))
 
-        x = vol_discr.nodes()[0].with_queue(queue)
+        x = thaw(actx, vol_discr.nodes()[0])
         vol_f = f(x)
 
         bdry_connection = make_face_restriction(
-                vol_discr, group_factory(order),
+                actx, vol_discr, group_factory(order),
                 boundary_tag, per_face_groups=per_face_groups)
-        check_connection(bdry_connection)
+        check_connection(actx, bdry_connection)
         bdry_discr = bdry_connection.to_discr
 
-        bdry_x = bdry_discr.nodes()[0].with_queue(queue)
+        bdry_x = thaw(actx, bdry_discr.nodes()[0])
         bdry_f = f(bdry_x)
-        bdry_f_2 = bdry_connection(queue, vol_f)
+        bdry_f_2 = bdry_connection(vol_f)
 
         if mesh_name == "blob" and dim == 2 and mesh.nelements < 500:
-            mat = bdry_connection.full_resample_matrix(queue).get(queue)
-            bdry_f_2_by_mat = mat.dot(vol_f.get())
+            mat = actx.to_numpy(bdry_connection.full_resample_matrix(actx))
+            bdry_f_2_by_mat = mat.dot(actx.to_numpy(flatten(vol_f)))
 
-            mat_error = la.norm(bdry_f_2.get(queue=queue) - bdry_f_2_by_mat)
+            mat_error = la.norm(actx.to_numpy(flatten(bdry_f_2)) - bdry_f_2_by_mat)
             assert mat_error < 1e-14, mat_error
 
-        err = la.norm((bdry_f-bdry_f_2).get(), np.inf)
+        err = flat_norm(bdry_f-bdry_f_2, np.inf)
         eoc_rec.add_data_point(h, err)
 
     order_slack = 0.75 if mesh_name == "blob" else 0.5
@@ -316,6 +317,7 @@ def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
         per_face_groups):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.discretization import Discretization
     from meshmode.discretization.connection import (
@@ -328,7 +330,7 @@ def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
     order = 4
 
     def f(x):
-        return 0.1*cl.clmath.sin(30*x)
+        return 0.1*actx.np.sin(30*x)
 
     for mesh_par in mesh_pars:
         # {{{ get mesh
@@ -344,7 +346,8 @@ def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
                     FileSource("blob-2d.step"), 2, order=order,
                     force_ambient_dim=2,
                     other_options=[
-                        "-string", "Mesh.CharacteristicLengthMax = %s;" % h]
+                        "-string", "Mesh.CharacteristicLengthMax = %s;" % h],
+                    target_unit="MM",
                     )
             print("END GEN")
         elif mesh_name == "warp":
@@ -357,20 +360,20 @@ def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
 
         # }}}
 
-        vol_discr = Discretization(cl_ctx, mesh,
+        vol_discr = Discretization(actx, mesh,
                 PolynomialWarpAndBlendGroupFactory(order))
         print("h=%s -> %d elements" % (
                 h, sum(mgrp.nelements for mgrp in mesh.groups)))
 
         all_face_bdry_connection = make_face_restriction(
-                vol_discr, PolynomialWarpAndBlendGroupFactory(order),
+                actx, vol_discr, PolynomialWarpAndBlendGroupFactory(order),
                 FACE_RESTR_ALL, per_face_groups=per_face_groups)
         all_face_bdry_discr = all_face_bdry_connection.to_discr
 
         for ito_grp, ceg in enumerate(all_face_bdry_connection.groups):
             for ibatch, batch in enumerate(ceg.batches):
                 assert np.array_equal(
-                        batch.from_element_indices.get(queue),
+                        actx.to_numpy(actx.thaw(batch.from_element_indices)),
                         np.arange(vol_discr.mesh.nelements))
 
                 if per_face_groups:
@@ -378,31 +381,31 @@ def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
                 else:
                     assert ibatch == batch.to_element_face
 
-        all_face_x = all_face_bdry_discr.nodes()[0].with_queue(queue)
+        all_face_x = thaw(actx, all_face_bdry_discr.nodes()[0])
         all_face_f = f(all_face_x)
 
-        all_face_f_2 = all_face_bdry_discr.zeros(queue)
+        all_face_f_2 = all_face_bdry_discr.zeros(actx)
 
         for boundary_tag in [
                 BTAG_ALL,
                 FACE_RESTR_INTERIOR,
                 ]:
             bdry_connection = make_face_restriction(
-                    vol_discr, PolynomialWarpAndBlendGroupFactory(order),
+                    actx, vol_discr, PolynomialWarpAndBlendGroupFactory(order),
                     boundary_tag, per_face_groups=per_face_groups)
             bdry_discr = bdry_connection.to_discr
 
-            bdry_x = bdry_discr.nodes()[0].with_queue(queue)
+            bdry_x = thaw(actx, bdry_discr.nodes()[0])
             bdry_f = f(bdry_x)
 
             all_face_embedding = make_face_to_all_faces_embedding(
-                    bdry_connection, all_face_bdry_discr)
+                    actx, bdry_connection, all_face_bdry_discr)
 
-            check_connection(all_face_embedding)
+            check_connection(actx, all_face_embedding)
 
-            all_face_f_2 += all_face_embedding(queue, bdry_f)
+            all_face_f_2 = all_face_f_2 + all_face_embedding(bdry_f)
 
-        err = la.norm((all_face_f-all_face_f_2).get(), np.inf)
+        err = flat_norm(all_face_f-all_face_f_2, np.inf)
         eoc_rec.add_data_point(h, err)
 
     print(eoc_rec)
@@ -431,6 +434,7 @@ def test_opposite_face_interpolation(ctx_factory, group_factory,
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.discretization import Discretization
     from meshmode.discretization.connection import (
@@ -443,7 +447,7 @@ def test_opposite_face_interpolation(ctx_factory, group_factory,
     order = 5
 
     def f(x):
-        return 0.1*cl.clmath.sin(30*x)
+        return 0.1*actx.np.sin(30*x)
 
     for mesh_par in mesh_pars:
         # {{{ get mesh
@@ -467,7 +471,8 @@ def test_opposite_face_interpolation(ctx_factory, group_factory,
                     FileSource("blob-2d.step"), 2, order=order,
                     force_ambient_dim=2,
                     other_options=[
-                        "-string", "Mesh.CharacteristicLengthMax = %s;" % h]
+                        "-string", "Mesh.CharacteristicLengthMax = %s;" % h],
+                    target_unit="MM",
                     )
             print("END GEN")
         elif mesh_name == "warp":
@@ -480,24 +485,24 @@ def test_opposite_face_interpolation(ctx_factory, group_factory,
 
         # }}}
 
-        vol_discr = Discretization(cl_ctx, mesh,
+        vol_discr = Discretization(actx, mesh,
                 group_factory(order))
         print("h=%s -> %d elements" % (
                 h, sum(mgrp.nelements for mgrp in mesh.groups)))
 
         bdry_connection = make_face_restriction(
-                vol_discr, group_factory(order),
+                actx, vol_discr, group_factory(order),
                 FACE_RESTR_INTERIOR)
         bdry_discr = bdry_connection.to_discr
 
-        opp_face = make_opposite_face_connection(bdry_connection)
-        check_connection(opp_face)
+        opp_face = make_opposite_face_connection(actx, bdry_connection)
+        check_connection(actx, opp_face)
 
-        bdry_x = bdry_discr.nodes()[0].with_queue(queue)
+        bdry_x = thaw(actx, bdry_discr.nodes()[0])
         bdry_f = f(bdry_x)
-        bdry_f_2 = opp_face(queue, bdry_f)
+        bdry_f_2 = opp_face(bdry_f)
 
-        err = la.norm((bdry_f-bdry_f_2).get(), np.inf)
+        err = flat_norm(bdry_f-bdry_f_2, np.inf)
         eoc_rec.add_data_point(h, err)
 
     print(eoc_rec)
@@ -518,7 +523,8 @@ def test_element_orientation():
     mesh = generate_gmsh(
             FileSource("blob-2d.step"), 2, order=mesh_order,
             force_ambient_dim=2,
-            other_options=["-string", "Mesh.CharacteristicLengthMax = 0.02;"]
+            other_options=["-string", "Mesh.CharacteristicLengthMax = 0.02;"],
+            target_unit="MM",
             )
 
     from meshmode.mesh.processing import (perform_flips,
@@ -555,13 +561,14 @@ def test_3d_orientation(ctx_factory, what, mesh_gen_func, visualize=False):
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     mesh = mesh_gen_func()
 
     logger.info("%d elements" % mesh.nelements)
 
     from meshmode.discretization import Discretization
-    discr = Discretization(ctx, mesh,
+    discr = Discretization(actx, mesh,
             PolynomialWarpAndBlendGroupFactory(1))
 
     from pytential import bind, sym
@@ -582,17 +589,18 @@ def test_3d_orientation(ctx_factory, what, mesh_gen_func, visualize=False):
         normal_outward_expr = (
                 sym.normal(mesh.ambient_dim) | sym.nodes(mesh.ambient_dim))
 
-    normal_outward_check = bind(discr, normal_outward_expr)(queue).as_scalar() > 0
+    normal_outward_check = actx.to_numpy(
+            flatten(bind(discr, normal_outward_expr)(actx).as_scalar())) > 0
 
-    assert normal_outward_check.get().all(), normal_outward_check.get()
+    assert normal_outward_check.all(), normal_outward_check
 
     # }}}
 
-    normals = bind(discr, sym.normal(mesh.ambient_dim).xproject(1))(queue)
+    normals = bind(discr, sym.normal(mesh.ambient_dim).xproject(1))(actx)
 
     if visualize:
         from meshmode.discretization.visualization import make_visualizer
-        vis = make_visualizer(queue, discr, 1)
+        vis = make_visualizer(actx, discr, 1)
 
         vis.write_vtk_file("normals.vtu", [
             ("normals", normals),
@@ -616,7 +624,8 @@ def test_merge_and_map(ctx_factory, visualize=False):
         mesh = generate_gmsh(
                 FileSource("blob-2d.step"), 2, order=mesh_order,
                 force_ambient_dim=2,
-                other_options=["-string", "Mesh.CharacteristicLengthMax = 0.02;"]
+                other_options=["-string", "Mesh.CharacteristicLengthMax = 0.02;"],
+                target_unit="MM",
                 )
 
         discr_grp_factory = PolynomialWarpAndBlendGroupFactory(3)
@@ -664,6 +673,7 @@ def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
 
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from modepy.tools import unit_vertices
     vertices = unit_vertices(dim).T.copy()
@@ -684,21 +694,20 @@ def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import \
             PolynomialWarpAndBlendGroupFactory
-    vol_discr = Discretization(cl_ctx, mesh,
+    vol_discr = Discretization(actx, mesh,
             PolynomialWarpAndBlendGroupFactory(order+3))
 
     # {{{ volume calculation check
 
-    vol_x = vol_discr.nodes().with_queue(queue)
+    vol_x = thaw(actx, vol_discr.nodes())
 
-    vol_one = vol_x[0].copy()
-    vol_one.fill(1)
+    vol_one = vol_x[0] * 0 + 1
     from pytential import norm, integral  # noqa
 
     from pytools import factorial
     true_vol = 1/factorial(dim) * 2**dim
 
-    comp_vol = integral(vol_discr, queue, vol_one)
+    comp_vol = integral(vol_discr, vol_one)
     rel_vol_err = abs(true_vol - comp_vol) / true_vol
 
     assert rel_vol_err < 1e-12
@@ -709,7 +718,7 @@ def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
 
     from meshmode.discretization.connection import make_face_restriction
     bdry_connection = make_face_restriction(
-            vol_discr, PolynomialWarpAndBlendGroupFactory(order + 3),
+            actx, vol_discr, PolynomialWarpAndBlendGroupFactory(order + 3),
             BTAG_ALL)
     bdry_discr = bdry_connection.to_discr
 
@@ -719,12 +728,12 @@ def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
 
     from meshmode.discretization.visualization import make_visualizer
     #vol_vis = make_visualizer(queue, vol_discr, 4)
-    bdry_vis = make_visualizer(queue, bdry_discr, 4)
+    bdry_vis = make_visualizer(actx, bdry_discr, 4)
 
     # }}}
 
     from pytential import bind, sym
-    bdry_normals = bind(bdry_discr, sym.normal(dim))(queue).as_vector(dtype=object)
+    bdry_normals = bind(bdry_discr, sym.normal(dim))(actx).as_vector(dtype=object)
 
     if visualize:
         bdry_vis.write_vtk_file("boundary.vtu", [
@@ -734,9 +743,10 @@ def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
     normal_outward_check = bind(bdry_discr,
             sym.normal(dim)
             | (sym.nodes(dim) + 0.5*sym.ones_vec(dim)),
-            )(queue).as_scalar() > 0
+            )(actx).as_scalar()
 
-    assert normal_outward_check.get().all(), normal_outward_check.get()
+    normal_outward_check = actx.to_numpy(flatten(normal_outward_check) > 0)
+    assert normal_outward_check.all(), normal_outward_check
 
 # }}}
 
@@ -752,6 +762,7 @@ def test_sanity_qhull_nd(ctx_factory, dim, order):
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from scipy.spatial import Delaunay
     verts = np.random.rand(1000, dim)
@@ -762,28 +773,28 @@ def test_sanity_qhull_nd(ctx_factory, dim, order):
             fix_orientation=True)
 
     from meshmode.discretization import Discretization
-    low_discr = Discretization(ctx, mesh,
+    low_discr = Discretization(actx, mesh,
             PolynomialEquidistantSimplexGroupFactory(order))
-    high_discr = Discretization(ctx, mesh,
+    high_discr = Discretization(actx, mesh,
             PolynomialEquidistantSimplexGroupFactory(order+1))
 
     from meshmode.discretization.connection import make_same_mesh_connection
-    cnx = make_same_mesh_connection(high_discr, low_discr)
+    cnx = make_same_mesh_connection(actx, high_discr, low_discr)
 
     def f(x):
-        return 0.1*cl.clmath.sin(x)
+        return 0.1*actx.np.sin(x)
 
-    x_low = low_discr.nodes()[0].with_queue(queue)
+    x_low = thaw(actx, low_discr.nodes()[0])
     f_low = f(x_low)
 
-    x_high = high_discr.nodes()[0].with_queue(queue)
+    x_high = thaw(actx, high_discr.nodes()[0])
     f_high_ref = f(x_high)
 
-    f_high_num = cnx(queue, f_low)
+    f_high_num = cnx(f_low)
 
-    err = (f_high_ref-f_high_num).get()
-
-    err = la.norm(err, np.inf)/la.norm(f_high_ref.get(), np.inf)
+    err = (
+            flat_norm(f_high_ref-f_high_num, np.inf)
+            / flat_norm(f_high_ref, np.inf))
 
     print(err)
     assert err < 1e-2
@@ -799,14 +810,14 @@ def test_sanity_qhull_nd(ctx_factory, dim, order):
     ("ball-radius-1.step", 3),
     ])
 @pytest.mark.parametrize("mesh_order", [1, 2])
-def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
-        visualize=False):
+def test_sanity_balls(ctx_factory, src_file, dim, mesh_order, visualize=False):
     pytest.importorskip("pytential")
 
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from pytools.convergence import EOCRecorder
     vol_eoc_rec = EOCRecorder()
@@ -822,18 +833,20 @@ def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
         mesh = generate_gmsh(
                 FileSource(src_file), dim, order=mesh_order,
                 other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h],
-                force_ambient_dim=dim)
+                force_ambient_dim=dim,
+                target_unit="MM")
 
         logger.info("%d elements" % mesh.nelements)
 
         # {{{ discretizations and connections
 
         from meshmode.discretization import Discretization
-        vol_discr = Discretization(ctx, mesh,
+        vol_discr = Discretization(actx, mesh,
                 InterpolatoryQuadratureSimplexGroupFactory(quad_order))
 
         from meshmode.discretization.connection import make_face_restriction
         bdry_connection = make_face_restriction(
+                actx,
                 vol_discr,
                 InterpolatoryQuadratureSimplexGroupFactory(quad_order),
                 BTAG_ALL)
@@ -844,8 +857,8 @@ def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
         # {{{ visualizers
 
         from meshmode.discretization.visualization import make_visualizer
-        vol_vis = make_visualizer(queue, vol_discr, 20)
-        bdry_vis = make_visualizer(queue, bdry_discr, 20)
+        vol_vis = make_visualizer(actx, vol_discr, 20)
+        bdry_vis = make_visualizer(actx, bdry_discr, 20)
 
         # }}}
 
@@ -853,27 +866,25 @@ def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
         true_surf = 2*np.pi**(dim/2)/gamma(dim/2)
         true_vol = true_surf/dim
 
-        vol_x = vol_discr.nodes().with_queue(queue)
+        vol_x = thaw(actx, vol_discr.nodes())
 
-        vol_one = vol_x[0].copy()
-        vol_one.fill(1)
+        vol_one = vol_x[0]*0 + 1
         from pytential import norm, integral  # noqa
 
-        comp_vol = integral(vol_discr, queue, vol_one)
+        comp_vol = integral(vol_discr, vol_one)
         rel_vol_err = abs(true_vol - comp_vol) / true_vol
         vol_eoc_rec.add_data_point(h, rel_vol_err)
         print("VOL", true_vol, comp_vol)
 
-        bdry_x = bdry_discr.nodes().with_queue(queue)
+        bdry_x = thaw(actx, bdry_discr.nodes())
 
-        bdry_one_exact = bdry_x[0].copy()
-        bdry_one_exact.fill(1)
+        bdry_one_exact = bdry_x[0] * 0 + 1
 
-        bdry_one = bdry_connection(queue, vol_one).with_queue(queue)
-        intp_err = norm(bdry_discr, queue, bdry_one-bdry_one_exact)
+        bdry_one = bdry_connection(vol_one)
+        intp_err = norm(bdry_discr, bdry_one-bdry_one_exact)
         assert intp_err < 1e-14
 
-        comp_surf = integral(bdry_discr, queue, bdry_one)
+        comp_surf = integral(bdry_discr, bdry_one)
         rel_surf_err = abs(true_surf - comp_surf) / true_surf
         surf_eoc_rec.add_data_point(h, rel_surf_err)
         print("SURF", true_surf, comp_surf)
@@ -884,7 +895,7 @@ def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
                 ("area_el", bind(
                     vol_discr,
                     sym.area_element(mesh.ambient_dim, mesh.ambient_dim))
-                    (queue)),
+                    (actx)),
                 ])
             bdry_vis.write_vtk_file("boundary-h=%g.vtu" % h, [("f", bdry_one)])
 
@@ -892,9 +903,10 @@ def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
 
         normal_outward_check = bind(bdry_discr,
                 sym.normal(mesh.ambient_dim) | sym.nodes(mesh.ambient_dim),
-                )(queue).as_scalar() > 0
+                )(actx).as_scalar()
 
-        assert normal_outward_check.get().all(), normal_outward_check.get()
+        normal_outward_check = actx.to_numpy(flatten(normal_outward_check) > 0)
+        assert normal_outward_check.all(), normal_outward_check
 
         # }}}
 
@@ -1044,6 +1056,7 @@ def test_quad_mesh_2d():
                 """,
                 ["blob-2d.step"]),
             force_ambient_dim=2,
+            target_unit="MM",
             )
     print("END GEN")
     print(mesh.nelements)
@@ -1135,7 +1148,7 @@ def test_quad_multi_element():
 
 # {{{ test_vtk_overwrite
 
-def test_vtk_overwrite(ctx_getter):
+def test_vtk_overwrite(ctx_factory):
     pytest.importorskip("pyvisfile")
 
     def _try_write_vtk(writer, obj):
@@ -1154,8 +1167,9 @@ def test_vtk_overwrite(ctx_getter):
         if os.path.exists(filename):
             os.remove(filename)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     target_order = 7
 
@@ -1166,7 +1180,7 @@ def test_vtk_overwrite(ctx_getter):
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory
     discr = Discretization(
-            queue.context, mesh,
+            actx, mesh,
             InterpolatoryQuadratureSimplexGroupFactory(target_order))
 
     from meshmode.discretization.visualization import make_visualizer
@@ -1174,7 +1188,7 @@ def test_vtk_overwrite(ctx_getter):
             write_nodal_adjacency_vtk_file
     from meshmode.mesh.visualization import write_vertex_vtk_file
 
-    vis = make_visualizer(queue, discr, 1)
+    vis = make_visualizer(actx, discr, 1)
     _try_write_vtk(vis.write_vtk_file, discr)
 
     _try_write_vtk(lambda x, y, **kwargs:
@@ -1197,7 +1211,8 @@ def test_mesh_to_tikz():
             FileSource("../test/blob-2d.step"), 2, order=order,
             force_ambient_dim=2,
             other_options=[
-                "-string", "Mesh.CharacteristicLengthMax = %s;" % h]
+                "-string", "Mesh.CharacteristicLengthMax = %s;" % h],
+            target_unit="MM",
             )
 
     from meshmode.mesh.visualization import mesh_to_tikz
@@ -1227,6 +1242,7 @@ def test_affine_map():
 def test_mesh_without_vertices(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     # create a mesh
     from meshmode.mesh.generation import generate_icosphere
@@ -1246,11 +1262,11 @@ def test_mesh_without_vertices(ctx_factory):
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import \
             InterpolatoryQuadratureSimplexGroupFactory as GroupFactory
-    discr = Discretization(ctx, mesh, GroupFactory(4))
-    discr.nodes().with_queue(queue)
+    discr = Discretization(actx, mesh, GroupFactory(4))
+    thaw(actx, discr.nodes())
 
     from meshmode.discretization.visualization import make_visualizer
-    make_visualizer(queue, discr, 4)
+    make_visualizer(actx, discr, 4)
 
 
 @pytest.mark.parametrize("curve_name", ["ellipse", "arc"])
@@ -1352,6 +1368,7 @@ def test_is_affine_group_check(mesh_name):
 def test_mesh_multiple_groups(ctx_factory, ambient_dim, visualize=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     order = 4
 
@@ -1385,16 +1402,16 @@ def test_mesh_multiple_groups(ctx_factory, ambient_dim, visualize=False):
     from meshmode.discretization import Discretization
     from meshmode.discretization.poly_element import \
             PolynomialWarpAndBlendGroupFactory as GroupFactory
-    discr = Discretization(ctx, mesh, GroupFactory(order))
+    discr = Discretization(actx, mesh, GroupFactory(order))
 
     if visualize:
-        group_id = discr.empty(queue, dtype=np.int)
+        group_id = discr.empty(actx, dtype=np.int)
         for igrp, grp in enumerate(discr.groups):
             group_id_view = grp.view(group_id)
             group_id_view.fill(igrp)
 
         from meshmode.discretization.visualization import make_visualizer
-        vis = make_visualizer(queue, discr, vis_order=order)
+        vis = make_visualizer(actx, discr, vis_order=order)
         vis.write_vtk_file("test_mesh_multiple_groups.vtu", [
             ("group_id", group_id)
             ], overwrite=True)
@@ -1406,28 +1423,27 @@ def test_mesh_multiple_groups(ctx_factory, ambient_dim, visualize=False):
             make_opposite_face_connection,
             check_connection)
     for boundary_tag in [BTAG_ALL, FACE_RESTR_INTERIOR, FACE_RESTR_ALL]:
-        conn = make_face_restriction(discr, GroupFactory(order),
+        conn = make_face_restriction(actx, discr, GroupFactory(order),
                 boundary_tag=boundary_tag,
                 per_face_groups=False)
-        check_connection(conn)
+        check_connection(actx, conn)
 
-        bdry_f = conn.to_discr.empty(queue)
-        bdry_f.fill(1.0)
+        bdry_f = conn.to_discr.zeros(actx) + 1
 
         if boundary_tag == FACE_RESTR_INTERIOR:
-            opposite = make_opposite_face_connection(conn)
-            check_connection(opposite)
+            opposite = make_opposite_face_connection(actx, conn)
+            check_connection(actx, opposite)
 
-            op_bdry_f = opposite(queue, bdry_f)
-            error = abs(cl.array.sum(bdry_f - op_bdry_f).get(queue))
+            op_bdry_f = opposite(bdry_f)
+            error = flat_norm(bdry_f - op_bdry_f, np.inf)
             assert error < 1.0e-11, error
 
         if boundary_tag == FACE_RESTR_ALL:
-            embedding = make_face_to_all_faces_embedding(conn, conn.to_discr)
-            check_connection(embedding)
+            embedding = make_face_to_all_faces_embedding(actx, conn, conn.to_discr)
+            check_connection(actx, embedding)
 
-            em_bdry_f = embedding(queue, bdry_f)
-            error = abs(cl.array.sum(bdry_f - em_bdry_f).get(queue))
+            em_bdry_f = embedding(bdry_f)
+            error = flat_norm(bdry_f - em_bdry_f)
             assert error < 1.0e-11, error
 
 
diff --git a/test/test_partition.py b/test/test_partition.py
index 54b9d88923249b553c334a278744e2f51ab14c44..1e8e6c7fb5d556807375dc3e132b4e1b1c3f3a09 100644
--- a/test/test_partition.py
+++ b/test/test_partition.py
@@ -27,10 +27,10 @@ THE SOFTWARE.
 
 from six.moves import range
 import numpy as np
-import numpy.linalg as la
 import pyopencl as cl
-import pyopencl.array  # noqa
-import pyopencl.clmath  # noqa
+
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw, flatten, unflatten, flat_norm
 
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
@@ -69,6 +69,8 @@ def test_partition_interpolation(ctx_factory, dim, mesh_pars,
     group_factory = PolynomialWarpAndBlendGroupFactory
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
+
     order = 4
 
     from pytools.convergence import EOCRecorder
@@ -80,7 +82,7 @@ def test_partition_interpolation(ctx_factory, dim, mesh_pars,
             eoc_rec[i, j] = EOCRecorder()
 
     def f(x):
-        return 10.*cl.clmath.sin(50.*x)
+        return 10.*actx.np.sin(50.*x)
 
     for n in mesh_pars:
         from meshmode.mesh.generation import generate_warped_rect_mesh
@@ -107,7 +109,7 @@ def test_partition_interpolation(ctx_factory, dim, mesh_pars,
             partition_mesh(mesh, part_per_element, i)[0] for i in range(num_parts)]
 
         from meshmode.discretization import Discretization
-        vol_discrs = [Discretization(cl_ctx, part_meshes[i], group_factory(order))
+        vol_discrs = [Discretization(actx, part_meshes[i], group_factory(order))
                         for i in range(num_parts)]
 
         from meshmode.mesh import BTAG_PARTITION
@@ -120,23 +122,26 @@ def test_partition_interpolation(ctx_factory, dim, mesh_pars,
                 continue
 
             # Mark faces within local_mesh that are connected to remote_mesh
-            local_bdry_conn = make_face_restriction(vol_discrs[i_local_part],
+            local_bdry_conn = make_face_restriction(actx, vol_discrs[i_local_part],
                                                     group_factory(order),
                                                     BTAG_PARTITION(i_remote_part))
 
             # If these parts are not connected, don't bother checking the error
-            bdry_nodes = local_bdry_conn.to_discr.nodes()
-            if bdry_nodes.size == 0:
+            bdry_nelements = sum(
+                    grp.nelements for grp in local_bdry_conn.to_discr.groups)
+            if bdry_nelements == 0:
                 eoc_rec[i_local_part, i_remote_part] = None
                 continue
 
             # Mark faces within remote_mesh that are connected to local_mesh
-            remote_bdry_conn = make_face_restriction(vol_discrs[i_remote_part],
+            remote_bdry_conn = make_face_restriction(actx, vol_discrs[i_remote_part],
                                                      group_factory(order),
                                                      BTAG_PARTITION(i_local_part))
 
-            assert bdry_nodes.size == remote_bdry_conn.to_discr.nodes().size, \
-                        "partitions do not have the same number of connected nodes"
+            remote_bdry_nelements = sum(
+                    grp.nelements for grp in remote_bdry_conn.to_discr.groups)
+            assert bdry_nelements == remote_bdry_nelements, \
+                    "partitions do not have the same number of connected elements"
 
             # Gather just enough information for the connection
             local_bdry = local_bdry_conn.to_discr
@@ -166,27 +171,25 @@ def test_partition_interpolation(ctx_factory, dim, mesh_pars,
                                         for grp_batches in remote_batches]
 
             # Connect from remote_mesh to local_mesh
-            remote_to_local_conn = make_partition_connection(local_bdry_conn,
-                                                             i_local_part,
-                                                             remote_bdry,
-                                                             remote_adj_groups,
-                                                             remote_from_elem_faces,
-                                                            remote_from_elem_indices)
+            remote_to_local_conn = make_partition_connection(
+                    actx, local_bdry_conn, i_local_part, remote_bdry,
+                    remote_adj_groups, remote_from_elem_faces,
+                    remote_from_elem_indices)
+
             # Connect from local mesh to remote mesh
-            local_to_remote_conn = make_partition_connection(remote_bdry_conn,
-                                                             i_remote_part,
-                                                             local_bdry,
-                                                             local_adj_groups,
-                                                             local_from_elem_faces,
-                                                             local_from_elem_indices)
-            check_connection(remote_to_local_conn)
-            check_connection(local_to_remote_conn)
-
-            true_local_points = f(local_bdry.nodes()[0].with_queue(queue))
-            remote_points = local_to_remote_conn(queue, true_local_points)
-            local_points = remote_to_local_conn(queue, remote_points)
-
-            err = la.norm((true_local_points - local_points).get(), np.inf)
+            local_to_remote_conn = make_partition_connection(
+                    actx, remote_bdry_conn, i_remote_part, local_bdry,
+                    local_adj_groups, local_from_elem_faces,
+                    local_from_elem_indices)
+
+            check_connection(actx, remote_to_local_conn)
+            check_connection(actx, local_to_remote_conn)
+
+            true_local_points = f(thaw(actx, local_bdry.nodes()[0]))
+            remote_points = local_to_remote_conn(true_local_points)
+            local_points = remote_to_local_conn(remote_points)
+
+            err = flat_norm(true_local_points - local_points, np.inf)
             eoc_rec[i_local_part, i_remote_part].add_data_point(1./n, err)
 
     for (i, j), e in eoc_rec.items():
@@ -359,9 +362,10 @@ def _test_mpi_boundary_swap(dim, order, num_groups):
     group_factory = PolynomialWarpAndBlendGroupFactory(order)
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.discretization import Discretization
-    vol_discr = Discretization(cl_ctx, local_mesh, group_factory)
+    vol_discr = Discretization(actx, local_mesh, group_factory)
 
     from meshmode.distributed import get_connected_partitions
     connected_parts = get_connected_partitions(local_mesh)
@@ -373,11 +377,11 @@ def _test_mpi_boundary_swap(dim, order, num_groups):
     from meshmode.mesh import BTAG_PARTITION
     for i_remote_part in connected_parts:
         local_bdry_conns[i_remote_part] = make_face_restriction(
-                vol_discr, group_factory, BTAG_PARTITION(i_remote_part))
+                actx, vol_discr, group_factory, BTAG_PARTITION(i_remote_part))
 
         setup_helper = bdry_setup_helpers[i_remote_part] = \
                 MPIBoundaryCommSetupHelper(
-                        mpi_comm, queue, local_bdry_conns[i_remote_part],
+                        mpi_comm, actx, local_bdry_conns[i_remote_part],
                         i_remote_part, bdry_grp_factory=group_factory)
 
         setup_helper.post_sends()
@@ -389,14 +393,14 @@ def _test_mpi_boundary_swap(dim, order, num_groups):
             if setup_helper.is_setup_ready():
                 assert bdry_setup_helpers.pop(i_remote_part) is setup_helper
                 conn = setup_helper.complete_setup()
-                check_connection(conn)
+                check_connection(actx, conn)
                 remote_to_local_bdry_conns[i_remote_part] = conn
                 break
 
         # FIXME: Not ideal, busy-waits
 
     _test_data_transfer(mpi_comm,
-                        queue,
+                        actx,
                         local_bdry_conns,
                         remote_to_local_bdry_conns,
                         connected_parts)
@@ -405,12 +409,12 @@ def _test_mpi_boundary_swap(dim, order, num_groups):
 
 
 # TODO
-def _test_data_transfer(mpi_comm, queue, local_bdry_conns,
+def _test_data_transfer(mpi_comm, actx, local_bdry_conns,
                         remote_to_local_bdry_conns, connected_parts):
     from mpi4py import MPI
 
     def f(x):
-        return 10*cl.clmath.sin(20.*x)
+        return 10*actx.np.sin(20.*x)
 
     '''
     Here is a simplified example of what happens from
@@ -435,13 +439,13 @@ def _test_data_transfer(mpi_comm, queue, local_bdry_conns,
     for i_remote_part in connected_parts:
         conn = remote_to_local_bdry_conns[i_remote_part]
         bdry_discr = local_bdry_conns[i_remote_part].to_discr
-        bdry_x = bdry_discr.nodes()[0].with_queue(queue=queue)
+        bdry_x = thaw(actx, bdry_discr.nodes()[0])
 
         true_local_f = f(bdry_x)
-        remote_f = conn(queue, true_local_f)
+        remote_f = conn(true_local_f)
 
         # 2.
-        send_reqs.append(mpi_comm.isend(remote_f.get(queue=queue),
+        send_reqs.append(mpi_comm.isend(actx.to_numpy(flatten(remote_f)),
                                         dest=i_remote_part,
                                         tag=TAG_SEND_REMOTE_NODES))
 
@@ -471,12 +475,9 @@ def _test_data_transfer(mpi_comm, queue, local_bdry_conns,
     send_reqs = []
     for i_remote_part in connected_parts:
         conn = remote_to_local_bdry_conns[i_remote_part]
-        local_f_np = remote_to_local_f_data[i_remote_part]
-        local_f_cl = cl.array.Array(queue,
-                                    shape=local_f_np.shape,
-                                    dtype=local_f_np.dtype)
-        local_f_cl.set(local_f_np)
-        remote_f = conn(queue, local_f_cl).get(queue=queue)
+        local_f = unflatten(actx, conn.from_discr,
+                actx.from_numpy(remote_to_local_f_data[i_remote_part]))
+        remote_f = actx.to_numpy(flatten(conn(local_f)))
 
         # 5.
         send_reqs.append(mpi_comm.isend(remote_f,
@@ -508,9 +509,9 @@ def _test_data_transfer(mpi_comm, queue, local_bdry_conns,
     # 7.
     for i_remote_part in connected_parts:
         bdry_discr = local_bdry_conns[i_remote_part].to_discr
-        bdry_x = bdry_discr.nodes()[0].with_queue(queue=queue)
+        bdry_x = thaw(actx, bdry_discr.nodes()[0])
 
-        true_local_f = f(bdry_x).get(queue=queue)
+        true_local_f = actx.to_numpy(flatten(f(bdry_x)))
         local_f = local_f_data[i_remote_part]
 
         from numpy.linalg import norm
@@ -554,7 +555,7 @@ if __name__ == "__main__":
         if len(sys.argv) > 1:
             exec(sys.argv[1])
         else:
-            from py.test.cmdline import main
+            from pytest import main
             main([__file__])
 
 # vim: fdm=marker
diff --git a/test/test_refinement.py b/test/test_refinement.py
index 2bc29e89266f4e7a5ae824335d719cb259412ea6..94a4fe0879533163953ae831cb4541b68ca2a951 100644
--- a/test/test_refinement.py
+++ b/test/test_refinement.py
@@ -27,12 +27,13 @@ from functools import partial
 
 import pytest
 import pyopencl as cl
-import pyopencl.clmath  # noqa
 
 import numpy as np
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import thaw, flat_norm
 from meshmode.mesh.generation import (  # noqa
         generate_icosahedron, generate_box_mesh, make_curve_mesh, ellipse)
 from meshmode.mesh.refinement.utils import check_nodal_adj_against_geometry
@@ -191,6 +192,7 @@ def test_refinement_connection(
 
     cl_ctx = ctx_getter()
     queue = cl.CommandQueue(cl_ctx)
+    actx = PyOpenCLArrayContext(queue)
 
     from meshmode.discretization import Discretization
     from meshmode.discretization.connection import (
@@ -235,27 +237,27 @@ def test_refinement_connection(
                 factor = 9
 
             for iaxis in range(len(x)):
-                result = result * cl.clmath.sin(factor * (x[iaxis]/mesh_ext[iaxis]))
+                result = result * actx.np.sin(factor * (x[iaxis]/mesh_ext[iaxis]))
 
             return result
 
-        discr = Discretization(cl_ctx, mesh, group_factory(order))
+        discr = Discretization(actx, mesh, group_factory(order))
 
         refiner = refiner_cls(mesh)
         flags = refine_flags(mesh)
         refiner.refine(flags)
 
         connection = make_refinement_connection(
-            refiner, discr, group_factory(order))
-        check_connection(connection)
+            actx, refiner, discr, group_factory(order))
+        check_connection(actx, connection)
 
         fine_discr = connection.to_discr
 
-        x = discr.nodes().with_queue(queue)
-        x_fine = fine_discr.nodes().with_queue(queue)
+        x = thaw(actx, discr.nodes())
+        x_fine = thaw(actx, fine_discr.nodes())
         f_coarse = f(x)
-        f_interp = connection(queue, f_coarse).with_queue(queue)
-        f_true = f(x_fine).with_queue(queue)
+        f_interp = connection(f_coarse)
+        f_true = f(x_fine)
 
         if visualize == "dots":
             import matplotlib.pyplot as plt
@@ -279,8 +281,7 @@ def test_refinement_connection(
                         ("f_true", f_true),
                         ])
 
-        import numpy.linalg as la
-        err = la.norm((f_interp - f_true).get(queue), np.inf)
+        err = flat_norm(f_interp - f_true, np.inf)
         eoc_rec.add_data_point(h, err)
 
     order_slack = 0.5