diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fe686ac3d32eab7b2cc3a943d06c0c2c01d8e66a..a77b24a3ba3e0e21995941ef40e21a9e388777f5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -159,6 +159,7 @@ Docker Image w/t Firedrake:
   script:
   - docker info
   - docker build --no-cache -t xywei/volumential-firedrake:latest -f docker/Dockerfile.firedrake .
+  - bash -c 'if [ "$BRANCH_PROTECTED" = 1 ]; then exit 0; else exit 1; fi'
   - docker login --username "${DOCKERHUB_USERNAME}" --password "${DOCKERHUB_ACCESS_TOKEN}"
   - docker push xywei/volumential-firedrake:latest
   tags:
diff --git a/contrib/meshgen11_dealii/meshgen.cpp b/contrib/meshgen11_dealii/meshgen.cpp
index 90289920ea5f2fbbe3d61a71f08a9356d734f66e..6f3d15dfef97778bc1f9327e5b09603697887aff 100644
--- a/contrib/meshgen11_dealii/meshgen.cpp
+++ b/contrib/meshgen11_dealii/meshgen.cpp
@@ -133,31 +133,10 @@ public:
 
   // q: quad order
   // level: initial (uniform) level
-  MeshGenerator(
-      int q, int level,
-      unsigned int max_n_cells = std::numeric_limits<unsigned int>::max(),
-      unsigned int min_grid_level = std::numeric_limits<unsigned int>::min(),
-      unsigned int max_grid_level = std::numeric_limits<unsigned int>::max())
-      : triangulation(d::Triangulation<dimension>::limit_level_difference_at_vertices), fe(q),
-        dof_handler(triangulation), quadrature_formula(q), box_a(a), box_b(b),
-        max_n_cells(max_n_cells), min_grid_level(min_grid_level),
-        max_grid_level(max_grid_level) {
-    d::GridGenerator::hyper_cube(triangulation, box_a, box_b);
-
-    // initial number of levels must be compatible
-    assert(level > min_grid_level);
-    assert(level <= max_grid_level + 1);
-    assert(std::pow(std::pow(2, dimension), level - 1) <= max_n_cells);
-
-    if (level > 1) {
-      triangulation.refine_global(level - 1);
-    }
-    this->dof_handler.distribute_dofs(fe);
-  }
-
-  // customized bounding box
+  // bounding box: [aa, bb]^dim
   MeshGenerator(
       int q, int level, double aa, double bb,
+      py::args args, py::kwargs kwargs,
       unsigned int max_n_cells = std::numeric_limits<unsigned int>::max(),
       unsigned int min_grid_level = std::numeric_limits<unsigned int>::min(),
       unsigned int max_grid_level = std::numeric_limits<unsigned int>::max())
@@ -616,7 +595,8 @@ template <int dim> py::tuple make_uniform_cubic_grid_details(int q, int level) {
   return result;
 }
 
-py::tuple make_uniform_cubic_grid(int q, int level, int dim) {
+py::tuple make_uniform_cubic_grid(int q, int level, int dim,
+                                  py::args args, py::kwargs kwargs) {
   if (dim == 1) {
     return make_uniform_cubic_grid_details<1>(q, level);
   } else if (dim == 2) {
@@ -640,8 +620,7 @@ PYBIND11_MODULE(meshgen_dealii, m) {
         py::arg("dim") = 2);
 
   py::class_<MeshGen1D>(m, "MeshGen1D")
-      .def(py::init<int, int>(), py::arg("degree"), py::arg("level"))
-      .def(py::init<int, int, double, double>(), py::arg("degree"),
+      .def(py::init<int, int, double, double, py::args, py::kwargs>(), py::arg("degree"),
            py::arg("level"), py::arg("a"), py::arg("b"))
       .def("greet", &MeshGen1D::greet)
       .def("get_q_points", &MeshGen1D::get_q_points)
@@ -668,8 +647,7 @@ PYBIND11_MODULE(meshgen_dealii, m) {
       .def("write_vtu", &MeshGen1D::write_vtu);
 
   py::class_<MeshGen2D>(m, "MeshGen2D")
-      .def(py::init<int, int>(), py::arg("degree"), py::arg("level"))
-      .def(py::init<int, int, double, double>(), py::arg("degree"),
+      .def(py::init<int, int, double, double, py::args, py::kwargs>(), py::arg("degree"),
            py::arg("level"), py::arg("a"), py::arg("b"))
       .def("greet", &MeshGen2D::greet)
       .def("get_q_points", &MeshGen2D::get_q_points)
@@ -696,8 +674,7 @@ PYBIND11_MODULE(meshgen_dealii, m) {
       .def("write_vtu", &MeshGen2D::write_vtu);
 
   py::class_<MeshGen3D>(m, "MeshGen3D")
-      .def(py::init<int, int>(), py::arg("degree"), py::arg("level"))
-      .def(py::init<int, int, double, double>(), py::arg("degree"),
+      .def(py::init<int, int, double, double, py::args, py::kwargs>(), py::arg("degree"),
            py::arg("level"), py::arg("a"), py::arg("b"))
       .def("greet", &MeshGen3D::greet)
       .def("get_q_points", &MeshGen3D::get_q_points)
diff --git a/examples/volume_potential_3d.py b/examples/volume_potential_3d.py
index ec612fc653ed3fd53e701fe693c1ccbf9a9f921e..31e96153e26258578217512b0c79d21d6a304678 100644
--- a/examples/volume_potential_3d.py
+++ b/examples/volume_potential_3d.py
@@ -428,11 +428,13 @@ def main():
         from meshmode.discretization.poly_element import (
             LegendreGaussLobattoTensorProductGroupFactory,
         )
+        from meshmode.array_context import PyOpenCLArrayContext
         from meshmode.discretization import Discretization
 
+        actx = PyOpenCLArrayContext(queue)
         box_discr = Discretization(
-            ctx, modemesh, LegendreGaussLobattoTensorProductGroupFactory(q_order)
-        )
+            actx, modemesh,
+            LegendreGaussLobattoTensorProductGroupFactory(q_order))
 
         box_nodes_x = box_discr.nodes()[0].with_queue(queue).get()
         box_nodes_y = box_discr.nodes()[1].with_queue(queue).get()
diff --git a/experiments/table_urls.json b/experiments/table_urls.json
new file mode 120000
index 0000000000000000000000000000000000000000..f7d45bc2635c93cfeb5eb4eca64aa0fc93dc18b1
--- /dev/null
+++ b/experiments/table_urls.json
@@ -0,0 +1 @@
+../examples/table_urls.json
\ No newline at end of file
diff --git a/experiments/volume_potential_2d_derivatives.py b/experiments/volume_potential_2d_derivatives.py
index 192d51281402d02ba60cd15858df00351a1dd39f..548836c08f28d53558b620df4a34c6d0c2bc72c2 100644
--- a/experiments/volume_potential_2d_derivatives.py
+++ b/experiments/volume_potential_2d_derivatives.py
@@ -55,8 +55,8 @@ root_table_source_extent = 2
 
 print("Using table cache:", table_filename)
 
-q_order = 2   # quadrature order
-n_levels = 8  # 2^(n_levels-1) subintervals in 1D
+q_order = 6   # quadrature order
+n_levels = 6  # 2^(n_levels-1) subintervals in 1D
 
 use_multilevel_table = False
 
@@ -68,7 +68,7 @@ rratio_bot = 0.5
 
 dtype = np.float64
 
-m_order = 20  # multipole order
+m_order = 15  # multipole order
 force_direct_evaluation = False
 
 print("Multipole order =", m_order)
@@ -345,9 +345,11 @@ mpole_expn_class = expn_factory.get_multipole_expansion_class(knl)
 out_kernels = [knl, knl_dx, knl_dy]
 
 exclude_self = True
-from volumential.expansion_wrangler_interface import ExpansionWranglerCodeContainer
+from volumential.expansion_wrangler_fpnd import (
+    FPNDExpansionWranglerCodeContainer,
+    FPNDExpansionWrangler)
 
-wcc = ExpansionWranglerCodeContainer(
+wcc = FPNDExpansionWranglerCodeContainer(
     ctx,
     partial(mpole_expn_class, knl),
     partial(local_expn_class, knl),
@@ -361,8 +363,6 @@ if exclude_self:
 else:
     self_extra_kwargs = {}
 
-from volumential.expansion_wrangler_fpnd import FPNDExpansionWrangler
-
 wrangler = FPNDExpansionWrangler(
     code_container=wcc,
     queue=queue,
@@ -429,9 +429,9 @@ zs_dy = pot[2].get()
 
 print_error = True
 if print_error:
-    err = np.max(np.abs(ze - zs))
-    err_dx = np.max(np.abs(ze_dx - zs_dx))
-    err_dy = np.max(np.abs(ze_dy - zs_dy))
+    err = np.max(np.abs(ze - zs)) / exact_solu(0, 0)
+    err_dx = np.max(np.abs(ze_dx - zs_dx)) / np.max(np.abs(ze_dx))
+    err_dy = np.max(np.abs(ze_dy - zs_dy)) / np.max(np.abs(ze_dy))
     print("Error =", err)
     print("Error dx =", err_dx)
     print("Error dy =", err_dy)
diff --git a/test/conftest.py b/test/conftest.py
index 006a4203531709aaa0e788c7bc18b4af211fc3f0..b6e6b9bfce119470998cd41bfe45639aa9f0f25e 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -62,9 +62,14 @@ def pytest_sessionstart(session):
     subprocess.call(['rm', '-f', '/tmp/volumential-tests.hdf5'])
 
     # pre-compute a basic table that is re-used in many tests.
-    with NearFieldInteractionTableManager("/tmp/volumential-tests.hdf5") as tm:
-        table, _ = tm.get_table(2, "Laplace",
-                                q_order=1, force_recompute=False, queue=None)
+    # can be easily turned off to run individual tests that do not require
+    # the table cache.
+    if 0:
+        with NearFieldInteractionTableManager(
+                "/tmp/volumential-tests.hdf5") as tm:
+            table, _ = tm.get_table(
+                    2, "Laplace",
+                    q_order=1, force_recompute=False, queue=None)
 
 
 def pytest_sessionfinish(session, exitstatus):
diff --git a/test/test_interpolation.py b/test/test_interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..07fffa73b3088c41600b3295508a7d5e34c25159
--- /dev/null
+++ b/test/test_interpolation.py
@@ -0,0 +1,178 @@
+__copyright__ = "Copyright (C) 2020 Xiaoyu Wei"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+import pytest
+import numpy as np
+import pyopencl as cl
+
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import flatten, thaw
+from meshmode.mesh.generation import generate_regular_rect_mesh
+from meshmode.discretization import Discretization
+from meshmode.discretization.poly_element import (
+        PolynomialWarpAndBlendGroupFactory)
+from volumential.interpolation import (ElementsToSourcesLookupBuilder,
+        interpolate_from_meshmode)
+from volumential.geometry import BoundingBoxFactory, BoxFMMGeometryFactory
+
+
+# {{{ test data
+
+def random_polynomial_func(dim, degree, seed=None):
+    """Makes a random polynomial function.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+    coefs = np.random.rand(*((degree + 1, ) * dim))
+
+    def poly_func(pts):
+        if dim == 1 and len(pts.shape) == 1:
+            pts = pts.reshape([1, -1])
+        assert pts.shape[0] == dim
+        npts = pts.shape[1]
+        res = np.zeros(npts)
+        for deg in np.ndindex(coefs.shape):
+            for iaxis in range(dim):
+                res += coefs[deg] * pts[iaxis, :]**deg[iaxis]
+        return res
+
+    return poly_func
+
+
+def eval_func_on_discr_nodes(queue, discr, func):
+    arr_ctx = PyOpenCLArrayContext(queue)
+    nodes = np.vstack([
+        c.get() for c in flatten(thaw(arr_ctx, discr.nodes()))])
+    fvals = func(nodes)
+    return cl.array.to_device(queue, fvals)
+
+# }}} End test data
+
+
+def drive_test_from_meshmode_exact_interpolation_2d(
+        cl_ctx, queue,
+        degree, nel_1d, n_levels, q_order,
+        a=-0.5, b=0.5, seed=0):
+    """
+    meshmode mesh control: nel_1d, degree
+    volumential mesh control: n_levels, q_order
+    """
+    dim = 2
+
+    mesh = generate_regular_rect_mesh(
+            a=(a, a),
+            b=(b, b),
+            n=(nel_1d, nel_1d))
+
+    arr_ctx = PyOpenCLArrayContext(queue)
+    group_factory = PolynomialWarpAndBlendGroupFactory(order=degree)
+    discr = Discretization(arr_ctx, mesh, group_factory)
+
+    bbox_fac = BoundingBoxFactory(dim=2)
+    boxfmm_fac = BoxFMMGeometryFactory(
+            cl_ctx, dim=dim, order=q_order,
+            nlevels=n_levels, bbox_getter=bbox_fac,
+            expand_to_hold_mesh=mesh, mesh_padding_factor=0.)
+    boxgeo = boxfmm_fac(queue)
+    lookup_fac = ElementsToSourcesLookupBuilder(
+            cl_ctx, tree=boxgeo.tree, discr=discr)
+    lookup, evt = lookup_fac(queue)
+
+    # algebraically exact interpolation
+    func = random_polynomial_func(dim, degree, seed)
+
+    dof_vec = eval_func_on_discr_nodes(queue, discr, func)
+    res = interpolate_from_meshmode(queue, dof_vec, lookup).get(queue)
+
+    tree = boxgeo.tree.get(queue)
+    ref = func(np.vstack(tree.sources))
+
+    return np.allclose(ref, res)
+
+
+def drive_test_from_meshmode_interpolation_2d(
+        cl_ctx, queue,
+        degree, nel_1d, n_levels, q_order,
+        a=-0.5, b=0.5, seed=0):
+    """
+    meshmode mesh control: nel_1d, degree
+    volumential mesh control: n_levels, q_order
+    """
+    dim = 2
+
+    mesh = generate_regular_rect_mesh(
+            a=(a, a),
+            b=(b, b),
+            n=(nel_1d, nel_1d))
+
+    arr_ctx = PyOpenCLArrayContext(queue)
+    group_factory = PolynomialWarpAndBlendGroupFactory(order=degree)
+    discr = Discretization(arr_ctx, mesh, group_factory)
+
+    bbox_fac = BoundingBoxFactory(dim=2)
+    boxfmm_fac = BoxFMMGeometryFactory(
+            cl_ctx, dim=dim, order=q_order,
+            nlevels=n_levels, bbox_getter=bbox_fac,
+            expand_to_hold_mesh=mesh, mesh_padding_factor=0.)
+    boxgeo = boxfmm_fac(queue)
+    lookup_fac = ElementsToSourcesLookupBuilder(
+            cl_ctx, tree=boxgeo.tree, discr=discr)
+    lookup, evt = lookup_fac(queue)
+
+    def func(pts):
+        x, y = pts
+        return np.sin(x + y) * x
+
+    dof_vec = eval_func_on_discr_nodes(queue, discr, func)
+    res = interpolate_from_meshmode(queue, dof_vec, lookup).get(queue)
+
+    tree = boxgeo.tree.get(queue)
+    ref = func(np.vstack(tree.sources))
+
+    resid = np.linalg.norm(ref - res, ord=np.inf)
+    return resid
+
+
+@pytest.mark.parametrize("params", [
+    [1, 8, 3, 1], [1, 8, 5, 2], [1, 8, 7, 3],
+    [2, 16, 3, 1], [2, 32, 5, 2], [2, 64, 7, 3],
+    ])
+def test_from_meshmode_interpolation_2d_exact(ctx_factory, params):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    assert drive_test_from_meshmode_exact_interpolation_2d(
+            cl_ctx, queue, *params)
+
+
+@pytest.mark.parametrize("params", [
+    [1, 32, 5, 1], [2, 64, 5, 3], [3, 64, 5, 4]
+    ])
+def test_from_meshmode_interpolation_2d_nonexact(ctx_factory, params):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    assert drive_test_from_meshmode_interpolation_2d(
+            cl_ctx, queue, *params) < 1e-3
+
+
+if __name__ == '__main__':
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+    drive_test_from_meshmode_interpolation_2d(cl_ctx, queue, 1, 8, 5, 3)
diff --git a/volumential/droste.py b/volumential/droste.py
index 960bda2e0aa46e23c3d047a5d58aeefaed10969b..55716ada303d6c33df9f0625409369a8abfdeae5 100644
--- a/volumential/droste.py
+++ b/volumential/droste.py
@@ -801,7 +801,6 @@ class DrosteFull(DrosteBase):
             self.integral_knl.__str__(),
             "quad_order-" + str(self.ntgt_points),
             "brick_order-" + str(self.nquad_points),
-            "brick_order-" + str(self.nquad_points),
         )
 
     def __call__(self, queue, **kwargs):
diff --git a/volumential/geometry.py b/volumential/geometry.py
index b00dc3ba3bf3840fda149ba5309fcbb612e67ec2..c3d334a9546e80f045d78dbf50bd7058396414b1 100644
--- a/volumential/geometry.py
+++ b/volumential/geometry.py
@@ -33,7 +33,7 @@ from boxtree.pyfmmlib_integration import FMMLibRotationData
 class BoundingBoxFactory():
     def __init__(self, dim,
             center=None, radius=None,
-            dtype=np.float64):
+            dtype=np.dtype("float64")):
         self.dim = dim
         self.dtype = dtype
 
@@ -78,8 +78,8 @@ class BoundingBoxFactory():
             box_center = self.center
             box_radius = self.radius
         else:
-            a = np.min(expand_to_hold_mesh.vertices, axis=0)
-            b = np.max(expand_to_hold_mesh.vertices, axis=0)
+            a = np.min(expand_to_hold_mesh.vertices, axis=1)
+            b = np.max(expand_to_hold_mesh.vertices, axis=1)
             c = (a + b) * 0.5
 
             if self.center is None:
@@ -142,8 +142,8 @@ class BoxFMMGeometryFactory():
         :class:`volumential.meshgen.MeshGenBase` interface.
     """
     def __init__(
-            self, cl_ctx, dim, quadrature_formula,
-            order, nlevels, bbox_getter,
+            self, cl_ctx, dim, order, nlevels, bbox_getter,
+            quadrature_formula=None,
             expand_to_hold_mesh=None, mesh_padding_factor=0.05):
         """
         :arg bbox_getter: A :class:`BoundingBoxFactory` object.
@@ -164,7 +164,12 @@ class BoxFMMGeometryFactory():
             self._engine_class = MeshGen3D
 
         self.dim = dim
-        self.quadrature_formula = quadrature_formula
+        if quadrature_formula is None:
+            from modepy import LegendreGaussQuadrature
+            # order = degree + 1
+            self.quadrature_formula = LegendreGaussQuadrature(order - 1)
+        else:
+            raise NotImplementedError()
 
         self.bbox_getter = bbox_getter
 
@@ -176,8 +181,8 @@ class BoxFMMGeometryFactory():
         self.nlevels = nlevels
 
         # the engine generates meshes centered at the origin
-        a = -1 * self.bbox_getter.radius
-        b = self.bbox_getter.radius
+        a = -1 * self.bbox_getter.box_radius
+        b = self.bbox_getter.box_radius
         self.engine = self._engine_class(self.order, self.nlevels, a, b)
 
     def reinit(
diff --git a/volumential/interpolation.py b/volumential/interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd3dfac0d593ea59a3536f01311145a3d7e89574
--- /dev/null
+++ b/volumential/interpolation.py
@@ -0,0 +1,506 @@
+__copyright__ = """
+Copyright (C) 2020 Xiaoyu Wei
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import pyopencl as cl
+import loopy as lp
+from pyopencl.algorithm import KeyValueSorter
+from pytools import memoize_method, ProcessLogger
+from pytools.obj_array import make_obj_array
+from boxtree.area_query import AreaQueryBuilder
+from boxtree.tools import DeviceDataRecord
+from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.dof_array import unflatten
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = r"""
+.. currentmodule:: volumential
+
+From :mod:`meshmode`
+-------------------------
+
+Interpolation from functions given by DoF vectors of :mod:`meshmode`.
+The underlying mesh on the :mod:`meshmode` side must be discretizing the
+same bounding box.
+
+The intersection testing assumes a boundedness property for the supported
+element types: each element is bounded inside the smallest :math:`l^\infty`
+ball that is centered at the element's center and covers all its vertices.
+This property might be broken, for example, by high order elements that
+warp the element boundary too much.
+
+.. autoclass:: ElementsToSourcesLookup
+
+.. autoclass:: LeavesToNodesLookup
+
+.. autofunction:: interpolate_from_meshmode
+
+To :mod:`meshmode`
+---------------------------
+
+"""
+
+
+# {{{ output
+
+class ElementsToSourcesLookup(DeviceDataRecord):
+    """
+    .. attribute:: tree
+
+        The :class:`boxtree.Tree` instance representing the box mesh.
+
+    .. attribute:: discr
+
+        The :class:`meshmode.discretization.Discretization` instance
+        representing the external mesh and DoF distribution.
+
+    .. attribute:: sources_in_element_starts
+
+        Indices into :attr:`sources_in_element_lists`.
+
+        .. code-block:: python
+
+            sources_in_element_lists[
+                sources_in_element_starts[global_iel]
+                :sources_in_element_starts[global_iel] + 1
+                ]
+
+        contains the list of source nodes residing in the given element.
+
+        .. note:: ``global_iel`` is the global element id in `meshmode`.
+            ``global_iel = mesh.groups[igrp].element_nr_base + iel``.
+
+    .. attribute:: sources_in_element_lists
+
+        Indices into :attr:`tree.sources`.
+
+    .. automethod:: get
+    """
+
+
+class LeavesToNodesLookup(DeviceDataRecord):
+    """
+    .. attribute:: tree
+
+        The :class:`boxtree.Tree` instance representing the box mesh.
+
+    .. attribute:: discr
+
+        The :class:`meshmode.discretization.Discretization` instance
+        representing the external mesh and DoF distribution.
+
+    .. attribute:: nodes_in_leaf_starts
+
+        Indices into :attr:`nodes_in_leaf_lists`.
+
+        .. code-block:: python
+
+            nodes_in_leaf_lists[
+                nodes_in_leaf_starts[box_id]:nodes_in_leaf_starts[box_id] + 1]
+
+        contains the list of discretization nodes residing in the given leaf box.
+
+        .. note:: Only leaf boxes have non-empty entries in this table.
+            Nonetheless, this list is indexed by the global box index.
+
+    .. attribute:: box_nodes_in_element_lists
+
+        Indices into :attr:`tree.sources`.
+
+    .. automethod:: get
+    """
+
+# }}} End output
+
+
+# {{{ elements-to-sources lookup builder
+
+class ElementsToSourcesLookupBuilder:
+    """Given a :mod:`meshmod` mesh and a :mod:`boxtree.Tree`, both discretizing
+    the same bounding box, this class helps to build a look-up table from
+    element to source nodes that are positioned inside the element.
+    """
+
+    def __init__(self, context, tree, discr):
+        """
+        :arg tree: a :class:`boxtree.Tree`
+        :arg discr: a :class: `meshmode.discretization.Discretization`
+
+        Boxes and elements can be non-aligned as long as the domains
+        (bounding boxes) are the same.
+        """
+        assert tree.dimensions == discr.dim
+        self.dim = discr.dim
+        self.context = context
+        self.tree = tree
+        self.discr = discr
+
+        self.key_value_sorter = KeyValueSorter(context)
+        self.area_query_builder = AreaQueryBuilder(self.context)
+
+    # {{{ kernel generation
+
+    @memoize_method
+    def get_simplex_lookup_kernel(self):
+        """Returns a loopy kernel that computes a potential vector
+        representing the (q_point --> element_id) relationship.
+        When a source q_point lies on the element boundary, it will be
+        assigned an element depending on code scheduling. This ensures
+        that the resulting lookup lists are disjoint.
+
+        The kernel assumes that the mesh uses one single group of simplex elements.
+        Also, the test only works for affine elements.
+        """
+        logger.debug("start building elements-to-sources lookup kernel")
+
+        if self.dim != 2:
+            raise NotImplementedError()
+
+        loopy_knl = lp.make_kernel(
+            ["{ [ iel ]: 0 <= iel < nelements }",
+             "{ [ ineighbor ]: nearby_leaves_beg <= ineighbor < nearby_leaves_end }",
+             "{ [ isrc ]: 0 <= isrc < n_box_sources }"
+             ],
+            ["""
+            for iel
+                <> nearby_leaves_beg = leaves_near_ball_starts[iel]
+                <> nearby_leaves_end = leaves_near_ball_starts[iel + 1]
+
+                <> Ax = mesh_vertices_0[mesh_vertex_indices[iel, 0]]
+                <> Ay = mesh_vertices_1[mesh_vertex_indices[iel, 0]]
+                <> Bx = mesh_vertices_0[mesh_vertex_indices[iel, 1]]
+                <> By = mesh_vertices_1[mesh_vertex_indices[iel, 1]]
+                <> Cx = mesh_vertices_0[mesh_vertex_indices[iel, 2]]
+                <> Cy = mesh_vertices_1[mesh_vertex_indices[iel, 2]]
+
+                for ineighbor
+                    <> ileaf = leaves_near_ball_lists[ineighbor]
+                    <> box_source_beg = box_source_starts[ileaf]
+                    <> n_box_sources = box_source_counts_cumul[ileaf]
+
+                    for isrc
+                        <> source_id = box_source_beg + isrc
+                        <> Px = source_points_0[source_id]
+                        <> Py = source_points_1[source_id]
+
+                        <> s0 = (Px - Ax) * (By - Ay) - (Py - Ay) * (Bx - Ax)
+                        <> s1 = (Px - Bx) * (Cy - By) - (Py - By) * (Cx - Bx)
+                        <> s2 = (Px - Cx) * (Ay - Cy) - (Py - Cy) * (Ax - Cx)
+
+                        result[source_id] = if(
+                            s0 * s1 >= -1e-12 and s1 * s2 >= -1e-12,
+                            iel,
+                            result[source_id])
+                    end
+                end
+            end
+            """],
+            [lp.ValueArg("nelements, dim, nboxes, nsources", np.int32),
+             lp.GlobalArg("mesh_vertex_indices", np.int32, "nelements, dim+1"),
+             lp.GlobalArg("box_source_starts", np.int32, "nboxes"),
+             lp.GlobalArg("box_source_counts_cumul", np.int32, "nboxes"),
+             lp.GlobalArg("leaves_near_ball_lists", np.int32, None),
+             lp.GlobalArg("result", np.int32, "nsources"),
+             "..."],
+            name="build_sources_in_simplex_lookup",
+            lang_version=(2018, 2),
+        )
+
+        logger.debug("done building elements-to-sources lookup kernel")
+        return loopy_knl
+
+    # }}} End kernel generation
+
+    def compute_short_lists(self, queue, wait_for=None):
+        """balls --> overlapping leaves
+        """
+        mesh = self.discr.mesh
+        if len(mesh.groups) > 1:
+            raise NotImplementedError("Mixed elements not supported")
+        melgrp = mesh.groups[0]
+        ball_centers_host = (np.max(melgrp.nodes, axis=2)
+                        + np.min(melgrp.nodes, axis=2)) / 2
+        ball_radii_host = np.max(
+                np.max(melgrp.nodes, axis=2) - np.min(melgrp.nodes, axis=2),
+                axis=0) / 2
+
+        ball_centers = make_obj_array([
+            cl.array.to_device(queue, center_coord_comp)
+            for center_coord_comp in ball_centers_host])
+        ball_radii = cl.array.to_device(queue, ball_radii_host)
+
+        area_query_result, evt = self.area_query_builder(
+            queue, self.tree, ball_centers, ball_radii,
+            peer_lists=None, wait_for=wait_for)
+        return area_query_result, evt
+
+    def __call__(self, queue, balls_to_leaves_lookup=None, wait_for=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue`
+        """
+        slk_plog = ProcessLogger(logger, "element-to-source lookup: run area query")
+
+        if balls_to_leaves_lookup is None:
+            balls_to_leaves_lookup, evt = \
+                self.compute_short_lists(queue, wait_for=wait_for)
+            wait_for = [evt]
+
+        # -----------------------------------------------------------------
+        # Refine the area query using point-in-simplex test
+
+        logger.debug("element-to-source lookup: refine starts")
+
+        element_lookup_kernel = self.get_simplex_lookup_kernel()
+
+        vertices_dev = make_obj_array([
+            cl.array.to_device(queue, verts)
+            for verts in self.discr.mesh.vertices])
+
+        evt, res = element_lookup_kernel(
+            queue, dim=self.dim, nboxes=self.tree.nboxes,
+            nelements=self.discr.mesh.nelements, nsources=self.tree.nsources,
+            result=cl.array.zeros(queue, self.tree.nsources, dtype=np.int32),
+            mesh_vertex_indices=self.discr.mesh.groups[0].vertex_indices,
+            mesh_vertices_0=vertices_dev[0],
+            mesh_vertices_1=vertices_dev[1],
+            source_points_0=self.tree.sources[0],
+            source_points_1=self.tree.sources[1],
+            box_source_starts=self.tree.box_source_starts,
+            box_source_counts_cumul=self.tree.box_source_counts_cumul,
+            leaves_near_ball_starts=balls_to_leaves_lookup.leaves_near_ball_starts,
+            leaves_near_ball_lists=balls_to_leaves_lookup.leaves_near_ball_lists,
+            wait_for=wait_for)
+
+        source_to_element_lookup, = res
+        wait_for = [evt]
+
+        # -----------------------------------------------------------------
+        # Invert the source-to-element lookup by a key-value sort
+
+        logger.debug("element-to-source lookup: key-value sort")
+
+        sources_in_element_starts, sources_in_element_lists, evt = \
+            self.key_value_sorter(
+                queue,
+                keys=source_to_element_lookup,
+                values=cl.array.arange(
+                    queue, self.tree.nsources, dtype=self.tree.box_id_dtype),
+                nkeys=self.discr.mesh.nelements,
+                starts_dtype=self.tree.box_id_dtype,
+                wait_for=wait_for)
+
+        slk_plog.done()
+
+        return ElementsToSourcesLookup(
+            tree=self.tree, discr=self.discr,
+            sources_in_element_starts=sources_in_element_starts,
+            sources_in_element_lists=sources_in_element_lists), evt
+
+# }}} End elements-to-sources lookup builder
+
+
+# {{{ transform helper
+
+def compute_affine_transform(source_simplex, target_simplex):
+    """Computes A and b for the affine transform :math:`y = A x + b`
+    that maps ``source_simplex`` to ``target_simplex``.
+
+    :param source_simplex: a dim-by-(dim+1) :mod:`numpy` array
+    :param target_simplex: a dim-by-(dim+1) :mod:`numpy: array
+    """
+    assert source_simplex.shape == target_simplex.shape
+    dim = source_simplex.shape[0]
+
+    if dim == 2:
+        assert source_simplex.shape == (2, 3)
+        mat = np.zeros([6, 6])
+        mat[:3, :2] = source_simplex.T
+        mat[-3:, 2:4] = source_simplex.T
+        mat[:3, -2] = 1
+        mat[-3:, -1] = 1
+        rhs = target_simplex.reshape(-1)
+        solu = np.linalg.solve(mat, rhs)
+        return solu[:4].reshape(2, 2), solu[-2:]
+
+    elif dim == 3:
+        assert source_simplex.shape == (3, 4)
+        mat = np.zeros([12, 12])
+        mat[:4, :3] = source_simplex.T
+        mat[4:8, 3:6] = source_simplex.T
+        mat[-4:, 6:9] = source_simplex.T
+        mat[:4, -3] = 1
+        mat[4:8, -2] = 1
+        mat[-4:, -1] = 1
+        rhs = target_simplex.reshape(-1)
+        solu = np.linalg.solve(mat, rhs)
+        return solu[:9].reshape(3, 3), solu[-3:]
+
+    else:
+        raise NotImplementedError()
+
+
+def invert_affine_transform(mat_a, disp_b):
+    """Inverts an affine transform given by :math:`y = A x + b`.
+
+    :param mat_A: a dim*(dim+1)-by-dim*(dim+1) :mod:`numpy` array
+    :param disp_b: a dim*(dim+1) :mod:`numpy` array
+    """
+    iva = np.linalg.inv(mat_a)
+    ivb = - iva @ disp_b
+    return iva, ivb
+
+# }}} End transform helper
+
+
+# {{{ from meshmode interpolation
+
+def interpolate_from_meshmode(queue, dof_vec, elements_to_sources_lookup):
+    """
+    :arg dof_vec: a DoF vector representing a field in :mod:`meshmode`
+        of shape ``(..., nnodes)``.
+    :arg elements_to_sources_lookup: a :class:`ElementsToSourcesLookup`
+
+    .. note:: This function currently supports meshes with just one element
+        group. Also, the element group must be simplex-based.
+
+    .. note:: This function does some heavy-lifting computation in Python,
+        which we intend to optimize in the future. In particular, we plan
+        to shift the batched linear solves and basis evaluations to
+        :mod:`loopy`.
+
+    TODO: make linear solvers available as :mod:`loopy` callables.
+    TODO: make :mod:`modepy` emit :mod:`loopy` callables for basis evaluation.
+    """
+    if not isinstance(dof_vec, cl.array.Array):
+        raise TypeError("non-array passed to interpolator")
+
+    assert len(elements_to_sources_lookup.discr.groups) == 1
+    assert len(elements_to_sources_lookup.discr.mesh.groups) == 1
+    degroup = elements_to_sources_lookup.discr.groups[0]
+    megroup = elements_to_sources_lookup.discr.mesh.groups[0]
+
+    if not degroup.is_affine:
+        raise ValueError(
+            "interpolation requires global-to-local map, "
+            "which is only available for affinely mapped elements")
+
+    mesh = elements_to_sources_lookup.discr.mesh
+    dim = elements_to_sources_lookup.discr.dim
+    template_simplex = mesh.groups[0].vertex_unit_coordinates().T
+
+    # -------------------------------------------------------
+    # Inversely map source points with a global-to-local map.
+    #
+    # 1. For each element, solve for the affine map.
+    #
+    # 2. Apply the map to corresponding source points.
+    #
+    # This step computes `unit_sources`, the list of inversely
+    # mapped source points.
+
+    sources_in_element_starts = \
+        elements_to_sources_lookup.sources_in_element_starts.get(queue)
+    sources_in_element_lists = \
+        elements_to_sources_lookup.sources_in_element_lists.get(queue)
+    tree = elements_to_sources_lookup.tree.get(queue)
+
+    unit_sources_host = make_obj_array(
+            [np.zeros_like(srccrd) for srccrd in tree.sources])
+
+    for iel in range(degroup.nelements):
+        vertex_ids = megroup.vertex_indices[iel]
+        vertices = mesh.vertices[:, vertex_ids]
+        afa, afb = compute_affine_transform(vertices, template_simplex)
+
+        beg = sources_in_element_starts[iel]
+        end = sources_in_element_starts[iel + 1]
+        source_ids_in_el = sources_in_element_lists[beg:end]
+        sources_in_el = np.vstack(
+            [tree.sources[iaxis][source_ids_in_el] for iaxis in range(dim)])
+
+        ivmapped_el_sources = afa @ sources_in_el + afb.reshape([dim, 1])
+        for iaxis in range(dim):
+            unit_sources_host[iaxis][source_ids_in_el] = \
+                ivmapped_el_sources[iaxis, :]
+
+    unit_sources = make_obj_array(
+        [cl.array.to_device(queue, usc) for usc in unit_sources_host])
+
+    # -----------------------------------------------------
+    # Carry out evaluations in the local (template) frames.
+    #
+    # 1. Assemble a resampling matrix for each element, with
+    #    the basis functions and the local source points.
+    #
+    # 2. For each element, perform matvec on the resampling
+    #    matrix and the local DoF coefficients.
+    #
+    # This step assumes `unit_sources` computed on device, so
+    # that the previous step can be swapped with a kernel without
+    # interrupting the followed computation.
+
+    mapped_sources = np.vstack(
+        [usc.get(queue) for usc in unit_sources])
+
+    basis_funcs = degroup.basis()
+
+    arr_ctx = PyOpenCLArrayContext(queue)
+    dof_vec_view = unflatten(
+            arr_ctx, elements_to_sources_lookup.discr, dof_vec)[0]
+    dof_vec_view = dof_vec_view.get()
+
+    sym_shape = dof_vec.shape[:-1]
+    source_vec = np.zeros(sym_shape + (tree.nsources, ))
+
+    for iel in range(degroup.nelements):
+        beg = sources_in_element_starts[iel]
+        end = sources_in_element_starts[iel + 1]
+        source_ids_in_el = sources_in_element_lists[beg:end]
+        mapped_sources_in_el = mapped_sources[:, source_ids_in_el]
+        local_dof_vec = dof_vec_view[..., iel, :]
+
+        # resampling matrix built from Vandermonde matrices
+        import modepy as mp
+        rsplm = mp.resampling_matrix(
+                basis=basis_funcs,
+                new_nodes=mapped_sources_in_el,
+                old_nodes=degroup.unit_nodes)
+
+        if len(sym_shape) == 0:
+            local_coeffs = local_dof_vec
+            source_vec[source_ids_in_el] = rsplm @ local_coeffs
+        else:
+            from pytools import indices_in_shape
+            for sym_id in indices_in_shape(sym_shape):
+                source_vec[sym_id + (source_ids_in_el, )] = \
+                    rsplm @ local_dof_vec[sym_id]
+
+    source_vec = cl.array.to_device(queue, source_vec)
+
+    return source_vec
+
+# }}} End from meshmode interpolation
diff --git a/volumential/nearfield_potential_table.py b/volumential/nearfield_potential_table.py
index 3f876496c1eba0c799363936e82387e43a86e7aa..d4a526d7be06fe0321eb584e878ffa5f59cf9e48 100644
--- a/volumential/nearfield_potential_table.py
+++ b/volumential/nearfield_potential_table.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import logging
 import numpy as np
 import loopy as lp
+import pyopencl as cl
 from scipy.interpolate import BarycentricInterpolator as Interpolator
 
 from functools import partial
@@ -1098,6 +1099,8 @@ class NearFieldInteractionTable(object):
                     ) * fl_scaling(k=self.dim, s=s)
             return
 
+        from meshmode.array_context import PyOpenCLArrayContext
+        from meshmode.dof_array import thaw, flatten
         from meshmode.mesh.io import read_gmsh
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
@@ -1165,7 +1168,8 @@ class NearFieldInteractionTable(object):
 
         # }}} End gmsh processing
 
-        discr = Discretization(cl_ctx, mesh,  # noqa
+        arr_ctx = PyOpenCLArrayContext(queue)
+        discr = Discretization(arr_ctx, mesh,
                 PolynomialWarpAndBlendGroupFactory(order=quad_order))
 
         from pytential import bind, sym
@@ -1274,7 +1278,9 @@ class NearFieldInteractionTable(object):
 
         # }}} End kernel evaluation
 
-        nodes = discr.nodes().with_queue(queue)[:self.dim]
+        node_coords = flatten(thaw(arr_ctx, discr.nodes()))
+        nodes = cl.array.to_device(queue,
+            np.vstack([crd.get() for crd in node_coords]))
 
         int_vals = []