diff --git a/experiments/sep.py b/experiments/sep.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f94ad57ac9e9764f2d93f0b812ed18ae57fe07
--- /dev/null
+++ b/experiments/sep.py
@@ -0,0 +1,484 @@
+""" This example evaluates the volume potential over
+    [-1,1]^2 with the Laplace kernel.
+"""
+from __future__ import absolute_import, division, print_function
+
+__copyright__ = "Copyright (C) 2017 - 2018 Xiaoyu Wei"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+if 0:
+    # verbose
+    logging.basicConfig(level=logging.INFO)
+else:
+    # clean
+    logging.basicConfig(level=logging.CRITICAL)
+
+import numpy as np
+import pyopencl as cl
+from volumential.tools import ScalarFieldExpressionEvaluation as Eval
+
+import pymbolic as pmbl
+from functools import partial
+
+from pytools.obj_array import make_obj_array
+
+
+def main():
+
+    print("*************************")
+    print("* Setting up...")
+    print("*************************")
+
+    dim = 2
+
+    # download precomputation results for the 2D Laplace kernel
+    download_table = True
+    table_filename = "nft_laplace2d.hdf5"
+    root_table_source_extent = 2
+
+    print("Using table cache:", table_filename)
+
+    q_order = 4       # quadrature order (target / near-field order)
+    source_order = 9  # source order
+    n_levels = 6  # 2^(n_levels-1) subintervals in 1D
+
+    use_multilevel_table = False
+
+    adaptive_mesh = False
+    n_refinement_loops = 100
+    refined_n_cells = 2000
+    rratio_top = 0.2
+    rratio_bot = 0.5
+
+    dtype = np.float64
+
+    m_order = 20  # multipole order
+    force_direct_evaluation = False
+
+    print("Multipole order =", m_order)
+
+    alpha = 160
+
+    x = pmbl.var("x")
+    y = pmbl.var("y")
+    expp = pmbl.var("exp")
+
+    norm2 = x ** 2 + y ** 2
+    source_expr = -(4 * alpha ** 2 * norm2 - 4 * alpha) * expp(-alpha * norm2)
+    solu_expr = expp(-alpha * norm2)
+
+    logger.info("Source expr: " + str(source_expr))
+    logger.info("Solu expr: " + str(solu_expr))
+
+    # bounding box
+    a = -0.5
+    b = 0.5
+    root_table_source_extent = 2
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    source_eval = Eval(dim, source_expr, [x, y])
+
+    # {{{ generate quad points
+
+    import volumential.meshgen as mg
+
+    # Show meshgen info
+    mg.greet()
+
+    mesh = mg.MeshGen2D(q_order, n_levels, a, b, queue=queue)
+    src_mesh = mg.MeshGen2D(source_order, n_levels, a, b, queue=queue)
+    if not adaptive_mesh:
+        mesh.print_info()
+        q_points = mesh.get_q_points()
+        q_weights = mesh.get_q_weights()
+
+    else:
+        iloop = -1
+        while mesh.n_active_cells() < refined_n_cells:
+            iloop += 1
+            crtr = np.abs(
+                source_eval(mesh.get_cell_centers)
+                * mesh.get_cell_measures)
+            mesh.update_mesh(crtr, rratio_top, rratio_bot)
+            if iloop > n_refinement_loops:
+                print("Max number of refinement loops reached.")
+                break
+
+        mesh.print_info()
+        q_points = mesh.get_q_points()
+        q_weights = mesh.get_q_weights()
+
+    assert len(q_points) == len(q_weights)
+    assert q_points.shape[1] == dim
+
+    q_points = np.ascontiguousarray(np.transpose(q_points))
+    q_points = make_obj_array(
+        [cl.array.to_device(queue, q_points[i]) for i in range(dim)])
+
+    q_weights = cl.array.to_device(queue, q_weights)
+    # q_radii = cl.array.to_device(queue, q_radii)
+
+
+    src_q_points = src_mesh.get_q_points()
+    src_q_points = np.ascontiguousarray(np.transpose(src_q_points))
+    src_q_points = make_obj_array(
+        [cl.array.to_device(queue, src_q_points[i]) for i in range(dim)])
+
+    src_q_weights = src_mesh.get_q_weights()
+    src_q_weights = cl.array.to_device(queue, src_q_weights)
+
+    # }}}
+
+    # {{{ discretize the source field
+
+    source_vals = cl.array.to_device(
+        queue, source_eval(queue, np.array([coords.get() for coords in q_points]))
+    )
+
+    src_source_vals = cl.array.to_device(
+        queue, source_eval(queue,
+          np.array([coords.get() for coords in src_q_points]))
+    )
+
+    # particle_weigt = source_val * q_weight
+
+    # }}} End discretize the source field
+
+    # {{{ build tree and traversals
+
+    from boxtree.tools import AXIS_NAMES
+
+    axis_names = AXIS_NAMES[:dim]
+
+    from pytools import single_valued
+
+    coord_dtype = single_valued(coord.dtype for coord in q_points)
+    from boxtree.bounding_box import make_bounding_box_dtype
+
+    bbox_type, _ = make_bounding_box_dtype(ctx.devices[0], dim, coord_dtype)
+    bbox = np.empty(1, bbox_type)
+    for ax in axis_names:
+        bbox["min_" + ax] = a
+        bbox["max_" + ax] = b
+
+    # tune max_particles_in_box to reconstruct the mesh
+    # TODO: use points from FieldPlotter are used as target points for better
+    # visuals
+    from boxtree import TreeBuilder
+
+    tb = TreeBuilder(ctx)
+    tree, _ = tb(
+        queue,
+        particles=src_q_points,
+        targets=q_points,
+        bbox=bbox,
+        max_particles_in_box=source_order ** 2 * 4 - 1,
+        kind="adaptive-level-restricted",
+    )
+
+    from boxtree.traversal import FMMTraversalBuilder
+
+    tg = FMMTraversalBuilder(ctx)
+    trav, _ = tg(queue, tree)
+
+    # }}} End build tree and traversals
+
+    # {{{ build near field potential table
+
+    from volumential.table_manager import NearFieldInteractionTableManager
+    import os
+
+    if download_table and (not os.path.isfile(table_filename)):
+        import json
+        with open("table_urls.json", 'r') as fp:
+            urls = json.load(fp)
+
+        print("Downloading table from %s" % urls['Laplace2D'])
+        import subprocess
+        subprocess.call(["wget", "-q", urls['Laplace2D'], table_filename])
+
+    tm = NearFieldInteractionTableManager(
+        table_filename, root_extent=root_table_source_extent,
+        queue=queue)
+
+    if use_multilevel_table:
+        assert (
+            abs(
+                int((b - a)
+                  / root_table_source_extent) * root_table_source_extent
+                - (b - a))
+            < 1e-15)
+        nftable = []
+        for l in range(0, tree.nlevels + 1):
+            print("Getting table at level", l)
+            tb, _ = tm.get_table(
+                dim,
+                "Laplace",
+                q_order,
+                source_box_level=l,
+                compute_method="DrosteSum",
+                queue=queue,
+                n_brick_quad_poitns=100,
+                adaptive_level=False,
+                use_symmetry=True,
+                alpha=0.1,
+                nlevels=15,
+            )
+            nftable.append(tb)
+
+        print("Using table list of length", len(nftable))
+
+    else:
+        nftable, _ = tm.get_table(
+            dim,
+            "Laplace",
+            q_order,
+            force_recompute=False,
+            compute_method="DrosteSum",
+            queue=queue,
+            n_brick_quad_poitns=100,
+            adaptive_level=False,
+            use_symmetry=True,
+            alpha=0.1,
+            nlevels=15,
+        )
+
+    # }}} End build near field potential table
+
+    # {{{ sumpy expansion for laplace kernel
+
+    from sumpy.expansion import DefaultExpansionFactory
+    from sumpy.kernel import LaplaceKernel
+
+    knl = LaplaceKernel(dim)
+    out_kernels = [knl]
+
+    expn_factory = DefaultExpansionFactory()
+    local_expn_class = expn_factory.get_local_expansion_class(knl)
+    mpole_expn_class = expn_factory.get_multipole_expansion_class(knl)
+
+    exclude_self = True
+
+    from volumential.expansion_wrangler_fpnd import (
+            FPNDExpansionWranglerCodeContainer,
+            FPNDExpansionWrangler
+            )
+
+    wcc = FPNDExpansionWranglerCodeContainer(
+        ctx,
+        partial(mpole_expn_class, knl),
+        partial(local_expn_class, knl),
+        out_kernels,
+        exclude_self=exclude_self,
+    )
+
+    if exclude_self:
+        target_to_source = np.arange(tree.ntargets, dtype=np.int32)
+        self_extra_kwargs = {"target_to_source": target_to_source}
+    else:
+        self_extra_kwargs = {}
+
+    wrangler = FPNDExpansionWrangler(
+        code_container=wcc,
+        queue=queue,
+        tree=tree,
+        near_field_table=nftable,
+        dtype=dtype,
+        fmm_level_to_order=lambda kernel, kernel_args, tree, lev: m_order,
+        quad_order=q_order,
+        self_extra_kwargs=self_extra_kwargs,
+    )
+
+    # }}} End sumpy expansion for laplace kernel
+
+    print("*************************")
+    print("* Performing FMM ...")
+    print("*************************")
+
+    # {{{ conduct fmm computation
+
+    from volumential.volume_fmm import drive_volume_fmm
+
+    import time
+    queue.finish()
+
+    t0 = time.clock()
+
+    pot, = drive_volume_fmm(
+        trav,
+        wrangler,
+        src_source_vals * src_q_weights,
+        source_vals,
+        direct_evaluation=force_direct_evaluation,
+    )
+    queue.finish()
+
+    t1 = time.clock()
+
+    print("Finished in %.2f seconds." % (t1 - t0))
+    print("(%e points per second)" % (
+        len(q_weights) / (t1 - t0)
+        ))
+
+    # }}} End conduct fmm computation
+
+    print("*************************")
+    print("* Postprocessing ...")
+    print("*************************")
+
+    # {{{ postprocess and plot
+
+    # print(pot)
+
+    solu_eval = Eval(dim, solu_expr, [x, y])
+
+    x = q_points[0].get()
+    y = q_points[1].get()
+    ze = solu_eval(queue, np.array([x, y]))
+    zs = pot.get()
+
+    print_error = True
+    if print_error:
+        err = np.max(np.abs(ze - zs))
+        print("Error =", err)
+
+    # Interpolated surface
+    if 0:
+        h = 0.005
+        out_x = np.arange(a, b + h, h)
+        out_y = np.arange(a, b + h, h)
+        oxx, oyy = np.meshgrid(out_x, out_y)
+        out_targets = make_obj_array(
+            [
+                cl.array.to_device(queue, oxx.flatten()),
+                cl.array.to_device(queue, oyy.flatten()),
+            ]
+        )
+
+        from volumential.volume_fmm import interpolate_volume_potential
+
+        # src = source_field([q.get() for q in q_points])
+        # src = cl.array.to_device(queue, src)
+        interp_pot = interpolate_volume_potential(out_targets, trav, wrangler, pot)
+        opot = interp_pot.get()
+
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+
+        plt3d = plt.figure()
+        ax = Axes3D(plt3d)  # noqa
+        surf = ax.plot_surface(oxx, oyy, opot.reshape(oxx.shape))  # noqa
+        # ax.scatter(x, y, src.get())
+        # ax.set_zlim(-0.25, 0.25)
+
+        plt.draw()
+        plt.show()
+
+    # Boxtree
+    if 0:
+        import matplotlib.pyplot as plt
+
+        if dim == 2:
+            # plt.plot(q_points[0].get(), q_points[1].get(), ".")
+            pass
+
+        from boxtree.visualization import TreePlotter
+
+        plotter = TreePlotter(tree.get(queue=queue))
+        plotter.draw_tree(fill=False, edgecolor="black")
+        # plotter.draw_box_numbers()
+        plotter.set_bounding_box()
+        plt.gca().set_aspect("equal")
+
+        plt.draw()
+        # plt.show()
+        plt.savefig("tree.png")
+
+    # Direct p2p
+    if 0:
+        print("Performing P2P")
+        pot_direct, = drive_volume_fmm(
+            trav, wrangler, source_vals * q_weights,
+            source_vals, direct_evaluation=True)
+        zds = pot_direct.get()
+        zs = pot.get()
+
+        print("P2P-FMM diff =", np.max(np.abs(zs - zds)))
+
+        print("P2P Error =", np.max(np.abs(ze - zds)))
+
+        """
+        import matplotlib.pyplot as plt
+        import matplotlib.cm as cm
+        x = q_points[0].get()
+        y = q_points[1].get()
+        plt.scatter(x, y, c=np.log(abs(zs-zds)) / np.log(10), cmap=cm.jet)
+        plt.colorbar()
+
+        plt.xlabel("Multipole order = " + str(m_order))
+
+        plt.draw()
+        plt.show()
+        """
+
+    # Scatter plot
+    if 0:
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+
+        x = q_points[0].get()
+        y = q_points[1].get()
+        ze = solu_eval(queue, np.array([x, y]))
+        zs = pot.get()
+
+        plt3d = plt.figure()
+        ax = Axes3D(plt3d)
+        ax.scatter(x, y, zs, s=1)
+        # ax.scatter(x, y, source_field([q.get() for q in q_points]), s=1)
+        # import matplotlib.cm as cm
+
+        # ax.scatter(x, y, zs, c=np.log(abs(zs-zds)), cmap=cm.jet)
+        # plt.gca().set_aspect("equal")
+
+        # ax.set_xlim3d([-1, 1])
+        # ax.set_ylim3d([-1, 1])
+        # ax.set_zlim3d([np.min(z), np.max(z)])
+        # ax.set_zlim3d([-0.002, 0.00])
+
+        plt.draw()
+        plt.show()
+        # plt.savefig("exact.png")
+
+    # }}} End postprocess and plot
+
+
+if __name__ == '__main__':
+    main()
+
+
+# vim: filetype=pyopencl:foldmethod=marker
diff --git a/requirements.txt b/requirements.txt
index a40693e3e5b3ad55dcc0c78ed347776bebb92edf..9cd6c9ab0efdffbdedb335d336378eb4dc73c668 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ git+https://gitlab.tiker.net/xywei/pypvfmm.git#egg=pypvfmm
 -e git+https://www.github.com/xywei/gmsh_interop.git#egg=gmsh_interop
 -e git+https://gitlab.tiker.net/xywei/pytential.git#egg=pytential
 
+-e git+https://gitlab.tiker.net/inducer/pytools.git#egg=pytools
 -e git+https://gitlab.tiker.net/inducer/cgen.git#egg=cgen
 -e git+https://gitlab.tiker.net/inducer/meshmode.git#egg=meshmode
 -e git+https://gitlab.tiker.net/inducer/genpy.git#egg=genpy
diff --git a/volumential/expansion_wrangler_fpnd.py b/volumential/expansion_wrangler_fpnd.py
index f313c9c0f4c21c872add283d476f14a4a9d0382a..330b96944b5bdb1739fdd281cbe709287f6cf155 100644
--- a/volumential/expansion_wrangler_fpnd.py
+++ b/volumential/expansion_wrangler_fpnd.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import numpy as np
 import logging
 import pyopencl as cl
+import pyopencl.array
 from pytools.obj_array import make_obj_array
 
 # from pytools import memoize_method
@@ -46,6 +47,24 @@ def level_to_rscale(tree, level):
     return tree.root_extent * (2 ** -level)
 
 
+def inverse_id_map(queue, mapped_ids):
+    """Given a index mapping as its mapped ids, compute its inverse,
+    and return the inverse by the inversely-mapped ids.
+    """
+    cl_array = False
+    if isinstance(mapped_ids, cl.array.Array):
+        cl_array = True
+        mapped_ids = mapped_ids.get(queue)
+
+    inv_ids = np.zeros_like(mapped_ids)
+    inv_ids[mapped_ids] = np.arange(len(mapped_ids))
+
+    if cl_array:
+        inv_ids = cl.array.to_device(queue, inv_ids)
+
+    return inv_ids
+
+
 # {{{ sumpy backend
 
 
@@ -267,6 +286,12 @@ class FPNDSumpyExpansionWrangler(
     def reorder_sources(self, source_array):
         return SumpyExpansionWrangler.reorder_sources(self, source_array)
 
+    def reorder_targets(self, target_array):
+        if not hasattr(self.tree, 'user_target_ids'):
+            self.tree.user_target_ids = inverse_id_map(
+                self.queue, self.tree.sorted_target_ids)
+        return target_array.with_queue(self.queue)[self.tree.user_target_ids]
+
     def reorder_potentials(self, potentials):
         return SumpyExpansionWrangler.reorder_potentials(self, potentials)
 
@@ -407,13 +432,16 @@ class FPNDSumpyExpansionWrangler(
         self.queue.finish()
         logger.info("sent table data to device")
 
+        # NOTE: box_sources for this evaluation should be "box_targets".
+        # This is due to the special features of how box-FMM works.
+
         res, evt = near_field(
             self.queue,
             result=out_pot,
             box_centers=self.tree.box_centers,
             box_levels=self.tree.box_levels,
-            box_source_counts_cumul=self.tree.box_source_counts_cumul,
-            box_source_starts=self.tree.box_source_starts,
+            box_source_counts_cumul=self.tree.box_target_counts_cumul,
+            box_source_starts=self.tree.box_target_starts,
             box_target_counts_cumul=self.tree.box_target_counts_cumul,
             box_target_starts=self.tree.box_target_starts,
             case_indices=case_indices_dev,
@@ -871,6 +899,12 @@ class FPNDFMMLibExpansionWrangler(
     def reorder_sources(self, source_array):
         return FMMLibExpansionWrangler.reorder_sources(self, source_array)
 
+    def reorder_targets(self, target_array):
+        if not hasattr(self.tree, 'user_target_ids'):
+            self.tree.user_target_ids = inverse_id_map(
+                self.queue, self.tree.sorted_target_ids)
+        return target_array[self.tree.user_target_ids]
+
     def reorder_potentials(self, potentials):
         return FMMLibExpansionWrangler.reorder_potentials(self, potentials)
 
@@ -973,6 +1007,18 @@ class FPNDFMMLibExpansionWrangler(
             mode_nmlz_combined[lev, :] = self.near_field_table[kname][
                 lev
             ].mode_normalizers
+        exterior_mode_nmlz_combined = np.zeros(
+            (
+                len(self.near_field_table[kname]),
+                len(self.near_field_table[kname][0].kernel_exterior_normalizers),
+            )
+        )
+        for lev in range(len(self.near_field_table[kname])):
+            table_data_combined[lev, :] = self.near_field_table[kname][lev].data
+            mode_nmlz_combined[lev, :] = \
+                self.near_field_table[kname][lev].mode_normalizers
+            exterior_mode_nmlz_combined[lev, :] = \
+                self.near_field_table[kname][lev].kernel_exterior_normalizers
 
         logger.info(
                 "Table data for kernel "
@@ -999,14 +1045,15 @@ class FPNDFMMLibExpansionWrangler(
             result=out_pot,
             box_centers=self.tree.box_centers,
             box_levels=self.tree.box_levels,
-            box_source_counts_cumul=self.tree.box_source_counts_cumul,
-            box_source_starts=self.tree.box_source_starts,
+            box_source_counts_cumul=self.tree.box_target_counts_cumul,
+            box_source_starts=self.tree.box_target_starts,
             box_target_counts_cumul=self.tree.box_target_counts_cumul,
             box_target_starts=self.tree.box_target_starts,
             case_indices=case_indices_dev,
             encoding_base=base,
             encoding_shift=shift,
             mode_nmlz_combined=mode_nmlz_combined,
+            exterior_mode_nmlz_combined=exterior_mode_nmlz_combined,
             neighbor_source_boxes_starts=neighbor_source_boxes_starts,
             root_extent=self.tree.root_extent,
             neighbor_source_boxes_lists=neighbor_source_boxes_lists,
diff --git a/volumential/expansion_wrangler_interface.py b/volumential/expansion_wrangler_interface.py
index ea006ffffd968a0346468cd19c0502c7cc8d8c15..4418f13f8cb1e35344e146669a2e6ed8a40511e9 100644
--- a/volumential/expansion_wrangler_interface.py
+++ b/volumential/expansion_wrangler_interface.py
@@ -75,6 +75,14 @@ class ExpansionWranglerInterface(object):
         """
         pass
 
+    @abstractmethod
+    def reorder_targets(self, source_array):
+        """
+        Return a copy of *target_array* in tree source order.
+        *target_array* is in user target order.
+        """
+        pass
+
     @abstractmethod
     def reorder_potentials(self, potentials):
         """
diff --git a/volumential/volume_fmm.py b/volumential/volume_fmm.py
index 3e96bc9a889dd5c09c85b3692ac4fb36e957002f..5695b228207cad15a2cabcb6dd5d7d6f68232664 100644
--- a/volumential/volume_fmm.py
+++ b/volumential/volume_fmm.py
@@ -90,12 +90,12 @@ def drive_volume_fmm(
         if isinstance(src_weights, cl.array.Array):
             src_weights = src_weights.get(wrangler.queue)
         if isinstance(src_func, cl.array.Array):
-            src_func = src_func.get()
+            src_func = src_func.get(wrangler.queue)
 
     logger.debug("reorder source weights")
 
     src_weights = wrangler.reorder_sources(src_weights)
-    src_func = wrangler.reorder_sources(src_func)
+    src_func = wrangler.reorder_targets(src_func)
 
     # {{{ Construct local multipoles
 
@@ -168,6 +168,10 @@ def drive_volume_fmm(
     if direct_evaluation:
 
         print("Warning: NOT USING FMM (forcing global p2p)")
+        if len(src_weights) != len(src_func):
+            print("Using P2P with different src/tgt discretizations can be "
+                  "unstable when targets are close to the sources while not "
+                  "be exactly the same")
 
         # list 2 and beyond
         # First call global p2p, then substract list 1