diff --git a/examples/curve-pot.py b/examples/curve-pot.py
index 4a09b403f755d9bdf35a250aff75453a7b60047c..310a11315efb1fe4d2c92bd0ef97fcc20f22d8dc 100644
--- a/examples/curve-pot.py
+++ b/examples/curve-pot.py
@@ -3,7 +3,6 @@ import pyopencl as cl
 import numpy as np
 import numpy.linalg as la
 import matplotlib.pyplot as pt
-from pytools import Record
 
 
 def process_kernel(knl, what_operator):
@@ -16,8 +15,8 @@ def process_kernel(knl, what_operator):
         from sumpy.kernel import TargetDerivative
         knl = TargetDerivative(1, knl)
     elif what_operator == "D":
-        from sumpy.kernel import SourceDerivative
-        knl = SourceDerivative(knl)
+        from sumpy.kernel import DirectionalSourceDerivative
+        knl = DirectionalSourceDerivative(knl)
     # DirectionalTargetDerivative (temporarily?) removed
     # elif what_operator == "S'":
     #     from sumpy.kernel import DirectionalTargetDerivative
@@ -49,7 +48,7 @@ def draw_pot_figure(aspect_ratio,
 
     center = np.asarray([0, 0], dtype=np.float64)
     from sumpy.visualization import FieldPlotter
-    fp = FieldPlotter(center, points=1000, extent=3)
+    fp = FieldPlotter(center, npoints=1000, extent=3)
 
     # }}}
 
@@ -57,7 +56,7 @@ def draw_pot_figure(aspect_ratio,
 
     from sumpy.p2p import P2P
     from sumpy.kernel import LaplaceKernel, HelmholtzKernel
-    from sumpy.local_expansion import H2DLocalExpansion, LineTaylorLocalExpansion
+    from sumpy.expansion.local import H2DLocalExpansion, LineTaylorLocalExpansion
     if helmholtz_k:
         knl = HelmholtzKernel(2)
         expn_class = H2DLocalExpansion
@@ -73,14 +72,11 @@ def draw_pot_figure(aspect_ratio,
 
     lpot_knl = process_kernel(knl, what_operator_lpot)
 
-    from sumpy.layerpot import LayerPotential, JumpTermApplier
+    from sumpy.qbx import LayerPotential
     lpot = LayerPotential(ctx, [
         expn_class(lpot_knl, order=order)],
             value_dtypes=np.complex128)
 
-    jt = JumpTermApplier(ctx, [lpot_knl],
-            value_dtypes=np.complex128)
-
     # }}}
 
     # {{{ set up geometry
@@ -155,16 +151,7 @@ def draw_pot_figure(aspect_ratio,
             evt, (y,) = lpot(queue, native_curve.pos, ovsmp_curve.pos, centers,
                     [xovsmp], ovsmp_curve.speed, ovsmp_weights,
                     **lpot_kwargs)
-            if 0:
-                class JumpTermArgs(Record):
-                    pass
-                evt, (y,) = jt(queue, [y], JumpTermArgs(
-                    src_derivative_dir=native_curve.normal,
-                    tgt_derivative_dir=native_curve.normal,
-                    density_0=x,
-                    side=center_side,
-                    normal=native_curve.normal,
-                    ))
+
             return y
 
         op = LinearOperator((nsrc, nsrc), apply_lpot)
@@ -189,21 +176,6 @@ def draw_pot_figure(aspect_ratio,
 
     # }}}
 
-    if 0:
-        # {{{ apply jump terms
-
-        class JumpTermArgs(Record):
-            pass
-        evt, (curve_pot,) = jt(queue, [curve_pot], JumpTermArgs(
-            src_derivative_dir=native_curve.normal,
-            tgt_derivative_dir=native_curve.normal,
-            density_0=density,
-            side=center_side,
-            normal=native_curve.normal,
-            ))
-
-        # }}}
-
     if 0:
         # {{{ plot on-surface potential in 2D
 
@@ -273,6 +245,7 @@ def draw_pot_figure(aspect_ratio,
 
 
 if __name__ == "__main__":
+    1/0  # FIXME update to new conventions
     draw_pot_figure(aspect_ratio=1, nsrc=100, novsmp=100, helmholtz_k=0,
             what_operator="D", what_operator_lpot="D", force_center_side=1)
     pt.savefig("eigvals-ext-nsrc100-novsmp100.pdf")
diff --git a/sumpy/__init__.py b/sumpy/__init__.py
index 279f6683ae96480ca398bdc64c1464315a056950..372d58c88e3978052bcc9c781f7c39ae4b4412a0 100644
--- a/sumpy/__init__.py
+++ b/sumpy/__init__.py
@@ -22,9 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from sumpy.p2p import P2P
-from sumpy.p2e import P2E
-from sumpy.e2p import E2P
-from sumpy.e2e import E2E
+from sumpy.p2p import P2P, P2PFromCSR
+from sumpy.p2e import P2EFromLocal, P2EFromCSR
+from sumpy.e2p import E2PFromLocal, E2PFromCSR
+from sumpy.e2e import E2EFromCSR, E2EFromChildren, E2EFromParent
 
-__all__ = ["P2P", "P2E", "E2P", "E2E"]
+__all__ = [
+    "P2P", "P2PFromCSR",
+    "P2EFromLocal", "P2EFromCSR",
+    "E2PFromLocal", "E2PFromCSR",
+    "E2EFromCSR", "E2EFromChildren", "E2EFromParent"]
diff --git a/sumpy/e2e.py b/sumpy/e2e.py
index bc6f96f97805c5a48c040641b5c321b09078ea81..3168c87a7f135346bd7cc36804916e754998dc20 100644
--- a/sumpy/e2e.py
+++ b/sumpy/e2e.py
@@ -28,15 +28,17 @@ import sympy as sp
 from pytools import memoize_method
 
 
-class E2E(object):
+# {{{ translation base class
+
+class E2EBase(object):
     def __init__(self, ctx, src_expansion, tgt_expansion,
             options=[], name="e2e", device=None):
         """
         :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase`
         :arg strength_usage: A list of integers indicating which expression
-          uses which source strength indicator. This implicitly specifies the
-          number of strength arrays that need to be passed.
-          Default: all kernels use the same strength.
+            uses which source strength indicator. This implicitly specifies the
+            number of strength arrays that need to be passed.
+            Default: all kernels use the same strength.
         """
 
         if device is None:
@@ -55,16 +57,12 @@ class E2E(object):
 
         self.dim = src_expansion.dim
 
-    @memoize_method
-    def get_kernel(self):
+    def get_translation_loopy_insns(self):
         from sumpy.symbolic import make_sym_vector
         dvec = make_sym_vector("d", self.dim)
 
-        ncoeff_src = len(self.src_expansion.get_coefficient_identifiers())
-        ncoeff_tgt = len(self.tgt_expansion.get_coefficient_identifiers())
-
         src_coeff_exprs = [sp.Symbol("src_coeff%d" % i)
-                for i in xrange(ncoeff_src)]
+                for i in xrange(len(self.src_expansion))]
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
@@ -83,34 +81,49 @@ class E2E(object):
                 retain_names=tgt_coeff_names)
 
         from sumpy.codegen import to_loopy_insns
-        loopy_insns = to_loopy_insns(assignments,
+        return to_loopy_insns(
+                assignments,
                 vector_names=set(["d"]),
                 pymbolic_expr_maps=[self.tgt_expansion.transform_to_code],
                 complex_dtype=np.complex128  # FIXME
                 )
 
-        from sumpy.tools import gather_arguments
-        arguments = (
-                [
-                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
-                    lp.GlobalArg("src_expansions", None,
-                        shape=("nboxes", ncoeff_src)),
-                    lp.GlobalArg("tgt_expansions", None,
-                        shape=("nboxes", ncoeff_tgt)),
-                    lp.GlobalArg("src_box_starts, src_box_lists",
-                        None, shape=None, strides=(1,)),
-                    lp.ValueArg("nboxes", np.int32),
-                    "..."
-                ] + gather_arguments([self.src_expansion, self.tgt_expansion])
-                )
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
+        return knl
+
+# }}}
+
+
+# {{{ translation from "compressed sparse row"-like source box lists
+
+class E2EFromCSR(E2EBase):
+    """Implements translation from a "compressed sparse row"-like source box
+    list.
+    """
+
+    def get_kernel(self):
+        ncoeff_src = len(self.src_expansion)
+        ncoeff_tgt = len(self.tgt_expansion)
+
+        # To clarify terminology:
+        #
+        # isrc_box -> The index in a list of (in this case, source) boxes
+        # src_ibox -> The (global) box number for the (in this case, source) box
+        #
+        # (same for itgt_box, tgt_ibox)
 
+        from sumpy.tools import gather_arguments
         loopy_knl = lp.make_kernel(self.device,
                 [
                     "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                     "{[isrc_box]: isrc_start<=isrc_box<isrc_stop}",
                     "{[idim]: 0<=idim<dim}",
                     ],
-                loopy_insns
+                self.get_translation_loopy_insns()
                 + ["""
                     <> tgt_ibox = target_boxes[itgt_box]
                     <> tgt_center[idim] = centers[idim, tgt_ibox] \
@@ -123,13 +136,22 @@ class E2E(object):
                     <> src_center[idim] = centers[idim, src_ibox] \
                             {id=fetch_src_center}
                     <> d[idim] = tgt_center[idim] - src_center[idim]
-
                     <> src_coeff${SRC_COEFFIDX} = \
                         src_expansions[src_ibox, ${SRC_COEFFIDX}]
                     tgt_expansions[tgt_ibox, ${TGT_COEFFIDX}] = \
-                        coeff${TGT_COEFFIDX} {id_prefix=write_expn}
+                            coeff${TGT_COEFFIDX} {id_prefix=write_expn}
                     """],
-                arguments,
+                [
+                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
+                    lp.GlobalArg("src_box_starts, src_box_lists",
+                        None, shape=None, strides=(1,)),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.GlobalArg("src_expansions", None,
+                        shape=("nboxes", ncoeff_src)),
+                    lp.GlobalArg("tgt_expansions", None,
+                        shape=("nboxes", ncoeff_tgt)),
+                    "..."
+                ] + gather_arguments([self.src_expansion, self.tgt_expansion]),
                 name=self.name, assumptions="ntgt_boxes>=1",
                 defines=dict(
                     dim=self.dim,
@@ -147,12 +169,174 @@ class E2E(object):
 
         return loopy_knl
 
-    @memoize_method
-    def get_optimized_kernel(self):
-        # FIXME
-        knl = self.get_kernel()
-        knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
-        return knl
+    def __call__(self, queue, **kwargs):
+        """
+        :arg src_expansions:
+        :arg src_box_starts:
+        :arg src_box_lists:
+        :arg centers:
+        """
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, **kwargs)
+
+# }}}
+
+
+# {{{ translation from a box's children
+
+class E2EFromChildren(E2EBase):
+    def get_kernel(self):
+        if self.src_expansion is not self.tgt_expansion:
+            raise RuntimeError("%s requires that the source "
+                    "and target expansion are the same object")
+
+        ncoeffs = len(self.src_expansion)
+
+        # To clarify terminology:
+        #
+        # isrc_box -> The index in a list of (in this case, source) boxes
+        # src_ibox -> The (global) box number for the (in this case, source) box
+        #
+        # (same for itgt_box, tgt_ibox)
+
+        loopy_insns = [
+                insn.copy(predicates=insn.predicates
+                    | frozenset(["is_src_box_valid"]))
+                for insn in self.get_translation_loopy_insns()]
+
+        from sumpy.tools import gather_arguments
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
+                    "{[isrc_box]: 0<=isrc_box<nchildren}",
+                    "{[idim]: 0<=idim<dim}",
+                    ],
+                loopy_insns
+                + ["""
+                    <> tgt_ibox = target_boxes[itgt_box]
+                    <> tgt_center[idim] = centers[idim, tgt_ibox] \
+                        {id=fetch_tgt_center}
+
+                    <> src_ibox = box_child_ids[isrc_box,itgt_box]
+                    <> is_src_box_valid = src_ibox != 0
+
+                    <> src_center[idim] = centers[idim, src_ibox] \
+                        {id=fetch_src_center,if=is_src_box_valid}
+                    <> d[idim] = tgt_center[idim] - src_center[idim] \
+                        {if=is_src_box_valid}
+
+                    <> src_coeff${COEFFIDX} = \
+                        expansions[src_ibox, ${COEFFIDX}] \
+                        {if=is_src_box_valid}
+                    expansions[tgt_ibox, ${COEFFIDX}] = \
+                        expansions[tgt_ibox, ${COEFFIDX}] + coeff${COEFFIDX} \
+                        {id_prefix=write_expn,if=is_src_box_valid,dep=}
+                    """],
+                [
+                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
+                    lp.GlobalArg("box_child_ids", None,
+                        shape="nchildren, aligned_nboxes"),
+                    lp.GlobalArg("expansions", None,
+                        shape=("nboxes", ncoeffs)),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.ValueArg("aligned_nboxes", np.int32),
+                    "..."
+                ] + gather_arguments([self.src_expansion, self.tgt_expansion]),
+                name=self.name, assumptions="ntgt_boxes>=1",
+                defines=dict(
+                    dim=self.dim,
+                    nchildren=2**self.dim,
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)],
+                    ),
+                silenced_warnings="write_race(write_expn*)")
+
+        for expn in [self.src_expansion, self.tgt_expansion]:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_tgt_center",
+                tags={"idim": "unr"})
+        loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr"))
+
+        return loopy_knl
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg src_expansions:
+        :arg src_box_starts:
+        :arg src_box_lists:
+        :arg centers:
+        """
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, **kwargs)
+
+# }}}
+
+
+# {{{ translation from a box's parent
+
+class E2EFromParent(E2EBase):
+    def get_kernel(self):
+        if self.src_expansion is not self.tgt_expansion:
+            raise RuntimeError("%s requires that the source "
+                    "and target expansion are the same object")
+
+        ncoeffs = len(self.src_expansion)
+
+        # To clarify terminology:
+        #
+        # isrc_box -> The index in a list of (in this case, source) boxes
+        # src_ibox -> The (global) box number for the (in this case, source) box
+        #
+        # (same for itgt_box, tgt_ibox)
+
+        from sumpy.tools import gather_arguments
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
+                    "{[idim]: 0<=idim<dim}",
+                    ],
+                self.get_translation_loopy_insns()
+                + ["""
+                    <> tgt_ibox = target_boxes[itgt_box]
+                    <> tgt_center[idim] = centers[idim, tgt_ibox] \
+                        {id=fetch_tgt_center}
+
+                    <> src_ibox = box_parent_ids[tgt_ibox]
+                    <> src_center[idim] = centers[idim, src_ibox] \
+                        {id=fetch_src_center}
+                    <> d[idim] = tgt_center[idim] - src_center[idim]
+
+                    <> src_coeff${COEFFIDX} = \
+                        expansions[src_ibox, ${COEFFIDX}]
+                    expansions[tgt_ibox, ${COEFFIDX}] = \
+                        expansions[tgt_ibox, ${COEFFIDX}] + coeff${COEFFIDX} \
+                        {id_prefix=write_expn,if=is_src_box_valid}
+                    """],
+                [
+                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.GlobalArg("expansions", None,
+                        shape=("nboxes", ncoeffs)),
+                    "..."
+                ] + gather_arguments([self.src_expansion, self.tgt_expansion]),
+                name=self.name, assumptions="ntgt_boxes>=1",
+                defines=dict(
+                    dim=self.dim,
+                    nchildren=2**self.dim,
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)],
+                    ),
+                silenced_warnings="write_race(write_expn*)")
+
+        for expn in [self.src_expansion, self.tgt_expansion]:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_tgt_center",
+                tags={"idim": "unr"})
+        loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr"))
+
+        return loopy_knl
 
     def __call__(self, queue, **kwargs):
         """
@@ -164,3 +348,7 @@ class E2E(object):
         knl = self.get_optimized_kernel()
 
         return knl(queue, **kwargs)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/sumpy/e2p.py b/sumpy/e2p.py
index 00c760781c93c7b11d6d965985392acd6ed34ed0..18cc8f8ea764afc6ea2bc665a0519aed3b5bab3d 100644
--- a/sumpy/e2p.py
+++ b/sumpy/e2p.py
@@ -28,7 +28,9 @@ import sympy as sp
 from pytools import memoize_method
 
 
-class E2P(object):
+# {{{ E2P base class
+
+class E2PBase(object):
     def __init__(self, ctx, expansion, kernels,
             options=[], name="e2p", device=None):
         """
@@ -56,8 +58,7 @@ class E2P(object):
         for knl in kernels:
             assert tdr(knl) == expansion.kernel
 
-    @memoize_method
-    def get_kernel(self):
+    def get_loopy_insns_and_result_names(self):
         from sumpy.symbolic import make_sym_vector
         bvec = make_sym_vector("b", self.dim)
 
@@ -96,6 +97,20 @@ class E2P(object):
                     expression=sympy_conv(self.expansion.kernel.get_scaling()),
                     temp_var_type=lp.auto))
 
+        return loopy_insns, result_names
+
+# }}}
+
+
+# {{{ box-local E2P (L2P, likely)
+
+class E2PFromLocal(E2PBase):
+    @memoize_method
+    def get_kernel(self):
+        ncoeffs = len(self.expansion)
+
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+
         arguments = (
                 [
                     lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")),
@@ -103,7 +118,7 @@ class E2P(object):
                         None, shape=None),
                     lp.GlobalArg("centers", None, shape="dim, nboxes"),
                     lp.GlobalArg("expansions", None,
-                        shape=("nboxes", len(coeff_exprs))),
+                        shape=("nboxes", ncoeffs)),
                     lp.ValueArg("nboxes", np.int32),
                     lp.ValueArg("ntargets", np.int32),
                     "..."
@@ -133,7 +148,7 @@ class E2P(object):
                 name=self.name, assumptions="ntgt_boxes>=1",
                 defines=dict(
                     dim=self.dim,
-                    COEFFIDX=[str(i) for i in xrange(len(coeff_exprs))],
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)],
                     RESULTIDX=[str(i) for i in xrange(len(result_names))],
                     )
                 )
@@ -163,3 +178,89 @@ class E2P(object):
         knl = self.get_optimized_kernel()
 
         return knl(queue, **kwargs)
+
+# }}}
+
+
+# {{{ E2P from CSR-like interaction list
+
+class E2PFromCSR(E2PBase):
+    @memoize_method
+    def get_kernel(self):
+        ncoeffs = len(self.expansion)
+
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+
+        arguments = (
+                [
+                    lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")),
+                    lp.GlobalArg("box_target_starts,box_target_counts_nonchild",
+                        None, shape=None),
+                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
+                    lp.GlobalArg("expansions", None,
+                        shape=("nboxes", ncoeffs)),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.ValueArg("ntargets", np.int32),
+                    "..."
+                ] + [
+                    lp.GlobalArg("result_%d" % i, None, shape="ntargets")
+                    for i in range(len(result_names))
+                ] + self.expansion.get_args()
+                )
+
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
+                    "{[itgt]: itgt_start<=itgt<itgt_end}",
+                    "{[isrc_box,idim]: isrc_box_start<=isrc_box<=isrc_box_end \
+                            and 0<=idim<dim}",
+                    ],
+                loopy_insns
+                + ["""
+                    <> tgt_ibox = target_boxes[itgt_box]
+                    <> itgt_start = box_target_starts[itgt_box]
+                    <> itgt_end = itgt_start+box_target_counts_nonchild[itgt_box]
+                    <> tgt[idim] = targets[idim] {id=fetch_tgt}
+
+                    <> isrc_box_start = source_box_starts[isrc_box]
+                    <> isrc_box_end = source_box_starts[isrc_box+1]
+
+                    <> src_ibox = source_box_lists[isrc_box]
+                    <> coeff${COEFFIDX} = expansions[src_ibox, ${COEFFIDX}]
+                    <> center[idim] = centers[idim, src_ibox] {id=fetch_center}
+
+                    <> b[idim] = tgt[idim] - center[idim]
+                    result[${RESULTIDX, itgt] = \
+                            kernel_scaling * sum(isrc_box, result_${RESULTIDX}_p)
+                """],
+                arguments,
+                name=self.name, assumptions="ntgt_boxes>=1",
+                defines=dict(
+                    dim=self.dim,
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)],
+                    RESULTIDX=[str(i) for i in xrange(len(result_names))],
+                    )
+                )
+
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_tgt",
+                tags={"idim": "unr"})
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center",
+                tags={"idim": "unr"})
+        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        #knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, **kwargs):
+        knl = self.get_optimized_kernel()
+        return knl(queue, **kwargs)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py
index 6477b9ab4d9b7de6ba5b3a096ee57de42e29b51d..6d1d540c643be004d48c6809b62bf95243302229 100644
--- a/sumpy/expansion/__init__.py
+++ b/sumpy/expansion/__init__.py
@@ -63,6 +63,9 @@ class ExpansionBase(object):
 
     # }}}
 
+    def __len__(self):
+        return len(self.get_coefficient_identifiers())
+
     def coefficients_from_source(self, expr, avec, bvec):
         """
         :arg bvec: vector from center to target. Not usually necessary,
diff --git a/sumpy/fmm.py b/sumpy/fmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..10de0de3745a65013255010e73de668ba543e8e3
--- /dev/null
+++ b/sumpy/fmm.py
@@ -0,0 +1,280 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+__doc__ = """Integrates :mod:`boxtree` with :mod:`sumpy`.
+"""
+
+
+import pyopencl as cl
+import pyopencl.array  # noqa
+
+
+class SumpyExpansionWranglerCodeContainer(object):
+    """Objects of this type serve as a place to keep the code needed
+    for :class:`SumpyExpansionWrangler`. Since :class:`SumpyExpansionWrangler`
+    necessarily must have a :class:`pyopencl.CommandQueue`, but this queue
+    is allowed to be more ephemeral than the code, the code's lifetime
+    is decoupled by storing it in this object.
+    """
+
+    def __init__(self, cl_context, tree,
+            multipole_expansion, local_expansion, out_kernels):
+        self.tree = tree
+        m_expn = self.multipole_expansion = multipole_expansion
+        l_expn = self.local_expansion = local_expansion
+
+        self.cl_context = cl_context
+
+        from sumpy import (
+                P2EFromLocal, P2EFromCSR,
+                E2PFromLocal, E2PFromCSR,
+                P2PFromCSR,
+                E2EFromCSR, E2EFromChildren, E2EFromParent)
+        self.p2m = P2EFromLocal(cl_context, m_expn)
+        self.p2l = P2EFromCSR(cl_context, l_expn)
+        self.m2m = E2EFromChildren(cl_context, m_expn, m_expn)
+        self.m2l = E2EFromCSR(cl_context, m_expn, l_expn)
+        self.l2l = E2EFromParent(cl_context, l_expn, l_expn)
+        self.m2p = E2PFromCSR(cl_context, m_expn, out_kernels)
+        self.l2p = E2PFromLocal(cl_context, l_expn, out_kernels)
+        self.p2p = P2PFromCSR(cl_context, out_kernels, exclude_self=False)
+
+    def get_wrangler(self, queue, dtype, extra_kwargs={}):
+        return SumpyExpansionWrangler(self, queue, dtype, extra_kwargs)
+
+
+class SumpyExpansionWrangler(object):
+    """Implements the :class:`boxtree.fmm.ExpansionWranglerInterface`
+    by using :mod:`sumpy` expansions/translations.
+    """
+
+    def __init__(self, code_container, queue, dtype, extra_kwargs):
+        self.code = code_container
+        self.queue = queue
+        self.dtype = dtype
+        self.extra_kwargs = extra_kwargs
+
+    @property
+    def tree(self):
+        return self.code.tree
+
+    @property
+    def multipole_expansion(self):
+        return self.code.multipole_expansion
+
+    @property
+    def local_expansion(self):
+        return self.code.local_expansion
+
+    def multipole_expansion_zeros(self):
+        return cl.array.zeros(
+                self.queue,
+                (self.tree.nboxes, len(self.multipole_expansion)),
+                dtype=self.dtype)
+
+    def local_expansion_zeros(self):
+        return cl.array.zeros(
+                self.queue,
+                (self.tree.nboxes, len(self.local_expansion)),
+                dtype=self.dtype)
+
+    def potential_zeros(self):
+        return tuple(
+                cl.array.zeros(
+                    self.queue,
+                    self.tree.ntargets,
+                    dtype=self.dtype)
+                for k in self.out_kernels)
+
+    def reorder_src_weights(self, src_weights):
+        return src_weights.with_queue(self.queue)[self.tree.user_source_ids]
+
+    def reorder_potentials(self, potentials):
+        return potentials.with_queue(self.queue)[self.tree.sorted_target_ids]
+
+    def form_multipoles(self, source_boxes, src_weights):
+        mpoles = self.multipole_expansion_zeros()
+
+        evt, (mpoles_res,) = self.code.p2m(self.queue,
+                source_boxes=source_boxes,
+                box_source_starts=self.tree.box_source_starts,
+                box_source_counts_nonchild=self.tree.box_source_counts_nonchild,
+                centers=self.tree.box_centers,
+                sources=self.tree.sources,
+                strengths=src_weights,
+                expansions=mpoles,
+
+                **self.extra_kwargs)
+
+        assert mpoles_res is mpoles
+
+        return mpoles
+
+    def coarsen_multipoles(self, parent_boxes, mpoles):
+        evt, (mpoles_res,) = self.code.m2m(self.queue,
+                expansions=mpoles,
+                target_boxes=parent_boxes,
+                box_child_ids=self.tree.box_child_ids,
+                centers=self.tree.box_centers,
+                **self.extra_kwargs)
+
+        assert mpoles_res is mpoles
+        return mpoles
+
+    def eval_direct(self, target_boxes, source_box_starts,
+            source_box_lists, src_weights):
+        pot = self.potential_zeros()
+
+        evt, pot_res = self.code.p2p(self.queue,
+                target_boxes=target_boxes,
+                source_box_starts=source_box_starts,
+                source_box_lists=source_box_lists,
+                strengths=(src_weights,),
+                box_target_starts=self.tree.box_target_starts,
+                box_target_counts_nonchild=self.tree.box_target_counts_nonchild,
+                sources=self.tree.sources,
+                targets=self.tree.targets,
+                result=pot,
+
+                **self.extra_kwargs)
+
+        for pot_i, pot_res_i in zip(pot, pot_res):
+            assert pot_i is pot_res_i
+
+        return pot_res
+
+    def multipole_to_local(self, target_or_target_parent_boxes,
+            starts, lists, mpole_exps):
+        local_exps = self.local_expansion_zeros()
+
+        evt, (local_exps_res,) = self.m2l(self.queue,
+                src_expansions=mpole_exps,
+                target_boxes=target_or_target_parent_boxes,
+                src_box_starts=starts,
+                src_box_lists=lists,
+                centers=self.tree.centers,
+                tgt_expansions=local_exps,
+
+                **self.extra_kwargs)
+
+        assert local_exps_res is local_exps
+
+        return local_exps
+
+    def eval_multipoles(self, target_boxes, sep_smaller_nonsiblings_starts,
+            sep_smaller_nonsiblings_lists, mpole_exps):
+        pot = self.potential_zeros()
+
+        evt, pot_res = self.m2p(self.queue,
+                expansions=mpole_exps,
+                target_boxes=target_boxes,
+                box_target_starts=self.tree.box_target_starts,
+                box_target_counts_nonchild=self.tree.box_target_counts_nonchild,
+                source_box_starts=sep_smaller_nonsiblings_starts,
+                source_box_lists=sep_smaller_nonsiblings_lists,
+                centers=self.tree.centers,
+                targets=self.tree.targets,
+
+                **self.extra_kwargs)
+
+        for pot_i, pot_res_i in zip(pot, pot_res):
+            assert pot_i is pot_res_i
+
+        return pot
+
+    def form_locals(self, target_or_target_parent_boxes, starts, lists, src_weights):
+        local_exps = self.expansion_zeros()
+
+        evt, (local_exps,) = self.p2l(self.queue,
+                source_boxes=source_boxes,
+                box_source_starts=box_source_starts,
+                box_source_counts_nonchild=box_source_counts_nonchild,
+                centers=self.tree.centers,
+                sources=self.tree.sources,
+                strengths=src_weights,
+
+                **self.extra_kwargs)
+        from pyfmmlib import h2dformta
+
+        for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
+            start, end = starts[itgt_box:itgt_box+2]
+
+            contrib = 0
+
+            for src_ibox in lists[start:end]:
+                src_pslice = self._get_source_slice(src_ibox)
+                tgt_center = self.tree.box_centers[:, tgt_ibox]
+
+                if src_pslice.stop - src_pslice.start == 0:
+                    continue
+
+                ier, mpole = h2dformta(
+                        self.helmholtz_k, rscale,
+                        self._get_sources(src_pslice), src_weights[src_pslice],
+                        tgt_center, self.nterms)
+                if ier:
+                    raise RuntimeError("h2dformta failed")
+
+                contrib = contrib + mpole
+
+            local_exps[tgt_ibox] = contrib
+
+        return local_exps
+
+    def refine_locals(self, child_boxes, local_exps):
+        from pyfmmlib import h2dlocloc_vec
+
+        for tgt_ibox in child_boxes:
+            tgt_center = self.tree.box_centers[:, tgt_ibox]
+            src_ibox = self.tree.box_parent_ids[tgt_ibox]
+            src_center = self.tree.box_centers[:, src_ibox]
+
+            tmp_loc_exp = h2dlocloc_vec(
+                        self.helmholtz_k,
+                        rscale, src_center, local_exps[src_ibox],
+                        rscale, tgt_center, self.nterms)[:, 0]
+
+            local_exps[tgt_ibox] += tmp_loc_exp
+
+        return local_exps
+
+    def eval_locals(self, target_boxes, local_exps):
+        pot = self.potential_zeros()
+        rscale = 1  # FIXME
+
+        from pyfmmlib import h2dtaeval_vec
+
+        for tgt_ibox in target_boxes:
+            tgt_pslice = self._get_target_slice(tgt_ibox)
+
+            if tgt_pslice.stop - tgt_pslice.start == 0:
+                continue
+
+            tmp_pot, _, _ = h2dtaeval_vec(self.helmholtz_k, rscale,
+                    self.tree.box_centers[:, tgt_ibox], local_exps[tgt_ibox],
+                    self._get_targets(tgt_pslice), ifgrad=False, ifhess=False)
+
+            pot[tgt_pslice] += tmp_pot
+
+        return pot
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index 0f9a9a474251584b04885d76e2ac34e56550656a..9ec5acf13681e8bfa0e8e15f12b32e66520ebb00 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -389,7 +389,7 @@ class DirectionalDerivative(DerivativeBase):
 
     def get_source_args(self):
         return [
-            lp.GlobalArg(self.dir_vec_name, None, shape=(self.dim, "nsrc"),
+            lp.GlobalArg(self.dir_vec_name, None, shape=(self.dim, "nsources"),
                 dim_tags="sep,C")] + self.inner_kernel.get_source_args()
 
 
diff --git a/sumpy/p2e.py b/sumpy/p2e.py
index 6f9287619c35e18b5ddd28d6a1f1d88f358b2f9f..9bd731d6a7e3bdab6beb8e4563d47d0abb277f94 100644
--- a/sumpy/p2e.py
+++ b/sumpy/p2e.py
@@ -27,7 +27,9 @@ import loopy as lp
 from pytools import memoize_method
 
 
-class P2E(object):
+# {{{ P2E base class
+
+class P2EBase(object):
     def __init__(self, ctx, expansion,
             options=[], name="p2e", device=None):
         """
@@ -49,8 +51,7 @@ class P2E(object):
 
         self.dim = expansion.dim
 
-    @memoize_method
-    def get_kernel(self):
+    def get_looy_instructions(self):
         from sumpy.symbolic import make_sym_vector
         avec = make_sym_vector("a", self.dim)
 
@@ -71,55 +72,147 @@ class P2E(object):
                 retain_names=coeff_names)
 
         from sumpy.codegen import to_loopy_insns
-        loopy_insns = to_loopy_insns(assignments,
+        return to_loopy_insns(
+                assignments,
                 vector_names=set(["a"]),
                 pymbolic_expr_maps=[self.expansion.transform_to_code],
                 complex_dtype=np.complex128  # FIXME
                 )
 
+# }}}
+
+
+# {{{ P2E from local boxes
+
+class P2EFromLocal(P2EBase):
+    @memoize_method
+    def get_kernel(self):
+        ncoeffs = len(self.expansion)
+
+        from sumpy.tools import gather_source_arguments
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
+                    "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
+                    ],
+                self.get_looy_instructions()
+                + ["""
+                    <> src_ibox = source_boxes[isrc_box]
+                    <> isrc_start = box_source_starts[isrc_box]
+                    <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box]
+                    <> center[idim] = centers[idim, src_ibox] {id=fetch_center}
+                    <> a[idim] = center[idim] - sources[idim, isrc] \
+                            {id=compute_a}
+                    <> strength = strengths[isrc]
+                    expansions[src_ibox, ${COEFFIDX}] = \
+                            sum(isrc, strength*coeff${COEFFIDX}) \
+                            {id_prefix=write_expn}
+                    """],
+                [
+                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources")),
+                    lp.GlobalArg("strengths", None, shape="nsources"),
+                    lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
+                        None, shape=None),
+                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
+                    lp.GlobalArg("expansions", None,
+                        shape=("nboxes", ncoeffs)),
+                    lp.ValueArg("nboxes,aligned_nboxes", np.int32),
+                    lp.ValueArg("nsources", np.int32),
+                    "..."
+                ] + gather_source_arguments([self.expansion]),
+                name=self.name, assumptions="nsrc_boxes>=1",
+                defines=dict(
+                    dim=self.dim,
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)]
+                    ),
+                silenced_warnings="write_race(write_expn*)")
+
+        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center",
+                tags={"idim": "unr"})
+        loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr"))
+        loopy_knl = lp.tag_data_axes(loopy_knl, "sources", "sep,c")
+
+        return loopy_knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg expansions:
+        :arg source_boxes:
+        :arg box_source_starts:
+        :arg box_source_counts_nonchild:
+        :arg centers:
+        :arg sources:
+        :arg strengths:
+        """
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, **kwargs)
+
+# }}}
+
+
+# {{{ P2E from CSR-like interaction list
+
+class P2EFromCSR(P2EBase):
+    @memoize_method
+    def get_kernel(self):
+        ncoeffs = len(self.expansion)
+
+        # FIXXME
         from sumpy.tools import gather_source_arguments
         arguments = (
                 [
-                    lp.GlobalArg("sources", None, shape=(self.dim, "nsrc")),
-                    lp.GlobalArg("strengths", None, shape="nsrc"),
+                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources")),
+                    lp.GlobalArg("strengths", None, shape="nsources"),
                     lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
                         None, shape=None),
                     lp.GlobalArg("centers", None, shape="dim, nboxes"),
                     lp.GlobalArg("expansions", None,
-                        shape=("nboxes", len(coeff_names))),
+                        shape=("nboxes", ncoeffs)),
                     lp.ValueArg("nboxes", np.int32),
-                    lp.ValueArg("nsrc", np.int32),
+                    lp.ValueArg("nsources", np.int32),
                     "..."
                 ] + gather_source_arguments([self.expansion]))
 
         loopy_knl = lp.make_kernel(self.device,
                 [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                     "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
                     "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
                     ],
-                loopy_insns
-                + [
-                    "<> src_ibox = source_boxes[isrc_box]",
-                    "<> isrc_start = box_source_starts[isrc_box]",
-                    "<> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box]",
-                    "<> center[idim] = centers[idim, src_ibox] {id=fetch_center}",
-                    "<> a[idim] = center[idim] - sources[idim, isrc]",
-                    "<> strength = strengths[isrc]",
-                    "expansions[src_ibox, ${COEFFIDX}] = "
-                            "sum(isrc, strength*coeff${COEFFIDX})"
-                            "{id_prefix=write_expn}"
-                ],
+                self.get_looy_instructions()
+                + ["""
+                    <> src_ibox = source_boxes[isrc_box]
+                    <> isrc_start = box_source_starts[isrc_box]
+                    <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box]
+                    <> center[idim] = centers[idim, src_ibox] {id=fetch_center}
+                    <> a[idim] = center[idim] - sources[idim, isrc] {id=compute_a}
+                    <> strength = strengths[isrc]
+                    expansions[src_ibox, ${COEFFIDX}] = \
+                            sum(isrc, strength*coeff${COEFFIDX}) \
+                            {id_prefix=write_expn}
+                    """],
                 arguments,
                 name=self.name, assumptions="nsrc_boxes>=1",
                 defines=dict(
                     dim=self.dim,
-                    COEFFIDX=[str(i) for i in xrange(len(coeff_names))]
+                    COEFFIDX=[str(i) for i in xrange(ncoeffs)]
                     ),
                 silenced_warnings="write_race(write_expn*)")
 
         loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
         loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center",
                 tags={"idim": "unr"})
+        loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr"))
+        loopy_knl = lp.tag_data_axes(loopy_knl, "sources", "sep,c")
 
         return loopy_knl
 
@@ -143,3 +236,7 @@ class P2E(object):
         knl = self.get_optimized_kernel()
 
         return knl(queue, **kwargs)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index e7129684e3336fdbe268b4e52fd98acacf082d07..20c133650388d5da639891c7d3e22ddab772bd94 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -32,9 +32,11 @@ from sumpy.tools import KernelComputation
 # LATER:
 # - Optimization for source == target (postpone)
 
-class P2P(KernelComputation):
+# {{{ p2p base class
+
+class P2PBase(KernelComputation):
     def __init__(self, ctx, kernels,  exclude_self, strength_usage=None,
-            value_dtypes=None, strength_dtypes=None,
+            value_dtypes=None,
             options=[], name="p2p", device=None):
         """
         :arg kernels: list of :class:`sumpy.kernel.Kernel` instances
@@ -44,7 +46,7 @@ class P2P(KernelComputation):
           Default: all kernels use the same strength.
         """
         KernelComputation.__init__(self, ctx, kernels, strength_usage,
-                value_dtypes, strength_dtypes,
+                value_dtypes,
                 name, options, device)
 
         self.exclude_self = exclude_self
@@ -52,8 +54,7 @@ class P2P(KernelComputation):
         from pytools import single_valued
         self.dim = single_valued(knl.dim for knl in self.kernels)
 
-    @memoize_method
-    def get_kernel(self):
+    def get_loopy_insns_and_result_names(self):
         from sumpy.symbolic import make_sym_vector
         dvec = make_sym_vector("d", self.dim)
 
@@ -77,6 +78,18 @@ class P2P(KernelComputation):
                 complex_dtype=np.complex128  # FIXME
                 )
 
+        return loopy_insns, result_names
+
+# }}}
+
+
+# {{{ P2P with list of sources and list of targets
+
+class P2P(P2PBase):
+    @memoize_method
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+
         from pymbolic import var
         exprs = [
                 var(name)
@@ -87,16 +100,16 @@ class P2P(KernelComputation):
         arguments = (
                 [
                     lp.GlobalArg("src", None,
-                        shape=(self.dim, "nsrc"), order="C"),
+                        shape=(self.dim, "nsources"), order="C"),
                     lp.GlobalArg("tgt", None,
-                        shape=(self.dim, "ntgt"), order="C"),
-                    lp.ValueArg("nsrc", None),
-                    lp.ValueArg("ntgt", np.int32),
+                        shape=(self.dim, "ntargets"), order="C"),
+                    lp.ValueArg("nsources", None),
+                    lp.ValueArg("ntargets", np.int32),
                 ]+[
-                    lp.GlobalArg("strength_%d" % i, None, shape="nsrc", order="C")
-                    for i, dtype in enumerate(self.strength_dtypes)
+                    lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C")
+                    for i in xrange(self.strength_count)
                 ]+[
-                    lp.GlobalArg("result_%d" % i, dtype, shape="ntgt", order="C")
+                    lp.GlobalArg("result_%d" % i, dtype, shape="ntargets", order="C")
                     for i, dtype in enumerate(self.value_dtypes)
                 ] + gather_source_arguments(self.kernels))
 
@@ -109,8 +122,7 @@ class P2P(KernelComputation):
                     for expr in exprs]
 
         loopy_knl = lp.make_kernel(self.device,
-                "{[isrc,itgt,idim]: 0<=itgt<ntgt and 0<=isrc<nsrc "
-                "and 0<=idim<%d}" % self.dim,
+                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources and 0<=idim<dim}",
                 self.get_kernel_scaling_assignments()
                 + loopy_insns
                 + [
@@ -119,14 +131,17 @@ class P2P(KernelComputation):
                     lp.ExpressionInstruction(id=None,
                         assignee="pair_result_%d" % i, expression=expr,
                         temp_var_type=lp.auto)
-                    for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
+                    for i, expr in enumerate(exprs)
                 ]+[
                     "result_${KNLIDX}[itgt] = knl_${KNLIDX}_scaling \
                             * sum(isrc, pair_result_${KNLIDX})"
                 ],
                 arguments,
-                name=self.name, assumptions="nsrc>=1 and ntgt>=1",
-                defines=dict(KNLIDX=range(len(exprs))))
+                name=self.name, assumptions="nsources>=1 and ntargets>=1",
+                defines=dict(
+                    KNLIDX=range(len(exprs)),
+                    dim=self.dim,
+                    ))
 
         for where in ["compute_a", "compute_b"]:
             loopy_knl = lp.duplicate_inames(loopy_knl, "idim", where)
@@ -149,3 +164,105 @@ class P2P(KernelComputation):
             kwargs["strength_%d" % i] = sstr
 
         return knl(queue, src=sources, tgt=targets, **kwargs)
+
+# }}}
+
+
+# {{{ P2P from CSR-like interaction list
+
+class P2PFromCSR(P2PBase):
+    # FIXME: exclude_self ...?
+
+    @memoize_method
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+
+        from pymbolic import var
+        exprs = [
+                var(name)
+                * var("strength_%d" % self.strength_usage[i])[var("isrc")]
+                for i, name in enumerate(result_names)]
+
+        from sumpy.tools import gather_source_arguments
+        arguments = (
+                [
+                    lp.GlobalArg("box_target_starts,box_target_counts_nonchild,"
+                        "box_source_starts,box_source_counts_nonchild",
+                        None, shape=None)
+                ]+[
+                    lp.GlobalArg("strength_%d" % i, None, shape="nsources")
+                    for i in xrange(self.strength_count)
+                ]+[
+                    lp.GlobalArg("result_%d" % i, dtype, shape="ntargets")
+                    for i, dtype in enumerate(self.value_dtypes)
+                ] + gather_source_arguments(self.kernels))
+
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
+                    "{[isrc_box]: isrc_box_start<=isrc_box<=isrc_box_end}",
+                    "{[itgt,isrc,idim]: \
+                            itgt_start<=itgt<itgt_end and \
+                            isrc_start<=isrc<isrc_end and \
+                            0<=idim<dim }",
+                    ],
+                self.get_kernel_scaling_assignments()
+                + loopy_insns + [
+                    """
+                    <> tgt_ibox = target_boxes[itgt_box]
+                    <> itgt_start = box_target_starts[itgt_box]
+                    <> itgt_end = itgt_start+box_target_counts_nonchild[itgt_box]
+
+                    <> isrc_box_start = source_box_starts[itgt_box]
+                    <> isrc_box_end = source_box_starts[itgt_box+1]
+
+                    <> src_ibox = source_box_lists[isrc_box]
+                    <> isrc_start = box_source_starts[isrc_box]
+                    <> isrc_end = isrc_start+box_source_counts_nonchild[isrc_box]
+
+                    <> d[idim] = targets[idim,itgt] - sources[idim,isrc] \
+                            {id=compute_d}
+
+                    result[${KNLIDX}, itgt] = result[${KNLIDX}, itgt] + \
+                            knl_${KNLIDX}_scaling \
+                            * sum(isrc, pair_result_${KNLIDX})
+                    """
+                ]+[
+                    lp.ExpressionInstruction(id=None,
+                        assignee="pair_result_%d" % i, expression=expr,
+                        temp_var_type=lp.auto)
+                    for i, expr in enumerate(exprs)
+                ]+[
+                ],
+                arguments,
+                name=self.name, assumptions="nsources>=1 and ntargets>=1",
+                defines=dict(
+                    KNLIDX=range(len(exprs)),
+                    dim=self.dim,
+                    ))
+
+        for where in ["compute_a", "compute_b"]:
+            loopy_knl = lp.duplicate_inames(loopy_knl, "idim", where)
+
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
+
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, targets, sources, src_strengths, **kwargs):
+        knl = self.get_optimized_kernel()
+
+        for i, sstr in enumerate(src_strengths):
+            kwargs["strength_%d" % i] = sstr
+
+        return knl(queue, src=sources, tgt=targets, **kwargs)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 8ed6382711fda7e1db472c27daac86f247c61edd..99ad35e31571e8d7e7bc49a896af3b8cffa3866c 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -66,10 +66,10 @@ def expand(expansion_nr, sac, expansion, avec, bvec):
 
 class LayerPotentialBase(KernelComputation):
     def __init__(self, ctx, expansions, strength_usage=None,
-            value_dtypes=None, strength_dtypes=None,
+            value_dtypes=None,
             options=[], name="layerpot", device=None):
         KernelComputation.__init__(self, ctx, expansions, strength_usage,
-                value_dtypes, strength_dtypes,
+                value_dtypes,
                 name, options, device)
 
         from pytools import single_valued
@@ -122,18 +122,18 @@ class LayerPotentialBase(KernelComputation):
         arguments = (
                 [
                     lp.GlobalArg("src", None,
-                        shape=(self.dim, "nsrc"), order="C"),
+                        shape=(self.dim, "nsources"), order="C"),
                     lp.GlobalArg("tgt", None,
-                        shape=(self.dim, "ntgt"), order="C"),
+                        shape=(self.dim, "ntargets"), order="C"),
                     lp.GlobalArg("center", None,
-                        shape=(self.dim, "ntgt"), order="C"),
-                    lp.ValueArg("nsrc", None),
-                    lp.ValueArg("ntgt", None),
+                        shape=(self.dim, "ntargets"), order="C"),
+                    lp.ValueArg("nsources", None),
+                    lp.ValueArg("ntargets", None),
                 ] + self.get_input_and_output_arguments()
                 + gather_source_arguments(self.kernels))
 
         loopy_knl = lp.make_kernel(self.device,
-                "{[isrc,itgt,idim]: 0<=itgt<ntgt and 0<=isrc<nsrc "
+                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
                 "and 0<=idim<%d}" % self.dim,
                 [
                 "<> a[idim] = center[idim,itgt] - src[idim,isrc] {id=compute_a}",
@@ -146,7 +146,7 @@ class LayerPotentialBase(KernelComputation):
                 ]+self.get_result_store_instructions(),
                 arguments,
                 defines=dict(KNLIDX=range(len(exprs))),
-                name=self.name, assumptions="nsrc>=1 and ntgt>=1",
+                name=self.name, assumptions="nsources>=1 and ntargets>=1",
                 )
 
         for where in ["compute_a", "compute_b"]:
@@ -193,11 +193,11 @@ class LayerPotential(LayerPotentialBase):
 
     def get_input_and_output_arguments(self):
         return [
-                lp.GlobalArg("strength_%d" % i, None, shape="nsrc", order="C")
-                for i, dtype in enumerate(self.strength_dtypes)
+                lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C")
+                for i in xrange(self.strength_count)
             ]+[
-                lp.GlobalArg("result_%d" % i, None, shape="ntgt", order="C")
-                for i, dtype in enumerate(self.value_dtypes)
+                lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C")
+                for i in xrange(len(self.kernels))
             ]
 
     def get_result_store_instructions(self):
@@ -232,7 +232,7 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 
     def get_input_and_output_arguments(self):
         return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntgt,nsrc", order="C")
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
                 for i, dtype in enumerate(self.value_dtypes)
             ]
 
@@ -361,7 +361,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def density(self):
         self.arguments[self.density_var_name] = \
                 lp.GlobalArg(self.density_var_name, self.density_dtype,
-                        shape="ntgt", order="C")
+                        shape="ntargets", order="C")
         return parse("%s[itgt]" % self.density_var_name)
 
     @property
@@ -370,14 +370,14 @@ class _JumpTermSymbolicArgumentProvider(object):
         prime_var_name = self.density_var_name+"_prime"
         self.arguments[prime_var_name] = \
                 lp.GlobalArg(prime_var_name, self.density_dtype,
-                        shape="ntgt", order="C")
+                        shape="ntargets", order="C")
         return parse("%s[itgt]" % prime_var_name)
 
     @property
     @memoize_method
     def side(self):
         self.arguments["side"] = \
-                lp.GlobalArg("side", self.geometry_dtype, shape="ntgt", order="C")
+                lp.GlobalArg("side", self.geometry_dtype, shape="ntargets")
         return parse("side[itgt]")
 
     @property
@@ -385,7 +385,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def normal(self):
         self.arguments["normal"] = \
                 lp.GlobalArg("normal", self.geometry_dtype,
-                        shape=("ntgt", self.dim), order="C")
+                        shape=("ntargets", self.dim), order="C")
         from pytools.obj_array import make_obj_array
         return make_obj_array([
             parse("normal[itgt, %d]" % i)
@@ -396,7 +396,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def tangent(self):
         self.arguments["tangent"] = \
                 lp.GlobalArg("tangent", self.geometry_dtype,
-                        shape=("ntgt", self.dim), order="C")
+                        shape=("ntargets", self.dim), order="C")
         from pytools.obj_array import make_obj_array
         return make_obj_array([
             parse("tangent[itgt, %d]" % i)
@@ -407,7 +407,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def mean_curvature(self):
         self.arguments["mean_curvature"] = \
                 lp.GlobalArg("mean_curvature",
-                        self.geometry_dtype, shape="ntgt",
+                        self.geometry_dtype, shape="ntargets",
                         order="C")
         return parse("mean_curvature[itgt]")
 
@@ -416,7 +416,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def src_derivative_dir(self):
         self.arguments["src_derivative_dir"] = \
                 lp.GlobalArg("src_derivative_dir",
-                        self.geometry_dtype, shape=("ntgt", self.dim),
+                        self.geometry_dtype, shape=("ntargets", self.dim),
                         order="C")
         from pytools.obj_array import make_obj_array
         return make_obj_array([
@@ -428,7 +428,7 @@ class _JumpTermSymbolicArgumentProvider(object):
     def tgt_derivative_dir(self):
         self.arguments["tgt_derivative_dir"] = \
                 lp.GlobalArg("tgt_derivative_dir",
-                        self.geometry_dtype, shape=("ntgt", self.dim),
+                        self.geometry_dtype, shape=("ntargets", self.dim),
                         order="C")
         from pytools.obj_array import make_obj_array
         return make_obj_array([
diff --git a/sumpy/tools.py b/sumpy/tools.py
index f45a443ef5590b71e77f947a08bbb6a34c3f3bbc..df6dbf426ae1f597aa763f74030cf86ebd125d32 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -110,16 +110,15 @@ class KernelComputation(object):
     """Common input processing for kernel computations."""
 
     def __init__(self, ctx, kernels, strength_usage,
-            value_dtypes, strength_dtypes,
-            name="kernel", options=[], device=None):
+            value_dtypes, name="kernel", options=[], device=None):
         """
         :arg kernels: list of :class:`sumpy.kernel.Kernel` instances
             :class:`sumpy.kernel.TargetDerivative` wrappers should be
             the outermost kernel wrappers, if present.
         :arg strength_usage: A list of integers indicating which expression
-          uses which density. This implicitly specifies the
-          number of density arrays that need to be passed.
-          Default: all kernels use the same density.
+            uses which density. This implicitly specifies the
+            number of density arrays that need to be passed.
+            Default: all kernels use the same density.
         """
 
         # {{{ process value_dtypes
@@ -149,21 +148,6 @@ class KernelComputation(object):
 
         # }}}
 
-        # {{{ process strength_dtypes
-
-        if strength_dtypes is None:
-            strength_dtypes = value_dtypes[0]
-
-        if not isinstance(strength_dtypes, (list, tuple)):
-            strength_dtypes = [np.dtype(strength_dtypes)] * strength_count
-
-        if len(strength_dtypes) != strength_count:
-            raise ValueError("exprs and strength_usage must have the same length")
-
-        strength_dtypes = [np.dtype(dtype) for dtype in strength_dtypes]
-
-        # }}}
-
         if device is None:
             device = ctx.devices[0]
 
@@ -173,7 +157,6 @@ class KernelComputation(object):
         self.kernels = kernels
         self.value_dtypes = value_dtypes
         self.strength_usage = strength_usage
-        self.strength_dtypes = strength_dtypes
         self.strength_count = strength_count
 
         self.name = name
diff --git a/test/test_fmm.py b/test/test_fmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..800682af99f38b21558f7a3e7747617a72c78ae6
--- /dev/null
+++ b/test/test_fmm.py
@@ -0,0 +1,115 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def no_test_sumpy_fmm(ctx_getter):
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_getter()
+    queue = cl.CommandQueue(ctx)
+
+    nsources = 3000
+    ntargets = 1000
+    dim = 2
+    dtype = np.float64
+
+    from boxtree.tools import (
+            make_normal_particle_array as p_normal)
+
+    sources = p_normal(queue, nsources, dim, dtype, seed=15)
+    targets = (
+            p_normal(queue, ntargets, dim, dtype, seed=18)
+            + np.array([2, 0]))
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(ctx)
+
+    tree, _ = tb(queue, sources, targets=targets,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(ctx)
+    trav, _ = tbuild(queue, tree, debug=True)
+
+    trav = trav.get()
+
+    from pyopencl.clrandom import RanluxGenerator
+    rng = RanluxGenerator(queue, seed=20)
+
+    weights = rng.uniform(queue, nsources, dtype=np.float64)
+    #weights = np.ones(nsources)
+
+    logger.info("computing direct (reference) result")
+
+    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
+    from sumpy.expansion.local import VolumeTaylorLocalExpansion
+    from sumpy.kernel import LaplaceKernel
+
+    order = 3
+    knl = LaplaceKernel(dim)
+    mpole_expn = VolumeTaylorMultipoleExpansion(knl, order)
+    local_expn = VolumeTaylorLocalExpansion(knl, order)
+    out_kernels = [knl]
+
+    from sumpy.fmm import SumpyExpansionWranglerCodeContainer
+    wcc = SumpyExpansionWranglerCodeContainer(
+            ctx, tree, mpole_expn, local_expn, out_kernels)
+    wrangler = wcc.get_wrangler(queue, np.float64)
+
+    from boxtree.fmm import drive_fmm
+    pot = drive_fmm(trav, wrangler, weights)
+
+    from sumpy import P2P
+    p2p = P2P(ctx, out_kernels, exclude_self=False)
+    ref_pot = p2p(queue, sources, targets)
+
+    pot = pot.get()
+    ref_pot = ref_pot.get()
+
+    rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot)
+    logger.info("relative l2 error: %g" % rel_err)
+    assert rel_err < 1e-5
+
+
+# You can test individual routines by typing
+# $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context)'
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_kernels.py b/test/test_kernels.py
index d40911446bd570cc3bb310748dbfa6c126b24190..6ab925436ba0374e77c1b9cc95aff3c8d34281e7 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 import numpy as np
 import numpy.linalg as la
 import sys
-import pytools.test
 
 import pytest
 import pyopencl as cl
@@ -48,7 +47,6 @@ else:
     faulthandler.enable()
 
 
-@pytest.mark.opencl
 def test_p2p(ctx_getter):
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
@@ -88,7 +86,6 @@ def test_p2p(ctx_getter):
     assert rel_err < 1e-3
 
 
-@pytools.test.mark_test.opencl
 @pytest.mark.parametrize("order", [2, 3, 4, 5])
 @pytest.mark.parametrize(("knl", "expn_class"), [
     (LaplaceKernel(2), VolumeTaylorLocalExpansion),
@@ -129,9 +126,9 @@ def test_p2e2p(ctx_getter, knl, expn_class, order, with_source_derivative):
             ]
     expn = expn_class(knl, order=order)
 
-    from sumpy import P2E, E2P, P2P
-    p2e = P2E(ctx, expn, out_kernels)
-    e2p = E2P(ctx, expn, out_kernels)
+    from sumpy import P2EFromLocal, E2PFromLocal, P2P
+    p2e = P2EFromLocal(ctx, expn, out_kernels)
+    e2p = E2PFromLocal(ctx, expn, out_kernels)
     p2p = P2P(ctx, out_kernels, exclude_self=False)
 
     from pytools.convergence import EOCRecorder
@@ -185,6 +182,8 @@ def test_p2e2p(ctx_getter, knl, expn_class, order, with_source_derivative):
                 centers=centers,
                 sources=sources,
                 strengths=strengths,
+                nboxes=1,
+
                 #flags="print_hl_cl",
                 out_host=True, **extra_source_kwargs)
 
@@ -325,7 +324,6 @@ def test_p_convergence_verifier():
         pconv_verifier()
 
 
-@pytools.test.mark_test.opencl
 @pytest.mark.parametrize("knl", [
     LaplaceKernel(2),
     HelmholtzKernel(2)
@@ -385,17 +383,18 @@ def test_translations(ctx_getter, knl):
         orders = [2, 3]
     else:
         orders = [2, 3, 4, 5]
+    nboxes = centers.shape[-1]
 
     for order in orders:
         m_expn = VolumeTaylorMultipoleExpansion(knl, order=order)
         l_expn = VolumeTaylorLocalExpansion(knl, order=order)
 
-        from sumpy import P2E, E2P, P2P, E2E
-        p2m = P2E(ctx, m_expn)
-        m2m = E2E(ctx, m_expn, m_expn)
-        m2l = E2E(ctx, m_expn, l_expn)
-        l2l = E2E(ctx, l_expn, l_expn)
-        l2p = E2P(ctx, l_expn, out_kernels)
+        from sumpy import P2EFromLocal, E2PFromLocal, P2P, E2EFromCSR
+        p2m = P2EFromLocal(ctx, m_expn)
+        m2m = E2EFromCSR(ctx, m_expn, m_expn)
+        m2l = E2EFromCSR(ctx, m_expn, l_expn)
+        l2l = E2EFromCSR(ctx, l_expn, l_expn)
+        l2p = E2PFromLocal(ctx, l_expn, out_kernels)
         p2p = P2P(ctx, out_kernels, exclude_self=False)
 
         fp = FieldPlotter(centers[:, -1], extent=0.3, npoints=res)
@@ -414,6 +413,8 @@ def test_translations(ctx_getter, knl):
                 centers=centers,
                 sources=sources,
                 strengths=strengths,
+                nboxes=nboxes,
+
                 #flags="print_hl_wrapper",
                 out_host=True, **extra_kwargs)