diff --git a/sumpy/p2e.py b/sumpy/p2e.py
index b1214603d3a84f3c67ca274d798cd943b28a539b..8105ac5388f7f6afb6b6ef9c92a4ba2afc1c1bc3 100644
--- a/sumpy/p2e.py
+++ b/sumpy/p2e.py
@@ -32,29 +32,35 @@ logger = logging.getLogger(__name__)
 
 __doc__ = """
 
-Particle-to-expansion
+Particle-to-Expansion
 ---------------------
 
 .. autoclass:: P2EBase
 .. autoclass:: P2EFromSingleBox
 .. autoclass:: P2EFromCSR
-
 """
 
 
 # {{{ P2E base class
 
 class P2EBase(KernelComputation, KernelCacheWrapper):
-    """Common input processing for kernel computations."""
+    """Common input processing for kernel computations.
+
+    .. automethod:: __init__
+    """
 
     def __init__(self, ctx, expansion, kernels=None,
             name=None, device=None, strength_usage=None):
         """
-        :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase`
-        :arg strength_usage: A list of integers indicating which expression
-          uses which source strength indicator. This implicitly specifies the
-          number of strength arrays that need to be passed.
-          Default: all kernels use the same strength.
+        :arg expansion: a subclass of :class:`sumpy.expansion.ExpansionBase`
+        :arg kernels: if not provided, the kernel of the *expansion* is used.
+            The base kernel (after source or target transformation
+            removal) of each kernel in the list should match the base kernel of
+            the expansion.
+        :arg strength_usage: a list of integers indicating which expression
+            uses which source strength indicator. This implicitly specifies the
+            number of strength arrays that need to be passed in.
+            By default all kernels use the same strength.
         """
         from sumpy.kernel import (TargetTransformationRemover,
                 SourceTransformationRemover)
@@ -117,59 +123,88 @@ class P2EBase(KernelComputation, KernelCacheWrapper):
         return (type(self).__name__, self.name, self.expansion,
                 tuple(self.source_kernels), tuple(self.strength_usage))
 
+    def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array):
+        knl = self.get_kernel()
+
+        if sources_is_obj_array:
+            knl = lp.tag_array_axes(knl, "sources", "sep,C")
+        if centers_is_obj_array:
+            knl = lp.tag_array_axes(knl, "centers", "sep,C")
+
+        knl = self._allow_redundant_execution_of_knl_scaling(knl)
+        return knl
+
+    def __call__(self, queue, **kwargs):
+        from sumpy.tools import is_obj_array_like
+        sources = kwargs.pop("sources")
+        centers = kwargs.pop("centers")
+        knl = self.get_cached_optimized_kernel(
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))
+
+        # "1" may be passed for rscale, which won't have its type
+        # meaningfully inferred. Make the type of rscale explicit.
+        rscale = centers.dtype.type(kwargs.pop("rscale"))
+
+        return knl(queue, sources=sources, centers=centers, rscale=rscale, **kwargs)
 
 # }}}
 
+
 # {{{ P2E from single box (P2M, likely)
 
 class P2EFromSingleBox(P2EBase):
+    """
+    .. automethod:: __call__
+    """
+
     default_name = "p2e_from_single_box"
 
     def get_kernel(self):
         ncoeffs = len(self.expansion)
 
         from sumpy.tools import gather_loopy_source_arguments
-        loopy_knl = lp.make_kernel(
-                [
-                    "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
-                    "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
-                    ],
-                ["""
+        loopy_knl = lp.make_kernel([
+                "{[isrc_box]: 0 <= isrc_box < nsrc_boxes}",
+                "{[isrc, idim]: isrc_start <= isrc < isrc_end and 0 <= idim < dim}",
+                ], ["""
                 for isrc_box
                     <> src_ibox = source_boxes[isrc_box]
                     <> isrc_start = box_source_starts[src_ibox]
-                    <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
+                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
 
                     <> center[idim] = centers[idim, src_ibox] {id=fetch_center}
 
                     for isrc
                         <> a[idim] = center[idim] - sources[idim, isrc] {dup=idim}
-                        """] + self.get_loopy_instructions() + ["""
+                        """] + [
+                        f"<> strength_{i} = strengths[{i}, isrc]"
+                        for i in set(self.strength_usage)
+                        ] + self.get_loopy_instructions() + ["""
                     end
-                    """] + [f"<> strength_{i} = strengths[{i}, isrc]" for
-                        i in set(self.strength_usage)] + [f"""
-                    tgt_expansions[src_ibox-tgt_base_ibox, {coeffidx}] = \
+                    """] + [f"""
+                    tgt_expansions[src_ibox - tgt_base_ibox, {coeffidx}] = \
                         simul_reduce(sum, isrc, coeff{coeffidx}) \
                             {{id_prefix=write_expn}}
                     """ for coeffidx in range(ncoeffs)] + ["""
                 end
                 """],
                 [
-                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources"),
-                        dim_tags="sep,c"),
+                    lp.GlobalArg("sources", None,
+                        shape=(self.dim, "nsources"), order="C"),
                     lp.GlobalArg("strengths", None,
-                        shape="strength_count, nsources", dim_tags="sep,C"),
-                    lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
+                        shape=("strength_count", "nsources"), dim_tags="sep,C"),
+                    lp.GlobalArg("box_source_starts, box_source_counts_nonchild",
                         None, shape=None),
                     lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                     lp.ValueArg("rscale", None),
                     lp.GlobalArg("tgt_expansions", None,
                         shape=("nboxes", ncoeffs), offset=lp.auto),
-                    lp.ValueArg("nboxes,aligned_nboxes,tgt_base_ibox", np.int32),
+                    lp.ValueArg("nboxes, aligned_nboxes, tgt_base_ibox", np.int32),
                     lp.ValueArg("nsources", np.int32),
-                    "..."
-                ] + gather_loopy_source_arguments(self.source_kernels
-                        + (self.expansion,)),
+                    ...
+                ] + gather_loopy_source_arguments(
+                    self.source_kernels + (self.expansion,)),
                 name=self.name,
                 assumptions="nsrc_boxes>=1",
                 silenced_warnings="write_race(write_expn*)",
@@ -184,33 +219,36 @@ class P2EFromSingleBox(P2EBase):
 
         return loopy_knl
 
-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array):
+        knl = super().get_optimized_kernel(
+                sources_is_obj_array=sources_is_obj_array,
+                centers_is_obj_array=centers_is_obj_array)
+
         # FIXME
-        knl = self.get_kernel()
         knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0")
-        knl = self._allow_redundant_execution_of_knl_scaling(knl)
-
         return knl
 
     def __call__(self, queue, **kwargs):
         """
-        :arg expansions:
-        :arg source_boxes:
-        :arg box_source_starts:
-        :arg box_source_counts_nonchild:
-        :arg centers:
-        :arg sources:
-        :arg strengths:
-        :arg rscale:
+        :arg source_boxes: an array of integer indices into *box_source_starts*
+            and *box_source_counts_nonchild*.
+        :arg box_source_starts: an array of integer indices into *sources*.
+        :arg box_source_counts_nonchild: an array of integer sizes of each box.
+        :arg centers: expansion centers.
+        :arg sources: source points.
+        :arg strengths: strengths at each source point. the strength count
+            is given by the *strength_usage* list passed in to
+            :meth:`P2EBase.__init__`.
+        :arg nboxes: number of boxes.
+        :arg tgt_base_ibox: integer for the base index of the target box.
+        :arg rscale: expansion scale.
+        :arg tgt_expansions: if given as an input, the array will be filled
+            in with the expansions at boxes indexed by
+            ``source_boxes[i] - tgt_base_ibox``.
+
+        :returns: an array of *tgt_expansions*.
         """
-        centers = kwargs.pop("centers")
-        # "1" may be passed for rscale, which won't have its type
-        # meaningfully inferred. Make the type of rscale explicit.
-        rscale = centers.dtype.type(kwargs.pop("rscale"))
-
-        knl = self.get_cached_optimized_kernel()
-
-        return knl(queue, centers=centers, rscale=rscale, **kwargs)
+        return super().__call__(queue, **kwargs)
 
 # }}}
 
@@ -218,6 +256,10 @@ class P2EFromSingleBox(P2EBase):
 # {{{ P2E from CSR-like interaction list
 
 class P2EFromCSR(P2EBase):
+    """
+    .. automethod:: __call__
+    """
+
     default_name = "p2e_from_csr"
 
     def get_kernel(self):
@@ -226,10 +268,10 @@ class P2EFromCSR(P2EBase):
         from sumpy.tools import gather_loopy_source_arguments
         arguments = (
                 [
-                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources"),
-                        dim_tags="sep,c"),
-                    lp.GlobalArg("strengths", None, shape="strength_count, nsources",
-                        dim_tags="sep,C"),
+                    lp.GlobalArg("sources", None,
+                        shape=(self.dim, "nsources"), order="C"),
+                    lp.GlobalArg("strengths", None,
+                        shape=("strength_count", "nsources"), dim_tags="sep,C"),
                     lp.GlobalArg("source_box_starts,source_box_lists",
                         None, shape=None, offset=lp.auto),
                     lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
@@ -240,37 +282,39 @@ class P2EFromCSR(P2EBase):
                     lp.ValueArg("naligned_boxes,ntgt_level_boxes,tgt_base_ibox",
                         np.int32),
                     lp.ValueArg("nsources", np.int32),
-                    "..."
-                ] + gather_loopy_source_arguments(self.source_kernels
-                        + (self.expansion,)))
+                    ...
+                ] + gather_loopy_source_arguments(
+                    self.source_kernels + (self.expansion,)))
 
         loopy_knl = lp.make_kernel(
                 [
-                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
-                    "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_stop}",
-                    "{[isrc]: isrc_start<=isrc<isrc_end}",
-                    "{[idim]: 0<=idim<dim}",
+                    "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
+                    "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_stop}",
+                    "{[isrc]: isrc_start <= isrc < isrc_end}",
+                    "{[idim]: 0 <= idim < dim}",
                     ],
                 ["""
                 for itgt_box
                     <> tgt_ibox = target_boxes[itgt_box]
-
                     <> center[idim] = centers[idim, tgt_ibox] {id=fetch_center}
 
                     <> isrc_box_start = source_box_starts[itgt_box]
-                    <> isrc_box_stop = source_box_starts[itgt_box+1]
+                    <> isrc_box_stop = source_box_starts[itgt_box + 1]
 
                     for isrc_box
                         <> src_ibox = source_box_lists[isrc_box]
                         <> isrc_start = box_source_starts[src_ibox]
-                        <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
+                        <> isrc_end = isrc_start \
+                                + box_source_counts_nonchild[src_ibox]
 
                         for isrc
                             <> a[idim] = center[idim] - sources[idim, isrc] \
                                     {dup=idim}
-                    """] + [f"""
+                    """] + [
+                        f"""
                              <> strength_{i} = strengths[{i}, isrc]
-                             """ for i in set(self.strength_usage)] + self.get_loopy_instructions() + ["""
+                             """ for i in set(self.strength_usage)
+                             ] + self.get_loopy_instructions() + ["""
                         end
                     end"""] + [f"""
                     tgt_expansions[tgt_ibox - tgt_base_ibox, {coeffidx}] = \
@@ -294,33 +338,35 @@ class P2EFromCSR(P2EBase):
 
         return loopy_knl
 
-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array):
+        knl = super().get_optimized_kernel(
+                sources_is_obj_array=sources_is_obj_array,
+                centers_is_obj_array=centers_is_obj_array)
+
         # FIXME
-        knl = self.get_kernel()
         knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
-        knl = self._allow_redundant_execution_of_knl_scaling(knl)
-
         return knl
 
     def __call__(self, queue, **kwargs):
         """
-        :arg expansions:
-        :arg source_boxes:
-        :arg box_source_starts:
-        :arg box_source_counts_nonchild:
-        :arg centers:
-        :arg sources:
-        :arg strengths:
-        :arg rscale:
+        :arg target_boxes: array of integer indices into *source_box_starts*
+            and *centers*.
+        :arg source_box_starts: array of integer indices into *source_box_lists*,
+            i.e. `source_box_starts[i]:source_box_starts[i + 1]` gives all the
+            source boxes for a given target box ``i = target_boxes[itgt_box]``.
+        :arg source_box_lists: an array of integer indices into *box_source_starts*
+            and *box_source_counts_nonchild*.
+
+        :arg box_source_starts: see :meth:`P2EFromSingleBox.__call__`.
+        :arg box_source_counts_nonchild: see :meth:`P2EFromSingleBox.__call__`.
+        :arg centers: see :meth:`P2EFromSingleBox.__call__`.
+        :arg sources: see :meth:`P2EFromSingleBox.__call__`.
+        :arg strengths: see :meth:`P2EFromSingleBox.__call__`.
+        :arg rscale: see :meth:`P2EFromSingleBox.__call__`.
+        :arg tgt_base_ibox: see :meth:`P2EFromSingleBox.__call__`.
+        :arg tgt_expansion: see :meth:`P2EFromSingleBox.__call__`.
         """
-        knl = self.get_cached_optimized_kernel()
-
-        centers = kwargs.pop("centers")
-        # "1" may be passed for rscale, which won't have its type
-        # meaningfully inferred. Make the type of rscale explicit.
-        rscale = centers.dtype.type(kwargs.pop("rscale"))
-
-        return knl(queue, centers=centers, rscale=rscale, **kwargs)
+        return super().__call__(queue, **kwargs)
 
 # }}}
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 6fd929e5567ac31c00e2ec9eede330fbad53b67e..a6bc52d5aefbfca85ac6e6aeaa2d21683b192c6c 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -37,31 +37,27 @@ from pyopencl.tools import (  # noqa
 import logging
 logger = logging.getLogger(__name__)
 
-try:
-    import faulthandler
-except ImportError:
-    pass
-else:
-    faulthandler.enable()
+import faulthandler
+faulthandler.enable()
 
 
-def _build_geometry(queue, n, mode, target_radius=1.0):
-    # parametrize circle
-    t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
-    unit_circle = np.exp(1j * t)
-    unit_circle = np.array([unit_circle.real, unit_circle.imag])
+def _build_geometry(queue, ntargets, nsources, mode, target_radius=1.0):
+    # source points
+    t = np.linspace(0.0, 2.0 * np.pi, nsources, endpoint=False)
+    sources = np.array([np.cos(t), np.sin(t)])
 
-    # create density
+    # density
     sigma = np.cos(mode * t)
 
-    # create sources and targets
-    h = 2.0 * np.pi / n
-    targets = target_radius * unit_circle
-    sources = unit_circle
+    # target points
+    t = np.linspace(0.0, 2.0 * np.pi, ntargets, endpoint=False)
+    targets = target_radius * np.array([np.cos(t), np.sin(t)])
 
+    # target centers and expansion radii
+    h = 2.0 * np.pi * target_radius / ntargets
     radius = 7.0 * h
-    centers = unit_circle * (1.0 - radius)
-    expansion_radii = radius * np.ones(n)
+    centers = (1.0 - radius) * targets
+    expansion_radii = np.full(ntargets, radius)
 
     return (cl.array.to_device(queue, targets),
             cl.array.to_device(queue, sources),
@@ -84,9 +80,10 @@ def _build_subset_indices(queue, ntargets, nsources, factor):
         rng.shuffle(tgtindices)
         rng.shuffle(srcindices)
 
+    tgtindices, srcindices = np.meshgrid(tgtindices, srcindices)
     return (
-            cl.array.to_device(queue, tgtindices).with_queue(None),
-            cl.array.to_device(queue, srcindices).with_queue(None))
+            cl.array.to_device(queue, tgtindices.ravel()).with_queue(None),
+            cl.array.to_device(queue, srcindices.ravel()).with_queue(None))
 
 
 @pytest.mark.parametrize("factor", [1.0, 0.6])
@@ -128,7 +125,7 @@ def test_qbx_direct(ctx_factory, factor, lpot_id):
 
     for n in [200, 300, 400]:
         targets, sources, centers, expansion_radii, sigma = \
-                _build_geometry(queue, n, mode_nr, target_radius=1.2)
+                _build_geometry(queue, n, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
@@ -206,7 +203,7 @@ def test_p2p_direct(ctx_factory, exclude_self, factor, lpot_id):
 
     for n in [200, 300, 400]:
         targets, sources, _, _, sigma = \
-            _build_geometry(queue, n, mode_nr, target_radius=1.2)
+            _build_geometry(queue, n, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)