Compare revisions

56f36681 · 56f36681 · 56f36681 · 56f36681 · 56f36681 · 56f36681
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = """
 Copyright (C) 2012 Andreas Kloeckner
@@ -25,15 +26,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import six
-from six.moves import range
-
 import numpy as np
+
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-from pymbolic import var

-from sumpy.tools import KernelComputation, KernelCacheWrapper
+from sumpy.codegen import register_optimization_preambles
+from sumpy.tools import KernelCacheMixin, KernelComputation, is_obj_array_like


 __doc__ = """
@@ -44,7 +43,7 @@ Particle-to-particle
 .. autoclass:: P2PBase
 .. autoclass:: P2P
 .. autoclass:: P2PMatrixGenerator
-.. autoclass:: P2PMatrixBlockGenerator
+.. autoclass:: P2PMatrixSubsetGenerator
 .. autoclass:: P2PFromCSR

 """
@@ -55,78 +54,118 @@ Particle-to-particle

 # {{{ p2p base class

-class P2PBase(KernelComputation, KernelCacheWrapper):
-    def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
-            value_dtypes=None,
-            options=[], name=None, device=None):
+class P2PBase(KernelCacheMixin, KernelComputation):
+    def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None,
+            value_dtypes=None, name=None, device=None, source_kernels=None):
        """
-        :arg kernels: list of :class:`sumpy.kernel.Kernel` instances
+        :arg target_kernels: list of :class:`sumpy.kernel.Kernel` instances
+          with only target derivatives.
+        :arg source_kernels: list of :class:`sumpy.kernel.Kernel` instances
+          with only source derivatives.
        :arg strength_usage: A list of integers indicating which expression
          uses which source strength indicator. This implicitly specifies the
          number of strength arrays that need to be passed.
          Default: all kernels use the same strength.
        """
-        KernelComputation.__init__(self, ctx, kernels, strength_usage,
-                value_dtypes,
-                name, options, device)
+        from pytools import single_valued
+
+        from sumpy.kernel import (
+            SourceTransformationRemover,
+            TargetTransformationRemover,
+        )
+        txr = TargetTransformationRemover()
+        sxr = SourceTransformationRemover()
+
+        if source_kernels is None:
+            source_kernels = [single_valued(txr(knl) for knl in target_kernels)]
+            target_kernels = [sxr(knl) for knl in target_kernels]
+        else:
+            for knl in source_kernels:
+                assert txr(knl) == knl
+            for knl in target_kernels:
+                assert sxr(knl) == knl
+
+        base_source_kernel = single_valued(sxr(knl) for knl in source_kernels)
+        base_target_kernel = single_valued(txr(knl) for knl in target_kernels)
+        assert base_source_kernel == base_target_kernel
+
+        KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels,
+            source_kernels=source_kernels, strength_usage=strength_usage,
+            value_dtypes=value_dtypes, name=name, device=device)
+
+        import pyopencl as cl
+        self.is_gpu = not (self.device.type & cl.device_type.CPU)

        self.exclude_self = exclude_self

-        from pytools import single_valued
-        self.dim = single_valued(knl.dim for knl in self.kernels)
+        self.dim = single_valued(knl.dim for knl in
+            list(self.target_kernels) + list(self.source_kernels))

    def get_cache_key(self):
-        return (type(self).__name__, tuple(self.kernels), self.exclude_self,
-                tuple(self.strength_usage), tuple(self.value_dtypes))
+        return (type(self).__name__, tuple(self.target_kernels), self.exclude_self,
+                tuple(self.strength_usage), tuple(self.value_dtypes),
+                tuple(self.source_kernels),
+                self.device.hashable_model_and_version_identifier)

    def get_loopy_insns_and_result_names(self):
+        from pymbolic import var
+
        from sumpy.symbolic import make_sym_vector
+
        dvec = make_sym_vector("d", self.dim)

        from sumpy.assignment_collection import SymbolicAssignmentCollection
        sac = SymbolicAssignmentCollection()

+        isrc_sym = var("isrc")
+
+        exprs = []
+        for out_knl in self.target_kernels:
+            expr_sum = 0
+            for j, in_knl in enumerate(self.source_kernels):
+                expr = in_knl.postprocess_at_source(
+                            in_knl.get_expression(dvec),
+                            dvec)
+                expr *= self.get_strength_or_not(isrc_sym, j)
+                expr_sum += expr
+            expr_sum = out_knl.postprocess_at_target(expr_sum, dvec)
+            exprs.append(expr_sum)
+
+        result_name_prefix = "pair_result_tmp" if self.exclude_self else "pair_result"
        result_names = [
-                sac.assign_unique("knl%d" % i,
-                    knl.postprocess_at_target(
-                        knl.postprocess_at_source(
-                            knl.get_expression(dvec),
-                            dvec),
-                        dvec)
-                    )
-                for i, knl in enumerate(self.kernels)]
+            sac.add_assignment(f"{result_name_prefix}_{i}", expr)
+            for i, expr in enumerate(exprs)
+        ]

        sac.run_global_cse()

        from sumpy.codegen import to_loopy_insns
-        loopy_insns = to_loopy_insns(six.iteritems(sac.assignments),
-                vector_names=set(["d"]),
-                pymbolic_expr_maps=[
-                        knl.get_code_transformer() for knl in self.kernels],
+        loopy_insns = to_loopy_insns(sac.assignments.items(),
+                vector_names={"d"},
+                pymbolic_expr_maps=(
+                    [knl.get_code_transformer() for knl in self.source_kernels]
+                    + [knl.get_code_transformer() for knl in self.target_kernels]),
                retain_names=result_names,
-                complex_dtype=np.complex128  # FIXME
                )

-        return loopy_insns, result_names
-
-    def get_strength_or_not(self, isrc, kernel_idx):
-        return var("strength").index((self.strength_usage[kernel_idx], isrc))
-
-    def get_kernel_exprs(self, result_names):
        from pymbolic import var
-
-        isrc_sym = var("isrc")
-        exprs = [var(name) * self.get_strength_or_not(isrc_sym, i)
-                 for i, name in enumerate(result_names)]
-
        if self.exclude_self:
+            assignments = [lp.Assignment(id=None,
+                    assignee=f"pair_result_{i}", expression=var(name),
+                    temp_var_type=lp.Optional(None))
+                for i, name in enumerate(result_names)]
+
            from pymbolic.primitives import If, Variable
-            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
+            assignments = [assign.copy(expression=If(Variable("is_self"), 0,
+                            assign.expression)) for assign in assignments]
+        else:
+            assignments = []

-        return [lp.Assignment(id=None,
-                    assignee="pair_result_%d" % i, expression=expr,
-                    temp_var_type=lp.Optional(None))
-                for i, expr in enumerate(exprs)]
+        return assignments + loopy_insns, result_names
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        from sumpy.symbolic import Symbol
+        return Symbol(f"strength_{self.strength_usage[kernel_idx]}")

    def get_default_src_tgt_arguments(self):
        from sumpy.tools import gather_loopy_source_arguments
@@ -139,10 +178,7 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
                lp.ValueArg("ntargets", None)]
                + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))]
                    if self.exclude_self else [])
-                + gather_loopy_source_arguments(self.kernels))
-
-    def get_kernel(self):
-        raise NotImplementedError
+                + gather_loopy_source_arguments(self.source_kernels))

    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
        # FIXME
@@ -154,6 +190,12 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
            knl = lp.tag_array_axes(knl, "targets", "sep,C")

        knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
+        knl = self._allow_redundant_execution_of_knl_scaling(knl)
+        knl = lp.set_options(knl,
+                enforce_variable_access_ordered="no_check")
+
+        knl = register_optimization_preambles(knl, self.device)
+
        return knl


@@ -165,19 +207,19 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
 class P2P(P2PBase):
    """Direct applier for P2P interactions."""

-    default_name = "p2p_apply"
+    @property
+    def default_name(self):
+        return "p2p_apply"

    def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-        kernel_exprs = self.get_kernel_exprs(result_names)
-        arguments = (
-            self.get_default_src_tgt_arguments()
-            + [
+        loopy_insns, _result_names = self.get_loopy_insns_and_result_names()
+        arguments = [
+                *self.get_default_src_tgt_arguments(),
                lp.GlobalArg("strength", None,
                    shape="nstrengths, nsources", dim_tags="sep,C"),
                lp.GlobalArg("result", None,
                    shape="nresults, ntargets", dim_tags="sep,C")
-            ])
+            ]

        loopy_knl = lp.make_kernel(["""
            {[itgt, isrc, idim]: \
@@ -190,36 +232,35 @@ class P2P(P2PBase):
            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
            + ["<> is_self = (isrc == target_to_source[itgt])"
                if self.exclude_self else ""]
-            + loopy_insns + kernel_exprs
-            + ["""
-                result[{i}, itgt] = knl_{i}_scaling * \
-                    simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}}
-               """.format(i=iknl)
-               for iknl in range(len(self.kernels))]
+            + [f"<> strength_{i} = strength[{i}, isrc]" for
+                i in set(self.strength_usage)]
+            + loopy_insns
+            + [f"""
+                result[{iknl}, itgt] = knl_{iknl}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{iknl}) {{inames=itgt}}
+               """ for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            assumptions="nsources>=1 and ntargets>=1",
            name=self.name,
-            fixed_parameters=dict(
-                dim=self.dim,
-                nstrengths=self.strength_count,
-                nresults=len(self.kernels)),
+            default_offset=lp.auto,
+            fixed_parameters={
+                "dim": self.dim,
+                "nstrengths": self.strength_count,
+                "nresults": len(self.target_kernels)},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

-        for knl in self.kernels:
+        for knl in self.target_kernels + self.source_kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

    def __call__(self, queue, targets, sources, strength, **kwargs):
-        from pytools.obj_array import is_obj_array
-        knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))

        return knl(queue, sources=sources, targets=targets, strength=strength,
                **kwargs)
@@ -232,19 +273,21 @@ class P2P(P2PBase):
 class P2PMatrixGenerator(P2PBase):
    """Generator for P2P interaction matrix entries."""

-    default_name = "p2p_matrix"
+    @property
+    def default_name(self):
+        return "p2p_matrix"

    def get_strength_or_not(self, isrc, kernel_idx):
        return 1

    def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-        kernel_exprs = self.get_kernel_exprs(result_names)
+        loopy_insns, _result_names = self.get_loopy_insns_and_result_names()
        arguments = (
            self.get_default_src_tgt_arguments()
-            + [lp.GlobalArg("result_%d" % i, dtype,
-                shape="ntargets,nsources")
-             for i, dtype in enumerate(self.value_dtypes)])
+            + [
+                lp.GlobalArg(f"result_{i}", dtype, shape="ntargets,nsources")
+                for i, dtype in enumerate(self.value_dtypes)
+            ])

        loopy_knl = lp.make_kernel(["""
            {[itgt, isrc, idim]: \
@@ -257,55 +300,55 @@ class P2PMatrixGenerator(P2PBase):
            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
            + ["<> is_self = (isrc == target_to_source[itgt])"
                if self.exclude_self else ""]
-            + loopy_insns + kernel_exprs
-            + ["""
-                result_{i}[itgt, isrc] = \
-                    knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))]
+            + loopy_insns
+            + [f"""
+                result_{iknl}[itgt, isrc] = knl_{iknl}_scaling \
+                    * pair_result_{iknl} {{inames=isrc:itgt}}
+                """ for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            assumptions="nsources>=1 and ntargets>=1",
            name=self.name,
-            fixed_parameters=dict(dim=self.dim),
+            fixed_parameters={"dim": self.dim},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

-        for knl in self.kernels:
+        for knl in self.target_kernels + self.source_kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

    def __call__(self, queue, targets, sources, **kwargs):
-        from pytools.obj_array import is_obj_array
-        knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))

        return knl(queue, sources=sources, targets=targets, **kwargs)

 # }}}


-# {{{ P2P matrix block writer
+# {{{ P2P matrix subset generator

-class P2PMatrixBlockGenerator(P2PBase):
+class P2PMatrixSubsetGenerator(P2PBase):
    """Generator for a subset of P2P interaction matrix entries.

+    This generator evaluates a generic set of entries in the matrix. See
+    :class:`P2PFromCSR` for when a compressed row storage format is available.
+
    .. automethod:: __call__
    """

-    default_name = "p2p_block"
+    @property
+    def default_name(self):
+        return "p2p_subset"

    def get_strength_or_not(self, isrc, kernel_idx):
        return 1

    def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-        kernel_exprs = self.get_kernel_exprs(result_names)
+        loopy_insns, _result_names = self.get_loopy_insns_and_result_names()
        arguments = (
            self.get_default_src_tgt_arguments()
            + [
@@ -313,8 +356,10 @@ class P2PMatrixBlockGenerator(P2PBase):
                lp.GlobalArg("tgtindices", None, shape="nresult"),
                lp.ValueArg("nresult", None)
            ]
-            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
-             for i, dtype in enumerate(self.value_dtypes)])
+            + [
+                lp.GlobalArg(f"result_{i}", dtype, shape="nresult")
+                for i, dtype in enumerate(self.value_dtypes)
+            ])

        loopy_knl = lp.make_kernel(
            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
@@ -332,26 +377,25 @@ class P2PMatrixBlockGenerator(P2PBase):
            + ["""
                    <> is_self = (isrc == target_to_source[itgt])
                """ if self.exclude_self else ""]
-            + loopy_insns + kernel_exprs
-            + ["""
-                    result_{i}[imat] = \
-                        knl_{i}_scaling * pair_result_{i} \
+            + loopy_insns
+            + [f"""
+                    result_{iknl}[imat] = \
+                        knl_{iknl}_scaling * pair_result_{iknl} \
                            {{id_prefix=write_p2p}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))]
+                """ for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            assumptions="nresult>=1",
            silenced_warnings="write_race(write_p2p*)",
            name=self.name,
-            fixed_parameters=dict(dim=self.dim),
+            fixed_parameters={"dim": self.dim},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.add_dtypes(loopy_knl,
-            dict(nsources=np.int32, ntargets=np.int32))
+        loopy_knl = lp.add_dtypes(
+                loopy_knl, {"nsources": np.int32, "ntargets": np.int32})

-        for knl in self.kernels:
+        for knl in self.target_kernels + self.source_kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
@@ -366,42 +410,38 @@ class P2PMatrixBlockGenerator(P2PBase):
            knl = lp.tag_array_axes(knl, "targets", "sep,C")

        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
-        return knl
+        knl = self._allow_redundant_execution_of_knl_scaling(knl)
+        knl = lp.set_options(knl,
+                enforce_variable_access_ordered="no_check")
+        knl = register_optimization_preambles(knl, self.device)

-    def __call__(self, queue, targets, sources, index_set, **kwargs):
-        """Construct a set of blocks of the full P2P interaction matrix.
-
-        The blocks are returned as one-dimensional arrays, for performance
-        and storage reasons. If the two-dimensional form is desired, it can
-        be obtained using the information in the `index_set` for a block
-        :math:`i` in the following way:
+        return knl

-        .. code-block:: python
+    def __call__(self, queue, targets, sources, tgtindices, srcindices, **kwargs):
+        """Evaluate a subset of the P2P matrix interactions.

-            blkranges = index_set.linear_ranges()
-            blkshape = index_set.block_shape(i)
+        :arg targets: target point coordinates, which can be an object
+            :class:`~numpy.ndarray`, :class:`list` or :class:`tuple` of
+            coordinates or a single stacked array.
+        :arg sources: source point coordinates, which can also be in any of the
+            formats of the *targets*,

-            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(*blkshape)
+        :arg srcindices: an array of indices into *sources*.
+        :arg tgtindices: an array of indices into *targets*, of the same size
+            as *srcindices*.

-        :arg targets: target point coordinates.
-        :arg sources: source point coordinates.
-        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
-            to define the blocks.
-        :return: a tuple of one-dimensional arrays of kernel evaluations at
-            target-source pairs described by `index_set`.
+        :returns: a one-dimensional array of interactions, for each index pair
+            in (*srcindices*, *tgtindices*)
        """
-        from pytools.obj_array import is_obj_array
-        knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))

        return knl(queue,
                   targets=targets,
                   sources=sources,
-                   tgtindices=index_set.linear_row_indices,
-                   srcindices=index_set.linear_col_indices, **kwargs)
+                   tgtindices=tgtindices,
+                   srcindices=srcindices, **kwargs)

 # }}}

@@ -409,14 +449,15 @@ class P2PMatrixBlockGenerator(P2PBase):
 # {{{ P2P from CSR-like interaction list

 class P2PFromCSR(P2PBase):
-    default_name = "p2p_from_csr"
-
-    def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-        kernel_exprs = self.get_kernel_exprs(result_names)
-        arguments = (
-            self.get_default_src_tgt_arguments()
-            + [
+    @property
+    def default_name(self):
+        return "p2p_from_csr"
+
+    def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box,
+            work_items_per_group=32):
+        loopy_insns, _result_names = self.get_loopy_insns_and_result_names()
+        arguments = [
+                *self.get_default_src_tgt_arguments(),
                lp.GlobalArg("box_target_starts",
                    None, shape=None),
                lp.GlobalArg("box_target_counts_nonchild",
@@ -432,20 +473,121 @@ class P2PFromCSR(P2PBase):
                lp.GlobalArg("strength", None,
                    shape="nstrengths, nsources", dim_tags="sep,C"),
                lp.GlobalArg("result", None,
-                    shape="nkernels, ntargets", dim_tags="sep,C"),
+                    shape="noutputs, ntargets", dim_tags="sep,C"),
+                lp.TemporaryVariable("tgt_center", shape=(self.dim,)),
                "..."
-            ])
+            ]

-        loopy_knl = lp.make_kernel([
+        domains = [
            "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
+            "{[iknl]: 0 <= iknl < noutputs}",
            "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
-            "{[itgt, isrc, idim]: \
-                itgt_start <= itgt < itgt_end and \
-                isrc_start <= isrc < isrc_end and \
-                0 <= idim < dim}",
-            ],
-            self.get_kernel_scaling_assignments()
-            + ["""
+            "{[idim]: 0 <= idim < dim}",
+        ]
+
+        tgt_outer_limit = (max_ntargets_in_one_box - 1) // work_items_per_group
+
+        if self.is_gpu:
+            arguments += [
+                lp.TemporaryVariable("local_isrc",
+                    shape=(self.dim, max_nsources_in_one_box)),
+                lp.TemporaryVariable("local_isrc_strength",
+                    shape=(self.strength_count, max_nsources_in_one_box)),
+            ]
+            domains += [
+                "{[istrength]: 0 <= istrength < nstrengths}",
+                "{[inner]: 0 <= inner < work_items_per_group}",
+                "{[itgt_offset_outer]: 0 <= itgt_offset_outer <= tgt_outer_limit}",
+                "{[isrc_prefetch]: 0 <= isrc_prefetch < max_nsources_in_one_box}",
+                "{[isrc_offset]: 0 <= isrc_offset < max_nsources_in_one_box"
+                " and isrc_offset < isrc_end - isrc_start}",
+            ]
+        else:
+            domains += [
+                "{[itgt]: itgt_start <= itgt < itgt_end}",
+                "{[isrc]: isrc_start <= isrc < isrc_end}"
+            ]
+
+        # There are two algorithms here because pocl-pthread 1.9 miscompiles
+        # the "gpu" kernel with prefetching.
+        if self.is_gpu:
+            instructions = (self.get_kernel_scaling_assignments()
+              + ["""
+                for itgt_box
+                <> tgt_ibox = target_boxes[itgt_box]  {id=init_0}
+                <> itgt_start = box_target_starts[tgt_ibox]  {id=init_1}
+                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] \
+                        {id=init_2}
+                <> isrc_box_start = source_box_starts[itgt_box]  {id=init_3}
+                <> isrc_box_end = source_box_starts[itgt_box+1]  {id=init_4}
+
+                for itgt_offset_outer
+                  for inner
+                    <> itgt_offset = itgt_offset_outer * work_items_per_group + inner
+                    <> itgt = itgt_offset + itgt_start
+                    <> cond_itgt = itgt < itgt_end
+                    <> acc[iknl] = 0  {id=init_acc}
+                    if cond_itgt
+                      tgt_center[idim] = targets[idim, itgt] {id=set_tgt,dup=idim}
+                    end
+                  end
+                  for isrc_box
+                    <> src_ibox = source_box_lists[isrc_box]  {id=src_box_insn_0}
+                    <> isrc_start = box_source_starts[src_ibox]  {id=src_box_insn_1}
+                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] \
+                        {id=src_box_insn_2}
+                    for isrc_prefetch
+                      <> cond_isrc_prefetch = isrc_prefetch < isrc_end - isrc_start \
+                              {id=cond_isrc_prefetch}
+                      if cond_isrc_prefetch
+                        local_isrc[idim, isrc_prefetch] = sources[idim,
+                          isrc_prefetch + isrc_start]  {id=prefetch_src, dup=idim}
+                        local_isrc_strength[istrength, isrc_prefetch] = strength[
+                          istrength, isrc_prefetch + isrc_start] {id=prefetch_charge}
+                      end
+                    end
+                    for inner
+                      if cond_itgt
+                        for isrc_offset
+                          <> isrc = isrc_offset + isrc_start
+                          <> d[idim] = (tgt_center[idim] - local_isrc[idim,
+                            isrc_offset]) \
+                            {id=set_d,dep=prefetch_src:set_tgt}
+              """] + ["""
+                          <> is_self = (isrc == target_to_source[itgt])
+                    """ if self.exclude_self else ""]
+              + [f"""
+                          <> strength_{i} = local_isrc_strength[{i}, isrc_offset] \
+                            {{id=set_strength{i},dep=prefetch_charge}}
+                """ for
+                i in set(self.strength_usage)]
+              + loopy_insns
+              + [f"""
+                          acc[{iknl}] = acc[{iknl}] + \
+                            pair_result_{iknl} \
+                            {{id=update_acc_{iknl}, dep=init_acc}}
+                """ for iknl in range(len(self.target_kernels))]
+              + ["""
+                        end
+                      end
+                    end
+                  end
+                 """]
+              + [f"""
+                  for inner
+                  if cond_itgt
+                    result[{iknl}, itgt] = knl_{iknl}_scaling * acc[{iknl}] \
+                            {{id_prefix=write_csr,dep=update_acc_{iknl} }}
+                  end
+                  end
+                """ for iknl in range(len(self.target_kernels))]
+              + ["""
+                end
+                end
+              """])
+        else:
+            instructions = (self.get_kernel_scaling_assignments()
+              + ["""
                for itgt_box
                <> tgt_ibox = target_boxes[itgt_box]
                <> itgt_start = box_target_starts[tgt_ibox]
@@ -454,68 +596,178 @@ class P2PFromCSR(P2PBase):
                <> isrc_box_start = source_box_starts[itgt_box]
                <> isrc_box_end = source_box_starts[itgt_box+1]

-                for isrc_box
-                    <> src_ibox = source_box_lists[isrc_box]
-                    <> isrc_start = box_source_starts[src_ibox]
-                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
-
-                    for itgt
+                for itgt
+                  <> acc[iknl] = 0 {id=init_acc}
+                  tgt_center[idim] = targets[idim, itgt] {id=prefetch_tgt,dup=idim}
+                  for isrc_box
+                    <> src_ibox = source_box_lists[isrc_box]  {id=src_box_insn_0}
+                    <> isrc_start = box_source_starts[src_ibox]  {id=src_box_insn_1}
+                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] \
+                            {id=src_box_insn_2}
                    for isrc
-                        <> d[idim] = \
-                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
-            """] + ["""
+                        <> d[idim] = (tgt_center[idim] - sources[idim,
+                          isrc]) {dep=prefetch_tgt}
+              """] + ["""
                        <> is_self = (isrc == target_to_source[itgt])
                    """ if self.exclude_self else ""]
-            + loopy_insns + kernel_exprs
-            + ["    end"]
-            + ["""
-                    result[{i}, itgt] = result[{i}, itgt] + \
-                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
-                        {{id_prefix=write_csr}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))]
-            + ["""
+              + [f"<> strength_{i} = strength[{i}, isrc]" for
+                i in set(self.strength_usage)]
+              + loopy_insns
+              + [f"""
+                        acc[{iknl}] = acc[{iknl}] + \
+                          pair_result_{iknl} \
+                          {{id=update_acc_{iknl}, dep=init_acc}}
+                """ for iknl in range(len(self.target_kernels))]
+              + ["""
                    end
+                  end
+               """]
+              + [f"""
+                  result[{iknl}, itgt] = knl_{iknl}_scaling * acc[{iknl}] \
+                          {{id_prefix=write_csr,dep=update_acc_{iknl} }}
+                """ for iknl in range(len(self.target_kernels))]
+              + ["""
                end
                end
-            """],
+              """])
+
+        loopy_knl = lp.make_kernel(
+            domains,
+            instructions,
            arguments,
            assumptions="ntgt_boxes>=1",
            name=self.name,
-            silenced_warnings="write_race(write_csr*)",
-            fixed_parameters=dict(
-                dim=self.dim,
-                nstrengths=self.strength_count,
-                nkernels=len(self.kernels)),
+            silenced_warnings=["write_race(write_csr*)", "write_race(prefetch_src)",
+                "write_race(prefetch_charge)"],
+            fixed_parameters={
+                "dim": self.dim,
+                "nstrengths": self.strength_count,
+                "max_nsources_in_one_box": max_nsources_in_one_box,
+                "max_ntargets_in_one_box": max_ntargets_in_one_box,
+                "work_items_per_group": work_items_per_group,
+                "tgt_outer_limit": tgt_outer_limit,
+                "noutputs": len(self.target_kernels)},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

-        loopy_knl = lp.add_dtypes(loopy_knl,
-            dict(nsources=np.int32, ntargets=np.int32))
+        loopy_knl = lp.add_dtypes(
+                loopy_knl, {"nsources": np.int32, "ntargets": np.int32})

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")

-        for knl in self.kernels:
+        for knl in self.target_kernels + self.source_kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

-    def get_optimized_kernel(self):
-        # FIXME
-        knl = self.get_kernel()
-
-        import pyopencl as cl
-        dev = self.context.devices[0]
-        if dev.type & cl.device_type.CPU:
+    def get_optimized_kernel(self, max_nsources_in_one_box,
+            max_ntargets_in_one_box, source_dtype, strength_dtype):
+        if not self.is_gpu:
+            knl = self.get_kernel(max_nsources_in_one_box,
+                    max_ntargets_in_one_box)
            knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
+            knl = self._allow_redundant_execution_of_knl_scaling(knl)
        else:
-            knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
+            dtype_size = np.dtype(strength_dtype).alignment
+            work_items_per_group = min(256, max_ntargets_in_one_box)
+            total_local_mem = max_nsources_in_one_box * \
+                    (self.dim + self.strength_count) * dtype_size
+            # multiplying by 2 here to make sure at least 2 work groups
+            # can be scheduled at the same time for latency hiding
+            nprefetch = (2 * total_local_mem - 1) // self.device.local_mem_size + 1
+
+            knl = self.get_kernel(max_nsources_in_one_box,
+                    max_ntargets_in_one_box,
+                    work_items_per_group=work_items_per_group)
+            knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"})
+            knl = lp.set_temporary_address_space(knl,
+                ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL)
+
+            local_arrays = ["local_isrc", "local_isrc_strength"]
+            local_array_isrc_axis = [1, 1]
+            local_array_sizes = [self.dim, self.strength_count]
+            local_array_dtypes = [source_dtype, strength_dtype]
+            # By having a concatenated memory layout of the temporaries
+            # and marking the first axis as vec, we are transposing the
+            # the arrays and also making the access of the source
+            # co-ordinates and the strength for each source a coalesced
+            # access of 256 bits (assuming double precision) which is
+            # optimized for NVIDIA GPUs. On an NVIDIA Titan V, this
+            # optimization led to a 8% speedup in the performance.
+            if strength_dtype == source_dtype:
+                knl = lp.concatenate_arrays(knl, local_arrays, "local_isrc")
+                local_arrays = ["local_isrc"]
+                local_array_sizes = [self.dim + self.strength_count]
+                local_array_dtypes = [source_dtype]
+            # We try to mark the local arrays (sources, strengths)
+            # as vec for the first dimension
+            for i, (array_name, array_size, array_dtype) in \
+                    enumerate(zip(local_arrays, local_array_sizes,
+                                  local_array_dtypes, strict=True)):
+                if issubclass(array_dtype.type, np.complexfloating):
+                    # pyopencl does not support complex data type vectors
+                    continue
+                if array_size in [2, 3, 4, 8, 16]:
+                    knl = lp.tag_array_axes(knl, array_name, "vec,C")
+                else:
+                    # FIXME: check if CUDA
+                    n = 16 // dtype_size
+                    if n in [1, 2, 4, 8]:
+                        knl = lp.split_array_axis(knl, array_name, 0, n)
+                        knl = lp.tag_array_axes(knl, array_name, "C,vec,C")
+                        local_array_isrc_axis[i] = 2
+
+            # We need to split isrc_prefetch and isrc_offset into chunks.
+            nsources = (max_nsources_in_one_box + nprefetch - 1) // nprefetch
+            for local_array, axis in zip(local_arrays, local_array_isrc_axis,
+                                         strict=True):
+                knl = lp.split_array_axis(knl, local_array, axis, nsources)
+            knl = lp.split_iname(knl, "isrc_prefetch", nsources,
+                    outer_iname="iprefetch")
+            knl = lp.split_iname(knl, "isrc_prefetch_inner", work_items_per_group)
+            knl = lp.tag_inames(knl, {"isrc_prefetch_inner_inner": "l.0"})
+            knl = lp.split_iname(knl, "isrc_offset", nsources,
+                    outer_iname="iprefetch")
+
+            # After splitting, the temporary array local_isrc need not
+            # be as large as before. Need to simplify before unprivatizing
+            knl = lp.simplify_indices(knl)
+            knl = lp.unprivatize_temporaries_with_inames(knl,
+                    "iprefetch", only_var_names=local_arrays)
+
+            knl = lp.add_inames_to_insn(knl,
+                    "inner", "id:init_* or id:*_scaling or id:src_box_insn_*")
+            knl = lp.add_inames_to_insn(knl, "itgt_box", "id:*_scaling")
+            # knl = lp.set_options(knl, write_code=True)
+
+        knl = lp.set_options(knl,
+                enforce_variable_access_ordered="no_check")
+
+        knl = register_optimization_preambles(knl, self.device)

        return knl

    def __call__(self, queue, **kwargs):
-        knl = self.get_cached_optimized_kernel()
+        max_nsources_in_one_box = kwargs.pop("max_nsources_in_one_box")
+        max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box")
+
+        if self.is_gpu:
+            source_dtype = kwargs.get("sources")[0].dtype
+            strength_dtype = kwargs.get("strength").dtype
+        else:
+            # these are unused for not GPU and defeats the caching
+            # set them to None to keep the caching across dtypes
+            source_dtype = None
+            strength_dtype = None
+
+        knl = self.get_cached_kernel_executor(
+                max_nsources_in_one_box=max_nsources_in_one_box,
+                max_ntargets_in_one_box=max_ntargets_in_one_box,
+                source_dtype=source_dtype,
+                strength_dtype=strength_dtype,
+                )

        return knl(queue, **kwargs)


--- a/sumpy/point_calculus.py
+++ b/sumpy/point_calculus.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2017 Andreas Kloeckner"

@@ -24,8 +25,10 @@ THE SOFTWARE.

 import numpy as np
 import numpy.linalg as la
+
 from pytools import memoize_method

+
 __doc__ = """
 .. autoclass:: CalculusPatch

@@ -33,13 +36,13 @@ __doc__ = """
 """


-class CalculusPatch(object):
+class CalculusPatch:
    """Sets up a grid of points on which derivatives can be calculated. Useful
    to verify that an evaluated potential actually solves a PDE.

-    .. attribute: dim
+    .. attribute:: dim

-    .. attribute: points
+    .. attribute:: points

        shape: ``(dim, npoints_total)``

@@ -48,7 +51,7 @@ class CalculusPatch(object):
    .. automethod:: diff
    .. automethod:: dx
    .. automethod:: dy
-    .. automethod:: dy
+    .. automethod:: dz
    .. automethod:: laplace
    .. automethod:: div
    .. automethod:: curl
@@ -80,7 +83,7 @@ class CalculusPatch(object):
            weights_1d = weights_1d * (h/2)

        else:
-            raise ValueError("invalid node set: %s" % nodes)
+            raise ValueError(f"invalid node set: {nodes}")

        self.h = h
        self.npoints = npoints
@@ -116,13 +119,14 @@ class CalculusPatch(object):
        return self._vandermonde_1d()[0]

    def basis(self):
-        """"
+        """
        :returns: a :class:`list` containing functions that realize
-            a high-order interpolation basis on the :attr:`points`.
+            a high-order interpolation basis on the :py:attr:`points`.
        """

+        from scipy.special import eval_chebyt  # pylint: disable=no-name-in-module
+
        from pytools import indices_in_shape
-        from scipy.special import eval_chebyt

        def eval_basis(ind, x):
            result = 1
@@ -161,7 +165,7 @@ class CalculusPatch(object):
        coeff_diff_mat = np.diag(np.arange(1, npoints), 1)

        n_diff_mat = np.eye(npoints)
-        for i in range(nderivs):
+        for _ in range(nderivs):
            n_diff_mat = n_diff_mat.dot(coeff_diff_mat)

        deriv_coeffs_mat = la.solve(vandermonde.T, n_diff_mat.T).T
@@ -175,7 +179,7 @@ class CalculusPatch(object):
        """

        from numbers import Number
-        if isinstance(f_values, (np.number, Number)):
+        if isinstance(f_values, np.number | Number):
            # constants differentiate to 0
            return 0

@@ -188,7 +192,7 @@ class CalculusPatch(object):
        tgt_axes = (axes[:axis] + "i" + axes[axis:])[:dim]

        return np.einsum(
-                "ij,%s->%s" % (src_axes, tgt_axes),
+                f"ij,{src_axes}->{tgt_axes}",
                self._diff_mat_1d(nderivs),
                f_values.reshape(*self._pshape)).reshape(-1)

@@ -211,9 +215,9 @@ class CalculusPatch(object):
        return sum(self.diff(iaxis, f_values, 2) for iaxis in range(self.dim))

    def div(self, arg):
-        """
+        r"""
        :arg arg: an object array containing
-            :class:`numpy.ndarrays` with shape ``(npoints_total,)``.
+            :class:`numpy.ndarray`\ s with shape ``(npoints_total,)``.
        """
        result = 0
        for i, arg_i in enumerate(arg):
@@ -222,18 +226,18 @@ class CalculusPatch(object):
        return result

    def curl(self, arg):
-        """Take the curl of the vector quantity *arg*.
+        r"""Take the curl of the vector quantity *arg*.

        :arg arg: an object array of shape ``(3,)`` containing
-            :class:`numpy.ndarrays` with shape ``(npoints_total,)``.
+            :class:`numpy.ndarray`\ s with shape ``(npoints_total,)``.
        """
        from pytools import levi_civita
        from pytools.obj_array import make_obj_array
        return make_obj_array([
            sum(
-                levi_civita((l, m, n)) * self.diff(m, arg[n])
+                levi_civita((k, m, n)) * self.diff(m, arg[n])
                for m in range(3) for n in range(3))
-            for l in range(3)])
+            for k in range(3)])

    def eval_at_center(self, f_values):
        """Interpolate *f_values* to the center point.
@@ -242,8 +246,10 @@ class CalculusPatch(object):
        :returns: a scalar.
        """
        f_values = f_values.reshape(*self._pshape)
-        for i in range(self.dim):
-            f_values = self._zero_eval_vec_1d.dot(f_values)
+        zero_eval_vec_1d = self._zero_eval_vec_1d
+
+        for _ in range(self.dim):
+            f_values = zero_eval_vec_1d @ f_values

        return f_values

@@ -261,7 +267,7 @@ class CalculusPatch(object):

    def norm(self, arg, p):
        if p == np.inf:
-            if arg.dtype == np.object:
+            if arg.dtype == object:
                return max(
                        la.norm(x_i, p)
                        for x_i in arg)
@@ -297,7 +303,7 @@ def frequency_domain_maxwell(cpatch, e, h, k):

    # https://en.wikipedia.org/w/index.php?title=Maxwell%27s_equations&oldid=798940325#Macroscopic_formulation
    # assumed time dependence exp(-1j*omega*t)
-    # Agrees with Jackson, Third Ed., (8.16)
+    # This agrees with Jackson, Third Ed., (8.16)
    resid_faraday = cpatch.curl(e) - 1j * omega * b
    resid_ampere = cpatch.curl(h) + 1j * omega * d


--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = """
 Copyright (C) 2012 Andreas Kloeckner
@@ -26,18 +27,19 @@ THE SOFTWARE.
 """


-import six
-from six.moves import range
+import logging
+
 import numpy as np
+
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-import sumpy.symbolic as sym
-from pytools import memoize_method
 from pymbolic import parse, var
+from pytools import memoize_method
+
+import sumpy.symbolic as sym
+from sumpy.tools import KernelCacheMixin, KernelComputation, is_obj_array_like

-from sumpy.tools import KernelComputation, KernelCacheWrapper

-import logging
 logger = logging.getLogger(__name__)


@@ -49,7 +51,7 @@ QBX for Layer Potentials
 .. autoclass:: LayerPotentialBase
 .. autoclass:: LayerPotential
 .. autoclass:: LayerPotentialMatrixGenerator
-.. autoclass:: LayerPotentialMatrixBlockGenerator
+.. autoclass:: LayerPotentialMatrixSubsetGenerator

 """

@@ -60,49 +62,67 @@ def stringify_expn_index(i):
    else:
        assert isinstance(i, int)
        if i < 0:
-            return "m%d" % (-i)
+            return f"m{-i}"
        else:
            return str(i)


-def expand(expansion_nr, sac, expansion, avec, bvec):
-    rscale = sym.Symbol("rscale")
-
-    coefficients = expansion.coefficients_from_source(avec, bvec, rscale)
-
-    assigned_coeffs = [
-            sym.Symbol(
-                    sac.assign_unique("expn%dcoeff%s" % (
-                        expansion_nr, stringify_expn_index(i)),
-                        coefficients[expansion.get_storage_index(i)]))
-            for i in expansion.get_coefficient_identifiers()]
-
-    return sac.assign_unique("expn%d_result" % expansion_nr,
-            expansion.evaluate(assigned_coeffs, bvec, rscale))
-
-
 # {{{ layer potential computation

 # {{{ base class

-class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
-    def __init__(self, ctx, expansions, strength_usage=None,
-            value_dtypes=None,
-            options=[], name=None, device=None):
-        KernelComputation.__init__(self, ctx, expansions, strength_usage,
-                value_dtypes,
-                name, options, device)
+class LayerPotentialBase(KernelCacheMixin, KernelComputation):
+    def __init__(self, ctx, expansion, strength_usage=None,
+            value_dtypes=None, name=None, device=None,
+            source_kernels=None, target_kernels=None):

        from pytools import single_valued
-        self.dim = single_valued(knl.dim for knl in self.expansions)
+
+        KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels,
+                strength_usage=strength_usage, source_kernels=source_kernels,
+                value_dtypes=value_dtypes, name=name, device=device)
+
+        self.dim = single_valued(knl.dim for knl in self.target_kernels)
+        self.expansion = expansion

    def get_cache_key(self):
-        return (type(self).__name__, tuple(self.kernels),
-                tuple(self.strength_usage), tuple(self.value_dtypes))
+        return (type(self).__name__, self.expansion, tuple(self.target_kernels),
+                tuple(self.source_kernels), tuple(self.strength_usage),
+                tuple(self.value_dtypes),
+                self.device.hashable_model_and_version_identifier)
+
+    def _expand(self, sac, avec, bvec, rscale, isrc):
+        from sumpy.symbolic import PymbolicToSympyMapper
+        conv = PymbolicToSympyMapper()
+        strengths = [conv(self.get_strength_or_not(isrc, idx))
+                     for idx in range(len(self.source_kernels))]
+        coefficients = self.expansion.coefficients_from_source_vec(
+            self.source_kernels, avec, bvec, rscale=rscale, weights=strengths,
+            sac=sac)
+
+        return coefficients
+
+    def _evaluate(self, sac, avec, bvec, rscale, expansion_nr, coefficients):
+        from sumpy.expansion.local import LineTaylorLocalExpansion
+        tgt_knl = self.target_kernels[expansion_nr]
+        if isinstance(tgt_knl, LineTaylorLocalExpansion):
+            # In LineTaylorLocalExpansion.evaluate, we can't run
+            # postprocess_at_target because the coefficients are assigned
+            # symbols and postprocess with a derivative will make them zero.
+            # Instead run postprocess here before the coefficients are assigned.
+            coefficients = [tgt_knl.postprocess_at_target(coeff, bvec) for
+                    coeff in coefficients]
+
+        assigned_coeffs = [
+            sym.Symbol(
+                sac.assign_unique(
+                    f"expn{expansion_nr}coeff{stringify_expn_index(i)}",
+                    coefficients[self.expansion.get_storage_index(i)])
+                )
+            for i in self.expansion.get_coefficient_identifiers()]

-    @property
-    def expansions(self):
-        return self.kernels
+        return sac.assign_unique(f"expn{expansion_nr}_result",
+            self.expansion.evaluate(tgt_knl, assigned_coeffs, bvec, rscale))

    def get_loopy_insns_and_result_names(self):
        from sumpy.symbolic import make_sym_vector
@@ -114,19 +134,25 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):

        logger.info("compute expansion expressions: start")

-        result_names = [expand(i, sac, expn, avec, bvec)
-                        for i, expn in enumerate(self.expansions)]
+        rscale = sym.Symbol("rscale")
+        isrc_sym = var("isrc")
+
+        coefficients = self._expand(sac, avec, bvec, rscale, isrc_sym)
+        result_names = [self._evaluate(sac, avec, bvec, rscale, i, coefficients)
+                        for i in range(len(self.target_kernels))]

        logger.info("compute expansion expressions: done")

        sac.run_global_cse()

+        pymbolic_expr_maps = [knl.get_code_transformer() for knl in (
+            self.target_kernels + self.source_kernels)]
+
        from sumpy.codegen import to_loopy_insns
        loopy_insns = to_loopy_insns(
-                six.iteritems(sac.assignments),
-                vector_names=set(["a", "b"]),
-                pymbolic_expr_maps=[
-                    expn.kernel.get_code_transformer() for expn in self.expansions],
+                sac.assignments.items(),
+                vector_names={"a", "b"},
+                pymbolic_expr_maps=pymbolic_expr_maps,
                retain_names=result_names,
                complex_dtype=np.complex128  # FIXME
                )
@@ -134,52 +160,66 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
        return loopy_insns, result_names

    def get_strength_or_not(self, isrc, kernel_idx):
-        return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc)
+        return var(f"strength_{self.strength_usage[kernel_idx]}_isrc")

    def get_kernel_exprs(self, result_names):
-        isrc_sym = var("isrc")
-        exprs = [var(name) * self.get_strength_or_not(isrc_sym, i)
-                 for i, name in enumerate(result_names)]
+        exprs = [var(name) for i, name in enumerate(result_names)]

        return [lp.Assignment(id=None,
-                    assignee="pair_result_%d" % i, expression=expr,
+                    assignee=f"pair_result_{i}",
+                    expression=expr,
                    temp_var_type=lp.Optional(None))
                for i, expr in enumerate(exprs)]

    def get_default_src_tgt_arguments(self):
        from sumpy.tools import gather_loopy_source_arguments
-        return ([
-                lp.GlobalArg("src", None,
+        return [
+                lp.GlobalArg("sources", None,
                    shape=(self.dim, "nsources"), order="C"),
-                lp.GlobalArg("tgt", None,
+                lp.GlobalArg("targets", None,
                    shape=(self.dim, "ntargets"), order="C"),
                lp.GlobalArg("center", None,
-                    shape=(self.dim, "ntargets"), dim_tags="sep,C"),
+                    shape=(self.dim, "ntargets"), order="C"),
                lp.GlobalArg("expansion_radii",
                    None, shape="ntargets"),
                lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None)]
-                + gather_loopy_source_arguments(self.kernels))
+                lp.ValueArg("ntargets", None),
+                *gather_loopy_source_arguments(self.source_kernels)
+            ]

    def get_kernel(self):
        raise NotImplementedError

-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self,
+            targets_is_obj_array, sources_is_obj_array, centers_is_obj_array,
+            # Used by pytential to override the name of the loop to be
+            # parallelized. In the case of QBX, that's the loop over QBX
+            # targets (not global targets).
+            itgt_name="itgt"):
        # FIXME specialize/tune for GPU/CPU
        loopy_knl = self.get_kernel()

+        if targets_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
+        if sources_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")
+        if centers_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")
+
        import pyopencl as cl
        dev = self.context.devices[0]
        if dev.type & cl.device_type.CPU:
-            loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0",
+            loopy_knl = lp.split_iname(loopy_knl, itgt_name, 16, outer_tag="g.0",
                    inner_tag="l.0")
            loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
            loopy_knl = lp.prioritize_loops(loopy_knl,
-                    ["isrc_outer", "itgt_inner"])
+                    ["isrc_outer", f"{itgt_name}_inner"])
        else:
            from warnings import warn
-            warn("don't know how to tune layer potential computation for '%s'" % dev)
-            loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")
+            warn(f"don't know how to tune layer potential computation for '{dev}'",
+                 stacklevel=1)
+            loopy_knl = lp.split_iname(loopy_knl, itgt_name, 128, outer_tag="g.0")
+        loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl)

        return loopy_knl

@@ -194,7 +234,9 @@ class LayerPotential(LayerPotentialBase):
    .. automethod:: __call__
    """

-    default_name = "qbx_apply"
+    @property
+    def default_name(self):
+        return "qbx_apply"

    @memoize_method
    def get_kernel(self):
@@ -202,12 +244,14 @@ class LayerPotential(LayerPotentialBase):
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
-            + [lp.GlobalArg("strength_%d" % i,
-                None, shape="nsources", order="C")
-            for i in range(self.strength_count)]
-            + [lp.GlobalArg("result_%d" % i,
-                None, shape="ntargets", order="C")
-            for i in range(len(self.kernels))])
+            + [
+                lp.GlobalArg(f"strength_{i}", None, shape="nsources", order="C")
+                for i in range(self.strength_count)
+            ]
+            + [
+                lp.GlobalArg(f"result_{i}", None, shape="ntargets", order="C")
+                for i in range(len(self.target_kernels))
+            ])

        loopy_knl = lp.make_kernel(["""
            {[itgt, isrc, idim]: \
@@ -217,28 +261,30 @@ class LayerPotential(LayerPotentialBase):
            """],
            self.get_kernel_scaling_assignments()
            + ["for itgt, isrc"]
-            + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"]
-            + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"]
+            + ["<> a[idim] = center[idim, itgt] - sources[idim, isrc]"]
+            + ["<> b[idim] = targets[idim, itgt] - center[idim, itgt] {dup=idim}"]
            + ["<> rscale = expansion_radii[itgt]"]
+            + [f"<> strength_{i}_isrc = strength_{i}[isrc]" for i in
+                    range(self.strength_count)]
            + loopy_insns + kernel_exprs
-            + ["""
-                result_{i}[itgt] = knl_{i}_scaling * \
-                    simul_reduce(sum, isrc, pair_result_{i}) \
+            + [f"""
+                result_{iknl}[itgt] = knl_{iknl}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{iknl}) \
                        {{id_prefix=write_lpot,inames=itgt}}
-                """.format(i=iknl)
-                for iknl in range(len(self.expansions))]
+                """
+                for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            name=self.name,
            assumptions="ntargets>=1 and nsources>=1",
            default_offset=lp.auto,
            silenced_warnings="write_race(write_lpot*)",
-            fixed_parameters=dict(dim=self.dim),
+            fixed_parameters={"dim": self.dim},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        for expn in self.expansions:
-            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+        for knl in self.target_kernels + self.source_kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

@@ -249,12 +295,15 @@ class LayerPotential(LayerPotentialBase):
            already multiplied in.
        """

-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))

        for i, dens in enumerate(strengths):
-            kwargs["strength_%d" % i] = dens
+            kwargs[f"strength_{i}"] = dens

-        return knl(queue, src=sources, tgt=targets, center=centers,
+        return knl(queue, sources=sources, targets=targets, center=centers,
                expansion_radii=expansion_radii, **kwargs)

 # }}}
@@ -265,7 +314,9 @@ class LayerPotential(LayerPotentialBase):
 class LayerPotentialMatrixGenerator(LayerPotentialBase):
    """Generator for layer potential matrix entries."""

-    default_name = "qbx_matrix"
+    @property
+    def default_name(self):
+        return "qbx_matrix"

    def get_strength_or_not(self, isrc, kernel_idx):
        return 1
@@ -276,9 +327,11 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
-            + [lp.GlobalArg("result_%d" % i,
-                dtype, shape="ntargets, nsources", order="C")
-             for i, dtype in enumerate(self.value_dtypes)])
+            + [
+                lp.GlobalArg(f"result_{i}",
+                             dtype, shape="ntargets, nsources", order="C")
+                for i, dtype in enumerate(self.value_dtypes)
+            ])

        loopy_knl = lp.make_kernel(["""
            {[itgt, isrc, idim]: \
@@ -288,48 +341,53 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
            """],
            self.get_kernel_scaling_assignments()
            + ["for itgt, isrc"]
-            + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"]
-            + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"]
+            + ["<> a[idim] = center[idim, itgt] - sources[idim, isrc]"]
+            + ["<> b[idim] = targets[idim, itgt] - center[idim, itgt] {dup=idim}"]
            + ["<> rscale = expansion_radii[itgt]"]
            + loopy_insns + kernel_exprs
-            + ["""
-                result_{i}[itgt, isrc] = \
-                    knl_{i}_scaling * pair_result_{i} \
+            + [f"""
+                result_{iknl}[itgt, isrc] = \
+                    knl_{iknl}_scaling * pair_result_{iknl} \
                        {{inames=isrc:itgt}}
-                """.format(i=iknl)
-                for iknl in range(len(self.expansions))]
+                """
+                for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            name=self.name,
            assumptions="ntargets>=1 and nsources>=1",
            default_offset=lp.auto,
-            fixed_parameters=dict(dim=self.dim),
+            fixed_parameters={"dim": self.dim},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        for expn in self.expansions:
+        for expn in self.source_kernels + self.target_kernels:
            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

    def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs):
-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))

-        return knl(queue, src=sources, tgt=targets, center=centers,
+        return knl(queue, sources=sources, targets=targets, center=centers,
                expansion_radii=expansion_radii, **kwargs)

 # }}}


-# {{{
+# {{{ matrix subset generator

-class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
+class LayerPotentialMatrixSubsetGenerator(LayerPotentialBase):
    """Generator for a subset of the layer potential matrix entries.

    .. automethod:: __call__
    """

-    default_name = "qbx_block"
+    @property
+    def default_name(self):
+        return "qbx_subset"

    def get_strength_or_not(self, isrc, kernel_idx):
        return 1
@@ -345,8 +403,10 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
                lp.GlobalArg("tgtindices", None, shape="nresult"),
                lp.ValueArg("nresult", None)
            ]
-            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
-             for i, dtype in enumerate(self.value_dtypes)])
+            + [
+                lp.GlobalArg(f"result_{i}", dtype, shape="nresult")
+                for i, dtype in enumerate(self.value_dtypes)
+            ])

        loopy_knl = lp.make_kernel([
            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
@@ -360,62 +420,84 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
                    <> itgt = tgtindices[imat]
                    <> isrc = srcindices[imat]

-                    <> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}
-                    <> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}
+                    <> a[idim] = center[idim, itgt] - sources[idim, isrc]
+                    <> b[idim] = targets[idim, itgt] - center[idim, itgt] {dup=idim}
                    <> rscale = expansion_radii[itgt]
            """]
            + loopy_insns + kernel_exprs
-            + ["""
-                    result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \
+            + [f"""
+                    result_{iknl}[imat] = knl_{iknl}_scaling * pair_result_{iknl} \
                            {{id_prefix=write_lpot}}
-                """.format(i=iknl)
-                for iknl in range(len(self.expansions))]
+                """
+                for iknl in range(len(self.target_kernels))]
            + ["end"],
            arguments,
            name=self.name,
            assumptions="nresult>=1",
            default_offset=lp.auto,
            silenced_warnings="write_race(write_lpot*)",
-            fixed_parameters=dict(dim=self.dim),
+            fixed_parameters={"dim": self.dim},
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.add_dtypes(loopy_knl,
-            dict(nsources=np.int32, ntargets=np.int32))
+        loopy_knl = lp.add_dtypes(
+                loopy_knl, {"nsources": np.int32, "ntargets": np.int32})

-        for expn in self.expansions:
-            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+        for knl in self.source_kernels + self.target_kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl

-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self,
+            targets_is_obj_array, sources_is_obj_array, centers_is_obj_array):
        loopy_knl = self.get_kernel()

+        if targets_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
+        if sources_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")
+        if centers_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")
+
        loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
+        loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl)
        return loopy_knl

    def __call__(self, queue, targets, sources, centers, expansion_radii,
-                 index_set, **kwargs):
-        """
-        :arg targets: target point coordinates.
-        :arg sources: source point coordinates.
-        :arg centers: QBX target expansion centers.
+                 tgtindices, srcindices, **kwargs):
+        """Evaluate a subset of the QBX matrix interactions.
+
+        :arg targets: target point coordinates, which can be an object
+            :class:`~numpy.ndarray`, :class:`list` or :class:`tuple` of
+            coordinates or a single stacked array.
+        :arg sources: source point coordinates, which can also be in any of the
+            formats of the *targets*,
+
+        :arg centers: QBX target expansion center coordinates, which can also
+            be in any of the formats of the *targets*. The number of centers
+            must match the number of targets.
        :arg expansion_radii: radii for each expansion center.
-        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
-            to define the blocks.
-        :return: a tuple of one-dimensional arrays of kernel evaluations at
-            target-source pairs described by `index_set`.
+
+        :arg srcindices: an array of indices into *sources*.
+        :arg tgtindices: an array of indices into *targets*, of the same size
+            as *srcindices*.
+
+        :returns: a one-dimensional array of interactions, for each index pair
+            in (*srcindices*, *tgtindices*)
        """

-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_kernel_executor(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))

        return knl(queue,
-                   src=sources,
-                   tgt=targets,
+                   sources=sources,
+                   targets=targets,
                   center=centers,
                   expansion_radii=expansion_radii,
-                   tgtindices=index_set.linear_row_indices,
-                   srcindices=index_set.linear_col_indices, **kwargs)
+                   tgtindices=tgtindices,
+                   srcindices=srcindices, **kwargs)

 # }}}

@@ -426,10 +508,12 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):

 def find_jump_term(kernel, arg_provider):
    from sumpy.kernel import (
-            AxisTargetDerivative,
-            DirectionalSourceDerivative,
-            DirectionalTargetDerivative,
-            DerivativeBase)
+        AxisSourceDerivative,
+        AxisTargetDerivative,
+        DerivativeBase,
+        DirectionalSourceDerivative,
+        DirectionalTargetDerivative,
+    )

    tgt_derivatives = []
    src_derivatives = []
@@ -441,12 +525,15 @@ def find_jump_term(kernel, arg_provider):
        elif isinstance(kernel, DirectionalTargetDerivative):
            tgt_derivatives.append(kernel.dir_vec_name)
            kernel = kernel.kernel
+        elif isinstance(kernel, AxisSourceDerivative):
+            src_derivatives.append(kernel.axis)
+            kernel = kernel.kernel
        elif isinstance(kernel, DirectionalSourceDerivative):
            src_derivatives.append(kernel.dir_vec_name)
            kernel = kernel.kernel
        else:
-            raise RuntimeError("derivative type '%s' not understood"
-                    % type(kernel))
+            raise RuntimeError(
+                f"derivative type '{type(kernel).__name__}' not understood")

    tgt_count = len(tgt_derivatives)
    src_count = len(src_derivatives)
@@ -495,7 +582,7 @@ def find_jump_term(kernel, arg_provider):
        elif tgt_count == 1:
            from warnings import warn
            warn("jump terms for mixed derivatives (1 src+1 tgt) only available "
-                    "for the double-layer potential")
+                    "for the double-layer potential", stacklevel=1)
            i, = tgt_derivatives
            assert isinstance(i, int)
            return (
@@ -503,13 +590,14 @@ def find_jump_term(kernel, arg_provider):
                    * info.tangent[i]
                    * info.density_prime)

-    raise ValueError("don't know jump term for %d "
-            "target and %d source derivatives" % (tgt_count, src_count))
+    raise ValueError(
+        f"Do not know jump term for {tgt_count} target "
+        f"and {src_count} source derivatives")


 # {{{ symbolic argument provider

-class _JumpTermSymbolicArgumentProvider(object):
+class _JumpTermSymbolicArgumentProvider:
    """This class answers requests by :func:`find_jump_term` for symbolic values
    of quantities needed for the computation of the jump terms and tracks what
    data was requested. This tracking allows assembling the argument list of the
@@ -531,77 +619,77 @@ class _JumpTermSymbolicArgumentProvider(object):
        self.arguments[self.density_var_name] = \
                lp.GlobalArg(self.density_var_name, self.density_dtype,
                        shape="ntargets", order="C")
-        return parse("%s[itgt]" % self.density_var_name)
+        return parse(f"{self.density_var_name}[itgt]")

    @property
    @memoize_method
    def density_prime(self):
-        prime_var_name = self.density_var_name+"_prime"
-        self.arguments[prime_var_name] = \
+        prime_var_name = f"{self.density_var_name}_prime"
+        self.arguments[prime_var_name] = (
                lp.GlobalArg(prime_var_name, self.density_dtype,
-                        shape="ntargets", order="C")
-        return parse("%s[itgt]" % prime_var_name)
+                             shape="ntargets", order="C"))
+        return parse(f"{prime_var_name}[itgt]")

    @property
    @memoize_method
    def side(self):
-        self.arguments["side"] = \
-                lp.GlobalArg("side", self.geometry_dtype, shape="ntargets")
+        self.arguments["side"] = (
+                lp.GlobalArg("side", self.geometry_dtype, shape="ntargets"))
        return parse("side[itgt]")

    @property
    @memoize_method
    def normal(self):
-        self.arguments["normal"] = \
+        self.arguments["normal"] = (
                lp.GlobalArg("normal", self.geometry_dtype,
-                        shape=("ntargets", self.dim), order="C")
+                             shape=("ntargets", self.dim), order="C"))
        from pytools.obj_array import make_obj_array
        return make_obj_array([
-            parse("normal[itgt, %d]" % i)
+            parse(f"normal[itgt, {i}]")
            for i in range(self.dim)])

    @property
    @memoize_method
    def tangent(self):
-        self.arguments["tangent"] = \
+        self.arguments["tangent"] = (
                lp.GlobalArg("tangent", self.geometry_dtype,
-                        shape=("ntargets", self.dim), order="C")
+                             shape=("ntargets", self.dim), order="C"))
        from pytools.obj_array import make_obj_array
        return make_obj_array([
-            parse("tangent[itgt, %d]" % i)
+            parse(f"tangent[itgt, {i}]")
            for i in range(self.dim)])

    @property
    @memoize_method
    def mean_curvature(self):
-        self.arguments["mean_curvature"] = \
+        self.arguments["mean_curvature"] = (
                lp.GlobalArg("mean_curvature",
-                        self.geometry_dtype, shape="ntargets",
-                        order="C")
+                             self.geometry_dtype, shape="ntargets",
+                             order="C"))
        return parse("mean_curvature[itgt]")

    @property
    @memoize_method
    def src_derivative_dir(self):
-        self.arguments["src_derivative_dir"] = \
+        self.arguments["src_derivative_dir"] = (
                lp.GlobalArg("src_derivative_dir",
-                        self.geometry_dtype, shape=("ntargets", self.dim),
-                        order="C")
+                             self.geometry_dtype, shape=("ntargets", self.dim),
+                             order="C"))
        from pytools.obj_array import make_obj_array
        return make_obj_array([
-            parse("src_derivative_dir[itgt, %d]" % i)
+            parse(f"src_derivative_dir[itgt, {i}]")
            for i in range(self.dim)])

    @property
    @memoize_method
    def tgt_derivative_dir(self):
-        self.arguments["tgt_derivative_dir"] = \
+        self.arguments["tgt_derivative_dir"] = (
                lp.GlobalArg("tgt_derivative_dir",
-                        self.geometry_dtype, shape=("ntargets", self.dim),
-                        order="C")
+                             self.geometry_dtype, shape=("ntargets", self.dim),
+                             order="C"))
        from pytools.obj_array import make_obj_array
        return make_obj_array([
-            parse("tgt_derivative_dir[itgt, %d]" % i)
+            parse(f"tgt_derivative_dir[itgt, {i}]")
            for i in range(self.dim)])

 # }}}

--- a/sumpy/symbolic.py
+++ b/sumpy/symbolic.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

@@ -22,17 +23,35 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+__doc__ = """
+
+ Symbolic Tools
+ ==============
+
+ .. class:: Basic
+
+    The expression base class for the "heavy-duty" computer algebra toolkit
+    in use. Either :class:`sympy.core.basic.Basic` or :class:`symengine.Basic`.
+
+ .. autoclass:: SpatialConstant
+"""
+

-import six
-from six.moves import range
+import logging
+import math
+from typing import TYPE_CHECKING, ClassVar

 import numpy as np
-from pymbolic.mapper import IdentityMapper as IdentityMapperBase
+import sympy as sp
+
 import pymbolic.primitives as prim
+from pymbolic.mapper import IdentityMapper as IdentityMapperBase
+

-import logging
 logger = logging.getLogger(__name__)

+USE_SYMENGINE = False
+

 # {{{ symbolic backend

@@ -40,28 +59,28 @@ def _find_symbolic_backend():
    global USE_SYMENGINE

    try:
-        import symengine  # noqa
+        import symengine  # noqa: F401
        symengine_found = True
+        symengine_error = None
    except ImportError as import_error:
        symengine_found = False
        symengine_error = import_error

-    ALLOWED_BACKENDS = ("sympy", "symengine")  # noqa
-    BACKEND_ENV_VAR = "SUMPY_FORCE_SYMBOLIC_BACKEND"  # noqa
+    allowed_backends = ("sympy", "symengine")
+    backend_env_var = "SUMPY_FORCE_SYMBOLIC_BACKEND"

    import os
-    backend = os.environ.get(BACKEND_ENV_VAR)
+    backend = os.environ.get(backend_env_var)
    if backend is not None:
-        if backend not in ALLOWED_BACKENDS:
+        if backend not in allowed_backends:
            raise RuntimeError(
-                "%s value is unrecognized: '%s' "
-                "(allowed values are %s)" % (
-                    BACKEND_ENV_VAR,
-                    backend,
-                    ", ".join("'%s'" % val for val in ALLOWED_BACKENDS)))
+                f"{backend_env_var} value is unrecognized: '{backend}' "
+                "(allowed values are {})".format(
+                    ", ".join(f"'{val}'" for val in allowed_backends))
+                )

        if backend == "symengine" and not symengine_found:
-            raise RuntimeError("could not find SymEngine: %s" % symengine_error)
+            raise RuntimeError(f"could not find SymEngine: {symengine_error}")

        USE_SYMENGINE = (backend == "symengine")
    else:
@@ -72,24 +91,47 @@ _find_symbolic_backend()

 # }}}

-# Symbolic API common to SymEngine and sympy.
-# Before adding a function here, make sure it's present in both modules.
-SYMBOLIC_API = """
-Add Basic Mul Pow exp sqrt log symbols sympify cos sin atan2 Function Symbol
-Derivative Integer Matrix Subs I pi functions""".split()
-
 if USE_SYMENGINE:
    import symengine as sym
+
    from pymbolic.interop.symengine import (
-        PymbolicToSymEngineMapper as PymbolicToSympyMapper,
-        SymEngineToPymbolicMapper as SympyToPymbolicMapper)
+        PymbolicToSymEngineMapper as PymbolicToSympyMapperBase,
+        SymEngineToPymbolicMapper as SympyToPymbolicMapperBase,
+    )
 else:
    import sympy as sym
-    from pymbolic.interop.sympy import (
-        PymbolicToSympyMapper, SympyToPymbolicMapper)

-for _apifunc in SYMBOLIC_API:
-    globals()[_apifunc] = getattr(sym, _apifunc)
+    from pymbolic.interop.sympy import (  # type: ignore[assignment]
+        PymbolicToSympyMapper as PymbolicToSympyMapperBase,
+        SympyToPymbolicMapper as SympyToPymbolicMapperBase,
+    )
+
+# Symbolic API common to SymEngine and sympy.
+# Before adding a function here, make sure it's present in both modules.
+Add = sym.Add
+Basic = sym.Basic
+Mul = sym.Mul
+Pow = sym.Pow
+exp = sym.exp
+sqrt = sym.sqrt
+log = sym.log
+symbols = sym.symbols
+sympify = sym.sympify
+cos = sym.cos
+sin = sym.sin
+atan2 = sym.atan2
+Function = sym.Function
+Symbol = sym.Symbol
+Derivative = sym.Derivative
+Integer = sym.Integer
+Rational = sym.Rational
+Matrix = sym.Matrix
+Subs = sym.Subs
+I = sym.I  # noqa: E741
+pi = sym.pi
+functions = sym.functions
+Number = sym.Number
+Float = sym.Float


 def _coeff_isneg(a):
@@ -98,23 +140,27 @@ def _coeff_isneg(a):
    return a.is_Number and a.is_negative


-have_unevaluated_expr = False
-if not USE_SYMENGINE:
+if TYPE_CHECKING or USE_SYMENGINE:
+    def UnevaluatedExpr(x):  # noqa: N802
+        return x
+else:
    try:
        from sympy import UnevaluatedExpr
-        have_unevaluated_expr = True
    except ImportError:
-        pass
-
-if not have_unevaluated_expr:
-    def UnevaluatedExpr(x):  # noqa
-        return x
+        def UnevaluatedExpr(x):  # noqa: N802
+            return x


 if USE_SYMENGINE:
+    def doit(expr):
+        return expr
+
    def unevaluated_pow(a, b):
        return sym.Pow(a, b)
 else:
+    def doit(expr):
+        return expr.doit()
+
    def unevaluated_pow(a, b):
        return sym.Pow(a, b, evaluate=False)

@@ -124,18 +170,18 @@ else:
 class _DerivativeKiller(IdentityMapperBase):
    def map_derivative(self, expr):
        from pymbolic import var
-        return var("d_"+"_".join(expr.variables))(expr.child)
+        return var("d_{}".format("_".join(expr.variables)))(expr.child)

    def map_substitution(self, expr):
        return self.rec(expr.child)


 def _get_assignments_in_maxima(assignments, prefix=""):
-    my_variable_names = set(six.iterkeys(assignments))
+    my_variable_names = set(assignments.keys())
    written_assignments = set()

-    prefix_subst_dict = dict(
-            (vn, prefix+vn) for vn in my_variable_names)
+    prefix_subst_dict = {
+            vn: prefix+vn for vn in my_variable_names}

    from pymbolic.maxima import MaximaStringifyMapper
    mstr = MaximaStringifyMapper()
@@ -153,12 +199,12 @@ def _get_assignments_in_maxima(assignments, prefix=""):
            if symb.name not in written_assignments:
                write_assignment(symb.name)

-        result.append("%s%s : %s;" % (
+        result.append("{}{} : {};".format(
            prefix, name, mstr(dkill(s2p(
                assignments[name].subs(prefix_subst_dict))))))
        written_assignments.add(name)

-    for name in six.iterkeys(assignments):
+    for name in assignments:
        if name not in written_assignments:
            write_assignment(name)

@@ -172,26 +218,26 @@ def checked_cse(exprs, symbols=None):

    new_assignments, new_exprs = sym.cse(exprs, **kwargs)

-    max_old = _get_assignments_in_maxima(dict(
-            ("old_expr%d" % i, expr)
-            for i, expr in enumerate(exprs)))
-    new_ass_dict = dict(
-            ("new_expr%d" % i, expr)
-            for i, expr in enumerate(new_exprs))
+    max_old = _get_assignments_in_maxima({
+            f"old_expr{i}": expr
+            for i, expr in enumerate(exprs)})
+    new_ass_dict = {
+            f"new_expr{i}": expr
+            for i, expr in enumerate(new_exprs)}
    for name, val in new_assignments:
        new_ass_dict[name.name] = val
    max_new = _get_assignments_in_maxima(new_ass_dict)

    with open("check.mac", "w") as outf:
        outf.write("ratprint:false;\n")
-        outf.write("%s\n\n" % max_old)
-        outf.write("%s\n" % max_new)
+        outf.write(f"{max_old}\n\n")
+        outf.write(f"{max_new}\n")
        for i in range(len(exprs)):
-            outf.write("print(\"diff in expr %d:\n\");\n" % i)
-            outf.write("print(ratsimp(old_expr%d - new_expr%d));\n" % (i, i))
+            outf.write(f'print("diff in expr {i}:\n");\n')
+            outf.write(f"print(ratsimp(old_expr{i} - new_expr{i}));\n")

    from subprocess import check_call
-    check_call(["maxima", "--very-quiet", "-r", "load(\"check.mac\");"])
+    check_call(["maxima", "--very-quiet", "-r", 'load("check.mac");'])

    return new_assignments, new_exprs

@@ -208,8 +254,7 @@ def pymbolic_real_norm_2(x):


 def make_sym_vector(name, components):
-    return sym.Matrix(
-            [sym.Symbol("%s%d" % (name, i)) for i in range(components)])
+    return sym.Matrix([sym.Symbol(f"{name}{i}") for i in range(components)])


 def vector_xreplace(expr, from_vec, to_vec):
@@ -231,6 +276,64 @@ def find_power_of(base, prod):
    return result[power]


+@prim.expr_dataclass()
+class SpatialConstant(prim.Variable):
+    """A symbolic constant to represent a symbolic variable that is spatially constant.
+
+    For example the wave-number :math:`k` in the setting of a constant-coefficient
+    Helmholtz problem. For use in :attr:`sumpy.kernel.ExpressionKernel.expression`.
+    Any variable occurring there that is not a :class:`~sumpy.symbolic.SpatialConstant`
+    is assumed to have a spatial dependency.
+
+    .. autoattribute:: prefix
+    .. automethod:: as_sympy
+    .. automethod:: from_sympy
+    """
+
+    prefix: ClassVar[str] = "_spatial_constant_"
+    """Prefix used in code generation for variables of this type."""
+
+    def as_sympy(self) -> sp.Symbol:
+        """Convert variable to a :mod:`sympy` expression."""
+        return sym.Symbol(f"{self.prefix}{self.name}")
+
+    @classmethod
+    def from_sympy(cls, expr: sp.Symbol) -> SpatialConstant:
+        """Convert :mod:`sympy` expression to a constant."""
+        return cls(expr.name[len(cls.prefix):])
+
+
+class PymbolicToSympyMapper(PymbolicToSympyMapperBase):
+    def map_spatial_constant(self, expr):
+        return expr.as_sympy()
+
+
+class SympyToPymbolicMapper(SympyToPymbolicMapperBase):
+    def map_Symbol(self, expr):  # noqa: N802
+        if expr.name.startswith(SpatialConstant.prefix):
+            return SpatialConstant.from_sympy(expr)
+        return SympyToPymbolicMapperBase.map_Symbol(self, expr)
+
+    def map_Pow(self, expr):  # noqa: N802
+        if expr.exp == -1:
+            return 1/self.rec(expr.base)
+        else:
+            return SympyToPymbolicMapperBase.map_Pow(self, expr)
+
+    def map_Mul(self, expr):  # noqa: N802
+        num_args = []
+        den_args = []
+        for child in expr.args:
+            if (isinstance(child, Pow)
+                    and isinstance(child.exp, Integer)
+                    and child.exp < 0):
+                den_args.append(self.rec(child.base)**(-self.rec(child.exp)))
+            else:
+                num_args.append(self.rec(child))
+
+        return math.prod(num_args) / math.prod(den_args)
+
+
 class PymbolicToSympyMapperWithSymbols(PymbolicToSympyMapper):
    def map_variable(self, expr):
        if expr.name == "I":
@@ -242,8 +345,58 @@ class PymbolicToSympyMapperWithSymbols(PymbolicToSympyMapper):

    def map_subscript(self, expr):
        if isinstance(expr.aggregate, prim.Variable) and isinstance(expr.index, int):
-            return sym.Symbol("%s%d" % (expr.aggregate.name, expr.index))
+            return sym.Symbol(f"{expr.aggregate.name}{expr.index}")
        else:
            self.raise_conversion_error(expr)

+    def map_call(self, expr):
+        if expr.function.name == "hankel_1":
+            args = [self.rec(param) for param in expr.parameters]
+            args.append(0)
+            return Hankel1(*args)
+        elif expr.function.name == "bessel_j":
+            args = [self.rec(param) for param in expr.parameters]
+            args.append(0)
+            return BesselJ(*args)
+        else:
+            return PymbolicToSympyMapper.map_call(self, expr)
+
+
+import sympy
+
+
+class _BesselOrHankel(sympy.Function):
+    """A symbolic function for BesselJ or Hankel1 functions
+    that keeps track of the derivatives taken of the function.
+    Arguments are ``(order, z, nderivs)``.
+    """
+    nargs = (3,)
+
+    def fdiff(self, argindex=1):
+        if argindex in (1, 3):
+            # we are not differentiating w.r.t order or nderivs
+            raise ValueError(f"invalid argindex: {argindex}")
+
+        order, z, nderivs = self.args
+        return self.func(order, z, nderivs+1)
+
+
+class BesselJ(_BesselOrHankel):
+    pass
+
+
+class Hankel1(_BesselOrHankel):
+    pass
+
+
+_SympyBesselJ = BesselJ
+_SympyHankel1 = Hankel1
+
+if not TYPE_CHECKING and USE_SYMENGINE:
+    def BesselJ(*args):   # noqa: N802  # pylint: disable=function-redefined
+        return sym.sympify(_SympyBesselJ(*args))
+
+    def Hankel1(*args):   # noqa: N802  # pylint: disable=function-redefined
+        return sym.sympify(_SympyHankel1(*args))
+
 # vim: fdm=marker
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = """
 Copyright (C) 2012 Andreas Kloeckner
 Copyright (C) 2018 Alexandru Fikl
+Copyright (C) 2020 Isuru Fernando
 """

 __license__ = """
@@ -25,39 +27,127 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import six
-from six.moves import range, zip
-from pytools import memoize_method, memoize_in
+import enum
+import logging
+import warnings
+from abc import ABC, abstractmethod
+from collections.abc import Hashable, Sequence
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
 import numpy as np
-import sumpy.symbolic as sym

+import loopy as lp
 import pyopencl as cl
-import pyopencl.array  # noqa
+from pymbolic.mapper import WalkMapper
+from pytools import memoize_method
+from pytools.tag import Tag, tag_dataclass

-import loopy as lp
-from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+import sumpy.symbolic as sym
+
+
+if TYPE_CHECKING:
+    import numpy
+
+    import pyopencl
+
+    from sumpy.kernel import Kernel

-import logging
 logger = logging.getLogger(__name__)


+__doc__ = """
+Tools
+=====
+
+.. autofunction:: to_complex_dtype
+.. autofunction:: is_obj_array_like
+.. autofunction:: vector_to_device
+.. autofunction:: vector_from_device
+.. autoclass:: OrderedSet
+
+Multi-index Helpers
+-------------------
+
+.. autofunction:: add_mi
+.. autofunction:: mi_factorial
+.. autofunction:: mi_increment_axis
+.. autofunction:: mi_set_axis
+.. autofunction:: mi_power
+
+Symbolic Helpers
+----------------
+
+.. autofunction:: add_to_sac
+.. autofunction:: gather_arguments
+.. autofunction:: gather_source_arguments
+.. autofunction:: gather_loopy_arguments
+.. autofunction:: gather_loopy_source_arguments
+
+.. autoclass:: ScalingAssignmentTag
+.. autoclass:: KernelComputation
+.. autoclass:: KernelCacheMixin
+
+.. autofunction:: reduced_row_echelon_form
+.. autofunction:: nullspace
+
+FFT
+---
+
+.. autofunction:: fft
+.. autofunction:: fft_toeplitz_upper_triangular
+.. autofunction:: matvec_toeplitz_upper_triangular
+
+.. autoclass:: FFTBackend
+    :members:
+.. autofunction:: loopy_fft
+.. autofunction:: get_opencl_fft_app
+.. autofunction:: run_opencl_fft
+
+Profiling
+---------
+
+.. autofunction:: get_native_event
+.. autoclass:: ProfileGetter
+.. autoclass:: AggregateProfilingEvent
+.. autoclass:: MarkerBasedProfilingEvent
+"""
+
+
 # {{{ multi_index helpers

-def add_mi(mi1, mi2):
-    return tuple(mi1i+mi2i for mi1i, mi2i in zip(mi1, mi2))
+def add_mi(mi1: Sequence[int], mi2: Sequence[int]) -> tuple[int, ...]:
+    # NOTE: these are used a lot and `tuple([])` is faster
+    return tuple([mi1i + mi2i for mi1i, mi2i in zip(mi1, mi2, strict=True)])  # noqa: C409


-def mi_factorial(mi):
-    from pytools import factorial
+def mi_factorial(mi: Sequence[int]) -> int:
+    import math
    result = 1
    for mi_i in mi:
-        result *= factorial(mi_i)
+        result *= math.factorial(mi_i)
    return result


-def mi_power(vector, mi, evaluate=True):
+def mi_increment_axis(
+        mi: Sequence[int], axis: int, increment: int
+        ) -> tuple[int, ...]:
+    new_mi = list(mi)
+    new_mi[axis] += increment
+    return tuple(new_mi)
+
+
+def mi_set_axis(mi: Sequence[int], axis: int, value: int) -> tuple[int, ...]:
+    new_mi = list(mi)
+    new_mi[axis] = value
+    return tuple(new_mi)
+
+
+def mi_power(
+        vector: Sequence[Any], mi: Sequence[int],
+        evaluate: bool = True) -> Any:
    result = 1
-    for mi_i, vec_i in zip(mi, vector):
+    for mi_i, vec_i in zip(mi, vector, strict=True):
        if mi_i == 1:
            result *= vec_i
        elif evaluate:
@@ -67,91 +157,35 @@ def mi_power(vector, mi, evaluate=True):
    return result


-class MiDerivativeTaker(object):
-
-    def __init__(self, expr, var_list):
-        assert isinstance(expr, sym.Basic)
-        self.var_list = var_list
-        empty_mi = (0,) * len(var_list)
-        self.cache_by_mi = {empty_mi: expr}
-
-    def mi_dist(self, a, b):
-        return np.array(a, dtype=int) - np.array(b, dtype=int)
-
-    def diff(self, mi):
-        try:
-            expr = self.cache_by_mi[mi]
-        except KeyError:
-            current_mi = self.get_closest_cached_mi(mi)
-            expr = self.cache_by_mi[current_mi]
-
-            for next_deriv, next_mi in self.get_derivative_taking_sequence(
-                    current_mi, mi):
-                expr = expr.diff(next_deriv)
-                self.cache_by_mi[next_mi] = expr
+def add_to_sac(sac, expr):
+    if sac is None:
+        return expr

+    from numbers import Number
+    if isinstance(expr, Number | sym.Number | sym.Symbol):
        return expr

-    def get_derivative_taking_sequence(self, start_mi, end_mi):
-        current_mi = np.array(start_mi, dtype=int)
-        for idx, (mi_i, vec_i) in enumerate(
-                zip(self.mi_dist(end_mi, start_mi), self.var_list)):
-            for i in range(1, 1 + mi_i):
-                current_mi[idx] += 1
-                yield vec_i, tuple(current_mi)
+    name = sac.assign_temp("temp", expr)
+    return sym.Symbol(name)

-    def get_closest_cached_mi(self, mi):
-        return min((other_mi
-                for other_mi in self.cache_by_mi.keys()
-                if (np.array(mi) >= np.array(other_mi)).all()),
-            key=lambda other_mi: sum(self.mi_dist(mi, other_mi)))

+# }}}

-class LinearRecurrenceBasedMiDerivativeTaker(MiDerivativeTaker):
-    """
-    The derivative taker for expansions that use
-    :class:`sumpy.expansion.LinearRecurrenceBasedDerivativeWrangler`
-    """

-    def __init__(self, expr, var_list, wrangler):
-        super(LinearRecurrenceBasedMiDerivativeTaker, self).__init__(
-                expr, var_list)
-        self.wrangler = wrangler
+# {{{ get variables

-    @memoize_method
-    def diff(self, mi):
-        """
-        :arg mi: a multi-index (tuple) indicating how many x/y derivatives are
-            to be taken.
-        """
-        try:
-            expr = self.cache_by_mi[mi]
-        except KeyError:
-            from six import iteritems
-            from sumpy.symbolic import Add
-
-            closest_mi = self.get_closest_cached_mi(mi)
-            expr = self.cache_by_mi[closest_mi]
-
-            # Try to reduce the derivative using recurrences first, and if that
-            # fails fall back to derivative taking.
-            for next_deriv, next_mi in (
-                        self.get_derivative_taking_sequence(closest_mi, mi)):
-
-                recurrence = (
-                        self.wrangler.try_get_recurrence_for_derivative(
-                            next_mi, self.cache_by_mi))
-
-                if recurrence is not None:
-                    expr = Add(*tuple(
-                            coeff * self.cache_by_mi[ident]
-                            for ident, coeff in iteritems(recurrence)))
-                else:
-                    expr = expr.diff(next_deriv)
+class GatherAllVariables(WalkMapper):
+    def __init__(self):
+        self.vars = set()

-                self.cache_by_mi[next_mi] = expr
+    def map_variable(self, expr):
+        self.vars.add(expr)

-        return expr
+
+def get_all_variables(expr):
+    mapper = GatherAllVariables()
+    mapper(expr)
+    return mapper.vars

 # }}}

@@ -160,7 +194,7 @@ def build_matrix(op, dtype=None, shape=None):
    dtype = dtype or op.dtype
    from pytools import ProgressBar
    shape = shape or op.shape
-    rows, cols = shape
+    _rows, cols = shape
    pb = ProgressBar("matrix", cols)
    mat = np.zeros(shape, dtype)

@@ -181,48 +215,52 @@ def build_matrix(op, dtype=None, shape=None):


 def vector_to_device(queue, vec):
-    from pytools.obj_array import with_object_array_or_scalar
-
    from pyopencl.array import to_device
+    from pytools.obj_array import obj_array_vectorize

    def to_dev(ary):
        return to_device(queue, ary)

-    return with_object_array_or_scalar(to_dev, vec)
+    return obj_array_vectorize(to_dev, vec)


 def vector_from_device(queue, vec):
-    from pytools.obj_array import with_object_array_or_scalar
+    from pytools.obj_array import obj_array_vectorize

    def from_dev(ary):
        from numbers import Number
-        if isinstance(ary, (np.number, Number)):
+        if isinstance(ary, np.number | Number):
            # zero, most likely
            return ary

        return ary.get(queue=queue)

-    return with_object_array_or_scalar(from_dev, vec)
+    return obj_array_vectorize(from_dev, vec)
+
+
+def _merge_kernel_arguments(dictionary, arg):
+    # Check for strict equality until there's a usecase
+    if dictionary.setdefault(arg.name, arg) != arg:
+        msg = "Merging two different kernel arguments {} and {} with the same name"
+        raise ValueError(msg.format(arg.loopy_arg, dictionary[arg].loopy_arg))


 def gather_arguments(kernel_likes):
    result = {}
    for knl in kernel_likes:
        for arg in knl.get_args():
-            result[arg.name] = arg
-            # FIXME: possibly check that arguments match before overwriting
+            _merge_kernel_arguments(result, arg)

-    return sorted(six.itervalues(result), key=lambda arg: arg.name)
+    return sorted(result.values(), key=lambda arg: arg.name)


 def gather_source_arguments(kernel_likes):
    result = {}
    for knl in kernel_likes:
        for arg in knl.get_args() + knl.get_source_args():
-            result[arg.name] = arg
-            # FIXME: possibly check that arguments match before overwriting
+            _merge_kernel_arguments(result, arg)

-    return sorted(six.itervalues(result), key=lambda arg: arg.name)
+    return sorted(result.values(), key=lambda arg: arg.name)


 def gather_loopy_arguments(kernel_likes):
@@ -235,16 +273,37 @@ def gather_loopy_source_arguments(kernel_likes):

 # {{{  KernelComputation

-class KernelComputation(object):
-    """Common input processing for kernel computations."""
+@tag_dataclass
+class ScalingAssignmentTag(Tag):
+    pass
+
+
+class KernelComputation(ABC):
+    """Common input processing for kernel computations.

-    def __init__(self, ctx, kernels, strength_usage,
-            value_dtypes, name, options=[], device=None):
+    .. attribute:: name
+    .. attribute:: target_kernels
+    .. attribute:: source_kernels
+    .. attribute:: strength_usage
+
+    .. automethod:: get_kernel
+    """
+
+    def __init__(self, ctx: Any,
+            target_kernels: list[Kernel],
+            source_kernels: list[Kernel],
+            strength_usage: list[int] | None = None,
+            value_dtypes: list[numpy.dtype[Any]] | None = None,
+            name: str | None = None,
+            device: Any | None = None) -> None:
        """
-        :arg kernels: list of :class:`sumpy.kernel.Kernel` instances
-            :class:`sumpy.kernel.TargetDerivative` wrappers should be
+        :arg target_kernels: list of :class:`~sumpy.kernel.Kernel` instances,
+            with :class:`sumpy.kernel.DirectionalTargetDerivative` as
            the outermost kernel wrappers, if present.
-        :arg strength_usage: A list of integers indicating which expression
+        :arg source_kernels: list of :class:`~sumpy.kernel.Kernel` instances
+            with :class:`~sumpy.kernel.DirectionalSourceDerivative` as the
+            outermost kernel wrappers, if present.
+        :arg strength_usage: list of integers indicating which expression
            uses which density. This implicitly specifies the
            number of density arrays that need to be passed.
            Default: all kernels use the same density.
@@ -254,14 +313,14 @@ class KernelComputation(object):

        if value_dtypes is None:
            value_dtypes = []
-            for knl in kernels:
+            for knl in target_kernels:
                if knl.is_complex_valued:
-                    value_dtypes.append(np.complex128)
+                    value_dtypes.append(np.dtype(np.complex128))
                else:
-                    value_dtypes.append(np.float64)
+                    value_dtypes.append(np.dtype(np.float64))

-        if not isinstance(value_dtypes, (list, tuple)):
-            value_dtypes = [np.dtype(value_dtypes)] * len(kernels)
+        if not isinstance(value_dtypes, list | tuple):
+            value_dtypes = [np.dtype(value_dtypes)] * len(target_kernels)
        value_dtypes = [np.dtype(vd) for vd in value_dtypes]

        # }}}
@@ -269,9 +328,9 @@ class KernelComputation(object):
        # {{{ process strength_usage

        if strength_usage is None:
-            strength_usage = [0] * len(kernels)
+            strength_usage = list(range(len(source_kernels)))

-        if len(kernels) != len(strength_usage):
+        if len(source_kernels) != len(strength_usage):
            raise ValueError("exprs and strength_usage must have the same length")
        strength_count = max(strength_usage)+1

@@ -283,286 +342,47 @@ class KernelComputation(object):
        self.context = ctx
        self.device = device

-        self.kernels = kernels
+        self.source_kernels = tuple(source_kernels)
+        self.target_kernels = tuple(target_kernels)
        self.value_dtypes = value_dtypes
        self.strength_usage = strength_usage
        self.strength_count = strength_count

        self.name = name or self.default_name

+    @property
+    @abstractmethod
+    def default_name(self):
+        pass
+
    def get_kernel_scaling_assignments(self):
        from sumpy.symbolic import SympyToPymbolicMapper
        sympy_conv = SympyToPymbolicMapper()

        import loopy as lp
        return [
-                lp.Assignment(id=None,
-                    assignee="knl_%d_scaling" % i,
+                lp.Assignment(id=f"knl_{i}_scaling",
+                    assignee=f"knl_{i}_scaling",
                    expression=sympy_conv(kernel.get_global_scaling_const()),
-                    temp_var_type=lp.Optional(dtype))
+                    temp_var_type=lp.Optional(dtype),
+                    tags=frozenset([ScalingAssignmentTag()]))
                for i, (kernel, dtype) in enumerate(
-                    zip(self.kernels, self.value_dtypes))]
-
-# }}}
-
-
-# {{{
-
-
-def _to_host(x, queue=None):
-    if isinstance(x, cl.array.Array):
-        queue = queue or x.queue
-        return x.get(queue)
-    return x
-
-
-class BlockIndexRanges(object):
-    """Convenience class for working with blocks of a global array.
-
-    .. attribute:: indices
-
-        A list of not necessarily continuous or increasing integers
-        representing the indices of a global array. The individual blocks are
-        delimited using :attr:`ranges`.
-
-    .. attribute:: ranges
-
-        A list of nondecreasing integers used to index into :attr:`indices`.
-        A block :math:`i` can be retrieved using
-        `indices[ranges[i]:ranges[i + 1]]`.
-
-    .. automethod:: block_shape
-    .. automethod:: get
-    .. automethod:: take
-    """
-
-    def __init__(self, cl_context, indices, ranges):
-        self.cl_context = cl_context
-        self.indices = indices
-        self.ranges = ranges
-
-    @property
-    @memoize_method
-    def _ranges(self):
-        with cl.CommandQueue(self.cl_context) as queue:
-            return _to_host(self.ranges, queue=queue)
-
-    @property
-    def nblocks(self):
-        return self.ranges.shape[0] - 1
-
-    def block_shape(self, i):
-        return (self._ranges[i + 1] - self._ranges[i],)
-
-    def block_indices(self, i):
-        return self.indices[self._ranges[i]:self._ranges[i + 1]]
-
-    def get(self, queue=None):
-        return BlockIndexRanges(self.cl_context,
-                                _to_host(self.indices, queue=queue),
-                                _to_host(self.ranges, queue=queue))
-
-    def take(self, x, i):
-        """Return the subset of a global array `x` that is defined by
-        the :attr:`indices` in block :math:`i`.
-        """
-
-        return x[self.block_indices(i)]
-
-
-class MatrixBlockIndexRanges(object):
-    """Keep track of different ways to index into matrix blocks.
-
-    .. attribute:: row
-
-        A :class:`BlockIndexRanges` encapsulating row block indices.
-
-    .. attribute:: col
-
-        A :class:`BlockIndexRanges` encapsulating column block indices.
-
-    .. automethod:: block_shape
-    .. automethod:: block_take
-    .. automethod:: get
-    .. automethod:: take
-
-    """
-
-    def __init__(self, cl_context, row, col):
-        self.cl_context = cl_context
-        self.row = row
-        self.col = col
-        assert self.row.nblocks == self.col.nblocks
-
-        self.blkranges = np.cumsum([0] + [
-            self.row.block_shape(i)[0] * self.col.block_shape(i)[0]
-            for i in range(self.row.nblocks)])
-
-        if isinstance(self.row.indices, cl.array.Array):
-            with cl.CommandQueue(self.cl_context) as queue:
-                self.blkranges = \
-                    cl.array.to_device(queue, self.blkranges).with_queue(None)
-
-    @property
-    def nblocks(self):
-        return self.row.nblocks
-
-    def block_shape(self, i):
-        return self.row.block_shape(i) + self.col.block_shape(i)
-
-    def block_indices(self, i):
-        return (self.row.block_indices(i),
-                self.col.block_indices(i))
-
-    @property
-    def linear_row_indices(self):
-        r, _ = self._linear_indices()
-        return r
-
-    @property
-    def linear_col_indices(self):
-        _, c = self._linear_indices()
-        return c
-
-    @property
-    def linear_ranges(self):
-        return self.blkranges
-
-    def get(self, queue=None):
-        """Transfer data to the host. Only the initial given data is
-        transfered, not the arrays returned by :meth:`linear_row_indices` and
-        friends.
-
-        :return: a copy of `self` in which all data lives on the host, i.e.
-        all :class:`pyopencl.array.Array` instances are replaces by
-        :class:`numpy.ndarray` instances.
-        """
-        return MatrixBlockIndexRanges(self.cl_context,
-                row=self.row.get(queue=queue),
-                col=self.col.get(queue=queue))
-
-    def take(self, x, i):
-        """Retrieve a block from a global matrix.
-
-        :arg x: a 2D :class:`numpy.ndarray`.
-        :arg i: block index.
-        :return: requested block from the matrix.
-        """
-
-        if isinstance(self.row.indices, cl.array.Array) or \
-                isinstance(self.col.indices, cl.array.Array):
-            raise ValueError("CL `Array`s are not supported."
-                    "Use MatrixBlockIndexRanges.get() and then view into matrices.")
-
-        irow, icol = self.block_indices(i)
-        return x[np.ix_(irow, icol)]
-
-    def block_take(self, x, i):
-        """Retrieve a block from a linear representation of the matrix blocks.
-        A linear representation of the matrix blocks can be obtained, or
-        should be consistent with
-
-        .. code-block:: python
-
-            i = index.linear_row_indices()
-            j = index.linear_col_indices()
-            linear_blks = global_mat[i, j]
-
-            for k in range(index.nblocks):
-                assert np.allclose(index.block_take(linear_blks, k),
-                                   index.take(global_mat, k))
-
-        :arg x: a 1D :class:`numpy.ndarray`.
-        :arg i: block index.
-        :return: requested block, reshaped into a 2D array.
-        """
-
-        iblk = np.s_[self.blkranges[i]:self.blkranges[i + 1]]
-        return x[iblk].reshape(*self.block_shape(i))
-
-    @memoize_method
-    def _linear_indices(self):
-        """
-        :return: a tuple of `(rowindices, colindices)` that can be
-            used to provide linear indexing into a set of matrix blocks. These
-            index arrays are just the concatenated Cartesian products of all
-            the block arrays described by :attr:`row` and :attr:`col`.
-
-            They can be used to index directly into a matrix as follows:
-
-            .. code-block:: python
-
-                mat[rowindices[blkranges[i]:blkranges[i + 1]],
-                    colindices[blkranges[i]:blkranges[i + 1]]]
-
-            The same block can be obtained more easily using
+                    zip(self.target_kernels, self.value_dtypes, strict=True))]

-            .. code-block:: python
-
-                index.view(mat, i).reshape(-1)
-        """
-
-        @memoize_in(self, "block_index_knl")
-        def _build_index():
-            loopy_knl = lp.make_kernel([
-                "{[irange]: 0 <= irange < nranges}",
-                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
-                ],
-                """
-                for irange
-                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
-                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
-
-                    for itgt, isrc
-                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
-
-                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
-                            {id_prefix=write_index}
-                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
-                            {id_prefix=write_index}
-                    end
-                end
-                """,
-                [
-                    lp.GlobalArg('blkranges', None, shape="nranges + 1"),
-                    lp.GlobalArg("rowindices", None, shape="nresults"),
-                    lp.GlobalArg("colindices", None, shape="nresults"),
-                    lp.ValueArg("nresults", None),
-                    '...'
-                ],
-                name="block_index_knl",
-                default_offset=lp.auto,
-                assumptions='nranges>=1',
-                silenced_warnings="write_race(write_index*)",
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
-            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
-
-            return loopy_knl
-
-        with cl.CommandQueue(self.cl_context) as queue:
-            _, (rowindices, colindices) = _build_index()(queue,
-                tgtindices=self.row.indices,
-                srcindices=self.col.indices,
-                tgtranges=self.row.ranges,
-                srcranges=self.col.ranges,
-                blkranges=self.blkranges,
-                nresults=_to_host(self.blkranges[-1], queue=queue))
-            return (rowindices.with_queue(None),
-                    colindices.with_queue(None))
+    @abstractmethod
+    def get_kernel(self):
+        pass

 # }}}


 # {{{ OrderedSet

-# Source: http://code.activestate.com/recipes/576694-orderedset/
+# Source: https://code.activestate.com/recipes/576694-orderedset/
 # Author: Raymond Hettinger
 # License: MIT

-try:
-    from collections.abc import MutableSet
-except ImportError:
-    from collections import MutableSet
+from collections.abc import MutableSet


 class OrderedSet(MutableSet):
@@ -608,15 +428,15 @@ class OrderedSet(MutableSet):

    def pop(self, last=True):
        if not self:
-            raise KeyError('set is empty')
+            raise KeyError("set is empty")
        key = self.end[1][0] if last else self.end[2][0]
        self.discard(key)
        return key

    def __repr__(self):
        if not self:
-            return '%s()' % (self.__class__.__name__,)
-        return '%s(%r)' % (self.__class__.__name__, list(self))
+            return f"{self.__class__.__name__}()"
+        return f"{self.__class__.__name__}({list(self)!r})"

    def __eq__(self, other):
        if isinstance(other, OrderedSet):
@@ -626,33 +446,50 @@ class OrderedSet(MutableSet):
 # }}}


-class KernelCacheWrapper(object):
+class KernelCacheMixin(ABC):
+    context: cl.Context
+    name: str
+
+    @abstractmethod
+    def get_cache_key(self) -> tuple[Hashable, ...]:
+        ...
+
+    @abstractmethod
+    def get_kernel(self, **kwargs: Any) -> lp.TranslationUnit:
+        ...
+
+    @abstractmethod
+    def get_optimized_kernel(self, **kwargs: Any) -> lp.TranslationUnit:
+        ...
+
    @memoize_method
-    def get_cached_optimized_kernel(self, **kwargs):
-        from sumpy import code_cache, CACHING_ENABLED, OPT_ENABLED
+    def get_cached_kernel_executor(self, **kwargs) -> lp.ExecutorBase:
+        from sumpy import CACHING_ENABLED, NO_CACHE_KERNELS, OPT_ENABLED, code_cache

-        if CACHING_ENABLED:
+        if CACHING_ENABLED and not (
+                NO_CACHE_KERNELS and self.name in NO_CACHE_KERNELS):
            import loopy.version
+
            from sumpy.version import KERNEL_VERSION
            cache_key = (
                    self.get_cache_key()
-                    + tuple(sorted(six.iteritems(kwargs)))
+                    + tuple(sorted(kwargs.items()))
                    + (loopy.version.DATA_MODEL_VERSION,)
                    + (KERNEL_VERSION,)
                    + (OPT_ENABLED,))

            try:
                result = code_cache[cache_key]
-                logger.debug("%s: kernel cache hit [key=%s]" % (
-                    self.name, cache_key))
-                return result
+                logger.debug("%s: kernel cache hit [key=%s]", self.name, cache_key)
+                return result.executor(self.context)
            except KeyError:
                pass

-        logger.info("%s: kernel cache miss" % self.name)
-        if CACHING_ENABLED:
-            logger.info("%s: kernel cache miss [key=%s]" % (
-                self.name, cache_key))
+        logger.info("%s: kernel cache miss", self.name)
+        if CACHING_ENABLED and not (
+                NO_CACHE_KERNELS and self.name in NO_CACHE_KERNELS):
+            logger.info("%s: kernel cache miss [key=%s]",
+                self.name, cache_key)

        from pytools import MinRecursionLimit
        with MinRecursionLimit(3000):
@@ -661,55 +498,561 @@ class KernelCacheWrapper(object):
            else:
                knl = self.get_kernel()

-        if CACHING_ENABLED:
+        if CACHING_ENABLED and not (
+                NO_CACHE_KERNELS and self.name in NO_CACHE_KERNELS):
            code_cache.store_if_not_present(cache_key, knl)

-        return knl
+        return knl.executor(self.context)

+    @staticmethod
+    def _allow_redundant_execution_of_knl_scaling(knl):
+        from loopy.match import ObjTagged
+        return lp.add_inames_for_unused_hw_axes(
+                knl, within=ObjTagged(ScalingAssignmentTag()))

-def my_syntactic_subs(expr, subst_dict):
-    # Workaround for differing substitution semantics between sympy and symengine.
-    # FIXME: This is a hack.
-    from sumpy.symbolic import Basic, Subs, Derivative

-    if not isinstance(expr, Basic):
-        return expr
+KernelCacheWrapper = KernelCacheMixin

-    elif expr.is_Symbol:
-        return subst_dict.get(expr, expr)

-    elif isinstance(expr, Subs):
-        new_point = tuple(my_syntactic_subs(p, subst_dict) for p in expr.point)
+def is_obj_array_like(ary):
+    return (
+            isinstance(ary, tuple | list)
+            or (isinstance(ary, np.ndarray) and ary.dtype.char == "O"))

-        import six
-        new_subst_dict = dict(
-            (var, subs) for var, subs in six.iteritems(subst_dict)
-            if var not in expr.variables)

-        new_expr = my_syntactic_subs(expr.expr, new_subst_dict)
+# {{{ matrices

-        if new_point != expr.point or new_expr != expr.expr:
-            return Subs(new_expr, expr.variables, new_point)
+def reduced_row_echelon_form(m, atol=0):
+    """Calculates a reduced row echelon form of a
+    matrix `m`.

-        return expr
+    :arg m: a 2D :class:`numpy.ndarray` or a list of lists or a sympy Matrix
+    :arg atol: absolute tolerance for values to be considered zero
+    :return: reduced row echelon form as a 2D :class:`numpy.ndarray`
+             and a list of pivots
+    """

-    elif isinstance(expr, Derivative):
-        new_expr = my_syntactic_subs(expr.expr, subst_dict)
-        new_variables = my_syntactic_subs(expr.variables, subst_dict)
+    mat = np.array(m, dtype=object)
+    index = 0
+    nrows = mat.shape[0]
+    ncols = mat.shape[1]
+    pivot_cols = []
+    for i in range(ncols):
+        if index == nrows:
+            break
+        pivot = nrows
+        for k in range(index, nrows):
+            symbolic = isinstance(mat[k, i], sym.Basic) and not mat[k, i].is_number
+            if (symbolic or abs(mat[k, i]) > atol) and pivot == nrows:
+                pivot = k
+            # If there's a pivot that's close to 1 use that as it avoids
+            # having to divide.
+            # When checking for a number close to 1, we shouldn't consider
+            # symbolic values
+            if not symbolic and abs(mat[k, i] - 1) <= atol:
+                pivot = k
+                break
+        if pivot == nrows:
+            # no nonzero pivot found, next column
+            continue
+        if pivot != index:
+            mat[[pivot, index], :] = mat[[index, pivot], :]
+
+        pivot_cols.append(i)
+        scale = mat[index, i]
+        if isinstance(scale, int | sym.Integer):
+            scale = int(scale)
+
+        for j in range(mat.shape[1]):
+            elem = mat[index, j]
+            if isinstance(scale, int) and isinstance(elem, int | sym.Integer):
+                quo = int(elem) // scale
+                if quo * scale == elem:
+                    mat[index, j] = quo
+                    continue
+            mat[index, j] = sym.sympify(elem)/scale
+
+        for j in range(nrows):
+            if (j == index):
+                continue
+
+            scale = mat[j, i]
+            if scale != 0:
+                mat[j, :] = mat[j, :] - mat[index, :]*scale
+
+        index = index + 1
+
+    return mat, pivot_cols
+
+
+def nullspace(m, atol=0):
+    """Calculates the nullspace of a matrix `m`.
+
+    :arg m: a 2D :class:`numpy.ndarray` or a list of lists or a sympy Matrix
+    :arg atol: absolute tolerance for values to be considered zero
+    :return: nullspace of `m` as a 2D :class:`numpy.ndarray`
+    """
+    mat, pivot_cols = reduced_row_echelon_form(m, atol=atol)
+    pivot_cols = list(pivot_cols)
+    cols = mat.shape[1]
+
+    free_vars = [i for i in range(cols) if i not in pivot_cols]
+
+    n = []
+    for free_var in free_vars:
+        vec = [0]*cols
+        vec[free_var] = 1
+        for piv_row, piv_col in enumerate(pivot_cols):
+            for pos in pivot_cols[piv_row+1:] + [free_var]:
+                if isinstance(mat[piv_row, pos], sym.Integer):
+                    vec[piv_col] -= int(mat[piv_row, pos])
+                else:
+                    vec[piv_col] -= mat[piv_row, pos]
+        n.append(vec)
+    return np.array(n, dtype=object).T

-        if new_expr != expr.expr or any(new_var != var for new_var, var in
-                                          zip(new_variables, expr.variables)):
-            return Derivative(new_expr, *new_variables)
+# }}}

-        return expr

+# {{{ FFT
+
+def fft(seq, inverse=False, sac=None):
+    """
+    Return the discrete fourier transform of the sequence seq.
+    seq should be a python iterable with tuples of length 2
+    corresponding to the real part and imaginary part.
+    """
+
+    from pymbolic.algorithm import fft as _fft, ifft as _ifft
+
+    def wrap(level, expr):
+        if isinstance(expr, np.ndarray):
+            res = [wrap(level, a) for a in expr]
+            return np.array(res, dtype=object).reshape(expr.shape)
+        return add_to_sac(sac, expr)
+
+    if inverse:
+        return _ifft(np.array(seq), wrap_intermediate_with_level=wrap,
+                complex_dtype=np.complex128).tolist()
    else:
-        new_args = tuple(my_syntactic_subs(arg, subst_dict) for arg in expr.args)
+        return _fft(np.array(seq), wrap_intermediate_with_level=wrap,
+                complex_dtype=np.complex128).tolist()

-        if any(new_arg != arg for arg, new_arg in zip(expr.args, new_args)):
-            return expr.func(*new_args)

-        return expr
+def fft_toeplitz_upper_triangular(first_row, x, sac=None):
+    """
+    Returns the matvec of the Toeplitz matrix given by
+    the first row and the vector x using a Fourier transform
+    """
+    assert len(first_row) == len(x)
+    n = len(first_row)
+    v = list(first_row)
+    v += [0]*(n-1)
+
+    x = list(reversed(x))
+    x += [0]*(n-1)

+    v_fft = fft(v, sac)
+    x_fft = fft(x, sac)
+    res_fft = [add_to_sac(sac, a * b) for a, b in zip(v_fft, x_fft, strict=True)]
+    res = fft(res_fft, inverse=True, sac=sac)
+    return list(reversed(res[:n]))
+
+
+def matvec_toeplitz_upper_triangular(first_row, vector):
+    n = len(first_row)
+    assert len(vector) == n
+    output = [0]*n
+    for row in range(n):
+        terms = tuple(first_row[col-row]*vector[col] for col in range(row, n))
+        output[row] = sym.Add(*terms)
+    return output
+
+
+to_complex_type_dict = {
+    np.complex64: np.complex64,
+    np.complex128: np.complex128,
+    np.float32: np.complex64,
+    np.float64: np.complex128,
+}
+
+
+def to_complex_dtype(dtype):
+    np_type = np.dtype(dtype).type
+    try:
+        return to_complex_type_dict[np_type]
+    except KeyError as err:
+        raise RuntimeError(f"Unknown dtype: {dtype}") from err
+
+
+@dataclass(frozen=True)
+class ProfileGetter:
+    start: int
+    end: int
+
+
+def get_native_event(evt):
+    from pyopencl import Event
+    return evt if isinstance(evt, Event) else evt.native_event
+
+
+class AggregateProfilingEvent:
+    """An object to hold a list of events and provides compatibility
+    with some of the functionality of :class:`pyopencl.Event`.
+    Assumes that the last event waits on all of the previous events.
+    """
+    def __init__(self, events):
+        self.events = events[:]
+        self.native_event = get_native_event(events[-1])
+
+    @property
+    def profile(self):
+        total = sum(evt.profile.end - evt.profile.start for evt in self.events)
+        end = self.native_event.profile.end
+        return ProfileGetter(start=end - total, end=end)
+
+    def wait(self):
+        return self.native_event.wait()
+
+
+class MarkerBasedProfilingEvent:
+    """An object to hold two marker events and provides compatibility
+    with some of the functionality of :class:`pyopencl.Event`.
+    """
+    def __init__(self, *, end_event, start_event):
+        self.native_event = end_event
+        self.start_event = start_event
+
+    @property
+    def profile(self):
+        return ProfileGetter(start=self.start_event.profile.start,
+                             end=self.native_event.profile.end)
+
+    def wait(self):
+        return self.native_event.wait()
+
+
+def loopy_fft(shape, inverse, complex_dtype, index_dtype=None,
+        name=None):
+    from math import pi
+
+    from pymbolic import var
+    from pymbolic.algorithm import find_factors
+
+    sign = 1 if not inverse else -1
+    n = shape[-1]
+
+    m = n
+    factors = []
+    while m != 1:
+        N1, m = find_factors(m)  # noqa: N806
+        factors.append(N1)
+
+    nfft = n
+
+    broadcast_dims = tuple(var(f"j{d}") for d in range(len(shape) - 1))
+
+    domains = [
+        "{[i]: 0<=i<n}",
+        "{[i2]: 0<=i2<n}",
+    ]
+    domains += [f"{{[j{d}]: 0<=j{d}<{shape[d]} }}" for d in range(len(shape) - 1)]
+
+    x = var("x")
+    y = var("y")
+    i = var("i")
+    i2 = var("i2")
+    i3 = var("i3")
+
+    fixed_parameters = {"const": complex_dtype(sign*(-2j)*pi/n), "n": n}
+
+    index = (*broadcast_dims, i2)
+    insns = [
+        "exp_table[i] = exp(const * i) {id=exp_table}",
+        lp.Assignment(
+            assignee=x[index],
+            expression=y[index],
+            id="copy",
+            happens_after=frozenset(["exp_table"]),
+        ),
+    ]
+
+    for ilev, N1 in enumerate(list(reversed(factors))):  # noqa: N806
+        nfft //= N1
+        N2 = n // (nfft * N1)  # noqa: N806
+        init_happens_after = "copy" if ilev == 0 else f"update_{ilev-1}"
+
+        temp = var("temp")
+        exp_table = var("exp_table")
+        i = var(f"i_{ilev}")
+        i2 = var(f"i2_{ilev}")
+        ifft = var(f"ifft_{ilev}")
+        iN1 = var(f"iN1_{ilev}")           # noqa: N806
+        iN1_sum = var(f"iN1_sum_{ilev}")   # noqa: N806
+        iN2 = var(f"iN2_{ilev}")           # noqa: N806
+        table_idx = var(f"table_idx_{ilev}")
+        exp = var(f"exp_{ilev}")
+
+        i_bcast = (*broadcast_dims, i)
+        i2_bcast = (*broadcast_dims, i2)
+        iN_bcast = (*broadcast_dims, ifft + nfft * (iN1 * N2 + iN2))  # noqa: N806
+
+        insns += [
+            lp.Assignment(
+                assignee=temp[i],
+                expression=x[i_bcast],
+                id=f"copy_{ilev}",
+                happens_after=frozenset([init_happens_after]),
+            ),
+            lp.Assignment(
+                assignee=x[i2_bcast],
+                expression=0,
+                id=f"reset_{ilev}",
+                happens_after=frozenset([f"copy_{ilev}"])),
+            lp.Assignment(
+                assignee=table_idx,
+                expression=nfft*iN1_sum*(iN2 + N2*iN1),
+                id=f"idx_{ilev}",
+                happens_after=frozenset([f"reset_{ilev}"]),
+                temp_var_type=lp.Optional(np.uint32)),
+            lp.Assignment(
+                assignee=exp,
+                expression=exp_table[table_idx % n],
+                id=f"exp_{ilev}",
+                happens_after=frozenset([f"idx_{ilev}"]),
+                within_inames=frozenset({x.name for x in
+                    [*broadcast_dims, iN1_sum, iN1, iN2]}),
+                temp_var_type=lp.Optional(complex_dtype)),
+            lp.Assignment(
+                assignee=x[iN_bcast],
+                expression=(x[iN_bcast]
+                    + exp * temp[ifft + nfft * (iN2*N1 + iN1_sum)]),
+                id=f"update_{ilev}",
+                happens_after=frozenset([f"exp_{ilev}"])),
+        ]
+
+        domains += [
+            f"[ifft_{ilev}]: 0<=ifft_{ilev}<{nfft}",
+            f"[iN1_{ilev}]: 0<=iN1_{ilev}<{N1}",
+            f"[iN1_sum_{ilev}]: 0<=iN1_sum_{ilev}<{N1}",
+            f"[iN2_{ilev}]: 0<=iN2_{ilev}<{N2}",
+            f"[i_{ilev}]: 0<=i_{ilev}<{n}",
+            f"[i2_{ilev}]: 0<=i2_{ilev}<{n}",
+        ]
+
+    for idom, dom in enumerate(domains):
+        if not dom.startswith("{"):
+            domains[idom] = "{" + dom + "}"
+
+    kernel_data = [
+        lp.GlobalArg("x", shape=shape, is_input=False, is_output=True,
+            dtype=complex_dtype),
+        lp.GlobalArg("y", shape=shape, is_input=True, is_output=False,
+            dtype=complex_dtype),
+        lp.TemporaryVariable("exp_table", shape=(n,),
+            dtype=complex_dtype),
+        lp.TemporaryVariable("temp", shape=(n,),
+            dtype=complex_dtype),
+        ...
+    ]
+
+    if n == 1:
+        domains = domains[2:]
+        index = (*broadcast_dims, 0)
+        insns = [
+            lp.Assignment(
+                assignee=x[index],
+                expression=y[index],
+            ),
+        ]
+        kernel_data = kernel_data[:2]
+    elif inverse:
+        domains += ["{[i3]: 0<=i3<n}"]
+        index = (*broadcast_dims, i3)
+        insns += [
+            lp.Assignment(
+                assignee=x[index],
+                expression=x[index] / n,
+                happens_after=frozenset([f"update_{len(factors) - 1}"]),
+            ),
+        ]
+
+    if name is None:
+        name = f"ifft_{n}" if inverse else f"fft_{n}"
+
+    knl = lp.make_kernel(
+        domains, insns,
+        kernel_data=kernel_data,
+        name=name,
+        fixed_parameters=fixed_parameters,
+        lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+        index_dtype=index_dtype,
+    )
+
+    if broadcast_dims:
+        knl = lp.split_iname(knl, "j0", 32, inner_tag="l.0", outer_tag="g.0")
+        knl = lp.add_inames_for_unused_hw_axes(knl)
+
+    return knl
+
+
+class FFTBackend(enum.Enum):
+    #: FFT backend based on the vkFFT library.
+    pyvkfft = 1
+    #: FFT backend based on :mod:`loopy` used as a fallback.
+    loopy = 2
+
+
+def _get_fft_backend(queue: pyopencl.CommandQueue) -> FFTBackend:
+    import os
+
+    env_val = os.environ.get("SUMPY_FFT_BACKEND")
+    if env_val:
+        if env_val not in ["loopy", "pyvkfft"]:
+            raise ValueError("Expected 'loopy' or 'pyvkfft' for SUMPY_FFT_BACKEND. "
+                   f"Found {env_val}.")
+        return FFTBackend[env_val]
+
+    try:
+        import pyvkfft.opencl  # noqa: F401
+    except ImportError:
+        warnings.warn("VkFFT not found. FFT runs will be slower.", stacklevel=3)
+        return FFTBackend.loopy
+
+    from pyopencl import command_queue_properties
+
+    if queue.properties & command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE:
+        warnings.warn(
+            "VkFFT does not support out of order queues yet. "
+            "Falling back to slower implementation.", stacklevel=3)
+        return FFTBackend.loopy
+
+    import platform
+    import sys
+
+    if (sys.platform == "darwin"
+            and platform.machine() == "x86_64"
+            and queue.context.devices[0].platform.name
+            == "Portable Computing Language"):
+        warnings.warn(
+            "PoCL miscompiles some VkFFT kernels. "
+            "See https://github.com/inducer/sumpy/issues/129. "
+            "Falling back to slower implementation.", stacklevel=3)
+        return FFTBackend.loopy
+
+    return FFTBackend.pyvkfft
+
+
+def get_opencl_fft_app(
+        queue: pyopencl.CommandQueue,
+        shape: tuple[int, ...],
+        dtype: numpy.dtype[Any],
+        inverse: bool) -> Any:
+    """Setup an object for out-of-place FFT on with given shape and dtype
+    on given queue.
+    """
+    assert dtype.type in (np.float32, np.float64, np.complex64,
+                           np.complex128)
+
+    backend = _get_fft_backend(queue)
+
+    if backend == FFTBackend.loopy:
+        return loopy_fft(shape, inverse=inverse, complex_dtype=dtype.type), backend
+    elif backend == FFTBackend.pyvkfft:
+        from pyvkfft.opencl import VkFFTApp
+        app = VkFFTApp(shape=shape, dtype=dtype, queue=queue, ndim=1, inplace=False)
+        return app, backend
+    else:
+        raise RuntimeError(f"Unsupported FFT backend {backend}")
+
+
+def run_opencl_fft(
+        fft_app: tuple[Any, FFTBackend],
+        queue: pyopencl.CommandQueue,
+        input_vec: Any,
+        inverse: bool = False,
+        wait_for: list[pyopencl.Event] | None = None
+    ) -> tuple[pyopencl.Event, Any]:
+    """Runs an FFT on input_vec and returns a :class:`MarkerBasedProfilingEvent`
+    that indicate the end and start of the operations carried out and the output
+    vector.
+    Only supports in-order queues.
+    """
+    app, backend = fft_app
+
+    if backend == FFTBackend.loopy:
+        evt, (output_vec,) = app(queue, y=input_vec, wait_for=wait_for)
+        return (evt, output_vec)
+    elif backend == FFTBackend.pyvkfft:
+        if wait_for is None:
+            wait_for = []
+
+        import pyopencl as cl
+        import pyopencl.array as cla
+
+        if queue.device.platform.name == "NVIDIA CUDA":
+            # NVIDIA OpenCL gives wrong event profile values with wait_for
+            # Not passing wait_for will wait for all events queued before
+            # and therefore correctness is preserved if it's the same queue
+            for evt in wait_for:
+                if not evt.command_queue != queue:
+                    raise RuntimeError(
+                        "Different queues not supported with NVIDIA CUDA")
+            start_evt = cl.enqueue_marker(queue)
+        else:
+            start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:])
+
+        if app.inplace:
+            raise RuntimeError("inplace fft is not supported")
+        else:
+            output_vec = cla.empty_like(input_vec, queue)
+
+        # FIXME: use the public API once
+        # https://github.com/vincefn/pyvkfft/pull/17 is in
+        from pyvkfft.opencl import _vkfft_opencl
+        if inverse:  # noqa: SIM108
+            meth = _vkfft_opencl.ifft
+        else:
+            meth = _vkfft_opencl.fft
+
+        meth(app.app, int(input_vec.data.int_ptr),
+            int(output_vec.data.int_ptr), int(queue.int_ptr))
+
+        if queue.device.platform.name == "NVIDIA CUDA":
+            end_evt = cl.enqueue_marker(queue)
+        else:
+            end_evt = cl.enqueue_marker(queue, wait_for=[start_evt])
+
+        output_vec.add_event(end_evt)
+
+        return (MarkerBasedProfilingEvent(end_event=end_evt, start_event=start_evt),
+            output_vec)
+    else:
+        raise RuntimeError(f"Unsupported FFT backend {backend}")
+
+# }}}
+
+
+# {{{ deprecations
+
+_depr_name_to_replacement_and_obj = {
+    "KernelCacheWrapper": ("KernelCacheMixin", 2023),
+    }
+
+
+def __getattr__(name):
+    replacement_and_obj = _depr_name_to_replacement_and_obj.get(name)
+    if replacement_and_obj is not None:
+        replacement, obj, year = replacement_and_obj
+        from warnings import warn
+        warn(f"'sumpy.tools.{name}' is deprecated. "
+                f"Use '{replacement}' instead. "
+                f"'sumpy.tools.{name}' will continue to work until {year}.",
+                DeprecationWarning, stacklevel=2)
+        return obj
+    else:
+        raise AttributeError(name)
+
+# }}}

 # vim: fdm=marker
--- a/sumpy/toys.py
+++ b/sumpy/toys.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+
+from sumpy.expansion.m2l import M2LTranslationClassFactoryBase
+

 __copyright__ = """
 Copyright (C) 2017 Andreas Kloeckner
@@ -25,18 +28,31 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import six  # noqa: F401
-from six.moves import range  # noqa: F401
+from collections.abc import Sequence
+from functools import partial
+from numbers import Number
+from typing import TYPE_CHECKING

 from pytools import memoize_method
-from numbers import Number
-from sumpy.kernel import TargetDerivativeRemover

-import numpy as np  # noqa: F401
+from sumpy.kernel import TargetTransformationRemover
+
+
+if TYPE_CHECKING:
+    import pyopencl
+
+    from sumpy.kernel import Kernel
+    from sumpy.visualization import FieldPlotter
+
+import logging
+
+import numpy as np
+
 import loopy as lp  # noqa: F401
 import pyopencl as cl
+import pyopencl.array
+

-import logging
 logger = logging.getLogger(__name__)

 __doc__ = """
@@ -57,6 +73,7 @@ These functions manipulate these potentials:
 .. autofunction:: combine_inner_outer
 .. autofunction:: combine_halfspace
 .. autofunction:: combine_halfspace_and_outer
+.. autofunction:: l_inf

 These functions help with plotting:

@@ -73,6 +90,7 @@ by users:
 .. autoclass:: ExpansionPotentialSource
 .. autoclass:: MultipoleExpansion
 .. autoclass:: LocalExpansion
+.. autoclass:: PotentialExpressionNode
 .. autoclass:: Sum
 .. autoclass:: Product
 .. autoclass:: SchematicVisitor
@@ -82,24 +100,24 @@ by users:

 # {{{ context

-class ToyContext(object):
+class ToyContext:
    """This class functions as a container for generated code and 'behind-the-scenes'
    information.

    .. automethod:: __init__
    """

-    def __init__(self, cl_context, kernel,
+    def __init__(self, cl_context: pyopencl.Context, kernel: Kernel,
            mpole_expn_class=None,
            local_expn_class=None,
            expansion_factory=None,
            extra_source_kwargs=None,
-            extra_kernel_kwargs=None):
+            extra_kernel_kwargs=None, m2l_use_fft=None):
        self.cl_context = cl_context
        self.queue = cl.CommandQueue(self.cl_context)
        self.kernel = kernel

-        self.no_target_deriv_kernel = TargetDerivativeRemover()(kernel)
+        self.no_target_deriv_kernel = TargetTransformationRemover()(kernel)

        if expansion_factory is None:
            from sumpy.expansion import DefaultExpansionFactory
@@ -108,8 +126,25 @@ class ToyContext(object):
            mpole_expn_class = \
                    expansion_factory.get_multipole_expansion_class(kernel)
        if local_expn_class is None:
+            from sumpy.expansion.m2l import (
+                FFTM2LTranslationClassFactory,
+                NonFFTM2LTranslationClassFactory,
+            )
+            if m2l_use_fft:
+                m2l_translation_class_factory: M2LTranslationClassFactoryBase = \
+                    FFTM2LTranslationClassFactory()
+            else:
+                m2l_translation_class_factory = NonFFTM2LTranslationClassFactory()
            local_expn_class = \
                    expansion_factory.get_local_expansion_class(kernel)
+            m2l_translation_class = \
+                    m2l_translation_class_factory.get_m2l_translation_class(
+                        kernel, local_expn_class)
+            local_expn_class = partial(local_expn_class,
+                    m2l_translation=m2l_translation_class())
+        elif m2l_use_fft is not None:
+            raise ValueError("local_expn_class and m2l_use_fft are both supplied. "
+                             "Use only one of these arguments")

        self.mpole_expn_class = mpole_expn_class
        self.local_expn_class = local_expn_class
@@ -129,35 +164,35 @@ class ToyContext(object):
    @memoize_method
    def get_p2p(self):
        from sumpy.p2p import P2P
-        return P2P(self.cl_context, [self.kernel], exclude_self=False)
+        return P2P(self.cl_context, (self.kernel,), exclude_self=False)

    @memoize_method
    def get_p2m(self, order):
        from sumpy import P2EFromSingleBox
        return P2EFromSingleBox(self.cl_context,
                self.mpole_expn_class(self.no_target_deriv_kernel, order),
-                [self.kernel])
+                kernels=(self.kernel,))

    @memoize_method
    def get_p2l(self, order):
        from sumpy import P2EFromSingleBox
        return P2EFromSingleBox(self.cl_context,
                self.local_expn_class(self.no_target_deriv_kernel, order),
-                [self.kernel])
+                kernels=(self.kernel,))

    @memoize_method
    def get_m2p(self, order):
        from sumpy import E2PFromSingleBox
        return E2PFromSingleBox(self.cl_context,
                self.mpole_expn_class(self.no_target_deriv_kernel, order),
-                [self.kernel])
+                (self.kernel,))

    @memoize_method
    def get_l2p(self, order):
        from sumpy import E2PFromSingleBox
        return E2PFromSingleBox(self.cl_context,
                self.local_expn_class(self.no_target_deriv_kernel, order),
-                [self.kernel])
+                (self.kernel,))

    @memoize_method
    def get_m2m(self, from_order, to_order):
@@ -166,10 +201,51 @@ class ToyContext(object):
                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
                self.mpole_expn_class(self.no_target_deriv_kernel, to_order))

+    @memoize_method
+    def use_translation_classes_dependent_data(self):
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, 2)
+        return l_expn.m2l_translation.use_preprocessing
+
+    @memoize_method
+    def use_fft(self):
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, 2)
+        return l_expn.m2l_translation.use_fft
+
    @memoize_method
    def get_m2l(self, from_order, to_order):
-        from sumpy import E2EFromCSR
-        return E2EFromCSR(self.cl_context,
+        from sumpy import E2EFromCSR, M2LUsingTranslationClassesDependentData
+        if self.use_translation_classes_dependent_data():
+            m2l_class = M2LUsingTranslationClassesDependentData
+        else:
+            m2l_class = E2EFromCSR
+        return m2l_class(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_translation_class_dependent_data_kernel(self, from_order, to_order):
+        from sumpy import M2LGenerateTranslationClassesDependentData
+        return M2LGenerateTranslationClassesDependentData(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_expansion_size(self, from_order, to_order):
+        m_expn = self.mpole_expn_class(self.no_target_deriv_kernel, from_order)
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, to_order)
+        return l_expn.m2l_translation.preprocess_multipole_nexprs(l_expn, m_expn)
+
+    @memoize_method
+    def get_m2l_preprocess_mpole_kernel(self, from_order, to_order):
+        from sumpy import M2LPreprocessMultipole
+        return M2LPreprocessMultipole(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_postprocess_local_kernel(self, from_order, to_order):
+        from sumpy import M2LPostprocessLocal
+        return M2LPostprocessLocal(self.cl_context,
                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
                self.local_expn_class(self.no_target_deriv_kernel, to_order))

@@ -186,49 +262,59 @@ class ToyContext(object):
 # {{{ helpers

 def _p2e(psource, center, rscale, order, p2e, expn_class, expn_kwargs):
-    source_boxes = np.array([0], dtype=np.int32)
-    box_source_starts = np.array([0], dtype=np.int32)
-    box_source_counts_nonchild = np.array(
-            [psource.points.shape[-1]], dtype=np.int32)
-
    toy_ctx = psource.toy_ctx
+    queue = toy_ctx.queue
+
+    source_boxes = cl.array.to_device(
+        queue, np.array([0], dtype=np.int32))
+    box_source_starts = cl.array.to_device(
+        queue, np.array([0], dtype=np.int32))
+    box_source_counts_nonchild = cl.array.to_device(
+        queue, np.array([psource.points.shape[-1]], dtype=np.int32))
+
    center = np.asarray(center)
-    centers = np.array(center, dtype=np.float64).reshape(
-            toy_ctx.kernel.dim, 1)
+    centers = cl.array.to_device(
+        queue,
+        np.array(center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1))

-    evt, (coeffs,) = p2e(toy_ctx.queue,
+    _evt, (coeffs,) = p2e(toy_ctx.queue,
            source_boxes=source_boxes,
            box_source_starts=box_source_starts,
            box_source_counts_nonchild=box_source_counts_nonchild,
            centers=centers,
-            sources=psource.points,
-            strengths=psource.weights,
+            sources=cl.array.to_device(queue, psource.points),
+            strengths=(cl.array.to_device(queue, psource.weights),),
            rscale=rscale,
            nboxes=1,
            tgt_base_ibox=0,
-
-            #flags="print_hl_cl",
-            out_host=True,
            **toy_ctx.extra_source_and_kernel_kwargs)

-    return expn_class(toy_ctx, center, rscale, order, coeffs[0],
+    return expn_class(toy_ctx, center, rscale, order, coeffs[0].get(queue),
            derived_from=psource, **expn_kwargs)


 def _e2p(psource, targets, e2p):
+    toy_ctx = psource.toy_ctx
+    queue = toy_ctx.queue
+
    ntargets = targets.shape[-1]
+    boxes = cl.array.to_device(
+        queue, np.array([0], dtype=np.int32))
+    box_target_starts = cl.array.to_device(
+        queue, np.array([0], dtype=np.int32))
+    box_target_counts_nonchild = cl.array.to_device(
+        queue, np.array([ntargets], dtype=np.int32))

-    boxes = np.array([0], dtype=np.int32)
+    centers = cl.array.to_device(
+        queue,
+        np.array(psource.center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1))

-    box_target_starts = np.array([0], dtype=np.int32)
-    box_target_counts_nonchild = np.array([ntargets], dtype=np.int32)
+    from pytools.obj_array import make_obj_array

-    toy_ctx = psource.toy_ctx
-    centers = np.array(psource.center, dtype=np.float64).reshape(
-            toy_ctx.kernel.dim, 1)
+    from sumpy.tools import vector_to_device

-    coeffs = np.array([psource.coeffs])
-    evt, (pot,) = e2p(
+    coeffs = cl.array.to_device(queue, np.array([psource.coeffs]))
+    _evt, (pot,) = e2p(
            toy_ctx.queue,
            src_expansions=coeffs,
            src_base_ibox=0,
@@ -237,60 +323,169 @@ def _e2p(psource, targets, e2p):
            box_target_counts_nonchild=box_target_counts_nonchild,
            centers=centers,
            rscale=psource.rscale,
-            targets=targets,
-            #flags="print_hl_cl",
-            out_host=True, **toy_ctx.extra_kernel_kwargs)
+            targets=vector_to_device(queue, make_obj_array(targets)),
+            **toy_ctx.extra_kernel_kwargs)

-    return pot
+    return pot.get(queue)


-def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs):
+def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs,
+         extra_kernel_kwargs):
    toy_ctx = psource.toy_ctx
-
-    target_boxes = np.array([1], dtype=np.int32)
-    src_box_starts = np.array([0, 1], dtype=np.int32)
-    src_box_lists = np.array([0], dtype=np.int32)
-
-    centers = (np.array(
+    queue = toy_ctx.queue
+
+    target_boxes = cl.array.to_device(
+        queue, np.array([1], dtype=np.int32))
+    src_box_starts = cl.array.to_device(
+        queue, np.array([0, 1], dtype=np.int32))
+    src_box_lists = cl.array.to_device(
+        queue, np.array([0], dtype=np.int32))
+
+    centers = cl.array.to_device(
+        queue,
+        np.array(
            [
                # box 0: source
                psource.center,

                # box 1: target
                to_center,
-                ],
-            dtype=np.float64)).T.copy()
-
-    coeffs = np.array([psource.coeffs])
-
-    evt, (to_coeffs,) = e2e(
-            toy_ctx.queue,
-            src_expansions=coeffs,
-            src_base_ibox=0,
-            tgt_base_ibox=0,
-            ntgt_level_boxes=2,
-
-            target_boxes=target_boxes,
+            ],
+            dtype=np.float64).T.copy()
+        )
+
+    coeffs = cl.array.to_device(queue, np.array([psource.coeffs]))
+    args = {
+        "queue": toy_ctx.queue,
+        "src_expansions": coeffs,
+        "src_base_ibox": 0,
+        "tgt_base_ibox": 0,
+        "ntgt_level_boxes": 2,
+        "target_boxes": target_boxes,
+        "src_box_starts": src_box_starts,
+        "src_box_lists": src_box_lists,
+        "centers": centers,
+        "src_rscale": psource.rscale,
+        "tgt_rscale": to_rscale,
+        **extra_kernel_kwargs,
+        **toy_ctx.extra_kernel_kwargs,
+    }
+
+    _evt, (to_coeffs,) = e2e(**args)
+
+    return expn_class(
+            toy_ctx, to_center, to_rscale, to_order, to_coeffs[1].get(queue),
+            derived_from=psource, **expn_kwargs)

-            src_box_starts=src_box_starts,
-            src_box_lists=src_box_lists,
-            centers=centers,

-            src_rscale=psource.rscale,
-            tgt_rscale=to_rscale,
+def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs,
+         translation_classes_kwargs):
+    toy_ctx = psource.toy_ctx
+    queue = toy_ctx.queue
+
+    coeffs = cl.array.to_device(queue, np.array([psource.coeffs]))
+
+    m2l_use_translation_classes_dependent_data = \
+            toy_ctx.use_translation_classes_dependent_data()
+
+    if m2l_use_translation_classes_dependent_data:
+        data_kernel = translation_classes_kwargs["data_kernel"]
+        preprocess_kernel = translation_classes_kwargs["preprocess_kernel"]
+        postprocess_kernel = translation_classes_kwargs["postprocess_kernel"]
+        expn_size = translation_classes_kwargs["m2l_expn_size"]
+
+        # Preprocess the mpole expansion
+        preprocessed_src_expansions = cl.array.zeros(queue, (1, expn_size),
+                dtype=np.complex128)
+        evt, _ = preprocess_kernel(queue,
+                src_expansions=coeffs,
+                preprocessed_src_expansions=preprocessed_src_expansions,
+                src_rscale=np.float64(psource.rscale),
+                **toy_ctx.extra_kernel_kwargs)
+
+        from sumpy.tools import get_native_event, get_opencl_fft_app, run_opencl_fft
+
+        if toy_ctx.use_fft:
+            fft_app = get_opencl_fft_app(queue, (expn_size,),
+                dtype=preprocessed_src_expansions.dtype, inverse=False)
+            ifft_app = get_opencl_fft_app(queue, (expn_size,),
+                dtype=preprocessed_src_expansions.dtype, inverse=True)
+
+            evt, preprocessed_src_expansions = run_opencl_fft(fft_app, queue,
+                    preprocessed_src_expansions, inverse=False, wait_for=[evt])
+
+        # Compute translation classes data
+        m2l_translation_classes_lists = cl.array.to_device(queue,
+                np.array([0], dtype=np.int32))
+        dist = np.array(to_center - psource.center, dtype=np.float64)
+        dim = toy_ctx.kernel.dim
+        m2l_translation_vectors = cl.array.to_device(queue, dist.reshape(dim, 1))
+        m2l_translation_classes_dependent_data = cl.array.zeros(queue,
+                (1, expn_size), dtype=np.complex128)
+
+        evt, _ = data_kernel(
+                queue,
+                src_rscale=np.float64(psource.rscale),
+                ntranslation_classes=1,
+                translation_classes_level_start=0,
+                m2l_translation_vectors=m2l_translation_vectors,
+                m2l_translation_classes_dependent_data=(
+                    m2l_translation_classes_dependent_data),
+                ntranslation_vectors=1,
+                wait_for=[get_native_event(evt)],
+                **toy_ctx.extra_kernel_kwargs)
+
+        if toy_ctx.use_fft:
+            evt, m2l_translation_classes_dependent_data = run_opencl_fft(fft_app,
+                queue, m2l_translation_classes_dependent_data, inverse=False,
+                wait_for=[evt])
+
+        ret = _e2e(psource, to_center, to_rscale, to_order,
+            e2e, expn_class, expn_kwargs,
+            {
+                "src_expansions": preprocessed_src_expansions,
+                "m2l_translation_classes_lists": m2l_translation_classes_lists,
+                "m2l_translation_classes_dependent_data": (
+                    m2l_translation_classes_dependent_data),
+                "translation_classes_level_start": 0,
+            }
+        )
+
+        # Postprocess the local expansion
+        local_before = cl.array.to_device(queue, np.array([ret.coeffs]))
+        to_coeffs = cl.array.zeros(queue, (1, len(data_kernel.tgt_expansion)),
+                                           dtype=coeffs.dtype)
+
+        if toy_ctx.use_fft:
+            evt, local_before = run_opencl_fft(ifft_app, queue,
+                local_before, inverse=True, wait_for=[get_native_event(evt)])
+
+        evt, _ = postprocess_kernel(queue=queue,
+                tgt_expansions_before_postprocessing=local_before,
+                tgt_expansions=to_coeffs,
+                src_rscale=np.float64(psource.rscale),
+                tgt_rscale=np.float64(to_rscale),
+                wait_for=[get_native_event(evt)],
+                **toy_ctx.extra_kernel_kwargs)
+
+        return expn_class(
+            toy_ctx, to_center, to_rscale, to_order, to_coeffs.get(queue)[0],
+            derived_from=psource, **expn_kwargs)
+    else:
+        ret = _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class,
+                expn_kwargs, {})

-            #flags="print_hl_cl",
-            out_host=True, **toy_ctx.extra_kernel_kwargs)
+    return ret

-    return expn_class(toy_ctx, to_center, to_rscale, to_order, to_coeffs[1],
-            derived_from=psource, **expn_kwargs)

 # }}}

-
 # {{{ potential source classes

-class PotentialSource(object):
+Number_ish = int | float | complex | np.number
+
+
+class PotentialSource:
    """A base class for all classes representing potentials that can be
    evaluated anywhere in space.

@@ -307,17 +502,22 @@ class PotentialSource(object):
    .. automethod:: __rmul__
    """

-    def __init__(self, toy_ctx):
+    def __init__(self, toy_ctx: ToyContext):
        self.toy_ctx = toy_ctx

-    def eval(self, targets):
+    def eval(self, targets: np.ndarray) -> np.ndarray:
+        """
+        :param targets: An array of shape ``(dim, ntargets)``.
+        :returns: an array of shape ``(ntargets,)``.
+        """
        raise NotImplementedError()

-    def __neg__(self):
+    def __neg__(self) -> PotentialSource:
        return -1*self

-    def __add__(self, other):
-        if isinstance(other, (Number, np.number)):
+    def __add__(self, other: Number_ish | PotentialSource
+                ) -> PotentialSource:
+        if isinstance(other, Number | np.number):
            other = ConstantPotential(self.toy_ctx, other)
        elif not isinstance(other, PotentialSource):
            return NotImplemented
@@ -326,14 +526,20 @@ class PotentialSource(object):

    __radd__ = __add__

-    def __sub__(self, other):
+    def __sub__(self,
+                other: Number_ish | PotentialSource) -> PotentialSource:
        return self.__add__(-other)

-    def __rsub__(self, other):
+    # FIXME: mypy says " Forward operator "__sub__" is not callable"
+    # I don't know what it means. -AK, 2024-07-18
+    def __rsub__(self,  # type:ignore[misc]
+                 other: Number_ish | PotentialSource
+                 ) -> PotentialSource:
        return (-self).__add__(other)

-    def __mul__(self, other):
-        if isinstance(other, (Number, np.number)):
+    def __mul__(self,
+                other: Number_ish | PotentialSource) -> PotentialSource:
+        if isinstance(other, Number | np.number):
            other = ConstantPotential(self.toy_ctx, other)
        elif not isinstance(other, PotentialSource):
            return NotImplemented
@@ -345,14 +551,16 @@ class PotentialSource(object):

 class ConstantPotential(PotentialSource):
    """
+    Inherits from :class:`PotentialSource`.
+
    .. automethod:: __init__
    """

-    def __init__(self, toy_ctx, value):
-        super(ConstantPotential, self).__init__(toy_ctx)
+    def __init__(self, toy_ctx: ToyContext, value):
+        super().__init__(toy_ctx)
        self.value = np.array(value)

-    def eval(self, targets):
+    def eval(self, targets: np.ndarray) -> np.ndarray:
        pot = np.empty(targets.shape[-1], dtype=self.value.dtype)
        pot.fill(self.value)
        return pot
@@ -360,29 +568,37 @@ class ConstantPotential(PotentialSource):

 class OneOnBallPotential(PotentialSource):
    """
+    A potential that is the characteristic function on a ball.
+
+    Inherits from :class:`PotentialSource`.
+
    .. automethod:: __init__
    """
-    def __init__(self, toy_ctx, center, radius):
-        super(OneOnBallPotential, self).__init__(toy_ctx)
+    def __init__(self,
+                 toy_ctx: ToyContext, center: np.ndarray, radius: float) -> None:
+        super().__init__(toy_ctx)
        self.center = np.asarray(center)
        self.radius = radius

-    def eval(self, targets):
+    def eval(self, targets: np.ndarray) -> np.ndarray:
        dist_vec = targets - self.center[:, np.newaxis]
        return (np.sum(dist_vec**2, axis=0) < self.radius**2).astype(np.float64)


 class HalfspaceOnePotential(PotentialSource):
    """
+    A potential that is the characteristic function of a halfspace.
+
    .. automethod:: __init__
    """
-    def __init__(self, toy_ctx, center, axis, side=1):
-        super(HalfspaceOnePotential, self).__init__(toy_ctx)
+    def __init__(self, toy_ctx: ToyContext, center: np.ndarray,
+                 axis: int, side: int = 1) -> None:
+        super().__init__(toy_ctx)
        self.center = np.asarray(center)
        self.axis = axis
        self.side = side

-    def eval(self, targets):
+    def eval(self, targets: np.ndarray) -> np.ndarray:
        return (
            (self.side*(targets[self.axis] - self.center[self.axis])) >= 0
            ).astype(np.float64)
@@ -390,6 +606,8 @@ class HalfspaceOnePotential(PotentialSource):

 class PointSources(PotentialSource):
    """
+    Inherits from :class:`PotentialSource`.
+
    .. attribute:: points

        ``[ndim, npoints]``
@@ -397,20 +615,25 @@ class PointSources(PotentialSource):
    .. automethod:: __init__
    """

-    def __init__(self, toy_ctx, points, weights, center=None):
-        super(PointSources, self).__init__(toy_ctx)
+    def __init__(self,
+                 toy_ctx: ToyContext, points: np.ndarray, weights: np.ndarray,
+                 center: np.ndarray | None = None):
+        super().__init__(toy_ctx)

        self.points = points
        self.weights = weights
        self._center = center

-    def eval(self, targets):
-        evt, (potential,) = self.toy_ctx.get_p2p()(
-                self.toy_ctx.queue, targets, self.points, [self.weights],
-                out_host=True,
+    def eval(self, targets: np.ndarray) -> np.ndarray:
+        queue = self.toy_ctx.queue
+        _evt, (potential,) = self.toy_ctx.get_p2p()(
+                queue,
+                cl.array.to_device(queue, targets),
+                cl.array.to_device(queue, self.points),
+                [cl.array.to_device(queue, self.weights)],
                **self.toy_ctx.extra_source_and_kernel_kwargs)

-        return potential
+        return potential.get(queue)

    @property
    def center(self):
@@ -422,19 +645,23 @@ class PointSources(PotentialSource):

 class ExpansionPotentialSource(PotentialSource):
    """
+    Inherits from :class:`PotentialSource`.
+
    .. attribute:: radius

        Not used mathematically. Just for visualization, purely advisory.

    .. attribute:: text_kwargs

-       Passed to :method:`matplotlib.pyplot.annotate`. Used for customizing the
+       Passed to :func:`matplotlib.pyplot.annotate`. Used for customizing the
       expansion label. Changing the label text is supported by passing the
       kwarg *s*.  Just for visualization, purely advisory.
+
+    .. automethod:: __init__
    """
    def __init__(self, toy_ctx, center, rscale, order, coeffs, derived_from,
            radius=None, expn_style=None, text_kwargs=None):
-        super(ExpansionPotentialSource, self).__init__(toy_ctx)
+        super().__init__(toy_ctx)
        self.center = np.asarray(center)
        self.rscale = rscale
        self.order = order
@@ -452,28 +679,42 @@ class ExpansionPotentialSource(PotentialSource):


 class MultipoleExpansion(ExpansionPotentialSource):
-    def eval(self, targets):
+    """
+    Inherits from :class:`ExpansionPotentialSource`.
+    """
+
+    def eval(self, targets: np.ndarray) -> np.ndarray:
        return _e2p(self, targets, self.toy_ctx.get_m2p(self.order))


 class LocalExpansion(ExpansionPotentialSource):
-    def eval(self, targets):
+    """
+    Inherits from :class:`ExpansionPotentialSource`.
+    """
+
+    def eval(self, targets: np.ndarray) -> np.ndarray:
        return _e2p(self, targets, self.toy_ctx.get_l2p(self.order))


 class PotentialExpressionNode(PotentialSource):
-    def __init__(self, psources):
+    """
+    Inherits from :class:`PotentialSource`.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, psources: Sequence[PotentialSource]) -> None:
        from pytools import single_valued
-        super(PotentialExpressionNode, self).__init__(
+        super().__init__(
                single_valued(psource.toy_ctx for psource in psources))

        self.psources = psources

    @property
-    def center(self):
+    def center(self) -> np.ndarray:
        for psource in self.psources:
            try:
-                return psource.center
+                return psource.center  # type: ignore[attr-defined]
            except AttributeError:
                pass

@@ -481,8 +722,12 @@ class PotentialExpressionNode(PotentialSource):


 class Sum(PotentialExpressionNode):
-    def eval(self, targets):
-        result = 0
+    """
+    Inherits from :class:`PotentialExpressionNode`.
+    """
+
+    def eval(self, targets: np.ndarray) -> np.ndarray:
+        result = np.zeros(targets.shape[1])
        for psource in self.psources:
            result = result + psource.eval(targets)

@@ -490,8 +735,12 @@ class Sum(PotentialExpressionNode):


 class Product(PotentialExpressionNode):
-    def eval(self, targets):
-        result = 1
+    """
+    Inherits from :class:`PotentialExpressionNode`.
+    """
+
+    def eval(self, targets: np.ndarray) -> np.ndarray:
+        result = np.zeros(targets.shape[1])
        for psource in self.psources:
            result = result * psource.eval(targets)

@@ -500,7 +749,9 @@ class Product(PotentialExpressionNode):
 # }}}


-def multipole_expand(psource, center, order=None, rscale=1, **expn_kwargs):
+def multipole_expand(psource: PotentialSource,
+                     center: np.ndarray, order: int | None = None,
+                     rscale: float = 1, **expn_kwargs) -> MultipoleExpansion:
    if isinstance(psource, PointSources):
        if order is None:
            raise ValueError("order may not be None")
@@ -514,14 +765,16 @@ def multipole_expand(psource, center, order=None, rscale=1, **expn_kwargs):

        return _e2e(psource, center, rscale, order,
                psource.toy_ctx.get_m2m(psource.order, order),
-                MultipoleExpansion, expn_kwargs)
+                MultipoleExpansion, expn_kwargs, {})

    else:
-        raise TypeError("do not know how to expand '%s'"
-                % type(psource).__name__)
+        raise TypeError(f"do not know how to expand '{type(psource).__name__}'")


-def local_expand(psource, center, order=None, rscale=1, **expn_kwargs):
+def local_expand(
+        psource: PotentialSource,
+        center: np.ndarray, order: int | None = None, rscale: Number_ish = 1,
+        **expn_kwargs) -> LocalExpansion:
    if isinstance(psource, PointSources):
        if order is None:
            raise ValueError("order may not be None")
@@ -533,9 +786,28 @@ def local_expand(psource, center, order=None, rscale=1, **expn_kwargs):
        if order is None:
            order = psource.order

-        return _e2e(psource, center, rscale, order,
-                psource.toy_ctx.get_m2l(psource.order, order),
-                LocalExpansion, expn_kwargs)
+        toy_ctx = psource.toy_ctx
+        translation_classes_kwargs = {}
+        m2l_use_translation_classes_dependent_data = \
+            toy_ctx.use_translation_classes_dependent_data()
+
+        if m2l_use_translation_classes_dependent_data:
+            data_kernel = toy_ctx.get_m2l_translation_class_dependent_data_kernel(
+                    psource.order, order)
+            preprocess_kernel = toy_ctx.get_m2l_preprocess_mpole_kernel(
+                    psource.order, order)
+            postprocess_kernel = toy_ctx.get_m2l_postprocess_local_kernel(
+                    psource.order, order)
+            translation_classes_kwargs["data_kernel"] = data_kernel
+            translation_classes_kwargs["preprocess_kernel"] = preprocess_kernel
+            translation_classes_kwargs["postprocess_kernel"] = postprocess_kernel
+            translation_classes_kwargs["m2l_expn_size"] = \
+                    toy_ctx.get_m2l_expansion_size(psource.order, order)
+
+        return _m2l(psource, center, rscale, order,
+                toy_ctx.get_m2l(psource.order, order),
+                LocalExpansion, expn_kwargs,
+                translation_classes_kwargs)

    elif isinstance(psource, LocalExpansion):
        if order is None:
@@ -543,22 +815,27 @@ def local_expand(psource, center, order=None, rscale=1, **expn_kwargs):

        return _e2e(psource, center, rscale, order,
                psource.toy_ctx.get_l2l(psource.order, order),
-                LocalExpansion, expn_kwargs)
+                LocalExpansion, expn_kwargs, {})

    else:
-        raise TypeError("do not know how to expand '%s'"
-                % type(psource).__name__)
+        raise TypeError(f"do not know how to expand '{type(psource).__name__}'")


-def logplot(fp, psource, **kwargs):
+def logplot(fp: FieldPlotter, psource: PotentialSource, **kwargs) -> None:
    fp.show_scalar_in_matplotlib(
            np.log10(np.abs(psource.eval(fp.points) + 1e-15)), **kwargs)


-def combine_inner_outer(psource_inner, psource_outer, radius, center=None):
+def combine_inner_outer(
+        psource_inner: PotentialSource,
+        psource_outer: PotentialSource,
+        radius: float | None,
+        center: np.ndarray | None = None) -> PotentialSource:
    if center is None:
+        assert isinstance(psource_inner, ExpansionPotentialSource)
        center = psource_inner.center
    if radius is None:
+        assert isinstance(psource_inner, ExpansionPotentialSource)
        radius = psource_inner.radius

    ball_one = OneOnBallPotential(psource_inner.toy_ctx, center, radius)
@@ -567,8 +844,11 @@ def combine_inner_outer(psource_inner, psource_outer, radius, center=None):
            + psource_outer * (1 - ball_one))


-def combine_halfspace(psource_pos, psource_neg, axis, center=None):
+def combine_halfspace(psource_pos: PotentialSource,
+                      psource_neg: PotentialSource, axis: int,
+                      center: np.ndarray | None = None) -> PotentialSource:
    if center is None:
+        assert isinstance(psource_pos, ExpansionPotentialSource)
        center = psource_pos.center

    halfspace_one = HalfspaceOnePotential(psource_pos.toy_ctx, center, axis)
@@ -577,12 +857,18 @@ def combine_halfspace(psource_pos, psource_neg, axis, center=None):
        + psource_neg * (1-halfspace_one))


-def combine_halfspace_and_outer(psource_pos, psource_neg, psource_outer,
-        axis, radius=None, center=None):
+def combine_halfspace_and_outer(
+        psource_pos: PotentialSource,
+        psource_neg: PotentialSource,
+        psource_outer: PotentialSource,
+        axis: int, radius: float | None = None,
+        center: np.ndarray | None = None) -> PotentialSource:

    if center is None:
+        assert isinstance(psource_pos, ExpansionPotentialSource)
        center = psource_pos.center
    if radius is None:
+        assert isinstance(psource_pos, ExpansionPotentialSource)
        center = psource_pos.radius

    return combine_inner_outer(
@@ -590,8 +876,11 @@ def combine_halfspace_and_outer(psource_pos, psource_neg, psource_outer,
            psource_outer, radius, center)


-def l_inf(psource, radius, center=None, npoints=100, debug=False):
+def l_inf(psource: PotentialSource, radius: float,
+          center: np.ndarray | None = None, npoints: int = 100,
+          debug: bool = False) -> np.number:
    if center is None:
+        assert isinstance(psource, ExpansionPotentialSource)
        center = psource.center

    restr = psource * OneOnBallPotential(psource.toy_ctx, center, radius)
@@ -613,8 +902,8 @@ def l_inf(psource, radius, center=None, npoints=100, debug=False):
 # {{{ schematic visualization

 def draw_box(el, eh, **kwargs):
-    import matplotlib.pyplot as pt
    import matplotlib.patches as mpatches
+    import matplotlib.pyplot as pt
    from matplotlib.path import Path

    pathdata = [
@@ -625,7 +914,7 @@ def draw_box(el, eh, **kwargs):
        (Path.CLOSEPOLY, (el[0], el[1])),
        ]

-    codes, verts = zip(*pathdata)
+    codes, verts = zip(*pathdata, strict=True)
    path = Path(verts, codes)
    patch = mpatches.PathPatch(path, **kwargs)
    pt.gca().add_patch(patch)
@@ -643,7 +932,7 @@ def draw_point(loc, **kwargs):
    plt.plot(*loc, marker="o", **kwargs)


-def draw_annotation(to_pt, from_pt, label, arrowprops={}, **kwargs):
+def draw_annotation(to_pt, from_pt, label, arrowprops=None, **kwargs):
    """
    :arg to_pt: Head of arrow
    :arg from_pt: Tail of arrow
@@ -651,13 +940,15 @@ def draw_annotation(to_pt, from_pt, label, arrowprops={}, **kwargs):
    :arg arrowprops: Passed to arrowprops
    :arg kwargs: Passed to annotate
    """
+    if arrowprops is None:
+        arrowprops = {}

    import matplotlib.pyplot as plt

-    my_arrowprops = dict(
-            facecolor="black",
-            edgecolor="black",
-            arrowstyle="->")
+    my_arrowprops = {
+            "facecolor": "black",
+            "edgecolor": "black",
+            "arrowstyle": "->"}

    my_arrowprops.update(arrowprops)

@@ -665,7 +956,7 @@ def draw_annotation(to_pt, from_pt, label, arrowprops={}, **kwargs):
            arrowprops=my_arrowprops, **kwargs)


-class SchematicVisitor(object):
+class SchematicVisitor:
    def __init__(self, default_expn_style="circle"):
        self.default_expn_style = default_expn_style

@@ -694,7 +985,7 @@ class SchematicVisitor(object):
            elif expn_style == "circle":
                draw_circle(psource.center, psource.radius, fill=None)
            else:
-                raise ValueError("unknown expn_style: %s" % self.expn_style)
+                raise ValueError(f"unknown expn_style: {expn_style}")

        if psource.derived_from is None:
            return
@@ -703,17 +994,17 @@ class SchematicVisitor(object):
        #
        # ------> M

-        text_kwargs = dict(
-                verticalalignment="center",
-                horizontalalignment="center")
+        text_kwargs = {
+                "verticalalignment": "center",
+                "horizontalalignment": "center"}

-        label = "$%s_{%s}$" % (
+        label = "${}_{{{}}}$".format(
                type(psource).__name__[0].lower().replace("l", "\\ell"),
                psource.order)

        if psource.text_kwargs is not None:
            psource_text_kwargs_copy = psource.text_kwargs.copy()
-            label = psource_text_kwargs_copy.pop('s', label)
+            label = psource_text_kwargs_copy.pop("s", label)
            text_kwargs.update(psource_text_kwargs_copy)

        shrinkB = 0  # noqa
@@ -721,10 +1012,10 @@ class SchematicVisitor(object):
            # Avoid overlapping the tail of the arrow with any expansion labels that
            # are present at the tail.
            import matplotlib as mpl
-            font_size = mpl.rcParams['font.size']
+            font_size = mpl.rcParams["font.size"]
            shrinkB = 7/8 * font_size  # noqa

-        arrowprops = dict(shrinkB=shrinkB, arrowstyle="<|-")
+        arrowprops = {"shrinkB": shrinkB, "arrowstyle": "<|-"}

        draw_annotation(psource.derived_from.center, psource.center, label,
                        arrowprops, **text_kwargs)

--- a/sumpy/version.py
+++ b/sumpy/version.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2014 Andreas Kloeckner"

@@ -22,29 +23,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-# {{{ find install- or run-time git revision
+from importlib import metadata
+
+from pytools import find_module_git_revision

-import os
-if os.environ.get("AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY") is not None:
-    # We're just being exec'd by setup.py. We can't import anything.
-    _git_rev = None

-else:
-    import sumpy._git_rev as _git_rev_mod
-    _git_rev = _git_rev_mod.GIT_REVISION
+def _parse_version(version: str) -> tuple[tuple[int, ...], str]:
+    import re

-    # If we're running from a dev tree, the last install (and hence the most
-    # recent update of the above git rev) could have taken place very long ago.
-    from pytools import find_module_git_revision
-    _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1)
-    if _runtime_git_rev is not None:
-        _git_rev = _runtime_git_rev
+    m = re.match(r"^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
+    assert m is not None

-# }}}
+    return tuple(int(nr) for nr in m.group(1).split(".")), m.group(2)


-VERSION = (2016, 1)
-VERSION_STATUS = "beta1"
-VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
+VERSION_TEXT = metadata.version("sumpy")
+VERSION, VERSION_STATUS = _parse_version(VERSION_TEXT)

-KERNEL_VERSION = (VERSION, _git_rev, 0)
+_GIT_REVISION = find_module_git_revision(__file__, n_levels_up=1)
+KERNEL_VERSION = (*VERSION, _GIT_REVISION, 0)
--- a/sumpy/visualization.py
+++ b/sumpy/visualization.py
-from __future__ import division, absolute_import
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

@@ -29,38 +30,25 @@ __doc__ = """
 """

 import numpy as np
-from six.moves import range


 def separate_by_real_and_imag(data, real_only):
+    from pytools.obj_array import obj_array_imag_copy, obj_array_real_copy
+
    for name, field in data:
-        from pytools.obj_array import log_shape
-        ls = log_shape(field)
-
-        if ls != () and ls[0] > 1:
-            assert len(ls) == 1
-            from pytools.obj_array import (
-                    oarray_real_copy, oarray_imag_copy,
-                    with_object_array_or_scalar)
-
-            if field[0].dtype.kind == "c":
-                if real_only:
-                    yield (name,
-                            with_object_array_or_scalar(oarray_real_copy, field))
-                else:
-                    yield (name+"_r",
-                            with_object_array_or_scalar(oarray_real_copy, field))
-                    yield (name+"_i",
-                            with_object_array_or_scalar(oarray_imag_copy, field))
-            else:
-                yield (name, field)
+        try:
+            # Look inside object arrays to get the entry dtype.
+            entry_dtype = field[0].dtype
+        except AttributeError:
+            entry_dtype = field.dtype
+
+        assert entry_dtype.kind != "O"
+
+        if real_only or entry_dtype.kind != "c":
+            yield (name, obj_array_real_copy(field))
        else:
-            # ls == ()
-            if field.dtype.kind == "c":
-                yield (name+"_r", field.real.copy())
-                yield (name+"_i", field.imag.copy())
-            else:
-                yield (name, field)
+            yield (f"{name}_r", obj_array_real_copy(field))
+            yield (f"{name}_i", obj_array_imag_copy(field))


 def make_field_plotter_from_bbox(bbox, h, extend_factor=0):
@@ -88,14 +76,12 @@ def make_field_plotter_from_bbox(bbox, h, extend_factor=0):

    from math import ceil

-    npoints = tuple(
-            int(ceil(extent[i] / h[i]))
-            for i in range(dimensions))
+    npoints = tuple(ceil(extent[i] / h[i]) for i in range(dimensions))

    return FieldPlotter(center, extent, npoints)


-class FieldPlotter(object):
+class FieldPlotter:
    """
    .. automethod:: set_matplotlib_limits
    .. automethod:: show_scalar_in_matplotlib
@@ -123,7 +109,9 @@ class FieldPlotter(object):
                slice(a[i], b[i], 1j*npoints[i])
                for i in range(dim))

-        mgrid = np.mgrid[mgrid_index]
+        # np.asarray is technically unneeded, used to placate pylint
+        # https://github.com/pylint-dev/pylint/issues/9989
+        mgrid = np.asarray(np.mgrid[mgrid_index])

        # (axis, point x idx, point y idx, ...)
        self.nd_points = mgrid
@@ -180,17 +168,19 @@ class FieldPlotter(object):
    def show_vector_in_mayavi(self, fld, do_show=True, **kwargs):
        c = self.points

-        from mayavi import mlab
+        from mayavi import mlab  # pylint: disable=import-error
+
        mlab.quiver3d(c[0], c[1], c[2], fld[0], fld[1], fld[2],
                **kwargs)

        if do_show:
            mlab.show()

-    def write_vtk_file(self, file_name, data, real_only=False):
+    def write_vtk_file(self, file_name, data, real_only=False, overwrite=False):
        from pyvisfile.vtk import write_structured_grid
        write_structured_grid(file_name, self.nd_points,
-                point_data=list(separate_by_real_and_imag(data, real_only)))
+                point_data=list(separate_by_real_and_imag(data, real_only)),
+                overwrite=overwrite)

    def show_scalar_in_mayavi(self, fld, max_val=None, **kwargs):
        if max_val is not None:
@@ -203,7 +193,7 @@ class FieldPlotter(object):
        nd_points = self.nd_points.squeeze()[self._get_nontrivial_dims()]
        squeezed_fld = fld.squeeze()

-        from mayavi import mlab
+        from mayavi import mlab  # pylint: disable=import-error
        mlab.surf(nd_points[0], nd_points[1], squeezed_fld, **kwargs)

 # vim: foldmethod=marker
--- a/test/test_codegen.py
+++ b/test/test_codegen.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2017 Matt Wala"

@@ -22,44 +23,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-
-import sys
-
 import logging
-logger = logging.getLogger(__name__)
-
-
-def test_kill_trivial_assignments():
-    from pymbolic import var
-    x, y, t0, t1, t2 = [var(s) for s in "x y t0 t1 t2".split()]
-
-    assignments = (
-        ("t0", 6),
-        ("t1", -t0),
-        ("t2", 6*x),
-        ("nt", x**y),
-        # users of trivial assignments
-        ("u0", t0 + 1),
-        ("u1", t1 + 1),
-        ("u2", t2 + 1),
-    )
+import sys

-    from sumpy.codegen import kill_trivial_assignments
-    result = kill_trivial_assignments(
-        assignments,
-        retain_names=("u0", "u1", "u2"))
+import pytest

-    from pymbolic.primitives import Sum

-    def _s(*vals):
-        return Sum(vals)
+logger = logging.getLogger(__name__)

-    assert result == [
-        ('nt', x**y),
-        ('u0', _s(6, 1)),
-        ('u1', _s(-6, 1)),
-        ('u2', _s(6*x, 1))]

+# {{{ test_symbolic_assignment_name_uniqueness

 def test_symbolic_assignment_name_uniqueness():
    # https://gitlab.tiker.net/inducer/sumpy/issues/13
@@ -77,21 +50,25 @@ def test_symbolic_assignment_name_uniqueness():

    assert len(sac.assignments) == 3

+# }}}
+
+
+# {{{ test_line_taylor_coeff_growth

 def test_line_taylor_coeff_growth():
    # Regression test for LineTaylorLocalExpansion.
    # See https://gitlab.tiker.net/inducer/pytential/merge_requests/12
-    from sumpy.kernel import LaplaceKernel
-    from sumpy.expansion.local import LineTaylorLocalExpansion
-    from sumpy.symbolic import make_sym_vector, SympyToPymbolicMapper
-
    import numpy as np

+    from sumpy.expansion.local import LineTaylorLocalExpansion
+    from sumpy.kernel import LaplaceKernel
+    from sumpy.symbolic import SympyToPymbolicMapper, make_sym_vector
+
    order = 10
    expn = LineTaylorLocalExpansion(LaplaceKernel(2), order)
    avec = make_sym_vector("a", 2)
    bvec = make_sym_vector("b", 2)
-    coeffs = expn.coefficients_from_source(avec, bvec, rscale=1)
+    coeffs = expn.coefficients_from_source(expn.kernel, avec, bvec, rscale=1)

    sym2pymbolic = SympyToPymbolicMapper()
    coeffs_pymbolic = [sym2pymbolic(c) for c in coeffs]
@@ -104,15 +81,16 @@ def test_line_taylor_coeff_growth():
    max_order = 2
    assert np.polyfit(np.log(indices), np.log(counts), deg=1)[0] < max_order

+# }}}
+

 # You can test individual routines by typing
-# $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context)'
+# $ python test_codegen.py 'test_line_taylor_coeff_growth()'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_cse.py
+++ b/test/test_cse.py
-from __future__ import print_function, division
+from __future__ import annotations
+

 __copyright__ = """
 Copyright (C) 2017 Matt Wala
@@ -66,29 +67,29 @@ DAMAGE.

 # }}}

-import pytest
 import sys

-from sumpy.symbolic import (
-    Add, Pow, exp, sqrt, symbols, sympify, cos, sin, Function, USE_SYMENGINE)
+import pytest
+
+import sumpy.symbolic as sym
+from sumpy.cse import cse, postprocess_for_cse, preprocess_for_cse
+

-if not USE_SYMENGINE:
-    from sympy.simplify.cse_opts import sub_pre, sub_post
+if not sym.USE_SYMENGINE:
    from sympy.functions.special.hyper import meijerg
    from sympy.simplify import cse_opts
+    from sympy.simplify.cse_opts import sub_post, sub_pre

-from sumpy.cse import (
-    cse, preprocess_for_cse, postprocess_for_cse)
+import logging


-w, x, y, z = symbols('w,x,y,z')
-x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 = symbols('x:13')
-
-sympyonly = (
-    pytest.mark.skipif(USE_SYMENGINE, reason="uses a sympy-only feature"))
+logger = logging.getLogger(__name__)

+w, x, y, z = sym.symbols("w,x,y,z")
+x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 = sym.symbols("x:13")

-# Dummy "optimization" functions for testing.
+sympyonly = (
+    pytest.mark.skipif(sym.USE_SYMENGINE, reason="uses a sympy-only feature"))


 def opt1(expr):
@@ -99,6 +100,8 @@ def opt2(expr):
    return expr*z


+# {{{ test_preprocess_for_cse
+
 def test_preprocess_for_cse():
    assert preprocess_for_cse(x, [(opt1, None)]) == x + y
    assert preprocess_for_cse(x, [(None, opt1)]) == x
@@ -107,6 +110,10 @@ def test_preprocess_for_cse():
    assert preprocess_for_cse(
        x, [(opt1, None), (opt2, None)]) == (x + y)*z

+# }}}
+
+
+# {{{ test_postprocess_for_cse

 def test_postprocess_for_cse():
    assert postprocess_for_cse(x, [(opt1, None)]) == x
@@ -117,56 +124,78 @@ def test_postprocess_for_cse():
    assert postprocess_for_cse(
        x, [(None, opt1), (None, opt2)]) == x*z + y

+# }}}
+
+
+# {{{ test_cse_single

 def test_cse_single():
    # Simple substitution.
-    e = Add(Pow(x + y, 2), sqrt(x + y))
+    e = sym.Add(sym.Pow(x + y, 2), sym.sqrt(x + y))
    substs, reduced = cse([e])
    assert substs == [(x0, x + y)]
-    assert reduced == [sqrt(x0) + x0**2]
+    assert reduced == [sym.sqrt(x0) + x0**2]
+
+# }}}
+

+# {{{

 @sympyonly
 def test_cse_not_possible():
    # No substitution possible.
-    e = Add(x, y)
+    e = sym.Add(x, y)
    substs, reduced = cse([e])
    assert substs == []
    assert reduced == [x + y]
    # issue 6329
-    eq = (meijerg((1, 2), (y, 4), (5,), [], x)
-          + meijerg((1, 3), (y, 4), (5,), [], x))
+    eq = (
+        meijerg((1, 2), (y, 4), (5,), [], x)  # pylint: disable=possibly-used-before-assignment
+        + meijerg((1, 3), (y, 4), (5,), [], x))
    assert cse(eq) == ([], [eq])

+# }}}
+
+
+# {{{ test_nested_substitution

 def test_nested_substitution():
    # Substitution within a substitution.
-    e = Add(Pow(w*x + y, 2), sqrt(w*x + y))
+    e = sym.Add(sym.Pow(w*x + y, 2), sym.sqrt(w*x + y))
    substs, reduced = cse([e])
    assert substs == [(x0, w*x + y)]
-    assert reduced == [sqrt(x0) + x0**2]
+    assert reduced == [sym.sqrt(x0) + x0**2]
+
+# }}}


+# {{{ test_subtraction_opt
+
 @sympyonly
 def test_subtraction_opt():
    # Make sure subtraction is optimized.
-    e = (x - y)*(z - y) + exp((x - y)*(z - y))
+    e = (x - y)*(z - y) + sym.exp((x - y)*(z - y))
    substs, reduced = cse(
-        [e], optimizations=[(cse_opts.sub_pre, cse_opts.sub_post)])
+        [e], optimizations=[(cse_opts.sub_pre, cse_opts.sub_post)])  # pylint: disable=possibly-used-before-assignment
    assert substs == [(x0, (x - y)*(y - z))]
-    assert reduced == [-x0 + exp(-x0)]
-    e = -(x - y)*(z - y) + exp(-(x - y)*(z - y))
+    assert reduced == [-x0 + sym.exp(-x0)]
+    e = -(x - y)*(z - y) + sym.exp(-(x - y)*(z - y))
    substs, reduced = cse(
        [e], optimizations=[(cse_opts.sub_pre, cse_opts.sub_post)])
    assert substs == [(x0, (x - y)*(y - z))]
-    assert reduced == [x0 + exp(x0)]
+    assert reduced == [x0 + sym.exp(x0)]
    # issue 4077
    n = -1 + 1/x
    e = n/x/(-n)**2 - 1/n/x
-    assert cse(e, optimizations=[(cse_opts.sub_pre, cse_opts.sub_post)]) == \
-        ([], [0])
+    assert cse(e, optimizations=[
+               (cse_opts.sub_pre, cse_opts.sub_post)]
+               ) == ([], [0])
+
+# }}}


+# {{{ test_multiple_expressions
+
 def test_multiple_expressions():
    e1 = (x + y)*z
    e2 = (x + y)*w
@@ -183,7 +212,7 @@ def test_multiple_expressions():
    rsubsts, _ = cse(reversed(l_))
    assert substs == rsubsts
    assert reduced == [x1, x1 + z, x0]
-    f = Function("f")
+    f = sym.Function("f")
    l_ = [f(x - z, y - z), x - z, y - z]
    substs, reduced = cse(l_)
    rsubsts, _ = cse(reversed(l_))
@@ -197,31 +226,46 @@ def test_multiple_expressions():
    assert cse([x*y, z + x*y, x*y*z + 3]) == \
        ([(x0, x*y)], [x0, z + x0, 3 + x0*z])

+# }}}
+
+
+# {{{ test_issue_4203

 def test_issue_4203():
-    assert cse(sin(x**x)/x**x) == ([(x0, x**x)], [sin(x0)/x0])
+    assert cse(sym.sin(x**x)/x**x) == ([(x0, x**x)], [sym.sin(x0)/x0])

+# }}}
+
+
+# {{{ test_dont_cse_subs

 def test_dont_cse_subs():
-    f = Function("f")
-    g = Function("g")
+    f = sym.Function("f")
+    g = sym.Function("g")

-    name_val, (expr,) = cse(f(x+y).diff(x) + g(x+y).diff(x))
+    name_val, (_expr,) = cse(f(x+y).diff(x) + g(x+y).diff(x))

    assert name_val == []

+# }}}
+
+
+# {{{ test_dont_cse_derivative

 def test_dont_cse_derivative():
-    from sumpy.symbolic import Derivative
-    f = Function("f")
+    f = sym.Function("f")

-    deriv = Derivative(f(x+y), x)
+    deriv = sym.Derivative(f(x+y), x)

    name_val, (expr,) = cse(x + y + deriv)

    assert name_val == []
    assert expr == x + y + deriv

+# }}}
+
+
+# {{{ test_pow_invpow

 def test_pow_invpow():
    assert cse(1/x**2 + x**2) == \
@@ -230,39 +274,64 @@ def test_pow_invpow():
        ([(x0, x**2), (x1, 1/x0)], [x0 + x1*(x1 + 1)])
    assert cse(1/x**2 + (1 + 1/x**2)*x**2) == \
        ([(x0, x**2), (x1, 1/x0)], [x0*(x1 + 1) + x1])
-    assert cse(cos(1/x**2) + sin(1/x**2)) == \
-        ([(x0, x**(-2))], [sin(x0) + cos(x0)])
-    assert cse(cos(x**2) + sin(x**2)) == \
-        ([(x0, x**2)], [sin(x0) + cos(x0)])
+    assert cse(sym.cos(1/x**2) + sym.sin(1/x**2)) == \
+        ([(x0, x**(-2))], [sym.sin(x0) + sym.cos(x0)])
+    assert cse(sym.cos(x**2) + sym.sin(x**2)) == \
+        ([(x0, x**2)], [sym.sin(x0) + sym.cos(x0)])
    assert cse(y/(2 + x**2) + z/x**2/y) == \
        ([(x0, x**2)], [y/(x0 + 2) + z/(x0*y)])
-    assert cse(exp(x**2) + x**2*cos(1/x**2)) == \
-        ([(x0, x**2)], [x0*cos(1/x0) + exp(x0)])
+    assert cse(sym.exp(x**2) + x**2*sym.cos(1/x**2)) == \
+        ([(x0, x**2)], [x0*sym.cos(1/x0) + sym.exp(x0)])
    assert cse((1 + 1/x**2)/x**2) == \
        ([(x0, x**(-2))], [x0*(x0 + 1)])
    assert cse(x**(2*y) + x**(-2*y)) == \
        ([(x0, x**(2*y))], [x0 + 1/x0])

+# }}}
+
+
+# {{{ test_issue_4499

 @sympyonly
 def test_issue_4499():
    # previously, this gave 16 constants
+    from sympy import S, Tuple
    from sympy.abc import a, b
-    from sympy import Tuple, S
-    B = Function('B')  # noqa
-    G = Function('G')  # noqa
-    t = Tuple(
-        *(a, a + S(1)/2, 2*a, b, 2*a - b + 1, (sqrt(z)/2)**(-2*a + 1)*B(2*a
-        - b, sqrt(z))*B(b - 1, sqrt(z))*G(b)*G(2*a - b + 1),
-        sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b,
-        sqrt(z))*G(b)*G(2*a - b + 1), sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b - 1,
-        sqrt(z))*B(2*a - b + 1, sqrt(z))*G(b)*G(2*a - b + 1),
-        (sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b + 1,
-        sqrt(z))*G(b)*G(2*a - b + 1), 1, 0, S(1)/2, z/2, -b + 1, -2*a + b,
-        -2*a))  # noqa
+
+    B = sym.Function("B")   # noqa: N806
+    G = sym.Function("G")   # noqa: N806
+    t = Tuple(*(
+        a,
+        a + S(1)/2,
+        2*a,
+        b,
+        2*a - b + 1,
+        (sym.sqrt(z)/2)**(-2*a + 1)
+        * B(2*a-b, sym.sqrt(z))
+        * B(b - 1, sym.sqrt(z))*G(b)*G(2*a - b + 1),
+        sym.sqrt(z)*(sym.sqrt(z)/2)**(-2*a + 1)
+        * B(b, sym.sqrt(z))
+        * B(2*a - b, sym.sqrt(z))*G(b)*G(2*a - b + 1),
+        sym.sqrt(z)*(sym.sqrt(z)/2)**(-2*a + 1)
+        * B(b - 1, sym.sqrt(z))
+        * B(2*a - b + 1, sym.sqrt(z))*G(b)*G(2*a - b + 1),
+        (sym.sqrt(z)/2)**(-2*a + 1)
+        * B(b, sym.sqrt(z))
+        * B(2*a - b + 1, sym.sqrt(z))*G(b)*G(2*a - b + 1),
+        1,
+        0,
+        S(1)/2,
+        z/2,
+        -b + 1,
+        -2*a + b,
+        -2*a))
    c = cse(t)
    assert len(c[0]) == 11

+# }}}
+
+
+# {{{ test_issue_6169

 @sympyonly
 def test_issue_6169():
@@ -271,72 +340,95 @@ def test_issue_6169():
    assert cse(r) == ([], [r])
    # and a check that the right thing is done with the new
    # mechanism
-    assert sub_post(sub_pre((-x - y)*z - x - y)) == -z*(x + y) - x - y
+    assert sub_post(sub_pre((-x - y)*z - x - y)) == -z*(x + y) - x - y  # pylint: disable=possibly-used-before-assignment
+
+# }}}
+

+# {{{ test_cse_Indexed

 @sympyonly
-def test_cse_Indexed():  # noqa
-    from sympy import IndexedBase, Idx
+def test_cse_indexed():
+    from sympy import Idx, IndexedBase
    len_y = 5
-    y = IndexedBase('y', shape=(len_y,))
-    x = IndexedBase('x', shape=(len_y,))
-    Dy = IndexedBase('Dy', shape=(len_y-1,))  # noqa
-    i = Idx('i', len_y-1)
+    y = IndexedBase("y", shape=(len_y,))
+    x = IndexedBase("x", shape=(len_y,))
+    i = Idx("i", len_y-1)

    expr1 = (y[i+1]-y[i])/(x[i+1]-x[i])
    expr2 = 1/(x[i+1]-x[i])
-    replacements, reduced_exprs = cse([expr1, expr2])
+    replacements, _reduced_exprs = cse([expr1, expr2])
    assert len(replacements) > 0

+# }}}
+
+
+# {{{ test_Piecewise

 @sympyonly
-def test_Piecewise():  # noqa
-    from sympy import Piecewise, Eq
+def test_piecewise():
+    from sympy import Eq, Piecewise
    f = Piecewise((-z + x*y, Eq(y, 0)), (-z - x*y, True))
    ans = cse(f)
    actual_ans = ([(x0, -z), (x1, x*y)],
                  [Piecewise((x0+x1, Eq(y, 0)), (x0 - x1, True))])
    assert ans == actual_ans

+# }}}
+
+
+# {{{ test_name_conflict

 def test_name_conflict():
    z1 = x0 + y
    z2 = x2 + x3
-    l_ = [cos(z1) + z1, cos(z2) + z2, x0 + x2]
+    l_ = [sym.cos(z1) + z1, sym.cos(z2) + z2, x0 + x2]
    substs, reduced = cse(l_)
    assert [e.subs(dict(substs)) for e in reduced] == l_

+# }}}
+
+
+# {{{ test_name_conflict_cust_symbols

 def test_name_conflict_cust_symbols():
    z1 = x0 + y
    z2 = x2 + x3
-    l_ = [cos(z1) + z1, cos(z2) + z2, x0 + x2]
-    substs, reduced = cse(l_, symbols("x:10"))
+    l_ = [sym.cos(z1) + z1, sym.cos(z2) + z2, x0 + x2]
+    substs, reduced = cse(l_, sym.symbols("x:10"))
    assert [e.subs(dict(substs)) for e in reduced] == l_

+# }}}
+
+
+# {{{ test_symbols_exhausted_error

 def test_symbols_exhausted_error():
-    l_ = cos(x+y)+x+y+cos(w+y)+sin(w+y)
-    sym = [x, y, z]
+    l_ = sym.cos(x+y)+x+y+sym.cos(w+y)+sym.sin(w+y)
+    s = [x, y, z]
    with pytest.raises(ValueError):
-        print(cse(l_, symbols=sym))
+        logger.info("cse:\n%s", cse(l_, symbols=s))

+# }}}
+
+
+# {{{ test_issue_7840

 @sympyonly
 def test_issue_7840():
    # daveknippers' example
-    C393 = sympify(  # noqa
-        'Piecewise((C391 - 1.65, C390 < 0.5), (Piecewise((C391 - 1.65, \
-        C391 > 2.35), (C392, True)), True))'
+    C393 = sym.sympify(     # noqa: N806
+        "Piecewise((C391 - 1.65, C390 < 0.5), (Piecewise((C391 - 1.65, \
+        C391 > 2.35), (C392, True)), True))"
    )
-    C391 = sympify(  # noqa
-        'Piecewise((2.05*C390**(-1.03), C390 < 0.5), (2.5*C390**(-0.625), True))'
+    C391 = sym.sympify(     # noqa: N806
+        "Piecewise((2.05*C390**(-1.03), C390 < 0.5), (2.5*C390**(-0.625), True))"
    )
-    C393 = C393.subs('C391',C391)  # noqa
+    C393 = C393.subs("C391", C391)   # noqa: N806
    # simple substitution
    sub = {}
-    sub['C390'] = 0.703451854
-    sub['C392'] = 1.01417794
+    sub["C390"] = 0.703451854
+    sub["C392"] = 1.01417794
    ss_answer = C393.subs(sub)
    # cse
    substitutions, new_eqn = cse(C393)
@@ -347,7 +439,7 @@ def test_issue_7840():
    assert ss_answer == cse_answer

    # GitRay's example
-    expr = sympify(
+    expr = sym.sympify(
        "Piecewise((Symbol('ON'), Equality(Symbol('mode'), Symbol('ON'))), \
        (Piecewise((Piecewise((Symbol('OFF'), StrictLessThan(Symbol('x'), \
        Symbol('threshold'))), (Symbol('ON'), S.true)), Equality(Symbol('mode'), \
@@ -359,6 +451,10 @@ def test_issue_7840():
    # there should not be any replacements
    assert len(substitutions) < 1

+# }}}
+
+
+# {{{ test_recursive_matching

 def test_recursive_matching():
    assert cse([x+y, 2+x+y, x+y+z, 3+x+y+z]) == \
@@ -374,12 +470,16 @@ def test_recursive_matching():
    assert cse([2*x*x, x*x*y, x*x*y*w, x*x*y*w*x0, x*x*y*w*x2]) == \
        ([(x1, x**2), (x3, x1*y), (x4, w*x3)], [2*x1, x3, x4, x0*x4, x2*x4])

+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_cse.py 'test_recursive_matching()'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import os
+from functools import partial
+
+import numpy as np
+import pytest
+
+import pyopencl as cl
+
+
+# Note: Do not import mpi4py.MPI object at the module level, because OpenMPI does not
+# support recursive invocations.
+
+
+def set_cache_dir(mpirank):
+    """Make each rank use a different cache location to avoid conflict."""
+    import platformdirs
+    cache_dir = platformdirs.user_cache_dir("sumpy", "sumpy")
+
+    # FIXME: should clean up this directory after running the tests
+    os.environ["XDG_CACHE_HOME"] = os.path.join(cache_dir, str(mpirank))
+
+
+# {{{ _test_against_single_rank
+
+def _test_against_single_rank(
+        dims, nsources, ntargets, dtype, communicate_mpoles_via_allreduce=False):
+    from mpi4py import MPI
+
+    # Get the current rank
+    comm = MPI.COMM_WORLD
+    mpi_rank = comm.Get_rank()
+    set_cache_dir(mpi_rank)
+
+    # Configure array context
+    cl_context = cl.create_some_context()
+    queue = cl.CommandQueue(cl_context)
+
+    def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level):
+        return max(level, 3)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    traversal_builder = FMMTraversalBuilder(cl_context, well_sep_is_n_away=2)
+
+    from sumpy.expansion import DefaultExpansionFactory
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(dims)
+    expansion_factory = DefaultExpansionFactory()
+    local_expansion_factory = expansion_factory.get_local_expansion_class(kernel)
+    local_expansion_factory = partial(local_expansion_factory, kernel)
+    multipole_expansion_factory = \
+        expansion_factory.get_multipole_expansion_class(kernel)
+    multipole_expansion_factory = partial(multipole_expansion_factory, kernel)
+
+    from sumpy.fmm import SumpyTreeIndependentDataForWrangler
+    tree_indep = SumpyTreeIndependentDataForWrangler(
+        cl_context, multipole_expansion_factory, local_expansion_factory, [kernel])
+
+    global_tree_dev = None
+    sources_weights = cl.array.empty(queue, 0, dtype=dtype)
+
+    if mpi_rank == 0:
+        # Generate random particles and source weights
+        from boxtree.tools import make_normal_particle_array as p_normal
+        sources = p_normal(queue, nsources, dims, dtype, seed=15)
+        targets = p_normal(queue, ntargets, dims, dtype, seed=18)
+
+        # FIXME: Use arraycontext instead of raw PyOpenCL arrays
+        from pyopencl.clrandom import PhiloxGenerator
+        rng = PhiloxGenerator(cl_context, seed=20)
+        sources_weights = rng.uniform(queue, nsources, dtype=np.float64)
+
+        rng = PhiloxGenerator(cl_context, seed=22)
+        target_radii = rng.uniform(
+            queue, ntargets, a=0, b=0.05, dtype=np.float64)
+
+        # Build the tree and interaction lists
+        from boxtree import TreeBuilder
+        tb = TreeBuilder(cl_context)
+        global_tree_dev, _ = tb(
+            queue, sources, targets=targets, target_radii=target_radii,
+            stick_out_factor=0.25, max_particles_in_box=30, debug=True)
+
+        global_trav_dev, _ = traversal_builder(queue, global_tree_dev, debug=True)
+
+        from sumpy.fmm import SumpyExpansionWrangler
+        wrangler = SumpyExpansionWrangler(tree_indep, global_trav_dev, dtype,
+                                          fmm_level_to_order)
+
+        # Compute FMM with one MPI rank
+        from boxtree.fmm import drive_fmm
+        shmem_potential = drive_fmm(wrangler, [sources_weights])
+
+    # Compute FMM using the distributed implementation
+
+    def wrangler_factory(local_traversal, global_traversal):
+        from sumpy.distributed import DistributedSumpyExpansionWrangler
+        return DistributedSumpyExpansionWrangler(
+            cl_context, comm, tree_indep, local_traversal, global_traversal, dtype,
+            fmm_level_to_order,
+            communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
+
+    from boxtree.distributed import DistributedFMMRunner
+    distributed_fmm_info = DistributedFMMRunner(
+        queue, global_tree_dev, traversal_builder, wrangler_factory, comm=comm)
+
+    timing_data = {}
+    distributed_potential = distributed_fmm_info.drive_dfmm(
+                [sources_weights], timing_data=timing_data)
+    assert timing_data
+
+    if mpi_rank == 0:
+        assert shmem_potential.shape == (1,)
+        assert distributed_potential.shape == (1,)
+
+        shmem_potential = shmem_potential[0].get()
+        distributed_potential = distributed_potential[0].get()
+
+        error = (np.linalg.norm(distributed_potential - shmem_potential, ord=np.inf)
+                 / np.linalg.norm(shmem_potential, ord=np.inf))
+        print(error)
+        assert error < 1e-14
+
+
+@pytest.mark.mpi
+@pytest.mark.parametrize(
+    "num_processes, dims, nsources, ntargets, communicate_mpoles_via_allreduce", [
+        (4, 3, 10000, 10000, True),
+        (4, 3, 10000, 10000, False)
+    ]
+)
+def test_against_single_rank(
+        num_processes, dims, nsources, ntargets, communicate_mpoles_via_allreduce):
+    pytest.importorskip("mpi4py")
+
+    from boxtree.tools import run_mpi
+    run_mpi(__file__, num_processes, {
+        "OMP_NUM_THREADS": 1,
+        "_SUMPY_TEST_NAME": "against_single_rank",
+        "_SUMPY_TEST_DIMS": dims,
+        "_SUMPY_TEST_NSOURCES": nsources,
+        "_SUMPY_TEST_NTARGETS": ntargets,
+        "_SUMPY_TEST_MPOLES_ALLREDUCE": communicate_mpoles_via_allreduce
+        })
+
+# }}}
+
+
+if __name__ == "__main__":
+    if "_SUMPY_TEST_NAME" in os.environ:
+        name = os.environ["_SUMPY_TEST_NAME"]
+        if name == "against_single_rank":
+            # Run "test_against_single_rank" test case
+            dims = int(os.environ["_SUMPY_TEST_DIMS"])
+            nsources = int(os.environ["_SUMPY_TEST_NSOURCES"])
+            ntargets = int(os.environ["_SUMPY_TEST_NTARGETS"])
+
+            communicate_mpoles_via_allreduce = (
+                os.environ["_SUMPY_TEST_MPOLES_ALLREDUCE"] == "True")
+
+            _test_against_single_rank(
+                dims, nsources, ntargets, np.float64,
+                communicate_mpoles_via_allreduce)
+        else:
+            raise ValueError(f"Invalid test name: {name!r}")
+    else:
+        # You can test individual routines by typing
+        # $ python test_distributed.py
+        # 'test_against_single_rank(4, 3, 10000, 10000, False)'
+        import sys
+        exec(sys.argv[1])
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"

@@ -22,107 +23,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-from six.moves import range
-
+import logging
+import os
 import sys
+from functools import partial
+
 import numpy as np
 import numpy.linalg as la
-import pyopencl as cl
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
-from sumpy.kernel import LaplaceKernel, HelmholtzKernel, YukawaKernel
-from sumpy.expansion.multipole import (
-    VolumeTaylorMultipoleExpansion,
-    H2DMultipoleExpansion, Y2DMultipoleExpansion,
-    LaplaceConformingVolumeTaylorMultipoleExpansion,
-    HelmholtzConformingVolumeTaylorMultipoleExpansion)
+import pytest
+
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
 from sumpy.expansion.local import (
+    H2DLocalExpansion,
+    LinearPDEConformingVolumeTaylorLocalExpansion,
    VolumeTaylorLocalExpansion,
-    H2DLocalExpansion, Y2DLocalExpansion,
-    LaplaceConformingVolumeTaylorLocalExpansion,
-    HelmholtzConformingVolumeTaylorLocalExpansion)
+    Y2DLocalExpansion,
+)
+from sumpy.expansion.multipole import (
+    H2DMultipoleExpansion,
+    LinearPDEConformingVolumeTaylorMultipoleExpansion,
+    VolumeTaylorMultipoleExpansion,
+    Y2DMultipoleExpansion,
+)
+from sumpy.fmm import SumpyExpansionWrangler, SumpyTreeIndependentDataForWrangler
+from sumpy.kernel import BiharmonicKernel, HelmholtzKernel, LaplaceKernel, YukawaKernel

-import pytest

-import logging
 logger = logging.getLogger(__name__)

+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+# {{{ test_sumpy_fmm

-try:
-    import faulthandler
-except ImportError:
-    pass
-else:
-    faulthandler.enable()
-
-
-@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class", [
-    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(2), LaplaceConformingVolumeTaylorLocalExpansion,
-                       LaplaceConformingVolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(3), LaplaceConformingVolumeTaylorLocalExpansion,
-                       LaplaceConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), HelmholtzConformingVolumeTaylorLocalExpansion,
-                         HelmholtzConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion),
-    (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(3), HelmholtzConformingVolumeTaylorLocalExpansion,
-                         HelmholtzConformingVolumeTaylorMultipoleExpansion),
-    (YukawaKernel(2), Y2DLocalExpansion, Y2DMultipoleExpansion),
+@pytest.mark.parametrize(
+    ("use_translation_classes", "use_fft", "fft_backend"), [
+        (False, False, None),
+        (True, False, None),
+        (True, True, "loopy"),
+        (True, True, "pyvkfft"),
    ])
-def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
-    logging.basicConfig(level=logging.INFO)
+@pytest.mark.parametrize(
+        ("knl", "local_expn_class", "mpole_expn_class",
+        "order_varies_with_level"), [
+            (LaplaceKernel(2), VolumeTaylorLocalExpansion,
+                VolumeTaylorMultipoleExpansion, False),
+            (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+                LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+            (LaplaceKernel(3), VolumeTaylorLocalExpansion,
+                VolumeTaylorMultipoleExpansion, False),
+            (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+                LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+            (HelmholtzKernel(2), VolumeTaylorLocalExpansion,
+                VolumeTaylorMultipoleExpansion, False),
+            (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+                LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+            (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion, False),
+            (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion, True),
+            (HelmholtzKernel(3), VolumeTaylorLocalExpansion,
+                VolumeTaylorMultipoleExpansion, False),
+            (HelmholtzKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+                LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+            (YukawaKernel(2), Y2DLocalExpansion, Y2DMultipoleExpansion,
+                False),
+            ])
+def test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class,
+        order_varies_with_level, use_translation_classes, use_fft,
+        fft_backend, visualize=False):
+    if fft_backend == "pyvkfft":
+        pytest.importorskip("pyvkfft")
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if local_expn_class == VolumeTaylorLocalExpansion and use_fft:
+        pytest.skip("VolumeTaylorExpansion with FFT takes a lot of resources.")
+
+    if local_expn_class in [H2DLocalExpansion, Y2DLocalExpansion] and use_fft:
+        pytest.skip("Fourier/Bessel based expansions with FFT is not supported yet.")
+
+    if use_fft:
+        from unittest.mock import patch
+
+        with patch.dict(os.environ, {"SUMPY_FFT_BACKEND": fft_backend}):
+            _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class,
+                order_varies_with_level, use_translation_classes, use_fft,
+                fft_backend)
+    else:
+        _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class,
+            order_varies_with_level, use_translation_classes, use_fft,
+            fft_backend)
+
+
+def _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class,
+        order_varies_with_level, use_translation_classes, use_fft,
+        fft_backend):

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    actx = actx_factory()

    nsources = 1000
    ntargets = 300
    dtype = np.float64

-    from boxtree.tools import (
-            make_normal_particle_array as p_normal)
+    from boxtree.tools import make_normal_particle_array as p_normal
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)

-    sources = p_normal(queue, nsources, knl.dim, dtype, seed=15)
    if 1:
        offset = np.zeros(knl.dim)
        offset[0] = 0.1
-
-        targets = (
-                p_normal(queue, ntargets, knl.dim, dtype, seed=18)
-                + offset)
+        targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18)

        del offset
    else:
        from sumpy.visualization import FieldPlotter
        fp = FieldPlotter(np.array([0.5, 0]), extent=3, npoints=200)
+
        from pytools.obj_array import make_obj_array
-        targets = make_obj_array(
-                [fp.points[i] for i in range(knl.dim)])
+        targets = make_obj_array([fp.points[i] for i in range(knl.dim)])

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    tree, _ = tb(queue, sources, targets=targets,
+    tree, _ = tb(actx.queue, sources, targets=targets,
            max_particles_in_box=30, debug=True)

    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(ctx)
-    trav, _ = tbuild(queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)

    # {{{ plot tree

    if 0:
-        host_tree = tree.get()
-        host_trav = trav.get()
+        host_tree = tree.get(actx.queue)
+        host_trav = trav.get(actx.queue)

-        if 1:
-            print("src_box", host_tree.find_box_nr_for_source(403))
-            print("tgt_box", host_tree.find_box_nr_for_target(28))
-            print(list(host_trav.target_or_target_parent_boxes).index(37))
-            print(host_trav.get_box_list("sep_bigger", 22))
+        if 0:
+            logger.info("src_box: %s", host_tree.find_box_nr_for_source(403))
+            logger.info("tgt_box: %s", host_tree.find_box_nr_for_target(28))
+            logger.info("%s",
+                list(host_trav.target_or_target_parent_boxes).index(37))
+            logger.info("%s", host_trav.get_box_list("sep_bigger", 22))

        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(host_tree)
@@ -135,14 +179,11 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):

    # }}}

-    from pyopencl.clrandom import PhiloxGenerator
-    rng = PhiloxGenerator(ctx, seed=44)
-    weights = rng.uniform(queue, nsources, dtype=np.float64)
-
+    rng = np.random.default_rng(44)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))
    logger.info("computing direct (reference) result")

    from pytools.convergence import PConvergenceVerifier
-
    pconv_verifier = PConvergenceVerifier()

    extra_kwargs = {}
@@ -155,7 +196,7 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
        if knl.dim == 3:
            order_values = [1, 2]
        elif knl.dim == 2 and issubclass(local_expn_class, H2DLocalExpansion):
-            order_values = [10, 12]
+            order_values = [4, 5]

    elif isinstance(knl, YukawaKernel):
        extra_kwargs["lam"] = 2
@@ -166,175 +207,522 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
        elif knl.dim == 2 and issubclass(local_expn_class, Y2DLocalExpansion):
            order_values = [10, 12]

-    from functools import partial
    for order in order_values:
-        out_kernels = [knl]
+        target_kernels = [knl]
+
+        if use_fft:
+            from sumpy.expansion.m2l import FFTM2LTranslationClassFactory
+            m2l_translation_factory = FFTM2LTranslationClassFactory()
+        else:
+            from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
+            m2l_translation_factory = NonFFTM2LTranslationClassFactory()

-        from sumpy.fmm import SumpyExpansionWranglerCodeContainer
-        wcc = SumpyExpansionWranglerCodeContainer(
-                ctx,
+        m2l_translation = m2l_translation_factory.get_m2l_translation_class(
+                knl, local_expn_class)()
+
+        tree_indep = SumpyTreeIndependentDataForWrangler(
+                actx.context,
                partial(mpole_expn_class, knl),
-                partial(local_expn_class, knl),
-                out_kernels)
-        wrangler = wcc.get_wrangler(queue, tree, dtype,
-                fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order,
-                kernel_extra_kwargs=extra_kwargs)
+                partial(local_expn_class, knl, m2l_translation=m2l_translation),
+                target_kernels)
+
+        if order_varies_with_level:
+            def fmm_level_to_order(kernel, kernel_args, tree, lev):
+                return order + lev % 2  # noqa: B023
+        else:
+            def fmm_level_to_order(kernel, kernel_args, tree, lev):
+                return order  # noqa: B023
+
+        wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
+            fmm_level_to_order=fmm_level_to_order,
+            kernel_extra_kwargs=extra_kwargs,
+            _disable_translation_classes=not use_translation_classes)

        from boxtree.fmm import drive_fmm

-        pot, = drive_fmm(trav, wrangler, weights)
+        pot, = drive_fmm(wrangler, (weights,))

        from sumpy import P2P
-        p2p = P2P(ctx, out_kernels, exclude_self=False)
-        evt, (ref_pot,) = p2p(queue, targets, sources, (weights,),
+        p2p = P2P(actx.context, target_kernels, exclude_self=False)
+        _evt, (ref_pot,) = p2p(actx.queue, targets, sources, (weights,),
                **extra_kwargs)

-        pot = pot.get()
-        ref_pot = ref_pot.get()
+        pot = actx.to_numpy(pot)
+        ref_pot = actx.to_numpy(ref_pot)

        rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
-        logger.info("order %d -> relative l2 error: %g" % (order, rel_err))
+        logger.info("order %d -> relative l2 error: %g", order, rel_err)

        pconv_verifier.add_data_point(order, rel_err)

-    print(pconv_verifier)
+    logger.info("\n%s", pconv_verifier)
    pconv_verifier()

+# }}}
+
+
+# {{{ test_coeff_magnitude_rscale
+
+@pytest.mark.parametrize("knl", [LaplaceKernel(2), BiharmonicKernel(2)])
+def test_coeff_magnitude_rscale(actx_factory, knl):
+    """Checks that the rscale used keeps the coefficient magnitude
+    difference small
+    """
+    local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion
+
+    actx = actx_factory()
+
+    nsources = 1000
+    ntargets = 300
+    dtype = np.float64
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)
+    offset = np.zeros(knl.dim)
+    offset[0] = 0.1
+
+    targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=30, debug=True)

-def test_sumpy_fmm_timing_data_collection(ctx_getter):
-    logging.basicConfig(level=logging.INFO)
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(
-            ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
+    rng = np.random.default_rng(31)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))
+
+    extra_kwargs = {}
+    dtype = np.float64
+    order = 10
+    if isinstance(knl, HelmholtzKernel):
+        extra_kwargs["k"] = 0.05
+        dtype = np.complex128
+
+    elif isinstance(knl, YukawaKernel):
+        extra_kwargs["lam"] = 2
+        dtype = np.complex128
+
+    target_kernels = [knl]
+
+    tree_indep = SumpyTreeIndependentDataForWrangler(
+        actx.context,
+        partial(mpole_expn_class, knl),
+        partial(local_expn_class, knl),
+        target_kernels)
+
+    def fmm_level_to_order(kernel, kernel_args, tree, lev):
+        return order
+
+    wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
+        fmm_level_to_order=fmm_level_to_order,
+        kernel_extra_kwargs=extra_kwargs)
+
+    weights = wrangler.reorder_sources(weights)
+    (weights,) = wrangler.distribute_source_weights((weights,), None)
+
+    local_result, _ = wrangler.form_locals(
+        trav.level_start_target_or_target_parent_box_nrs,
+        trav.target_or_target_parent_boxes,
+        trav.from_sep_bigger_starts,
+        trav.from_sep_bigger_lists,
+        (weights,))
+
+    result = actx.to_numpy(
+        actx.np.abs(wrangler.local_expansions_view(local_result, 5)[1][0])
+        )
+
+    result_ratio = np.max(result) / np.min(result)
+    assert result_ratio < 10**6, result_ratio
+
+# }}}
+
+
+# {{{ test_unified_single_and_double
+
+def test_unified_single_and_double(actx_factory, visualize=False):
+    """
+    Test that running one FMM for single layer + double layer gives the
+    same result as running one FMM for each and adding the results together
+    at the end
+    """
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    actx = actx_factory()
+
+    knl = LaplaceKernel(2)
+    local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion
+
+    nsources = 1000
+    ntargets = 300
+    dtype = np.float64
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)
+    offset = np.zeros(knl.dim)
+    offset[0] = 0.1
+
+    targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18)
+    del offset
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources, targets=targets,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    rng = np.random.default_rng(44)
+    weights = (
+        actx.from_numpy(rng.random(nsources, dtype=np.float64)),
+        actx.from_numpy(rng.random(nsources, dtype=np.float64))
+        )
+
+    logger.info("computing direct (reference) result")
+
+    dtype = np.float64
+    order = 3
+
+    from sumpy.kernel import AxisTargetDerivative, DirectionalSourceDerivative
+
+    deriv_knl = DirectionalSourceDerivative(knl, "dir_vec")
+
+    target_kernels = [knl, AxisTargetDerivative(0, knl)]
+    source_kernel_vecs = [[knl], [deriv_knl], [knl, deriv_knl]]
+    strength_usages = [[0], [1], [0, 1]]
+
+    alpha = np.linspace(0, 2*np.pi, nsources, np.float64)
+    dir_vec = actx.from_numpy(np.vstack([np.cos(alpha), np.sin(alpha)]))
+
+    results = []
+    for source_kernels, strength_usage in zip(
+            source_kernel_vecs, strength_usages, strict=True):
+        source_extra_kwargs = {}
+        if deriv_knl in source_kernels:
+            source_extra_kwargs["dir_vec"] = dir_vec
+        tree_indep = SumpyTreeIndependentDataForWrangler(
+                actx.context,
+                partial(mpole_expn_class, knl),
+                partial(local_expn_class, knl),
+                target_kernels=target_kernels, source_kernels=source_kernels,
+                strength_usage=strength_usage)
+        wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
+                fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order,
+                source_extra_kwargs=source_extra_kwargs)
+
+        from boxtree.fmm import drive_fmm
+
+        pot = drive_fmm(wrangler, weights)
+        results.append(np.array([actx.to_numpy(pot[0]), actx.to_numpy(pot[1])]))
+
+    ref_pot = results[0] + results[1]
+    pot = results[2]
+    rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
+
+    assert rel_err < 1e-12
+
+# }}}
+
+
+# {{{ test_sumpy_fmm_timing_data_collection
+
+@pytest.mark.parametrize("use_fft", [True, False])
+def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    import pyopencl as cl
+
+    from sumpy.array_context import PyOpenCLArrayContext
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx,
+        properties=cl.command_queue_properties.PROFILING_ENABLE)
+    actx = PyOpenCLArrayContext(queue)

    nsources = 500
    dtype = np.float64

-    from boxtree.tools import (
-            make_normal_particle_array as p_normal)
+    from boxtree.tools import make_normal_particle_array as p_normal

    knl = LaplaceKernel(2)
    local_expn_class = VolumeTaylorLocalExpansion
    mpole_expn_class = VolumeTaylorMultipoleExpansion
    order = 1

-    sources = p_normal(queue, nsources, knl.dim, dtype, seed=15)
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    tree, _ = tb(queue, sources,
-            max_particles_in_box=30, debug=True)
+    tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True)

    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(ctx)
-    trav, _ = tbuild(queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    rng = np.random.default_rng(44)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))

-    from pyopencl.clrandom import PhiloxGenerator
-    rng = PhiloxGenerator(ctx)
-    weights = rng.uniform(queue, nsources, dtype=np.float64)
+    target_kernels = [knl]

-    out_kernels = [knl]
+    if use_fft:
+        from sumpy.expansion.m2l import FFTM2LTranslationClassFactory
+        m2l_translation_factory = FFTM2LTranslationClassFactory()
+    else:
+        from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
+        m2l_translation_factory = NonFFTM2LTranslationClassFactory()

-    from functools import partial
+    m2l_translation = m2l_translation_factory.get_m2l_translation_class(
+                knl, local_expn_class)()

-    from sumpy.fmm import SumpyExpansionWranglerCodeContainer
-    wcc = SumpyExpansionWranglerCodeContainer(
-            ctx,
+    tree_indep = SumpyTreeIndependentDataForWrangler(
+            actx.context,
            partial(mpole_expn_class, knl),
-            partial(local_expn_class, knl),
-            out_kernels)
+            partial(local_expn_class, knl, m2l_translation=m2l_translation),
+            target_kernels)

-    wrangler = wcc.get_wrangler(queue, tree, dtype,
+    wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
            fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order)
    from boxtree.fmm import drive_fmm

    timing_data = {}
-    pot, = drive_fmm(trav, wrangler, weights, timing_data=timing_data)
-    print(timing_data)
+    _pot, = drive_fmm(wrangler, (weights,), timing_data=timing_data)
+    logger.info("timing_data:\n%s", timing_data)
+
    assert timing_data


-def test_sumpy_fmm_exclude_self(ctx_getter):
-    logging.basicConfig(level=logging.INFO)
+def test_sumpy_fmm_exclude_self(actx_factory, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    actx = actx_factory()

    nsources = 500
    dtype = np.float64

-    from boxtree.tools import (
-            make_normal_particle_array as p_normal)
+    from boxtree.tools import make_normal_particle_array as p_normal

    knl = LaplaceKernel(2)
    local_expn_class = VolumeTaylorLocalExpansion
    mpole_expn_class = VolumeTaylorMultipoleExpansion
    order = 10

-    sources = p_normal(queue, nsources, knl.dim, dtype, seed=15)
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)

    from boxtree import TreeBuilder
-    tb = TreeBuilder(ctx)
+    tb = TreeBuilder(actx.context)

-    tree, _ = tb(queue, sources,
-            max_particles_in_box=30, debug=True)
+    tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True)

    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(ctx)
-    trav, _ = tbuild(queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)

-    from pyopencl.clrandom import PhiloxGenerator
-    rng = PhiloxGenerator(ctx)
-    weights = rng.uniform(queue, nsources, dtype=np.float64)
+    rng = np.random.default_rng(44)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))

-    target_to_source = np.arange(tree.ntargets, dtype=np.int32)
+    target_to_source = actx.from_numpy(np.arange(tree.ntargets, dtype=np.int32))
    self_extra_kwargs = {"target_to_source": target_to_source}

-    out_kernels = [knl]
-
-    from functools import partial
+    target_kernels = [knl]

-    from sumpy.fmm import SumpyExpansionWranglerCodeContainer
-    wcc = SumpyExpansionWranglerCodeContainer(
-            ctx,
+    tree_indep = SumpyTreeIndependentDataForWrangler(
+            actx.context,
            partial(mpole_expn_class, knl),
            partial(local_expn_class, knl),
-            out_kernels,
+            target_kernels,
            exclude_self=True)

-    wrangler = wcc.get_wrangler(queue, tree, dtype,
+    wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
            fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order,
            self_extra_kwargs=self_extra_kwargs)

    from boxtree.fmm import drive_fmm

-    pot, = drive_fmm(trav, wrangler, weights)
+    pot, = drive_fmm(wrangler, (weights,))

    from sumpy import P2P
-    p2p = P2P(ctx, out_kernels, exclude_self=True)
-    evt, (ref_pot,) = p2p(queue, sources, sources, (weights,),
+    p2p = P2P(actx.context, target_kernels, exclude_self=True)
+    _evt, (ref_pot,) = p2p(actx.queue, sources, sources, (weights,),
            **self_extra_kwargs)

-    pot = pot.get()
-    ref_pot = ref_pot.get()
+    pot = actx.to_numpy(pot)
+    ref_pot = actx.to_numpy(ref_pot)

    rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot)
-    logger.info("order %d -> relative l2 error: %g" % (order, rel_err))
+    logger.info("order %d -> relative l2 error: %g", order, rel_err)

    assert np.isclose(rel_err, 0, atol=1e-7)

+# }}}
+
+
+# {{{ test_sumpy_axis_source_derivative
+
+def test_sumpy_axis_source_derivative(actx_factory, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    actx = actx_factory()
+
+    nsources = 500
+    dtype = np.float64
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+
+    knl = LaplaceKernel(2)
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    order = 10
+
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    rng = np.random.default_rng(12)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))
+
+    target_to_source = actx.from_numpy(np.arange(tree.ntargets, dtype=np.int32))
+    self_extra_kwargs = {"target_to_source": target_to_source}
+
+    from sumpy.kernel import AxisSourceDerivative, AxisTargetDerivative
+
+    pots = []
+    for tgt_knl, src_knl in [
+            (AxisTargetDerivative(0, knl), knl),
+            (knl, AxisSourceDerivative(0, knl))]:
+        tree_indep = SumpyTreeIndependentDataForWrangler(
+                actx.context,
+                partial(mpole_expn_class, knl),
+                partial(local_expn_class, knl),
+                target_kernels=[tgt_knl],
+                source_kernels=[src_knl],
+                exclude_self=True)
+
+        wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
+                fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order,
+                self_extra_kwargs=self_extra_kwargs)
+
+        from boxtree.fmm import drive_fmm
+
+        pot, = drive_fmm(wrangler, (weights,))
+        pots.append(actx.to_numpy(pot))
+
+    rel_err = la.norm(pots[0] + pots[1]) / la.norm(pots[0])
+    logger.info("order %d -> relative l2 error: %g", order, rel_err)
+
+    assert np.isclose(rel_err, 0, atol=1e-5)
+
+# }}}
+
+
+# {{{ test_sumpy_target_point_multiplier
+
+@pytest.mark.parametrize("deriv_axes", [(), (0,), (1,)])
+def test_sumpy_target_point_multiplier(actx_factory, deriv_axes, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    actx = actx_factory()
+
+    nsources = 500
+    dtype = np.float64
+
+    from boxtree.tools import make_normal_particle_array as p_normal
+
+    knl = LaplaceKernel(2)
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    order = 5
+
+    sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(actx.context)
+
+    tree, _ = tb(actx.queue, sources,
+            max_particles_in_box=30, debug=True)

-# You can test individual routines by typing
-# $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context)'
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(actx.context)
+    trav, _ = tbuild(actx.queue, tree, debug=True)
+
+    rng = np.random.default_rng(12)
+    weights = actx.from_numpy(rng.random(nsources, dtype=np.float64))
+
+    target_to_source = actx.from_numpy(np.arange(tree.ntargets, dtype=np.int32))
+    self_extra_kwargs = {"target_to_source": target_to_source}
+
+    from sumpy.kernel import AxisTargetDerivative, TargetPointMultiplier
+
+    tgt_knls = [TargetPointMultiplier(0, knl), knl, knl]
+    for axis in deriv_axes:
+        tgt_knls[0] = AxisTargetDerivative(axis, tgt_knls[0])
+        tgt_knls[1] = AxisTargetDerivative(axis, tgt_knls[1])
+
+    tree_indep = SumpyTreeIndependentDataForWrangler(
+            actx.context,
+            partial(mpole_expn_class, knl),
+            partial(local_expn_class, knl),
+            target_kernels=tgt_knls,
+            source_kernels=[knl],
+            exclude_self=True)
+
+    wrangler = SumpyExpansionWrangler(tree_indep, trav, dtype,
+            fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order,
+            self_extra_kwargs=self_extra_kwargs)
+
+    from boxtree.fmm import drive_fmm
+
+    pot0, pot1, pot2 = drive_fmm(wrangler, (weights,))
+    pot0, pot1, pot2 = actx.to_numpy(pot0), actx.to_numpy(pot1), actx.to_numpy(pot2)
+    if deriv_axes == (0,):
+        ref_pot = pot1 * actx.to_numpy(sources[0]) + pot2
+    else:
+        ref_pot = pot1 * actx.to_numpy(sources[0])
+
+    rel_err = la.norm(pot0 - ref_pot) / la.norm(ref_pot)
+    logger.info("order %d -> relative l2 error: %g", order, rel_err)
+
+    assert np.isclose(rel_err, 0, atol=1e-5)
+
+# }}}
+
+
+"""
+You can test individual routines by typing
+$ python test/test_fmm.py 'test_sumpy_fmm(_acf, LaplaceKernel(2),
+      VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+      order_varies_with_level=False, use_translation_classes=True, use_fft=True,
+      fft_backend="pyvkfft", visualize=True)'
+"""

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

@@ -22,71 +23,88 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-from six.moves import range
+import logging
+import sys
+from functools import partial

 import numpy as np
 import numpy.linalg as la
-import sys
-
 import pytest
-import pyopencl as cl
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)

-from sumpy.expansion.multipole import (
-        VolumeTaylorMultipoleExpansion, H2DMultipoleExpansion,
-        VolumeTaylorMultipoleExpansionBase,
-        LaplaceConformingVolumeTaylorMultipoleExpansion,
-        HelmholtzConformingVolumeTaylorMultipoleExpansion)
-from sumpy.expansion.local import (
-        VolumeTaylorLocalExpansion, H2DLocalExpansion,
-        LaplaceConformingVolumeTaylorLocalExpansion,
-        HelmholtzConformingVolumeTaylorLocalExpansion)
-from sumpy.kernel import (LaplaceKernel, HelmholtzKernel, AxisTargetDerivative,
-        DirectionalSourceDerivative)
+from arraycontext import pytest_generate_tests_for_array_contexts
 from pytools.convergence import PConvergenceVerifier
+from pytools.obj_array import make_obj_array
+
+import sumpy.symbolic as sym
+import sumpy.toys as t
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
+from sumpy.expansion.local import (
+    H2DLocalExpansion,
+    LinearPDEConformingVolumeTaylorLocalExpansion,
+    VolumeTaylorLocalExpansion,
+)
+from sumpy.expansion.m2l import (
+    FFTM2LTranslationClassFactory,
+    NonFFTM2LTranslationClassFactory,
+)
+from sumpy.expansion.multipole import (
+    H2DMultipoleExpansion,
+    LinearPDEConformingVolumeTaylorMultipoleExpansion,
+    VolumeTaylorMultipoleExpansion,
+    VolumeTaylorMultipoleExpansionBase,
+)
+from sumpy.kernel import (
+    AxisTargetDerivative,
+    BiharmonicKernel,
+    DirectionalSourceDerivative,
+    HelmholtzKernel,
+    LaplaceKernel,
+    StokesletKernel,
+)
+

-import logging
 logger = logging.getLogger(__name__)

-try:
-    import faulthandler
-except ImportError:
-    pass
-else:
-    faulthandler.enable()
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+

+# {{{ test_p2p

 @pytest.mark.parametrize("exclude_self", (True, False))
-def test_p2p(ctx_getter, exclude_self):
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+def test_p2p(actx_factory, exclude_self):
+    actx = actx_factory()

    dimensions = 3
    n = 5000

    from sumpy.p2p import P2P
    lknl = LaplaceKernel(dimensions)
-    knl = P2P(ctx,
+    knl = P2P(actx.context,
            [lknl, AxisTargetDerivative(0, lknl)],
            exclude_self=exclude_self)

-    targets = np.random.rand(dimensions, n)
-    sources = targets if exclude_self else np.random.rand(dimensions, n)
-
+    rng = np.random.default_rng(42)
+    targets = rng.random(size=(dimensions, n))
+    sources = targets if exclude_self else rng.random(size=(dimensions, n))
    strengths = np.ones(n, dtype=np.float64)

    extra_kwargs = {}

    if exclude_self:
-        extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
+        extra_kwargs["target_to_source"] = (
+            actx.from_numpy(np.arange(n, dtype=np.int32)))

-    evt, (potential, x_derivative) = knl(
-            queue, targets, sources, [strengths],
-            out_host=True, **extra_kwargs)
+    _evt, (potential, _x_derivative) = knl(
+            actx.queue,
+            actx.from_numpy(targets),
+            actx.from_numpy(sources),
+            [actx.from_numpy(strengths)],
+            **extra_kwargs)
+    potential = actx.to_numpy(potential)

    potential_ref = np.empty_like(potential)
-
    targets = targets.T
    sources = sources.T
    for itarg in range(n):
@@ -103,30 +121,153 @@ def test_p2p(ctx_getter, exclude_self):
    potential_ref *= 1/(4*np.pi)

    rel_err = la.norm(potential - potential_ref)/la.norm(potential_ref)
-    print(rel_err)
+    logger.info("error: %.12e", rel_err)
+
    assert rel_err < 1e-3

+# }}}
+
+
+# {{{ test_p2e_multiple
+
+@pytest.mark.parametrize(("base_knl", "expn_class"), [
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorMultipoleExpansion),
+])
+def test_p2e_multiple(actx_factory, base_knl, expn_class):
+    order = 4
+    actx = actx_factory()
+
+    nsources = 100
+
+    extra_kwargs = {}
+    if isinstance(base_knl, HelmholtzKernel):
+        if base_knl.allow_evanescent:
+            extra_kwargs["k"] = 0.2 * (0.707 + 0.707j)
+        else:
+            extra_kwargs["k"] = 0.2
+    if isinstance(base_knl, StokesletKernel):
+        extra_kwargs["mu"] = 0.2
+
+    source_kernels = [
+        DirectionalSourceDerivative(base_knl, "dir_vec"),
+        base_knl,
+    ]
+
+    knl = base_knl
+    expn = expn_class(knl, order=order)
+
+    from sumpy import P2EFromSingleBox
+
+    rng = np.random.default_rng(14)
+    center = np.array([2, 1, 0][:knl.dim], np.float64)
+    sources = actx.from_numpy(
+        0.7 * (-0.5 + rng.random(size=(knl.dim, nsources), dtype=np.float64))
+        + center[:, np.newaxis])
+
+    strengths = [
+        actx.from_numpy(np.ones(nsources, dtype=np.float64) * (1/nsources)),
+        actx.from_numpy(np.ones(nsources, dtype=np.float64) * (2/nsources))
+    ]
+
+    source_boxes = actx.from_numpy(np.array([0], dtype=np.int32))
+    box_source_starts = actx.from_numpy(np.array([0], dtype=np.int32))
+    box_source_counts_nonchild = (
+        actx.from_numpy(np.array([nsources], dtype=np.int32)))
+
+    alpha = np.linspace(0, 2*np.pi, nsources, np.float64)
+    dir_vec = actx.from_numpy(np.vstack([np.cos(alpha), np.sin(alpha)]))
+
+    from sumpy.expansion.local import LocalExpansionBase
+    if issubclass(expn_class, LocalExpansionBase):
+        loc_center = np.array([5.5, 0.0, 0.0][:knl.dim]) + center
+        centers = np.array(loc_center, dtype=np.float64).reshape(knl.dim, 1)
+    else:
+        centers = (np.array([0.0, 0.0, 0.0][:knl.dim],
+                            dtype=np.float64).reshape(knl.dim, 1)
+                    + center[:, np.newaxis])
+    centers = actx.from_numpy(centers)
+
+    rscale = 0.5  # pick something non-1
+
+    # apply p2e at the same time
+    p2e = P2EFromSingleBox(actx.context, expn,
+        kernels=source_kernels,
+        strength_usage=[0, 1])
+
+    _evt, (mpoles,) = p2e(actx.queue,
+            source_boxes=source_boxes,
+            box_source_starts=box_source_starts,
+            box_source_counts_nonchild=box_source_counts_nonchild,
+            centers=centers,
+            sources=sources,
+            strengths=strengths,
+            nboxes=1,
+            tgt_base_ibox=0,
+            rscale=rscale,
+            dir_vec=dir_vec,
+            **extra_kwargs)
+
+    actual_result = actx.to_numpy(mpoles)
+
+    # apply p2e separately
+    expected_result = np.zeros_like(actual_result)
+    for i, source_kernel in enumerate(source_kernels):
+        extra_source_kwargs = extra_kwargs.copy()
+        if isinstance(source_kernel, DirectionalSourceDerivative):
+            extra_source_kwargs["dir_vec"] = dir_vec
+
+        p2e = P2EFromSingleBox(actx.context, expn,
+            kernels=[source_kernel], strength_usage=[i])
+
+        _evt, (mpoles,) = p2e(actx.queue,
+            source_boxes=source_boxes,
+            box_source_starts=box_source_starts,
+            box_source_counts_nonchild=box_source_counts_nonchild,
+            centers=centers,
+            sources=sources,
+            strengths=strengths,
+            nboxes=1,
+            tgt_base_ibox=0,
+            rscale=rscale,
+            **extra_source_kwargs)
+        mpoles = actx.to_numpy(mpoles)
+
+        expected_result += mpoles
+
+    norm = la.norm(actual_result - expected_result)/la.norm(expected_result)
+    assert norm < 1e-12
+
+# }}}
+
+
+# {{{ test_p2e2p

 @pytest.mark.parametrize("order", [4])
 @pytest.mark.parametrize(("base_knl", "expn_class"), [
    (LaplaceKernel(2), VolumeTaylorLocalExpansion),
    (LaplaceKernel(2), VolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(2), LaplaceConformingVolumeTaylorLocalExpansion),
-    (LaplaceKernel(2), LaplaceConformingVolumeTaylorMultipoleExpansion),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorMultipoleExpansion),

    (HelmholtzKernel(2), VolumeTaylorMultipoleExpansion),
    (HelmholtzKernel(2), VolumeTaylorLocalExpansion),
-    (HelmholtzKernel(2), HelmholtzConformingVolumeTaylorLocalExpansion),
-    (HelmholtzKernel(2), HelmholtzConformingVolumeTaylorMultipoleExpansion),
+    (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion),
+    (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorMultipoleExpansion),
    (HelmholtzKernel(2), H2DLocalExpansion),
    (HelmholtzKernel(2), H2DMultipoleExpansion),

+    (DirectionalSourceDerivative(BiharmonicKernel(2), "dir_vec"),
+        VolumeTaylorMultipoleExpansion),
+    (DirectionalSourceDerivative(BiharmonicKernel(2), "dir_vec"),
+        VolumeTaylorLocalExpansion),
+
    (HelmholtzKernel(2, allow_evanescent=True), VolumeTaylorMultipoleExpansion),
    (HelmholtzKernel(2, allow_evanescent=True), VolumeTaylorLocalExpansion),
    (HelmholtzKernel(2, allow_evanescent=True),
-     HelmholtzConformingVolumeTaylorLocalExpansion),
+     LinearPDEConformingVolumeTaylorLocalExpansion),
    (HelmholtzKernel(2, allow_evanescent=True),
-     HelmholtzConformingVolumeTaylorMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion),
    (HelmholtzKernel(2, allow_evanescent=True), H2DLocalExpansion),
    (HelmholtzKernel(2, allow_evanescent=True), H2DMultipoleExpansion),
    ])
@@ -134,17 +275,8 @@ def test_p2p(ctx_getter, exclude_self):
    False,
    True
    ])
-# Sample: test_p2e2p(cl._csc, LaplaceKernel(2), VolumeTaylorLocalExpansion, 4, False)
-def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
-    #logging.basicConfig(level=logging.INFO)
-
-    from sympy.core.cache import clear_cache
-    clear_cache()
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
-
-    np.random.seed(17)
+def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative):
+    actx = actx_factory()

    res = 100
    nsources = 100
@@ -155,22 +287,24 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
            extra_kwargs["k"] = 0.2 * (0.707 + 0.707j)
        else:
            extra_kwargs["k"] = 0.2
+    if isinstance(base_knl, StokesletKernel):
+        extra_kwargs["mu"] = 0.2

    if with_source_derivative:
        knl = DirectionalSourceDerivative(base_knl, "dir_vec")
    else:
        knl = base_knl

-    out_kernels = [
+    target_kernels = [
            knl,
            AxisTargetDerivative(0, knl),
            ]
    expn = expn_class(knl, order=order)

-    from sumpy import P2EFromSingleBox, E2PFromSingleBox, P2P
-    p2e = P2EFromSingleBox(ctx, expn, out_kernels)
-    e2p = E2PFromSingleBox(ctx, expn, out_kernels)
-    p2p = P2P(ctx, out_kernels, exclude_self=False)
+    from sumpy import P2P, E2PFromSingleBox, P2EFromSingleBox
+    p2e = P2EFromSingleBox(actx.context, expn, kernels=[knl])
+    e2p = E2PFromSingleBox(actx.context, expn, kernels=target_kernels)
+    p2p = P2P(actx.context, target_kernels, exclude_self=False)

    from pytools.convergence import EOCRecorder
    eoc_rec_pot = EOCRecorder()
@@ -182,67 +316,70 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
    else:
        h_values = [1/2, 1/3, 1/5]

-    center = np.array([2, 1], np.float64)
-    sources = (0.7*(-0.5+np.random.rand(knl.dim, nsources).astype(np.float64))
-            + center[:, np.newaxis])
+    rng = np.random.default_rng(19)
+    center = np.array([2, 1, 0][:knl.dim], np.float64)
+    sources = actx.from_numpy(
+        0.7 * (-0.5 + rng.random((knl.dim, nsources), dtype=np.float64))
+        + center[:, np.newaxis])

-    strengths = np.ones(nsources, dtype=np.float64) * (1/nsources)
+    strengths = actx.from_numpy(np.ones(nsources, dtype=np.float64) / nsources)

-    source_boxes = np.array([0], dtype=np.int32)
-    box_source_starts = np.array([0], dtype=np.int32)
-    box_source_counts_nonchild = np.array([nsources], dtype=np.int32)
+    source_boxes = actx.from_numpy(np.array([0], dtype=np.int32))
+    box_source_starts = actx.from_numpy(np.array([0], dtype=np.int32))
+    box_source_counts_nonchild = (
+        actx.from_numpy(np.array([nsources], dtype=np.int32)))

    extra_source_kwargs = extra_kwargs.copy()
-    if with_source_derivative:
+    if isinstance(knl, DirectionalSourceDerivative):
        alpha = np.linspace(0, 2*np.pi, nsources, np.float64)
        dir_vec = np.vstack([np.cos(alpha), np.sin(alpha)])
-        extra_source_kwargs["dir_vec"] = dir_vec
+        extra_source_kwargs["dir_vec"] = actx.from_numpy(dir_vec)

    from sumpy.visualization import FieldPlotter

    for h in h_values:
        if issubclass(expn_class, LocalExpansionBase):
-            loc_center = np.array([5.5, 0.0]) + center
+            loc_center = np.array([5.5, 0.0, 0.0][:knl.dim]) + center
            centers = np.array(loc_center, dtype=np.float64).reshape(knl.dim, 1)
            fp = FieldPlotter(loc_center, extent=h, npoints=res)
        else:
-            eval_center = np.array([1/h, 0.0]) + center
+            eval_center = np.array([1/h, 0.0, 0.0][:knl.dim]) + center
            fp = FieldPlotter(eval_center, extent=0.1, npoints=res)
-            centers = (
-                    np.array([0.0, 0.0], dtype=np.float64).reshape(knl.dim, 1)
-                    + center[:, np.newaxis])
+            centers = (np.array([0.0, 0.0, 0.0][:knl.dim],
+                                dtype=np.float64).reshape(knl.dim, 1)
+                        + center[:, np.newaxis])

-        targets = fp.points
+        centers = actx.from_numpy(centers)
+        targets = actx.from_numpy(make_obj_array(fp.points))

        rscale = 0.5  # pick something non-1

        # {{{ apply p2e

-        evt, (mpoles,) = p2e(queue,
+        _evt, (mpoles,) = p2e(actx.queue,
                source_boxes=source_boxes,
                box_source_starts=box_source_starts,
                box_source_counts_nonchild=box_source_counts_nonchild,
                centers=centers,
                sources=sources,
-                strengths=strengths,
+                strengths=(strengths,),
                nboxes=1,
                tgt_base_ibox=0,
                rscale=rscale,
-
-                #flags="print_hl_cl",
-                out_host=True, **extra_source_kwargs)
+                **extra_source_kwargs)

        # }}}

        # {{{ apply e2p

-        ntargets = targets.shape[-1]
+        ntargets = fp.points.shape[-1]

-        box_target_starts = np.array([0], dtype=np.int32)
-        box_target_counts_nonchild = np.array([ntargets], dtype=np.int32)
+        box_target_starts = actx.from_numpy(np.array([0], dtype=np.int32))
+        box_target_counts_nonchild = (
+            actx.from_numpy(np.array([ntargets], dtype=np.int32)))

-        evt, (pot, grad_x, ) = e2p(
-                queue,
+        _evt, (pot, grad_x, ) = e2p(
+                actx.queue,
                src_expansions=mpoles,
                src_base_ibox=0,
                target_boxes=source_boxes,
@@ -251,19 +388,20 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
                centers=centers,
                targets=targets,
                rscale=rscale,
-
-                #flags="print_hl_cl",
-                out_host=True, **extra_kwargs)
+                **extra_kwargs)
+        pot = actx.to_numpy(pot)
+        grad_x = actx.to_numpy(grad_x)

        # }}}

        # {{{ compute (direct) reference solution

-        evt, (pot_direct, grad_x_direct, ) = p2p(
-                queue,
+        _evt, (pot_direct, grad_x_direct, ) = p2p(
+                actx.queue,
                targets, sources, (strengths,),
-                out_host=True,
                **extra_source_kwargs)
+        pot_direct = actx.to_numpy(pot_direct)
+        grad_x_direct = actx.to_numpy(grad_x_direct)

        err_pot = la.norm((pot - pot_direct)/res**2)
        err_grad_x = la.norm((grad_x - grad_x_direct)/res**2)
@@ -297,11 +435,11 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
        eoc_rec_pot.add_data_point(h, err_pot)
        eoc_rec_grad_x.add_data_point(h, err_grad_x)

-    print(expn_class, knl, order)
-    print("POTENTIAL:")
-    print(eoc_rec_pot)
-    print("X TARGET DERIVATIVE:")
-    print(eoc_rec_grad_x)
+    logger.info("expn_cls %s knl %s order %d", expn_class, knl, order)
+    logger.info("POTENTIAL:")
+    logger.info("%s", eoc_rec_pot)
+    logger.info("X TARGET DERIVATIVE:")
+    logger.info("%s", eoc_rec_grad_x)

    tgt_order = order + 1
    if issubclass(expn_class, LocalExpansionBase):
@@ -322,6 +460,10 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
        slack += 1
        grad_slack += 2

+    if isinstance(base_knl, DirectionalSourceDerivative):
+        slack += 1
+        grad_slack += 2
+
    if isinstance(base_knl, HelmholtzKernel):
        if base_knl.allow_evanescent:
            slack += 0.5
@@ -334,40 +476,60 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
    assert eoc_rec_pot.order_estimate() > tgt_order - slack
    assert eoc_rec_grad_x.order_estimate() > tgt_order_grad - grad_slack

-
-@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class", [
-    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(2), LaplaceConformingVolumeTaylorLocalExpansion,
-     LaplaceConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), HelmholtzConformingVolumeTaylorLocalExpansion,
-     HelmholtzConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion)
+# }}}
+
+
+# {{{ test_translations
+
+@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class, use_fft", [
+    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, True),
+    (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
+    (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, True),
+    (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
+    (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
+    (HelmholtzKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion, False),
+    (StokesletKernel(2, 0, 0), VolumeTaylorLocalExpansion,
+     VolumeTaylorMultipoleExpansion, False),
+    (StokesletKernel(2, 0, 0), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
    ])
-def test_translations(ctx_getter, knl, local_expn_class, mpole_expn_class):
-    logging.basicConfig(level=logging.INFO)
+def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class,
+        use_fft, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)

-    from sympy.core.cache import clear_cache
-    clear_cache()
-
-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
-
-    np.random.seed(17)
+    actx = actx_factory()

    res = 20
    nsources = 15

-    out_kernels = [knl]
-
    extra_kwargs = {}
    if isinstance(knl, HelmholtzKernel):
        extra_kwargs["k"] = 0.05
+    if isinstance(knl, StokesletKernel):
+        extra_kwargs["mu"] = 0.05

    # Just to make sure things also work away from the origin
-    origin = np.array([2, 1], np.float64)
-    sources = (0.7*(-0.5+np.random.rand(knl.dim, nsources).astype(np.float64))
-            + origin[:, np.newaxis])
+    rng = np.random.default_rng(18)
+    origin = np.array([2, 1, 0][:knl.dim], np.float64)
+    sources = (
+        0.7 * (-0.5 + rng.random((knl.dim, nsources), dtype=np.float64))
+        + origin[:, np.newaxis])
    strengths = np.ones(nsources, dtype=np.float64) * (1/nsources)

    pconv_verifier_p2m2p = PConvergenceVerifier()
@@ -377,18 +539,20 @@ def test_translations(ctx_getter, knl, local_expn_class, mpole_expn_class):

    from sumpy.visualization import FieldPlotter

-    eval_offset = np.array([5.5, 0.0])
+    eval_offset = np.array([5.5, 0.0, 0][:knl.dim])
+    fp = FieldPlotter(eval_offset + origin, extent=0.3, npoints=res)
+    targets = fp.points

    centers = (np.array(
            [
                # box 0: particles, first mpole here
-                [0, 0],
+                [0, 0, 0][:knl.dim],

                # box 1: second mpole here
-                np.array([-0.2, 0.1], np.float64),
+                np.array([-0.2, 0.1, 0][:knl.dim], np.float64),

                # box 2: first local here
-                eval_offset + np.array([0.3, -0.2], np.float64),
+                eval_offset + np.array([0.3, -0.2, 0][:knl.dim], np.float64),

                # box 3: second local and eval here
                eval_offset
@@ -397,228 +561,287 @@ def test_translations(ctx_getter, knl, local_expn_class, mpole_expn_class):

    del eval_offset

-    from sumpy.expansion import VolumeTaylorExpansionBase
-
-    if isinstance(knl, HelmholtzKernel) and \
-           issubclass(local_expn_class, VolumeTaylorExpansionBase):
-        # FIXME: Embarrassing--but we run out of memory for higher orders.
-        orders = [2, 3]
-    else:
+    if knl.dim == 2:  # noqa: SIM108
        orders = [2, 3, 4]
-    nboxes = centers.shape[-1]
-
-    def eval_at(e2p, source_box_nr, rscale):
-        e2p_target_boxes = np.array([source_box_nr], dtype=np.int32)
-
-        # These are indexed by global box numbers.
-        e2p_box_target_starts = np.array([0, 0, 0, 0], dtype=np.int32)
-        e2p_box_target_counts_nonchild = np.array([0, 0, 0, 0],
-                dtype=np.int32)
-        e2p_box_target_counts_nonchild[source_box_nr] = ntargets
-
-        evt, (pot,) = e2p(
-                queue,
-
-                src_expansions=mpoles,
-                src_base_ibox=0,
-
-                target_boxes=e2p_target_boxes,
-                box_target_starts=e2p_box_target_starts,
-                box_target_counts_nonchild=e2p_box_target_counts_nonchild,
-                centers=centers,
-                targets=targets,
-
-                rscale=rscale,
-
-                out_host=True, **extra_kwargs
-                )
+    else:
+        orders = [3, 4, 5]

-        return pot
+    if use_fft:
+        m2l_factory = FFTM2LTranslationClassFactory()
+    else:
+        m2l_factory = NonFFTM2LTranslationClassFactory()
+    m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)()
+
+    toy_ctx = t.ToyContext(
+            actx.context,
+            kernel=knl,
+            local_expn_class=partial(local_expn_class,
+                m2l_translation=m2l_translation),
+            mpole_expn_class=mpole_expn_class,
+            extra_kernel_kwargs=extra_kwargs,
+    )
+
+    p = t.PointSources(toy_ctx, sources, weights=strengths)
+    p2p = p.eval(targets)
+
+    m1_rscale = 0.5
+    m2_rscale = 0.25
+    l1_rscale = 0.5
+    l2_rscale = 0.25

    for order in orders:
-        m_expn = mpole_expn_class(knl, order=order)
-        l_expn = local_expn_class(knl, order=order)
+        print(centers[:, 0].shape)
+        p2m = t.multipole_expand(p, centers[:, 0], order, m1_rscale)
+        p2m2p = p2m.eval(targets)
+        err = la.norm((p2m2p - p2p) / res**2)
+        err = err / (la.norm(p2p) / res**2)
+        pconv_verifier_p2m2p.add_data_point(order, err)

-        from sumpy import P2EFromSingleBox, E2PFromSingleBox, P2P, E2EFromCSR
-        p2m = P2EFromSingleBox(ctx, m_expn)
-        m2m = E2EFromCSR(ctx, m_expn, m_expn)
-        m2p = E2PFromSingleBox(ctx, m_expn, out_kernels)
-        m2l = E2EFromCSR(ctx, m_expn, l_expn)
-        l2l = E2EFromCSR(ctx, l_expn, l_expn)
-        l2p = E2PFromSingleBox(ctx, l_expn, out_kernels)
-        p2p = P2P(ctx, out_kernels, exclude_self=False)
+        p2m2m = t.multipole_expand(p2m, centers[:, 1], order, m2_rscale)
+        p2m2m2p = p2m2m.eval(targets)
+        err = la.norm((p2m2m2p - p2p)/res**2)
+        err = err / (la.norm(p2p) / res**2)
+        pconv_verifier_p2m2m2p.add_data_point(order, err)

-        fp = FieldPlotter(centers[:, -1], extent=0.3, npoints=res)
-        targets = fp.points
+        p2m2m2l = t.local_expand(p2m2m, centers[:, 2], order, l1_rscale)
+        p2m2m2l2p = p2m2m2l.eval(targets)
+        err = la.norm((p2m2m2l2p - p2p)/res**2)
+        err = err / (la.norm(p2p) / res**2)
+        pconv_verifier_p2m2m2l2p.add_data_point(order, err)

-        # {{{ compute (direct) reference solution
+        p2m2m2l2l = t.local_expand(p2m2m2l, centers[:, 3], order, l2_rscale)
+        p2m2m2l2l2p = p2m2m2l2l.eval(targets)
+        err = la.norm((p2m2m2l2l2p - p2p)/res**2)
+        err = err / (la.norm(p2p) / res**2)
+        pconv_verifier_full.add_data_point(order, err)

-        evt, (pot_direct,) = p2p(
-                queue,
-                targets, sources, (strengths,),
-                out_host=True, **extra_kwargs)
+    for name, verifier in [
+            ("p2m2p", pconv_verifier_p2m2p),
+            ("p2m2m2p", pconv_verifier_p2m2m2p),
+            ("p2m2m2l2p", pconv_verifier_p2m2m2l2p),
+            ("full", pconv_verifier_full),
+            ]:
+        logger.info(30*"-")
+        logger.info("name:  %s", name)
+        logger.info(30*"-")
+        logger.info("result: %s", verifier)
+        logger.info(30*"-")

-        # }}}
+        verifier()

-        m1_rscale = 0.5
-        m2_rscale = 0.25
-        l1_rscale = 0.5
-        l2_rscale = 0.25
+# }}}

-        # {{{ apply P2M

-        p2m_source_boxes = np.array([0], dtype=np.int32)
+# {{{ test_m2m_and_l2l_exprs_simpler

-        # These are indexed by global box numbers.
-        p2m_box_source_starts = np.array([0, 0, 0, 0], dtype=np.int32)
-        p2m_box_source_counts_nonchild = np.array([nsources, 0, 0, 0],
-                dtype=np.int32)
+@pytest.mark.parametrize("order", [4])
+@pytest.mark.parametrize(("base_knl", "local_expn_class", "mpole_expn_class"), [
+    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+    ])
+@pytest.mark.parametrize("with_source_derivative", [
+    False,
+    True
+    ])
+def test_m2m_and_l2l_exprs_simpler(base_knl, local_expn_class, mpole_expn_class,
+        order, with_source_derivative):
+    extra_kwargs = {}
+    if isinstance(base_knl, HelmholtzKernel):
+        if base_knl.allow_evanescent:
+            extra_kwargs["k"] = 0.2 * (0.707 + 0.707j)
+        else:
+            extra_kwargs["k"] = 0.2
+    if isinstance(base_knl, StokesletKernel):
+        extra_kwargs["mu"] = 0.2

-        evt, (mpoles,) = p2m(queue,
-                source_boxes=p2m_source_boxes,
-                box_source_starts=p2m_box_source_starts,
-                box_source_counts_nonchild=p2m_box_source_counts_nonchild,
-                centers=centers,
-                sources=sources,
-                strengths=strengths,
-                nboxes=nboxes,
-                rscale=m1_rscale,
+    if with_source_derivative:
+        knl = DirectionalSourceDerivative(base_knl, "dir_vec")
+    else:
+        knl = base_knl

-                tgt_base_ibox=0,
+    mpole_expn = mpole_expn_class(knl, order=order)
+    local_expn = local_expn_class(knl, order=order)

-                #flags="print_hl_wrapper",
-                out_host=True, **extra_kwargs)
+    dvec = sym.make_sym_vector("d", knl.dim)
+    src_coeff_exprs = [sym.Symbol(f"src_coeff{i}") for i in range(len(mpole_expn))]

-        # }}}
+    src_rscale = 3
+    tgt_rscale = 2

-        ntargets = targets.shape[-1]
+    faster_m2m = mpole_expn.translate_from(mpole_expn, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale)
+    slower_m2m = mpole_expn.translate_from(mpole_expn, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, _fast_version=False)

-        pot = eval_at(m2p, 0, m1_rscale)
+    for expr1, expr2 in zip(faster_m2m, slower_m2m, strict=True):
+        assert float(sym.doit(expr1 - expr2).expand()) == 0.0

-        err = la.norm((pot - pot_direct)/res**2)
-        err = err / (la.norm(pot_direct) / res**2)
+    faster_l2l = local_expn.translate_from(local_expn, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale)
+    slower_l2l = local_expn.translate_from(local_expn, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, _fast_version=False)
+    for expr1, expr2 in zip(faster_l2l, slower_l2l, strict=True):
+        assert float(sym.doit(expr1 - expr2).expand()) == 0.0

-        pconv_verifier_p2m2p.add_data_point(order, err)
+# }}}

-        # {{{ apply M2M

-        m2m_target_boxes = np.array([1], dtype=np.int32)
-        m2m_src_box_starts = np.array([0, 1], dtype=np.int32)
-        m2m_src_box_lists = np.array([0], dtype=np.int32)
+# {{{ test toeplitz

-        evt, (mpoles,) = m2m(queue,
-                src_expansions=mpoles,
-                src_base_ibox=0,
-                tgt_base_ibox=0,
-                ntgt_level_boxes=mpoles.shape[0],
+def _m2l_translate_simple(tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+                           dvec, tgt_rscale):
+    if not tgt_expansion.use_rscale:
+        src_rscale = 1
+        tgt_rscale = 1

-                target_boxes=m2m_target_boxes,
+    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansionBase
+    if not isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase):
+        return 1

-                src_box_starts=m2m_src_box_starts,
-                src_box_lists=m2m_src_box_lists,
-                centers=centers,
+    # We know the general form of the multipole expansion is:
+    #
+    #    coeff0 * diff(kernel, mi0) + coeff1 * diff(kernel, mi1) + ...
+    #
+    # To get the local expansion coefficients, we take derivatives of
+    # the multipole expansion.
+    taker = src_expansion.kernel.get_derivative_taker(dvec, src_rscale, sac=None)

-                src_rscale=m1_rscale,
-                tgt_rscale=m2_rscale,
+    from sumpy.tools import add_mi

-                #flags="print_hl_cl",
-                out_host=True, **extra_kwargs)
+    result = []
+    for deriv in tgt_expansion.get_coefficient_identifiers():
+        local_result = []
+        for coeff, term in zip(
+                src_coeff_exprs,
+                src_expansion.get_coefficient_identifiers(), strict=True):

-        # }}}
+            kernel_deriv = taker.diff(add_mi(deriv, term)) / src_rscale**sum(deriv)

-        pot = eval_at(m2p, 1, m2_rscale)
+            local_result.append(
+                    coeff * kernel_deriv * tgt_rscale**sum(deriv))
+        result.append(sym.Add(*local_result))

-        err = la.norm((pot - pot_direct)/res**2)
-        err = err / (la.norm(pot_direct) / res**2)
+    return result

-        pconv_verifier_p2m2m2p.add_data_point(order, err)

-        # {{{ apply M2L
+def test_m2l_toeplitz():
+    dim = 3
+    knl = LaplaceKernel(dim)

-        m2l_target_boxes = np.array([2], dtype=np.int32)
-        m2l_src_box_starts = np.array([0, 1], dtype=np.int32)
-        m2l_src_box_lists = np.array([1], dtype=np.int32)
+    local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion
+    m2l_factory = NonFFTM2LTranslationClassFactory()
+    m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)()

-        evt, (mpoles,) = m2l(queue,
-                src_expansions=mpoles,
-                src_base_ibox=0,
-                tgt_base_ibox=0,
-                ntgt_level_boxes=mpoles.shape[0],
+    local_expn = local_expn_class(knl, order=5, m2l_translation=m2l_translation)
+    mpole_expn = mpole_expn_class(knl, order=5)

-                target_boxes=m2l_target_boxes,
-                src_box_starts=m2l_src_box_starts,
-                src_box_lists=m2l_src_box_lists,
-                centers=centers,
+    dvec = sym.make_sym_vector("d", dim)

-                src_rscale=m2_rscale,
-                tgt_rscale=l1_rscale,
+    rng = np.random.default_rng(44)
+    src_coeff_exprs = list(1 + rng.standard_normal(len(mpole_expn)))
+    src_rscale = 2.0
+    tgt_rscale = 1.0

-                #flags="print_hl_cl",
-                out_host=True, **extra_kwargs)
+    expected_output = _m2l_translate_simple(
+        local_expn, mpole_expn, src_coeff_exprs,
+        src_rscale, dvec, tgt_rscale)
+    actual_output = local_expn.translate_from(
+        mpole_expn, src_coeff_exprs,
+        src_rscale, dvec, tgt_rscale, sac=None)

-        # }}}
+    replace_dict = {d: rng.random() for d in dvec}
+    for sym_a, sym_b in zip(expected_output, actual_output, strict=True):
+        num_a = sym_a.xreplace(replace_dict)
+        num_b = sym_b.xreplace(replace_dict)

-        pot = eval_at(l2p, 2, l1_rscale)
+        assert abs(num_a - num_b)/abs(num_a) < 1e-10

-        err = la.norm((pot - pot_direct)/res**2)
-        err = err / (la.norm(pot_direct) / res**2)
+# }}}

-        pconv_verifier_p2m2m2l2p.add_data_point(order, err)

-        # {{{ apply L2L
+# {{{ test_m2m_compressed

-        l2l_target_boxes = np.array([3], dtype=np.int32)
-        l2l_src_box_starts = np.array([0, 1], dtype=np.int32)
-        l2l_src_box_lists = np.array([2], dtype=np.int32)
+@pytest.mark.parametrize("dim", [2, 3])
+@pytest.mark.parametrize("order", [2, 4, 6])
+def test_m2m_compressed_error_helmholtz(actx_factory, dim, order):
+    from sumpy import toys

-        evt, (mpoles,) = l2l(queue,
-                src_expansions=mpoles,
-                src_base_ibox=0,
-                tgt_base_ibox=0,
-                ntgt_level_boxes=mpoles.shape[0],
+    actx = actx_factory()

-                target_boxes=l2l_target_boxes,
-                src_box_starts=l2l_src_box_starts,
-                src_box_lists=l2l_src_box_lists,
-                centers=centers,
+    knl = HelmholtzKernel(dim)
+    extra_kernel_kwargs = {"k": 5}

-                src_rscale=l1_rscale,
-                tgt_rscale=l2_rscale,
+    mpole_expn_classes = [LinearPDEConformingVolumeTaylorMultipoleExpansion,
+                                VolumeTaylorMultipoleExpansion]
+    local_expn_classes = [LinearPDEConformingVolumeTaylorLocalExpansion,
+                                VolumeTaylorLocalExpansion]

-                #flags="print_hl_wrapper",
-                out_host=True, **extra_kwargs)
+    eval_center = np.array([0.1, 0.0, 0.0][:dim]).reshape(dim, 1)
+    mpole_center = np.array([0.0, 0.0, 0.0][:dim]).reshape(dim, 1)

-        # }}}
+    ntargets_per_dim = 10
+    nsources_per_dim = 10

-        pot = eval_at(l2p, 3, l2_rscale)
+    sources_grid = np.meshgrid(*[np.linspace(0, 1, nsources_per_dim)
+            for _ in range(dim)])
+    sources_grid = np.ndarray.flatten(np.array(sources_grid)).reshape(dim, -1)

-        err = la.norm((pot - pot_direct)/res**2)
-        err = err / (la.norm(pot_direct) / res**2)
+    targets_grid = np.meshgrid(*[np.linspace(0, 1, ntargets_per_dim)
+            for _ in range(dim)])
+    targets_grid = np.ndarray.flatten(np.array(targets_grid)).reshape(dim, -1)

-        pconv_verifier_full.add_data_point(order, err)
+    targets = eval_center - 0.1*(targets_grid - 0.5)

-    for name, verifier in [
-            ("p2m2p", pconv_verifier_p2m2p),
-            ("p2m2m2p", pconv_verifier_p2m2m2p),
-            ("p2m2m2l2p", pconv_verifier_p2m2m2l2p),
-            ("full", pconv_verifier_full),
-            ]:
-        print(30*"-")
-        print(name)
-        print(30*"-")
-        print(verifier)
-        print(30*"-")
-        verifier()
+    from pytools.convergence import EOCRecorder
+    eoc_rec = EOCRecorder()
+    h_values = 2.0**np.arange(-7, -3)
+    for h in h_values:
+        sources = (h*(-0.5+sources_grid.astype(np.float64)) + mpole_center)
+        second_center = mpole_center + h/2
+        furthest_source = np.max(np.abs(sources - mpole_center))
+        m2m_vals = [0, 0]
+        for i, (mpole_expn_class, local_expn_class) in \
+                enumerate(zip(mpole_expn_classes, local_expn_classes, strict=True)):
+            tctx = toys.ToyContext(
+                actx.context,
+                knl,
+                extra_kernel_kwargs=extra_kernel_kwargs,
+                local_expn_class=local_expn_class,
+                mpole_expn_class=mpole_expn_class,
+            )
+            pt_src = toys.PointSources(
+                tctx,
+                sources,
+                np.ones(sources.shape[-1])
+            )
+
+            mexp = toys.multipole_expand(pt_src,
+                center=mpole_center.reshape(dim),
+                order=order,
+                rscale=h)
+            mexp2 = toys.multipole_expand(mexp,
+                center=second_center.reshape(dim),
+                order=order,
+                rscale=h)
+            m2m_vals[i] = mexp2.eval(targets)
+
+        err = np.linalg.norm(m2m_vals[1] - m2m_vals[0]) \
+                / np.linalg.norm(m2m_vals[1])
+        eoc_rec.add_data_point(furthest_source, err)
+
+    logger.info("\n%s", eoc_rec)
+    assert eoc_rec.order_estimate() >= order + 1
+
+# }}}


 # You can test individual routines by typing
-# $ python test_kernels.py 'test_p2p(cl.create_some_context)'
+# $ python test_kernels.py 'test_p2p(_acf, True)'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2018 Alexandru Fikl"

@@ -22,247 +23,252 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

+import logging
 import sys
+
 import numpy as np
 import numpy.linalg as la
+import pytest

-import pyopencl as cl
-import pyopencl.array  # noqa
+from arraycontext import pytest_generate_tests_for_array_contexts

-from sumpy.tools import vector_to_device
-from sumpy.tools import MatrixBlockIndexRanges
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401

-import pytest
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)

-
-import logging
 logger = logging.getLogger(__name__)

-try:
-    import faulthandler
-except ImportError:
-    pass
-else:
-    faulthandler.enable()
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])


-def _build_geometry(queue, n, mode, target_radius=1.0):
-    # parametrize circle
-    t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
-    unit_circle = np.exp(1j * t)
-    unit_circle = np.array([unit_circle.real, unit_circle.imag])
+def _build_geometry(actx, ntargets, nsources, mode, target_radius=1.0):
+    # source points
+    t = np.linspace(0.0, 2.0 * np.pi, nsources, endpoint=False)
+    sources = np.array([np.cos(t), np.sin(t)])

-    # create density
+    # density
    sigma = np.cos(mode * t)

-    # create sources and targets
-    h = 2.0 * np.pi / n
-    targets = target_radius * unit_circle
-    sources = unit_circle
+    # target points
+    t = np.linspace(0.0, 2.0 * np.pi, ntargets, endpoint=False)
+    targets = target_radius * np.array([np.cos(t), np.sin(t)])

+    # target centers and expansion radii
+    h = 2.0 * np.pi * target_radius / ntargets
    radius = 7.0 * h
-    centers = unit_circle * (1.0 - radius)
-    expansion_radii = radius * np.ones(n)
-
-    return (cl.array.to_device(queue, targets),
-            cl.array.to_device(queue, sources),
-            vector_to_device(queue, centers),
-            cl.array.to_device(queue, expansion_radii),
-            cl.array.to_device(queue, sigma))
-
-
-def _build_block_index(queue, nnodes, nblks, factor):
-    indices = np.arange(0, nnodes)
-    ranges = np.arange(0, nnodes + 1, nnodes // nblks)
-
-    if abs(factor - 1.0) < 1.0e-14:
-        ranges_ = ranges
-        indices_ = indices
+    centers = (1.0 - radius) * targets
+    expansion_radii = np.full(ntargets, radius)
+
+    return (actx.from_numpy(targets),
+            actx.from_numpy(sources),
+            actx.from_numpy(centers),
+            actx.from_numpy(expansion_radii),
+            actx.from_numpy(sigma))
+
+
+def _build_subset_indices(actx, ntargets, nsources, factor):
+    tgtindices = np.arange(0, ntargets)
+    srcindices = np.arange(0, nsources)
+
+    rng = np.random.default_rng()
+    if abs(factor - 1.0) > 1.0e-14:
+        tgtindices = rng.choice(tgtindices,
+                size=int(factor * ntargets), replace=False)
+        srcindices = rng.choice(srcindices,
+                size=int(factor * nsources), replace=False)
    else:
-        indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
-        for i in range(ranges.shape[0] - 1):
-            iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
-            indices_[i] = np.sort(np.random.choice(iidx,
-                size=int(factor * len(iidx)), replace=False))
+        rng.shuffle(tgtindices)
+        rng.shuffle(srcindices)

-        ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
-        indices_ = np.hstack(indices_)
+    tgtindices, srcindices = np.meshgrid(tgtindices, srcindices)
+    return (
+            actx.freeze(actx.from_numpy(tgtindices.ravel())),
+            actx.freeze(actx.from_numpy(srcindices.ravel())))

-    from sumpy.tools import BlockIndexRanges
-    return BlockIndexRanges(queue.context,
-                            cl.array.to_device(queue, indices_).with_queue(None),
-                            cl.array.to_device(queue, ranges_).with_queue(None))

+# {{{ test_qbx_direct

-@pytest.mark.parametrize('factor', [1.0, 0.6])
-@pytest.mark.parametrize('lpot_id', [1, 2])
-def test_qbx_direct(ctx_getter, factor, lpot_id):
-    logging.basicConfig(level=logging.INFO)
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+@pytest.mark.parametrize("lpot_id", [1, 2])
+def test_qbx_direct(actx_factory, factor, lpot_id, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    actx = actx_factory()

    ndim = 2
-    nblks = 10
    order = 12
    mode_nr = 25

-    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    from sumpy.kernel import DirectionalSourceDerivative, LaplaceKernel
    if lpot_id == 1:
-        knl = LaplaceKernel(ndim)
+        base_knl = LaplaceKernel(ndim)
+        knl = base_knl
    elif lpot_id == 2:
-        knl = LaplaceKernel(ndim)
-        knl = DirectionalSourceDerivative(knl, dir_vec_name="dsource_vec")
+        base_knl = LaplaceKernel(ndim)
+        knl = DirectionalSourceDerivative(base_knl, dir_vec_name="dsource_vec")
    else:
-        raise ValueError("unknow lpot_id")
+        raise ValueError(f"unknown lpot_id: {lpot_id}")

    from sumpy.expansion.local import LineTaylorLocalExpansion
-    lknl = LineTaylorLocalExpansion(knl, order)
+    expn = LineTaylorLocalExpansion(knl, order)

    from sumpy.qbx import LayerPotential
-    lpot = LayerPotential(ctx, [lknl])
+    lpot = LayerPotential(actx.context, expansion=expn, source_kernels=(knl,),
+            target_kernels=(base_knl,))

    from sumpy.qbx import LayerPotentialMatrixGenerator
-    mat_gen = LayerPotentialMatrixGenerator(ctx, [lknl])
+    mat_gen = LayerPotentialMatrixGenerator(actx.context,
+            expansion=expn,
+            source_kernels=(knl,),
+            target_kernels=(base_knl,))

-    from sumpy.qbx import LayerPotentialMatrixBlockGenerator
-    blk_gen = LayerPotentialMatrixBlockGenerator(ctx, [lknl])
+    from sumpy.qbx import LayerPotentialMatrixSubsetGenerator
+    blk_gen = LayerPotentialMatrixSubsetGenerator(actx.context,
+            expansion=expn,
+            source_kernels=(knl,),
+            target_kernels=(base_knl,))

    for n in [200, 300, 400]:
        targets, sources, centers, expansion_radii, sigma = \
-                _build_geometry(queue, n, mode_nr, target_radius=1.2)
+                _build_geometry(actx, n, n, mode_nr, target_radius=1.2)

        h = 2 * np.pi / n
        strengths = (sigma * h,)
-
-        tgtindices = _build_block_index(queue, n, nblks, factor)
-        srcindices = _build_block_index(queue, n, nblks, factor)
-        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
+        tgtindices, srcindices = _build_subset_indices(actx,
+                ntargets=n, nsources=n, factor=factor)

        extra_kwargs = {}
        if lpot_id == 2:
-            extra_kwargs["dsource_vec"] = \
-                    vector_to_device(queue, np.ones((ndim, n)))
+            from pytools.obj_array import make_obj_array
+            extra_kwargs["dsource_vec"] = (
+                    actx.from_numpy(make_obj_array(np.ones((ndim, n))))
+                    )

-        _, (result_lpot,) = lpot(queue,
+        _, (result_lpot,) = lpot(actx.queue,
                targets=targets,
                sources=sources,
                centers=centers,
                expansion_radii=expansion_radii,
                strengths=strengths, **extra_kwargs)
-        result_lpot = result_lpot.get()
+        result_lpot = actx.to_numpy(result_lpot)

-        _, (mat,) = mat_gen(queue,
+        _, (mat,) = mat_gen(actx.queue,
                targets=targets,
                sources=sources,
                centers=centers,
                expansion_radii=expansion_radii, **extra_kwargs)
-        mat = mat.get()
-        result_mat = mat.dot(strengths[0].get())
+        mat = actx.to_numpy(mat)
+        result_mat = mat @ actx.to_numpy(strengths[0])

-        _, (blk,) = blk_gen(queue,
+        _, (blk,) = blk_gen(actx.queue,
                targets=targets,
                sources=sources,
                centers=centers,
                expansion_radii=expansion_radii,
-                index_set=index_set, **extra_kwargs)
-        blk = blk.get()
+                tgtindices=tgtindices,
+                srcindices=srcindices, **extra_kwargs)
+        blk = actx.to_numpy(blk)

-        rowindices = index_set.linear_row_indices.get(queue)
-        colindices = index_set.linear_col_indices.get(queue)
+        tgtindices = actx.to_numpy(tgtindices)
+        srcindices = actx.to_numpy(srcindices)

        eps = 1.0e-10 * la.norm(result_lpot)
        assert la.norm(result_mat - result_lpot) < eps
-        assert la.norm(blk - mat[rowindices, colindices]) < eps
+        assert la.norm(blk - mat[tgtindices, srcindices]) < eps

+# }}}
+
+
+# {{{ test_p2p_direct

 @pytest.mark.parametrize("exclude_self", [True, False])
 @pytest.mark.parametrize("factor", [1.0, 0.6])
-@pytest.mark.parametrize('lpot_id', [1, 2])
-def test_p2p_direct(ctx_getter, exclude_self, factor, lpot_id):
-    logging.basicConfig(level=logging.INFO)
+@pytest.mark.parametrize("lpot_id", [1, 2])
+def test_p2p_direct(actx_factory, exclude_self, factor, lpot_id, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    actx = actx_factory()

    ndim = 2
-    nblks = 10
    mode_nr = 25

-    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    from sumpy.kernel import DirectionalSourceDerivative, LaplaceKernel
    if lpot_id == 1:
        lknl = LaplaceKernel(ndim)
    elif lpot_id == 2:
        lknl = LaplaceKernel(ndim)
        lknl = DirectionalSourceDerivative(lknl, dir_vec_name="dsource_vec")
    else:
-        raise ValueError("unknow lpot_id")
+        raise ValueError(f"unknown lpot_id: '{lpot_id}'")

    from sumpy.p2p import P2P
-    lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+    lpot = P2P(actx.context, [lknl], exclude_self=exclude_self)

    from sumpy.p2p import P2PMatrixGenerator
-    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    mat_gen = P2PMatrixGenerator(actx.context, [lknl], exclude_self=exclude_self)

-    from sumpy.p2p import P2PMatrixBlockGenerator
-    blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
+    from sumpy.p2p import P2PMatrixSubsetGenerator
+    blk_gen = P2PMatrixSubsetGenerator(
+        actx.context, [lknl], exclude_self=exclude_self)

    for n in [200, 300, 400]:
-        targets, sources, _, _, sigma = \
-            _build_geometry(queue, n, mode_nr, target_radius=1.2)
+        targets, sources, _, _, sigma = (
+            _build_geometry(actx, n, n, mode_nr, target_radius=1.2))

        h = 2 * np.pi / n
        strengths = (sigma * h,)
-
-        tgtindices = _build_block_index(queue, n, nblks, factor)
-        srcindices = _build_block_index(queue, n, nblks, factor)
-        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
+        tgtindices, srcindices = _build_subset_indices(actx,
+                ntargets=n, nsources=n, factor=factor)

        extra_kwargs = {}
        if exclude_self:
-            extra_kwargs["target_to_source"] = \
-                cl.array.arange(queue, 0, n, dtype=np.int)
+            extra_kwargs["target_to_source"] = (
+                actx.from_numpy(np.arange(n, dtype=np.int32))
+                )
        if lpot_id == 2:
-            extra_kwargs["dsource_vec"] = \
-                    vector_to_device(queue, np.ones((ndim, n)))
+            from pytools.obj_array import make_obj_array
+            extra_kwargs["dsource_vec"] = (
+                    actx.from_numpy(make_obj_array(np.ones((ndim, n)))))

-        _, (result_lpot,) = lpot(queue,
+        _, (result_lpot,) = lpot(actx.queue,
                targets=targets,
                sources=sources,
                strength=strengths, **extra_kwargs)
-        result_lpot = result_lpot.get()
+        result_lpot = actx.to_numpy(result_lpot)

-        _, (mat,) = mat_gen(queue,
+        _, (mat,) = mat_gen(actx.queue,
                targets=targets,
                sources=sources, **extra_kwargs)
-        mat = mat.get()
-        result_mat = mat.dot(strengths[0].get())
+        mat = actx.to_numpy(mat)
+        result_mat = mat @ actx.to_numpy(strengths[0])

-        _, (blk,) = blk_gen(queue,
+        _, (blk,) = blk_gen(actx.queue,
                targets=targets,
                sources=sources,
-                index_set=index_set, **extra_kwargs)
-        blk = blk.get()
+                tgtindices=tgtindices,
+                srcindices=srcindices, **extra_kwargs)
+        blk = actx.to_numpy(blk)
+
+        tgtindices = actx.to_numpy(tgtindices)
+        srcindices = actx.to_numpy(srcindices)

        eps = 1.0e-10 * la.norm(result_lpot)
        assert la.norm(result_mat - result_lpot) < eps
+        assert la.norm(blk - mat[tgtindices, srcindices]) < eps

-        index_set = index_set.get(queue)
-        for i in range(index_set.nblocks):
-            assert la.norm(index_set.block_take(blk, i)
-                           - index_set.take(mat, i)) < eps
+# }}}


 # You can test individual routines by typing
-# $ python test_kernels.py 'test_p2p(cl.create_some_context)'
+# $ python test_matrixgen.py 'test_p2p_direct(_acf, True, 1.0, 1, visualize=True)'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_misc.py
+++ b/test/test_misc.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2017 Andreas Kloeckner"

@@ -22,64 +23,121 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import numpy as np
-import numpy.linalg as la
+import logging
 import sys
-import sumpy.toys as t
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any

+import numpy as np
+import numpy.linalg as la
 import pytest
-import pyopencl as cl  # noqa: F401
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)

-from pytools import Record
-
-from sumpy.kernel import (LaplaceKernel, HelmholtzKernel,
-        BiharmonicKernel, YukawaKernel)
+from arraycontext import pytest_generate_tests_for_array_contexts

+import sumpy.symbolic as sym
+import sumpy.toys as t
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
+from sumpy.expansion import (
+    FullExpansionTermsWrangler,
+    LinearPDEBasedExpansionTermsWrangler,
+)
+from sumpy.expansion.diff_op import (
+    as_scalar_pde,
+    concat,
+    curl,
+    diff,
+    divergence,
+    gradient,
+    laplacian,
+    make_identity_diff_op,
+)
+from sumpy.kernel import (
+    BiharmonicKernel,
+    ElasticityKernel,
+    ExpressionKernel,
+    HelmholtzKernel,
+    LaplaceKernel,
+    LineOfCompressionKernel,
+    StokesletKernel,
+    StressletKernel,
+    YukawaKernel,
+)

-# {{{ pde check for kernels

-class BiharmonicKernelInfo:
-    def __init__(self, dim):
-        self.kernel = BiharmonicKernel(dim)
-        self.extra_kwargs = {}
+logger = logging.getLogger(__name__)

-    @staticmethod
-    def pde_func(cp, pot):
-        return cp.laplace(cp.laplace(pot))
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])

-    nderivs = 4

+# {{{ pde check for kernels

-class YukawaKernelInfo:
-    def __init__(self, dim, lam):
-        self.kernel = YukawaKernel(dim)
-        self.lam = lam
-        self.extra_kwargs = {"lam": lam}
+class KernelInfo:
+    def __init__(self, kernel, **kwargs):
+        self.kernel = kernel
+        self.extra_kwargs = kwargs
+        diff_op = self.kernel.get_pde_as_diff_op()
+        assert len(diff_op.eqs) == 1
+        eq = diff_op.eqs[0]
+        self.eq = eq

    def pde_func(self, cp, pot):
-        return cp.laplace(pot) - self.lam**2*pot
+        subs_dict = {sym.Symbol(k): v for k, v in self.extra_kwargs.items()}
+        result = 0
+        for ident, coeff in self.eq.items():
+            lresult = pot
+            for axis, nderivs in enumerate(ident.mi):
+                lresult = cp.diff(axis, lresult, nderivs)
+            result += lresult*float(sym.sympify(coeff).xreplace(subs_dict))
+        return result

-    nderivs = 2
+    @property
+    def nderivs(self):
+        return max(sum(ident.mi) for ident in self.eq)


 @pytest.mark.parametrize("knl_info", [
-    BiharmonicKernelInfo(2),
-    BiharmonicKernelInfo(3),
-    YukawaKernelInfo(2, 5),
+    KernelInfo(BiharmonicKernel(2)),
+    KernelInfo(BiharmonicKernel(3)),
+    KernelInfo(YukawaKernel(2), lam=5),
+    KernelInfo(YukawaKernel(3), lam=5),
+    KernelInfo(LaplaceKernel(2)),
+    KernelInfo(LaplaceKernel(3)),
+    KernelInfo(HelmholtzKernel(2), k=5),
+    KernelInfo(HelmholtzKernel(3), k=5),
+    KernelInfo(StokesletKernel(2, 0, 1), mu=5),
+    KernelInfo(StokesletKernel(2, 1, 1), mu=5),
+    KernelInfo(StokesletKernel(3, 0, 1), mu=5),
+    KernelInfo(StokesletKernel(3, 1, 1), mu=5),
+    KernelInfo(StressletKernel(2, 0, 0, 0), mu=5),
+    KernelInfo(StressletKernel(2, 0, 0, 1), mu=5),
+    KernelInfo(StressletKernel(3, 0, 0, 0), mu=5),
+    KernelInfo(StressletKernel(3, 0, 0, 1), mu=5),
+    KernelInfo(StressletKernel(3, 0, 1, 2), mu=5),
+    KernelInfo(ElasticityKernel(2, 0, 1), mu=5, nu=0.2),
+    KernelInfo(ElasticityKernel(2, 0, 0), mu=5, nu=0.2),
+    KernelInfo(ElasticityKernel(3, 0, 1), mu=5, nu=0.2),
+    KernelInfo(ElasticityKernel(3, 0, 0), mu=5, nu=0.2),
+    KernelInfo(LineOfCompressionKernel(3, 0), mu=5, nu=0.2),
+    KernelInfo(LineOfCompressionKernel(3, 1), mu=5, nu=0.2),
    ])
-def test_pde_check_kernels(ctx_factory, knl_info, order=5):
+def test_pde_check_kernels(actx_factory, knl_info, order=5):
+    actx = actx_factory()
+
    dim = knl_info.kernel.dim
-    tctx = t.ToyContext(ctx_factory(), knl_info.kernel,
+    tctx = t.ToyContext(actx.context, knl_info.kernel,
            extra_source_kwargs=knl_info.extra_kwargs)

+    rng = np.random.default_rng(42)
    pt_src = t.PointSources(
            tctx,
-            np.random.rand(dim, 50) - 0.5,
+            rng.random(size=(dim, 50)) - 0.5,
            np.ones(50))

    from pytools.convergence import EOCRecorder
+
    from sumpy.point_calculus import CalculusPatch
    eoc_rec = EOCRecorder()

@@ -92,17 +150,20 @@ def test_pde_check_kernels(ctx_factory, knl_info, order=5):
        err = la.norm(pde)
        eoc_rec.add_data_point(h, err)

-    print(eoc_rec)
+    logger.info("eoc:\n%s", eoc_rec)
    assert eoc_rec.order_estimate() > order - knl_info.nderivs + 1 - 0.1

 # }}}


+# {{{ test_pde_check
+
 @pytest.mark.parametrize("dim", [1, 2, 3])
 def test_pde_check(dim, order=4):
-    from sumpy.point_calculus import CalculusPatch
    from pytools.convergence import EOCRecorder

+    from sumpy.point_calculus import CalculusPatch
+
    for iaxis in range(dim):
        eoc_rec = EOCRecorder()
        for h in [0.1, 0.01, 0.001]:
@@ -113,15 +174,19 @@ def test_pde_check(dim, order=4):
            err = la.norm(df_num-df_true)
            eoc_rec.add_data_point(h, err)

-        print(eoc_rec)
+        logger.info("eoc:\n%s", eoc_rec)
        assert eoc_rec.order_estimate() > order-2-0.1

+# }}}
+

+# {{{ test_order_finder
+
+@dataclass(frozen=True)
 class FakeTree:
-    def __init__(self, dimensions, root_extent, stick_out_factor):
-        self.dimensions = dimensions
-        self.root_extent = root_extent
-        self.stick_out_factor = stick_out_factor
+    dimensions: int
+    root_extent: float
+    stick_out_factor: float


 @pytest.mark.parametrize("knl", [
@@ -136,7 +201,7 @@ def test_order_finder(knl):
    orders = [
        ofind(knl, frozenset([("k", 5)]), tree, level)
        for level in range(30)]
-    print(orders)
+    logger.info("orders: %s", orders)

    # Order should not increase with level
    assert (np.diff(orders) <= 0).all()
@@ -155,11 +220,13 @@ def test_fmmlib_order_finder(knl):
    orders = [
        ofind(knl, frozenset([("k", 5)]), tree, level)
        for level in range(30)]
-    print(orders)
+    logger.info("orders: %s", orders)

    # Order should not increase with level
    assert (np.diff(orders) <= 0).all()

+# }}}
+

 # {{{ expansion toys p2e2e2p test cases

@@ -168,39 +235,21 @@ def approx_convergence_factor(orders, errors):
    return np.exp(poly[0])


-class P2E2E2PTestCase(Record):
+@dataclass(frozen=True)
+class P2E2E2PTestCase:
+    source: np.ndarray
+    target: np.ndarray
+    center1: np.ndarray
+    center2: np.ndarray
+    expansion1: Callable[..., Any]
+    expansion2: Callable[..., Any]
+    conv_factor: str
+    m2l_use_fft: bool = False

    @property
    def dim(self):
        return len(self.source)

-    @staticmethod
-    def eval(expr, source, center1, center2, target):
-        from pymbolic import parse, evaluate
-        context = {
-                "s": source,
-                "c1": center1,
-                "c2": center2,
-                "t": target,
-                "norm": la.norm}
-
-        return evaluate(parse(expr), context)
-
-    def __init__(self,
-            source, center1, center2, target, expansion1, expansion2, conv_factor):
-
-        if isinstance(conv_factor, str):
-            conv_factor = self.eval(conv_factor, source, center1, center2, target)
-
-        Record.__init__(self,
-                source=source,
-                center1=center1,
-                center2=center2,
-                target=target,
-                expansion1=expansion1,
-                expansion2=expansion2,
-                conv_factor=conv_factor)
-

 P2E2E2P_TEST_CASES = (
        # local to local, 3D
@@ -232,34 +281,55 @@ P2E2E2P_TEST_CASES = (
            expansion1=t.multipole_expand,
            expansion2=t.local_expand,
            conv_factor="norm(t-c2)/(norm(c2-c1)-norm(c1-s))"),
+
+        # multipole to local, 3D with FFT
+        P2E2E2PTestCase(
+            source=np.array([-2., 2., 1.]),
+            center1=np.array([-2., 5., 3.]),
+            center2=np.array([0., 0., 0.]),
+            target=np.array([0., 0., -1]),
+            expansion1=t.multipole_expand,
+            expansion2=t.local_expand,
+            m2l_use_fft=True,
+            conv_factor="norm(t-c2)/(norm(c2-c1)-norm(c1-s))"),
 )

 # }}}


+# {{{ test_toy_p2e2e2p
+
 ORDERS_P2E2E2P = (3, 4, 5)
 RTOL_P2E2E2P = 1e-2


 @pytest.mark.parametrize("case", P2E2E2P_TEST_CASES)
-def test_toy_p2e2e2p(ctx_getter, case):
+def test_toy_p2e2e2p(actx_factory, case):
    dim = case.dim

    src = case.source.reshape(dim, -1)
    tgt = case.target.reshape(dim, -1)

-    if not 0 <= case.conv_factor <= 1:
+    from pymbolic import evaluate, parse
+    case_conv_factor = evaluate(parse(case.conv_factor), {
+            "s": case.source,
+            "c1": case.center1,
+            "c2": case.center2,
+            "t": case.target,
+            "norm": la.norm,
+    })
+
+    if not 0 <= case_conv_factor <= 1:
        raise ValueError(
-                "convergence factor not in valid range: %e" % case.conv_factor)
+            f"convergence factor not in valid range: {case_conv_factor}")

-    from sumpy.expansion.local import VolumeTaylorLocalExpansion
-    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
+    from sumpy.expansion import VolumeTaylorExpansionFactory

-    cl_ctx = ctx_getter()
-    ctx = t.ToyContext(cl_ctx,
+    actx = actx_factory()
+    ctx = t.ToyContext(actx.context,
             LaplaceKernel(dim),
-             VolumeTaylorMultipoleExpansion,
-             VolumeTaylorLocalExpansion)
+             expansion_factory=VolumeTaylorExpansionFactory(),
+             m2l_use_fft=case.m2l_use_fft)

    errors = []

@@ -273,18 +343,257 @@ def test_toy_p2e2e2p(ctx_getter, case):
        errors.append(np.abs(pot_actual - pot_p2e2e2p))

    conv_factor = approx_convergence_factor(1 + np.array(ORDERS_P2E2E2P), errors)
-    assert conv_factor <= min(1, case.conv_factor * (1 + RTOL_P2E2E2P)), \
-        (conv_factor, case.conv_factor * (1 + RTOL_P2E2E2P))
+    assert conv_factor <= min(1, case_conv_factor * (1 + RTOL_P2E2E2P)), \
+        (conv_factor, case_conv_factor * (1 + RTOL_P2E2E2P))
+
+# }}}
+
+
+# {{{ test_cse_matvec
+
+def test_cse_matvec():
+    from sumpy.expansion import CSEMatVecOperator
+    input_coeffs = [
+        [(0, 2)],
+        [],
+        [(1, 1)],
+        [(1, 9)],
+    ]
+
+    output_coeffs = [
+        [],
+        [(0, 3)],
+        [],
+        [(2, 7), (1, 5)],
+    ]
+
+    op = CSEMatVecOperator(input_coeffs, output_coeffs, shape=(4, 2))
+    m = np.array([[2, 0], [6, 0], [0, 1], [30, 16]])
+
+    rng = np.random.default_rng(42)
+    vec = rng.random(2)
+    expected_result = m @ vec
+    actual_result = op.matvec(vec)
+    assert np.allclose(expected_result, actual_result)
+
+    vec = rng.random(4)
+    expected_result = m.T @ vec
+    actual_result = op.transpose_matvec(vec)
+    assert np.allclose(expected_result, actual_result)
+
+# }}}
+
+
+# {{{ test_diff_op_stokes
+
+def test_diff_op_stokes():
+    from sumpy.symbolic import Function, symbols
+    diff_op = make_identity_diff_op(3, 4)
+    u = diff_op[:3]
+    p = diff_op[3]
+    pde = concat(laplacian(u) - gradient(p), divergence(u))
+
+    actual_output = pde.to_sym()
+    x, y, z = syms = symbols("x0, x1, x2")
+    funcs = symbols("f0, f1, f2, f3", cls=Function)
+    u, v, w, p = (f(*syms) for f in funcs)
+
+    eq1 = u.diff(x, x) + u.diff(y, y) + u.diff(z, z) - p.diff(x)
+    eq2 = v.diff(x, x) + v.diff(y, y) + v.diff(z, z) - p.diff(y)
+    eq3 = w.diff(x, x) + w.diff(y, y) + w.diff(z, z) - p.diff(z)
+    eq4 = u.diff(x) + v.diff(y) + w.diff(z)
+
+    expected_output = [eq1, eq2, eq3, eq4]
+
+    assert expected_output == actual_output
+
+# }}}
+
+
+# {{{ test_as_scalar_pde_stokes
+
+def test_as_scalar_pde_stokes():
+    diff_op = make_identity_diff_op(3, 4)
+    u = diff_op[:3]
+    p = diff_op[3]
+    pde = concat(laplacian(u) - gradient(p), divergence(u))
+
+    # velocity components in Stokes should satisfy Biharmonic
+    for i in range(3):
+        logger.info("pde\n%s", as_scalar_pde(pde, i))
+        logger.info("\n%s", laplacian(laplacian(u[i])))
+        assert as_scalar_pde(pde, i) == laplacian(laplacian(u[0]))
+
+    # pressure should satisfy Laplace
+    assert as_scalar_pde(pde, 3) == laplacian(u[0])
+
+# }}}
+
+
+# {{{ test_as_scalar_pde_maxwell
+
+def test_as_scalar_pde_maxwell():
+    from sumpy.symbolic import symbols
+    op = make_identity_diff_op(3, 6, time_dependent=True)
+    E = op[:3]  # noqa: N806
+    B = op[3:]  # noqa: N806
+    mu, epsilon = symbols("mu, epsilon")
+    t = (0, 0, 0, 1)
+
+    pde = concat(curl(E) + diff(B, t), curl(B) - mu*epsilon*diff(E, t),
+                 divergence(E), divergence(B))
+    as_scalar_pde(pde, 3)
+
+    for i in range(6):
+        assert as_scalar_pde(pde, i) == \
+            laplacian(op[0]) - mu*epsilon*diff(diff(op[0], t), t)
+
+# }}}
+
+
+# {{{ test_as_scalar_pde_elasticity
+
+def test_as_scalar_pde_elasticity():
+
+    # Ref: https://doi.org/10.1006/jcph.1996.0102
+
+    diff_op = make_identity_diff_op(2, 5)
+    sigma_x = diff_op[0]
+    sigma_y = diff_op[1]
+    tau = diff_op[2]
+    u = diff_op[3]
+    v = diff_op[4]
+
+    # Use numeric values as the expressions grow exponentially large otherwise
+    from sumpy.symbolic import symbols
+    lam, mu = symbols("lam, mu")
+
+    x = (1, 0)
+    y = (0, 1)
+
+    exprs = [
+        diff(sigma_x, x) + diff(tau, y),
+        diff(tau, x) + diff(sigma_y, y),
+        sigma_x - (lam + 2*mu)*diff(u, x) - lam*diff(v, y),
+        sigma_y - (lam + 2*mu)*diff(v, y) - lam*diff(u, x),
+        tau - mu*(diff(u, y) + diff(v, x)),
+    ]
+
+    pde = concat(*exprs)
+    assert pde.order == 1
+    for i in range(5):
+        scalar_pde = as_scalar_pde(pde, i)
+        assert scalar_pde == laplacian(laplacian(diff_op[0]))
+        assert scalar_pde.order == 4
+
+# }}}
+
+
+# {{{ test_elasticity_new
+
+def test_elasticity_new():
+    from pickle import dumps, loads
+    stokes_knl = StokesletKernel(3, 0, 1, "mu1", 0.5)
+    stokes_knl2 = ElasticityKernel(3, 0, 1, "mu1", 0.5)
+    elasticity_knl = ElasticityKernel(3, 0, 1, "mu1", "nu")
+    elasticity_helper_knl = LineOfCompressionKernel(3, 0, "mu1", "nu")
+
+    assert isinstance(stokes_knl2, StokesletKernel)
+    assert stokes_knl == stokes_knl2
+    assert loads(dumps(stokes_knl)) == stokes_knl
+
+    for knl in [elasticity_knl, elasticity_helper_knl]:
+        assert not isinstance(knl, StokesletKernel)
+        assert loads(dumps(knl)) == knl
+
+# }}}
+
+
+# {{{ test_weird_kernel
+
+w = make_identity_diff_op(2)
+
+pdes = [
+    diff(w, (1, 1)) + diff(w, (2, 0)),
+    diff(w, (1, 1)) + diff(w, (0, 2)),
+]
+
+
+@pytest.mark.parametrize("pde", pdes)
+def test_weird_kernel(pde):
+    class MyKernel(ExpressionKernel):
+        def __init__(self):
+            super().__init__(dim=2, expression=1, global_scaling_const=1,
+                is_complex_valued=False)
+
+        def get_pde_as_diff_op(self):
+            return pde
+
+    from functools import reduce
+    from operator import mul
+
+    from sumpy.expansion import LinearPDEConformingVolumeTaylorExpansion
+
+    knl = MyKernel()
+    order = 10
+    expn = LinearPDEConformingVolumeTaylorExpansion(kernel=knl,
+            order=order, use_rscale=False)
+
+    coeffs = expn.get_coefficient_identifiers()
+    fft_size = reduce(mul, map(max, *coeffs), 1)
+
+    assert fft_size == order
+
+# }}}
+
+
+# {{{ test_get_storage_index
+
+class StorageIndexTestKernel(ExpressionKernel):
+    def __init__(self, dim, max_mi):
+        super().__init__(dim=dim, expression=1, global_scaling_const=1,
+                is_complex_valued=False)
+        self._max_mi = max_mi
+
+    def get_pde_as_diff_op(self):
+        w = make_identity_diff_op(self.dim)
+        pde = diff(w, tuple(self._max_mi))
+        return pde
+
+
+@pytest.mark.parametrize("order", [6])
+@pytest.mark.parametrize("knl", [
+    LaplaceKernel(2),
+    LaplaceKernel(3),
+    StorageIndexTestKernel(2, (3, 0)),
+    StorageIndexTestKernel(2, (0, 3)),
+    StorageIndexTestKernel(3, (3, 0, 0)),
+    StorageIndexTestKernel(3, (0, 3, 0)),
+    StorageIndexTestKernel(3, (0, 0, 3)),
+    BiharmonicKernel(2),
+    BiharmonicKernel(3),
+])
+@pytest.mark.parametrize("compressed", (True, False))
+def test_get_storage_index(order, knl, compressed):
+    dim = knl.dim
+    if compressed:
+        wrangler = LinearPDEBasedExpansionTermsWrangler(order, dim, knl=knl)
+    else:
+        wrangler = FullExpansionTermsWrangler(order, dim)
+    for i, mi in enumerate(wrangler.get_coefficient_identifiers()):
+        assert i == wrangler.get_storage_index(mi)
+
+# }}}


 # You can test individual routines by typing
-# $ python test_misc.py 'test_p2p(cl.create_some_context)'
+# $ python test_misc.py 'test_pde_check_kernels(_acf,
+#       KernelInfo(HelmholtzKernel(2), k=5), order=5)'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_qbx.py
+++ b/test/test_qbx.py
-from __future__ import division, absolute_import, print_function
+from __future__ import annotations
+

 __copyright__ = "Copyright (C) 2017 Matt Wala"

@@ -22,31 +23,39 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import numpy as np
+import logging
 import sys

-import pyopencl as cl
-from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+import numpy as np
+import pytest
+
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
+from sumpy.expansion.local import LineTaylorLocalExpansion, VolumeTaylorLocalExpansion
+

-import logging
 logger = logging.getLogger(__name__)

+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])

-try:
-    import faulthandler
-except ImportError:
-    pass
-else:
-    faulthandler.enable()

+# {{{ test_direct_qbx_vs_eigval

-def test_direct(ctx_getter):
-    # This evaluates a single layer potential on a circle.
-    logging.basicConfig(level=logging.INFO)
+@pytest.mark.parametrize("expn_class", [
+            LineTaylorLocalExpansion,
+            VolumeTaylorLocalExpansion,
+            ])
+def test_direct_qbx_vs_eigval(actx_factory, expn_class, visualize=False):
+    """This evaluates a single layer potential on a circle using a known
+    eigenvalue/eigenvector combination.
+    """
+    if visualize:
+        logging.basicConfig(level=logging.INFO)

-    ctx = ctx_getter()
-    queue = cl.CommandQueue(ctx)
+    actx = actx_factory()

    from sumpy.kernel import LaplaceKernel
    lknl = LaplaceKernel(2)
@@ -54,9 +63,11 @@ def test_direct(ctx_getter):
    order = 12

    from sumpy.qbx import LayerPotential
-    from sumpy.expansion.local import LineTaylorLocalExpansion

-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
+    lpot = LayerPotential(actx.context,
+            expansion=expn_class(lknl, order),
+            target_kernels=(lknl,),
+            source_kernels=(lknl,))

    mode_nr = 25

@@ -76,34 +87,118 @@ def test_direct(ctx_getter):

        h = 2 * np.pi / n

-        targets = unit_circle
-        sources = unit_circle
+        targets = actx.from_numpy(unit_circle)
+        sources = actx.from_numpy(unit_circle)

        radius = 7 * h
-        centers = unit_circle * (1 - radius)
-
-        expansion_radii = np.ones(n) * radius
+        centers = actx.from_numpy((1 - radius) * unit_circle)
+        expansion_radii = actx.from_numpy(radius * np.ones(n))
+        strengths = (actx.from_numpy(sigma * h),)

-        strengths = (sigma * h,)
-        evt, (result_qbx,) = lpot(queue, targets, sources, centers, strengths,
+        _evt, (result_qbx,) = lpot(
+                actx.queue,
+                targets, sources, centers, strengths,
                expansion_radii=expansion_radii)
+        result_qbx = actx.to_numpy(result_qbx)

        eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx)))

-    print(eocrec)
+    logger.info("eoc:\n%s", eocrec)

    slack = 1.5
    assert eocrec.order_estimate() > order - slack

+# }}}
+
+
+# {{{ test_direct_qbx_vs_eigval_with_tgt_deriv
+
+@pytest.mark.parametrize("expn_class", [
+            LineTaylorLocalExpansion,
+            VolumeTaylorLocalExpansion,
+            ])
+def test_direct_qbx_vs_eigval_with_tgt_deriv(
+        actx_factory, expn_class, visualize=False):
+    """This evaluates a single layer potential on a circle using a known
+    eigenvalue/eigenvector combination.
+    """
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    actx = actx_factory()
+
+    from sumpy.kernel import AxisTargetDerivative, LaplaceKernel
+    lknl = LaplaceKernel(2)
+
+    order = 8
+
+    from sumpy.qbx import LayerPotential
+
+    lpot_dx = LayerPotential(actx.context, expansion=expn_class(lknl, order),
+            target_kernels=(AxisTargetDerivative(0, lknl),), source_kernels=(lknl,))
+    lpot_dy = LayerPotential(actx.context, expansion=expn_class(lknl, order),
+            target_kernels=(AxisTargetDerivative(1, lknl),), source_kernels=(lknl,))
+
+    mode_nr = 15
+
+    from pytools.convergence import EOCRecorder
+
+    eocrec = EOCRecorder()
+
+    for n in [200, 300, 400]:
+        t = np.linspace(0, 2 * np.pi, n, endpoint=False)
+        unit_circle = np.exp(1j * t)
+        unit_circle = np.array([unit_circle.real, unit_circle.imag])
+
+        sigma = np.cos(mode_nr * t)
+        # eigval = 1/(2*mode_nr)
+        eigval = 0.5
+
+        result_ref = eigval * sigma
+
+        h = 2 * np.pi / n
+
+        targets = actx.from_numpy(unit_circle)
+        sources = actx.from_numpy(unit_circle)
+
+        radius = 7 * h
+        centers = actx.from_numpy((1 - radius) * unit_circle)
+        expansion_radii = actx.from_numpy(radius * np.ones(n))
+        strengths = (actx.from_numpy(sigma * h),)
+
+        _evt, (result_qbx_dx,) = lpot_dx(
+                actx.queue,
+                targets, sources, centers, strengths,
+                expansion_radii=expansion_radii)
+        _evt, (result_qbx_dy,) = lpot_dy(
+                actx.queue,
+                targets, sources, centers, strengths,
+                expansion_radii=expansion_radii)
+
+        result_qbx_dx = actx.to_numpy(result_qbx_dx)
+        result_qbx_dy = actx.to_numpy(result_qbx_dy)
+
+        normals = unit_circle
+        result_qbx = normals[0] * result_qbx_dx + normals[1] * result_qbx_dy
+
+        eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx)))
+
+    if expn_class is not LineTaylorLocalExpansion:
+        logger.info("eoc:\n%s", eocrec)
+
+        slack = 1.5
+        assert eocrec.order_estimate() > order - slack
+
+# }}}
+

 # You can test individual routines by typing
-# $ python test_kernels.py 'test_p2p(cl.create_some_context)'
+# $ python test_qbx.py 'test_direct_qbx_vs_eigval(_acf, LineTaylorLocalExpansion)'

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
-        from pytest import main
-        main([__file__])
+        pytest.main([__file__])

 # vim: fdm=marker
--- a/test/test_tools.py
+++ b/test/test_tools.py
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2020 Isuru Fernando"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import sys
+
+import numpy as np
+import pytest
+
+from arraycontext import pytest_generate_tests_for_array_contexts
+
+import sumpy.symbolic as sym
+from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
+from sumpy.tools import (
+    fft,
+    fft_toeplitz_upper_triangular,
+    loopy_fft,
+    matvec_toeplitz_upper_triangular,
+)
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+# {{{ test_matvec_fft
+
+def test_matvec_fft():
+    k = 5
+
+    rng = np.random.default_rng(42)
+    v = rng.random(k)
+    x = rng.random(k)
+
+    fft = fft_toeplitz_upper_triangular(v, x)
+    matvec = matvec_toeplitz_upper_triangular(v, x)
+
+    for i in range(k):
+        assert abs(fft[i] - matvec[i]) < 1e-14
+
+# }}}
+
+
+# {{{ test_matvec_fft_small_floats
+
+def test_matvec_fft_small_floats():
+    k = 5
+    v = sym.make_sym_vector("v", k)
+    x = sym.make_sym_vector("x", k)
+
+    fft = fft_toeplitz_upper_triangular(v, x)
+    for expr in fft:
+        for f in expr.atoms(sym.Float):
+            if f == 0:
+                continue
+            assert abs(f) > 1e-10
+
+# }}}
+
+
+# {{{ test_fft
+
+@pytest.mark.parametrize("size", [1, 2, 7, 10, 30, 210])
+def test_fft(actx_factory, size):
+    actx = actx_factory()
+
+    inp = np.arange(size, dtype=np.complex64)
+    inp_dev = actx.from_numpy(inp)
+    out = fft(inp)
+
+    fft_func = loopy_fft(inp.shape, inverse=False, complex_dtype=inp.dtype.type)
+    _evt, (out_dev,) = fft_func(actx.queue, y=inp_dev)
+
+    assert np.allclose(actx.to_numpy(out_dev), out)
+
+# }}}
+
+
+# You can test individual routines by typing
+# $ python test_tools.py 'test_fft(_acf, 30)'
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        pytest.main([__file__])
+
+# vim: fdm=marker
No results found