From daa4118947d5d4a1b892280720d48dde611288e2 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <idf2@illinois.edu>
Date: Fri, 13 May 2022 17:41:12 -0500
Subject: [PATCH] Use a separate class for M2L translation and change expansion
 default to FFT (#113)

* Use a separate class for M2L translation

* Fix docs and caching

* M2L Translation Factory

* vim markers

* Fix tests

* Fix toys

* Fix test_m2l_toeplitz

* Fix more tests

* Use a better rscale to get the test passing

* Use pytential dev branch

* Try 2r/order instead of r/order
---
 .github/workflows/ci.yml |   4 +-
 doc/expansion.rst        |   5 +
 sumpy/e2e.py             |  79 ++--
 sumpy/expansion/local.py | 581 +++--------------------------
 sumpy/expansion/m2l.py   | 770 +++++++++++++++++++++++++++++++++++++++
 sumpy/fmm.py             |  40 +-
 sumpy/toys.py            |   8 +
 test/test_fmm.py         |  14 +-
 test/test_kernels.py     |  10 +-
 test/test_misc.py        |   6 +-
 10 files changed, 912 insertions(+), 605 deletions(-)
 create mode 100644 sumpy/expansion/m2l.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 12c8d4ce..c369d1fc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -88,8 +88,8 @@ jobs:
             run: |
                 curl -L -O https://tiker.net/ci-support-v0
                 . ./ci-support-v0
-                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rscale" ]]; then
-                   DOWNSTREAM_PROJECT=https://github.com/isuruf/pytential.git@level_to_rscale
+                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "m2l" ]]; then
+                   DOWNSTREAM_PROJECT=https://github.com/isuruf/pytential.git@m2l_translation
                 fi
                 test_downstream "$DOWNSTREAM_PROJECT"
 
diff --git a/doc/expansion.rst b/doc/expansion.rst
index ffb1e122..5d72d735 100644
--- a/doc/expansion.rst
+++ b/doc/expansion.rst
@@ -18,6 +18,11 @@ Multipole Expansions
 
 .. automodule:: sumpy.expansion.multipole
 
+Multipole to Local Translations
+-------------------------------
+
+.. automodule:: sumpy.expansion.m2l
+
 Estimating Expansion Orders
 ---------------------------
 
diff --git a/sumpy/e2e.py b/sumpy/e2e.py
index 6f742f72..b35f86cd 100644
--- a/sumpy/e2e.py
+++ b/sumpy/e2e.py
@@ -306,12 +306,13 @@ class M2LUsingTranslationClassesDependentData(E2EFromCSR):
         src_rscale = sym.Symbol("src_rscale")
         tgt_rscale = sym.Symbol("tgt_rscale")
 
+        m2l_translation = self.tgt_expansion.m2l_translation
         m2l_translation_classes_dependent_ndata = (
-                self.tgt_expansion.m2l_translation_classes_dependent_ndata(
-                        self.src_expansion))
+            m2l_translation.translation_classes_dependent_ndata(self.tgt_expansion,
+                self.src_expansion))
         m2l_translation_classes_dependent_data = \
-                    [sym.Symbol(f"data{i}")
-                for i in range(m2l_translation_classes_dependent_ndata)]
+                [sym.Symbol(f"data{i}")
+            for i in range(m2l_translation_classes_dependent_ndata)]
 
         ncoeff_src = len(self.src_expansion)
 
@@ -346,13 +347,14 @@ class M2LUsingTranslationClassesDependentData(E2EFromCSR):
         except NotImplementedError:
             pass
 
-        ndata = self.tgt_expansion.m2l_translation_classes_dependent_ndata(
-                        self.src_expansion)
-        if self.tgt_expansion.use_preprocessing_for_m2l:
-            ncoeff_src = self.tgt_expansion.m2l_preprocess_multipole_nexprs(
-                    self.src_expansion)
-            ncoeff_tgt = self.tgt_expansion.m2l_postprocess_local_nexprs(
-                    self.src_expansion)
+        m2l_translation = self.tgt_expansion.m2l_translation
+        ndata = m2l_translation.translation_classes_dependent_ndata(
+            self.tgt_expansion, self.src_expansion)
+        if m2l_translation.use_preprocessing:
+            ncoeff_src = m2l_translation.preprocess_multipole_nexprs(
+                self.tgt_expansion, self.src_expansion)
+            ncoeff_tgt = m2l_translation.postprocess_local_nexprs(
+                self.tgt_expansion, self.src_expansion)
         else:
             ncoeff_src = len(self.src_expansion)
             ncoeff_tgt = len(self.tgt_expansion)
@@ -380,15 +382,16 @@ class M2LUsingTranslationClassesDependentData(E2EFromCSR):
                         )
 
     def get_kernel(self, result_dtype):
+        m2l_translation = self.tgt_expansion.m2l_translation
         m2l_translation_classes_dependent_ndata = \
-                self.tgt_expansion.m2l_translation_classes_dependent_ndata(
-                        self.src_expansion)
-
-        if self.tgt_expansion.use_preprocessing_for_m2l:
-            ncoeff_src = self.tgt_expansion.m2l_preprocess_multipole_nexprs(
-                    self.src_expansion)
-            ncoeff_tgt = self.tgt_expansion.m2l_postprocess_local_nexprs(
-                    self.src_expansion)
+            m2l_translation.translation_classes_dependent_ndata(
+                self.tgt_expansion, self.src_expansion)
+
+        if m2l_translation.use_preprocessing:
+            ncoeff_src = m2l_translation.preprocess_multipole_nexprs(
+                self.tgt_expansion, self.src_expansion)
+            ncoeff_tgt = m2l_translation.postprocess_local_nexprs(
+                self.tgt_expansion, self.src_expansion)
         else:
             ncoeff_src = len(self.src_expansion)
             ncoeff_tgt = len(self.tgt_expansion)
@@ -535,17 +538,18 @@ class M2LGenerateTranslationClassesDependentData(E2EBase):
         dvec = make_sym_vector("d", self.dim)
 
         src_rscale = sym.Symbol("src_rscale")
-        tgt_rscale = sym.Symbol("tgt_rscale")
+
+        m2l_translation = self.tgt_expansion.m2l_translation
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
         tgt_coeff_names = [
-                sac.assign_unique(
-                    f"m2l_translation_classes_dependent_expr{i}", coeff_i)
-                for i, coeff_i in enumerate(
-                    self.tgt_expansion.m2l_translation_classes_dependent_data(
-                        self.src_expansion, src_rscale,
-                        dvec, tgt_rscale, sac))]
+            sac.assign_unique(
+                f"m2l_translation_classes_dependent_expr{i}", coeff_i)
+            for i, coeff_i in enumerate(
+                m2l_translation.translation_classes_dependent_data(
+                    self.tgt_expansion, self.src_expansion, src_rscale,
+                    dvec, sac=sac))]
 
         sac.run_global_cse()
 
@@ -560,8 +564,8 @@ class M2LGenerateTranslationClassesDependentData(E2EBase):
 
     def get_kernel(self, result_dtype):
         m2l_translation_classes_dependent_ndata = \
-            self.tgt_expansion.m2l_translation_classes_dependent_ndata(
-                    self.src_expansion)
+            self.tgt_expansion.m2l_translation.translation_classes_dependent_ndata(
+                self.tgt_expansion, self.src_expansion)
         from sumpy.tools import gather_loopy_arguments
         loopy_knl = lp.make_kernel(
                 [
@@ -665,8 +669,8 @@ class M2LPreprocessMultipole(E2EBase):
         preprocessed_src_coeff_names = [
                 sac.assign_unique(f"preprocessed_src_coeff{i}", coeff_i)
                 for i, coeff_i in enumerate(
-                    self.tgt_expansion.m2l_preprocess_multipole_exprs(
-                        self.src_expansion, src_coeff_exprs,
+                    self.tgt_expansion.m2l_translation.preprocess_multipole_exprs(
+                        self.tgt_expansion, self.src_expansion, src_coeff_exprs,
                         sac=sac, src_rscale=src_rscale))]
 
         sac.run_global_cse()
@@ -683,7 +687,8 @@ class M2LPreprocessMultipole(E2EBase):
     def get_kernel(self, result_dtype):
         nsrc_coeffs = len(self.src_expansion)
         npreprocessed_src_coeffs = \
-            self.tgt_expansion.m2l_preprocess_multipole_nexprs(self.src_expansion)
+            self.tgt_expansion.m2l_translation.preprocess_multipole_nexprs(
+                self.tgt_expansion, self.src_expansion)
         from sumpy.tools import gather_loopy_arguments
         loopy_knl = lp.make_kernel(
                 [
@@ -752,8 +757,10 @@ class M2LPostprocessLocal(E2EBase):
     default_name = "m2l_postprocess_local"
 
     def get_loopy_insns(self, result_dtype):
+        m2l_translation = self.tgt_expansion.m2l_translation
         ncoeffs_before_postprocessing = \
-            self.tgt_expansion.m2l_postprocess_local_nexprs(self.tgt_expansion)
+            m2l_translation.postprocess_local_nexprs(self.tgt_expansion,
+                                                     self.src_expansion)
 
         tgt_coeff_exprs_before_postprocessing = [
             sym.Symbol(f"tgt_coeff_before_postprocessing{i}")
@@ -765,8 +772,9 @@ class M2LPostprocessLocal(E2EBase):
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
 
-        tgt_coeff_exprs = self.tgt_expansion.m2l_postprocess_local_exprs(
-            self.tgt_expansion, tgt_coeff_exprs_before_postprocessing,
+        tgt_coeff_exprs = m2l_translation.postprocess_local_exprs(
+            self.tgt_expansion, self.src_expansion,
+            tgt_coeff_exprs_before_postprocessing,
             sac=sac, src_rscale=src_rscale, tgt_rscale=tgt_rscale)
 
         if result_dtype in (np.float32, np.float64):
@@ -792,7 +800,8 @@ class M2LPostprocessLocal(E2EBase):
     def get_kernel(self, result_dtype):
         ntgt_coeffs = len(self.tgt_expansion)
         ntgt_coeffs_before_postprocessing = \
-            self.tgt_expansion.m2l_postprocess_local_nexprs(self.tgt_expansion)
+            self.tgt_expansion.m2l_translation.postprocess_local_nexprs(
+                self.tgt_expansion, self.src_expansion)
         from sumpy.tools import gather_loopy_arguments
         loopy_knl = lp.make_kernel(
                 [
diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py
index 72d91c01..d170aba9 100644
--- a/sumpy/expansion/local.py
+++ b/sumpy/expansion/local.py
@@ -21,10 +21,7 @@ THE SOFTWARE.
 """
 
 import math
-from typing import Tuple, Any
 
-import pymbolic
-import loopy as lp
 from pytools import single_valued
 
 import sumpy.symbolic as sym
@@ -32,9 +29,7 @@ from sumpy.expansion import (
         ExpansionBase,
         VolumeTaylorExpansion,
         LinearPDEConformingVolumeTaylorExpansion)
-from sumpy.tools import (
-        add_to_sac, fft,
-        mi_increment_axis, matvec_toeplitz_upper_triangular)
+from sumpy.tools import add_to_sac, mi_increment_axis
 
 import logging
 logger = logging.getLogger(__name__)
@@ -56,38 +51,23 @@ class LocalExpansionBase(ExpansionBase):
     .. attribute:: kernel
     .. attribute:: order
     .. attribute:: use_rscale
-    .. attribute:: use_fft_for_m2l
-    .. attribute:: use_preprocessing_for_m2l
-
-    .. automethod:: m2l_translation_classes_dependent_data
-    .. automethod:: m2l_translation_classes_dependent_ndata
-    .. automethod:: m2l_preprocess_multipole_exprs
-    .. automethod:: m2l_preprocess_multipole_nexprs
-    .. automethod:: m2l_postprocess_local_exprs
-    .. automethod:: m2l_postprocess_local_nexprs
+
     .. automethod:: translate_from
     """
-    init_arg_names = ("kernel", "order", "use_rscale", "use_fft_for_m2l",
-            "use_preprocessing_for_m2l")
+    init_arg_names = ("kernel", "order", "use_rscale", "m2l_translation")
 
     def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+            m2l_translation=None):
         super().__init__(kernel, order, use_rscale)
-        self.use_fft_for_m2l = use_fft_for_m2l
-        if use_preprocessing_for_m2l is None:
-            self.use_preprocessing_for_m2l = use_fft_for_m2l
-        else:
-            self.use_preprocessing_for_m2l = use_preprocessing_for_m2l
+        self.m2l_translation = m2l_translation
 
     def with_kernel(self, kernel):
         return type(self)(kernel, self.order, self.use_rscale,
-                use_fft_for_m2l=self.use_fft_for_m2l,
-                use_preprocessing_for_m2l=self.use_preprocessing_for_m2l)
+            self.m2l_translation)
 
     def update_persistent_hash(self, key_hash, key_builder):
         super().update_persistent_hash(key_hash, key_builder)
-        key_builder.rec(key_hash, self.use_fft_for_m2l)
-        key_builder.rec(key_hash, self.use_preprocessing_for_m2l)
+        key_builder.rec(key_hash, self.m2l_translation)
 
     def __eq__(self, other):
         return (
@@ -95,83 +75,9 @@ class LocalExpansionBase(ExpansionBase):
             and self.kernel == other.kernel
             and self.order == other.order
             and self.use_rscale == other.use_rscale
-            and self.use_fft_for_m2l == other.use_fft_for_m2l
-            and self.use_preprocessing_for_m2l == other.use_preprocessing_for_m2l
+            and self.m2l_translation == other.m2l_translation
         )
 
-    def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale,
-            dvec, tgt_rscale, sac) -> Tuple[Any]:
-        """Return an iterable of expressions that needs to be precomputed
-        for multipole-to-local translations that depend only on the
-        distance between the multipole center and the local center which
-        is given as *dvec*.
-
-        Since there are only a finite number of different values for the
-        distance between per level, these can be precomputed for the tree.
-        In :mod:`boxtree`, these distances are referred to as translation
-        classes.
-        """
-        return tuple()
-
-    def m2l_translation_classes_dependent_ndata(self, src_expansion):
-        """Return the number of expressions returned by
-        :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data`.
-        This method exists because calculating the number of expressions using
-        the above method might be costly and
-        :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data`
-        cannot be memoized due to it having side effects through the argument
-        *sac*.
-        """
-        return 0
-
-    def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac,
-            src_rscale):
-        """Return the preprocessed multipole expansion for an optimized M2L.
-        Preprocessing happens once per source box before M2L translation is done.
-
-        When FFT is turned on, the input expressions are transformed into Fourier
-        space. These expressions are used in a separate :mod:`loopy` kernel
-        to avoid having to transform for each target and source box pair.
-        When FFT is turned off, the expressions are equal to the multipole
-        expansion coefficients with zeros added
-        to make the M2L computation a circulant matvec.
-        """
-        raise NotImplementedError
-
-    def m2l_preprocess_multipole_nexprs(self, src_expansion):
-        """Return the number of expressions returned by
-        :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_preprocess_multipole_exprs`.
-        This method exists because calculating the number of expressions using
-        the above method might be costly and it cannot be memoized due to it having
-        side effects through the argument *sac*.
-        """
-        # For all use-cases we have right now, this is equal to the number of
-        # translation classes dependent exprs. Use that as a default.
-        return self.m2l_translation_classes_dependent_ndata(src_expansion)
-
-    def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale,
-            tgt_rscale, sac):
-        """Return postprocessed local expansion for an optimized M2L.
-        Postprocessing happens once per target box just after the M2L translation
-        is done and before storing the expansion coefficients for the local
-        expansion.
-
-        When FFT is turned on, the output expressions are transformed from Fourier
-        space back to the original space.
-        """
-        raise NotImplementedError
-
-    def m2l_postprocess_local_nexprs(self, src_expansion):
-        """Return the number of expressions given as input to
-        :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_postprocess_local_exprs`.
-        This method exists because calculating the number of expressions using
-        the above method might be costly and it cannot be memoized due to it
-        having side effects through the argument *sac*.
-        """
-        # For all use-cases we have right now, this is equal to the number of
-        # translation classes dependent exprs. Use that as a default.
-        return self.m2l_translation_classes_dependent_ndata(src_expansion)
-
     def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
             dvec, tgt_rscale, sac=None, m2l_translation_classes_dependent_data=None):
         """Translate from a multipole or local expansion to a local expansion
@@ -188,7 +94,7 @@ class LocalExpansionBase(ExpansionBase):
                 to collect common subexpressions or None.
         :arg m2l_translation_classes_dependent_data: An iterable of symbolic
                 expressions representing the expressions returned by
-                :func:`~sumpy.expansion.local.LocalExpansionBase.m2l_translation_classes_dependent_data`.
+                :func:`~sumpy.expansion.m2l.M2LTranslationBase.translation_classes_dependent_data`.
         """
         raise NotImplementedError
 
@@ -254,6 +160,15 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
     Coefficients represent derivative values of the kernel.
     """
 
+    def __init__(self, kernel, order, use_rscale=None,
+            m2l_translation=None):
+        if not m2l_translation:
+            from sumpy.expansion.m2l import DefaultM2LTranslationClassFactory
+            factory = DefaultM2LTranslationClassFactory()
+            m2l_translation = factory.get_m2l_translation_class(kernel,
+                self.__class__)()
+        super().__init__(kernel, order, use_rscale, m2l_translation)
+
     def coefficients_from_source_vec(self, kernels, avec, bvec, rscale, weights,
             sac=None):
         """Form an expansion with a linear combination of kernels and weights.
@@ -319,195 +234,6 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
 
         return kernel.postprocess_at_target(result, bvec)
 
-    def m2l_translation_classes_dependent_ndata(self, src_expansion):
-        """Returns number of expressions in M2L global precomputation step.
-        """
-        mis_with_dummy_rows, _, _ = \
-            self._m2l_translation_classes_dependent_data_mis(src_expansion)
-
-        return len(mis_with_dummy_rows)
-
-    def _m2l_translation_classes_dependent_data_mis(self, src_expansion):
-        """We would like to compute the M2L by way of a circulant matrix below.
-        To get the matrix representing the M2L into circulant form, a certain
-        numbering of rows and columns (as identified by multi-indices) is
-        required. This routine returns that numbering.
-
-        .. note::
-
-            The set of multi-indices returned may be a superset of the
-            coefficients used by the expansion. On the input end, those
-            coefficients are taken as zero. On output, they are simply
-            dropped from the computed result.
-
-        This method returns the multi-indices representing the rows
-        of the circulant matrix, the multi-indices representing the rows
-        of the M2L translation matrix and the maximum multi-index of the
-        latter.
-        """
-        from pytools import generate_nonnegative_integer_tuples_below as gnitb
-        from sumpy.tools import add_mi
-
-        # max_mi is the multi-index which is the sum of the
-        # element-wise maximum of source multi-indices and the
-        # element-wise maximum of target multi-indices.
-        max_mi = [0]*self.dim
-        for i in range(self.dim):
-            max_mi[i] = max(mi[i] for mi in
-                              src_expansion.get_coefficient_identifiers())
-            max_mi[i] += max(mi[i] for mi in
-                              self.get_coefficient_identifiers())
-
-        # These are the multi-indices representing the rows
-        # in the circulant matrix.  Note that to get the circulant
-        # matrix structure some multi-indices that are not in the
-        # M2L translation matrix are added.
-        # This corresponds to adding O(p^(d-1))
-        # additional rows and columns in the case of some PDEs
-        # like Laplace and O(p^d) in other cases.
-        circulant_matrix_mis = list(gnitb([m + 1 for m in max_mi]))
-
-        # These are the multi-indices representing the rows
-        # in the M2L translation matrix without the additional
-        # multi-indices in the circulant matrix
-        needed_vector_terms = set()
-        # For eg: 2D full Taylor Laplace, we only need kernel derivatives
-        # (n1+n2, m1+m2), n1+m1<=p, n2+m2<=p
-        for tgt_deriv in self.get_coefficient_identifiers():
-            for src_deriv in src_expansion.get_coefficient_identifiers():
-                needed = add_mi(src_deriv, tgt_deriv)
-                if needed not in needed_vector_terms:
-                    needed_vector_terms.add(needed)
-
-        return circulant_matrix_mis, tuple(needed_vector_terms), max_mi
-
-    def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale,
-            dvec, tgt_rscale, sac):
-
-        # We know the general form of the multipole expansion is:
-        #
-        #  coeff0 * diff(kernel(src - c1), mi0) +
-        #    coeff1 * diff(kernel(src - c1), mi1) + ...
-        #
-        # To get the local expansion coefficients, we take derivatives of
-        # the multipole expansion. For eg: the coefficient w.r.t mir is
-        #
-        #  coeff0 * diff(kernel(c2 - c1), mi0 + mir) +
-        #    coeff1 * diff(kernel(c2 - c1), mi1 + mir) + ...
-        #
-        # The derivatives above depends only on `c2 - c1` and can be precomputed
-        # globally as there are only a finite number of values for `c2 - c1` for
-        # m2l.
-
-        if not self.use_rscale:
-            src_rscale = 1
-
-        circulant_matrix_mis, needed_vector_terms, max_mi = \
-            self._m2l_translation_classes_dependent_data_mis(src_expansion)
-
-        circulant_matrix_ident_to_index = {ident: i for i, ident in
-                                enumerate(circulant_matrix_mis)}
-
-        # Create a expansion terms wrangler for derivatives up to order
-        # (tgt order)+(src order) including a corresponding reduction matrix
-        # For eg: 2D full Taylor Laplace, this is (n, m),
-        # n+m<=2*p, n<=2*p, m<=2*p
-        srcplusderiv_terms_wrangler = \
-            src_expansion.expansion_terms_wrangler.copy(
-                    order=self.order + src_expansion.order, max_mi=tuple(max_mi))
-        srcplusderiv_full_coeff_ids = \
-            srcplusderiv_terms_wrangler.get_full_coefficient_identifiers()
-        srcplusderiv_ident_to_index = {ident: i for i, ident in
-                            enumerate(srcplusderiv_full_coeff_ids)}
-
-        # The vector has the kernel derivatives and depends only on the distance
-        # between the two centers
-        taker = src_expansion.kernel.get_derivative_taker(dvec, src_rscale, sac)
-        vector_stored = []
-        # Calculate the kernel derivatives for the compressed set
-        for term in \
-                srcplusderiv_terms_wrangler.get_coefficient_identifiers():
-            kernel_deriv = taker.diff(term)
-            vector_stored.append(kernel_deriv)
-        # Calculate the kernel derivatives for the full set
-        vector_full = \
-            srcplusderiv_terms_wrangler.get_full_kernel_derivatives_from_stored(
-                        vector_stored, src_rscale)
-
-        for term in srcplusderiv_full_coeff_ids:
-            assert term in needed_vector_terms
-
-        vector = [0]*len(needed_vector_terms)
-        for i, term in enumerate(needed_vector_terms):
-            vector[i] = add_to_sac(sac,
-                        vector_full[srcplusderiv_ident_to_index[term]])
-
-        # Add zero values needed to make the translation matrix circulant
-        derivatives_full = [0]*len(circulant_matrix_mis)
-        for expr, mi in zip(vector, needed_vector_terms):
-            derivatives_full[circulant_matrix_ident_to_index[mi]] = expr
-
-        if self.use_fft_for_m2l:
-            # Note that the matrix we have now is a mirror image of a
-            # circulant matrix. We reverse the first column to get the
-            # first column for the circulant matrix and then finally
-            # use the FFT for convolution represented by the circulant
-            # matrix.
-            return fft(list(reversed(derivatives_full)), sac=sac)
-
-        return derivatives_full
-
-    def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac,
-            src_rscale):
-        circulant_matrix_mis, needed_vector_terms, max_mi = \
-                self._m2l_translation_classes_dependent_data_mis(src_expansion)
-        circulant_matrix_ident_to_index = {ident: i for i, ident in
-                            enumerate(circulant_matrix_mis)}
-
-        # Calculate the input vector for the circulant matrix
-        input_vector = [0] * len(circulant_matrix_mis)
-        for coeff, term in zip(
-                src_coeff_exprs,
-                src_expansion.get_coefficient_identifiers()):
-            input_vector[circulant_matrix_ident_to_index[term]] = \
-                    add_to_sac(sac, coeff)
-
-        if self.use_fft_for_m2l:
-            return fft(input_vector, sac=sac)
-        else:
-            return input_vector
-
-    def m2l_preprocess_multipole_nexprs(self, src_expansion):
-        circulant_matrix_mis, _, _ = \
-            self._m2l_translation_classes_dependent_data_mis(src_expansion)
-        return len(circulant_matrix_mis)
-
-    def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale,
-            tgt_rscale, sac):
-        circulant_matrix_mis, needed_vector_terms, max_mi = \
-                self._m2l_translation_classes_dependent_data_mis(src_expansion)
-        circulant_matrix_ident_to_index = {ident: i for i, ident in
-                            enumerate(circulant_matrix_mis)}
-
-        if self.use_fft_for_m2l:
-            n = len(circulant_matrix_mis)
-            m2l_result = fft(m2l_result, inverse=True, sac=sac)
-            # since we reversed the M2L matrix, we reverse the result
-            # to get the correct result
-            m2l_result = list(reversed(m2l_result[:n]))
-
-        # Filter out the dummy rows and scale them for target
-        rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale)
-        result = [
-            m2l_result[circulant_matrix_ident_to_index[term]]
-            * rscale_ratio**sum(term)
-            for term in self.get_coefficient_identifiers()]
-
-        return result
-
-    def m2l_postprocess_local_nexprs(self, src_expansion):
-        return self.m2l_translation_classes_dependent_ndata(src_expansion)
-
     def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
             dvec, tgt_rscale, sac=None, _fast_version=True,
             m2l_translation_classes_dependent_data=None):
@@ -527,32 +253,9 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
         # {{{ M2L
 
         if isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase):
-            circulant_matrix_mis, needed_vector_terms, max_mi = \
-                self._m2l_translation_classes_dependent_data_mis(src_expansion)
-
-            if not m2l_translation_classes_dependent_data:
-                derivatives = self.m2l_translation_classes_dependent_data(
-                        src_expansion, src_rscale, dvec, tgt_rscale, sac)
-            else:
-                derivatives = m2l_translation_classes_dependent_data
-
-            if self.use_fft_for_m2l:
-                assert len(src_coeff_exprs) == len(derivatives)
-                result = [a*b for a, b in zip(derivatives, src_coeff_exprs)]
-            else:
-                if not self.use_preprocessing_for_m2l:
-                    src_coeff_exprs = self.m2l_preprocess_multipole_exprs(
-                        src_expansion, src_coeff_exprs, sac, src_rscale)
-
-                # Returns a big symbolic sum of matrix entries
-                # (FIXME? Though this is just the correctness-checking
-                # fallback for the FFT anyhow)
-                result = matvec_toeplitz_upper_triangular(src_coeff_exprs,
-                    derivatives)
-
-                if not self.use_preprocessing_for_m2l:
-                    result = self.m2l_postprocess_local_exprs(src_expansion,
-                        result, src_rscale, tgt_rscale, sac)
+            result = self.m2l_translation.translate(self, src_expansion,
+                src_coeff_exprs, src_rscale, dvec, tgt_rscale, sac,
+                m2l_translation_classes_dependent_data)
 
             logger.info("building translation operator: done")
             return result
@@ -687,7 +390,6 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
                     (taker.diff(mi).xreplace(replace_dict) * rscale_ratio**sum(mi))
                     for mi in self.get_coefficient_identifiers()]
         # }}}
-        # }}}
         logger.info("building translation operator: done")
         return result
 
@@ -695,44 +397,8 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
         from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansionBase
 
         if isinstance(src_expansion, VolumeTaylorMultipoleExpansionBase):
-            if self.use_preprocessing_for_m2l:
-                ncoeff_src = self.m2l_preprocess_multipole_nexprs(src_expansion)
-                ncoeff_tgt = self.m2l_postprocess_local_nexprs(src_expansion)
-                icoeff_src = pymbolic.var("icoeff_src")
-                icoeff_tgt = pymbolic.var("icoeff_tgt")
-                domains = [f"{{[icoeff_tgt]: 0<=icoeff_tgt<{ncoeff_tgt} }}"]
-
-                coeff = pymbolic.var("coeff")
-                src_coeffs = pymbolic.var("src_coeffs")
-                m2l_translation_classes_dependent_data = pymbolic.var("data")
-
-                if self.use_fft_for_m2l:
-                    expr = src_coeffs[icoeff_tgt] \
-                            * m2l_translation_classes_dependent_data[icoeff_tgt]
-                else:
-                    toeplitz_first_row = src_coeffs[icoeff_src-icoeff_tgt]
-                    vector = m2l_translation_classes_dependent_data[icoeff_src]
-                    expr = toeplitz_first_row * vector
-                    domains.append(
-                        f"{{[icoeff_src]: icoeff_tgt<=icoeff_src<{ncoeff_src} }}")
-
-                expr = src_coeffs[icoeff_tgt] \
-                    * m2l_translation_classes_dependent_data[icoeff_tgt]
-
-                insns = [
-                    lp.Assignment(
-                        assignee=coeff[icoeff_tgt],
-                        expression=coeff[icoeff_tgt] + expr),
-                ]
-                return lp.make_function(domains, insns,
-                        kernel_data=[
-                            lp.GlobalArg("coeff, src_coeffs, data",
-                                shape=lp.auto),
-                            lp.ValueArg("src_rscale, tgt_rscale"),
-                            ...],
-                        name="e2e",
-                        lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
-                        )
+            return self.m2l_translation.loopy_translate(self, src_expansion)
+
         raise NotImplementedError(
             f"A direct loopy kernel for translation from "
             f"{src_expansion} to {self} is not implemented.")
@@ -742,10 +408,9 @@ class VolumeTaylorLocalExpansion(
         VolumeTaylorExpansion,
         VolumeTaylorLocalExpansionBase):
 
-    def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+    def __init__(self, kernel, order, use_rscale=None, m2l_translation=None):
         VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale,
-                use_fft_for_m2l, use_preprocessing_for_m2l=None)
+            m2l_translation=m2l_translation)
         VolumeTaylorExpansion.__init__(self, kernel, order, use_rscale)
 
 
@@ -753,10 +418,9 @@ class LinearPDEConformingVolumeTaylorLocalExpansion(
         LinearPDEConformingVolumeTaylorExpansion,
         VolumeTaylorLocalExpansionBase):
 
-    def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+    def __init__(self, kernel, order, use_rscale=None, m2l_translation=None):
         VolumeTaylorLocalExpansionBase.__init__(self, kernel, order, use_rscale,
-                use_fft_for_m2l, use_preprocessing_for_m2l)
+            m2l_translation=m2l_translation)
         LinearPDEConformingVolumeTaylorExpansion.__init__(
                 self, kernel, order, use_rscale)
 
@@ -800,21 +464,13 @@ class BiharmonicConformingVolumeTaylorLocalExpansion(
 
 class _FourierBesselLocalExpansion(LocalExpansionBase):
     def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
-
-        if use_fft_for_m2l:
-            # FIXME: expansion with FFT is correct symbolically and can be verified
-            # with sympy. However there are numerical issues that we have to deal
-            # with. Greengard and Rokhlin 1988 attributes this to numerical
-            # instability but gives rscale as a possible solution. Sumpy's rscale
-            # choice is slightly different from Greengard and Rokhlin and that
-            # might be the reason for this numerical issue.
-            raise ValueError("Bessel based expansions with FFT is not fully "
-                             "supported yet.")
-
-        super().__init__(kernel, order, use_rscale,
-                use_fft_for_m2l=use_fft_for_m2l,
-                use_preprocessing_for_m2l=use_preprocessing_for_m2l)
+            m2l_translation=None):
+        if not m2l_translation:
+            from sumpy.expansion.m2l import DefaultM2LTranslationClassFactory
+            factory = DefaultM2LTranslationClassFactory()
+            m2l_translation = factory.get_m2l_translation_class(kernel,
+                self.__class__)()
+        super().__init__(kernel, order, use_rscale, m2l_translation)
 
     def get_storage_index(self, k):
         return self.order+k
@@ -856,94 +512,6 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
                        * sym.exp(sym.I * c * -target_angle_rel_center), bvec)
                 for c in self.get_coefficient_identifiers())
 
-    def m2l_translation_classes_dependent_ndata(self, src_expansion):
-        nexpr = 2 * self.order + 2 * src_expansion.order + 1
-        return nexpr
-
-    def m2l_translation_classes_dependent_data(self, src_expansion, src_rscale,
-            dvec, tgt_rscale, sac):
-
-        from sumpy.symbolic import sym_real_norm_2, Hankel1
-        from sumpy.tools import fft
-
-        dvec_len = sym_real_norm_2(dvec)
-        new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
-        arg_scale = self.get_bessel_arg_scaling()
-        # [-(src_order+tgt_order), ..., 0, ..., (src_order + tgt_order)]
-        m2l_translation_classes_dependent_data = \
-                [0] * (2*self.order + 2 * src_expansion.order + 1)
-
-        # The M2L is a mirror image of a Toeplitz matvec with Hankel function
-        # evaluations. https://dlmf.nist.gov/10.23.F1
-        # This loop computes the first row and the last column vector sufficient
-        # to specify the matrix entries.
-        for j in self.get_coefficient_identifiers():
-            idx_j = self.get_storage_index(j)
-            for m in src_expansion.get_coefficient_identifiers():
-                idx_m = src_expansion.get_storage_index(m)
-                m2l_translation_classes_dependent_data[idx_j + idx_m] = (
-                    Hankel1(m + j, arg_scale * dvec_len, 0)
-                    * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center))
-
-        if self.use_fft_for_m2l:
-            order = src_expansion.order
-            # For this expansion, we have a mirror image of a Toeplitz matrix.
-            # First, we have to take the mirror image of the M2L matrix.
-            #
-            # After that the Toeplitz matrix has to be embedded in a circulant
-            # matrix. In this cicrcular matrix the first part of the first
-            # column is the first column of the Toeplitz matrix which is
-            # the last column of the M2L matrix. The second part is the
-            # reverse of the first row of the Toeplitz matrix which
-            # is the reverse of the first row of the M2L matrix.
-            first_row_m2l, last_column_m2l = \
-                m2l_translation_classes_dependent_data[:2*order], \
-                    m2l_translation_classes_dependent_data[2*order:]
-            first_column_toeplitz = last_column_m2l
-            first_row_toeplitz = list(reversed(first_row_m2l))
-
-            first_column_circulant = list(first_column_toeplitz) + \
-                    list(reversed(first_row_toeplitz))
-            return fft(first_column_circulant, sac)
-        else:
-            return m2l_translation_classes_dependent_data
-
-    def m2l_preprocess_multipole_exprs(self, src_expansion, src_coeff_exprs, sac,
-            src_rscale):
-
-        from sumpy.tools import fft
-        src_coeff_exprs = list(src_coeff_exprs)
-        for m in src_expansion.get_coefficient_identifiers():
-            src_coeff_exprs[src_expansion.get_storage_index(m)] *= src_rscale**abs(m)
-
-        if self.use_fft_for_m2l:
-            src_coeff_exprs = list(reversed(src_coeff_exprs))
-            src_coeff_exprs += [0] * (len(src_coeff_exprs) - 1)
-            return fft(src_coeff_exprs, sac=sac)
-        else:
-            return src_coeff_exprs
-
-    def m2l_preprocess_multipole_nexprs(self, src_expansion):
-        return 2*src_expansion.order + 1
-
-    def m2l_postprocess_local_exprs(self, src_expansion, m2l_result, src_rscale,
-            tgt_rscale, sac):
-
-        if self.use_fft_for_m2l:
-            m2l_result = fft(m2l_result, inverse=True, sac=sac)
-            m2l_result = m2l_result[:2*self.order+1]
-
-        # Filter out the dummy rows and scale them for target
-        result = []
-        for j in self.get_coefficient_identifiers():
-            result.append(m2l_result[self.get_storage_index(j)]
-                    * tgt_rscale**(abs(j)) * sym.Integer(-1)**j)
-
-        return result
-
-    def m2l_postprocess_local_nexprs(self, src_expansion):
-        return 2*self.order + 1
-
     def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
             dvec, tgt_rscale, sac=None, m2l_translation_classes_dependent_data=None):
         from sumpy.symbolic import sym_real_norm_2, BesselJ
@@ -970,34 +538,9 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
             return translated_coeffs
 
         if isinstance(src_expansion, self.mpole_expn_class):
-            if m2l_translation_classes_dependent_data is None:
-                derivatives = self.m2l_translation_classes_dependent_data(
-                    src_expansion, src_rscale, dvec, tgt_rscale, sac=sac)
-            else:
-                derivatives = m2l_translation_classes_dependent_data
-
-            if self.use_fft_for_m2l:
-                assert m2l_translation_classes_dependent_data is not None
-                assert len(derivatives) == len(src_coeff_exprs)
-                translated_coeffs = [a * b for a, b in zip(derivatives,
-                    src_coeff_exprs)]
-            else:
-                if not self.use_preprocessing_for_m2l:
-                    src_coeff_exprs = self.m2l_preprocess_multipole_exprs(
-                        src_expansion, src_coeff_exprs, sac, src_rscale)
-
-                translated_coeffs = [
-                    sum(derivatives[m + j + self.order + src_expansion.order]
-                            * src_coeff_exprs[src_expansion.get_storage_index(m)]
-                        for m in src_expansion.get_coefficient_identifiers())
-                    for j in self.get_coefficient_identifiers()]
-
-                if not self.use_preprocessing_for_m2l:
-                    translated_coeffs = self.m2l_postprocess_local_exprs(
-                        src_expansion, translated_coeffs, src_rscale, tgt_rscale,
-                        sac)
-
-            return translated_coeffs
+            return self.m2l_translation.translate(self, src_expansion,
+                src_coeff_exprs, src_rscale, dvec, tgt_rscale, sac,
+                m2l_translation_classes_dependent_data)
 
         raise RuntimeError(
             "do not know how to translate "
@@ -1005,57 +548,20 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
 
     def loopy_translate_from(self, src_expansion):
         if isinstance(src_expansion, self.mpole_expn_class):
-            if self.use_preprocessing_for_m2l:
-                ncoeff_src = self.m2l_preprocess_multipole_nexprs(src_expansion)
-                ncoeff_tgt = self.m2l_postprocess_local_nexprs(src_expansion)
-
-                icoeff_src = pymbolic.var("icoeff_src")
-                icoeff_tgt = pymbolic.var("icoeff_tgt")
-                domains = [f"{{[icoeff_tgt]: 0<=icoeff_tgt<{ncoeff_tgt} }}"]
+            return self.m2l_translation.loopy_translate(self, src_expansion)
 
-                coeff = pymbolic.var("coeff")
-                src_coeffs = pymbolic.var("src_coeffs")
-                m2l_translation_classes_dependent_data = pymbolic.var("data")
-
-                if self.use_fft_for_m2l:
-                    expr = src_coeffs[icoeff_tgt] \
-                            * m2l_translation_classes_dependent_data[icoeff_tgt]
-                else:
-                    expr = src_coeffs[icoeff_src] \
-                           * m2l_translation_classes_dependent_data[
-                                   icoeff_tgt + icoeff_src]
-                    domains.append(
-                            f"{{[icoeff_src]: 0<=icoeff_src<{ncoeff_src} }}")
-
-                insns = [
-                    lp.Assignment(
-                        assignee=coeff[icoeff_tgt],
-                        expression=coeff[icoeff_tgt] + expr),
-                ]
-                return lp.make_function(domains, insns,
-                        kernel_data=[
-                            lp.GlobalArg("coeff, src_coeffs, data",
-                                shape=lp.auto),
-                            lp.ValueArg("src_rscale, tgt_rscale"),
-                            ...],
-                        name="e2e",
-                        lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
-                        )
         raise NotImplementedError(
             f"A direct loopy kernel for translation from "
             f"{src_expansion} to {self} is not implemented.")
 
 
 class H2DLocalExpansion(_FourierBesselLocalExpansion):
-    def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+    def __init__(self, kernel, order, use_rscale=None, m2l_translation=None):
         from sumpy.kernel import HelmholtzKernel
         assert (isinstance(kernel.get_base_kernel(), HelmholtzKernel)
                 and kernel.dim == 2)
 
-        super().__init__(kernel, order, use_rscale,
-                use_fft_for_m2l=use_fft_for_m2l,
-                use_preprocessing_for_m2l=use_preprocessing_for_m2l)
+        super().__init__(kernel, order, use_rscale, m2l_translation=m2l_translation)
 
         from sumpy.expansion.multipole import H2DMultipoleExpansion
         self.mpole_expn_class = H2DMultipoleExpansion
@@ -1065,15 +571,12 @@ class H2DLocalExpansion(_FourierBesselLocalExpansion):
 
 
 class Y2DLocalExpansion(_FourierBesselLocalExpansion):
-    def __init__(self, kernel, order, use_rscale=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+    def __init__(self, kernel, order, use_rscale=None, m2l_translation=None):
         from sumpy.kernel import YukawaKernel
         assert (isinstance(kernel.get_base_kernel(), YukawaKernel)
                 and kernel.dim == 2)
 
-        super().__init__(kernel, order, use_rscale,
-                use_fft_for_m2l=use_fft_for_m2l,
-                use_preprocessing_for_m2l=use_preprocessing_for_m2l)
+        super().__init__(kernel, order, use_rscale, m2l_translation=m2l_translation)
 
         from sumpy.expansion.multipole import Y2DMultipoleExpansion
         self.mpole_expn_class = Y2DMultipoleExpansion
diff --git a/sumpy/expansion/m2l.py b/sumpy/expansion/m2l.py
new file mode 100644
index 00000000..8e61f125
--- /dev/null
+++ b/sumpy/expansion/m2l.py
@@ -0,0 +1,770 @@
+__copyright__ = "Copyright (C) 2022 Isuru Fernando"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from typing import Tuple, Any
+
+import pymbolic
+import loopy as lp
+import sumpy.symbolic as sym
+from sumpy.tools import (
+        add_to_sac, fft,
+        matvec_toeplitz_upper_triangular)
+
+import logging
+logger = logging.getLogger(__name__)
+
+__doc__ = """
+.. autoclass:: M2LTranslationClassFactoryBase
+.. autoclass:: NonFFTM2LTranslationClassFactory
+.. autoclass:: FFTM2LTranslationClassFactory
+.. autoclass:: DefaultM2LTranslationClassFactory
+
+.. autoclass:: M2LTranslationBase
+.. autoclass:: VolumeTaylorM2LTranslation
+.. autoclass:: VolumeTaylorM2LWithFFT
+.. autoclass:: FourierBesselM2LTranslation
+"""
+
+
+# {{{ M2L translation factory
+
+class M2LTranslationClassFactoryBase:
+    """An interface
+    .. automethod:: get_m2l_translation_class
+    """
+
+    def get_m2l_translation_class(self, base_kernel, local_expansion_class):
+        """Returns a subclass of :class:`M2LTranslationBase` suitable for
+        *base_kernel* and *local_expansion_class*.
+        """
+        raise NotImplementedError()
+
+
+class NonFFTM2LTranslationClassFactory(M2LTranslationClassFactoryBase):
+    """An implementation of :class:`M2LTranslationClassFactoryBase` that uses
+    non FFT M2L translation class.
+    """
+
+    def get_m2l_translation_class(self, base_kernel, local_expansion_class):
+        """Returns a subclass of :class:`M2LTranslationBase` suitable for
+        *base_kernel* and *local_expansion_class*.
+        """
+        from sumpy.expansion.local import (VolumeTaylorLocalExpansionBase,
+            _FourierBesselLocalExpansion)
+        if issubclass(local_expansion_class, VolumeTaylorLocalExpansionBase):
+            return VolumeTaylorM2LTranslation
+        elif issubclass(local_expansion_class, _FourierBesselLocalExpansion):
+            return FourierBesselM2LTranslation
+        else:
+            raise RuntimeError(
+                f"Unknown local_expansion_class: {local_expansion_class}")
+
+
+class FFTM2LTranslationClassFactory(M2LTranslationClassFactoryBase):
+    """An implementation of :class:`M2LTranslationClassFactoryBase` that uses
+    FFT M2L translation class.
+    """
+
+    def get_m2l_translation_class(self, base_kernel, local_expansion_class):
+        """Returns a subclass of :class:`M2LTranslationBase` suitable for
+        *base_kernel* and *local_expansion_class*.
+        """
+        from sumpy.expansion.local import (VolumeTaylorLocalExpansionBase,
+            _FourierBesselLocalExpansion)
+        if issubclass(local_expansion_class, VolumeTaylorLocalExpansionBase):
+            return VolumeTaylorM2LWithFFT
+        elif issubclass(local_expansion_class, _FourierBesselLocalExpansion):
+            return FourierBesselM2LWithFFT
+        else:
+            raise RuntimeError(
+                f"Unknown local_expansion_class: {local_expansion_class}")
+
+
+class DefaultM2LTranslationClassFactory(M2LTranslationClassFactoryBase):
+    """An implementation of :class:`M2LTranslationClassFactoryBase` that gives the
+    'best known' translation type for each kernel and local expansion class"""
+    def get_m2l_translation_class(self, base_kernel, local_expansion_class):
+        from sumpy.expansion.local import (VolumeTaylorLocalExpansionBase,
+            _FourierBesselLocalExpansion)
+        from sumpy.kernel import LaplaceKernel
+        if issubclass(local_expansion_class, VolumeTaylorLocalExpansionBase):
+            if isinstance(base_kernel, LaplaceKernel):
+                return VolumeTaylorM2LWithFFT
+            else:
+                return VolumeTaylorM2LTranslation
+        elif issubclass(local_expansion_class, _FourierBesselLocalExpansion):
+            return FourierBesselM2LTranslation
+        else:
+            raise RuntimeError(
+                f"Unknown local_expansion_class: {local_expansion_class}")
+
+
+# }}}
+
+# {{{ M2LTranslationBase
+
+class M2LTranslationBase:
+    """Base class for Multipole to Local Translation
+
+    .. automethod:: translate
+    .. automethod:: loopy_translate
+    .. automethod:: translation_classes_dependent_data
+    .. automethod:: translation_classes_dependent_ndata
+    .. automethod:: preprocess_multipole_exprs
+    .. automethod:: preprocess_multipole_nexprs
+    .. automethod:: postprocess_local_exprs
+    .. automethod:: postprocess_local_nexprs
+    .. autoattribute:: use_fft
+    .. autoattribute:: use_preprocessing
+    """
+
+    use_fft = False
+    use_preprocessing = False
+
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+        raise NotImplementedError
+
+    def loopy_translate(self, tgt_expansion, src_expansion):
+        raise NotImplementedError(
+            f"A direct loopy kernel for translation from "
+            f"{src_expansion} to {tgt_expansion} using {self} is not implemented.")
+
+    def translation_classes_dependent_data(self, tgt_expansion, src_expansion,
+            src_rscale, dvec, sac) -> Tuple[Any]:
+        """Return an iterable of expressions that needs to be precomputed
+        for multipole-to-local translations that depend only on the
+        distance between the multipole center and the local center which
+        is given as *dvec*.
+
+        Since there are only a finite number of different values for the
+        distance between per level, these can be precomputed for the tree.
+        In :mod:`boxtree`, these distances are referred to as translation
+        classes.
+        """
+        return tuple()
+
+    def translation_classes_dependent_ndata(self, tgt_expansion, src_expansion):
+        """Return the number of expressions returned by
+        :func:`~sumpy.expansion.m2l.M2LTranslationBase.translation_classes_dependent_data`.
+        This method exists because calculating the number of expressions using
+        the above method might be costly and
+        :func:`~sumpy.expansion.m2l.M2LTranslationBase.translation_classes_dependent_data`
+        cannot be memoized due to it having side effects through the argument
+        *sac*.
+        """
+        return 0
+
+    def preprocess_multipole_exprs(self, tgt_expansion, src_expansion,
+            src_coeff_exprs, sac, src_rscale):
+        """Return the preprocessed multipole expansion for an optimized M2L.
+        Preprocessing happens once per source box before M2L translation is done.
+
+        When FFT is turned on, the input expressions are transformed into Fourier
+        space. These expressions are used in a separate :mod:`loopy` kernel
+        to avoid having to transform for each target and source box pair.
+        When FFT is turned off, the expressions are equal to the multipole
+        expansion coefficients with zeros added
+        to make the M2L computation a circulant matvec.
+        """
+        raise NotImplementedError
+
+    def preprocess_multipole_nexprs(self, tgt_expansion, src_expansion):
+        """Return the number of expressions returned by
+        :func:`~sumpy.expansion.m2l.M2LTranslationBase.preprocess_multipole_exprs`.
+        This method exists because calculating the number of expressions using
+        the above method might be costly and it cannot be memoized due to it having
+        side effects through the argument *sac*.
+        """
+        # For all use-cases we have right now, this is equal to the number of
+        # translation classes dependent exprs. Use that as a default.
+        return self.translation_classes_dependent_ndata(tgt_expansion,
+            src_expansion)
+
+    def postprocess_local_exprs(self, tgt_expansion, src_expansion, m2l_result,
+            src_rscale, tgt_rscale, sac):
+        """Return postprocessed local expansion for an optimized M2L.
+        Postprocessing happens once per target box just after the M2L translation
+        is done and before storing the expansion coefficients for the local
+        expansion.
+
+        When FFT is turned on, the output expressions are transformed from Fourier
+        space back to the original space.
+        """
+        raise NotImplementedError
+
+    def postprocess_local_nexprs(self, tgt_expansion, src_expansion):
+        """Return the number of expressions given as input to
+        :func:`~sumpy.expansion.m2l.M2LTranslationBase.postprocess_local_exprs`.
+        This method exists because calculating the number of expressions using
+        the above method might be costly and it cannot be memoized due to it
+        having side effects through the argument *sac*.
+        """
+        # For all use-cases we have right now, this is equal to the number of
+        # translation classes dependent exprs. Use that as a default.
+        return self.translation_classes_dependent_ndata(tgt_expansion,
+                                                            src_expansion)
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_hash.update(type(self).__name__.encode("utf8"))
+
+
+# }}} M2LTranslationBase
+
+# {{{ VolumeTaylorM2LTranslation
+
+class VolumeTaylorM2LTranslation(M2LTranslationBase):
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        if translation_classes_dependent_data:
+            derivatives = translation_classes_dependent_data
+        else:
+            derivatives = self.translation_classes_dependent_data(
+                tgt_expansion, src_expansion, src_rscale, dvec, sac=sac)
+
+        src_coeff_exprs = self.preprocess_multipole_exprs(
+            tgt_expansion, src_expansion, src_coeff_exprs, sac, src_rscale)
+
+        # Returns a big symbolic sum of matrix entries
+        # (FIXME? Though this is just the correctness-checking
+        # fallback for the FFT anyhow)
+        result = matvec_toeplitz_upper_triangular(src_coeff_exprs,
+            derivatives)
+
+        result = self.postprocess_local_exprs(tgt_expansion, src_expansion,
+            result, src_rscale, tgt_rscale, sac)
+
+        return result
+
+    def translation_classes_dependent_ndata(self, tgt_expansion, src_expansion):
+        """Returns number of expressions in M2L global precomputation step.
+        """
+        mis_with_dummy_rows, _, _ = \
+            self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                             src_expansion)
+
+        return len(mis_with_dummy_rows)
+
+    def _translation_classes_dependent_data_mis(self, tgt_expansion,
+                                                    src_expansion):
+        """We would like to compute the M2L by way of a circulant matrix below.
+        To get the matrix representing the M2L into circulant form, a certain
+        numbering of rows and columns (as identified by multi-indices) is
+        required. This routine returns that numbering.
+
+        .. note::
+
+            The set of multi-indices returned may be a superset of the
+            coefficients used by the expansion. On the input end, those
+            coefficients are taken as zero. On output, they are simply
+            dropped from the computed result.
+
+        This method returns the multi-indices representing the rows
+        of the circulant matrix, the multi-indices representing the rows
+        of the M2L translation matrix and the maximum multi-index of the
+        latter.
+        """
+        from pytools import generate_nonnegative_integer_tuples_below as gnitb
+        from sumpy.tools import add_mi
+
+        dim = tgt_expansion.dim
+        # max_mi is the multi-index which is the sum of the
+        # element-wise maximum of source multi-indices and the
+        # element-wise maximum of target multi-indices.
+        max_mi = [0]*dim
+        for i in range(dim):
+            max_mi[i] = max(mi[i] for mi in
+                              src_expansion.get_coefficient_identifiers())
+            max_mi[i] += max(mi[i] for mi in
+                              tgt_expansion.get_coefficient_identifiers())
+
+        # These are the multi-indices representing the rows
+        # in the circulant matrix.  Note that to get the circulant
+        # matrix structure some multi-indices that are not in the
+        # M2L translation matrix are added.
+        # This corresponds to adding O(p^(d-1))
+        # additional rows and columns in the case of some PDEs
+        # like Laplace and O(p^d) in other cases.
+        circulant_matrix_mis = list(gnitb([m + 1 for m in max_mi]))
+
+        # These are the multi-indices representing the rows
+        # in the M2L translation matrix without the additional
+        # multi-indices in the circulant matrix
+        needed_vector_terms = set()
+        # For eg: 2D full Taylor Laplace, we only need kernel derivatives
+        # (n1+n2, m1+m2), n1+m1<=p, n2+m2<=p
+        for tgt_deriv in tgt_expansion.get_coefficient_identifiers():
+            for src_deriv in src_expansion.get_coefficient_identifiers():
+                needed = add_mi(src_deriv, tgt_deriv)
+                if needed not in needed_vector_terms:
+                    needed_vector_terms.add(needed)
+
+        return circulant_matrix_mis, tuple(needed_vector_terms), max_mi
+
+    def translation_classes_dependent_data(self, tgt_expansion, src_expansion,
+            src_rscale, dvec, sac):
+
+        # We know the general form of the multipole expansion is:
+        #
+        #  coeff0 * diff(kernel(src - c1), mi0) +
+        #    coeff1 * diff(kernel(src - c1), mi1) + ...
+        #
+        # To get the local expansion coefficients, we take derivatives of
+        # the multipole expansion. For eg: the coefficient w.r.t mir is
+        #
+        #  coeff0 * diff(kernel(c2 - c1), mi0 + mir) +
+        #    coeff1 * diff(kernel(c2 - c1), mi1 + mir) + ...
+        #
+        # The derivatives above depends only on `c2 - c1` and can be precomputed
+        # globally as there are only a finite number of values for `c2 - c1` for
+        # m2l.
+
+        if not tgt_expansion.use_rscale:
+            src_rscale = 1
+
+        circulant_matrix_mis, needed_vector_terms, max_mi = \
+            self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                             src_expansion)
+
+        circulant_matrix_ident_to_index = {ident: i for i, ident in
+                                enumerate(circulant_matrix_mis)}
+
+        # Create a expansion terms wrangler for derivatives up to order
+        # (tgt order)+(src order) including a corresponding reduction matrix
+        # For eg: 2D full Taylor Laplace, this is (n, m),
+        # n+m<=2*p, n<=2*p, m<=2*p
+        srcplusderiv_terms_wrangler = \
+            src_expansion.expansion_terms_wrangler.copy(
+                    order=tgt_expansion.order + src_expansion.order,
+                    max_mi=tuple(max_mi))
+        srcplusderiv_full_coeff_ids = \
+            srcplusderiv_terms_wrangler.get_full_coefficient_identifiers()
+        srcplusderiv_ident_to_index = {ident: i for i, ident in
+                            enumerate(srcplusderiv_full_coeff_ids)}
+
+        # The vector has the kernel derivatives and depends only on the distance
+        # between the two centers
+        taker = src_expansion.kernel.get_derivative_taker(dvec, src_rscale, sac)
+        vector_stored = []
+        # Calculate the kernel derivatives for the compressed set
+        for term in \
+                srcplusderiv_terms_wrangler.get_coefficient_identifiers():
+            kernel_deriv = taker.diff(term)
+            vector_stored.append(kernel_deriv)
+        # Calculate the kernel derivatives for the full set
+        vector_full = \
+            srcplusderiv_terms_wrangler.get_full_kernel_derivatives_from_stored(
+                        vector_stored, src_rscale)
+
+        for term in srcplusderiv_full_coeff_ids:
+            assert term in needed_vector_terms
+
+        vector = [0]*len(needed_vector_terms)
+        for i, term in enumerate(needed_vector_terms):
+            vector[i] = add_to_sac(sac,
+                        vector_full[srcplusderiv_ident_to_index[term]])
+
+        # Add zero values needed to make the translation matrix circulant
+        derivatives_full = [0]*len(circulant_matrix_mis)
+        for expr, mi in zip(vector, needed_vector_terms):
+            derivatives_full[circulant_matrix_ident_to_index[mi]] = expr
+
+        return derivatives_full
+
+    def preprocess_multipole_exprs(self, tgt_expansion, src_expansion,
+            src_coeff_exprs, sac, src_rscale):
+        circulant_matrix_mis, needed_vector_terms, max_mi = \
+                self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                                 src_expansion)
+        circulant_matrix_ident_to_index = {ident: i for i, ident in
+                            enumerate(circulant_matrix_mis)}
+
+        # Calculate the input vector for the circulant matrix
+        input_vector = [0] * len(circulant_matrix_mis)
+        for coeff, term in zip(
+                src_coeff_exprs,
+                src_expansion.get_coefficient_identifiers()):
+            input_vector[circulant_matrix_ident_to_index[term]] = \
+                    add_to_sac(sac, coeff)
+
+        return input_vector
+
+    def preprocess_multipole_nexprs(self, tgt_expansion, src_expansion):
+        circulant_matrix_mis, _, _ = \
+            self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                             src_expansion)
+        return len(circulant_matrix_mis)
+
+    def postprocess_local_exprs(self, tgt_expansion, src_expansion, m2l_result,
+            src_rscale, tgt_rscale, sac):
+        circulant_matrix_mis, needed_vector_terms, max_mi = \
+                self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                                 src_expansion)
+        circulant_matrix_ident_to_index = {ident: i for i, ident in
+                            enumerate(circulant_matrix_mis)}
+
+        # Filter out the dummy rows and scale them for target
+        rscale_ratio = add_to_sac(sac, tgt_rscale/src_rscale)
+        result = [
+            m2l_result[circulant_matrix_ident_to_index[term]]
+            * rscale_ratio**sum(term)
+            for term in tgt_expansion.get_coefficient_identifiers()]
+
+        return result
+
+    def postprocess_local_nexprs(self, tgt_expansion, src_expansion):
+        return self.translation_classes_dependent_ndata(
+            tgt_expansion, src_expansion)
+
+
+# }}} VolumeTaylorM2LTranslation
+
+# {{{ VolumeTaylorM2LWithPreprocessedMultipoles
+
+class VolumeTaylorM2LWithPreprocessedMultipoles(VolumeTaylorM2LTranslation):
+    use_preprocessing = True
+
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        assert translation_classes_dependent_data
+        derivatives = translation_classes_dependent_data
+        # Returns a big symbolic sum of matrix entries
+        # (FIXME? Though this is just the correctness-checking
+        # fallback for the FFT anyhow)
+        result = matvec_toeplitz_upper_triangular(src_coeff_exprs,
+            derivatives)
+        return result
+
+    def loopy_translate(self, tgt_expansion, src_expansion):
+        ncoeff_src = self.preprocess_multipole_nexprs(tgt_expansion,
+                                                          src_expansion)
+        ncoeff_tgt = self.postprocess_local_nexprs(tgt_expansion, src_expansion)
+        icoeff_src = pymbolic.var("icoeff_src")
+        icoeff_tgt = pymbolic.var("icoeff_tgt")
+        domains = [f"{{[icoeff_tgt]: 0<=icoeff_tgt<{ncoeff_tgt} }}"]
+
+        coeff = pymbolic.var("coeff")
+        src_coeffs = pymbolic.var("src_coeffs")
+        translation_classes_dependent_data = pymbolic.var("data")
+
+        if self.use_fft:
+            expr = src_coeffs[icoeff_tgt] \
+                    * translation_classes_dependent_data[icoeff_tgt]
+        else:
+            toeplitz_first_row = src_coeffs[icoeff_src-icoeff_tgt]
+            vector = translation_classes_dependent_data[icoeff_src]
+            expr = toeplitz_first_row * vector
+            domains.append(
+                f"{{[icoeff_src]: icoeff_tgt<=icoeff_src<{ncoeff_src} }}")
+
+        expr = src_coeffs[icoeff_tgt] \
+            * translation_classes_dependent_data[icoeff_tgt]
+
+        insns = [
+            lp.Assignment(
+                assignee=coeff[icoeff_tgt],
+                expression=coeff[icoeff_tgt] + expr),
+        ]
+        return lp.make_function(domains, insns,
+                kernel_data=[
+                    lp.GlobalArg("coeff, src_coeffs, data",
+                        shape=lp.auto),
+                    lp.ValueArg("src_rscale, tgt_rscale"),
+                    ...],
+                name="e2e",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )
+
+
+# }}} VolumeTaylorM2LWithPreprocessedMultipoles
+
+# {{{ VolumeTaylorM2LWithFFT
+
+class VolumeTaylorM2LWithFFT(VolumeTaylorM2LWithPreprocessedMultipoles):
+    use_fft = True
+
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        assert translation_classes_dependent_data
+        derivatives = translation_classes_dependent_data
+        print(src_coeff_exprs, derivatives)
+        assert len(src_coeff_exprs) == len(derivatives)
+        result = [a*b for a, b in zip(derivatives, src_coeff_exprs)]
+        return result
+
+    def translation_classes_dependent_data(self, tgt_expansion, src_expansion,
+            src_rscale, dvec, sac):
+
+        derivatives_full = super().translation_classes_dependent_data(
+            tgt_expansion, src_expansion, src_rscale, dvec, sac)
+        # Note that the matrix we have now is a mirror image of a
+        # circulant matrix. We reverse the first column to get the
+        # first column for the circulant matrix and then finally
+        # use the FFT for convolution represented by the circulant
+        # matrix.
+        return fft(list(reversed(derivatives_full)), sac=sac)
+
+    def preprocess_multipole_exprs(self, tgt_expansion, src_expansion,
+            src_coeff_exprs, sac, src_rscale):
+        input_vector = super().preprocess_multipole_exprs(
+            tgt_expansion, src_expansion, src_coeff_exprs, sac, src_rscale)
+
+        return fft(input_vector, sac=sac)
+
+    def postprocess_local_exprs(self, tgt_expansion, src_expansion, m2l_result,
+            src_rscale, tgt_rscale, sac):
+        circulant_matrix_mis, _, _ = \
+                self._translation_classes_dependent_data_mis(tgt_expansion,
+                                                                 src_expansion)
+        n = len(circulant_matrix_mis)
+        m2l_result = fft(m2l_result, inverse=True, sac=sac)
+        # since we reversed the M2L matrix, we reverse the result
+        # to get the correct result
+        m2l_result = list(reversed(m2l_result[:n]))
+
+        return super().postprocess_local_exprs(tgt_expansion,
+            src_expansion, m2l_result, src_rscale, tgt_rscale, sac)
+
+
+# }}} VolumeTaylorM2LWithFFT
+
+# {{{ FourierBesselM2LTranslation
+
+class FourierBesselM2LTranslation(M2LTranslationBase):
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        if translation_classes_dependent_data is None:
+            derivatives = self.translation_classes_dependent_data(tgt_expansion,
+                src_expansion, src_rscale, dvec, sac=sac)
+        else:
+            derivatives = translation_classes_dependent_data
+
+        src_coeff_exprs = self.preprocess_multipole_exprs(tgt_expansion,
+            src_expansion, src_coeff_exprs, sac, src_rscale)
+
+        translated_coeffs = [
+            sum(derivatives[m + j + tgt_expansion.order + src_expansion.order]
+                    * src_coeff_exprs[src_expansion.get_storage_index(m)]
+                for m in src_expansion.get_coefficient_identifiers())
+            for j in tgt_expansion.get_coefficient_identifiers()]
+
+        translated_coeffs = self.postprocess_local_exprs(tgt_expansion,
+                src_expansion, translated_coeffs, src_rscale, tgt_rscale,
+                sac)
+
+        return translated_coeffs
+
+    def translation_classes_dependent_ndata(self, tgt_expansion, src_expansion):
+        nexpr = 2 * tgt_expansion.order + 2 * src_expansion.order + 1
+        return nexpr
+
+    def translation_classes_dependent_data(self, tgt_expansion, src_expansion,
+            src_rscale, dvec, sac):
+
+        from sumpy.symbolic import sym_real_norm_2, Hankel1
+
+        dvec_len = sym_real_norm_2(dvec)
+        new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
+        arg_scale = tgt_expansion.get_bessel_arg_scaling()
+        # [-(src_order+tgt_order), ..., 0, ..., (src_order + tgt_order)]
+        translation_classes_dependent_data = \
+                [0] * (2*tgt_expansion.order + 2 * src_expansion.order + 1)
+
+        # The M2L is a mirror image of a Toeplitz matvec with Hankel function
+        # evaluations. https://dlmf.nist.gov/10.23.F1
+        # This loop computes the first row and the last column vector sufficient
+        # to specify the matrix entries.
+        for j in tgt_expansion.get_coefficient_identifiers():
+            idx_j = tgt_expansion.get_storage_index(j)
+            for m in src_expansion.get_coefficient_identifiers():
+                idx_m = src_expansion.get_storage_index(m)
+                translation_classes_dependent_data[idx_j + idx_m] = (
+                    Hankel1(m + j, arg_scale * dvec_len, 0)
+                    * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center))
+
+        return translation_classes_dependent_data
+
+    def preprocess_multipole_exprs(self, tgt_expansion, src_expansion,
+            src_coeff_exprs, sac, src_rscale):
+
+        src_coeff_exprs = list(src_coeff_exprs)
+        for m in src_expansion.get_coefficient_identifiers():
+            src_coeff_exprs[src_expansion.get_storage_index(m)] *= src_rscale**abs(m)
+        return src_coeff_exprs
+
+    def preprocess_multipole_nexprs(self, tgt_expansion, src_expansion):
+        return 2*src_expansion.order + 1
+
+    def postprocess_local_exprs(self, tgt_expansion, src_expansion,
+            m2l_result, src_rscale, tgt_rscale, sac):
+
+        # Filter out the dummy rows and scale them for target
+        result = []
+        for j in tgt_expansion.get_coefficient_identifiers():
+            result.append(m2l_result[tgt_expansion.get_storage_index(j)]
+                    * tgt_rscale**(abs(j)) * sym.Integer(-1)**j)
+
+        return result
+
+    def postprocess_local_nexprs(self, tgt_expansion, src_expansion):
+        return 2*tgt_expansion.order + 1
+
+
+# }}} FourierBesselM2LTranslation
+
+# {{{ FourierBesselM2LWithPreprocessedMultipoles
+
+class FourierBesselM2LWithPreprocessedMultipoles(FourierBesselM2LTranslation):
+    use_preprocessing = True
+
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        assert translation_classes_dependent_data
+        derivatives = translation_classes_dependent_data
+
+        translated_coeffs = [
+            sum(derivatives[m + j + tgt_expansion.order + src_expansion.order]
+                    * src_coeff_exprs[src_expansion.get_storage_index(m)]
+                for m in src_expansion.get_coefficient_identifiers())
+            for j in tgt_expansion.get_coefficient_identifiers()]
+
+        return translated_coeffs
+
+    def loopy_translate(self, tgt_expansion, src_expansion):
+        ncoeff_src = self.preprocess_multipole_nexprs(src_expansion)
+        ncoeff_tgt = self.postprocess_local_nexprs(src_expansion)
+
+        icoeff_src = pymbolic.var("icoeff_src")
+        icoeff_tgt = pymbolic.var("icoeff_tgt")
+        domains = [f"{{[icoeff_tgt]: 0<=icoeff_tgt<{ncoeff_tgt} }}"]
+
+        coeff = pymbolic.var("coeff")
+        src_coeffs = pymbolic.var("src_coeffs")
+        translation_classes_dependent_data = pymbolic.var("data")
+
+        if self.use_fft_for_m2l:
+            expr = src_coeffs[icoeff_tgt] \
+                    * translation_classes_dependent_data[icoeff_tgt]
+        else:
+            expr = src_coeffs[icoeff_src] \
+                   * translation_classes_dependent_data[
+                           icoeff_tgt + icoeff_src]
+            domains.append(
+                    f"{{[icoeff_src]: 0<=icoeff_src<{ncoeff_src} }}")
+
+        insns = [
+            lp.Assignment(
+                assignee=coeff[icoeff_tgt],
+                expression=coeff[icoeff_tgt] + expr),
+        ]
+        return lp.make_function(domains, insns,
+                kernel_data=[
+                    lp.GlobalArg("coeff, src_coeffs, data",
+                        shape=lp.auto),
+                    lp.ValueArg("src_rscale, tgt_rscale"),
+                    ...],
+                name="e2e",
+                lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+                )
+
+
+# }}} FourierBesselM2LWithPreprocessedMultipoles
+
+# {{{ FourierBesselM2LWithFFT
+
+class FourierBesselM2LWithFFT(FourierBesselM2LWithPreprocessedMultipoles):
+    use_fft = True
+
+    def __init__(self):
+        # FIXME: expansion with FFT is correct symbolically and can be verified
+        # with sympy. However there are numerical issues that we have to deal
+        # with. Greengard and Rokhlin 1988 attributes this to numerical
+        # instability but gives rscale as a possible solution. Sumpy's rscale
+        # choice is slightly different from Greengard and Rokhlin and that
+        # might be the reason for this numerical issue.
+        raise ValueError("Bessel based expansions with FFT is not fully "
+                         "supported yet.")
+
+    def translate(self, tgt_expansion, src_expansion, src_coeff_exprs, src_rscale,
+            dvec, tgt_rscale, sac=None, translation_classes_dependent_data=None):
+
+        assert translation_classes_dependent_data is not None
+        derivatives = translation_classes_dependent_data
+        assert len(derivatives) == len(src_coeff_exprs)
+        return [a * b for a, b in zip(derivatives, src_coeff_exprs)]
+
+    def loopy_translate(self, tgt_expansion, src_expansion):
+        raise NotImplementedError
+
+    def translation_classes_dependent_data(self, tgt_expansion, src_expansion,
+            src_rscale, dvec, sac):
+
+        from sumpy.tools import fft
+        translation_classes_dependent_data = \
+            super().translation_classes_dependent_data(tgt_expansion,
+                src_expansion, src_rscale, dvec, sac)
+        order = src_expansion.order
+        # For this expansion, we have a mirror image of a Toeplitz matrix.
+        # First, we have to take the mirror image of the M2L matrix.
+        #
+        # After that the Toeplitz matrix has to be embedded in a circulant
+        # matrix. In this cicrcular matrix the first part of the first
+        # column is the first column of the Toeplitz matrix which is
+        # the last column of the M2L matrix. The second part is the
+        # reverse of the first row of the Toeplitz matrix which
+        # is the reverse of the first row of the M2L matrix.
+        first_row_m2l, last_column_m2l = \
+            translation_classes_dependent_data[:2*order], \
+                translation_classes_dependent_data[2*order:]
+        first_column_toeplitz = last_column_m2l
+        first_row_toeplitz = list(reversed(first_row_m2l))
+
+        first_column_circulant = list(first_column_toeplitz) + \
+                list(reversed(first_row_toeplitz))
+        return fft(first_column_circulant, sac)
+
+    def preprocess_multipole_exprs(self, tgt_expansion, src_expansion,
+            src_coeff_exprs, sac, src_rscale):
+
+        from sumpy.tools import fft
+        result = super().preprocess_multipole_exprs(tgt_expansion,
+            src_expansion, src_coeff_exprs, sac, src_rscale)
+
+        result = list(reversed(result))
+        result += [0] * (len(result) - 1)
+        return fft(result, sac=sac)
+
+    def postprocess_local_exprs(self, tgt_expansion, src_expansion,
+            m2l_result, src_rscale, tgt_rscale, sac):
+
+        m2l_result = fft(m2l_result, inverse=True, sac=sac)
+        m2l_result = m2l_result[:2*tgt_expansion.order+1]
+        return super().postprocess_local_exprs(tgt_expansion,
+            src_expansion, m2l_result, src_rscale, tgt_rscale, sac)
+
+# }}} FourierBesselM2LWithFFT
+# vim: fdm=marker
diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index 52cb9a60..519b1200 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -62,8 +62,7 @@ class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
             multipole_expansion_factory,
             local_expansion_factory,
             target_kernels, exclude_self=False, use_rscale=None,
-            strength_usage=None, source_kernels=None,
-            use_fft_for_m2l=False, use_preprocessing_for_m2l=None):
+            strength_usage=None, source_kernels=None):
         """
         :arg multipole_expansion_factory: a callable of a single argument (order)
             that returns a multipole expansion.
@@ -73,10 +72,6 @@ class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
         :arg exclude_self: whether the self contribution should be excluded
         :arg strength_usage: passed unchanged to p2l, p2m and p2p.
         :arg source_kernels: passed unchanged to p2l, p2m and p2p.
-        :arg use_fft_for_m2l: Use an FFT based multipole-to-local expansion.
-        :arg use_preprocessing_for_m2l: do preprocessing of the source multipole
-            expansion and postprocessing of the target local expansion for
-            multipole-to-local expansion.
         """
         self.multipole_expansion_factory = multipole_expansion_factory
         self.local_expansion_factory = local_expansion_factory
@@ -85,11 +80,6 @@ class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
         self.exclude_self = exclude_self
         self.use_rscale = use_rscale
         self.strength_usage = strength_usage
-        self.use_fft_for_m2l = use_fft_for_m2l
-        if use_preprocessing_for_m2l is None:
-            self.use_preprocessing_for_m2l = use_fft_for_m2l
-        else:
-            self.use_preprocessing_for_m2l = use_preprocessing_for_m2l
 
         super().__init__()
 
@@ -106,9 +96,11 @@ class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler):
 
     @memoize_method
     def local_expansion(self, order):
-        return self.local_expansion_factory(order, self.use_rscale,
-                use_fft_for_m2l=self.use_fft_for_m2l,
-                use_preprocessing_for_m2l=self.use_preprocessing_for_m2l)
+        return self.local_expansion_factory(order, self.use_rscale)
+
+    @property
+    def m2l_translation(self):
+        return self.local_expansion(0).m2l_translation
 
     @memoize_method
     def p2m(self, tgt_order):
@@ -274,7 +266,7 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
 
         self.dtype = dtype
 
-        if not self.tree_indep.use_fft_for_m2l:
+        if not self.tree_indep.m2l_translation.use_fft:
             # If not FFT, we don't need complex dtypes
             self.preprocessed_mpole_dtype = dtype
         elif preprocessed_mpole_dtype is not None:
@@ -320,16 +312,19 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
             self.supports_translation_classes = True
 
         self.translation_classes_data = translation_classes_data
-        self.use_fft_for_m2l = self.tree_indep.use_fft_for_m2l
 
     def level_to_rscale(self, level):
         tree = self.tree
         order = self.level_orders[level]
+        r = tree.root_extent * (2**-level)
 
         # See L. Greengard and V. Rokhlin. On the efficient implementation of the
         # fast multipole algorithm. Technical report,
         # YALE UNIV NEW HAVEN CT DEPT OF COMPUTER SCIENCE, 1988.
-        return tree.root_extent * (2**-level) / order
+        # rscale that we use in sumpy is the inverse of the scaling used in the
+        # paper and therefore we should use r / order. However empirically
+        # we have observed that 2r / order is better for numerical stability.
+        return r * 2 / order
 
     # {{{ data vector utilities
 
@@ -358,7 +353,9 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
         def order_to_size(order):
             mpole_expn = self.tree_indep.multipole_expansion(order)
             local_expn = self.tree_indep.local_expansion(order)
-            return local_expn.m2l_translation_classes_dependent_ndata(mpole_expn)
+            m2l_translation = local_expn.m2l_translation
+            return m2l_translation.translation_classes_dependent_ndata(
+                    local_expn, mpole_expn)
 
         return build_csr_level_starts(self.level_orders, order_to_size,
                 level_starts=self.m2l_translation_class_level_start_box_nrs())
@@ -429,7 +426,8 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
         def order_to_size(order):
             mpole_expn = self.tree_indep.multipole_expansion(order)
             local_expn = self.tree_indep.local_expansion(order)
-            res = local_expn.m2l_preprocess_multipole_nexprs(mpole_expn)
+            res = local_expn.m2l_translation.preprocess_multipole_nexprs(
+                local_expn, mpole_expn)
             return res
 
         return build_csr_level_starts(self.level_orders, order_to_size,
@@ -723,7 +721,7 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
         queue = mpole_exps.queue
         local_exps = self.local_expansion_zeros(mpole_exps)
 
-        if self.tree_indep.use_preprocessing_for_m2l:
+        if self.tree_indep.m2l_translation.use_preprocessing:
             preprocessed_mpole_exps = \
                 self.m2l_preproc_mpole_expansion_zeros(mpole_exps)
             for lev in range(self.tree.nlevels):
@@ -803,7 +801,7 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
 
         postprocess_evts = []
 
-        if self.tree_indep.use_preprocessing_for_m2l:
+        if self.tree_indep.m2l_translation.use_preprocessing:
             for lev in range(self.tree.nlevels):
                 order = self.level_orders[lev]
                 postprocess_local_kernel = \
diff --git a/sumpy/toys.py b/sumpy/toys.py
index 3b8876db..ded4a54e 100644
--- a/sumpy/toys.py
+++ b/sumpy/toys.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 from pytools import memoize_method
 from numbers import Number
+from functools import partial
 from sumpy.kernel import TargetTransformationRemover
 
 import numpy as np  # noqa: F401
@@ -103,8 +104,15 @@ class ToyContext:
             mpole_expn_class = \
                     expansion_factory.get_multipole_expansion_class(kernel)
         if local_expn_class is None:
+            from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
             local_expn_class = \
                     expansion_factory.get_local_expansion_class(kernel)
+            m2l_translation_class_factory = NonFFTM2LTranslationClassFactory()
+            m2l_translation_class = \
+                    m2l_translation_class_factory.get_m2l_translation_class(
+                        kernel, local_expn_class)
+            local_expn_class = partial(local_expn_class,
+                    m2l_translation=m2l_translation_class())
 
         self.mpole_expn_class = mpole_expn_class
         self.local_expn_class = local_expn_class
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 8e7bc329..cbf66e6d 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -185,11 +185,21 @@ def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class,
     for order in order_values:
         target_kernels = [knl]
 
+        if use_fft:
+            from sumpy.expansion.m2l import FFTM2LTranslationClassFactory
+            m2l_translation_factory = FFTM2LTranslationClassFactory()
+        else:
+            from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
+            m2l_translation_factory = NonFFTM2LTranslationClassFactory()
+
+        m2l_translation = m2l_translation_factory.get_m2l_translation_class(
+                knl, local_expn_class)()
+
         tree_indep = SumpyTreeIndependentDataForWrangler(
                 ctx,
                 partial(mpole_expn_class, knl),
-                partial(local_expn_class, knl),
-                target_kernels, use_fft_for_m2l=use_fft)
+                partial(local_expn_class, knl, m2l_translation=m2l_translation),
+                target_kernels)
 
         if order_varies_with_level:
             def fmm_level_to_order(kernel, kernel_args, tree, lev):
diff --git a/test/test_kernels.py b/test/test_kernels.py
index 1e0db7ce..f2d89b86 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -36,6 +36,7 @@ from sumpy.expansion.multipole import (
 from sumpy.expansion.local import (
         VolumeTaylorLocalExpansion, H2DLocalExpansion,
         LinearPDEConformingVolumeTaylorLocalExpansion)
+from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
 from sumpy.kernel import (LaplaceKernel, HelmholtzKernel, AxisTargetDerivative,
         DirectionalSourceDerivative, BiharmonicKernel, StokesletKernel)
 import sumpy.symbolic as sym
@@ -560,9 +561,12 @@ def test_translations(ctx_factory, knl, local_expn_class, mpole_expn_class):
 
         return pot
 
+    m2l_factory = NonFFTM2LTranslationClassFactory()
+    m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)()
+
     for order in orders:
         m_expn = mpole_expn_class(knl, order=order)
-        l_expn = local_expn_class(knl, order=order)
+        l_expn = local_expn_class(knl, order=order, m2l_translation=m2l_translation)
 
         from sumpy import P2EFromSingleBox, E2PFromSingleBox, P2P, E2EFromCSR
         p2m = P2EFromSingleBox(ctx, m_expn)
@@ -843,8 +847,10 @@ def test_m2l_toeplitz():
     knl = LaplaceKernel(dim)
     local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion
     mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion
+    m2l_factory = NonFFTM2LTranslationClassFactory()
+    m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)()
 
-    local_expn = local_expn_class(knl, order=5)
+    local_expn = local_expn_class(knl, order=5, m2l_translation=m2l_translation)
     mpole_expn = mpole_expn_class(knl, order=5)
 
     dvec = sym.make_sym_vector("d", dim)
diff --git a/test/test_misc.py b/test/test_misc.py
index 42956bc6..9aeb7f12 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -267,14 +267,12 @@ def test_toy_p2e2e2p(ctx_factory, case):
         raise ValueError(
             f"convergence factor not in valid range: {case_conv_factor}")
 
-    from sumpy.expansion.local import VolumeTaylorLocalExpansion
-    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
+    from sumpy.expansion import VolumeTaylorExpansionFactory
 
     cl_ctx = ctx_factory()
     ctx = t.ToyContext(cl_ctx,
              LaplaceKernel(dim),
-             VolumeTaylorMultipoleExpansion,
-             VolumeTaylorLocalExpansion)
+             expansion_factory=VolumeTaylorExpansionFactory())
 
     errors = []
 
-- 
GitLab