diff --git a/sumpy/e2e.py b/sumpy/e2e.py
index 0e218024c690b2d8fd1c8c34983a2434fcbf20a9..c66f9c02254e70de6c8f60aec52882a83125e5eb 100644
--- a/sumpy/e2e.py
+++ b/sumpy/e2e.py
@@ -791,8 +791,8 @@ class M2LPostprocessLocal(E2EBase):
                 for itgt_box
                     [itgt_coeff]: tgt_expansions[itgt_box, itgt_coeff] = \
                         m2l_postprocess_inner(
-                            tgt_rscale,
                             src_rscale,
+                            tgt_rscale,
                             [isrc_coeff]: tgt_expansions_before_postprocessing[ \
                             itgt_box, isrc_coeff],
                        )
diff --git a/sumpy/toys.py b/sumpy/toys.py
index a794ff1529691de4f8ab81fc8a5f4aeee730671a..c71f6107eafb8916228dd30451515a493d8d223f 100644
--- a/sumpy/toys.py
+++ b/sumpy/toys.py
@@ -102,7 +102,7 @@ class ToyContext:
             local_expn_class=None,
             expansion_factory=None,
             extra_source_kwargs=None,
-            extra_kernel_kwargs=None):
+            extra_kernel_kwargs=None, m2l_use_fft=None):
         self.cl_context = cl_context
         self.queue = cl.CommandQueue(self.cl_context)
         self.kernel = kernel
@@ -116,15 +116,22 @@ class ToyContext:
             mpole_expn_class = \
                     expansion_factory.get_multipole_expansion_class(kernel)
         if local_expn_class is None:
-            from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
+            from sumpy.expansion.m2l import (NonFFTM2LTranslationClassFactory,
+                FFTM2LTranslationClassFactory)
+            if m2l_use_fft:
+                m2l_translation_class_factory = FFTM2LTranslationClassFactory()
+            else:
+                m2l_translation_class_factory = NonFFTM2LTranslationClassFactory()
             local_expn_class = \
                     expansion_factory.get_local_expansion_class(kernel)
-            m2l_translation_class_factory = NonFFTM2LTranslationClassFactory()
             m2l_translation_class = \
                     m2l_translation_class_factory.get_m2l_translation_class(
                         kernel, local_expn_class)
             local_expn_class = partial(local_expn_class,
                     m2l_translation=m2l_translation_class())
+        elif m2l_use_fft is not None:
+            raise ValueError("local_expn_class and m2l_use_fft are both supplied. "
+                             "Use only one of these arguments")
 
         self.mpole_expn_class = mpole_expn_class
         self.local_expn_class = local_expn_class
@@ -181,10 +188,51 @@ class ToyContext:
                 self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
                 self.mpole_expn_class(self.no_target_deriv_kernel, to_order))
 
+    @memoize_method
+    def use_translation_classes_dependent_data(self):
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, 2)
+        return l_expn.m2l_translation.use_preprocessing
+
+    @memoize_method
+    def use_fft(self):
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, 2)
+        return l_expn.m2l_translation.use_fft
+
     @memoize_method
     def get_m2l(self, from_order, to_order):
-        from sumpy import E2EFromCSR
-        return E2EFromCSR(self.cl_context,
+        from sumpy import E2EFromCSR, M2LUsingTranslationClassesDependentData
+        if self.use_translation_classes_dependent_data():
+            m2l_class = M2LUsingTranslationClassesDependentData
+        else:
+            m2l_class = E2EFromCSR
+        return m2l_class(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_translation_class_dependent_data_kernel(self, from_order, to_order):
+        from sumpy import M2LGenerateTranslationClassesDependentData
+        return M2LGenerateTranslationClassesDependentData(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_expansion_size(self, from_order, to_order):
+        m_expn = self.mpole_expn_class(self.no_target_deriv_kernel, from_order)
+        l_expn = self.local_expn_class(self.no_target_deriv_kernel, to_order)
+        return l_expn.m2l_translation.preprocess_multipole_nexprs(l_expn, m_expn)
+
+    @memoize_method
+    def get_m2l_preprocess_mpole_kernel(self, from_order, to_order):
+        from sumpy import M2LPreprocessMultipole
+        return M2LPreprocessMultipole(self.cl_context,
+                self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
+                self.local_expn_class(self.no_target_deriv_kernel, to_order))
+
+    @memoize_method
+    def get_m2l_postprocess_local_kernel(self, from_order, to_order):
+        from sumpy import M2LPostprocessLocal
+        return M2LPostprocessLocal(self.cl_context,
                 self.mpole_expn_class(self.no_target_deriv_kernel, from_order),
                 self.local_expn_class(self.no_target_deriv_kernel, to_order))
 
@@ -267,7 +315,8 @@ def _e2p(psource, targets, e2p):
     return pot.get(queue)
 
 
-def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs):
+def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs,
+         extra_kernel_kwargs):
     toy_ctx = psource.toy_ctx
     queue = toy_ctx.queue
 
@@ -290,32 +339,133 @@ def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs):
             ],
             dtype=np.float64).T.copy()
         )
+
     coeffs = cl.array.to_device(queue, np.array([psource.coeffs]))
+    args = {
+        "queue": toy_ctx.queue,
+        "src_expansions": coeffs,
+        "src_base_ibox": 0,
+        "tgt_base_ibox": 0,
+        "ntgt_level_boxes": 2,
+        "target_boxes": target_boxes,
+        "src_box_starts": src_box_starts,
+        "src_box_lists": src_box_lists,
+        "centers": centers,
+        "src_rscale": psource.rscale,
+        "tgt_rscale": to_rscale,
+        **extra_kernel_kwargs,
+        **toy_ctx.extra_kernel_kwargs,
+    }
+
+    evt, (to_coeffs,) = e2e(**args)
 
-    evt, (to_coeffs,) = e2e(
-            toy_ctx.queue,
-            src_expansions=coeffs,
-            src_base_ibox=0,
-            tgt_base_ibox=0,
-            ntgt_level_boxes=2,
+    return expn_class(
+            toy_ctx, to_center, to_rscale, to_order, to_coeffs[1].get(queue),
+            derived_from=psource, **expn_kwargs)
 
-            target_boxes=target_boxes,
 
-            src_box_starts=src_box_starts,
-            src_box_lists=src_box_lists,
-            centers=centers,
+def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs,
+         translation_classes_kwargs):
+    toy_ctx = psource.toy_ctx
+    queue = toy_ctx.queue
 
-            src_rscale=psource.rscale,
-            tgt_rscale=to_rscale,
-            **toy_ctx.extra_kernel_kwargs)
+    coeffs = cl.array.to_device(queue, np.array([psource.coeffs]))
 
-    return expn_class(
-            toy_ctx, to_center, to_rscale, to_order, to_coeffs[1].get(queue),
+    m2l_use_translation_classes_dependent_data = \
+            toy_ctx.use_translation_classes_dependent_data()
+
+    if m2l_use_translation_classes_dependent_data:
+        data_kernel = translation_classes_kwargs["data_kernel"]
+        preprocess_kernel = translation_classes_kwargs["preprocess_kernel"]
+        postprocess_kernel = translation_classes_kwargs["postprocess_kernel"]
+        expn_size = translation_classes_kwargs["m2l_expn_size"]
+
+        # Preprocess the mpole expansion
+        preprocessed_src_expansions = cl.array.zeros(queue, (1, expn_size),
+                dtype=np.complex128)
+        evt, _ = preprocess_kernel(queue,
+                src_expansions=coeffs,
+                preprocessed_src_expansions=preprocessed_src_expansions,
+                src_rscale=np.float64(psource.rscale),
+                **toy_ctx.extra_kernel_kwargs)
+
+        if toy_ctx.use_fft:
+            from sumpy.tools import (run_opencl_fft, get_opencl_fft_app,
+                get_native_event)
+            fft_app = get_opencl_fft_app(queue, (expn_size,),
+                dtype=preprocessed_src_expansions.dtype, inverse=False)
+            ifft_app = get_opencl_fft_app(queue, (expn_size,),
+                dtype=preprocessed_src_expansions.dtype, inverse=True)
+
+            evt, preprocessed_src_expansions = run_opencl_fft(fft_app, queue,
+                    preprocessed_src_expansions, inverse=False, wait_for=[evt])
+
+        # Compute translation classes data
+        m2l_translation_classes_lists = cl.array.to_device(queue,
+                np.array([0], dtype=np.int32))
+        dist = np.array(to_center - psource.center, dtype=np.float64)
+        dim = toy_ctx.kernel.dim
+        m2l_translation_vectors = cl.array.to_device(queue, dist.reshape(dim, 1))
+        m2l_translation_classes_dependent_data = cl.array.zeros(queue,
+                (1, expn_size), dtype=np.complex128)
+
+        evt, _ = data_kernel(
+                queue,
+                src_rscale=np.float64(psource.rscale),
+                ntranslation_classes=1,
+                translation_classes_level_start=0,
+                m2l_translation_vectors=m2l_translation_vectors,
+                m2l_translation_classes_dependent_data=(
+                    m2l_translation_classes_dependent_data),
+                ntranslation_vectors=1,
+                wait_for=[get_native_event(evt)],
+                **toy_ctx.extra_kernel_kwargs)
+
+        if toy_ctx.use_fft:
+            evt, m2l_translation_classes_dependent_data = run_opencl_fft(fft_app,
+                queue, m2l_translation_classes_dependent_data, inverse=False,
+                wait_for=[evt])
+
+        ret = _e2e(psource, to_center, to_rscale, to_order,
+            e2e, expn_class, expn_kwargs,
+            {
+                "src_expansions": preprocessed_src_expansions,
+                "m2l_translation_classes_lists": m2l_translation_classes_lists,
+                "m2l_translation_classes_dependent_data": (
+                    m2l_translation_classes_dependent_data),
+                "translation_classes_level_start": 0,
+            }
+        )
+
+        # Postprocess the local expansion
+        local_before = cl.array.to_device(queue, np.array([ret.coeffs]))
+        to_coeffs = cl.array.zeros(queue, (1, len(data_kernel.tgt_expansion)),
+                                           dtype=coeffs.dtype)
+
+        if toy_ctx.use_fft:
+            evt, local_before = run_opencl_fft(ifft_app, queue,
+                local_before, inverse=True, wait_for=[get_native_event(evt)])
+
+        evt, _ = postprocess_kernel(queue=queue,
+                tgt_expansions_before_postprocessing=local_before,
+                tgt_expansions=to_coeffs,
+                src_rscale=np.float64(psource.rscale),
+                tgt_rscale=np.float64(to_rscale),
+                wait_for=[get_native_event(evt)],
+                **toy_ctx.extra_kernel_kwargs)
+
+        return expn_class(
+            toy_ctx, to_center, to_rscale, to_order, to_coeffs.get(queue)[0],
             derived_from=psource, **expn_kwargs)
+    else:
+        ret = _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class,
+                expn_kwargs, {})
 
-# }}}
+    return ret
 
 
+# }}}
+
 # {{{ potential source classes
 
 class PotentialSource:
@@ -596,7 +746,7 @@ def multipole_expand(psource: PotentialSource,
 
         return _e2e(psource, center, rscale, order,
                 psource.toy_ctx.get_m2m(psource.order, order),
-                MultipoleExpansion, expn_kwargs)
+                MultipoleExpansion, expn_kwargs, {})
 
     else:
         raise TypeError(f"do not know how to expand '{type(psource).__name__}'")
@@ -617,9 +767,28 @@ def local_expand(
         if order is None:
             order = psource.order
 
-        return _e2e(psource, center, rscale, order,
-                psource.toy_ctx.get_m2l(psource.order, order),
-                LocalExpansion, expn_kwargs)
+        toy_ctx = psource.toy_ctx
+        translation_classes_kwargs = {}
+        m2l_use_translation_classes_dependent_data = \
+            toy_ctx.use_translation_classes_dependent_data()
+
+        if m2l_use_translation_classes_dependent_data:
+            data_kernel = toy_ctx.get_m2l_translation_class_dependent_data_kernel(
+                    psource.order, order)
+            preprocess_kernel = toy_ctx.get_m2l_preprocess_mpole_kernel(
+                    psource.order, order)
+            postprocess_kernel = toy_ctx.get_m2l_postprocess_local_kernel(
+                    psource.order, order)
+            translation_classes_kwargs["data_kernel"] = data_kernel
+            translation_classes_kwargs["preprocess_kernel"] = preprocess_kernel
+            translation_classes_kwargs["postprocess_kernel"] = postprocess_kernel
+            translation_classes_kwargs["m2l_expn_size"] = \
+                    toy_ctx.get_m2l_expansion_size(psource.order, order)
+
+        return _m2l(psource, center, rscale, order,
+                toy_ctx.get_m2l(psource.order, order),
+                LocalExpansion, expn_kwargs,
+                translation_classes_kwargs)
 
     elif isinstance(psource, LocalExpansion):
         if order is None:
@@ -627,7 +796,7 @@ def local_expand(
 
         return _e2e(psource, center, rscale, order,
                 psource.toy_ctx.get_l2l(psource.order, order),
-                LocalExpansion, expn_kwargs)
+                LocalExpansion, expn_kwargs, {})
 
     else:
         raise TypeError(f"do not know how to expand '{type(psource).__name__}'")
diff --git a/test/test_kernels.py b/test/test_kernels.py
index d1768fd7c5040615a993f1ebb105b9bb1498c5b3..8f0a98ef21e6308ff5de6252318b6f7c840dc7fe 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -43,7 +43,8 @@ from sumpy.expansion.local import (
     VolumeTaylorLocalExpansion,
     H2DLocalExpansion,
     LinearPDEConformingVolumeTaylorLocalExpansion)
-from sumpy.expansion.m2l import NonFFTM2LTranslationClassFactory
+from sumpy.expansion.m2l import (NonFFTM2LTranslationClassFactory,
+    FFTM2LTranslationClassFactory)
 from sumpy.kernel import (
     LaplaceKernel,
     HelmholtzKernel,
@@ -473,27 +474,35 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative
 
 # {{{ test_translations
 
-@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class", [
-    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+@pytest.mark.parametrize("knl, local_expn_class, mpole_expn_class, use_fft", [
+    (LaplaceKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
     (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
-     LinearPDEConformingVolumeTaylorMultipoleExpansion),
-    (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (LaplaceKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, True),
+    (LaplaceKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
     (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
-     LinearPDEConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (LaplaceKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, True),
+    (HelmholtzKernel(2), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
     (HelmholtzKernel(2), LinearPDEConformingVolumeTaylorLocalExpansion,
-     LinearPDEConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (HelmholtzKernel(3), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion,
+        False),
     (HelmholtzKernel(3), LinearPDEConformingVolumeTaylorLocalExpansion,
-     LinearPDEConformingVolumeTaylorMultipoleExpansion),
-    (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
+    (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion, False),
     (StokesletKernel(2, 0, 0), VolumeTaylorLocalExpansion,
-     VolumeTaylorMultipoleExpansion),
+     VolumeTaylorMultipoleExpansion, False),
     (StokesletKernel(2, 0, 0), LinearPDEConformingVolumeTaylorLocalExpansion,
-     LinearPDEConformingVolumeTaylorMultipoleExpansion),
+     LinearPDEConformingVolumeTaylorMultipoleExpansion, False),
     ])
 def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class,
-        visualize=False):
+        use_fft, visualize=False):
     if visualize:
         logging.basicConfig(level=logging.INFO)
 
@@ -550,7 +559,10 @@ def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class,
     else:
         orders = [3, 4, 5]
 
-    m2l_factory = NonFFTM2LTranslationClassFactory()
+    if use_fft:
+        m2l_factory = FFTM2LTranslationClassFactory()
+    else:
+        m2l_factory = NonFFTM2LTranslationClassFactory()
     m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)()
 
     toy_ctx = t.ToyContext(
diff --git a/test/test_misc.py b/test/test_misc.py
index 9b72578120e63ee576df99f1e7c849f628f6a62d..98f07da1039c34a740860d5ff5c2045e32660625 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -230,6 +230,7 @@ class P2E2E2PTestCase:
     expansion1: Callable[..., Any]
     expansion2: Callable[..., Any]
     conv_factor: str
+    m2l_use_fft: bool = False
 
     @property
     def dim(self):
@@ -266,6 +267,17 @@ P2E2E2P_TEST_CASES = (
             expansion1=t.multipole_expand,
             expansion2=t.local_expand,
             conv_factor="norm(t-c2)/(norm(c2-c1)-norm(c1-s))"),
+
+        # multipole to local, 3D with FFT
+        P2E2E2PTestCase(
+            source=np.array([-2., 2., 1.]),
+            center1=np.array([-2., 5., 3.]),
+            center2=np.array([0., 0., 0.]),
+            target=np.array([0., 0., -1]),
+            expansion1=t.multipole_expand,
+            expansion2=t.local_expand,
+            m2l_use_fft=True,
+            conv_factor="norm(t-c2)/(norm(c2-c1)-norm(c1-s))"),
 )
 
 # }}}
@@ -302,7 +314,8 @@ def test_toy_p2e2e2p(actx_factory, case):
     actx = actx_factory()
     ctx = t.ToyContext(actx.context,
              LaplaceKernel(dim),
-             expansion_factory=VolumeTaylorExpansionFactory())
+             expansion_factory=VolumeTaylorExpansionFactory(),
+             m2l_use_fft=case.m2l_use_fft)
 
     errors = []