diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 2d6cbf409b73b83023e75731738033cc219cddca..068fea2393875f7b58904f088f7d58e8939e09b3 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -165,39 +165,77 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
         src_mi_to_index = dict((mi, i) for i, mi in enumerate(
             src_expansion.get_coefficient_identifiers()))
 
+        tgt_mi_to_index = dict((mi, i) for i, mi in enumerate(
+            self.get_full_coefficient_identifiers()))
+
+        src_coeff_exprs = list(src_coeff_exprs)
         for i, mi in enumerate(src_expansion.get_coefficient_identifiers()):
-            src_coeff_exprs[i] *= mi_factorial(mi)
+            src_coeff_exprs[i] *= mi_factorial(mi) * \
+                sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(mi)
 
         result = [0] * len(self.get_full_coefficient_identifiers())
-        from pytools import generate_nonnegative_integer_tuples_below as gnitb
-
-        for i, tgt_mi in enumerate(
-                self.get_full_coefficient_identifiers()):
 
-            tgt_mi_plus_one = tuple(mi_i + 1 for mi_i in tgt_mi)
+        # This algorithm uses the observation that M2M coefficients
+        # have the following form in 2D
+        #
+        # $B_{m, n} = \sum_{i\le m, j\le n} A_{i, j}
+        #             d_x^i d_y^j \binom{m}{i} \binom{n}{j}$
+        # and can be rewritten as follows.
+        #
+        # Let $T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}$.
+        #
+        # Then, $B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}$.
+        #
+        # $T_{m, n}$ are $p^2$ number of temporary variables that are
+        # reused for different M2M coefficients and costs $p$ per variable.
+        # Total cost for calculating $T_{m, n}$ is $p^3$ and similar
+        # for $B_{m, n}$
+
+        # In other words, we're better off computing the translation
+        # one dimension at a time. This is realized here by using
+        # the output from one dimension as the input to the next.
+        # per_dim_coeffs_to_translate serves as the array of input
+        # coefficients for each dimension's translation.
+
+        dim_coeffs_to_translate = src_coeff_exprs
+
+        mi_to_index = src_mi_to_index
+        for d in range(self.dim):
+            result = [0] * len(self.get_full_coefficient_identifiers())
+            for i, tgt_mi in enumerate(
+                    self.get_full_coefficient_identifiers()):
+
+                src_mis_per_dim = []
+                for mi_i in range(tgt_mi[d]+1):
+                    new_mi = list(tgt_mi)
+                    new_mi[d] = mi_i
+                    src_mis_per_dim.append(tuple(new_mi))
+
+                for src_mi in src_mis_per_dim:
+                    try:
+                        src_index = mi_to_index[src_mi]
+                    except KeyError:
+                        # Omitted coefficients: not life-threatening
+                        continue
 
-            for src_mi in gnitb(tgt_mi_plus_one):
-                try:
-                    src_index = src_mi_to_index[src_mi]
-                except KeyError:
-                    # Omitted coefficients: not life-threatening
-                    continue
+                    contrib = dim_coeffs_to_translate[src_index]
 
-                contrib = src_coeff_exprs[src_index]
+                    for idim in range(self.dim):
+                        n = tgt_mi[idim]
+                        k = src_mi[idim]
+                        assert n >= k
+                        from sympy import binomial
+                        contrib *= (binomial(n, k)
+                                * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
 
-                for idim in range(self.dim):
-                    n = tgt_mi[idim]
-                    k = src_mi[idim]
-                    assert n >= k
-                    from sympy import binomial
-                    contrib *= (binomial(n, k)
-                            * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
+                    result[i] += contrib
 
-                result[i] += (
-                        contrib
-                        * sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(src_mi))
+                # Defer division by target factorial until the very end
+                if d == self.dim-1:
+                    result[i] /= mi_factorial(tgt_mi)
 
-            result[i] /= mi_factorial(tgt_mi)
+            dim_coeffs_to_translate = result[:]
+            mi_to_index = tgt_mi_to_index
 
         logger.info("building translation operator: done")
         return (