From 9b917db20a82514dcad85de648a5912947d8b5f6 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 3 Dec 2019 19:16:00 -0600
Subject: [PATCH 1/4] Asymptotically better algorithm for M2M

On master, M2M is $`O(p^2d)`$ for order p and dimension d, but CSE
reduces this down to $`O(p^{2d-1})`$ sometimes.
For eg: in Helmholtz 2D, full taylor is $`O(p^{2d-1})`$ and
HelmholtzConformingTaylor is $`O(p^{2d-1.5})`$.

This commit produces expressions in $`O(p^{2d-1})`$ consistently
regardless of how good CSE is and $`O(p^{2d-2})`$ for compressed.

The new algorithm uses the observation that M2M coefficients
have the form in 2D,

$`B_{m, n} = \sum_{i\le m, j\le n} A_{i, j} d_x^i d_y^j \binom{m}{i} \binom{n}{j}`$

and can be rewritten as follows,

Let $`T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}`$.

Then, $`B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}`$
and $`T_{m, n}`$ are $`p^2`$ number of temporary variables that are
reused for different M2M coefficients and costs $`p`$ per variable.
Total cost for calculating $`T_{m, n}`$ is $`p^3`$ and similar for $`B_{m, n}`$
---
 sumpy/expansion/multipole.py | 69 ++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 2d6cbf40..aa797b73 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -169,35 +169,58 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
             src_coeff_exprs[i] *= mi_factorial(mi)
 
         result = [0] * len(self.get_full_coefficient_identifiers())
-        from pytools import generate_nonnegative_integer_tuples_below as gnitb
 
-        for i, tgt_mi in enumerate(
-                self.get_full_coefficient_identifiers()):
-
-            tgt_mi_plus_one = tuple(mi_i + 1 for mi_i in tgt_mi)
+        r"""
+        This algorithm uses the observation that M2M coefficients
+        have the following form in 2D,
+
+        $`B_{m, n} = \sum_{i\le m, j\le n} A_{i, j} d_x^i d_y^j \binom{m}{i} \binom{n}{j}`$
+        and can be rewritten as follows.
+
+        Let $`T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}`$.
+
+        Then, $`B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}`$.
+
+        $`T_{m, n}`$ are $`p^2`$ number of temporary variables that are
+        reused for different M2M coefficients and costs $`p`$ per variable.
+        Total cost for calculating $`T_{m, n}`$ is $`p^3`$ and similar for $`B_{m, n}`$
+        """
+        for d in range(self.dim):
+            result = [0] * len(self.get_full_coefficient_identifiers())
+            for i, tgt_mi in enumerate(
+                    self.get_full_coefficient_identifiers()):
+
+                src_mis_per_dim = []
+                for mi_i in range(tgt_mi[d]+1):
+                    new_mi = list(tgt_mi)
+                    new_mi[d] = mi_i
+                    src_mis_per_dim.append(tuple(new_mi))
+
+                for src_mi in src_mis_per_dim:
+                    try:
+                        src_index = src_mi_to_index[src_mi]
+                    except KeyError:
+                        # Omitted coefficients: not life-threatening
+                        continue
 
-            for src_mi in gnitb(tgt_mi_plus_one):
-                try:
-                    src_index = src_mi_to_index[src_mi]
-                except KeyError:
-                    # Omitted coefficients: not life-threatening
-                    continue
+                    contrib = src_coeff_exprs[src_index]
 
-                contrib = src_coeff_exprs[src_index]
+                    for idim in range(self.dim):
+                        n = tgt_mi[idim]
+                        k = src_mi[idim]
+                        assert n >= k
+                        from sympy import binomial
+                        contrib *= (binomial(n, k)
+                                * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
 
-                for idim in range(self.dim):
-                    n = tgt_mi[idim]
-                    k = src_mi[idim]
-                    assert n >= k
-                    from sympy import binomial
-                    contrib *= (binomial(n, k)
-                            * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
+                    result[i] += (
+                            contrib
+                            * sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(src_mi))
 
-                result[i] += (
-                        contrib
-                        * sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(src_mi))
+                if d == self.dim-1:
+                    result[i] /= mi_factorial(tgt_mi)
 
-            result[i] /= mi_factorial(tgt_mi)
+            src_coeff_exprs = result[:]
 
         logger.info("building translation operator: done")
         return (
-- 
GitLab


From a125a889d3435f88b2d6e3dc32d1af171dfe58b5 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 3 Dec 2019 22:52:09 -0600
Subject: [PATCH 2/4] Fix conforming expansions

---
 sumpy/expansion/multipole.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index aa797b73..eb6dc0fe 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -165,6 +165,9 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
         src_mi_to_index = dict((mi, i) for i, mi in enumerate(
             src_expansion.get_coefficient_identifiers()))
 
+        tgt_mi_to_index = dict((mi, i) for i, mi in enumerate(
+            self.get_full_coefficient_identifiers()))
+
         for i, mi in enumerate(src_expansion.get_coefficient_identifiers()):
             src_coeff_exprs[i] *= mi_factorial(mi)
 
@@ -185,6 +188,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
         reused for different M2M coefficients and costs $`p`$ per variable.
         Total cost for calculating $`T_{m, n}`$ is $`p^3`$ and similar for $`B_{m, n}`$
         """
+        mi_to_index = src_mi_to_index
         for d in range(self.dim):
             result = [0] * len(self.get_full_coefficient_identifiers())
             for i, tgt_mi in enumerate(
@@ -198,7 +202,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
 
                 for src_mi in src_mis_per_dim:
                     try:
-                        src_index = src_mi_to_index[src_mi]
+                        src_index = mi_to_index[src_mi]
                     except KeyError:
                         # Omitted coefficients: not life-threatening
                         continue
@@ -221,6 +225,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                     result[i] /= mi_factorial(tgt_mi)
 
             src_coeff_exprs = result[:]
+            mi_to_index = tgt_mi_to_index
 
         logger.info("building translation operator: done")
         return (
-- 
GitLab


From af8b7f398ba8dffc1d0ce5a03f4b5a3842dc0919 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 3 Dec 2019 23:54:08 -0600
Subject: [PATCH 3/4] Fix rscale

---
 sumpy/expansion/multipole.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index eb6dc0fe..5e34d7da 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -168,8 +168,10 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
         tgt_mi_to_index = dict((mi, i) for i, mi in enumerate(
             self.get_full_coefficient_identifiers()))
 
+        src_coeff_exprs = list(src_coeff_exprs)
         for i, mi in enumerate(src_expansion.get_coefficient_identifiers()):
-            src_coeff_exprs[i] *= mi_factorial(mi)
+            src_coeff_exprs[i] *= mi_factorial(mi) * \
+                sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(mi)
 
         result = [0] * len(self.get_full_coefficient_identifiers())
 
@@ -177,16 +179,18 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
         This algorithm uses the observation that M2M coefficients
         have the following form in 2D,
 
-        $`B_{m, n} = \sum_{i\le m, j\le n} A_{i, j} d_x^i d_y^j \binom{m}{i} \binom{n}{j}`$
+        $B_{m, n} = \sum_{i\le m, j\le n} A_{i, j}
+                    d_x^i d_y^j \binom{m}{i} \binom{n}{j}$
         and can be rewritten as follows.
 
-        Let $`T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}`$.
+        Let $T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}$.
 
-        Then, $`B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}`$.
+        Then, $B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}$.
 
-        $`T_{m, n}`$ are $`p^2`$ number of temporary variables that are
-        reused for different M2M coefficients and costs $`p`$ per variable.
-        Total cost for calculating $`T_{m, n}`$ is $`p^3`$ and similar for $`B_{m, n}`$
+        $T_{m, n}$ are $p^2$ number of temporary variables that are
+        reused for different M2M coefficients and costs $p$ per variable.
+        Total cost for calculating $T_{m, n}$ is $p^3$ and similar
+        for $B_{m, n}$
         """
         mi_to_index = src_mi_to_index
         for d in range(self.dim):
@@ -217,9 +221,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                         contrib *= (binomial(n, k)
                                 * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
 
-                    result[i] += (
-                            contrib
-                            * sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(src_mi))
+                    result[i] += contrib
 
                 if d == self.dim-1:
                     result[i] /= mi_factorial(tgt_mi)
-- 
GitLab


From b8246dca326ae04941138f0b7f05afb1a9f325dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Wed, 4 Dec 2019 17:46:55 +0100
Subject: [PATCH 4/4] Some readability improvements for the new M2M algorithm

---
 sumpy/expansion/multipole.py | 44 +++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 5e34d7da..068fea23 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -175,23 +175,30 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
 
         result = [0] * len(self.get_full_coefficient_identifiers())
 
-        r"""
-        This algorithm uses the observation that M2M coefficients
-        have the following form in 2D,
+        # This algorithm uses the observation that M2M coefficients
+        # have the following form in 2D
+        #
+        # $B_{m, n} = \sum_{i\le m, j\le n} A_{i, j}
+        #             d_x^i d_y^j \binom{m}{i} \binom{n}{j}$
+        # and can be rewritten as follows.
+        #
+        # Let $T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}$.
+        #
+        # Then, $B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}$.
+        #
+        # $T_{m, n}$ are $p^2$ number of temporary variables that are
+        # reused for different M2M coefficients and costs $p$ per variable.
+        # Total cost for calculating $T_{m, n}$ is $p^3$ and similar
+        # for $B_{m, n}$
+
+        # In other words, we're better off computing the translation
+        # one dimension at a time. This is realized here by using
+        # the output from one dimension as the input to the next.
+        # per_dim_coeffs_to_translate serves as the array of input
+        # coefficients for each dimension's translation.
+
+        dim_coeffs_to_translate = src_coeff_exprs
 
-        $B_{m, n} = \sum_{i\le m, j\le n} A_{i, j}
-                    d_x^i d_y^j \binom{m}{i} \binom{n}{j}$
-        and can be rewritten as follows.
-
-        Let $T_{m, n} = \sum_{i\le m} A_{i, n} d_x^i \binom{m}{i}$.
-
-        Then, $B_{m, n} = \sum_{j\le n} T_{m, j} d_y^j \binom{n}{j}$.
-
-        $T_{m, n}$ are $p^2$ number of temporary variables that are
-        reused for different M2M coefficients and costs $p$ per variable.
-        Total cost for calculating $T_{m, n}$ is $p^3$ and similar
-        for $B_{m, n}$
-        """
         mi_to_index = src_mi_to_index
         for d in range(self.dim):
             result = [0] * len(self.get_full_coefficient_identifiers())
@@ -211,7 +218,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                         # Omitted coefficients: not life-threatening
                         continue
 
-                    contrib = src_coeff_exprs[src_index]
+                    contrib = dim_coeffs_to_translate[src_index]
 
                     for idim in range(self.dim):
                         n = tgt_mi[idim]
@@ -223,10 +230,11 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
 
                     result[i] += contrib
 
+                # Defer division by target factorial until the very end
                 if d == self.dim-1:
                     result[i] /= mi_factorial(tgt_mi)
 
-            src_coeff_exprs = result[:]
+            dim_coeffs_to_translate = result[:]
             mi_to_index = tgt_mi_to_index
 
         logger.info("building translation operator: done")
-- 
GitLab