From 0abf04e52a8f980544c8d14f89e5a2f1409134d1 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 26 Apr 2018 20:22:05 -0500
Subject: [PATCH 1/6] Add tsqbx module.

---
 pytential/qbx/target_specific.pyx | 138 ++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 pytential/qbx/target_specific.pyx

diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx
new file mode 100644
index 00000000..184ec4d5
--- /dev/null
+++ b/pytential/qbx/target_specific.pyx
@@ -0,0 +1,138 @@
+#!python
+#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
+import numpy as np
+import cython
+import cython.parallel
+
+from libc.math cimport sqrt
+cimport openmp
+
+
+cdef double legendre(double x, int n, double[] coeffs) nogil:
+    """Evaluate the Legendre series of order n at x.
+
+    Taken from SciPy.
+    """
+    cdef:
+        double c0, c1, tmp
+        int nd, i
+        
+    if n == 0:
+        c0 = coeffs[0]
+        c1 = 0
+    elif n == 1:
+        c0 = coeffs[0]
+        c1 = coeffs[1]
+    else:
+        nd = n + 1
+        c0 = coeffs[n - 1]
+        c1 = coeffs[n]
+        
+        for i in range(3, n + 2):
+            tmp = c0
+            nd = nd - 1
+            c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd
+            c1 = tmp + (c1*x*(2*nd - 1))/n
+    return c0 + c1*x
+
+
+cdef double dist(double[3] a, double[3] b) nogil:
+    return sqrt(
+            (a[0] - b[0]) * (a[0] - b[0]) +
+            (a[1] - b[1]) * (a[1] - b[1]) +
+            (a[2] - b[2]) * (a[2] - b[2]))
+
+
+cdef double tsqbx_from_source(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        int order,
+        double[] tmp) nogil:
+    cdef:
+        int i
+        double r, sc_d, tc_d
+        double cos_angle
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
+    r = sc_d / tc_d
+    tmp[0] = 1 / tc_d
+
+    for i in range(1, order + 1):
+        tmp[i] = tmp[i - 1] * r
+
+    cos_angle = ((
+            (target[0] - center[0]) * (source[0] - center[0]) +
+            (target[1] - center[1]) * (source[1] - center[1]) +
+            (target[2] - center[2]) * (source[2] - center[2]))
+            / (tc_d * sc_d))
+
+    return legendre(cos_angle, order, tmp)
+
+
+def form_target_specific_qbx_contributions(
+        double[:,:] sources,
+        double[:,:] targets,
+        double[:,:] global_qbx_centers,
+        int[:] target_to_center,
+        int order,
+        int[:] qbx_center_to_target_box,
+        int[:] source_box_starts, int[:] source_box_lists,
+        int[:] box_source_starts, int[:] box_source_counts_nonchild,
+        double[:] source_weights,
+        double[:] pot):
+
+    cdef:
+        int itgt, icenter
+        int tgt_box, src_ibox
+        int isrc_box, isrc_box_start, isrc_box_end
+        int isrc, isrc_start, isrc_end
+        int i, tid
+        double result	
+        double[:,:] source, center, target, tmp
+
+    # Yucky thread-local hack
+    maxthreads = openmp.omp_get_max_threads()
+
+    source = np.zeros((maxthreads, 3))
+    target = np.zeros((maxthreads, 3))
+    center = np.zeros((maxthreads, 3))
+    tmp = np.zeros((maxthreads, 256))
+
+    # TODO: Check if order > 256
+
+    for itgt in cython.parallel.prange(0, targets.shape[1], nogil=True,
+                                       schedule="dynamic", chunksize=10):
+        icenter = target_to_center[itgt]
+        if icenter == -1:
+            continue
+
+        result = 0
+
+        tgt_box = qbx_center_to_target_box[icenter]
+
+        tid = cython.parallel.threadid()
+
+        for i in range(3):
+            target[tid, i] = targets[itgt, i]
+            center[tid, i] = global_qbx_centers[icenter, i]
+
+        isrc_box_start = source_box_starts[tgt_box]
+        isrc_box_end = source_box_starts[tgt_box + 1]
+
+        for isrc_box in range(isrc_box_start, isrc_box_end):
+            src_ibox = source_box_lists[isrc_box]
+            isrc_start = box_source_starts[src_ibox]
+            isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
+
+            for isrc in range(isrc_start, isrc_end):
+                for i in range(3):
+                    source[tid, i] = sources[i, isrc]
+                    
+                result = result + source_weights[isrc] * (
+                    tsqbx_from_source(&source[tid, 0], &center[tid, 0],
+                                      &target[tid, 0], order, &tmp[tid, 0]))
+
+        pot[itgt] = pot[itgt] + result
-- 
GitLab


From fe82084331cbbc92b27c60d80584ace981da11d7 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 26 Apr 2018 20:27:49 -0500
Subject: [PATCH 2/6] Add extension to setup.py

---
 setup.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/setup.py b/setup.py
index 84438494..89dc02d4 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,8 @@
 
 import os
 from setuptools import setup, find_packages
+from setuptools.extension import Extension
+from Cython.Build import cythonize
 
 
 # {{{ capture git revision at install time
@@ -54,6 +56,16 @@ write_git_revision("pytential")
 # }}}
 
 
+ext_modules = [
+        Extension(
+            "pytential.qbx.target_specific",
+            ["pytential/qbx/target_specific.pyx"],
+            extra_compile_args=['-fopenmp'],
+            extra_link_args=['-fopenmp']
+        )
+]
+
+
 version_dict = {}
 init_filename = "pytential/version.py"
 os.environ["AKPYTHON_EXEC_FROM_WITHIN_WITHIN_SETUP_PY"] = "1"
@@ -91,6 +103,8 @@ setup(name="pytential",
 
       packages=find_packages(),
 
+      ext_modules = cythonize(ext_modules),
+
       install_requires=[
           "pytest>=2.3",
           # FIXME leave out for now
-- 
GitLab


From ad70bb9d714d0fa4a0077c45c3dc5ca503b840bb Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 26 Apr 2018 22:41:05 -0500
Subject: [PATCH 3/6] TSQBX working for SLP

---
 pytential/qbx/__init__.py         |  8 ++-
 pytential/qbx/fmm.py              | 20 ++++++-
 pytential/qbx/fmmlib.py           | 53 ++++++++++++++++-
 pytential/qbx/target_specific.pyx | 99 +++++++++++++++++--------------
 setup.py                          |  2 +-
 5 files changed, 128 insertions(+), 54 deletions(-)

diff --git a/pytential/qbx/__init__.py b/pytential/qbx/__init__.py
index 4e03ceca..81900a9b 100644
--- a/pytential/qbx/__init__.py
+++ b/pytential/qbx/__init__.py
@@ -83,6 +83,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             _from_sep_smaller_crit=None,
             _from_sep_smaller_min_nsources_cumul=None,
             _tree_kind="adaptive",
+            _use_tsqbx_list1=False,
             geometry_data_inspector=None,
             fmm_backend="sumpy",
             target_stick_out_factor=_not_provided):
@@ -179,6 +180,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
         self._from_sep_smaller_min_nsources_cumul = \
                 _from_sep_smaller_min_nsources_cumul
         self._tree_kind = _tree_kind
+        self._use_tsqbx_list1 = _use_tsqbx_list1
         self.geometry_data_inspector = geometry_data_inspector
 
         # /!\ *All* parameters set here must also be set by copy() below,
@@ -198,6 +200,7 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
             _expansions_in_tree_have_extent=_not_provided,
             _expansion_stick_out_factor=_not_provided,
             _tree_kind=None,
+            _use_tsqbx_list1=_not_provided,
             geometry_data_inspector=None,
 
             debug=_not_provided,
@@ -277,6 +280,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                 _from_sep_smaller_min_nsources_cumul=(
                     self._from_sep_smaller_min_nsources_cumul),
                 _tree_kind=_tree_kind or self._tree_kind,
+                _use_tsqbx_list1=_use_tsqbx_list1 if _use_tsqbx_list1 is not _not_provided
+                    else self._use_tsqbx_list1,
                 geometry_data_inspector=(
                     geometry_data_inspector or self.geometry_data_inspector),
                 fmm_backend=self.fmm_backend,
@@ -694,7 +699,8 @@ class QBXLayerPotentialSource(LayerPotentialSourceBase):
                         self.qbx_order,
                         self.fmm_level_to_order,
                         source_extra_kwargs=source_extra_kwargs,
-                        kernel_extra_kwargs=kernel_extra_kwargs)
+                        kernel_extra_kwargs=kernel_extra_kwargs,
+                        _use_target_specific_list1=self._use_tsqbx_list1)
 
         from pytential.qbx.geometry import target_state
         if (geo_data.user_target_to_center().with_queue(queue)
diff --git a/pytential/qbx/fmm.py b/pytential/qbx/fmm.py
index d3622e59..9c39c9b4 100644
--- a/pytential/qbx/fmm.py
+++ b/pytential/qbx/fmm.py
@@ -90,12 +90,14 @@ class QBXSumpyExpansionWranglerCodeContainer(SumpyExpansionWranglerCodeContainer
     def get_wrangler(self, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs={},
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            _use_target_specific_list1=False):
         return QBXExpansionWrangler(self, queue, geo_data,
                 dtype,
                 qbx_order, fmm_level_to_order,
                 source_extra_kwargs,
-                kernel_extra_kwargs)
+                kernel_extra_kwargs,
+                _use_target_specific_list1)
 
 
 class QBXExpansionWrangler(SumpyExpansionWrangler):
@@ -119,7 +121,11 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
     def __init__(self, code_container, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
-            source_extra_kwargs, kernel_extra_kwargs):
+            source_extra_kwargs, kernel_extra_kwargs,
+            _use_target_specific_list1=False):
+        if _use_target_specific_list1:
+            raise NotImplementedError("Cannot use TSQBX with sumpy yet")
+
         SumpyExpansionWrangler.__init__(self,
                 code_container, queue, geo_data.tree(),
                 dtype, fmm_level_to_order, source_extra_kwargs, kernel_extra_kwargs)
@@ -358,6 +364,11 @@ QBXFMMGeometryData.non_qbx_box_target_lists`),
 
         return pot
 
+    @log_process(logger)
+    def eval_target_specific_global_qbx_locals(self, src_weights):
+        # Not implemented
+        pass
+
     # }}}
 
 # }}}
@@ -490,6 +501,9 @@ def drive_fmm(expansion_wrangler, src_weights):
     qbx_potentials = wrangler.eval_qbx_expansions(
             qbx_expansions)
 
+    qbx_potentials = qbx_potentials + \
+            wrangler.eval_target_specific_global_qbx_locals(src_weights)
+
     # }}}
 
     # {{{ reorder potentials
diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py
index 2b6cbf88..f19d23e6 100644
--- a/pytential/qbx/fmmlib.py
+++ b/pytential/qbx/fmmlib.py
@@ -28,6 +28,7 @@ import pyopencl as cl  # noqa
 import pyopencl.array  # noqa: F401
 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
 from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+import pytential.qbx.target_specific as target_specific
 
 
 from pytools import log_process
@@ -54,12 +55,14 @@ class QBXFMMLibExpansionWranglerCodeContainer(object):
     def get_wrangler(self, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs={},
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            _use_tsqbx_list1=False):
 
         return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype,
                 qbx_order, fmm_level_to_order,
                 source_extra_kwargs,
-                kernel_extra_kwargs)
+                kernel_extra_kwargs,
+                _use_tsqbx_list1)
 
 # }}}
 
@@ -128,10 +131,12 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
     def __init__(self, code, queue, geo_data, dtype,
             qbx_order, fmm_level_to_order,
             source_extra_kwargs,
-            kernel_extra_kwargs):
+            kernel_extra_kwargs,
+            _use_target_specific_list1=False):
 
         self.code = code
         self.queue = queue
+        self._use_target_specific_list1 = _use_target_specific_list1
 
         # FMMLib is CPU-only. This wrapper gets the geometry out of
         # OpenCL-land.
@@ -275,6 +280,13 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
                 raise ValueError("element '%s' of outputs array not "
                         "understood" % out)
 
+    @memoize_method
+    def _get_single_centers_array(self):
+        return np.array([
+            self.geo_data.centers()[idim]
+            for idim in range(self.dim)
+            ], order="F")
+
     # }}}
 
     # {{{ override target lists to only hit non-QBX targets
@@ -302,6 +314,9 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
     @log_process(logger)
     def form_global_qbx_locals(self, src_weights):
+        if self._use_target_specific_list1:
+            return self.qbx_local_expansion_zeros()
+
         geo_data = self.geo_data
         trav = geo_data.traversal()
 
@@ -557,6 +572,38 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
 
         return output
 
+    @log_process(logger)
+    def eval_target_specific_global_qbx_locals(self, src_weights):
+        if not self._use_target_specific_list1:
+            return self.full_output_zeros()
+
+        pot = self.full_output_zeros()
+        geo_data = self.geo_data
+        trav = geo_data.traversal()
+
+        ctt = geo_data.center_to_tree_targets()
+
+        # TODO: assert this is the Laplace single or double layer kernel
+
+        for output in pot:
+            target_specific.eval_target_specific_global_qbx_locals(
+                    order=self.qbx_order,
+                    sources=self._get_single_sources_array(),
+                    targets=geo_data.all_targets(),
+                    centers=self._get_single_centers_array(),
+                    global_qbx_centers=geo_data.global_qbx_centers(),
+                    qbx_center_to_target_box=geo_data.qbx_center_to_target_box(),
+                    center_to_target_starts=ctt.starts,
+                    center_to_target_lists=ctt.lists,
+                    source_box_starts=trav.neighbor_source_boxes_starts,
+                    source_box_lists=trav.neighbor_source_boxes_lists,
+                    box_source_starts=self.tree.box_source_starts,
+                    box_source_counts_nonchild=self.tree.box_source_counts_nonchild,
+                    src_weights=src_weights,
+                    pot=output)
+
+        return pot
+
     def finalize_potentials(self, potential):
         potential = super(QBXFMMLibExpansionWrangler, self).finalize_potentials(
                 potential)
diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx
index 184ec4d5..c668703a 100644
--- a/pytential/qbx/target_specific.pyx
+++ b/pytential/qbx/target_specific.pyx
@@ -6,6 +6,8 @@ import cython
 import cython.parallel
 
 from libc.math cimport sqrt
+from libc.stdio cimport printf
+
 cimport openmp
 
 
@@ -17,7 +19,7 @@ cdef double legendre(double x, int n, double[] coeffs) nogil:
     cdef:
         double c0, c1, tmp
         int nd, i
-        
+
     if n == 0:
         c0 = coeffs[0]
         c1 = 0
@@ -28,12 +30,12 @@ cdef double legendre(double x, int n, double[] coeffs) nogil:
         nd = n + 1
         c0 = coeffs[n - 1]
         c1 = coeffs[n]
-        
+
         for i in range(3, n + 2):
             tmp = c0
             nd = nd - 1
             c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd
-            c1 = tmp + (c1*x*(2*nd - 1))/n
+            c1 = tmp + (c1*x*(2*nd - 1))/nd
     return c0 + c1*x
 
 
@@ -57,8 +59,8 @@ cdef double tsqbx_from_source(
 
     tc_d = dist(target, center)
     sc_d = dist(source, center)
-    r = sc_d / tc_d
-    tmp[0] = 1 / tc_d
+    r = tc_d / sc_d
+    tmp[0] = 1 / sc_d
 
     for i in range(1, order + 1):
         tmp[i] = tmp[i - 1] * r
@@ -72,67 +74,72 @@ cdef double tsqbx_from_source(
     return legendre(cos_angle, order, tmp)
 
 
-def form_target_specific_qbx_contributions(
+def eval_target_specific_global_qbx_locals(
+        int order,
         double[:,:] sources,
         double[:,:] targets,
-        double[:,:] global_qbx_centers,
-        int[:] target_to_center,
-        int order,
+        double[:,:] centers,
+        int[:] global_qbx_centers,
         int[:] qbx_center_to_target_box,
+        int[:] center_to_target_starts, int[:] center_to_target_lists,
         int[:] source_box_starts, int[:] source_box_lists,
         int[:] box_source_starts, int[:] box_source_counts_nonchild,
-        double[:] source_weights,
-        double[:] pot):
+        double[:] src_weights,
+        double complex[:] pot):
 
     cdef:
-        int itgt, icenter
+        int tgt, ictr, ctr
+        int itgt, itgt_start, itgt_end
         int tgt_box, src_ibox
         int isrc_box, isrc_box_start, isrc_box_end
         int isrc, isrc_start, isrc_end
         int i, tid
-        double result	
+        double result
         double[:,:] source, center, target, tmp
 
     # Yucky thread-local hack
     maxthreads = openmp.omp_get_max_threads()
 
-    source = np.zeros((maxthreads, 3))
-    target = np.zeros((maxthreads, 3))
-    center = np.zeros((maxthreads, 3))
-    tmp = np.zeros((maxthreads, 256))
+    source = np.zeros((1 + maxthreads, 3))
+    target = np.zeros((1 + maxthreads, 3))
+    center = np.zeros((1 + maxthreads, 3))
+    tmp = np.zeros((1 + maxthreads, 256))
 
     # TODO: Check if order > 256
 
-    for itgt in cython.parallel.prange(0, targets.shape[1], nogil=True,
-                                       schedule="dynamic", chunksize=10):
-        icenter = target_to_center[itgt]
-        if icenter == -1:
-            continue
+    for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0],
+                                       nogil=True, schedule="dynamic",
+                                       chunksize=10):
+        ctr = global_qbx_centers[ictr]
+        itgt_start = center_to_target_starts[ctr]
+        itgt_end = center_to_target_starts[ctr + 1]
+        tgt_box = qbx_center_to_target_box[ctr]
+        tid = cython.parallel.threadid()
 
-        result = 0
+        for i in range(3):
+            center[tid, i] = centers[i, ctr]
 
-        tgt_box = qbx_center_to_target_box[icenter]
+        for itgt in range(itgt_start, itgt_end):
+            result = 0
+            tgt = center_to_target_lists[itgt]
 
-        tid = cython.parallel.threadid()
+            for i in range(3):
+                target[tid, i] = targets[i, tgt]
 
-        for i in range(3):
-            target[tid, i] = targets[itgt, i]
-            center[tid, i] = global_qbx_centers[icenter, i]
-
-        isrc_box_start = source_box_starts[tgt_box]
-        isrc_box_end = source_box_starts[tgt_box + 1]
-
-        for isrc_box in range(isrc_box_start, isrc_box_end):
-            src_ibox = source_box_lists[isrc_box]
-            isrc_start = box_source_starts[src_ibox]
-            isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
-
-            for isrc in range(isrc_start, isrc_end):
-                for i in range(3):
-                    source[tid, i] = sources[i, isrc]
-                    
-                result = result + source_weights[isrc] * (
-                    tsqbx_from_source(&source[tid, 0], &center[tid, 0],
-                                      &target[tid, 0], order, &tmp[tid, 0]))
-
-        pot[itgt] = pot[itgt] + result
+            isrc_box_start = source_box_starts[tgt_box]
+            isrc_box_end = source_box_starts[tgt_box + 1]
+
+            for isrc_box in range(isrc_box_start, isrc_box_end):
+                src_ibox = source_box_lists[isrc_box]
+                isrc_start = box_source_starts[src_ibox]
+                isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
+
+                for isrc in range(isrc_start, isrc_end):
+                    for i in range(3):
+                        source[tid, i] = sources[i, isrc]
+
+                    result = result + src_weights[isrc] * (
+                        tsqbx_from_source(&source[tid, 0], &center[tid, 0],
+                                          &target[tid, 0], order, &tmp[tid, 0]))
+
+            pot[tgt] = pot[tgt] + result
diff --git a/setup.py b/setup.py
index 89dc02d4..a39b21bb 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ setup(name="pytential",
 
       packages=find_packages(),
 
-      ext_modules = cythonize(ext_modules),
+      ext_modules = cythonize(ext_modules)
 
       install_requires=[
           "pytest>=2.3",
-- 
GitLab


From ff09893c559d340e12cc28e8627c8754eaa78d4b Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 26 Apr 2018 23:32:41 -0500
Subject: [PATCH 4/6] Checkpoint

---
 pytential/qbx/fmmlib.py           |   4 +-
 pytential/qbx/target_specific.pyx | 114 ++++++++++++++++++++++--------
 setup.py                          |   2 +-
 3 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py
index f19d23e6..ae323a24 100644
--- a/pytential/qbx/fmmlib.py
+++ b/pytential/qbx/fmmlib.py
@@ -56,13 +56,13 @@ class QBXFMMLibExpansionWranglerCodeContainer(object):
             qbx_order, fmm_level_to_order,
             source_extra_kwargs={},
             kernel_extra_kwargs=None,
-            _use_tsqbx_list1=False):
+            _use_target_specific_list1=False):
 
         return QBXFMMLibExpansionWrangler(self, queue, geo_data, dtype,
                 qbx_order, fmm_level_to_order,
                 source_extra_kwargs,
                 kernel_extra_kwargs,
-                _use_tsqbx_list1)
+                _use_target_specific_list1)
 
 # }}}
 
diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx
index c668703a..c349bb56 100644
--- a/pytential/qbx/target_specific.pyx
+++ b/pytential/qbx/target_specific.pyx
@@ -11,32 +11,47 @@ from libc.stdio cimport printf
 cimport openmp
 
 
-cdef double legendre(double x, int n, double[] coeffs) nogil:
-    """Evaluate the Legendre series of order n at x.
+cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil:
+    """Compute the values of the Legendre polynomial up to order n at x.
+    Optionally, if derivs is non-NULL, compute the values of the derivative too.
 
-    Taken from SciPy.
+    Borrowed from fmmlib.
     """
     cdef:
-        double c0, c1, tmp
-        int nd, i
+        double pj, derj, pjm2, pjm1, derjm2, derjm1
+        int j
+
+    pjm2 = 1
+    pjm1 = x
+
+    vals[0] = 1
+    if derivs != NULL:
+        derivs[0] = 0
+        derjm2 = 0
+        derjm1 = 1
 
     if n == 0:
-        c0 = coeffs[0]
-        c1 = 0
-    elif n == 1:
-        c0 = coeffs[0]
-        c1 = coeffs[1]
-    else:
-        nd = n + 1
-        c0 = coeffs[n - 1]
-        c1 = coeffs[n]
-
-        for i in range(3, n + 2):
-            tmp = c0
-            nd = nd - 1
-            c0 = coeffs[1+n-i] - (c1*(nd - 1))/nd
-            c1 = tmp + (c1*x*(2*nd - 1))/nd
-    return c0 + c1*x
+        return
+
+    vals[1] = x
+    if derivs != NULL:
+        derivs[1] = 1
+
+    if n == 1:
+        return
+
+    for j in range(2, n + 1):
+        pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j
+        vals[j] = pj
+        pjm2 = pjm1
+        pjm1 = pj
+
+        if derivs != NULL:
+            derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2
+            derj = derj / j
+            derivs[j] = derj
+            derjm2 = derjm1
+            derjm1 = derj
 
 
 cdef double dist(double[3] a, double[3] b) nogil:
@@ -46,24 +61,56 @@ cdef double dist(double[3] a, double[3] b) nogil:
             (a[2] - b[2]) * (a[2] - b[2]))
 
 
-cdef double tsqbx_from_source(
+"""
+cdef void tsqbx_grad_from_source(
         double[3] source,
         double[3] center,
         double[3] target,
+        double[3] grad,    
         int order,
         double[] tmp) nogil:
     cdef:
         int i
-        double r, sc_d, tc_d
-        double cos_angle
+        double result, r, sc_d, tc_d, cos_angle, alpha
+        double *derivs
+
+    derivs = &tmp[order + 1]
 
     tc_d = dist(target, center)
     sc_d = dist(source, center)
-    r = tc_d / sc_d
-    tmp[0] = 1 / sc_d
 
-    for i in range(1, order + 1):
-        tmp[i] = tmp[i - 1] * r
+    alpha = (
+            (target[0] - center[0]) * (source[0] - center[0]) +
+            (target[1] - center[1]) * (source[1] - center[1]) +
+            (target[2] - center[2]) * (source[2] - center[2])
+
+    cos_angle = alpha / (tc_d * sc_d)
+
+    legvals(cos_angle, order, tmp, derivs)
+
+    result = 0
+    r = 1 / sc_d
+
+    for i in range(0, order + 1):
+        result = result + tmp[i] * r
+        r = r * (tc_d / sc_d)
+
+    return result
+"""
+
+
+cdef double tsqbx_from_source(
+        double[3] source,
+        double[3] center,
+        double[3] target,
+        int order,
+        double[] tmp) nogil:
+    cdef:
+        int i
+        double result, r, sc_d, tc_d, cos_angle
+
+    tc_d = dist(target, center)
+    sc_d = dist(source, center)
 
     cos_angle = ((
             (target[0] - center[0]) * (source[0] - center[0]) +
@@ -71,7 +118,16 @@ cdef double tsqbx_from_source(
             (target[2] - center[2]) * (source[2] - center[2]))
             / (tc_d * sc_d))
 
-    return legendre(cos_angle, order, tmp)
+    legvals(cos_angle, order, tmp, NULL)
+
+    result = 0
+    r = 1 / sc_d
+
+    for i in range(0, order + 1):
+        result = result + tmp[i] * r
+        r = r * (tc_d / sc_d)
+
+    return result
 
 
 def eval_target_specific_global_qbx_locals(
diff --git a/setup.py b/setup.py
index a39b21bb..89dc02d4 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ setup(name="pytential",
 
       packages=find_packages(),
 
-      ext_modules = cythonize(ext_modules)
+      ext_modules = cythonize(ext_modules),
 
       install_requires=[
           "pytest>=2.3",
-- 
GitLab


From c62e9a3b81e7add42e6cdf829564488b8661ec13 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 27 Apr 2018 01:40:51 -0500
Subject: [PATCH 5/6] Get DLP working.

---
 pytential/qbx/fmmlib.py           |   3 +-
 pytential/qbx/target_specific.pyx |  92 ++++++++++++++--------
 test/test_target_specific_qbx.py  | 123 ++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+), 33 deletions(-)
 create mode 100644 test/test_target_specific_qbx.py

diff --git a/pytential/qbx/fmmlib.py b/pytential/qbx/fmmlib.py
index ae323a24..43fcbb13 100644
--- a/pytential/qbx/fmmlib.py
+++ b/pytential/qbx/fmmlib.py
@@ -599,7 +599,8 @@ class QBXFMMLibExpansionWrangler(FMMLibExpansionWrangler):
                     source_box_lists=trav.neighbor_source_boxes_lists,
                     box_source_starts=self.tree.box_source_starts,
                     box_source_counts_nonchild=self.tree.box_source_counts_nonchild,
-                    src_weights=src_weights,
+                    dipstr=src_weights,
+                    dipvec=self.dipole_vec,
                     pot=output)
 
         return pot
diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx
index c349bb56..cbb5d73b 100644
--- a/pytential/qbx/target_specific.pyx
+++ b/pytential/qbx/target_specific.pyx
@@ -43,8 +43,6 @@ cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil:
     for j in range(2, n + 1):
         pj = ( (2*j-1)*x*pjm1-(j-1)*pjm2 ) / j
         vals[j] = pj
-        pjm2 = pjm1
-        pjm1 = pj
 
         if derivs != NULL:
             derj = (2*j-1)*(pjm1+x*derjm1)-(j-1)*derjm2
@@ -53,6 +51,9 @@ cdef void legvals(double x, int n, double[] vals, double[] derivs) nogil:
             derjm2 = derjm1
             derjm1 = derj
 
+        pjm2 = pjm1
+        pjm1 = pj
+
 
 cdef double dist(double[3] a, double[3] b) nogil:
     return sqrt(
@@ -61,20 +62,24 @@ cdef double dist(double[3] a, double[3] b) nogil:
             (a[2] - b[2]) * (a[2] - b[2]))
 
 
-"""
 cdef void tsqbx_grad_from_source(
         double[3] source,
         double[3] center,
         double[3] target,
-        double[3] grad,    
-        int order,
-        double[] tmp) nogil:
+        double[3] grad,
+        int order) nogil:
     cdef:
-        int i
-        double result, r, sc_d, tc_d, cos_angle, alpha
-        double *derivs
-
-    derivs = &tmp[order + 1]
+        int i, j
+        double result, sc_d, tc_d, cos_angle, alpha, R
+        double[128] tmp
+        double[128] derivs
+        double[3] cms
+        double[3] tmc
+
+    for j in range(3):
+        cms[j] = center[j] - source[j]
+        tmc[j] = target[j] - center[j]
+        grad[j] = 0
 
     tc_d = dist(target, center)
     sc_d = dist(source, center)
@@ -82,32 +87,37 @@ cdef void tsqbx_grad_from_source(
     alpha = (
             (target[0] - center[0]) * (source[0] - center[0]) +
             (target[1] - center[1]) * (source[1] - center[1]) +
-            (target[2] - center[2]) * (source[2] - center[2])
+            (target[2] - center[2]) * (source[2] - center[2]))
 
     cos_angle = alpha / (tc_d * sc_d)
 
     legvals(cos_angle, order, tmp, derivs)
 
-    result = 0
-    r = 1 / sc_d
+    R = 1 / sc_d
 
     for i in range(0, order + 1):
-        result = result + tmp[i] * r
-        r = r * (tc_d / sc_d)
+        # Invariant: R = (t_cd ** i / sc_d ** (i + 1))
+        for j in range(3):
+            grad[j] += (i + 1) * cms[j] / (sc_d ** 2) * R * tmp[i]
+        for j in range(3):
+            # Siegel and Tornberg has a sign flip here :(
+            grad[j] += (
+                    tmc[j] / (tc_d * sc_d) +
+                    alpha * cms[j] / (tc_d * sc_d ** 3)) * R * derivs[i]
+        R *= (tc_d / sc_d)
 
-    return result
-"""
+    return
 
 
 cdef double tsqbx_from_source(
         double[3] source,
         double[3] center,
         double[3] target,
-        int order,
-        double[] tmp) nogil:
+        int order) nogil:
     cdef:
         int i
         double result, r, sc_d, tc_d, cos_angle
+        double tmp[128]
 
     tc_d = dist(target, center)
     sc_d = dist(source, center)
@@ -124,8 +134,8 @@ cdef double tsqbx_from_source(
     r = 1 / sc_d
 
     for i in range(0, order + 1):
-        result = result + tmp[i] * r
-        r = r * (tc_d / sc_d)
+        result +=  tmp[i] * r
+        r *= (tc_d / sc_d)
 
     return result
 
@@ -140,7 +150,8 @@ def eval_target_specific_global_qbx_locals(
         int[:] center_to_target_starts, int[:] center_to_target_lists,
         int[:] source_box_starts, int[:] source_box_lists,
         int[:] box_source_starts, int[:] box_source_counts_nonchild,
-        double[:] src_weights,
+        double[:] dipstr,
+        double[:,:] dipvec,
         double complex[:] pot):
 
     cdef:
@@ -151,15 +162,22 @@ def eval_target_specific_global_qbx_locals(
         int isrc, isrc_start, isrc_end
         int i, tid
         double result
-        double[:,:] source, center, target, tmp
+        double[:,:] source, center, target, grad
+        int slp, dlp
+
+    slp = (dipstr is not None) and (dipvec is None)
+    dlp = (dipstr is not None) and (dipvec is not None)
+
+    if not (slp or dlp):
+        raise ValueError("should specify exactly one of src_weights or dipvec")
 
-    # Yucky thread-local hack
+    # Hack to obtain thread-local storage
     maxthreads = openmp.omp_get_max_threads()
 
-    source = np.zeros((1 + maxthreads, 3))
-    target = np.zeros((1 + maxthreads, 3))
-    center = np.zeros((1 + maxthreads, 3))
-    tmp = np.zeros((1 + maxthreads, 256))
+    source = np.zeros((maxthreads, 3))
+    target = np.zeros((maxthreads, 3))
+    center = np.zeros((maxthreads, 3))
+    grad = np.zeros((maxthreads, 3))
 
     # TODO: Check if order > 256
 
@@ -194,8 +212,18 @@ def eval_target_specific_global_qbx_locals(
                     for i in range(3):
                         source[tid, i] = sources[i, isrc]
 
-                    result = result + src_weights[isrc] * (
-                        tsqbx_from_source(&source[tid, 0], &center[tid, 0],
-                                          &target[tid, 0], order, &tmp[tid, 0]))
+                    if slp:
+                        # Don't replace with +=, since that makes Cython think
+                        # it is a reduction.
+                        result = result + dipstr[isrc] * (
+                                tsqbx_from_source(&source[tid, 0], &center[tid, 0],
+                                                  &target[tid, 0], order))
+                    elif dlp:
+                        tsqbx_grad_from_source(&source[tid, 0], &center[tid, 0],
+                                               &target[tid, 0], &grad[tid, 0], order)
+                        result = result + dipstr[isrc] * (
+                                grad[tid, 0] * dipvec[0, isrc] +
+                                grad[tid, 1] * dipvec[1, isrc] +
+                                grad[tid, 2] * dipvec[2, isrc])
 
             pot[tgt] = pot[tgt] + result
diff --git a/test/test_target_specific_qbx.py b/test/test_target_specific_qbx.py
new file mode 100644
index 00000000..60a801d4
--- /dev/null
+++ b/test/test_target_specific_qbx.py
@@ -0,0 +1,123 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2013-2017 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pytest
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from functools import partial
+from meshmode.mesh.generation import (  # noqa
+        ellipse, cloverleaf, starfish, drop, n_gon, qbx_peanut, WobblyCircle,
+        NArmedStarfish,
+        make_curve_mesh)
+# from sumpy.visualization import FieldPlotter
+from pytential import bind, sym, norm
+from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import matplotlib.pyplot as pt
+except ImportError:
+    pass
+
+
+@pytest.mark.parametrize("op", ["S", "D"])
+def test_target_specific_qbx(ctx_getter, op):
+    logging.basicConfig(level=logging.INFO)
+
+    cl_ctx = ctx_getter()
+    queue = cl.CommandQueue(cl_ctx)
+
+    target_order = 4
+
+    from meshmode.mesh.generation import generate_icosphere
+    mesh = generate_icosphere(1, target_order)
+
+    from meshmode.discretization import Discretization
+    from meshmode.discretization.poly_element import \
+        InterpolatoryQuadratureSimplexGroupFactory
+    from pytential.qbx import QBXLayerPotentialSource
+    pre_density_discr = Discretization(
+        cl_ctx, mesh,
+        InterpolatoryQuadratureSimplexGroupFactory(target_order))
+
+    qbx, _ = QBXLayerPotentialSource(
+            pre_density_discr, 4*target_order,
+            qbx_order=5,
+            fmm_order=10,
+            fmm_backend="fmmlib",
+            _expansions_in_tree_have_extent=True,
+            _expansion_stick_out_factor=0.9,
+            ).with_refinement()
+
+    density_discr = qbx.density_discr
+
+    nodes_host = density_discr.nodes().get(queue)
+    center = np.array([3, 1, 2])
+    diff = nodes_host - center[:, np.newaxis]
+    
+    dist_squared = np.sum(diff**2, axis=0)
+    dist = np.sqrt(dist_squared)
+    u = 1/dist
+    
+    u_dev = cl.array.to_device(queue, u)
+
+    kernel = LaplaceKernel(3)
+    u_sym = sym.var("u")
+    
+    if op == "S":
+        op = sym.S
+    elif op == "D":
+        op = sym.D
+    expr = op(kernel, u_sym, qbx_forced_limit=-1)
+
+    bound_op = bind(qbx, expr)
+    slp_ref = bound_op(queue, u=u_dev)
+
+    qbx = qbx.copy(_use_tsqbx_list1=True)
+    bound_op = bind(qbx, expr)
+    slp_tsqbx = bound_op(queue, u=u_dev)
+
+    assert (np.max(np.abs(slp_ref.get() - slp_tsqbx.get()))) < 1e-13
+
+
+# You can test individual routines by typing
+# $ python test_layer_pot_identity.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
-- 
GitLab


From 29356b4154d5be017a24c899e8ee341259d9d1db Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 27 Apr 2018 18:53:18 -0500
Subject: [PATCH 6/6] [ci skip] Performance tweaking.

---
 pytential/qbx/target_specific.pyx | 19 +++++++++++--------
 setup.py                          |  4 ++--
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/pytential/qbx/target_specific.pyx b/pytential/qbx/target_specific.pyx
index cbb5d73b..7cde3a57 100644
--- a/pytential/qbx/target_specific.pyx
+++ b/pytential/qbx/target_specific.pyx
@@ -98,12 +98,12 @@ cdef void tsqbx_grad_from_source(
     for i in range(0, order + 1):
         # Invariant: R = (t_cd ** i / sc_d ** (i + 1))
         for j in range(3):
-            grad[j] += (i + 1) * cms[j] / (sc_d ** 2) * R * tmp[i]
+            grad[j] += (i + 1) * cms[j] / (sc_d * sc_d) * R * tmp[i]
         for j in range(3):
             # Siegel and Tornberg has a sign flip here :(
             grad[j] += (
                     tmc[j] / (tc_d * sc_d) +
-                    alpha * cms[j] / (tc_d * sc_d ** 3)) * R * derivs[i]
+                    alpha * cms[j] / (tc_d * sc_d * sc_d * sc_d)) * R * derivs[i]
         R *= (tc_d / sc_d)
 
     return
@@ -168,22 +168,25 @@ def eval_target_specific_global_qbx_locals(
     slp = (dipstr is not None) and (dipvec is None)
     dlp = (dipstr is not None) and (dipvec is not None)
 
+    print("Hi from Cython")
+
     if not (slp or dlp):
         raise ValueError("should specify exactly one of src_weights or dipvec")
 
     # Hack to obtain thread-local storage
     maxthreads = openmp.omp_get_max_threads()
 
-    source = np.zeros((maxthreads, 3))
-    target = np.zeros((maxthreads, 3))
-    center = np.zeros((maxthreads, 3))
-    grad = np.zeros((maxthreads, 3))
+    # Prevent false sharing by over-allocating the buffers
+    source = np.zeros((maxthreads, 65))
+    target = np.zeros((maxthreads, 65))
+    center = np.zeros((maxthreads, 65))
+    grad = np.zeros((maxthreads, 65))
 
     # TODO: Check if order > 256
 
     for ictr in cython.parallel.prange(0, global_qbx_centers.shape[0],
-                                       nogil=True, schedule="dynamic",
-                                       chunksize=10):
+                                       nogil=True, schedule="static",
+                                       chunksize=128):
         ctr = global_qbx_centers[ictr]
         itgt_start = center_to_target_starts[ctr]
         itgt_end = center_to_target_starts[ctr + 1]
diff --git a/setup.py b/setup.py
index 89dc02d4..59414845 100644
--- a/setup.py
+++ b/setup.py
@@ -60,8 +60,8 @@ ext_modules = [
         Extension(
             "pytential.qbx.target_specific",
             ["pytential/qbx/target_specific.pyx"],
-            extra_compile_args=['-fopenmp'],
-            extra_link_args=['-fopenmp']
+            extra_compile_args=["-fopenmp"],
+            extra_link_args=["-fopenmp"]
         )
 ]
 
-- 
GitLab