From 73e4d00114abb722ecc94149edbab6b86c464a7f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 11 Jul 2013 00:12:41 -0400
Subject: [PATCH] Add, test Taylor M2P, P2M

---
 MEMO                         |   8 ++
 sumpy/e2p.py                 | 166 ++++++++++++++++++++++++++++
 sumpy/expansion/__init__.py  |  10 +-
 sumpy/expansion/local.py     |  16 +--
 sumpy/expansion/multipole.py |  34 ++++--
 sumpy/kernel.py              |   7 +-
 sumpy/p2e.py                 | 146 +++++++++++++++++++++++++
 sumpy/p2p.py                 |   8 +-
 sumpy/qbx.py                 |   2 +-
 sumpy/tools.py               |   2 +-
 test/test_kernels.py         | 205 +++++++++++++++++++----------------
 11 files changed, 481 insertions(+), 123 deletions(-)
 create mode 100644 sumpy/e2p.py
 create mode 100644 sumpy/p2e.py

diff --git a/MEMO b/MEMO
index 6177f9fe..13852b25 100644
--- a/MEMO
+++ b/MEMO
@@ -1 +1,9 @@
 - AoS/SoA flexibility?
+
+Conventions:
+
+a = center - src
+b = tgt - center
+d = tgt - src
+
+Kernel scaling is done strictly on output to point values
diff --git a/sumpy/e2p.py b/sumpy/e2p.py
new file mode 100644
index 00000000..9d8ab4a4
--- /dev/null
+++ b/sumpy/e2p.py
@@ -0,0 +1,166 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import loopy as lp
+import sympy as sp
+from pytools import memoize_method
+
+
+class E2P(object):
+    def __init__(self, ctx, expansion, kernels,
+            options=[], name="e2p", device=None):
+        """
+        :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase`
+        :arg strength_usage: A list of integers indicating which expression
+          uses which source strength indicator. This implicitly specifies the
+          number of strength arrays that need to be passed.
+          Default: all kernels use the same strength.
+        """
+
+        if device is None:
+            device = ctx.devices[0]
+
+        self.ctx = ctx
+        self.expansion = expansion
+        self.kernels = kernels
+        self.options = options
+        self.name = name
+        self.device = device
+
+        self.dim = expansion.dim
+
+        from sumpy.kernel import TargetDerivativeRemover
+        tdr = TargetDerivativeRemover()
+        for knl in kernels:
+            assert tdr(knl) == expansion.kernel
+
+    @memoize_method
+    def get_kernel(self):
+        from sumpy.symbolic import make_sym_vector
+        bvec = make_sym_vector("b", self.dim)
+
+        from sumpy.assignment_collection import SymbolicAssignmentCollection
+        sac = SymbolicAssignmentCollection()
+
+        coeff_exprs = [sp.Symbol("coeff%d" % i)
+                for i in xrange(len(self.expansion.get_coefficient_identifiers()))]
+        value = self.expansion.evaluate(coeff_exprs, bvec)
+        result_names = [
+            sac.assign_unique("result_%d_p" % i,
+                knl.postprocess_at_target(value, bvec))
+            for i, knl in enumerate(self.kernels)
+            ]
+
+        sac.run_global_cse()
+
+        from sumpy.symbolic import kill_trivial_assignments
+        assignments = kill_trivial_assignments([
+                (name, expr)
+                for name, expr in sac.assignments.iteritems()],
+                retain_names=result_names)
+
+
+        from sumpy.codegen import to_loopy_insns
+        loopy_insns = to_loopy_insns(assignments,
+                vector_names=set(["b"]),
+                pymbolic_expr_maps=[self.expansion.transform_to_code],
+                complex_dtype=np.complex128  # FIXME
+                )
+
+        from pymbolic.sympy_interface import SympyToPymbolicMapper
+        sympy_conv = SympyToPymbolicMapper()
+        loopy_insns.append(
+                lp.ExpressionInstruction(id=None,
+                    assignee="kernel_scaling",
+                    expression=sympy_conv(self.expansion.kernel.get_scaling()),
+                    temp_var_type=lp.auto))
+
+        arguments = (
+                [
+                    lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")),
+                    lp.GlobalArg("box_target_starts,box_target_counts_nonchild",
+                        None, shape=None),
+                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
+                    lp.GlobalArg("expansions", None,
+                        shape=(len(coeff_exprs), "nboxes")),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.ValueArg("ntargets", np.int32),
+                    "..."
+                ] + [
+                    lp.GlobalArg("result_%d" % i, None, shape="ntargets")
+                    for i in range(len(result_names))
+                ] + self.expansion.get_args()
+                )
+
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
+                    "{[itgt,idim]: itgt_start<=itgt<itgt_end and 0<=idim<dim}",
+                    ],
+                loopy_insns
+                + [
+                    "<> tgt_ibox = target_boxes[itgt_box]",
+                    "<> itgt_start = box_target_starts[tgt_ibox]",
+                    "<> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]",
+                    "<> center[idim] = centers[idim, tgt_ibox] {id=fetch_center}",
+                    "<> b[idim] = targets[idim, itgt] - center[idim]",
+                    "<> coeff${COEFFIDX} = expansions[tgt_ibox, ${COEFFIDX}]",
+                    "result_${RESULTIDX}[itgt] = kernel_scaling * result_${RESULTIDX}_p"
+                ],
+                arguments,
+                name=self.name, assumptions="ntgt_boxes>=1",
+                preambles=self.expansion.get_preambles(),
+                defines=dict(
+                    dim=self.dim,
+                    COEFFIDX=[str(i) for i in xrange(len(coeff_exprs))],
+                    RESULTIDX=[str(i) for i in xrange(len(result_names))],
+                    )
+                )
+
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center",
+                tags={"idim": "unr"})
+        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        #knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg expansions:
+        :arg target_boxes:
+        :arg box_target_starts:
+        :arg box_target_counts_nonchild:
+        :arg centers:
+        :arg targets:
+        """
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, **kwargs)
diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py
index e52ff61d..3d4ae792 100644
--- a/sumpy/expansion/__init__.py
+++ b/sumpy/expansion/__init__.py
@@ -29,6 +29,9 @@ from pytools import memoize_method
 
 class ExpansionBase(object):
     def __init__(self, kernel, order):
+        from sumpy.kernel import TargetDerivativeRemover
+        kernel = TargetDerivativeRemover()(kernel)
+
         self.kernel = kernel
         self.order = order
 
@@ -45,6 +48,9 @@ class ExpansionBase(object):
     def prepare_loopy_kernel(self, loopy_knl):
         return self.kernel.prepare_loopy_kernel(loopy_knl)
 
+    def transform_to_code(self, expr):
+        return self.kernel.transform_to_code(expr)
+
     def get_scaling(self):
         return self.kernel.get_scaling()
 
@@ -80,15 +86,13 @@ class VolumeTaylorExpansionBase(object):
         return self._storage_loc_dict()[k]
 
     @memoize_method
-    def get_coefficient_indices(self):
+    def get_coefficient_identifiers(self):
         from pytools import (
                 generate_nonnegative_integer_tuples_summing_to_at_most
                 as gnitstam)
 
         return sorted(gnitstam(self.order, self.kernel.dim), key=sum)
 
-        return range(self.order+1)
-
 # }}}
 
 
diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py
index 73859973..83f51465 100644
--- a/sumpy/expansion/local.py
+++ b/sumpy/expansion/local.py
@@ -37,7 +37,7 @@ class LineTaylorLocalExpansion(LocalExpansionBase):
     def get_storage_index(self, k):
         return k
 
-    def get_coefficient_indices(self):
+    def get_coefficient_identifiers(self):
         return range(self.order+1)
 
     def coefficients_from_source(self, avec, bvec):
@@ -56,13 +56,13 @@ class LineTaylorLocalExpansion(LocalExpansionBase):
                         avec),
                     bvec)
                 .subs("tau", 0)
-                for i in self.get_coefficient_indices()]
+                for i in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
         from pytools import factorial
         return sum(
                 coeffs[self.get_storage_index(i)] / factorial(i)
-                for i in self.get_coefficient_indices())
+                for i in self.get_coefficient_identifiers())
 
 # }}}
 
@@ -75,7 +75,7 @@ class VolumeTaylorLocalExpansion(LocalExpansionBase, VolumeTaylorExpansionBase):
         ppkernel = self.kernel.postprocess_at_source(
                 self.kernel.get_expression(avec), avec)
         return [mi_derivative(ppkernel, avec, mi)
-                for mi in self.get_coefficient_indices()]
+                for mi in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
         from sumpy.tools import mi_power, mi_factorial
@@ -83,7 +83,7 @@ class VolumeTaylorLocalExpansion(LocalExpansionBase, VolumeTaylorExpansionBase):
                 coeff
                 * self.kernel.postprocess_at_target(mi_power(bvec, mi), bvec)
                 / mi_factorial(mi)
-                for coeff, mi in zip(coeffs, self.get_coefficient_indices()))
+                for coeff, mi in zip(coeffs, self.get_coefficient_identifiers()))
 
 # }}}
 
@@ -101,7 +101,7 @@ class H2DLocalExpansion(LocalExpansionBase):
     def get_storage_index(self, k):
         return self.order+k
 
-    def get_coefficient_indices(self):
+    def get_coefficient_identifiers(self):
         return range(-self.order, self.order+1)
 
     def coefficients_from_source(self, avec, bvec):
@@ -116,7 +116,7 @@ class H2DLocalExpansion(LocalExpansionBase):
                 self.kernel.postprocess_at_source(
                     hankel_1(i, sp.Symbol("k")*u)*e_i_csangle**i,
                     avec)
-                    for i in self.get_coefficient_indices()]
+                    for i in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
         bessel_j = sp.Function("bessel_j")
@@ -132,7 +132,7 @@ class H2DLocalExpansion(LocalExpansionBase):
                     * self.kernel.postprocess_at_target(
                         bessel_j(i, sp.Symbol("k")*v)
                         * e_i_ctangle**i, bvec)
-                for i in self.get_coefficient_indices())
+                for i in self.get_coefficient_identifiers())
 
 # }}}
 
diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 21af04ce..84a19e9a 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -36,19 +36,35 @@ class MultipoleExpansionBase(ExpansionBase):
 class VolumeTaylorMultipoleExpansion(
         MultipoleExpansionBase, VolumeTaylorExpansionBase):
     def coefficients_from_source(self, avec, bvec):
-        from sumpy.tools import mi_derivative
-        ppkernel = self.kernel.postprocess_at_source(
-                self.kernel.get_expression(avec), avec)
-        return [mi_derivative(ppkernel, avec, mi)
-                for mi in self.get_coefficient_indices()]
+        from sumpy.kernel import DirectionalSourceDerivative
+        kernel = self.kernel
 
-    def evaluate(self, coeffs, bvec):
         from sumpy.tools import mi_power, mi_factorial
+
+        if isinstance(kernel, DirectionalSourceDerivative):
+            if kernel.get_base_kernel() is not kernel.inner_kernel:
+                raise NotImplementedError("more than one source derivative "
+                        "not supported at present")
+
+            from sumpy.symbolic import make_sym_vector
+            dir_vec = make_sym_vector(kernel.dir_vec_name, kernel.dim)
+
+            result = [0]
+            1/0
+        else:
+            return [
+                    mi_power(avec, mi) / mi_factorial(mi)
+                    for mi in self.get_coefficient_identifiers()]
+
+    def evaluate(self, coeffs, bvec):
+        ppkernel = self.kernel.postprocess_at_target(
+                self.kernel.get_expression(bvec), bvec)
+
+        from sumpy.tools import mi_derivative
         return sum(
                 coeff
-                * self.kernel.postprocess_at_target(mi_power(bvec, mi), bvec)
-                / mi_factorial(mi)
-                for coeff, mi in zip(coeffs, self.get_coefficient_indices()))
+                * mi_derivative(ppkernel, bvec, mi)
+                for coeff, mi in zip(coeffs, self.get_coefficient_identifiers()))
 
 # }}}
 
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index fbc43d33..1fb00031 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -95,7 +95,7 @@ class Kernel(object):
         return expr
 
     def get_expression(self, dist_vec):
-        """Return a :mod:`pymbolic` expression for the kernel.
+        """Return a :mod:`sympy` expression for the kernel.
 
         :arg dist_vec: target - source
 
@@ -502,6 +502,11 @@ class AxisTargetDerivativeRemover(KernelIdentityMapper):
         return self.rec(kernel.inner_kernel)
 
 
+class TargetDerivativeRemover(AxisTargetDerivativeRemover):
+    def map_directional_target_derivative(self, kernel):
+        return self.rec(kernel.inner_kernel)
+
+
 class DerivativeCounter(KernelCombineMapper):
     def combine(self, values):
         return max(values)
diff --git a/sumpy/p2e.py b/sumpy/p2e.py
new file mode 100644
index 00000000..547cb4ce
--- /dev/null
+++ b/sumpy/p2e.py
@@ -0,0 +1,146 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import loopy as lp
+from pytools import memoize_method
+
+
+class P2E(object):
+    def __init__(self, ctx, expansion, coeff_dtype=None,
+            options=[], name="p2e", device=None):
+        """
+        :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase`
+        :arg strength_usage: A list of integers indicating which expression
+          uses which source strength indicator. This implicitly specifies the
+          number of strength arrays that need to be passed.
+          Default: all kernels use the same strength.
+        """
+
+        if device is None:
+            device = ctx.devices[0]
+
+        self.ctx = ctx
+        self.expansion = expansion
+        self.coeff_dtype = coeff_dtype
+        self.options = options
+        self.name = name
+        self.device = device
+
+        self.dim = expansion.dim
+
+    @memoize_method
+    def get_kernel(self):
+        from sumpy.symbolic import make_sym_vector
+        avec = make_sym_vector("a", self.dim)
+
+        from sumpy.assignment_collection import SymbolicAssignmentCollection
+        sac = SymbolicAssignmentCollection()
+
+        coeff_names = [
+                sac.assign_unique("coeff%d" % i, coeff_i)
+                for i, coeff_i in enumerate(
+                    self.expansion.coefficients_from_source(avec, None))]
+
+        sac.run_global_cse()
+
+        from sumpy.symbolic import kill_trivial_assignments
+        assignments = kill_trivial_assignments([
+                (name, expr)
+                for name, expr in sac.assignments.iteritems()],
+                retain_names=coeff_names)
+
+        from sumpy.codegen import to_loopy_insns
+        loopy_insns = to_loopy_insns(assignments,
+                vector_names=set(["a"]),
+                pymbolic_expr_maps=[self.expansion.transform_to_code],
+                complex_dtype=np.complex128  # FIXME
+                )
+
+        arguments = (
+                [
+                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources")),
+                    lp.GlobalArg("strengths", None, shape="nsources"),
+                    lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
+                        None, shape=None),
+                    lp.GlobalArg("centers", None, shape="dim, nboxes"),
+                    lp.GlobalArg("expansions", None,
+                        shape=("nboxes", len(coeff_names))),
+                    lp.ValueArg("nboxes", np.int32),
+                    lp.ValueArg("nsources", np.int32),
+                    "..."
+                ] + self.expansion.get_args())
+
+        loopy_knl = lp.make_kernel(self.device,
+                [
+                    "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
+                    "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
+                    ],
+                loopy_insns
+                + [
+                    "<> src_ibox = source_boxes[isrc_box]",
+                    "<> isrc_start = box_source_starts[src_ibox]",
+                    "<> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]",
+                    "<> center[idim] = centers[idim, src_ibox] {id=fetch_center}",
+                    "<> a[idim] = center[idim] - sources[idim, isrc]",
+                    "<> strength = strengths[isrc]",
+                    "expansions[src_ibox, ${COEFFIDX}] = "
+                            "sum(isrc, strength*coeff${COEFFIDX})"
+                            "{id_prefix=write_expn}"
+                ],
+                arguments,
+                name=self.name, assumptions="nsrc_boxes>=1",
+                preambles=self.expansion.get_preambles(),
+                defines=dict(
+                    dim=self.dim,
+                    COEFFIDX=[str(i) for i in xrange(len(coeff_names))]
+                    ),
+                silenced_warnings="write_race(write_expn*)")
+
+        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
+        loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center",
+                tags={"idim": "unr"})
+
+        return loopy_knl
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        knl = self.get_kernel()
+        knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg expansions:
+        :arg source_boxes:
+        :arg box_source_starts:
+        :arg box_source_counts_nonchild:
+        :arg centers:
+        :arg sources:
+        :arg strengths:
+        """
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, **kwargs)
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index e9433255..129591e5 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -142,14 +142,10 @@ class P2P(KernelComputation):
         knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
         return knl
 
-    @memoize_method
-    def get_compiled_kernel(self):
-        return lp.CompiledKernel(self.context, self.get_optimized_kernel())
-
     def __call__(self, queue, targets, sources, src_strengths, **kwargs):
-        cknl = self.get_compiled_kernel()
+        knl = self.get_optimized_kernel()
 
         for i, sstr in enumerate(src_strengths):
             kwargs["strength_%d" % i] = sstr
 
-        return cknl(queue, src=sources, tgt=targets, **kwargs)
+        return knl(queue, src=sources, tgt=targets, **kwargs)
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 4cfd3406..e8e03b0a 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -54,7 +54,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec):
                     sac.assign_unique("expn%dcoeff%s" % (
                         expansion_nr, stringify_expn_index(i)),
                         coefficients[expansion.get_storage_index(i)]))
-            for i in expansion.get_coefficient_indices()]
+            for i in expansion.get_coefficient_identifiers()]
 
     return sac.assign_unique("expn%d_result" % expansion_nr,
             expansion.evaluate(assigned_coeffs, bvec))
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 074f2c47..ce82681e 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -86,7 +86,7 @@ def vector_to_device(queue, vec):
 
 # {{{  KernelComputation
 
-class KernelComputation:
+class KernelComputation(object):
     """Common input processing for kernel computations."""
 
     def __init__(self, ctx, kernels, strength_usage,
diff --git a/test/test_kernels.py b/test/test_kernels.py
index b48ef665..604366c0 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -1,19 +1,43 @@
 from __future__ import division
 
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
 import numpy as np
 import numpy.linalg as la
 import sys
 import pytools.test
 
+import pytest
 import pyopencl.array as cl_array
 import pyopencl as cl
-from pyopencl.tools import pytest_generate_tests_for_pyopencl \
-        as pytest_generate_tests
-
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
+import logging
+logger = logging.getLogger(__name__)
 
 
-@pytools.test.mark_test.opencl
+@pytest.mark.opencl
 def test_p2p(ctx_getter):
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
@@ -21,141 +45,140 @@ def test_p2p(ctx_getter):
     dimensions = 3
     n = 5000
 
-    from sumpy.kernel import LaplaceKernel, TargetDerivative
+    from sumpy.kernel import LaplaceKernel, AxisTargetDerivative
     from sumpy.p2p import P2P
     lknl = LaplaceKernel(dimensions)
-    knl = P2P(ctx, [lknl, TargetDerivative(0, lknl)], exclude_self=False)
+    knl = P2P(ctx,
+            [lknl, AxisTargetDerivative(0, lknl)],
+            exclude_self=False)
 
     targets = np.random.rand(dimensions, n)
     sources = np.random.rand(dimensions, n)
 
-    #from sumpy.tools import vector_to_device
-    #targets_dev = vector_to_device(queue, targets)
-    #sources_dev = vector_to_device(queue, sources)
-    targets_dev = cl_array.to_device(queue, targets.T.copy())
-    sources_dev = cl_array.to_device(queue, sources.T.copy())
-    strengths_dev = cl_array.empty(queue, n, dtype=np.float64)
-    strengths_dev.fill(1)
+    strengths = np.ones(n, dtype=np.float64)
 
-    evt, (potential_dev, x_derivative) = knl(
-            queue, targets_dev, sources_dev, [strengths_dev])
+    evt, (potential, x_derivative) = knl(
+            queue, targets, sources, [strengths],
+            out_host=True)
 
-    potential = potential_dev.get()
-    potential_host = np.empty_like(potential)
-    strengths = strengths_dev.get()
+    potential_ref = np.empty_like(potential)
 
     targets = targets.T
     sources = sources.T
     for itarg in xrange(n):
-        potential_host[itarg] = np.sum(
+        potential_ref[itarg] = np.sum(
                 strengths
                 /
                 np.sum((targets[itarg] - sources)**2, axis=-1)**0.5)
 
-    potential_host *= 1/(4*np.pi)
-    rel_err = la.norm(potential - potential_host)/la.norm(potential_host)
+    potential_ref *= 1/(4*np.pi)
+
+    rel_err = la.norm(potential - potential_ref)/la.norm(potential_ref)
     print rel_err
     assert rel_err < 1e-3
 
 
-
-
 @pytools.test.mark_test.opencl
-def test_p2m2p(ctx_getter):
+@pytest.mark.parametrize("order", [2, 3, 4, 5])
+def test_p2m2p(ctx_getter, order):
+    logging.basicConfig(level=logging.INFO)
+
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    res = 100
-
-    dimensions = 3
-    sources = np.random.rand(dimensions, 5).astype(np.float32)
-    #targets = np.random.rand(dimensions, 10).astype(np.float32)
-    targets = np.mgrid[-2:2:res*1j, -2:2:res*1j, 2:2:1j].reshape(3, -1).astype(np.float32)
-    centers = np.array([[0.5]]*dimensions).astype(np.float32)
+    np.random.seed(17)
 
-    from sumpy.tools import vector_to_device
-    targets_dev = vector_to_device(queue, targets)
-    sources_dev = vector_to_device(queue, sources)
-    centers_dev = vector_to_device(queue, centers)
-    strengths_dev = cl_array.empty(queue, sources.shape[1], dtype=np.float32)
-    strengths_dev.fill(1)
+    res = 100
+    nsources = 500
 
-    cell_idx_to_particle_offset = np.array([0], dtype=np.uint32)
-    cell_idx_to_particle_cnt_src = np.array([sources.shape[1]], dtype=np.uint32)
-    cell_idx_to_particle_cnt_tgt = np.array([targets.shape[1]], dtype=np.uint32)
+    dim = 2
 
-    from pyopencl.array import to_device
-    cell_idx_to_particle_offset_dev = to_device(queue, cell_idx_to_particle_offset)
-    cell_idx_to_particle_cnt_src_dev = to_device(queue, cell_idx_to_particle_cnt_src)
-    cell_idx_to_particle_cnt_tgt_dev = to_device(queue, cell_idx_to_particle_cnt_tgt)
+    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
+    from sumpy.kernel import LaplaceKernel, AxisTargetDerivative
+    knl = LaplaceKernel(dim)
+    out_kernels = [
+            knl, AxisTargetDerivative(0, knl)
+            ]
+    texp = VolumeTaylorMultipoleExpansion(knl, order=order)
 
-    from sumpy.symbolic import make_coulomb_kernel_in
-    from sumpy.expansion import TaylorMultipoleExpansion
-    texp = TaylorMultipoleExpansion(
-            make_coulomb_kernel_in("b", dimensions),
-            order=3, dimensions=dimensions)
+    sources = np.random.rand(dim, nsources).astype(np.float64)
+    centers = np.array([[0.5]]*dim, dtype=np.float64)
+    strengths = np.ones(nsources, dtype=np.float64)
 
-    coeff_dtype = np.float32
+    from sumpy.p2e import P2E
+    p2m = P2E(ctx, texp, out_kernels)
 
-    # {{{ apply P2M
+    from sumpy.e2p import E2P
+    m2p = E2P(ctx, texp, out_kernels)
 
-    from sumpy.p2m import P2MKernel
-    p2m = P2MKernel(ctx, texp)
-    mpole_coeff = p2m(cell_idx_to_particle_offset_dev,
-            cell_idx_to_particle_cnt_src_dev,
-            sources_dev, strengths_dev, centers_dev, coeff_dtype)
+    from sumpy.p2p import P2P
+    p2p = P2P(ctx, out_kernels, exclude_self=False)
 
-    # }}}
+    from pytools.convergence import EOCRecorder
+    eoc_rec = EOCRecorder()
 
-    # {{{ apply M2P
+    for dist in [3, 5, 7]:
+        targets = np.mgrid[dist:dist + 1:res*1j, 0:1:res*1j] \
+                .reshape(dim, -1).astype(np.float64)
 
-    output_maps = [
-            lambda expr: expr,
-            lambda expr: sp.diff(expr, sp.Symbol("t0"))
-            ]
+        source_boxes = np.array([0], dtype=np.int32)
+        box_source_starts = np.array([0], dtype=np.int32)
+        box_source_counts_nonchild = np.array([nsources], dtype=np.int32)
 
-    m2p_ilist_starts = np.array([0, 1], dtype=np.uint32)
-    m2p_ilist_mpole_offsets = np.array([0], dtype=np.uint32)
-
-    m2p_ilist_starts_dev = to_device(queue, m2p_ilist_starts)
-    m2p_ilist_mpole_offsets_dev = to_device(queue, m2p_ilist_mpole_offsets)
+        # {{{ apply P2M
 
-    from sumpy.m2p import M2PKernel
-    m2p = M2PKernel(ctx, texp, output_maps=output_maps)
-    potential_dev, x_derivative = m2p(targets_dev, m2p_ilist_starts_dev, m2p_ilist_mpole_offsets_dev, mpole_coeff, 
-            cell_idx_to_particle_offset_dev,
-            cell_idx_to_particle_cnt_tgt_dev)
+        evt, (mpoles,) = p2m(queue,
+                source_boxes=source_boxes,
+                box_source_starts=box_source_starts,
+                box_source_counts_nonchild=box_source_counts_nonchild,
+                centers=centers,
+                sources=sources,
+                strengths=strengths,
+                #iflags="print_hl_wrapper",
+                out_host=True)
 
-    # }}}
+        # }}}
 
-    # {{{ compute (direct) reference solution
+        # {{{ apply M2P
 
-    from sumpy.p2p import P2PKernel
-    from sumpy.symbolic import make_coulomb_kernel_ts
-    coulomb_knl = make_coulomb_kernel_ts(dimensions)
+        ntargets = targets.shape[-1]
 
-    knl = P2PKernel(ctx, dimensions, 
-            exprs=[f(coulomb_knl) for f in output_maps], exclude_self=False)
+        box_target_starts = np.array([0], dtype=np.int32)
+        box_target_counts_nonchild = np.array([ntargets], dtype=np.int32)
 
-    potential_dev_direct, x_derivative_dir = knl(targets_dev, sources_dev, strengths_dev)
+        evt, (pot, grad_x) = m2p(
+                queue,
+                expansions=mpoles,
+                target_boxes=source_boxes,
+                box_target_starts=box_target_starts,
+                box_target_counts_nonchild=box_target_counts_nonchild,
+                centers=centers,
+                targets=targets,
+                #iflags="print_hl",
+                out_host=True,
+                )
 
-    if 0:
-        import matplotlib.pyplot as pt
-        pt.imshow((potential_dev-potential_dev_direct).get().reshape(res, res))
-        pt.colorbar()
-        pt.show()
+        # }}}
 
-    assert la.norm((potential_dev-potential_dev_direct).get())/res**2 < 1e-3
+        # {{{ compute (direct) reference solution
 
-    # }}}
+        evt, (pot_direct, grad_x_direct) = p2p(
+                queue,
+                targets, sources, (strengths,),
+                out_host=True)
 
+        err = la.norm((pot - pot_direct)/res**2)
 
+        # }}}
 
+        eoc_rec.add_data_point(1/dist, err)
 
+    print eoc_rec
+    assert eoc_rec.order_estimate() > order + 0.5
 
 
 @pytools.test.mark_test.opencl
-def test_p2m2m2p(ctx_getter):
+def no_test_p2m2m2p(ctx_getter):
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
@@ -259,16 +282,10 @@ def test_p2m2m2p(ctx_getter):
     # }}}
 
 
-
-
 # You can test individual routines by typing
 # $ python test_kernels.py 'test_p2p(cl.create_some_context)'
 
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the tests.
-    import pyopencl as cl
-
-    import sys
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-- 
GitLab