diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97c8ee2404035df17dc5dade007d6593a117b3a1..d44dd00060d68ca69050f0a9f804c3b49e5d4c65 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,17 +1,3 @@
-Python 3.5 Titan X:
-  script:
-  - py_version=2.7
-  - export PYOPENCL_TEST=nvi:titan
-  - EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python3.5
-  - nvidia-titan-x
-  except:
-  - tags
-  allow_failure: True
-
 # Sumpy thus far is poorly parallelized (no workgroup-level parallelism), and
 # the Kepler has a tendency to hang as a result.
 #
@@ -27,6 +13,9 @@ Python 3.5 Titan X:
 #   - nvidia-k40
 #   except:
 #   - tags
+#  artifacts:
+#    reports:
+#      junit: test/pytest.xml
 
 Python 2.7 POCL:
   script:
@@ -40,34 +29,44 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
 
-Python 3.5 POCL:
+Python 3 POCL:
   script:
-  - export PY_EXE=python3.5
+  - export PY_EXE=python3
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
-  - python3.5
+  - python3
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
 
-Python 3.6 POCL:
+Python 3 Titan X:
   script:
-  - export PY_EXE=python3.6
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - py_version=3
+  - export PYOPENCL_TEST=nvi:titan
+  - EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
-  - python3.6
-  - pocl
+  - python3
+  - nvidia-titan-x
   except:
   - tags
+  allow_failure: True
+  artifacts:
+    reports:
+      junit: test/pytest.xml
 
-Python 3.5 Conda:
+Python 3.6 Conda:
   script:
   # Disable caching to ensure SymEngine code generation is exercised.
   - export SUMPY_NO_CACHE=1
@@ -80,6 +79,9 @@ Python 3.5 Conda:
   - linux
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
 
 Documentation:
   script:
@@ -87,7 +89,7 @@ Documentation:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
   - ". ./build-docs.sh"
   tags:
-  - python3.5
+  - python3
   only:
   - master
 
@@ -96,7 +98,7 @@ Flake8:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
   - ". ./prepare-and-run-flake8.sh sumpy test"
   tags:
-  - python3.5
+  - python3
   except:
   - tags
 
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index b2b8b1b47c08a0c809a666abce034ade305e1278..a696dc8de72fdf5d00e90ea9a326efdc3c2d019b 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -10,7 +10,7 @@ dependencies:
 - pocl
 - islpy
 - pyopencl
-- python=3.5
+- python=3.6
 - symengine=0.3.0
 - python-symengine=0.3.0
 # things not in here: loopy boxtree pymbolic pyfmmlib
diff --git a/sumpy/__init__.py b/sumpy/__init__.py
index 8bece3a2dccde8dc6bf6a2b0d41eef20dfb3d1f2..fd0fc5154e04882ef45a0d6f8e282362e6f39f27 100644
--- a/sumpy/__init__.py
+++ b/sumpy/__init__.py
@@ -61,8 +61,7 @@ CACHING_ENABLED = True
 
 CACHING_ENABLED = (
     "SUMPY_NO_CACHE" not in os.environ
-    and
-    "CG_NO_CACHE" not in os.environ)
+    and "CG_NO_CACHE" not in os.environ)
 
 
 def set_caching_enabled(flag):
diff --git a/sumpy/assignment_collection.py b/sumpy/assignment_collection.py
index a12a17fd84b7158628c120a4626464fd288940bb..96d078a0f55b5d34e5a08231e28b7f5d9e94f089 100644
--- a/sumpy/assignment_collection.py
+++ b/sumpy/assignment_collection.py
@@ -51,7 +51,7 @@ class _SymbolGenerator(object):
     def _normalize(self, base):
         # Strip off any _N suffix, to avoid generating conflicting names.
         import re
-        base = re.split("_\d+$", base)[0]
+        base = re.split(r"_\d+$", base)[0]
         return base if base != "" else "expr"
 
     def __call__(self, base="expr"):
diff --git a/sumpy/codegen.py b/sumpy/codegen.py
index 948512dfe884c35e0f421c4a4314796e38b0ab7b..bef7fc6bf93a171a96626f9db88c312c23167bf2 100644
--- a/sumpy/codegen.py
+++ b/sumpy/codegen.py
@@ -581,6 +581,28 @@ class BigIntegerKiller(CSECachingMapperMixin, IdentityMapper):
 # }}}
 
 
+# {{{ convert 123000000j to 123000000 * 1j
+
+class ComplexRewriter(CSECachingMapperMixin, IdentityMapper):
+
+    def __init__(self, float_type=np.float32):
+        IdentityMapper.__init__(self)
+        self.float_type = float_type
+
+    def map_constant(self, expr):
+        """Convert complex values not within complex64 to a product for loopy
+        """
+        if not isinstance(expr, complex):
+            return IdentityMapper.map_constant(self, expr)
+
+        if complex(self.float_type(expr.imag)) == expr.imag:
+            return IdentityMapper.map_constant(self, expr)
+
+        return expr.real + prim.Product((expr.imag, 1j))
+
+    map_common_subexpression_uncached = IdentityMapper.map_common_subexpression
+
+
 # {{{ vector component rewriter
 
 INDEXED_VAR_RE = re.compile("^([a-zA-Z_]+)([0-9]+)$")
@@ -684,6 +706,7 @@ def to_loopy_insns(assignments, vector_names=set(), pymbolic_expr_maps=[],
     ssg = SumSignGrouper()
     fck = FractionKiller()
     bik = BigIntegerKiller()
+    cmr = ComplexRewriter()
 
     def convert_expr(name, expr):
         logger.debug("generate expression for: %s" % name)
@@ -694,6 +717,7 @@ def to_loopy_insns(assignments, vector_names=set(), pymbolic_expr_maps=[],
         expr = fck(expr)
         expr = ssg(expr)
         expr = bik(expr)
+        expr = cmr(expr)
         #expr = cse_tag(expr)
         for m in pymbolic_expr_maps:
             expr = m(expr)
@@ -705,7 +729,7 @@ def to_loopy_insns(assignments, vector_names=set(), pymbolic_expr_maps=[],
         result = [
                 lp.Assignment(id=None,
                     assignee=name, expression=convert_expr(name, expr),
-                    temp_var_type=lp.auto)
+                    temp_var_type=lp.Optional(None))
                 for name, expr in assignments]
 
     logger.info("loopy instruction generation: done")
diff --git a/sumpy/e2p.py b/sumpy/e2p.py
index d0d05cbf8c45e8d9f1c4552f784950bdfcab4bfb..7b0072ad55708ba205ceed3448914ad0d8eda281 100644
--- a/sumpy/e2p.py
+++ b/sumpy/e2p.py
@@ -119,7 +119,7 @@ class E2PBase(KernelCacheWrapper):
                     assignee="kernel_scaling",
                     expression=sympy_conv(
                         self.expansion.kernel.get_global_scaling_const()),
-                    temp_var_type=lp.auto)]
+                    temp_var_type=lp.Optional(None))]
 
     def get_cache_key(self):
         return (type(self).__name__, self.expansion, tuple(self.kernels))
diff --git a/sumpy/expansion/level_to_order.py b/sumpy/expansion/level_to_order.py
index 44d86ee236a8db894f76320516552c6b69588c69..8f012e55827f7676e2429f76502f5146c085387c 100644
--- a/sumpy/expansion/level_to_order.py
+++ b/sumpy/expansion/level_to_order.py
@@ -131,7 +131,7 @@ class SimpleExpansionOrderFinder(object):
 
         laplace_order = int(np.ceil(
                 (np.log(self.tol) - np.log(self.err_const_laplace))
-                /
+                /  # noqa: W504
                 np.log(
                     np.sqrt(tree.dimensions)/3
                     ) - 1))
diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py
index f080b7460029ce2f5325e0ed58f3d97611aa791c..cfb660e7f0a733bf33fd89b7e8e479c4f6f92647 100644
--- a/sumpy/expansion/local.py
+++ b/sumpy/expansion/local.py
@@ -133,7 +133,7 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
         bvec = bvec * rscale**-1
         result = sum(
                 coeff
-                * mi_power(bvec, mi)
+                * mi_power(bvec, mi, evaluate=False)
                 / mi_factorial(mi)
                 for coeff, mi in zip(
                         evaluated_coeffs, self.get_full_coefficient_identifiers()))
diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 07f2eeb6929c47b0437e954819ebc449fc7c0c27..6f31164bce4f42ac55577729ee7c315f65152e3e 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -88,7 +88,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
             for i, mi in enumerate(coeff_identifiers):
                 result[i] /= (mi_factorial(mi) * rscale ** sum(mi))
         else:
-            avec = avec * rscale**-1
+            avec = [sym.UnevaluatedExpr(a * rscale**-1) for a in avec]
 
             result = [
                     mi_power(avec, mi) / mi_factorial(mi)
diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index bd1c51d9b8050529c27174fdf068ffafbb766351..a233175686f2c395fe7783235ddeb5e351b09ff0 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -177,7 +177,8 @@ class SumpyTimingFuture(object):
                     stacklevel=3)
             return TimingResult(wall_elapsed=None)
 
-        pyopencl.wait_for_events(self.events)
+        if self.events:
+            pyopencl.wait_for_events(self.events)
 
         result = 0
         for event in self.events:
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index 12255a93f6d686c003f8f99f16f2f43db22ba037..99222f684340fe1f36d888355a24cbffe29573c0 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -149,8 +149,7 @@ class Kernel(object):
     def __setstate__(self, state):
         # Can't use trivial pickling: hash_value cache must stay unset
         assert len(self.init_arg_names) == len(state)
-        for name, value in zip(self.init_arg_names, state):
-            setattr(self, name, value)
+        self.__init__(*state)
 
     # }}}
 
@@ -601,7 +600,7 @@ class StokesletKernel(ExpressionKernel):
             r = pymbolic_real_norm_2(d)
             expr = (
                 -var("log")(r)*(1 if icomp == jcomp else 0)
-                +
+                +  # noqa: W504
                 d[icomp]*d[jcomp]/r**2
                 )
             scaling = -1/(4*var("pi")*mu)
@@ -611,7 +610,7 @@ class StokesletKernel(ExpressionKernel):
             r = pymbolic_real_norm_2(d)
             expr = (
                 (1/r)*(1 if icomp == jcomp else 0)
-                +
+                +  # noqa: W504
                 d[icomp]*d[jcomp]/r**3
                 )
             scaling = -1/(8*var("pi")*mu)
@@ -798,6 +797,9 @@ class AxisTargetDerivative(DerivativeBase):
         expr = self.inner_kernel.postprocess_at_target(expr, bvec)
         return expr.diff(bvec[self.axis])
 
+    def replace_inner_kernel(self, new_inner_kernel):
+        return type(self)(self.axis, new_inner_kernel)
+
     mapper_method = "map_axis_target_derivative"
 
 
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index a0e89f242f958bbaea411090582e885d942b2a1d..e3b457dd52e885bf66462e475b3b58edb78bd098 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -125,7 +125,7 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
 
         return [lp.Assignment(id=None,
                     assignee="pair_result_%d" % i, expression=expr,
-                    temp_var_type=lp.auto)
+                    temp_var_type=lp.Optional(None))
                 for i, expr in enumerate(exprs)]
 
     def get_default_src_tgt_arguments(self):
@@ -136,10 +136,10 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
                 lp.GlobalArg("targets", None,
                    shape=(self.dim, "ntargets")),
                 lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None)] +
-                ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))]
-                    if self.exclude_self else []) +
-                gather_loopy_source_arguments(self.kernels))
+                lp.ValueArg("ntargets", None)]
+                + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))]
+                    if self.exclude_self else [])
+                + gather_loopy_source_arguments(self.kernels))
 
     def get_kernel(self):
         raise NotImplementedError
@@ -171,8 +171,8 @@ class P2P(P2PBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [
+            self.get_default_src_tgt_arguments()
+            + [
                 lp.GlobalArg("strength", None,
                     shape="nstrengths, nsources", dim_tags="sep,C"),
                 lp.GlobalArg("result", None,
@@ -241,8 +241,8 @@ class P2PMatrixGenerator(P2PBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [lp.GlobalArg("result_%d" % i, dtype,
+            self.get_default_src_tgt_arguments()
+            + [lp.GlobalArg("result_%d" % i, dtype,
                 shape="ntargets,nsources")
              for i, dtype in enumerate(self.value_dtypes)])
 
@@ -307,26 +307,30 @@ class P2PMatrixBlockGenerator(P2PBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [
+            self.get_default_src_tgt_arguments()
+            + [
                 lp.GlobalArg("srcindices", None, shape="nresult"),
                 lp.GlobalArg("tgtindices", None, shape="nresult"),
                 lp.ValueArg("nresult", None)
-            ] +
-            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+            ]
+            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
              for i, dtype in enumerate(self.value_dtypes)])
 
         loopy_knl = lp.make_kernel(
             "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
+            # NOTE: itgt, isrc need to always be defined in case a statement
+            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
+            # places like get_kernel_exprs)
             + ["""
                 for imat
-                    <> d[idim] = targets[idim, tgtindices[imat]] - \
-                                 sources[idim, srcindices[imat]]
+                    <> itgt = tgtindices[imat]
+                    <> isrc = srcindices[imat]
+
+                    <> d[idim] = targets[idim, itgt] - sources[idim, isrc]
             """]
             + ["""
-                    <> is_self = (srcindices[imat] ==
-                                  target_to_source[tgtindices[imat]])
+                    <> is_self = (isrc == target_to_source[itgt])
                 """ if self.exclude_self else ""]
             + loopy_insns + kernel_exprs
             + ["""
@@ -411,8 +415,8 @@ class P2PFromCSR(P2PBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [
+            self.get_default_src_tgt_arguments()
+            + [
                 lp.GlobalArg("box_target_starts",
                     None, shape=None),
                 lp.GlobalArg("box_target_counts_nonchild",
diff --git a/sumpy/point_calculus.py b/sumpy/point_calculus.py
index 9321f7d9a0745e1f5ee1768e7f394fd6bfb00e2a..f8b4f5bdf38ec7cd9043047f07e5f2c2ce31b69c 100644
--- a/sumpy/point_calculus.py
+++ b/sumpy/point_calculus.py
@@ -43,6 +43,8 @@ class CalculusPatch(object):
 
         shape: ``(dim, npoints_total)``
 
+    .. automethod:: weights
+    .. automethod:: basis
     .. automethod:: diff
     .. automethod:: dx
     .. automethod:: dy
@@ -55,6 +57,8 @@ class CalculusPatch(object):
     .. autoattribute:: y
     .. autoattribute:: z
     .. automethod:: norm
+    .. automethod:: plot_nodes
+    .. automethod:: plot
     """
     def __init__(self, center, h=1e-1, order=4, nodes="chebyshev"):
         self.center = center
@@ -62,15 +66,26 @@ class CalculusPatch(object):
         npoints = order + 1
         if nodes == "equispaced":
             points_1d = np.linspace(-h/2, h/2, npoints)
+            weights_1d = None
 
         elif nodes == "chebyshev":
             a = np.arange(npoints, dtype=np.float64)
             points_1d = (h/2)*np.cos((2*(a+1)-1)/(2*npoints)*np.pi)
+            weights_1d = None
+
+        elif nodes == "legendre":
+            from scipy.special import legendre
+            points_1d, weights_1d, _ = legendre(npoints).weights.T
+            points_1d = points_1d * (h/2)
+            weights_1d = weights_1d * (h/2)
 
         else:
             raise ValueError("invalid node set: %s" % nodes)
 
+        self.h = h
+        self.npoints = npoints
         self._points_1d = points_1d
+        self._weights_1d = weights_1d
 
         self.dim = dim = len(self.center)
         self.center = center
@@ -100,6 +115,44 @@ class CalculusPatch(object):
         # The zeroth coefficient--all others involve x=0.
         return self._vandermonde_1d()[0]
 
+    def basis(self):
+        """"
+        :returns: a :class:`list` containing functions that realize
+            a high-order interpolation basis on the :attr:`points`.
+        """
+
+        from pytools import indices_in_shape
+        from scipy.special import eval_chebyt
+
+        def eval_basis(ind, x):
+            result = 1
+            for i in range(self.dim):
+                coord = (x[i] - self.center[i])/(self.h/2)
+                result *= eval_chebyt(ind[i], coord)
+            return result
+
+        from functools import partial
+        return [
+                partial(eval_basis, ind)
+                for ind in indices_in_shape((self.npoints,)*self.dim)]
+
+    @memoize_method
+    def weights(self):
+        """"
+        :returns: a vector of high-order quadrature weights on the :attr:`points`
+        """
+
+        if self._weights_1d is None:
+            raise NotImplementedError("weights not available for these nodes")
+
+        result = np.ones_like(self._points_shaped[0])
+        for i in range(self.dim):
+            result = result * self._weights_1d[
+                    (slice(None),)
+                    + i*(np.newaxis,)]
+
+        return result.reshape(-1)
+
     @memoize_method
     def _diff_mat_1d(self, nderivs):
         npoints = len(self._points_1d)
@@ -217,6 +270,21 @@ class CalculusPatch(object):
         else:
             raise ValueError("unsupported norm")
 
+    def plot_nodes(self):
+        import matplotlib.pyplot as plt
+        plt.gca().set_aspect("equal")
+        plt.plot(
+            self._points_shaped[0].reshape(-1),
+            self._points_shaped[1].reshape(-1),
+            "o")
+
+    def plot(self, f):
+        f = f.reshape(*self._pshape)
+
+        import matplotlib.pyplot as plt
+        plt.gca().set_aspect("equal")
+        plt.contourf(self._points_1d, self._points_1d, f)
+
 
 def frequency_domain_maxwell(cpatch, e, h, k):
     mu = 1
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index a0814e01c412727a908ec72758e62014b3278708..9708764c01d0448d7c7e8314efb461989c838acd 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -143,7 +143,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 
         return [lp.Assignment(id=None,
                     assignee="pair_result_%d" % i, expression=expr,
-                    temp_var_type=lp.auto)
+                    temp_var_type=lp.Optional(None))
                 for i, expr in enumerate(exprs)]
 
     def get_default_src_tgt_arguments(self):
@@ -158,8 +158,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 lp.GlobalArg("expansion_radii",
                     None, shape="ntargets"),
                 lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None)] +
-                gather_loopy_source_arguments(self.kernels))
+                lp.ValueArg("ntargets", None)]
+                + gather_loopy_source_arguments(self.kernels))
 
     def get_kernel(self):
         raise NotImplementedError
@@ -201,11 +201,11 @@ class LayerPotential(LayerPotentialBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [lp.GlobalArg("strength_%d" % i,
+            self.get_default_src_tgt_arguments()
+            + [lp.GlobalArg("strength_%d" % i,
                 None, shape="nsources", order="C")
-            for i in range(self.strength_count)] +
-            [lp.GlobalArg("result_%d" % i,
+            for i in range(self.strength_count)]
+            + [lp.GlobalArg("result_%d" % i,
                 None, shape="ntargets", order="C")
             for i in range(len(self.kernels))])
 
@@ -275,8 +275,8 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [lp.GlobalArg("result_%d" % i,
+            self.get_default_src_tgt_arguments()
+            + [lp.GlobalArg("result_%d" % i,
                 dtype, shape="ntargets, nsources", order="C")
              for i, dtype in enumerate(self.value_dtypes)])
 
@@ -339,29 +339,30 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         kernel_exprs = self.get_kernel_exprs(result_names)
         arguments = (
-            self.get_default_src_tgt_arguments() +
-            [
+            self.get_default_src_tgt_arguments()
+            + [
                 lp.GlobalArg("srcindices", None, shape="nresult"),
                 lp.GlobalArg("tgtindices", None, shape="nresult"),
                 lp.ValueArg("nresult", None)
-            ] +
-            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+            ]
+            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
              for i, dtype in enumerate(self.value_dtypes)])
 
         loopy_knl = lp.make_kernel([
             "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
             ],
             self.get_kernel_scaling_assignments()
+            # NOTE: itgt, isrc need to always be defined in case a statement
+            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
+            # places like get_kernel_exprs)
             + ["""
                 for imat
                     <> itgt = tgtindices[imat]
                     <> isrc = srcindices[imat]
 
-                    <> a[idim] = center[idim, tgtindices[imat]] - \
-                                 src[idim, srcindices[imat]] {dup=idim}
-                    <> b[idim] = tgt[idim, tgtindices[imat]] - \
-                                 center[idim, tgtindices[imat]] {dup=idim}
-                    <> rscale = expansion_radii[tgtindices[imat]]
+                    <> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}
+                    <> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}
+                    <> rscale = expansion_radii[itgt]
             """]
             + loopy_insns + kernel_exprs
             + ["""
diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py
index c7ed61e60c937cd18020b1ced0bd9629c5a0aa30..7a86958aebd7eea6c8054b11e61413314886ebe9 100644
--- a/sumpy/symbolic.py
+++ b/sumpy/symbolic.py
@@ -111,6 +111,14 @@ if not have_unevaluated_expr:
         return x
 
 
+if USE_SYMENGINE:
+    def unevaluated_pow(a, b):
+        return sym.Pow(a, b)
+else:
+    def unevaluated_pow(a, b):
+        return sym.Pow(a, b, evaluate=False)
+
+
 # {{{ debugging of sympy CSE via Maxima
 
 class _DerivativeKiller(IdentityMapperBase):
diff --git a/sumpy/tools.py b/sumpy/tools.py
index c13a04b6a73f858650e4235c9cb753d6b863c119..7216c8344700974984838103b5ef92b140f5465b 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -56,10 +56,15 @@ def mi_factorial(mi):
     return result
 
 
-def mi_power(vector, mi):
+def mi_power(vector, mi, evaluate=True):
     result = 1
     for mi_i, vec_i in zip(mi, vector):
-        result *= vec_i**mi_i
+        if mi_i == 1:
+            result *= vec_i
+        elif evaluate:
+            result *= vec_i**mi_i
+        else:
+            result *= sym.unevaluated_pow(vec_i, mi_i)
     return result
 
 
@@ -248,7 +253,7 @@ class KernelComputation(object):
                 lp.Assignment(id=None,
                     assignee="knl_%d_scaling" % i,
                     expression=sympy_conv(kernel.get_global_scaling_const()),
-                    temp_var_type=dtype)
+                    temp_var_type=lp.Optional(dtype))
                 for i, (kernel, dtype) in enumerate(
                     zip(self.kernels, self.value_dtypes))]
 
diff --git a/sumpy/toys.py b/sumpy/toys.py
index 5177d869e99c62ebd5035037ae00a532411d00ff..9bc68ae5e480f043ec4fbc4636839ee0a69d2b45 100644
--- a/sumpy/toys.py
+++ b/sumpy/toys.py
@@ -564,8 +564,7 @@ def combine_inner_outer(psource_inner, psource_outer, radius, center=None):
     ball_one = OneOnBallPotential(psource_inner.toy_ctx, center, radius)
     return (
             psource_inner * ball_one
-            +
-            psource_outer * (1 - ball_one))
+            + psource_outer * (1 - ball_one))
 
 
 def combine_halfspace(psource_pos, psource_neg, axis, center=None):
@@ -575,8 +574,7 @@ def combine_halfspace(psource_pos, psource_neg, axis, center=None):
     halfspace_one = HalfspaceOnePotential(psource_pos.toy_ctx, center, axis)
     return (
         psource_pos * halfspace_one
-        +
-        psource_neg * (1-halfspace_one))
+        + psource_neg * (1-halfspace_one))
 
 
 def combine_halfspace_and_outer(psource_pos, psource_neg, psource_outer,
diff --git a/sumpy/visualization.py b/sumpy/visualization.py
index 3a4dc8df66512fdf1da8e5e3c40fa1794e0bcfad..bf5f016b4c089be7723a2ff92c91530c998603c2 100644
--- a/sumpy/visualization.py
+++ b/sumpy/visualization.py
@@ -64,35 +64,35 @@ def separate_by_real_and_imag(data, real_only):
 
 
 def make_field_plotter_from_bbox(bbox, h, extend_factor=0):
-        """
-        :arg bbox: a tuple (low, high) of points represented as 1D numpy arrays
-            indicating the low and high ends of the extent of a bounding box.
-        :arg h: Either a number or a sequence of numbers indicating the desired
-            (approximate) grid spacing in all or each of the dimensions. If a
-            sequence, the length must match the number of dimensions.
-        :arg extend_factor: A floating point number indicating by what percentage
-            the plot area should be grown compared to *bbox*.
-        """
-        low, high = bbox
-
-        extent = (high-low) * (1 + extend_factor)
-        center = 0.5*(high+low)
-
-        dimensions = len(center)
-        from numbers import Number
-        if isinstance(h, Number):
-            h = (h,)*dimensions
-        else:
-            if len(h) != dimensions:
-                raise ValueError("length of 'h' must match number of dimensions")
+    """
+    :arg bbox: a tuple (low, high) of points represented as 1D numpy arrays
+        indicating the low and high ends of the extent of a bounding box.
+    :arg h: Either a number or a sequence of numbers indicating the desired
+        (approximate) grid spacing in all or each of the dimensions. If a
+        sequence, the length must match the number of dimensions.
+    :arg extend_factor: A floating point number indicating by what percentage
+        the plot area should be grown compared to *bbox*.
+    """
+    low, high = bbox
+
+    extent = (high-low) * (1 + extend_factor)
+    center = 0.5*(high+low)
+
+    dimensions = len(center)
+    from numbers import Number
+    if isinstance(h, Number):
+        h = (h,)*dimensions
+    else:
+        if len(h) != dimensions:
+            raise ValueError("length of 'h' must match number of dimensions")
 
-        from math import ceil
+    from math import ceil
 
-        npoints = tuple(
-                int(ceil(extent[i] / h[i]))
-                for i in range(dimensions))
+    npoints = tuple(
+            int(ceil(extent[i] / h[i]))
+            for i in range(dimensions))
 
-        return FieldPlotter(center, extent, npoints)
+    return FieldPlotter(center, extent, npoints)
 
 
 class FieldPlotter(object):
diff --git a/test/test_cse.py b/test/test_cse.py
index d6a2c0f05cb1c4512fa3a8b26b6a2654a7ed1ea3..703e8c9f93ca2069c273ead893a00ed0ba3dc697 100644
--- a/test/test_cse.py
+++ b/test/test_cse.py
@@ -134,8 +134,8 @@ def test_cse_not_possible():
     assert substs == []
     assert reduced == [x + y]
     # issue 6329
-    eq = (meijerg((1, 2), (y, 4), (5,), [], x) +
-          meijerg((1, 3), (y, 4), (5,), [], x))
+    eq = (meijerg((1, 2), (y, 4), (5,), [], x)
+          + meijerg((1, 3), (y, 4), (5,), [], x))
     assert cse(eq) == ([], [eq])
 
 
@@ -251,9 +251,9 @@ def test_issue_4499():
     from sympy import Tuple, S
     B = Function('B')  # noqa
     G = Function('G')  # noqa
-    t = Tuple(*
-        (a, a + S(1)/2, 2*a, b, 2*a - b + 1, (sqrt(z)/2)**(-2*a + 1)*B(2*a -
-        b, sqrt(z))*B(b - 1, sqrt(z))*G(b)*G(2*a - b + 1),
+    t = Tuple(
+        *(a, a + S(1)/2, 2*a, b, 2*a - b + 1, (sqrt(z)/2)**(-2*a + 1)*B(2*a
+        - b, sqrt(z))*B(b - 1, sqrt(z))*G(b)*G(2*a - b + 1),
         sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b,
         sqrt(z))*G(b)*G(2*a - b + 1), sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b - 1,
         sqrt(z))*B(2*a - b + 1, sqrt(z))*G(b)*G(2*a - b + 1),
diff --git a/test/test_kernels.py b/test/test_kernels.py
index a1fa8230c3d4a2fd9194641e40067a1d5f51abaf..fa42a5ee4f764939ffb92a6830da9324c67c0c03 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -347,7 +347,8 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
     (HelmholtzKernel(2), HelmholtzConformingVolumeTaylorLocalExpansion,
      HelmholtzConformingVolumeTaylorMultipoleExpansion),
     (HelmholtzKernel(2), H2DLocalExpansion, H2DMultipoleExpansion),
-    (StokesletKernel(2, 0, 0), VolumeTaylorLocalExpansion, VolumeTaylorMultipoleExpansion),
+    (StokesletKernel(2, 0, 0), VolumeTaylorLocalExpansion,
+     VolumeTaylorMultipoleExpansion),
     (StokesletKernel(2, 0, 0), StokesConformingVolumeTaylorLocalExpansion,
      StokesConformingVolumeTaylorMultipoleExpansion),
     ])
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 2f3aabc3082b99707ab86b5461390e866350fdc3..2c07b40a5c753e6c362be61263f0bf388784fc4a 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -132,7 +132,7 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 
     for n in [200, 300, 400]:
         targets, sources, centers, expansion_radii, sigma = \
-                _build_geometry(queue, n, mode_nr)
+                _build_geometry(queue, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
@@ -180,7 +180,8 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 
 @pytest.mark.parametrize("exclude_self", [True, False])
 @pytest.mark.parametrize("factor", [1.0, 0.6])
-def test_p2p_direct(ctx_getter, exclude_self, factor):
+@pytest.mark.parametrize('lpot_id', [1, 2])
+def test_p2p_direct(ctx_getter, exclude_self, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
@@ -190,8 +191,14 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
     nblks = 10
     mode_nr = 25
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(ndim)
+    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    if lpot_id == 1:
+        lknl = LaplaceKernel(ndim)
+    elif lpot_id == 2:
+        lknl = LaplaceKernel(ndim)
+        lknl = DirectionalSourceDerivative(lknl, dir_vec_name="dsource_vec")
+    else:
+        raise ValueError("unknow lpot_id")
 
     from sumpy.p2p import P2P
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
@@ -217,6 +224,9 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
         if exclude_self:
             extra_kwargs["target_to_source"] = \
                 cl.array.arange(queue, 0, n, dtype=np.int)
+        if lpot_id == 2:
+            extra_kwargs["dsource_vec"] = \
+                    vector_to_device(queue, np.ones((ndim, n)))
 
         _, (result_lpot,) = lpot(queue,
                 targets=targets,
@@ -241,8 +251,8 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
 
         index_set = index_set.get(queue)
         for i in range(index_set.nblocks):
-            assert la.norm(index_set.block_take(blk, i) -
-                           index_set.take(mat, i)) < eps
+            assert la.norm(index_set.block_take(blk, i)
+                           - index_set.take(mat, i)) < eps
 
 
 # You can test individual routines by typing
diff --git a/test/test_misc.py b/test/test_misc.py
index 7b35b2ddc3743d0eb9b7e80daf38c617ea73940d..33e8535a8cbce064263958962421d9b43e77ee6c 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -32,6 +32,8 @@ import pyopencl as cl  # noqa: F401
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
+from pytools import Record
+
 from sumpy.kernel import (LaplaceKernel, HelmholtzKernel,
         BiharmonicKernel, YukawaKernel)
 
@@ -135,6 +137,122 @@ def test_order_finder(knl):
     print(orders)
 
 
+# {{{ expansion toys p2e2e2p test cases
+
+def approx_convergence_factor(orders, errors):
+    poly = np.polyfit(orders, np.log(errors), deg=1)
+    return np.exp(poly[0])
+
+
+class P2E2E2PTestCase(Record):
+
+    @property
+    def dim(self):
+        return len(self.source)
+
+    @staticmethod
+    def eval(expr, source, center1, center2, target):
+        from pymbolic import parse, evaluate
+        context = {
+                "s": source,
+                "c1": center1,
+                "c2": center2,
+                "t": target,
+                "norm": la.norm}
+
+        return evaluate(parse(expr), context)
+
+    def __init__(self,
+            source, center1, center2, target, expansion1, expansion2, conv_factor):
+
+        if isinstance(conv_factor, str):
+            conv_factor = self.eval(conv_factor, source, center1, center2, target)
+
+        Record.__init__(self,
+                source=source,
+                center1=center1,
+                center2=center2,
+                target=target,
+                expansion1=expansion1,
+                expansion2=expansion2,
+                conv_factor=conv_factor)
+
+
+P2E2E2P_TEST_CASES = (
+        # local to local, 3D
+        P2E2E2PTestCase(
+            source=np.array([3., 4., 5.]),
+            center1=np.array([1., 0., 0.]),
+            center2=np.array([1., 3., 0.]),
+            target=np.array([1., 1., 1.]),
+            expansion1=t.local_expand,
+            expansion2=t.local_expand,
+            conv_factor="norm(t-c1)/norm(s-c1)"),
+
+        # multipole to multipole, 3D
+        P2E2E2PTestCase(
+            source=np.array([1., 1., 1.]),
+            center1=np.array([1., 0., 0.]),
+            center2=np.array([1., 0., 3.]),
+            target=np.array([3., 4., 5.]),
+            expansion1=t.multipole_expand,
+            expansion2=t.multipole_expand,
+            conv_factor="norm(s-c2)/norm(t-c2)"),
+
+        # multipole to local, 3D
+        P2E2E2PTestCase(
+            source=np.array([-2., 2., 1.]),
+            center1=np.array([-2., 5., 3.]),
+            center2=np.array([0., 0., 0.]),
+            target=np.array([0., 0., -1]),
+            expansion1=t.multipole_expand,
+            expansion2=t.local_expand,
+            conv_factor="norm(t-c2)/(norm(c2-c1)-norm(c1-s))"),
+)
+
+# }}}
+
+
+ORDERS_P2E2E2P = (3, 4, 5)
+RTOL_P2E2E2P = 1e-2
+
+
+@pytest.mark.parametrize("case", P2E2E2P_TEST_CASES)
+def test_toy_p2e2e2p(ctx_getter, case):
+    dim = case.dim
+
+    src = case.source.reshape(dim, -1)
+    tgt = case.target.reshape(dim, -1)
+
+    if not 0 <= case.conv_factor <= 1:
+        raise ValueError(
+                "convergence factor not in valid range: %e" % case.conv_factor)
+
+    from sumpy.expansion.local import VolumeTaylorLocalExpansion
+    from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
+
+    cl_ctx = ctx_getter()
+    ctx = t.ToyContext(cl_ctx,
+             LaplaceKernel(dim),
+             VolumeTaylorMultipoleExpansion,
+             VolumeTaylorLocalExpansion)
+
+    errors = []
+
+    src_pot = t.PointSources(ctx, src, weights=np.array([1.]))
+    pot_actual = src_pot.eval(tgt).item()
+
+    for order in ORDERS_P2E2E2P:
+        expn = case.expansion1(src_pot, case.center1, order=order)
+        expn2 = case.expansion2(expn, case.center2, order=order)
+        pot_p2e2e2p = expn2.eval(tgt).item()
+        errors.append(np.abs(pot_actual - pot_p2e2e2p))
+
+    conv_factor = approx_convergence_factor(1 + np.array(ORDERS_P2E2E2P), errors)
+    assert conv_factor <= min(1, case.conv_factor * (1 + RTOL_P2E2E2P)), \
+        (conv_factor, case.conv_factor * (1 + RTOL_P2E2E2P))
+
+
 # You can test individual routines by typing
 # $ python test_misc.py 'test_p2p(cl.create_some_context)'