diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index a0e89f242f958bbaea411090582e885d942b2a1d..d8fdadf45bbd7bedd9d0da6e3353e479cd107e3b 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -319,14 +319,18 @@ class P2PMatrixBlockGenerator(P2PBase):
         loopy_knl = lp.make_kernel(
             "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
             self.get_kernel_scaling_assignments()
+            # NOTE: itgt, isrc need to always be defined in case a statement
+            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
+            # places like get_kernel_exprs)
             + ["""
                 for imat
-                    <> d[idim] = targets[idim, tgtindices[imat]] - \
-                                 sources[idim, srcindices[imat]]
+                    <> itgt = tgtindices[imat]
+                    <> isrc = srcindices[imat]
+
+                    <> d[idim] = targets[idim, itgt] - sources[idim, isrc]
             """]
             + ["""
-                    <> is_self = (srcindices[imat] ==
-                                  target_to_source[tgtindices[imat]])
+                    <> is_self = (isrc == target_to_source[itgt])
                 """ if self.exclude_self else ""]
             + loopy_insns + kernel_exprs
             + ["""
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index a0814e01c412727a908ec72758e62014b3278708..79de1544a8355940303f234ba121d31e84a136cb 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -352,16 +352,17 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
             "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
             ],
             self.get_kernel_scaling_assignments()
+            # NOTE: itgt, isrc need to always be defined in case a statement
+            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
+            # places like get_kernel_exprs)
             + ["""
                 for imat
                     <> itgt = tgtindices[imat]
                     <> isrc = srcindices[imat]
 
-                    <> a[idim] = center[idim, tgtindices[imat]] - \
-                                 src[idim, srcindices[imat]] {dup=idim}
-                    <> b[idim] = tgt[idim, tgtindices[imat]] - \
-                                 center[idim, tgtindices[imat]] {dup=idim}
-                    <> rscale = expansion_radii[tgtindices[imat]]
+                    <> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}
+                    <> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}
+                    <> rscale = expansion_radii[itgt]
             """]
             + loopy_insns + kernel_exprs
             + ["""
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 2f3aabc3082b99707ab86b5461390e866350fdc3..01ebaa0eb290fcc1cc392c866c8b2e8994dd420c 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -132,7 +132,7 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 
     for n in [200, 300, 400]:
         targets, sources, centers, expansion_radii, sigma = \
-                _build_geometry(queue, n, mode_nr)
+                _build_geometry(queue, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
@@ -180,7 +180,8 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 
 @pytest.mark.parametrize("exclude_self", [True, False])
 @pytest.mark.parametrize("factor", [1.0, 0.6])
-def test_p2p_direct(ctx_getter, exclude_self, factor):
+@pytest.mark.parametrize('lpot_id', [1, 2])
+def test_p2p_direct(ctx_getter, exclude_self, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
@@ -190,8 +191,14 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
     nblks = 10
     mode_nr = 25
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(ndim)
+    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    if lpot_id == 1:
+        lknl = LaplaceKernel(ndim)
+    elif lpot_id == 2:
+        lknl = LaplaceKernel(ndim)
+        lknl = DirectionalSourceDerivative(lknl, dir_vec_name="dsource_vec")
+    else:
+        raise ValueError("unknow lpot_id")
 
     from sumpy.p2p import P2P
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
@@ -217,6 +224,9 @@ def test_p2p_direct(ctx_getter, exclude_self, factor):
         if exclude_self:
             extra_kwargs["target_to_source"] = \
                 cl.array.arange(queue, 0, n, dtype=np.int)
+        if lpot_id == 2:
+            extra_kwargs["dsource_vec"] = \
+                    vector_to_device(queue, np.ones((ndim, n)))
 
         _, (result_lpot,) = lpot(queue,
                 targets=targets,