diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index 3d313f258b421a9f51e2905e66661c1d4b831df1..17b03403ceab36a59b11b742b0bf7ff82ee04e9b 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -55,16 +55,20 @@ class SumpyExpansionWranglerCodeContainer(object):
 
     def __init__(self, cl_context,
             multipole_expansion_factory,
-            local_expansion_factory, out_kernels):
+            local_expansion_factory,
+            out_kernels, exclude_self=False):
         """
         :arg multipole_expansion_factory: a callable of a single argument (order)
             that returns a multipole expansion.
         :arg local_expansion_factory: a callable of a single argument (order)
             that returns a local expansion.
+        :arg out_kernels: a list of output kernels
+        :arg exclude_self: whether the self contribution should be excluded
         """
         self.multipole_expansion_factory = multipole_expansion_factory
         self.local_expansion_factory = local_expansion_factory
         self.out_kernels = out_kernels
+        self.exclude_self = exclude_self
 
         self.cl_context = cl_context
 
@@ -118,14 +122,15 @@ class SumpyExpansionWranglerCodeContainer(object):
 
     @memoize_method
     def p2p(self):
-        # FIXME figure out what to do about exclude_self
-        return P2PFromCSR(self.cl_context, self.out_kernels, exclude_self=False)
+        return P2PFromCSR(self.cl_context, self.out_kernels,
+                          exclude_self=self.exclude_self)
 
     def get_wrangler(self, queue, tree, dtype, fmm_level_to_order,
             source_extra_kwargs={},
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            self_extra_kwargs=None):
         return SumpyExpansionWrangler(self, queue, tree, dtype, fmm_level_to_order,
-                source_extra_kwargs, kernel_extra_kwargs)
+                source_extra_kwargs, kernel_extra_kwargs, self_extra_kwargs)
 
 # }}}
 
@@ -155,11 +160,18 @@ class SumpyExpansionWrangler(object):
 
         Keyword arguments to be passed to interactions that involve
         expansions, but not source particles.
+
+    .. attribute:: self_extra_kwargs
+
+        Keyword arguments to be passed for handling
+        self interactions (source and target particles are the same),
+        provided special handling is needed
     """
 
-    def __init__(self, code_container,  queue, tree, dtype, fmm_level_to_order,
+    def __init__(self, code_container, queue, tree, dtype, fmm_level_to_order,
             source_extra_kwargs,
-            kernel_extra_kwargs=None):
+            kernel_extra_kwargs=None,
+            self_extra_kwargs=None):
         self.code = code_container
         self.queue = queue
         self.tree = tree
@@ -174,8 +186,12 @@ class SumpyExpansionWrangler(object):
         if kernel_extra_kwargs is None:
             kernel_extra_kwargs = {}
 
+        if self_extra_kwargs is None:
+            self_extra_kwargs = {}
+
         self.source_extra_kwargs = source_extra_kwargs
         self.kernel_extra_kwargs = kernel_extra_kwargs
+        self.self_extra_kwargs = self_extra_kwargs
 
         self.extra_kwargs = source_extra_kwargs.copy()
         self.extra_kwargs.update(self.kernel_extra_kwargs)
@@ -373,6 +389,7 @@ class SumpyExpansionWrangler(object):
         pot = self.potential_zeros()
 
         kwargs = self.extra_kwargs.copy()
+        kwargs.update(self.self_extra_kwargs)
         kwargs.update(self.box_source_list_kwargs())
         kwargs.update(self.box_target_list_kwargs())
 
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 3071a3b88c9b1fc45c7d4ce3470535f084aba0a9..b0f15abb67f100c95dc8ab4b269441134e59e9cb 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -49,7 +49,7 @@ Particle-to-particle
 # {{{ p2p base class
 
 class P2PBase(KernelComputation, KernelCacheWrapper):
-    def __init__(self, ctx, kernels,  exclude_self, strength_usage=None,
+    def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
             value_dtypes=None,
             options=[], name=None, device=None):
         """
@@ -117,12 +117,8 @@ class P2P(P2PBase):
                 for i, name in enumerate(result_names)]
 
         if self.exclude_self:
-            from pymbolic.primitives import If, ComparisonOperator, Variable
-            exprs = [
-                    If(
-                        ComparisonOperator(Variable("isrc"), "!=", Variable("itgt")),
-                        expr, 0)
-                    for expr in exprs]
+            from pymbolic.primitives import If, Variable
+            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
 
         from sumpy.tools import gather_loopy_source_arguments
         loopy_knl = lp.make_kernel(
@@ -133,8 +129,10 @@ class P2P(P2PBase):
                 for itgt
                     for isrc
                         """] + loopy_insns + ["""
-                        <> d[idim] = targets[idim,itgt] - sources[idim,isrc] \
-                        """]+[
+                        <> d[idim] = targets[idim,itgt] - sources[idim,isrc]
+                        """] + ["""
+                        <> is_self = (source_to_target[isrc] == itgt)
+                        """ if self.exclude_self else ""] + [
                         lp.Assignment(id=None,
                             assignee="pair_result_%d" % i, expression=expr,
                             temp_var_type=lp.auto)
@@ -159,7 +157,10 @@ class P2P(P2PBase):
                     lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
                     lp.GlobalArg("result", None,
                         shape="nresults,ntargets", dim_tags="sep,C")
-                ] + gather_loopy_source_arguments(self.kernels),
+                ] + (
+                    [lp.GlobalArg("source_to_target", np.int32, shape=("nsources",))]
+                    if self.exclude_self else []
+                ) + gather_loopy_source_arguments(self.kernels),
                 name=self.name,
                 assumptions="nsources>=1 and ntargets>=1")
 
@@ -209,8 +210,6 @@ class P2P(P2PBase):
 class P2PFromCSR(P2PBase):
     default_name = "p2p_from_csr"
 
-    # FIXME: exclude_self ...?
-
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
 
@@ -220,6 +219,10 @@ class P2PFromCSR(P2PBase):
                 * var("strength").index((self.strength_usage[i], var("isrc")))
                 for i, name in enumerate(result_names)]
 
+        if self.exclude_self:
+            from pymbolic.primitives import If, Variable
+            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
+
         from sumpy.tools import gather_loopy_source_arguments
         loopy_knl = lp.make_kernel(
             [
@@ -251,7 +254,9 @@ class P2PFromCSR(P2PBase):
                                 <> d[idim] = \
                                         targets[idim,itgt] - sources[idim,isrc] \
                                         {dup=idim}
-                                """
+                                """] + ["""
+                                <> is_self = (source_to_target[isrc] == itgt)
+                                """ if self.exclude_self else ""] + [
                                 ] + loopy_insns + [
                                 lp.Assignment(id=None,
                                     assignee="pair_result_%d" % i, expression=expr,
@@ -286,7 +291,10 @@ class P2PFromCSR(P2PBase):
                 lp.ValueArg("nsources", np.int32),
                 lp.ValueArg("ntargets", np.int32),
                 "...",
-            ] + gather_loopy_source_arguments(self.kernels),
+            ] + (
+                [lp.GlobalArg("source_to_target", np.int32, shape=("nsources",))]
+                if self.exclude_self else []
+            ) + gather_loopy_source_arguments(self.kernels),
             name=self.name, assumptions="ntgt_boxes>=1")
 
         loopy_knl = lp.fix_parameters(
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 2e2bfea4803b6a5f5e17a6d05379620adff09855..acfcb2fa50c04311d2831d350b14064d80086c19 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -221,6 +221,76 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
     pconv_verifier()
 
 
+def test_sumpy_fmm_exclude_self(ctx_getter):
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_getter()
+    queue = cl.CommandQueue(ctx)
+
+    nsources = 500
+    dtype = np.float64
+
+    from boxtree.tools import (
+            make_normal_particle_array as p_normal)
+
+    knl = LaplaceKernel(2)
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    order = 10
+
+    sources = p_normal(queue, nsources, knl.dim, dtype, seed=15)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(ctx)
+
+    tree, _ = tb(queue, sources,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(ctx)
+    trav, _ = tbuild(queue, tree, debug=True)
+
+    from pyopencl.clrandom import PhiloxGenerator
+    rng = PhiloxGenerator(ctx)
+    weights = rng.uniform(queue, nsources, dtype=np.float64)
+
+    source_to_target = np.arange(tree.nsources, dtype=np.int32)
+    self_extra_kwargs = {"source_to_target": source_to_target}
+
+    out_kernels = [knl]
+
+    from functools import partial
+
+    from sumpy.fmm import SumpyExpansionWranglerCodeContainer
+    wcc = SumpyExpansionWranglerCodeContainer(
+            ctx,
+            partial(mpole_expn_class, knl),
+            partial(local_expn_class, knl),
+            out_kernels,
+            exclude_self=True)
+
+    wrangler = wcc.get_wrangler(queue, tree, dtype,
+            fmm_level_to_order=lambda lev: order,
+            self_extra_kwargs=self_extra_kwargs)
+
+    from boxtree.fmm import drive_fmm
+
+    pot, = drive_fmm(trav, wrangler, weights)
+
+    from sumpy import P2P
+    p2p = P2P(ctx, out_kernels, exclude_self=True)
+    evt, (ref_pot,) = p2p(queue, sources, sources, (weights,),
+            **self_extra_kwargs)
+
+    pot = pot.get()
+    ref_pot = ref_pot.get()
+
+    rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot)
+    logger.info("order %d -> relative l2 error: %g" % (order, rel_err))
+
+    assert np.isclose(rel_err, 0, atol=1e-7)
+
+
 # You can test individual routines by typing
 # $ python test_fmm.py 'test_sumpy_fmm(cl.create_some_context)'
 
diff --git a/test/test_kernels.py b/test/test_kernels.py
index 628e2893f3c67a18296badc0fe477dca67fcd4f6..b7b51e7ddc0646b6c50bc97b15dbfd2517700d2b 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -57,7 +57,8 @@ else:
     faulthandler.enable()
 
 
-def test_p2p(ctx_getter):
+@pytest.mark.parametrize("exclude_self", (True, False))
+def test_p2p(ctx_getter, exclude_self):
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
@@ -68,26 +69,36 @@ def test_p2p(ctx_getter):
     lknl = LaplaceKernel(dimensions)
     knl = P2P(ctx,
             [lknl, AxisTargetDerivative(0, lknl)],
-            exclude_self=False)
+            exclude_self=exclude_self)
 
     targets = np.random.rand(dimensions, n)
-    sources = np.random.rand(dimensions, n)
+    sources = targets if exclude_self else np.random.rand(dimensions, n)
 
     strengths = np.ones(n, dtype=np.float64)
 
+    extra_kwargs = {}
+
+    if exclude_self:
+        extra_kwargs["source_to_target"] = np.arange(n, dtype=np.int32)
+
     evt, (potential, x_derivative) = knl(
             queue, targets, sources, [strengths],
-            out_host=True)
+            out_host=True, **extra_kwargs)
 
     potential_ref = np.empty_like(potential)
 
     targets = targets.T
     sources = sources.T
     for itarg in range(n):
-        potential_ref[itarg] = np.sum(
-                strengths
-                /
-                np.sum((targets[itarg] - sources)**2, axis=-1)**0.5)
+
+        with np.errstate(divide="ignore"):
+            invdists = np.sum((targets[itarg] - sources) ** 2, axis=-1) ** -0.5
+
+        if exclude_self:
+            assert np.isinf(invdists[itarg])
+            invdists[itarg] = 0
+
+        potential_ref[itarg] = np.sum(strengths * invdists)
 
     potential_ref *= 1/(4*np.pi)