diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index d8586cc228e47d6aa9f12d7c7c436726ad28bc2e..a35505ea418b0cb0d2d58cde1cabb53c521d3602 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -40,10 +40,10 @@ __doc__ = """
 Particle-to-particle
 --------------------
 
-.. autoclass:: P2PComputationBase
-.. autoclass:: SingleSrcTgtListP2PBase
+.. autoclass:: P2PBase
 .. autoclass:: P2P
 .. autoclass:: P2PMatrixGenerator
+.. autoclass:: P2PMatrixBlockGenerator
 .. autoclass:: P2PFromCSR
 
 """
@@ -54,7 +54,7 @@ Particle-to-particle
 
 # {{{ p2p base class
 
-class P2PComputationBase(KernelComputation, KernelCacheWrapper):
+class P2PBase(KernelComputation, KernelCacheWrapper):
     def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
             value_dtypes=None,
             options=[], name=None, device=None):
@@ -104,17 +104,27 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper):
 
         return loopy_insns, result_names
 
-    def get_cache_key(self):
-        return (type(self).__name__, tuple(self.kernels), self.exclude_self,
-                tuple(self.strength_usage), tuple(self.value_dtypes))
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return var("strength").index((self.strength_usage[kernel_idx], isrc))
 
-# }}}
+    def get_exprs(self, results):
+        from pymbolic import var
 
+        isrc_sym = var("isrc")
+        exprs = [var(name) * self.get_strength_or_not(isrc_sym, i)
+                 for i, name in enumerate(results)]
 
-# {{{ P2P with list of sources and list of targets
+        if self.exclude_self:
+            from pymbolic.primitives import If, Variable
+            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
+
+        return [lp.Assignment(id=None,
+                              assignee="pair_result_%d" % i, expression=expr,
+                              temp_var_type=lp.auto)
+                for i, expr in enumerate(exprs)]
 
-class SingleSrcTgtListP2PBase(P2PComputationBase):
-    def get_src_tgt_arguments(self):
+    def get_default_src_tgt_arguments(self):
+        from sumpy.tools import gather_loopy_source_arguments
         return [
                 lp.GlobalArg("sources", None,
                     shape=(self.dim, "nsources")),
@@ -122,75 +132,13 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
                    shape=(self.dim, "ntargets")),
                 lp.ValueArg("nsources", np.int32),
                 lp.ValueArg("ntargets", np.int32)
-                ]
-
-    def get_domains(self):
-        return [
-                "{[itgt]: 0 <= itgt < ntargets}",
-                "{[isrc]: 0 <= isrc < nsources}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return ["for itgt, isrc"]
-
-    def get_loop_end(self):
-        return ["end"]
-
-    def get_assumptions(self):
-        return "nsources>=1 and ntargets>=1"
+                ] + ([
+                lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
+                if self.exclude_self else []) + (
+                gather_loopy_source_arguments(self.kernels))
 
     def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-
-        isrc_sym = var("isrc")
-        exprs = [
-                var(name)
-                * self.get_strength_or_not(isrc_sym, i)
-                for i, name in enumerate(result_names)]
-
-        if self.exclude_self:
-            from pymbolic.primitives import If, Variable
-            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
-
-        from sumpy.tools import gather_loopy_source_arguments
-        arguments = (
-                self.get_src_tgt_arguments()
-                + self.get_input_and_output_arguments()
-                + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
-                    if self.exclude_self else [])
-                + gather_loopy_source_arguments(self.kernels))
-
-        loopy_knl = lp.make_kernel(
-                self.get_domains(),
-                self.get_kernel_scaling_assignments()
-                + self.get_loop_begin()
-                + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
-                + ["<> is_self = (isrc == target_to_source[itgt])"
-                    if self.exclude_self else ""]
-                + loopy_insns
-                + [
-                    lp.Assignment(id=None,
-                        assignee="pair_result_%d" % i, expression=expr,
-                        temp_var_type=lp.auto)
-                    for i, expr in enumerate(exprs)
-                ]
-                + self.get_loop_end()
-                + self.get_result_store_instructions(),
-                arguments,
-                assumptions=self.get_assumptions(),
-                name=self.name,
-                fixed_parameters=dict(
-                    dim=self.dim,
-                    nstrengths=self.strength_count,
-                    nresults=len(self.kernels)))
-
-        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-
-        for knl in self.kernels:
-            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
-
-        return loopy_knl
+        raise NotImplementedError
 
     def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
         # FIXME
@@ -204,32 +152,60 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
         knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
         return knl
 
+    def get_cache_key(self):
+        return (type(self).__name__, tuple(self.kernels), self.exclude_self,
+                tuple(self.strength_usage), tuple(self.value_dtypes))
 
 # }}}
 
 
 # {{{ P2P point-interaction calculation
 
-class P2P(SingleSrcTgtListP2PBase):
+class P2P(P2PBase):
     default_name = "p2p_apply"
 
-    def get_strength_or_not(self, isrc, kernel_idx):
-        return var("strength").index((self.strength_usage[kernel_idx], isrc))
-
-    def get_input_and_output_arguments(self):
-        return [
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        exprs = self.get_exprs(result_names)
+        arguments = (
+                self.get_default_src_tgt_arguments() + [
                 lp.GlobalArg("strength", None,
-                    shape="nstrengths,nsources", dim_tags="sep,C"),
+                    shape="nstrengths, nsources", dim_tags="sep,C"),
                 lp.GlobalArg("result", None,
-                    shape="nresults,ntargets", dim_tags="sep,C")
-                ]
+                    shape="nresults, ntargets", dim_tags="sep,C")])
 
-    def get_result_store_instructions(self):
-        return ["""
-                result[{i}, itgt] = knl_{i}_scaling \
-                    * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))]
+        loopy_knl = lp.make_kernel([
+            "{[itgt]: 0 <= itgt < ntargets}",
+            "{[isrc]: 0 <= isrc < nsources}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
+            + ["<> is_self = (isrc == target_to_source[itgt])"
+                if self.exclude_self else ""]
+            + loopy_insns
+            + exprs
+            + ["""
+                result[{i}, itgt] = knl_{i}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}}
+               """.format(i=iknl)
+               for iknl in range(len(self.kernels))]
+            + ["end"],
+            arguments,
+            assumptions="nsources>=1 and ntargets>=1",
+            name=self.name,
+            fixed_parameters=dict(
+                dim=self.dim,
+                nstrengths=self.strength_count,
+                nresults=len(self.kernels)))
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, strength, **kwargs):
         from pytools.obj_array import is_obj_array
@@ -247,26 +223,49 @@ class P2P(SingleSrcTgtListP2PBase):
 
 # {{{ P2P matrix writer
 
-class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
+class P2PMatrixGenerator(P2PBase):
     default_name = "p2p_matrix"
 
     def get_strength_or_not(self, isrc, kernel_idx):
         return 1
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        exprs = self.get_exprs(result_names)
+        arguments = (
+                self.get_default_src_tgt_arguments() +
+                [lp.GlobalArg("result_%d" % i, dtype,
+                    shape="ntargets,nsources")
+                 for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel([
+            "{[itgt]: 0 <= itgt < ntargets}",
+            "{[isrc]: 0 <= isrc < nsources}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
+            + ["<> is_self = (isrc == target_to_source[itgt])"
+                if self.exclude_self else ""]
+            + loopy_insns + exprs
+            + ["""
                 result_{i}[itgt, isrc] = \
                     knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}}
                 """.format(i=iknl)
-                for iknl in range(len(self.kernels))
-                ]
+                for iknl in range(len(self.kernels))]
+            + ["end"],
+            arguments,
+            assumptions="nsources>=1 and ntargets>=1",
+            name=self.name,
+            fixed_parameters=dict(dim=self.dim))
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, **kwargs):
         from pytools.obj_array import is_obj_array
@@ -281,74 +280,73 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
 # }}}
 
 
-# {{{
+# {{{ P2P matrix block writer
 
-class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
+class P2PMatrixBlockGenerator(P2PBase):
     default_name = "p2p_block"
 
-    def get_src_tgt_arguments(self):
-        return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \
-            + [
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        exprs = self.get_exprs(result_names)
+        arguments = (
+                self.get_default_src_tgt_arguments() + [
                 lp.GlobalArg("srcindices", None, shape="nsrcindices"),
                 lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
                 lp.GlobalArg("srcranges", None, shape="nranges + 1"),
                 lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
                 lp.ValueArg("nsrcindices", np.int32),
                 lp.ValueArg("ntgtindices", np.int32),
-                lp.ValueArg("nranges", None)
-            ]
-
-    def get_domains(self):
-        return [
-                "{[irange]: 0 <= irange < nranges}",
-                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return [
-                """
+                lp.ValueArg("nranges", None)] +
+                [lp.GlobalArg("result_%d" % i, dtype,
+                    shape="ntgtindices, nsrcindices")
+                 for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel([
+            "{[irange]: 0 <= irange < nranges}",
+            "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}",
+            "{[idim]: 0 <= idim < dim}"
+            ],
+            self.get_kernel_scaling_assignments()
+            + ["""
                 for irange
                     <> tgtstart = tgtranges[irange]
                     <> tgtend = tgtranges[irange + 1]
-                    <> tgt_length = tgtend - tgtstart
+                    <> ntgtblock = tgtend - tgtstart
                     <> srcstart = srcranges[irange]
                     <> srcend = srcranges[irange + 1]
-                    <> src_length = srcend - srcstart
-                    for j, k
-                        <> itgt = tgtindices[tgtstart + j]
-                        <> isrc = srcindices[srcstart + k]
-                """
-                ]
-
-    def get_loop_end(self):
-        return [
-                """
+                    <> nsrcblock = srcend - srcstart
+
+                    for itgt, isrc
+                        <> d[idim] = targets[idim, tgtindices[tgtstart + itgt]] - \
+                                     sources[idim, srcindices[srcstart + isrc]]
+            """] + ["""
+                        <> is_self = (srcindices[srcstart + isrc] ==
+                                      target_to_source[tgtindices[tgtstart + itgt]])
+            """ if self.exclude_self else ""]
+            + loopy_insns + exprs
+            + ["""
+                    result_{i}[tgtstart + itgt, srcstart + isrc] = \
+                        knl_{i}_scaling * pair_result_{i} {{inames=irange}}
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))]
+            + ["""
                     end
                 end
-                """
-                ]
-
-    def get_strength_or_not(self, isrc, kernel_idx):
-        return 1
+            """],
+            arguments,
+            assumptions="nranges>=1",
+            name=self.name,
+            fixed_parameters=dict(dim=self.dim))
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
 
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_{i}[tgtstart + j, srcstart + k] = \
-                        knl_{i}_scaling * pair_result_{i} {{inames=irange}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))
-                ]
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
-    def get_assumptions(self):
-        return "nranges>=1"
+        return loopy_knl
 
     def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
         # FIXME
@@ -380,39 +378,42 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
 
 # {{{ P2P from CSR-like interaction list
 
-class P2PFromCSR(P2PComputationBase):
+class P2PFromCSR(P2PBase):
     default_name = "p2p_from_csr"
 
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-
-        from pymbolic import var
-        exprs = [
-                var(name)
-                * var("strength").index((self.strength_usage[i], var("isrc")))
-                for i, name in enumerate(result_names)]
-
-        if self.exclude_self:
-            from pymbolic.primitives import If, Variable
-            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
+        exprs = self.get_exprs(result_names)
+        arguments = (
+                self.get_default_src_tgt_arguments() + [
+                lp.GlobalArg("box_target_starts", None, shape=None),
+                lp.GlobalArg("box_target_counts_nonchild", None, shape=None),
+                lp.GlobalArg("box_source_starts", None, shape=None),
+                lp.GlobalArg("box_source_counts_nonchild", None, shape=None),
+                lp.GlobalArg("source_box_starts", None, shape=None),
+                lp.GlobalArg("source_box_lists", None, shape=None),
+                lp.GlobalArg("strength", None,
+                    shape="nstrengths, nsources", dim_tags="sep,C"),
+                lp.GlobalArg("result", None,
+                    shape="nkernels, ntargets", dim_tags="sep,C"),
+                "..."
+                ])
 
         from sumpy.tools import gather_loopy_source_arguments
-        loopy_knl = lp.make_kernel(
-            [
-                "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
-                "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}",
-                "{[itgt,isrc,idim]: \
-                        itgt_start<=itgt<itgt_end and \
-                        isrc_start<=isrc<isrc_end and \
-                        0<=idim<dim }",
-                ],
+        loopy_knl = lp.make_kernel([
+            "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
+            "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
+            "{[itgt, isrc, idim]: \
+                itgt_start <= itgt < itgt_end and \
+                isrc_start <= isrc < isrc_end and \
+                0 <= idim < dim}",
+            ],
             self.get_kernel_scaling_assignments()
-            + [
-                """
+            + ["""
                 for itgt_box
                     <> tgt_ibox = target_boxes[itgt_box]
                     <> itgt_start = box_target_starts[tgt_ibox]
-                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]
+                    <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]
 
                     <> isrc_box_start = source_box_starts[itgt_box]
                     <> isrc_box_end = source_box_starts[itgt_box+1]
@@ -420,62 +421,38 @@ class P2PFromCSR(P2PComputationBase):
                     for isrc_box
                         <> src_ibox = source_box_lists[isrc_box]
                         <> isrc_start = box_source_starts[src_ibox]
-                        <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
+                        <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
 
                         for itgt
-                            for isrc
-                                <> d[idim] = \
-                                        targets[idim,itgt] - sources[idim,isrc] \
-                                        {dup=idim}
-                                """] + ["""
-                                <> is_self = (isrc == target_to_source[itgt])
-                                """ if self.exclude_self else ""] + [
-                                ] + loopy_insns + [
-                                lp.Assignment(id=None,
-                                    assignee="pair_result_%d" % i, expression=expr,
-                                    temp_var_type=lp.auto)
-                                for i, expr in enumerate(exprs)
-                                ] + [
-                                """
-                            end
-                            """] + ["""
-                            result[KNLIDX, itgt] = result[KNLIDX, itgt] + \
-                                knl_KNLIDX_scaling \
-                                * simul_reduce(sum, isrc, pair_result_KNLIDX)
-                            """.replace("KNLIDX", str(iknl))
-                            for iknl in range(len(exprs))] + ["""
+                        for isrc
+                            <> d[idim] = \
+                                targets[idim, itgt] - sources[idim, isrc] {dup=idim}
+            """] + ["""
+                            <> is_self = (isrc == target_to_source[itgt])
+                    """ if self.exclude_self else ""]
+            + loopy_insns + exprs
+            + ["""
+                        end
+                        result[{i}, itgt] = result[{i}, itgt] + \
+                            knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i})
+                """.format(i=iknl)
+                for iknl in range(len(exprs))]
+            + ["""
                         end
                     end
                 end
-                """],
-            [
-                lp.GlobalArg("box_target_starts,box_target_counts_nonchild,"
-                    "box_source_starts,box_source_counts_nonchild,",
-                    None, shape=None),
-                lp.GlobalArg("source_box_starts, source_box_lists,",
-                    None, shape=None),
-                lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
-                lp.GlobalArg("result", None,
-                    shape="nkernels,ntargets", dim_tags="sep,c"),
-                lp.GlobalArg("targets", None,
-                    shape="dim,ntargets", dim_tags="sep,c"),
-                lp.GlobalArg("sources", None,
-                    shape="dim,nsources", dim_tags="sep,c"),
-                lp.ValueArg("nsources", np.int32),
-                lp.ValueArg("ntargets", np.int32),
-                "...",
-            ] + (
-                [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
-                if self.exclude_self else []
-            ) + gather_loopy_source_arguments(self.kernels),
-            name=self.name, assumptions="ntgt_boxes>=1",
+            """],
+            arguments,
+            assumptions="ntgt_boxes>=1",
+            name=self.name,
             fixed_parameters=dict(
                 dim=self.dim,
                 nstrengths=self.strength_count,
                 nkernels=len(self.kernels)))
 
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")
+        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
+        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")
 
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
@@ -485,12 +462,14 @@ class P2PFromCSR(P2PComputationBase):
     def get_optimized_kernel(self):
         # FIXME
         knl = self.get_kernel()
+
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
         else:
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
+
         return knl
 
     def __call__(self, queue, **kwargs):