diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index 7403a1dbc4b76addef2632af866c86e7a47d297d..231013b9cbb1e7c7b011d8e891ce54203c320590 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -539,6 +539,14 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
 
         return obj_array_vectorize(reorder, potentials)
 
+    def get_max_nsources_in_one_box(self, queue):
+        return int(pyopencl.array.max(self.tree.box_source_counts_nonchild,
+                queue).get())
+
+    def get_max_ntargets_in_one_box(self, queue):
+        return int(pyopencl.array.max(self.tree.box_target_counts_nonchild,
+                queue).get())
+
     # }}}
 
     # {{{ source/target dispatch
@@ -674,7 +682,8 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface):
                 source_box_lists=source_box_lists,
                 strength=src_weight_vecs,
                 result=pot,
-
+                max_npoints_in_one_box=max(self.get_max_nsources_in_one_box(queue),
+                    self.get_max_ntargets_in_one_box(queue)),
                 **kwargs)
         events.append(evt)
 
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 94b9362a19c8643c4874a1140c01ac33a19c9218..b7146a8b60cab64e5f990d2c9678e4669c4f237d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -438,7 +438,7 @@ class P2PMatrixSubsetGenerator(P2PBase):
 class P2PFromCSR(P2PBase):
     default_name = "p2p_from_csr"
 
-    def get_kernel(self):
+    def get_kernel(self, max_npoints_in_one_box):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
         arguments = (
             self.get_default_src_tgt_arguments()
@@ -459,17 +459,24 @@ class P2PFromCSR(P2PBase):
                     shape="nstrengths, nsources", dim_tags="sep,C"),
                 lp.GlobalArg("result", None,
                     shape="noutputs, ntargets", dim_tags="sep,C"),
+                lp.TemporaryVariable("local_isrc_strength",
+                    shape="nstrengths, max_npoints_in_one_box",
+                    address_space=lp.AddressSpace.LOCAL),
+                lp.TemporaryVariable("local_isrc",
+                    shape=(self.dim, max_npoints_in_one_box),
+                    address_space=lp.AddressSpace.LOCAL),
+                lp.TemporaryVariable("tgt_center", shape=(self.dim,)),
                 "..."
             ])
 
         loopy_knl = lp.make_kernel([
             "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
+            "{[ipoint]: 0 <= ipoint < max_npoints_in_one_box}",
+            "{[iknl]: 0 <= iknl < noutputs}",
             "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
-            "{[itgt, isrc, idim]: \
-                itgt_start <= itgt < itgt_end and \
-                isrc_start <= isrc < isrc_end and \
-                0 <= idim < dim}",
-            ],
+            "{[idim]: 0 <= idim < dim}",
+            "{[istrength]: 0 <= istrength < nstrengths}",
+            "{[isrc]: isrc_start <= isrc < isrc_end}"],
             self.get_kernel_scaling_assignments()
             + ["""
                 for itgt_box
@@ -480,30 +487,51 @@ class P2PFromCSR(P2PBase):
                 <> isrc_box_start = source_box_starts[itgt_box]
                 <> isrc_box_end = source_box_starts[itgt_box+1]
 
-                for isrc_box
+                for ipoint
+                  <> itgt = ipoint + itgt_start
+                  <> cond_itgt = itgt < itgt_end
+                  <> acc[iknl] = 0 {id=init_acc, dup=iknl}
+                  if cond_itgt
+                    tgt_center[idim] = targets[idim, itgt] {id=load_tgt, dup=idim}
+                  end
+                  for isrc_box
                     <> src_ibox = source_box_lists[isrc_box]
                     <> isrc_start = box_source_starts[src_ibox]
                     <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
-
-                    for itgt
-                    for isrc
-                        <> d[idim] = \
-                            targets[idim, itgt] - sources[idim, isrc]
+                    <> cond_isrc = ipoint < isrc_end - isrc_start
+                    if cond_isrc
+                      local_isrc[idim, ipoint] = sources[idim,
+                          ipoint + isrc_start]  {id=load_src, dup=idim}
+                      local_isrc_strength[istrength, ipoint] = strength[istrength,
+                          ipoint + isrc_start]  {id=load_charge}
+                    end
+                    if cond_itgt
+                      for isrc
+                        <> d[idim] = (tgt_center[idim] - local_isrc[idim,
+                          isrc - isrc_start]) {dep=load_src:load_tgt}
             """] + ["""
                         <> is_self = (isrc == target_to_source[itgt])
                     """ if self.exclude_self else ""]
-            + [f"<> strength_{i} = strength[{i}, isrc]" for
+            + [f"<> strength_{i} = local_isrc_strength[{i}, isrc - isrc_start]" for
                 i in set(self.strength_usage)]
             + loopy_insns
-            + ["    end"]
             + [f"""
-                    result[{iknl}, itgt] = result[{iknl}, itgt] + \
-                        knl_{iknl}_scaling * \
-                        simul_reduce(sum, isrc, pair_result_{iknl}) \
-                        {{id_prefix=write_csr}}
+                        acc[{iknl}] = acc[{iknl}] + \
+                          pair_result_{iknl} \
+                          {{id=update_acc, dep=init_acc}}
                 """ for iknl in range(len(self.target_kernels))]
             + ["""
+                      end
                     end
+                  end
+               """]
+            + [f"""
+                  if cond_itgt
+                    result[{iknl}, itgt] = knl_{iknl}_scaling * acc[{iknl}] \
+                            {{dep=update_acc}}
+                  end
+                """ for iknl in range(len(self.target_kernels))]
+            + ["""
                 end
                 end
             """],
@@ -514,6 +542,7 @@ class P2PFromCSR(P2PBase):
             fixed_parameters=dict(
                 dim=self.dim,
                 nstrengths=self.strength_count,
+                max_npoints_in_one_box=max_npoints_in_one_box,
                 noutputs=len(self.target_kernels)),
             lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
@@ -523,22 +552,24 @@ class P2PFromCSR(P2PBase):
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
         loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
         loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")
-
         for knl in self.target_kernels + self.source_kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
         return loopy_knl
 
-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self, max_npoints_in_one_box):
         # FIXME
-        knl = self.get_kernel()
+        knl = self.get_kernel(max_npoints_in_one_box)
 
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
         else:
-            knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
+            knl = lp.tag_inames(knl, {"itgt_box": "g.0"})
+        knl = lp.split_iname(knl, "ipoint", max_npoints_in_one_box,
+                inner_tag="l.0")
+        knl = lp.add_inames_for_unused_hw_axes(knl)
 
         knl = self._allow_redundant_execution_of_knl_scaling(knl)
         knl = lp.set_options(knl,
@@ -547,7 +578,9 @@ class P2PFromCSR(P2PBase):
         return knl
 
     def __call__(self, queue, **kwargs):
-        knl = self.get_cached_optimized_kernel()
+        max_npoints_in_one_box = kwargs.pop("max_npoints_in_one_box")
+        knl = self.get_cached_optimized_kernel(
+                max_npoints_in_one_box=max_npoints_in_one_box)
 
         return knl(queue, **kwargs)