diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index b76e78900f74c32920d7f1c582784eaef1db35ee..e7b47d1e68a464dba2cabc3e396ca0a9d09d389e 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -201,8 +201,7 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        # FIXME: how to split when using blocks?
-        # knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
+        knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
         return knl
 
 
@@ -300,9 +299,8 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
             ]
 
     def get_domains(self):
-        # FIXME: this doesn't work when separating j and k
         return [
-                "{[i]: 0 <= i < nranges - 1}",
+                "{[irange]: 0 <= irange < nranges - 1}",
                 "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
                 "{[idim]: 0 <= idim < dim}"
                 ]
@@ -310,12 +308,12 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
     def get_loop_begin(self):
         return [
                 """
-                for i
-                    <> tgtstart = tgtranges[i]
-                    <> tgtend = tgtranges[i + 1]
+                for irange
+                    <> tgtstart = tgtranges[irange]
+                    <> tgtend = tgtranges[irange + 1]
                     <> tgt_length = tgtend - tgtstart
-                    <> srcstart = srcranges[i]
-                    <> srcend = srcranges[i + 1]
+                    <> srcstart = srcranges[irange]
+                    <> srcend = srcranges[irange + 1]
                     <> src_length = srcend - srcstart
                     for j, k
                         <> itgt = tgtindices[tgtstart + j]
@@ -341,11 +339,10 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
                 ]
 
     def get_result_store_instructions(self):
-        # FIXME: doesn't work without inames=i. check how the loops are nested!
         return [
                 """
                 result_{i}[tgtstart + j, srcstart + k] = \
-                        knl_{i}_scaling * pair_result_{i} {{inames=i}}
+                        knl_{i}_scaling * pair_result_{i} {{inames=irange}}
                 """.format(i=iknl)
                 for iknl in range(len(self.kernels))
                 ]
@@ -353,14 +350,27 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
     def get_assumptions(self):
         return "nranges>=2"
 
+    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
+        # FIXME
+        knl = self.get_kernel()
+
+        if sources_is_obj_array:
+            knl = lp.tag_array_axes(knl, "sources", "sep,C")
+        if targets_is_obj_array:
+            knl = lp.tag_array_axes(knl, "targets", "sep,C")
+
+        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        return knl
+
     def __call__(self, queue, targets, sources, tgtindices, srcindices,
             tgtranges, srcranges, **kwargs):
         from pytools.obj_array import is_obj_array
-        knl = self.get_optimized_kernel(
+        knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
                     is_obj_array(targets) or isinstance(targets, (tuple, list))),
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
+        print(knl)
 
         return knl(queue, targets=targets, sources=sources,
                 tgtindices=tgtindices, srcindices=srcindices,
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 64ea5578c9a24638b9becb0c4dfd907cae49023c..52db31f0145b16a6fdc88482fbc0fba541659e23 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -212,7 +212,6 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
     def get_optimized_kernel(self):
         # FIXME specialize/tune for GPU/CPU
         loopy_knl = self.get_kernel()
-        return loopy_knl
 
         # FIXME: how to tune for blocks?
         import pyopencl as cl
@@ -335,7 +334,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
     def get_domains(self):
         # FIXME: this doesn't work when separating j and k
         return [
-                "{[i]: 0 <= i < nranges - 1}",
+                "{[irange]: 0 <= irange < nranges - 1}",
                 "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
                 "{[idim]: 0 <= idim < dim}"
                 ]
@@ -343,12 +342,12 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
     def get_loop_begin(self):
         return [
                 """
-                for i
-                    <> tgtstart = tgtranges[i]
-                    <> tgtend = tgtranges[i + 1]
+                for irange
+                    <> tgtstart = tgtranges[irange]
+                    <> tgtend = tgtranges[irange + 1]
                     <> tgt_length = tgtend - tgtstart
-                    <> srcstart = srcranges[i]
-                    <> srcend = srcranges[i + 1]
+                    <> srcstart = srcranges[irange]
+                    <> srcend = srcranges[irange + 1]
                     <> src_length = srcend - srcstart
                     for j, k
                         <> itgt = tgtindices[tgtstart + j]
@@ -377,7 +376,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         return [
                 """
                 result_KNLIDX[tgtstart + j, srcstart + k] = \
-                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=i}
+                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=irange}
                 """.replace("KNLIDX", str(iknl))
                 for iknl in range(len(self.expansions))
                 ]
@@ -385,6 +384,14 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
     def get_assumptions(self):
         return "nranges>=2"
 
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        loopy_knl = self.get_kernel()
+
+        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+        return loopy_knl
+
     def __call__(self, queue, targets, sources, centers, expansion_radii,
             tgtindices, srcindices, tgtranges, srcranges, **kwargs):
         knl = self.get_optimized_kernel()