diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 8c37b0e06ea93f5cf07c561b05b188671f6de36d..d8586cc228e47d6aa9f12d7c7c436726ad28bc2e 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -120,10 +120,26 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
                     shape=(self.dim, "nsources")),
                 lp.GlobalArg("targets", None,
                    shape=(self.dim, "ntargets")),
-                lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None)
+                lp.ValueArg("nsources", np.int32),
+                lp.ValueArg("ntargets", np.int32)
                 ]
 
+    def get_domains(self):
+        return [
+                "{[itgt]: 0 <= itgt < ntargets}",
+                "{[isrc]: 0 <= isrc < nsources}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return ["for itgt, isrc"]
+
+    def get_loop_end(self):
+        return ["end"]
+
+    def get_assumptions(self):
+        return "nsources>=1 and ntargets>=1"
+
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
 
@@ -146,25 +162,24 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
                 + gather_loopy_source_arguments(self.kernels))
 
         loopy_knl = lp.make_kernel(
-                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
-                        and 0<=idim<dim}",
+                self.get_domains(),
                 self.get_kernel_scaling_assignments()
-                + ["for itgt, isrc"]
-                + loopy_insns
+                + self.get_loop_begin()
                 + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
                 + ["<> is_self = (isrc == target_to_source[itgt])"
                     if self.exclude_self else ""]
+                + loopy_insns
                 + [
                     lp.Assignment(id=None,
                         assignee="pair_result_%d" % i, expression=expr,
                         temp_var_type=lp.auto)
                     for i, expr in enumerate(exprs)
                 ]
-                + ["end"]
+                + self.get_loop_end()
                 + self.get_result_store_instructions(),
                 arguments,
+                assumptions=self.get_assumptions(),
                 name=self.name,
-                assumptions="nsources>=1 and ntargets>=1",
                 fixed_parameters=dict(
                     dim=self.dim,
                     nstrengths=self.strength_count,
@@ -266,6 +281,103 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
 # }}}
 
 
+# {{{
+
+class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
+    default_name = "p2p_block"
+
+    def get_src_tgt_arguments(self):
+        return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \
+            + [
+                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                lp.ValueArg("nsrcindices", np.int32),
+                lp.ValueArg("ntgtindices", np.int32),
+                lp.ValueArg("nranges", None)
+            ]
+
+    def get_domains(self):
+        return [
+                "{[irange]: 0 <= irange < nranges}",
+                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return [
+                """
+                for irange
+                    <> tgtstart = tgtranges[irange]
+                    <> tgtend = tgtranges[irange + 1]
+                    <> tgt_length = tgtend - tgtstart
+                    <> srcstart = srcranges[irange]
+                    <> srcend = srcranges[irange + 1]
+                    <> src_length = srcend - srcstart
+                    for j, k
+                        <> itgt = tgtindices[tgtstart + j]
+                        <> isrc = srcindices[srcstart + k]
+                """
+                ]
+
+    def get_loop_end(self):
+        return [
+                """
+                    end
+                end
+                """
+                ]
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
+                for i, dtype in enumerate(self.value_dtypes)
+                ]
+
+    def get_result_store_instructions(self):
+        return [
+                """
+                result_{i}[tgtstart + j, srcstart + k] = \
+                        knl_{i}_scaling * pair_result_{i} {{inames=irange}}
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))
+                ]
+
+    def get_assumptions(self):
+        return "nranges>=1"
+
+    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
+        # FIXME
+        knl = self.get_kernel()
+
+        if sources_is_obj_array:
+            knl = lp.tag_array_axes(knl, "sources", "sep,C")
+        if targets_is_obj_array:
+            knl = lp.tag_array_axes(knl, "targets", "sep,C")
+
+        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        return knl
+
+    def __call__(self, queue, targets, sources, tgtindices, srcindices,
+            tgtranges, srcranges, **kwargs):
+        from pytools.obj_array import is_obj_array
+        knl = self.get_cached_optimized_kernel(
+                targets_is_obj_array=(
+                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
+                sources_is_obj_array=(
+                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+
+        return knl(queue, targets=targets, sources=sources,
+                tgtindices=tgtindices, srcindices=srcindices,
+                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+
+# }}}
+
+
 # {{{ P2P from CSR-like interaction list
 
 class P2PFromCSR(P2PComputationBase):
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index cde20ef235ee2cd7f5b7dfdf57e30be2d4d8c20c..0f617e62990c4aad8f187b03d22e2ac9c72b46fd 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -116,10 +119,26 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 lp.GlobalArg("center", None,
                     shape=(self.dim, "ntargets"), order="C"),
                 lp.GlobalArg("expansion_radii", None, shape="ntargets"),
-                lp.ValueArg("nsources", None),
-                lp.ValueArg("ntargets", None),
+                lp.ValueArg("nsources", np.int32),
+                lp.ValueArg("ntargets", np.int32),
+                ]
+
+    def get_domains(self):
+        return [
+                "{[itgt]: 0 <= itgt < ntargets}",
+                "{[isrc]: 0 <= isrc < nsources}",
+                "{[idim]: 0 <= idim < dim}"
                 ]
 
+    def get_loop_begin(self):
+        return ["for itgt, isrc"]
+
+    def get_loop_end(self):
+        return ["end"]
+
+    def get_assumptions(self):
+        return "nsources>=1 and ntargets>=1"
+
     @memoize_method
     def get_kernel(self):
         from sumpy.symbolic import make_sym_vector
@@ -162,10 +181,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 + gather_loopy_source_arguments(self.kernels))
 
         loopy_knl = lp.make_kernel(
-                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
-                "and 0<=idim<dim}",
+                self.get_domains(),
                 self.get_kernel_scaling_assignments()
-                + ["for itgt, isrc"]
+                + self.get_loop_begin()
                 + [self.get_compute_a_and_b_vecs()]
                 + loopy_insns
                 + [
@@ -174,11 +192,11 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                         temp_var_type=lp.auto)
                     for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
                 ]
-                + ["end"]
+                + self.get_loop_end()
                 + self.get_result_store_instructions(),
                 arguments,
                 name=self.name,
-                assumptions="nsources>=1 and ntargets>=1",
+                assumptions=self.get_assumptions(),
                 default_offset=lp.auto,
                 silenced_warnings="write_race(write_lpot*)",
                 fixed_parameters=dict(dim=self.dim),
@@ -198,6 +216,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
         # FIXME specialize/tune for GPU/CPU
         loopy_knl = self.get_kernel()
 
+        # FIXME: how to tune for blocks?
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
@@ -297,6 +316,96 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 
 # }}}
 
+
+# {{{
+
+class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
+    default_name = "qbx_block"
+
+    def get_src_tgt_arguments(self):
+        return LayerPotentialBase.get_src_tgt_arguments(self) \
+            + [
+                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
+                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
+                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
+                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
+                lp.ValueArg("nsrcindices", np.int32),
+                lp.ValueArg("ntgtindices", np.int32),
+                lp.ValueArg("nranges", None)
+            ]
+
+    def get_domains(self):
+        # FIXME: this doesn't work when separating j and k
+        return [
+                "{[irange]: 0 <= irange < nranges}",
+                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
+                "{[idim]: 0 <= idim < dim}"
+                ]
+
+    def get_loop_begin(self):
+        return [
+                """
+                for irange
+                    <> tgtstart = tgtranges[irange]
+                    <> tgtend = tgtranges[irange + 1]
+                    <> tgt_length = tgtend - tgtstart
+                    <> srcstart = srcranges[irange]
+                    <> srcend = srcranges[irange + 1]
+                    <> src_length = srcend - srcstart
+                    for j, k
+                        <> itgt = tgtindices[tgtstart + j]
+                        <> isrc = srcindices[srcstart + k]
+                """
+                ]
+
+    def get_loop_end(self):
+        return [
+                """
+                    end
+                end
+                """
+                ]
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
+                for i, dtype in enumerate(self.value_dtypes)
+                ]
+
+    def get_result_store_instructions(self):
+        return [
+                """
+                result_KNLIDX[tgtstart + j, srcstart + k] = \
+                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=irange}
+                """.replace("KNLIDX", str(iknl))
+                for iknl in range(len(self.expansions))
+                ]
+
+    def get_assumptions(self):
+        return "nranges>=1"
+
+    @memoize_method
+    def get_optimized_kernel(self):
+        # FIXME
+        loopy_knl = self.get_kernel()
+
+        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+        return loopy_knl
+
+    def __call__(self, queue, targets, sources, centers, expansion_radii,
+            tgtindices, srcindices, tgtranges, srcranges, **kwargs):
+        knl = self.get_optimized_kernel()
+
+        return knl(queue, src=sources, tgt=targets, center=centers,
+                expansion_radii=expansion_radii,
+                tgtindices=tgtindices, srcindices=srcindices,
+                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+
+# }}}
+
 # }}}
 
 
diff --git a/sumpy/version.py b/sumpy/version.py
index 98f05a2db719067b04cb7b36457b24c84b4ddddf..7edeb71ff7b0f7325e82ac21ed43b7c3c5cebc15 100644
--- a/sumpy/version.py
+++ b/sumpy/version.py
@@ -29,4 +29,4 @@ VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 # branch name, so as to avoid conflicts with the master branch. Make sure
 # to reset this to the next number up with "master" before merging into
 # master.
-KERNEL_VERSION = ("master", 28)
+KERNEL_VERSION = ("master", 29)
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index c7785d042b1ea2a6ee189d682722a679efad3021..4e2b3ad7c449ba9acb37955cd167d94fe3c80d83 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -73,15 +73,19 @@ def test_qbx_direct(ctx_getter):
     from sumpy.kernel import LaplaceKernel
     lknl = LaplaceKernel(2)
 
+    nblks = 10
     order = 12
     mode_nr = 25
 
     from sumpy.qbx import LayerPotential
     from sumpy.qbx import LayerPotentialMatrixGenerator
+    from sumpy.qbx import LayerPotentialMatrixBlockGenerator
     from sumpy.expansion.local import LineTaylorLocalExpansion
+    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
     mat_gen = LayerPotentialMatrixGenerator(ctx,
             [LineTaylorLocalExpansion(lknl, order)])
-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
+    blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
+            [LineTaylorLocalExpansion(lknl, order)])
 
     for n in [200, 300, 400]:
         targets, sources, centers, sigma, expansion_radii = \
@@ -90,16 +94,37 @@ def test_qbx_direct(ctx_getter):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
+        tgtindices = np.arange(0, n)
+        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
+        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
+                              tgtindices.shape[0] // nblks)
+        srcindices = np.arange(0, n)
+        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
+        srcranges = np.arange(0, srcindices.shape[0] + 1,
+                              srcindices.shape[0] // nblks)
+        assert tgtranges.shape == srcranges.shape
+
         _, (mat,) = mat_gen(queue, targets, sources, centers,
-                expansion_radii=expansion_radii)
+                expansion_radii)
         result_mat = mat.dot(strengths[0])
 
         _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
-                expansion_radii=expansion_radii)
+                expansion_radii)
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
+                            tgtindices, srcindices, tgtranges, srcranges)
+
+        for i in range(srcranges.shape[0] - 1):
+            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
+            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
+            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+
+            eps = 1.0e-10 * la.norm(mat[block])
+            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+
 
 @pytest.mark.parametrize("exclude_self", [True, False])
 def test_p2p_direct(ctx_getter, exclude_self):
@@ -112,12 +137,14 @@ def test_p2p_direct(ctx_getter, exclude_self):
     from sumpy.kernel import LaplaceKernel
     lknl = LaplaceKernel(2)
 
+    nblks = 10
     mode_nr = 25
 
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixGenerator
-    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
 
     for n in [200, 300, 400]:
         targets, sources, _, sigma, _ = \
@@ -126,6 +153,16 @@ def test_p2p_direct(ctx_getter, exclude_self):
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
+        tgtindices = np.arange(0, n)
+        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
+        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
+                              tgtindices.shape[0] // nblks)
+        srcindices = np.arange(0, n)
+        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
+        srcranges = np.arange(0, srcindices.shape[0] + 1,
+                              srcindices.shape[0] // nblks)
+        assert tgtranges.shape == srcranges.shape
+
         extra_kwargs = {}
         if exclude_self:
             extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
@@ -139,6 +176,18 @@ def test_p2p_direct(ctx_getter, exclude_self):
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
+        _, (blk,) = blk_gen(queue, targets, sources,
+                            tgtindices, srcindices, tgtranges, srcranges,
+                            **extra_kwargs)
+
+        for i in range(srcranges.shape[0] - 1):
+            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
+            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
+            block = np.ix_(tgtindices[itgt], srcindices[isrc])
+
+            eps = 1.0e-10 * la.norm(mat[block])
+            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+
 
 # You can test individual routines by typing
 # $ python test_kernels.py 'test_p2p(cl.create_some_context)'