diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 4b91b85eb8dc663693868f53e5b53eb8c9447690..96ae097c8a41d02abd3ddeb81d68f9751520ec03 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -27,6 +27,7 @@ from six.moves import range
 
 import numpy as np
 import loopy as lp
+from pymbolic import var
 
 from sumpy.tools import KernelComputation, KernelCacheWrapper
 
@@ -48,7 +49,7 @@ Particle-to-particle
 
 # {{{ p2p base class
 
-class P2PBase(KernelComputation, KernelCacheWrapper):
+class P2PComputationBase(KernelComputation, KernelCacheWrapper):
     def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
             value_dtypes=None,
             options=[], name=None, device=None):
@@ -107,16 +108,24 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
 
 # {{{ P2P with list of sources and list of targets
 
-class P2P(P2PBase):
-    default_name = "p2p"
+class P2PBase(P2PComputationBase):
+    def get_src_tgt_arguments(self):
+        return [
+                lp.GlobalArg("sources", None,
+                    shape=(self.dim, "nsources")),
+                lp.GlobalArg("targets", None,
+                   shape=(self.dim, "ntargets")),
+                lp.ValueArg("nsources", None),
+                lp.ValueArg("ntargets", None)
+                ]
 
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
 
-        from pymbolic import var
+        isrc_sym = var("isrc")
         exprs = [
                 var(name)
-                * var("strength").index((self.strength_usage[i], var("isrc")))
+                * self.get_strength_or_not(isrc_sym, i)
                 for i, name in enumerate(result_names)]
 
         if self.exclude_self:
@@ -124,46 +133,31 @@ class P2P(P2PBase):
             exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
 
         from sumpy.tools import gather_loopy_source_arguments
+        arguments = (
+                self.get_src_tgt_arguments()
+                + self.get_input_and_output_arguments()
+                + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
+                    if self.exclude_self else [])
+                + gather_loopy_source_arguments(self.kernels))
+
         loopy_knl = lp.make_kernel(
                 "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
                         and 0<=idim<dim}",
                 self.get_kernel_scaling_assignments()
-                + ["""
-                for itgt
-                    for isrc
-                        """] + loopy_insns + ["""
-                        <> d[idim] = targets[idim,itgt] - sources[idim,isrc]
-                        """] + ["""
-                        <> is_self = (isrc == target_to_source[itgt])
-                        """ if self.exclude_self else ""] + [
-                        lp.Assignment(id=None,
-                            assignee="pair_result_%d" % i, expression=expr,
-                            temp_var_type=lp.auto)
-                        for i, expr in enumerate(exprs)
-                        ] + ["""
-                    end
-                    """] + ["""
-                    result[KNLIDX, itgt] = knl_KNLIDX_scaling \
-                            * simul_reduce(sum, isrc, pair_result_KNLIDX)
-                    """.replace("KNLIDX", str(iknl))
-                    for iknl in range(len(exprs))] + [
-                    ] + ["""
-                end
-                """],
-                [
-                    lp.GlobalArg("sources", None,
-                        shape=(self.dim, "nsources")),
-                    lp.GlobalArg("targets", None,
-                        shape=(self.dim, "ntargets")),
-                    lp.ValueArg("nsources", None),
-                    lp.ValueArg("ntargets", None),
-                    lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
-                    lp.GlobalArg("result", None,
-                        shape="nresults,ntargets", dim_tags="sep,C")
-                ] + (
-                    [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
-                    if self.exclude_self else []
-                ) + gather_loopy_source_arguments(self.kernels),
+                + ["for itgt, isrc"]
+                + loopy_insns
+                + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
+                + ["<> is_self = (isrc == target_to_source[itgt])"
+                    if self.exclude_self else ""]
+                + [
+                    lp.Assignment(id=None,
+                        assignee="pair_result_%d" % i, expression=expr,
+                        temp_var_type=lp.auto)
+                    for i, expr in enumerate(exprs)
+                ]
+                + ["end"]
+                + self.get_result_store_instructions(),
+                arguments,
                 name=self.name,
                 assumptions="nsources>=1 and ntargets>=1",
                 fixed_parameters=dict(
@@ -176,8 +170,6 @@ class P2P(P2PBase):
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
 
-        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")
-
         return loopy_knl
 
     def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
@@ -192,9 +184,35 @@ class P2P(P2PBase):
         knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
         return knl
 
+
+# }}}
+
+# {{{
+
+class P2P(P2PBase):
+    default_name = "p2p_apply"
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return var("strength").index((self.strength_usage[kernel_idx], isrc))
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("strength", None,
+                    shape="nstrengths,nsources", dim_tags="sep,C"),
+                lp.GlobalArg("result", None,
+                    shape="nresults,ntargets", dim_tags="sep,C")
+                ]
+
+    def get_result_store_instructions(self):
+        return ["""
+                result[{i}, itgt] = knl_{i}_scaling \
+                    * simul_reduce(sum, isrc, pair_result_{i})
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))]
+
     def __call__(self, queue, targets, sources, strength, **kwargs):
         from pytools.obj_array import is_obj_array
-        knl = self.get_cached_optimized_kernel(
+        knl = self.get_optimized_kernel(
                 targets_is_obj_array=(
                     is_obj_array(targets) or isinstance(targets, (tuple, list))),
                 sources_is_obj_array=(
@@ -205,10 +223,44 @@ class P2P(P2PBase):
 
 # }}}
 
+# {{{ P2P Matrix Writer
+
+class P2PMatrixGenerator(P2PBase):
+    default_name = "p2p_matrix"
+
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return 1
+
+    def get_input_and_output_arguments(self):
+        return [
+                lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
+                for i, dtype in enumerate(self.value_dtypes)
+                ]
+
+    def get_result_store_instructions(self):
+        return [
+                """
+                result_{i}[itgt, isrc] = \
+                    knl_{i}_scaling * pair_result_{i}
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))
+                ]
+
+    def __call__(self, queue, targets, sources, **kwargs):
+        from pytools.obj_array import is_obj_array
+        knl = self.get_optimized_kernel(
+                targets_is_obj_array=(
+                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
+                sources_is_obj_array=(
+                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+
+        return knl(queue, sources=sources, targets=targets, **kwargs)
+
+# }}}
 
 # {{{ P2P from CSR-like interaction list
 
-class P2PFromCSR(P2PBase):
+class P2PFromCSR(P2PComputationBase):
     default_name = "p2p_from_csr"
 
     def get_kernel(self):
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..161ec6f14620ed2c386ae4650d785cf6c5e7b801
--- /dev/null
+++ b/test/test_matrixgen.py
@@ -0,0 +1,171 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2017 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import sys
+
+import pytest
+import pyopencl as cl
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+
+def create_arguments(n, mode):
+    # parametrize circle
+    t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
+    unit_circle = np.exp(1j * t)
+    unit_circle = np.array([unit_circle.real, unit_circle.imag])
+
+    # create density
+    sigma = np.cos(mode * t)
+
+    # create sources and targets
+    h = 2.0 * np.pi / n
+    targets = unit_circle
+    sources = unit_circle
+
+    radius = 7.0 * h
+    centers = unit_circle * (1.0 - radius)
+    expansion_radii = radius * np.ones(n)
+
+    return targets, sources, centers, sigma, expansion_radii
+
+
+def test_qbx_direct(ctx_getter):
+    # This evaluates a single layer potential on a circle.
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_getter()
+    queue = cl.CommandQueue(ctx)
+
+    from sumpy.kernel import LaplaceKernel
+    lknl = LaplaceKernel(2)
+
+    order = 12
+    mode_nr = 25
+
+    from sumpy.qbx import LayerPotential
+    from sumpy.qbx import LayerPotentialMatrixGenerator
+    from sumpy.expansion.local import LineTaylorLocalExpansion
+    mat_gen = LayerPotentialMatrixGenerator(ctx,
+            [LineTaylorLocalExpansion(lknl, order)])
+    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
+
+    from pytools.convergence import EOCRecorder
+    eocrec = EOCRecorder()
+
+    for n in [200, 300, 400]:
+        targets, sources, centers, sigma, expansion_radii = \
+                create_arguments(n, mode_nr)
+
+        eigval = 1.0 / (2.0 * mode_nr)
+        result_ref = eigval * sigma
+
+        h = 2 * np.pi / n
+        strengths = (sigma * h,)
+
+        _, (mat,) = mat_gen(queue, targets, sources, centers,
+                expansion_radii=expansion_radii)
+        result_mat = mat @ strengths[0]
+
+        _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
+                expansion_radii=expansion_radii)
+
+        assert np.max(np.abs(result_mat - result_lpot)) < 1.0e-14
+        eocrec.add_data_point(h, np.max(np.abs(result_ref - result_lpot)))
+
+    print(eocrec)
+
+    slack = 1.5
+    assert eocrec.order_estimate() > order - slack
+
+
+@pytest.mark.parametrize("exclude_self", [True, False])
+def test_p2p_direct(ctx_getter, exclude_self):
+    # This evaluates a single layer potential on a circle.
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_getter()
+    queue = cl.CommandQueue(ctx)
+
+    from sumpy.kernel import LaplaceKernel
+    lknl = LaplaceKernel(2)
+
+    mode_nr = 25
+
+    from sumpy.p2p import P2P
+    from sumpy.p2p import P2PMatrixGenerator
+    mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+    lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+
+    from pytools.convergence import EOCRecorder
+    eocrec = EOCRecorder()
+
+    for n in [200, 300, 400]:
+        targets, sources, _, sigma, _ = create_arguments(n, mode_nr)
+
+        eigval = 1.0 / (2.0 * mode_nr)
+        result_ref = eigval * sigma
+
+        h = 2 * np.pi / n
+        strengths = (sigma * h,)
+
+        extra_kwargs = {}
+        if exclude_self:
+            extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
+
+        _, (mat,) = mat_gen(queue, targets, sources, **extra_kwargs)
+        result_mat = mat @ strengths[0]
+
+        _, (result_lpot,) = lpot(queue, targets, sources, strengths,
+                                 **extra_kwargs)
+
+        assert np.max(np.abs(result_mat - result_lpot)) < 1.0e-14
+        eocrec.add_data_point(h, np.max(np.abs(result_ref - result_lpot)))
+
+    print(eocrec)
+
+    assert eocrec.order_estimate() > 0.8
+
+# You can test individual routines by typing
+# $ python test_kernels.py 'test_p2p(cl.create_some_context)'
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: fdm=marker