diff --git a/.gitignore b/.gitignore
index 71f03d3ad65ed14d6e37cfb471ba601adf355b4b..e0287a537d33cf2e5d567c9ee2d2463955238248 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,5 +16,7 @@ doc/_build
 .cache
 .DS_Store
 .ipynb_checkpoints
+.pytest_cache
 
 sumpy/_git_rev.py
+.asv
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 946ff3b118a645e8d78c8489789480c60e8c0d94..97c8ee2404035df17dc5dade007d6593a117b3a1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,7 +2,7 @@ Python 3.5 Titan X:
   script:
   - py_version=2.7
   - export PYOPENCL_TEST=nvi:titan
-  - EXTRA_INSTALL="numpy mako"
+  - EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -32,7 +32,7 @@ Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -45,7 +45,7 @@ Python 3.5 POCL:
   script:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -58,7 +58,7 @@ Python 3.6 POCL:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -83,7 +83,7 @@ Python 3.5 Conda:
 
 Documentation:
   script:
-  - EXTRA_INSTALL="numpy mako"
+  - EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
   - ". ./build-docs.sh"
   tags:
@@ -99,3 +99,17 @@ Flake8:
   - python3.5
   except:
   - tags
+
+Benchmarks:
+  script:
+  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - PROJECT=sumpy
+  - PYOPENCL_TEST=portable
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-benchmark-py-project.sh
+  - ". ./build-and-benchmark-py-project.sh"
+  tags:
+  - linux
+  - benchmark
+  except:
+  - tags
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f6e76d1584fbdfd2075d965b492507a742c92a0
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,35 @@
+sumpy: n-body kernels and translation operators
+===============================================
+
+.. image:: https://gitlab.tiker.net/inducer/sumpy/badges/master/pipeline.svg
+   :target: https://gitlab.tiker.net/inducer/sumpy/commits/master
+.. image:: https://badge.fury.io/py/sumpy.png
+    :target: http://pypi.python.org/pypi/sumpy
+
+Sumpy is mainly a 'scaffolding' package for Fast Multipole and quadrature methods.
+If you're building one of those and need code generation for the required Multipole
+and local expansions, come right on in. Together with boxtree, there is a full,
+symbolically kernel-independent FMM implementation here.
+
+Sumpy relies on
+
+* `numpy <http://pypi.python.org/pypi/numpy>`_ for arrays
+* `boxtree <http://pypi.python.org/pypi/boxtree>`_ for FMM tree building
+* `sumpy <http://pypi.python.org/pypi/sumpy>`_ for expansions and analytical routines
+* `loopy <http://pypi.python.org/pypi/loo.py>`_ for fast array operations
+* `pytest <http://pypi.python.org/pypi/pytest>`_ for automated testing
+
+and, indirectly,
+
+* `PyOpenCL <http://pypi.python.org/pypi/pyopencl>`_ as computational infrastructure
+
+PyOpenCL is likely the only package you'll have to install
+by hand, all the others will be installed automatically.
+
+Resources:
+
+* `documentation <http://documen.tician.de/sumpy>`_
+* `source code via git <http://github.com/inducer/sumpy>`_
+
+If you can see inside the UIUC firewall, you may browse
+`benchmark results <http://koelsch.d.tiker.net/benchmarks/asv/sumpy/>`_.
\ No newline at end of file
diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 0000000000000000000000000000000000000000..9632c4fb70310c528b43e3d055a96d96875e629f
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,160 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "sumpy",
+
+    // The project's homepage
+    "project_url": "https://documen.tician.de/sumpy",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["master"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://gitlab.tiker.net/inducer/sumpy/commits/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.6"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // "matrix": {
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+    "matrix": {
+        "numpy" : [""],
+        "sympy" : ["1.0"],
+        "pyopencl" : [""],
+        "islpy" : [""],
+        "pocl" : [""],
+        "pip+git+https://github.com/inducer/pymbolic#egg=pymbolic": [""],
+        "pip+git+https://gitlab.tiker.net/inducer/boxtree#egg=boxtree": [""],
+        "pip+git+https://github.com/inducer/loopy#egg=loopy": [""],
+    },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    // "wheel_cache_size": 0
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // }
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/benchmarks/bench_translations.py b/benchmarks/bench_translations.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d6cfdd882be3c8bc91f0078e9c1f82cfc1c841c
--- /dev/null
+++ b/benchmarks/bench_translations.py
@@ -0,0 +1,129 @@
+import numpy as np
+
+import pytest
+import pyopencl as cl
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from sumpy.expansion.multipole import (
+        VolumeTaylorMultipoleExpansion, H2DMultipoleExpansion,
+        VolumeTaylorMultipoleExpansionBase,
+        LaplaceConformingVolumeTaylorMultipoleExpansion,
+        HelmholtzConformingVolumeTaylorMultipoleExpansion)
+from sumpy.expansion.local import (
+        VolumeTaylorLocalExpansion, H2DLocalExpansion,
+        LaplaceConformingVolumeTaylorLocalExpansion,
+        HelmholtzConformingVolumeTaylorLocalExpansion)
+
+from sumpy.kernel import (LaplaceKernel, HelmholtzKernel, AxisTargetDerivative,
+        DirectionalSourceDerivative)
+
+import logging
+logger = logging.getLogger(__name__)
+
+import sympy
+import six
+import pymbolic.mapper.flop_counter
+
+import sumpy.symbolic as sym
+from sumpy.assignment_collection import SymbolicAssignmentCollection
+from sumpy.codegen import to_loopy_insns
+
+class Param:
+    def __init__(self, dim, order):
+        self.dim = dim
+        self.order = order
+
+    def __repr__(self):
+        return "{}D_order_{}".format(self.dim, self.order)
+
+
+class TranslationBenchmarkSuite:
+
+    params = [
+        Param(2, 10),
+        Param(2, 15),
+        Param(2, 20),
+        Param(3, 5),
+        Param(3, 10),
+    ]
+
+    param_names = ['order']
+
+    def setup(self, param):
+        logging.basicConfig(level=logging.INFO)
+        np.random.seed(17)
+        if self.__class__ == TranslationBenchmarkSuite:
+            raise NotImplementedError
+        mpole_expn_class = self.mpole_expn_class
+        if param.order == 3 and H2DMultipoleExpansion == mpole_expn_class:
+            raise NotImplementedError
+
+    def track_m2l_op_count(self, param):
+        knl = self.knl(param.dim)
+        m_expn = self.mpole_expn_class(knl, order=param.order)
+        l_expn = self.local_expn_class(knl, order=param.order)
+
+        src_coeff_exprs = [sym.Symbol("src_coeff%d" % i)
+                for i in range(len(m_expn))]
+        dvec = sym.make_sym_vector("d", knl.dim)
+        src_rscale = sym.Symbol("src_rscale")
+        tgt_rscale = sym.Symbol("tgt_rscale")
+        result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale,
+                                       dvec, tgt_rscale)
+        sac = SymbolicAssignmentCollection()
+        for i, expr in enumerate(result):
+            sac.assign_unique("coeff%d" % i, expr)
+        sac.run_global_cse()
+        insns = to_loopy_insns(six.iteritems(sac.assignments))
+        counter = pymbolic.mapper.flop_counter.CSEAwareFlopCounter()
+
+        return sum([counter.rec(insn.expression)+1 for insn in insns])
+
+    track_m2l_op_count.unit = "ops"
+    track_m2l_op_count.timeout = 200.0
+
+
+class LaplaceVolumeTaylorTranslation(TranslationBenchmarkSuite):
+    knl = LaplaceKernel
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    params = [
+        Param(2, 10),
+        Param(3, 5),
+    ]
+
+
+class LaplaceConformingVolumeTaylorTranslation(TranslationBenchmarkSuite):
+    knl = LaplaceKernel
+    local_expn_class = LaplaceConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = LaplaceConformingVolumeTaylorMultipoleExpansion
+
+
+class HelmholtzVolumeTaylorTranslation(TranslationBenchmarkSuite):
+    knl = HelmholtzKernel
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    params = [
+        Param(2, 10),
+        Param(3, 5),
+    ]
+
+
+class HelmholtzConformingVolumeTaylorTranslation(TranslationBenchmarkSuite):
+    knl = HelmholtzKernel
+    local_expn_class = HelmholtzConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = HelmholtzConformingVolumeTaylorMultipoleExpansion
+
+
+class Helmholtz2DTranslation(TranslationBenchmarkSuite):
+    knl = HelmholtzKernel
+    local_expn_class = H2DLocalExpansion
+    mpole_expn_class = H2DMultipoleExpansion
+    params = [
+        Param(2, 10),
+        Param(2, 15),
+        Param(2, 20),
+    ]
+
+
diff --git a/contrib/translations/PDE-reduction-symbolic.ipynb b/contrib/translations/PDE-reduction-symbolic.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..bffcdf9e1e24beb09f0da9fbf50bf6d0ecdf1903
--- /dev/null
+++ b/contrib/translations/PDE-reduction-symbolic.ipynb
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyopencl as cl\n",
+    "import sumpy.toys as t\n",
+    "import numpy as np\n",
+    "import numpy.linalg as la\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sumpy.visualization import FieldPlotter\n",
+    "from pytools import add_tuples\n",
+    "\n",
+    "from sumpy.expansion.local import VolumeTaylorLocalExpansion\n",
+    "from sumpy.expansion.multipole import (VolumeTaylorMultipoleExpansion,\n",
+    "    LaplaceConformingVolumeTaylorMultipoleExpansion,\n",
+    "    HelmholtzConformingVolumeTaylorMultipoleExpansion)\n",
+    "                                           \n",
+    "from sumpy.kernel import (YukawaKernel, HelmholtzKernel, LaplaceKernel)\n",
+    "\n",
+    "from sumpy.symbolic import make_sym_vector\n",
+    "\n",
+    "order = 2\n",
+    "dim = 2\n",
+    "\n",
+    "if 0:\n",
+    "    knl = LaplaceKernel(dim)\n",
+    "    extra_kernel_kwargs = {}\n",
+    "    mpole_expn_reduced_class = LaplaceConformingVolumeTaylorMultipoleExpansion\n",
+    "    \n",
+    "else:\n",
+    "    helm_k = 1.2\n",
+    "    knl = HelmholtzKernel(dim)\n",
+    "    extra_kernel_kwargs={\"k\": helm_k}\n",
+    "    mpole_expn_reduced_class = HelmholtzConformingVolumeTaylorMultipoleExpansion\n",
+    "\n",
+    "mpole_expn_reduced = mpole_expn_reduced_class(knl, order)\n",
+    "mpole_expn = VolumeTaylorMultipoleExpansion(knl, order)\n",
+    "local_expn = VolumeTaylorLocalExpansion(knl, order)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pytools import factorial\n",
+    "\n",
+    "def mi_factorial(n):\n",
+    "    return np.prod([factorial(n1) for n1 in n])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reduced_wrangler = mpole_expn_reduced.derivative_wrangler\n",
+    "full_wrangler = mpole_expn.derivative_wrangler\n",
+    "\n",
+    "reduced_derivatives = list(make_sym_vector(\"deriv\", len(reduced_wrangler.stored_identifiers)))\n",
+    "full_derivatives = reduced_wrangler.get_full_kernel_derivatives_from_stored(reduced_derivatives, 1)\n",
+    "\n",
+    "print(reduced_derivatives)\n",
+    "print(full_derivatives)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_coeffs = list(make_sym_vector(\"coeff\", len(reduced_wrangler.get_full_coefficient_identifiers())))\n",
+    "\n",
+    "reduced_coeffs = reduced_wrangler.get_stored_mpole_coefficients_from_full(full_mpole_coefficients=full_coeffs, rscale=1)\n",
+    "\n",
+    "print(full_coeffs)\n",
+    "print(reduced_coeffs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dvec = make_sym_vector(\"d\", dim)\n",
+    "translated_reduce_coeffs = mpole_expn_reduced.translate_from(mpole_expn_reduced, reduced_coeffs, 1, dvec, 1)\n",
+    "translated_full_coeffs = mpole_expn.translate_from(mpole_expn, full_coeffs, 1, dvec, 1)\n",
+    "translated_full_coeffs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_reduced = sum(a*b for a, b in zip(translated_reduce_coeffs, reduced_derivatives))\n",
+    "eval_full = sum(a*b for a, b in zip(translated_full_coeffs, full_derivatives))\n",
+    "\n",
+    "(eval_full-eval_reduced).simplify()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
index 1e86e61017a9f70631b25ba46ad808e16f0b715e..8e48bc1c8b0da2b4d81682839c688c0ac69637d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 numpy
-sympy==1.0
+sympy==1.1.1
 git+https://github.com/inducer/pymbolic
 git+https://github.com/inducer/islpy
 git+https://github.com/inducer/pyopencl
diff --git a/setup.py b/setup.py
index 8d0850540362ddfc3ecedf453f2ccc53bba21a1c..9457bbdc269697f61153e755a930efc9fccae41d 100644
--- a/setup.py
+++ b/setup.py
@@ -94,7 +94,7 @@ setup(name="sumpy",
       install_requires=[
           "pytools>=2018.2",
           "loo.py>=2017.2",
-          "boxtree>=2013.1",
+          "boxtree>=2018.1",
           "pytest>=2.3",
           "six",
 
diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index ed2a1e4bd35b07c4bb1f601c6b850044fdf5539a..bd1c51d9b8050529c27174fdf068ffafbb766351 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -55,6 +55,10 @@ class SumpyExpansionWranglerCodeContainer(object):
     necessarily must have a :class:`pyopencl.CommandQueue`, but this queue
     is allowed to be more ephemeral than the code, the code's lifetime
     is decoupled by storing it in this object.
+
+    Timing results returned by this wrangler contain the values *wall_elapsed*
+    which measures elapsed wall time. This requires a command queue with
+    profiling enabled.
     """
 
     def __init__(self, cl_context,
@@ -145,6 +149,53 @@ class SumpyExpansionWranglerCodeContainer(object):
 # }}}
 
 
+# {{{ timing future
+
+_SECONDS_PER_NANOSECOND = 1e-9
+
+
+class UnableToCollectTimingData(UserWarning):
+    pass
+
+
+class SumpyTimingFuture(object):
+
+    def __init__(self, queue, events):
+        self.queue = queue
+        self.events = events
+
+    @memoize_method
+    def result(self):
+        from boxtree.fmm import TimingResult
+
+        if not self.queue.properties & cl.command_queue_properties.PROFILING_ENABLE:
+            from warnings import warn
+            warn(
+                    "Profiling was not enabled in the command queue. "
+                    "Timing data will not be collected.",
+                    category=UnableToCollectTimingData,
+                    stacklevel=3)
+            return TimingResult(wall_elapsed=None)
+
+        pyopencl.wait_for_events(self.events)
+
+        result = 0
+        for event in self.events:
+            result += (
+                    (event.profile.end - event.profile.start)
+                    * _SECONDS_PER_NANOSECOND)
+
+        return TimingResult(wall_elapsed=result)
+
+    def done(self):
+        return all(
+                event.get_info(cl.event_info.COMMAND_EXECUTION_STATUS)
+                == cl.command_execution_status.COMPLETE
+                for event in self.events)
+
+# }}}
+
+
 # {{{ expansion wrangler
 
 class SumpyExpansionWrangler(object):
@@ -175,6 +226,7 @@ class SumpyExpansionWrangler(object):
         self.code = code_container
         self.queue = queue
         self.tree = tree
+        self.issued_timing_data_warning = False
 
         self.dtype = dtype
 
@@ -305,6 +357,8 @@ class SumpyExpansionWrangler(object):
         kwargs = self.extra_kwargs.copy()
         kwargs.update(self.box_source_list_kwargs())
 
+        events = []
+
         for lev in range(self.tree.nlevels):
             p2m = self.code.p2m(self.level_orders[lev])
             start, stop = level_start_source_box_nrs[lev:lev+2]
@@ -325,10 +379,11 @@ class SumpyExpansionWrangler(object):
                     rscale=level_to_rscale(self.tree, lev),
 
                     **kwargs)
+            events.append(evt)
 
             assert mpoles_res is mpoles_view
 
-        return mpoles
+        return (mpoles, SumpyTimingFuture(self.queue, events))
 
     def coarsen_multipoles(self,
             level_start_source_parent_box_nrs,
@@ -336,7 +391,7 @@ class SumpyExpansionWrangler(object):
             mpoles):
         tree = self.tree
 
-        evt = None
+        events = []
 
         # nlevels-1 is the last valid level index
         # nlevels-2 is the last valid level that could have children
@@ -378,12 +433,14 @@ class SumpyExpansionWrangler(object):
                     tgt_rscale=level_to_rscale(self.tree, target_level),
 
                     **self.kernel_extra_kwargs)
+            events.append(evt)
+
             assert mpoles_res is target_mpoles_view
 
-        if evt is not None:
-            mpoles.add_event(evt)
+        if events:
+            mpoles.add_event(events[-1])
 
-        return mpoles
+        return (mpoles, SumpyTimingFuture(self.queue, events))
 
     def eval_direct(self, target_boxes, source_box_starts,
             source_box_lists, src_weights):
@@ -394,6 +451,8 @@ class SumpyExpansionWrangler(object):
         kwargs.update(self.box_source_list_kwargs())
         kwargs.update(self.box_target_list_kwargs())
 
+        events = []
+
         evt, pot_res = self.code.p2p()(self.queue,
                 target_boxes=target_boxes,
                 source_box_starts=source_box_starts,
@@ -402,12 +461,13 @@ class SumpyExpansionWrangler(object):
                 result=pot,
 
                 **kwargs)
+        events.append(evt)
 
         for pot_i, pot_res_i in zip(pot, pot_res):
             assert pot_i is pot_res_i
             pot_i.add_event(evt)
 
-        return pot
+        return (pot, SumpyTimingFuture(self.queue, events))
 
     def multipole_to_local(self,
             level_start_target_box_nrs,
@@ -415,6 +475,8 @@ class SumpyExpansionWrangler(object):
             mpole_exps):
         local_exps = self.local_expansion_zeros()
 
+        events = []
+
         for lev in range(self.tree.nlevels):
             start, stop = level_start_target_box_nrs[lev:lev+2]
             if start == stop:
@@ -445,8 +507,9 @@ class SumpyExpansionWrangler(object):
                     tgt_rscale=level_to_rscale(self.tree, lev),
 
                     **self.kernel_extra_kwargs)
+            events.append(evt)
 
-        return local_exps
+        return (local_exps, SumpyTimingFuture(self.queue, events))
 
     def eval_multipoles(self,
             target_boxes_by_source_level, source_boxes_by_level, mpole_exps):
@@ -455,9 +518,10 @@ class SumpyExpansionWrangler(object):
         kwargs = self.kernel_extra_kwargs.copy()
         kwargs.update(self.box_target_list_kwargs())
 
+        events = []
+
         wait_for = mpole_exps.events
 
-        has_evt = False
         for isrc_level, ssn in enumerate(source_boxes_by_level):
             if len(target_boxes_by_source_level[isrc_level]) == 0:
                 continue
@@ -484,19 +548,18 @@ class SumpyExpansionWrangler(object):
                     wait_for=wait_for,
 
                     **kwargs)
+            events.append(evt)
 
-            has_evt = True
             wait_for = [evt]
 
             for pot_i, pot_res_i in zip(pot, pot_res):
                 assert pot_i is pot_res_i
 
-        if has_evt:
+        if events:
             for pot_i in pot:
-                # Intentionally only adding the last event.
-                pot_i.add_event(evt)
+                pot_i.add_event(events[-1])
 
-        return pot
+        return (pot, SumpyTimingFuture(self.queue, events))
 
     def form_locals(self,
             level_start_target_or_target_parent_box_nrs,
@@ -506,6 +569,8 @@ class SumpyExpansionWrangler(object):
         kwargs = self.extra_kwargs.copy()
         kwargs.update(self.box_source_list_kwargs())
 
+        events = []
+
         for lev in range(self.tree.nlevels):
             start, stop = \
                     level_start_target_or_target_parent_box_nrs[lev:lev+2]
@@ -531,15 +596,19 @@ class SumpyExpansionWrangler(object):
                     rscale=level_to_rscale(self.tree, lev),
 
                     **kwargs)
+            events.append(evt)
 
             assert result is target_local_exps_view
 
-        return local_exps
+        return (local_exps, SumpyTimingFuture(self.queue, events))
 
     def refine_locals(self,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             local_exps):
+
+        events = []
+
         for target_lev in range(1, self.tree.nlevels):
             start, stop = level_start_target_or_target_parent_box_nrs[
                     target_lev:target_lev+2]
@@ -570,12 +639,13 @@ class SumpyExpansionWrangler(object):
                     tgt_rscale=level_to_rscale(self.tree, target_lev),
 
                     **self.kernel_extra_kwargs)
+            events.append(evt)
 
             assert local_exps_res is target_local_exps_view
 
         local_exps.add_event(evt)
 
-        return local_exps
+        return (local_exps, SumpyTimingFuture(self.queue, [evt]))
 
     def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
         pot = self.output_zeros()
@@ -583,6 +653,8 @@ class SumpyExpansionWrangler(object):
         kwargs = self.kernel_extra_kwargs.copy()
         kwargs.update(self.box_target_list_kwargs())
 
+        events = []
+
         for lev in range(self.tree.nlevels):
             start, stop = level_start_target_box_nrs[lev:lev+2]
             if start == stop:
@@ -606,11 +678,12 @@ class SumpyExpansionWrangler(object):
                     rscale=level_to_rscale(self.tree, lev),
 
                     **kwargs)
+            events.append(evt)
 
             for pot_i, pot_res_i in zip(pot, pot_res):
                 assert pot_i is pot_res_i
 
-        return pot
+        return (pot, SumpyTimingFuture(self.queue, events))
 
     def finalize_potentials(self, potentials):
         return potentials
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index 40e82dc72af28830e548e42d1d8e1d6d9e77c9c1..12255a93f6d686c003f8f99f16f2f43db22ba037 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -269,12 +269,12 @@ class Kernel(object):
         return expr
 
     def get_global_scaling_const(self):
-        """Return a global scaling constant of the kernel.
-         Typically, this ensures that the kernel is scaled so that
-         :math:`\mathcal L(G)(x)=C\delta(x)` with a constant of 1, where
-         :math:`\mathcal L` is the PDE operator associated with the kernel.
-         Not to be confused with *rscale*, which keeps expansion
-         coefficients benignly scaled.
+        r"""Return a global scaling constant of the kernel.
+        Typically, this ensures that the kernel is scaled so that
+        :math:`\mathcal L(G)(x)=C\delta(x)` with a constant of 1, where
+        :math:`\mathcal L` is the PDE operator associated with the kernel.
+        Not to be confused with *rscale*, which keeps expansion
+        coefficients benignly scaled.
         """
         raise NotImplementedError
 
@@ -302,7 +302,7 @@ class ExpressionKernel(Kernel):
 
     def __init__(self, dim, expression, global_scaling_const,
             is_complex_valued):
-        """
+        r"""
         :arg expression: A :mod:`pymbolic` expression depending on
             variables *d_1* through *d_N* where *N* equals *dim*.
             (These variables match what is returned from
diff --git a/sumpy/p2e.py b/sumpy/p2e.py
index e9037a3f55396d57103a06970de3a1418adf00f3..daa7d93dc5b393d358424031500a4915e2c2dd9f 100644
--- a/sumpy/p2e.py
+++ b/sumpy/p2e.py
@@ -27,6 +27,7 @@ from six.moves import range
 
 import numpy as np
 import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
 from sumpy.tools import KernelCacheWrapper
 
@@ -159,7 +160,8 @@ class P2EFromSingleBox(P2EBase):
                 assumptions="nsrc_boxes>=1",
                 silenced_warnings="write_race(write_expn*)",
                 default_offset=lp.auto,
-                fixed_parameters=dict(dim=self.dim))
+                fixed_parameters=dict(dim=self.dim),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
@@ -265,7 +267,8 @@ class P2EFromCSR(P2EBase):
                 assumptions="ntgt_boxes>=1",
                 silenced_warnings="write_race(write_expn*)",
                 default_offset=lp.auto,
-                fixed_parameters=dict(dim=self.dim))
+                fixed_parameters=dict(dim=self.dim),
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
         loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index d8586cc228e47d6aa9f12d7c7c436726ad28bc2e..a0e89f242f958bbaea411090582e885d942b2a1d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -30,6 +30,7 @@ from six.moves import range
 
 import numpy as np
 import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
 from sumpy.tools import KernelComputation, KernelCacheWrapper
@@ -40,10 +41,10 @@ __doc__ = """
 Particle-to-particle
 --------------------
 
-.. autoclass:: P2PComputationBase
-.. autoclass:: SingleSrcTgtListP2PBase
+.. autoclass:: P2PBase
 .. autoclass:: P2P
 .. autoclass:: P2PMatrixGenerator
+.. autoclass:: P2PMatrixBlockGenerator
 .. autoclass:: P2PFromCSR
 
 """
@@ -54,7 +55,7 @@ Particle-to-particle
 
 # {{{ p2p base class
 
-class P2PComputationBase(KernelComputation, KernelCacheWrapper):
+class P2PBase(KernelComputation, KernelCacheWrapper):
     def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
             value_dtypes=None,
             options=[], name=None, device=None):
@@ -74,6 +75,10 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper):
         from pytools import single_valued
         self.dim = single_valued(knl.dim for knl in self.kernels)
 
+    def get_cache_key(self):
+        return (type(self).__name__, tuple(self.kernels), self.exclude_self,
+                tuple(self.strength_usage), tuple(self.value_dtypes))
+
     def get_loopy_insns_and_result_names(self):
         from sumpy.symbolic import make_sym_vector
         dvec = make_sym_vector("d", self.dim)
@@ -104,93 +109,40 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper):
 
         return loopy_insns, result_names
 
-    def get_cache_key(self):
-        return (type(self).__name__, tuple(self.kernels), self.exclude_self,
-                tuple(self.strength_usage), tuple(self.value_dtypes))
-
-# }}}
-
-
-# {{{ P2P with list of sources and list of targets
-
-class SingleSrcTgtListP2PBase(P2PComputationBase):
-    def get_src_tgt_arguments(self):
-        return [
-                lp.GlobalArg("sources", None,
-                    shape=(self.dim, "nsources")),
-                lp.GlobalArg("targets", None,
-                   shape=(self.dim, "ntargets")),
-                lp.ValueArg("nsources", np.int32),
-                lp.ValueArg("ntargets", np.int32)
-                ]
-
-    def get_domains(self):
-        return [
-                "{[itgt]: 0 <= itgt < ntargets}",
-                "{[isrc]: 0 <= isrc < nsources}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return ["for itgt, isrc"]
-
-    def get_loop_end(self):
-        return ["end"]
-
-    def get_assumptions(self):
-        return "nsources>=1 and ntargets>=1"
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return var("strength").index((self.strength_usage[kernel_idx], isrc))
 
-    def get_kernel(self):
-        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+    def get_kernel_exprs(self, result_names):
+        from pymbolic import var
 
         isrc_sym = var("isrc")
-        exprs = [
-                var(name)
-                * self.get_strength_or_not(isrc_sym, i)
-                for i, name in enumerate(result_names)]
+        exprs = [var(name) * self.get_strength_or_not(isrc_sym, i)
+                 for i, name in enumerate(result_names)]
 
         if self.exclude_self:
             from pymbolic.primitives import If, Variable
             exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
 
-        from sumpy.tools import gather_loopy_source_arguments
-        arguments = (
-                self.get_src_tgt_arguments()
-                + self.get_input_and_output_arguments()
-                + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
-                    if self.exclude_self else [])
-                + gather_loopy_source_arguments(self.kernels))
-
-        loopy_knl = lp.make_kernel(
-                self.get_domains(),
-                self.get_kernel_scaling_assignments()
-                + self.get_loop_begin()
-                + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
-                + ["<> is_self = (isrc == target_to_source[itgt])"
-                    if self.exclude_self else ""]
-                + loopy_insns
-                + [
-                    lp.Assignment(id=None,
-                        assignee="pair_result_%d" % i, expression=expr,
-                        temp_var_type=lp.auto)
-                    for i, expr in enumerate(exprs)
-                ]
-                + self.get_loop_end()
-                + self.get_result_store_instructions(),
-                arguments,
-                assumptions=self.get_assumptions(),
-                name=self.name,
-                fixed_parameters=dict(
-                    dim=self.dim,
-                    nstrengths=self.strength_count,
-                    nresults=len(self.kernels)))
+        return [lp.Assignment(id=None,
+                    assignee="pair_result_%d" % i, expression=expr,
+                    temp_var_type=lp.auto)
+                for i, expr in enumerate(exprs)]
 
-        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-
-        for knl in self.kernels:
-            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+    def get_default_src_tgt_arguments(self):
+        from sumpy.tools import gather_loopy_source_arguments
+        return ([
+                lp.GlobalArg("sources", None,
+                    shape=(self.dim, "nsources")),
+                lp.GlobalArg("targets", None,
+                   shape=(self.dim, "ntargets")),
+                lp.ValueArg("nsources", None),
+                lp.ValueArg("ntargets", None)] +
+                ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))]
+                    if self.exclude_self else []) +
+                gather_loopy_source_arguments(self.kernels))
 
-        return loopy_knl
+    def get_kernel(self):
+        raise NotImplementedError
 
     def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
         # FIXME
@@ -210,26 +162,56 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
 
 # {{{ P2P point-interaction calculation
 
-class P2P(SingleSrcTgtListP2PBase):
-    default_name = "p2p_apply"
+class P2P(P2PBase):
+    """Direct applier for P2P interactions."""
 
-    def get_strength_or_not(self, isrc, kernel_idx):
-        return var("strength").index((self.strength_usage[kernel_idx], isrc))
+    default_name = "p2p_apply"
 
-    def get_input_and_output_arguments(self):
-        return [
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [
                 lp.GlobalArg("strength", None,
-                    shape="nstrengths,nsources", dim_tags="sep,C"),
+                    shape="nstrengths, nsources", dim_tags="sep,C"),
                 lp.GlobalArg("result", None,
-                    shape="nresults,ntargets", dim_tags="sep,C")
-                ]
+                    shape="nresults, ntargets", dim_tags="sep,C")
+            ])
+
+        loopy_knl = lp.make_kernel(["""
+            {[itgt, isrc, idim]: \
+                0 <= itgt < ntargets and \
+                0 <= isrc < nsources and \
+                0 <= idim < dim}
+            """],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
+            + ["<> is_self = (isrc == target_to_source[itgt])"
+                if self.exclude_self else ""]
+            + loopy_insns + kernel_exprs
+            + ["""
+                result[{i}, itgt] = knl_{i}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}}
+               """.format(i=iknl)
+               for iknl in range(len(self.kernels))]
+            + ["end"],
+            arguments,
+            assumptions="nsources>=1 and ntargets>=1",
+            name=self.name,
+            fixed_parameters=dict(
+                dim=self.dim,
+                nstrengths=self.strength_count,
+                nresults=len(self.kernels)),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-    def get_result_store_instructions(self):
-        return ["""
-                result[{i}, itgt] = knl_{i}_scaling \
-                    * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}}
-                """.format(i=iknl)
-                for iknl in range(len(self.kernels))]
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, strength, **kwargs):
         from pytools.obj_array import is_obj_array
@@ -247,26 +229,53 @@ class P2P(SingleSrcTgtListP2PBase):
 
 # {{{ P2P matrix writer
 
-class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
+class P2PMatrixGenerator(P2PBase):
+    """Generator for P2P interaction matrix entries."""
+
     default_name = "p2p_matrix"
 
     def get_strength_or_not(self, isrc, kernel_idx):
         return 1
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [lp.GlobalArg("result_%d" % i, dtype,
+                shape="ntargets,nsources")
+             for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel(["""
+            {[itgt, isrc, idim]: \
+                0 <= itgt < ntargets and \
+                0 <= isrc < nsources and \
+                0 <= idim < dim}
+            """],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"]
+            + ["<> is_self = (isrc == target_to_source[itgt])"
+                if self.exclude_self else ""]
+            + loopy_insns + kernel_exprs
+            + ["""
                 result_{i}[itgt, isrc] = \
                     knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}}
                 """.format(i=iknl)
-                for iknl in range(len(self.kernels))
-                ]
+                for iknl in range(len(self.kernels))]
+            + ["end"],
+            arguments,
+            assumptions="nsources>=1 and ntargets>=1",
+            name=self.name,
+            fixed_parameters=dict(dim=self.dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, **kwargs):
         from pytools.obj_array import is_obj_array
@@ -281,74 +290,67 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
 # }}}
 
 
-# {{{
+# {{{ P2P matrix block writer
 
-class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
-    default_name = "p2p_block"
+class P2PMatrixBlockGenerator(P2PBase):
+    """Generator for a subset of P2P interaction matrix entries.
 
-    def get_src_tgt_arguments(self):
-        return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \
-            + [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", np.int32),
-                lp.ValueArg("ntgtindices", np.int32),
-                lp.ValueArg("nranges", None)
-            ]
-
-    def get_domains(self):
-        return [
-                "{[irange]: 0 <= irange < nranges}",
-                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return [
-                """
-                for irange
-                    <> tgtstart = tgtranges[irange]
-                    <> tgtend = tgtranges[irange + 1]
-                    <> tgt_length = tgtend - tgtstart
-                    <> srcstart = srcranges[irange]
-                    <> srcend = srcranges[irange + 1]
-                    <> src_length = srcend - srcstart
-                    for j, k
-                        <> itgt = tgtindices[tgtstart + j]
-                        <> isrc = srcindices[srcstart + k]
-                """
-                ]
-
-    def get_loop_end(self):
-        return [
-                """
-                    end
-                end
-                """
-                ]
+    .. automethod:: __call__
+    """
+
+    default_name = "p2p_block"
 
     def get_strength_or_not(self, isrc, kernel_idx):
         return 1
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_{i}[tgtstart + j, srcstart + k] = \
-                        knl_{i}_scaling * pair_result_{i} {{inames=irange}}
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
+            ] +
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+             for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel(
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
+            self.get_kernel_scaling_assignments()
+            + ["""
+                for imat
+                    <> d[idim] = targets[idim, tgtindices[imat]] - \
+                                 sources[idim, srcindices[imat]]
+            """]
+            + ["""
+                    <> is_self = (srcindices[imat] ==
+                                  target_to_source[tgtindices[imat]])
+                """ if self.exclude_self else ""]
+            + loopy_insns + kernel_exprs
+            + ["""
+                    result_{i}[imat] = \
+                        knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_p2p}}
                 """.format(i=iknl)
-                for iknl in range(len(self.kernels))
-                ]
+                for iknl in range(len(self.kernels))]
+            + ["end"],
+            arguments,
+            assumptions="nresult>=1",
+            silenced_warnings="write_race(write_p2p*)",
+            name=self.name,
+            fixed_parameters=dict(dim=self.dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
 
-    def get_assumptions(self):
-        return "nranges>=1"
+        for knl in self.kernels:
+            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
         # FIXME
@@ -359,11 +361,31 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
         if targets_is_obj_array:
             knl = lp.tag_array_axes(knl, "targets", "sep,C")
 
-        knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0")
+        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
         return knl
 
-    def __call__(self, queue, targets, sources, tgtindices, srcindices,
-            tgtranges, srcranges, **kwargs):
+    def __call__(self, queue, targets, sources, index_set, **kwargs):
+        """Construct a set of blocks of the full P2P interaction matrix.
+
+        The blocks are returned as one-dimensional arrays, for performance
+        and storage reasons. If the two-dimensional form is desired, it can
+        be obtained using the information in the `index_set` for a block
+        :math:`i` in the following way:
+
+        .. code-block:: python
+
+            blkranges = index_set.linear_ranges()
+            blkshape = index_set.block_shape(i)
+
+            block2d = result[blkranges[i]:blkranges[i + 1]].reshape(*blkshape)
+
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
+            to define the blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+            target-source pairs described by `index_set`.
+        """
         from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
                 targets_is_obj_array=(
@@ -371,111 +393,104 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
                 sources_is_obj_array=(
                     is_obj_array(sources) or isinstance(sources, (tuple, list))))
 
-        return knl(queue, targets=targets, sources=sources,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+        return knl(queue,
+                   targets=targets,
+                   sources=sources,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
 
 # {{{ P2P from CSR-like interaction list
 
-class P2PFromCSR(P2PComputationBase):
+class P2PFromCSR(P2PBase):
     default_name = "p2p_from_csr"
 
     def get_kernel(self):
         loopy_insns, result_names = self.get_loopy_insns_and_result_names()
-
-        from pymbolic import var
-        exprs = [
-                var(name)
-                * var("strength").index((self.strength_usage[i], var("isrc")))
-                for i, name in enumerate(result_names)]
-
-        if self.exclude_self:
-            from pymbolic.primitives import If, Variable
-            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]
-
-        from sumpy.tools import gather_loopy_source_arguments
-        loopy_knl = lp.make_kernel(
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
             [
-                "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
-                "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}",
-                "{[itgt,isrc,idim]: \
-                        itgt_start<=itgt<itgt_end and \
-                        isrc_start<=isrc<isrc_end and \
-                        0<=idim<dim }",
-                ],
+                lp.GlobalArg("box_target_starts",
+                    None, shape=None),
+                lp.GlobalArg("box_target_counts_nonchild",
+                    None, shape=None),
+                lp.GlobalArg("box_source_starts",
+                    None, shape=None),
+                lp.GlobalArg("box_source_counts_nonchild",
+                    None, shape=None),
+                lp.GlobalArg("source_box_starts",
+                    None, shape=None),
+                lp.GlobalArg("source_box_lists",
+                    None, shape=None),
+                lp.GlobalArg("strength", None,
+                    shape="nstrengths, nsources", dim_tags="sep,C"),
+                lp.GlobalArg("result", None,
+                    shape="nkernels, ntargets", dim_tags="sep,C"),
+                "..."
+            ])
+
+        loopy_knl = lp.make_kernel([
+            "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
+            "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
+            "{[itgt, isrc, idim]: \
+                itgt_start <= itgt < itgt_end and \
+                isrc_start <= isrc < isrc_end and \
+                0 <= idim < dim}",
+            ],
             self.get_kernel_scaling_assignments()
-            + [
-                """
+            + ["""
                 for itgt_box
-                    <> tgt_ibox = target_boxes[itgt_box]
-                    <> itgt_start = box_target_starts[tgt_ibox]
-                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]
-
-                    <> isrc_box_start = source_box_starts[itgt_box]
-                    <> isrc_box_end = source_box_starts[itgt_box+1]
-
-                    for isrc_box
-                        <> src_ibox = source_box_lists[isrc_box]
-                        <> isrc_start = box_source_starts[src_ibox]
-                        <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
-
-                        for itgt
-                            for isrc
-                                <> d[idim] = \
-                                        targets[idim,itgt] - sources[idim,isrc] \
-                                        {dup=idim}
-                                """] + ["""
-                                <> is_self = (isrc == target_to_source[itgt])
-                                """ if self.exclude_self else ""] + [
-                                ] + loopy_insns + [
-                                lp.Assignment(id=None,
-                                    assignee="pair_result_%d" % i, expression=expr,
-                                    temp_var_type=lp.auto)
-                                for i, expr in enumerate(exprs)
-                                ] + [
-                                """
-                            end
-                            """] + ["""
-                            result[KNLIDX, itgt] = result[KNLIDX, itgt] + \
-                                knl_KNLIDX_scaling \
-                                * simul_reduce(sum, isrc, pair_result_KNLIDX)
-                            """.replace("KNLIDX", str(iknl))
-                            for iknl in range(len(exprs))] + ["""
-                        end
+                <> tgt_ibox = target_boxes[itgt_box]
+                <> itgt_start = box_target_starts[tgt_ibox]
+                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]
+
+                <> isrc_box_start = source_box_starts[itgt_box]
+                <> isrc_box_end = source_box_starts[itgt_box+1]
+
+                for isrc_box
+                    <> src_ibox = source_box_lists[isrc_box]
+                    <> isrc_start = box_source_starts[src_ibox]
+                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]
+
+                    for itgt
+                    for isrc
+                        <> d[idim] = \
+                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
+            """] + ["""
+                        <> is_self = (isrc == target_to_source[itgt])
+                    """ if self.exclude_self else ""]
+            + loopy_insns + kernel_exprs
+            + ["    end"]
+            + ["""
+                    result[{i}, itgt] = result[{i}, itgt] + \
+                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
+                        {{id_prefix=write_csr}}
+                """.format(i=iknl)
+                for iknl in range(len(self.kernels))]
+            + ["""
                     end
                 end
-                """],
-            [
-                lp.GlobalArg("box_target_starts,box_target_counts_nonchild,"
-                    "box_source_starts,box_source_counts_nonchild,",
-                    None, shape=None),
-                lp.GlobalArg("source_box_starts, source_box_lists,",
-                    None, shape=None),
-                lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
-                lp.GlobalArg("result", None,
-                    shape="nkernels,ntargets", dim_tags="sep,c"),
-                lp.GlobalArg("targets", None,
-                    shape="dim,ntargets", dim_tags="sep,c"),
-                lp.GlobalArg("sources", None,
-                    shape="dim,nsources", dim_tags="sep,c"),
-                lp.ValueArg("nsources", np.int32),
-                lp.ValueArg("ntargets", np.int32),
-                "...",
-            ] + (
-                [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))]
-                if self.exclude_self else []
-            ) + gather_loopy_source_arguments(self.kernels),
-            name=self.name, assumptions="ntgt_boxes>=1",
+                end
+            """],
+            arguments,
+            assumptions="ntgt_boxes>=1",
+            name=self.name,
+            silenced_warnings="write_race(write_csr*)",
             fixed_parameters=dict(
                 dim=self.dim,
                 nstrengths=self.strength_count,
-                nkernels=len(self.kernels)))
+                nkernels=len(self.kernels)),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
 
         loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
-        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")
+        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
+        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")
 
         for knl in self.kernels:
             loopy_knl = knl.prepare_loopy_kernel(loopy_knl)
@@ -485,12 +500,14 @@ class P2PFromCSR(P2PComputationBase):
     def get_optimized_kernel(self):
         # FIXME
         knl = self.get_kernel()
+
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
         else:
             knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0")
+
         return knl
 
     def __call__(self, queue, **kwargs):
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 0f617e62990c4aad8f187b03d22e2ac9c72b46fd..a0814e01c412727a908ec72758e62014b3278708 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 
 
 import six
-from six.moves import range, zip
+from six.moves import range
 import numpy as np
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
@@ -49,6 +49,7 @@ QBX for Layer Potentials
 .. autoclass:: LayerPotentialBase
 .. autoclass:: LayerPotential
 .. autoclass:: LayerPotentialMatrixGenerator
+.. autoclass:: LayerPotentialMatrixBlockGenerator
 
 """
 
@@ -87,7 +88,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec):
 class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
     def __init__(self, ctx, expansions, strength_usage=None,
             value_dtypes=None,
-            options=[], name="layerpot", device=None):
+            options=[], name=None, device=None):
         KernelComputation.__init__(self, ctx, expansions, strength_usage,
                 value_dtypes,
                 name, options, device)
@@ -103,46 +104,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
     def expansions(self):
         return self.kernels
 
-    def get_compute_a_and_b_vecs(self):
-        return """
-            <> a[idim] = center[idim,itgt] - src[idim,isrc] {dup=idim}
-            <> b[idim] = tgt[idim,itgt] - center[idim,itgt] {dup=idim}
-            <> rscale = expansion_radii[itgt]
-            """
-
-    def get_src_tgt_arguments(self):
-        return [
-                lp.GlobalArg("src", None,
-                    shape=(self.dim, "nsources"), order="C"),
-                lp.GlobalArg("tgt", None,
-                    shape=(self.dim, "ntargets"), order="C"),
-                lp.GlobalArg("center", None,
-                    shape=(self.dim, "ntargets"), order="C"),
-                lp.GlobalArg("expansion_radii", None, shape="ntargets"),
-                lp.ValueArg("nsources", np.int32),
-                lp.ValueArg("ntargets", np.int32),
-                ]
-
-    def get_domains(self):
-        return [
-                "{[itgt]: 0 <= itgt < ntargets}",
-                "{[isrc]: 0 <= isrc < nsources}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return ["for itgt, isrc"]
-
-    def get_loop_end(self):
-        return ["end"]
-
-    def get_assumptions(self):
-        return "nsources>=1 and ntargets>=1"
-
-    @memoize_method
-    def get_kernel(self):
+    def get_loopy_insns_and_result_names(self):
         from sumpy.symbolic import make_sym_vector
-
         avec = make_sym_vector("a", self.dim)
         bvec = make_sym_vector("b", self.dim)
 
@@ -152,7 +115,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
         logger.info("compute expansion expressions: start")
 
         result_names = [expand(i, sac, expn, avec, bvec)
-                for i, expn in enumerate(self.expansions)]
+                        for i, expn in enumerate(self.expansions)]
 
         logger.info("compute expansion expressions: done")
 
@@ -168,55 +131,43 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 complex_dtype=np.complex128  # FIXME
                 )
 
-        isrc_sym = var("isrc")
-        exprs = [
-                var(name)
-                * self.get_strength_or_not(isrc_sym, i)
-                for i, name in enumerate(result_names)]
+        return loopy_insns, result_names
 
-        from sumpy.tools import gather_loopy_source_arguments
-        arguments = (
-                self.get_src_tgt_arguments()
-                + self.get_input_and_output_arguments()
-                + gather_loopy_source_arguments(self.kernels))
-
-        loopy_knl = lp.make_kernel(
-                self.get_domains(),
-                self.get_kernel_scaling_assignments()
-                + self.get_loop_begin()
-                + [self.get_compute_a_and_b_vecs()]
-                + loopy_insns
-                + [
-                    lp.Assignment(id=None,
-                        assignee="pair_result_%d" % i, expression=expr,
-                        temp_var_type=lp.auto)
-                    for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
-                ]
-                + self.get_loop_end()
-                + self.get_result_store_instructions(),
-                arguments,
-                name=self.name,
-                assumptions=self.get_assumptions(),
-                default_offset=lp.auto,
-                silenced_warnings="write_race(write_lpot*)",
-                fixed_parameters=dict(dim=self.dim),
-                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+    def get_strength_or_not(self, isrc, kernel_idx):
+        return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc)
 
-        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+    def get_kernel_exprs(self, result_names):
+        isrc_sym = var("isrc")
+        exprs = [var(name) * self.get_strength_or_not(isrc_sym, i)
+                 for i, name in enumerate(result_names)]
 
-        for expn in self.expansions:
-            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+        return [lp.Assignment(id=None,
+                    assignee="pair_result_%d" % i, expression=expr,
+                    temp_var_type=lp.auto)
+                for i, expr in enumerate(exprs)]
 
-        loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")
+    def get_default_src_tgt_arguments(self):
+        from sumpy.tools import gather_loopy_source_arguments
+        return ([
+                lp.GlobalArg("src", None,
+                    shape=(self.dim, "nsources"), order="C"),
+                lp.GlobalArg("tgt", None,
+                    shape=(self.dim, "ntargets"), order="C"),
+                lp.GlobalArg("center", None,
+                    shape=(self.dim, "ntargets"), dim_tags="sep,C"),
+                lp.GlobalArg("expansion_radii",
+                    None, shape="ntargets"),
+                lp.ValueArg("nsources", None),
+                lp.ValueArg("ntargets", None)] +
+                gather_loopy_source_arguments(self.kernels))
 
-        return loopy_knl
+    def get_kernel(self):
+        raise NotImplementedError
 
-    @memoize_method
     def get_optimized_kernel(self):
         # FIXME specialize/tune for GPU/CPU
         loopy_knl = self.get_kernel()
 
-        # FIXME: how to tune for blocks?
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
@@ -238,32 +189,58 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 # {{{ direct applier
 
 class LayerPotential(LayerPotentialBase):
-    """Direct applier for the layer potential."""
+    """Direct applier for the layer potential.
 
-    default_name = "qbx_apply"
+    .. automethod:: __call__
+    """
 
-    def get_strength_or_not(self, isrc, kernel_idx):
-        return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc)
+    default_name = "qbx_apply"
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C")
-                for i in range(self.strength_count)
-                ]+[
-                lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C")
-                for i in range(len(self.kernels))
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_{i}[itgt] = \
-                        knl_{i}_scaling*simul_reduce(
-                            sum, isrc, pair_result_{i}) \
-                                    {{id_prefix=write_lpot,inames=itgt}}
+    @memoize_method
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [lp.GlobalArg("strength_%d" % i,
+                None, shape="nsources", order="C")
+            for i in range(self.strength_count)] +
+            [lp.GlobalArg("result_%d" % i,
+                None, shape="ntargets", order="C")
+            for i in range(len(self.kernels))])
+
+        loopy_knl = lp.make_kernel(["""
+            {[itgt, isrc, idim]: \
+                0 <= itgt < ntargets and \
+                0 <= isrc < nsources and \
+                0 <= idim < dim}
+            """],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"]
+            + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"]
+            + ["<> rscale = expansion_radii[itgt]"]
+            + loopy_insns + kernel_exprs
+            + ["""
+                result_{i}[itgt] = knl_{i}_scaling * \
+                    simul_reduce(sum, isrc, pair_result_{i}) \
+                        {{id_prefix=write_lpot,inames=itgt}}
                 """.format(i=iknl)
-                for iknl in range(len(self.expansions))
-                ]
+                for iknl in range(len(self.expansions))]
+            + ["end"],
+            arguments,
+            name=self.name,
+            assumptions="ntargets>=1 and nsources>=1",
+            default_offset=lp.auto,
+            silenced_warnings="write_race(write_lpot*)",
+            fixed_parameters=dict(dim=self.dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        for expn in self.expansions:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, strengths, expansion_radii,
             **kwargs):
@@ -293,23 +270,50 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
     def get_strength_or_not(self, isrc, kernel_idx):
         return 1
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
-
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_KNLIDX[itgt, isrc] = \
-                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=isrc:itgt}
-                """.replace("KNLIDX", str(iknl))
-                for iknl in range(len(self.expansions))
-                ]
+    @memoize_method
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [lp.GlobalArg("result_%d" % i,
+                dtype, shape="ntargets, nsources", order="C")
+             for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel(["""
+            {[itgt, isrc, idim]: \
+                0 <= itgt < ntargets and \
+                0 <= isrc < nsources and \
+                0 <= idim < dim}
+            """],
+            self.get_kernel_scaling_assignments()
+            + ["for itgt, isrc"]
+            + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"]
+            + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"]
+            + ["<> rscale = expansion_radii[itgt]"]
+            + loopy_insns + kernel_exprs
+            + ["""
+                result_{i}[itgt, isrc] = \
+                    knl_{i}_scaling * pair_result_{i} \
+                        {{inames=isrc:itgt}}
+                """.format(i=iknl)
+                for iknl in range(len(self.expansions))]
+            + ["end"],
+            arguments,
+            name=self.name,
+            assumptions="ntargets>=1 and nsources>=1",
+            default_offset=lp.auto,
+            fixed_parameters=dict(dim=self.dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        for expn in self.expansions:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs):
-        knl = self.get_optimized_kernel()
+        knl = self.get_cached_optimized_kernel()
 
         return knl(queue, src=sources, tgt=targets, center=centers,
                 expansion_radii=expansion_radii, **kwargs)
@@ -320,89 +324,97 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
 # {{{
 
 class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
-    default_name = "qbx_block"
+    """Generator for a subset of the layer potential matrix entries.
 
-    def get_src_tgt_arguments(self):
-        return LayerPotentialBase.get_src_tgt_arguments(self) \
-            + [
-                lp.GlobalArg("srcindices", None, shape="nsrcindices"),
-                lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
-                lp.GlobalArg("srcranges", None, shape="nranges + 1"),
-                lp.GlobalArg("tgtranges", None, shape="nranges + 1"),
-                lp.ValueArg("nsrcindices", np.int32),
-                lp.ValueArg("ntgtindices", np.int32),
-                lp.ValueArg("nranges", None)
-            ]
-
-    def get_domains(self):
-        # FIXME: this doesn't work when separating j and k
-        return [
-                "{[irange]: 0 <= irange < nranges}",
-                "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
-                "{[idim]: 0 <= idim < dim}"
-                ]
-
-    def get_loop_begin(self):
-        return [
-                """
-                for irange
-                    <> tgtstart = tgtranges[irange]
-                    <> tgtend = tgtranges[irange + 1]
-                    <> tgt_length = tgtend - tgtstart
-                    <> srcstart = srcranges[irange]
-                    <> srcend = srcranges[irange + 1]
-                    <> src_length = srcend - srcstart
-                    for j, k
-                        <> itgt = tgtindices[tgtstart + j]
-                        <> isrc = srcindices[srcstart + k]
-                """
-                ]
-
-    def get_loop_end(self):
-        return [
-                """
-                    end
-                end
-                """
-                ]
+    .. automethod:: __call__
+    """
+
+    default_name = "qbx_block"
 
     def get_strength_or_not(self, isrc, kernel_idx):
         return 1
 
-    def get_input_and_output_arguments(self):
-        return [
-                lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
-                for i, dtype in enumerate(self.value_dtypes)
-                ]
+    @memoize_method
+    def get_kernel(self):
+        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
+        kernel_exprs = self.get_kernel_exprs(result_names)
+        arguments = (
+            self.get_default_src_tgt_arguments() +
+            [
+                lp.GlobalArg("srcindices", None, shape="nresult"),
+                lp.GlobalArg("tgtindices", None, shape="nresult"),
+                lp.ValueArg("nresult", None)
+            ] +
+            [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
+             for i, dtype in enumerate(self.value_dtypes)])
+
+        loopy_knl = lp.make_kernel([
+            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}"
+            ],
+            self.get_kernel_scaling_assignments()
+            + ["""
+                for imat
+                    <> itgt = tgtindices[imat]
+                    <> isrc = srcindices[imat]
+
+                    <> a[idim] = center[idim, tgtindices[imat]] - \
+                                 src[idim, srcindices[imat]] {dup=idim}
+                    <> b[idim] = tgt[idim, tgtindices[imat]] - \
+                                 center[idim, tgtindices[imat]] {dup=idim}
+                    <> rscale = expansion_radii[tgtindices[imat]]
+            """]
+            + loopy_insns + kernel_exprs
+            + ["""
+                    result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \
+                            {{id_prefix=write_lpot}}
+                """.format(i=iknl)
+                for iknl in range(len(self.expansions))]
+            + ["end"],
+            arguments,
+            name=self.name,
+            assumptions="nresult>=1",
+            default_offset=lp.auto,
+            silenced_warnings="write_race(write_lpot*)",
+            fixed_parameters=dict(dim=self.dim),
+            lang_version=MOST_RECENT_LANGUAGE_VERSION)
 
-    def get_result_store_instructions(self):
-        return [
-                """
-                result_KNLIDX[tgtstart + j, srcstart + k] = \
-                        knl_KNLIDX_scaling*pair_result_KNLIDX  {inames=irange}
-                """.replace("KNLIDX", str(iknl))
-                for iknl in range(len(self.expansions))
-                ]
+        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
+        loopy_knl = lp.add_dtypes(loopy_knl,
+            dict(nsources=np.int32, ntargets=np.int32))
 
-    def get_assumptions(self):
-        return "nranges>=1"
+        for expn in self.expansions:
+            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)
+
+        return loopy_knl
 
-    @memoize_method
     def get_optimized_kernel(self):
-        # FIXME
         loopy_knl = self.get_kernel()
 
-        loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+        loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
         return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, expansion_radii,
-            tgtindices, srcindices, tgtranges, srcranges, **kwargs):
-        knl = self.get_optimized_kernel()
+                 index_set, **kwargs):
+        """
+        :arg targets: target point coordinates.
+        :arg sources: source point coordinates.
+        :arg centers: QBX target expansion centers.
+        :arg expansion_radii: radii for each expansion center.
+        :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used
+            to define the blocks.
+        :return: a tuple of one-dimensional arrays of kernel evaluations at
+            target-source pairs described by `index_set`.
+        """
 
-        return knl(queue, src=sources, tgt=targets, center=centers,
-                expansion_radii=expansion_radii,
-                tgtindices=tgtindices, srcindices=srcindices,
-                tgtranges=tgtranges, srcranges=srcranges, **kwargs)
+        knl = self.get_cached_optimized_kernel()
+
+        return knl(queue,
+                   src=sources,
+                   tgt=targets,
+                   center=centers,
+                   expansion_radii=expansion_radii,
+                   tgtindices=index_set.linear_row_indices,
+                   srcindices=index_set.linear_col_indices, **kwargs)
 
 # }}}
 
diff --git a/sumpy/tools.py b/sumpy/tools.py
index dd98325bef11bc122c34cf9a34114e5ff19673f0..c5438320517642aad4ef7acb590511e42290d3f1 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -1,6 +1,9 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2018 Alexandru Fikl
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -24,10 +27,16 @@ THE SOFTWARE.
 
 import six
 from six.moves import range, zip
-from pytools import memoize_method
+from pytools import memoize_method, memoize_in
 import numpy as np
 import sumpy.symbolic as sym
 
+import pyopencl as cl
+import pyopencl.array  # noqa
+
+import loopy as lp
+from loopy.version import MOST_RECENT_LANGUAGE_VERSION
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -245,6 +254,253 @@ class KernelComputation(object):
 # }}}
 
 
+# {{{
+
+
+def _to_host(x, queue=None):
+    if isinstance(x, cl.array.Array):
+        queue = queue or x.queue
+        return x.get(queue)
+    return x
+
+
+class BlockIndexRanges(object):
+    """Convenience class for working with blocks of a global array.
+
+    .. attribute:: indices
+
+        A list of not necessarily continuous or increasing integers
+        representing the indices of a global array. The individual blocks are
+        delimited using :attr:`ranges`.
+
+    .. attribute:: ranges
+
+        A list of nondecreasing integers used to index into :attr:`indices`.
+        A block :math:`i` can be retrieved using
+        `indices[ranges[i]:ranges[i + 1]]`.
+
+    .. automethod:: block_shape
+    .. automethod:: get
+    .. automethod:: take
+    """
+
+    def __init__(self, cl_context, indices, ranges):
+        self.cl_context = cl_context
+        self.indices = indices
+        self.ranges = ranges
+
+    @property
+    @memoize_method
+    def _ranges(self):
+        with cl.CommandQueue(self.cl_context) as queue:
+            return _to_host(self.ranges, queue=queue)
+
+    @property
+    def nblocks(self):
+        return self.ranges.shape[0] - 1
+
+    def block_shape(self, i):
+        return (self._ranges[i + 1] - self._ranges[i],)
+
+    def block_indices(self, i):
+        return self.indices[self._ranges[i]:self._ranges[i + 1]]
+
+    def get(self, queue=None):
+        return BlockIndexRanges(self.cl_context,
+                                _to_host(self.indices, queue=queue),
+                                _to_host(self.ranges, queue=queue))
+
+    def take(self, x, i):
+        """Return the subset of a global array `x` that is defined by
+        the :attr:`indices` in block :math:`i`.
+        """
+
+        return x[self.block_indices(i)]
+
+
+class MatrixBlockIndexRanges(object):
+    """Keep track of different ways to index into matrix blocks.
+
+    .. attribute:: row
+
+        A :class:`BlockIndexRanges` encapsulating row block indices.
+
+    .. attribute:: col
+
+        A :class:`BlockIndexRanges` encapsulating column block indices.
+
+    .. automethod:: block_shape
+    .. automethod:: block_take
+    .. automethod:: get
+    .. automethod:: take
+
+    """
+
+    def __init__(self, cl_context, row, col):
+        self.cl_context = cl_context
+        self.row = row
+        self.col = col
+        assert self.row.nblocks == self.col.nblocks
+
+        self.blkranges = np.cumsum([0] + [
+            self.row.block_shape(i)[0] * self.col.block_shape(i)[0]
+            for i in range(self.row.nblocks)])
+
+        if isinstance(self.row.indices, cl.array.Array):
+            with cl.CommandQueue(self.cl_context) as queue:
+                self.blkranges = \
+                    cl.array.to_device(queue, self.blkranges).with_queue(None)
+
+    @property
+    def nblocks(self):
+        return self.row.nblocks
+
+    def block_shape(self, i):
+        return self.row.block_shape(i) + self.col.block_shape(i)
+
+    def block_indices(self, i):
+        return (self.row.block_indices(i),
+                self.col.block_indices(i))
+
+    @property
+    def linear_row_indices(self):
+        r, _ = self._linear_indices()
+        return r
+
+    @property
+    def linear_col_indices(self):
+        _, c = self._linear_indices()
+        return c
+
+    @property
+    def linear_ranges(self):
+        return self.blkranges
+
+    def get(self, queue=None):
+        """Transfer data to the host. Only the initial given data is
+        transfered, not the arrays returned by :meth:`linear_row_indices` and
+        friends.
+
+        :return: a copy of `self` in which all data lives on the host, i.e.
+        all :class:`pyopencl.array.Array` instances are replaces by
+        :class:`numpy.ndarray` instances.
+        """
+        return MatrixBlockIndexRanges(self.cl_context,
+                row=self.row.get(queue=queue),
+                col=self.col.get(queue=queue))
+
+    def take(self, x, i):
+        """Retrieve a block from a global matrix.
+
+        :arg x: a 2D :class:`numpy.ndarray`.
+        :arg i: block index.
+        :return: requested block from the matrix.
+        """
+
+        if isinstance(self.row.indices, cl.array.Array) or \
+                isinstance(self.col.indices, cl.array.Array):
+            raise ValueError("CL `Array`s are not supported."
+                    "Use MatrixBlockIndexRanges.get() and then view into matrices.")
+
+        irow, icol = self.block_indices(i)
+        return x[np.ix_(irow, icol)]
+
+    def block_take(self, x, i):
+        """Retrieve a block from a linear representation of the matrix blocks.
+        A linear representation of the matrix blocks can be obtained, or
+        should be consistent with
+
+        .. code-block:: python
+
+            i = index.linear_row_indices()
+            j = index.linear_col_indices()
+            linear_blks = global_mat[i, j]
+
+            for k in range(index.nblocks):
+                assert np.allclose(index.block_take(linear_blks, k),
+                                   index.take(global_mat, k))
+
+        :arg x: a 1D :class:`numpy.ndarray`.
+        :arg i: block index.
+        :return: requested block, reshaped into a 2D array.
+        """
+
+        iblk = np.s_[self.blkranges[i]:self.blkranges[i + 1]]
+        return x[iblk].reshape(*self.block_shape(i))
+
+    @memoize_method
+    def _linear_indices(self):
+        """
+        :return: a tuple of `(rowindices, colindices)` that can be
+            used to provide linear indexing into a set of matrix blocks. These
+            index arrays are just the concatenated Cartesian products of all
+            the block arrays described by :attr:`row` and :attr:`col`.
+
+            They can be used to index directly into a matrix as follows:
+
+            .. code-block:: python
+
+                mat[rowindices[blkranges[i]:blkranges[i + 1]],
+                    colindices[blkranges[i]:blkranges[i + 1]]]
+
+            The same block can be obtained more easily using
+
+            .. code-block:: python
+
+                index.view(mat, i).reshape(-1)
+        """
+
+        @memoize_in(self, "block_index_knl")
+        def _build_index():
+            loopy_knl = lp.make_kernel([
+                "{[irange]: 0 <= irange < nranges}",
+                "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}"
+                ],
+                """
+                for irange
+                    <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange]
+                    <> nsrcblock = srcranges[irange + 1] - srcranges[irange]
+
+                    for itgt, isrc
+                        <> imat = blkranges[irange] + (nsrcblock * itgt + isrc)
+
+                        rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \
+                            {id_prefix=write_index}
+                        colindices[imat] = srcindices[srcranges[irange] + isrc] \
+                            {id_prefix=write_index}
+                    end
+                end
+                """,
+                [
+                    lp.GlobalArg('blkranges', None, shape="nranges + 1"),
+                    lp.GlobalArg("rowindices", None, shape="nresults"),
+                    lp.GlobalArg("colindices", None, shape="nresults"),
+                    lp.ValueArg("nresults", None),
+                    '...'
+                ],
+                name="block_index_knl",
+                default_offset=lp.auto,
+                assumptions='nranges>=1',
+                silenced_warnings="write_race(write_index*)",
+                lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0")
+
+            return loopy_knl
+
+        with cl.CommandQueue(self.cl_context) as queue:
+            _, (rowindices, colindices) = _build_index()(queue,
+                tgtindices=self.row.indices,
+                srcindices=self.col.indices,
+                tgtranges=self.row.ranges,
+                srcranges=self.col.ranges,
+                blkranges=self.blkranges,
+                nresults=_to_host(self.blkranges[-1], queue=queue))
+            return (rowindices.with_queue(None),
+                    colindices.with_queue(None))
+
+# }}}
+
+
 # {{{ OrderedSet
 
 # Source: http://code.activestate.com/recipes/576694-orderedset/
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 0331db6c696043b2efb8fe27286a2d48cbd16b59..71e3f044432b4bd612337bdd55d56e421540bcc4 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -234,6 +234,62 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
     pconv_verifier()
 
 
+def test_sumpy_fmm_timing_data_collection(ctx_getter):
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_getter()
+    queue = cl.CommandQueue(
+            ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    nsources = 500
+    dtype = np.float64
+
+    from boxtree.tools import (
+            make_normal_particle_array as p_normal)
+
+    knl = LaplaceKernel(2)
+    local_expn_class = VolumeTaylorLocalExpansion
+    mpole_expn_class = VolumeTaylorMultipoleExpansion
+    order = 1
+
+    sources = p_normal(queue, nsources, knl.dim, dtype, seed=15)
+
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(ctx)
+
+    tree, _ = tb(queue, sources,
+            max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tbuild = FMMTraversalBuilder(ctx)
+    trav, _ = tbuild(queue, tree, debug=True)
+
+    from pyopencl.clrandom import PhiloxGenerator
+    rng = PhiloxGenerator(ctx)
+    weights = rng.uniform(queue, nsources, dtype=np.float64)
+
+    out_kernels = [knl]
+
+    from functools import partial
+
+    from sumpy.fmm import SumpyExpansionWranglerCodeContainer
+    wcc = SumpyExpansionWranglerCodeContainer(
+            ctx,
+            partial(mpole_expn_class, knl),
+            partial(local_expn_class, knl),
+            out_kernels)
+
+    wrangler = wcc.get_wrangler(queue, tree, dtype,
+            fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order)
+    from boxtree.fmm import drive_fmm
+
+    timing_data = {}
+    pot, = drive_fmm(trav, wrangler, weights, timing_data=timing_data)
+    print(timing_data)
+    assert timing_data
+
+
 def test_sumpy_fmm_exclude_self(ctx_getter):
     logging.basicConfig(level=logging.INFO)
 
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 4e2b3ad7c449ba9acb37955cd167d94fe3c80d83..2f3aabc3082b99707ab86b5461390e866350fdc3 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -26,11 +26,17 @@ import sys
 import numpy as np
 import numpy.linalg as la
 
-import pytest
 import pyopencl as cl
+import pyopencl.array  # noqa
+
+from sumpy.tools import vector_to_device
+from sumpy.tools import MatrixBlockIndexRanges
+
+import pytest
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -42,7 +48,7 @@ else:
     faulthandler.enable()
 
 
-def create_arguments(n, mode, target_radius=1.0):
+def _build_geometry(queue, n, mode, target_radius=1.0):
     # parametrize circle
     t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False)
     unit_circle = np.exp(1j * t)
@@ -60,133 +66,183 @@ def create_arguments(n, mode, target_radius=1.0):
     centers = unit_circle * (1.0 - radius)
     expansion_radii = radius * np.ones(n)
 
-    return targets, sources, centers, sigma, expansion_radii
+    return (cl.array.to_device(queue, targets),
+            cl.array.to_device(queue, sources),
+            vector_to_device(queue, centers),
+            cl.array.to_device(queue, expansion_radii),
+            cl.array.to_device(queue, sigma))
+
+
+def _build_block_index(queue, nnodes, nblks, factor):
+    indices = np.arange(0, nnodes)
+    ranges = np.arange(0, nnodes + 1, nnodes // nblks)
+
+    if abs(factor - 1.0) < 1.0e-14:
+        ranges_ = ranges
+        indices_ = indices
+    else:
+        indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object)
+        for i in range(ranges.shape[0] - 1):
+            iidx = indices[np.s_[ranges[i]:ranges[i + 1]]]
+            indices_[i] = np.sort(np.random.choice(iidx,
+                size=int(factor * len(iidx)), replace=False))
+
+        ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_])
+        indices_ = np.hstack(indices_)
+
+    from sumpy.tools import BlockIndexRanges
+    return BlockIndexRanges(queue.context,
+                            cl.array.to_device(queue, indices_).with_queue(None),
+                            cl.array.to_device(queue, ranges_).with_queue(None))
 
 
-def test_qbx_direct(ctx_getter):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize('factor', [1.0, 0.6])
+@pytest.mark.parametrize('lpot_id', [1, 2])
+def test_qbx_direct(ctx_getter, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(2)
-
+    ndim = 2
     nblks = 10
     order = 12
     mode_nr = 25
 
+    from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative
+    if lpot_id == 1:
+        knl = LaplaceKernel(ndim)
+    elif lpot_id == 2:
+        knl = LaplaceKernel(ndim)
+        knl = DirectionalSourceDerivative(knl, dir_vec_name="dsource_vec")
+    else:
+        raise ValueError("unknow lpot_id")
+
+    from sumpy.expansion.local import LineTaylorLocalExpansion
+    lknl = LineTaylorLocalExpansion(knl, order)
+
     from sumpy.qbx import LayerPotential
+    lpot = LayerPotential(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixGenerator
+    mat_gen = LayerPotentialMatrixGenerator(ctx, [lknl])
+
     from sumpy.qbx import LayerPotentialMatrixBlockGenerator
-    from sumpy.expansion.local import LineTaylorLocalExpansion
-    lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
-    mat_gen = LayerPotentialMatrixGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
-    blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
-            [LineTaylorLocalExpansion(lknl, order)])
+    blk_gen = LayerPotentialMatrixBlockGenerator(ctx, [lknl])
 
     for n in [200, 300, 400]:
-        targets, sources, centers, sigma, expansion_radii = \
-                create_arguments(n, mode_nr)
+        targets, sources, centers, expansion_radii, sigma = \
+                _build_geometry(queue, n, mode_nr)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
-        assert tgtranges.shape == srcranges.shape
-
-        _, (mat,) = mat_gen(queue, targets, sources, centers,
-                expansion_radii)
-        result_mat = mat.dot(strengths[0])
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
 
-        _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
-                expansion_radii)
+        extra_kwargs = {}
+        if lpot_id == 2:
+            extra_kwargs["dsource_vec"] = \
+                    vector_to_device(queue, np.ones((ndim, n)))
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                strengths=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                centers=centers,
+                expansion_radii=expansion_radii,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
+
+        rowindices = index_set.linear_row_indices.get(queue)
+        colindices = index_set.linear_col_indices.get(queue)
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
-
-        _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
-                            tgtindices, srcindices, tgtranges, srcranges)
-
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
-
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        assert la.norm(blk - mat[rowindices, colindices]) < eps
 
 
 @pytest.mark.parametrize("exclude_self", [True, False])
-def test_p2p_direct(ctx_getter, exclude_self):
-    # This evaluates a single layer potential on a circle.
+@pytest.mark.parametrize("factor", [1.0, 0.6])
+def test_p2p_direct(ctx_getter, exclude_self, factor):
     logging.basicConfig(level=logging.INFO)
 
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)
 
-    from sumpy.kernel import LaplaceKernel
-    lknl = LaplaceKernel(2)
-
+    ndim = 2
     nblks = 10
     mode_nr = 25
 
+    from sumpy.kernel import LaplaceKernel
+    lknl = LaplaceKernel(ndim)
+
     from sumpy.p2p import P2P
-    from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
     lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixGenerator
     mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
+
+    from sumpy.p2p import P2PMatrixBlockGenerator
     blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
 
     for n in [200, 300, 400]:
-        targets, sources, _, sigma, _ = \
-            create_arguments(n, mode_nr, target_radius=1.2)
+        targets, sources, _, _, sigma = \
+            _build_geometry(queue, n, mode_nr, target_radius=1.2)
 
         h = 2 * np.pi / n
         strengths = (sigma * h,)
 
-        tgtindices = np.arange(0, n)
-        tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
-        tgtranges = np.arange(0, tgtindices.shape[0] + 1,
-                              tgtindices.shape[0] // nblks)
-        srcindices = np.arange(0, n)
-        srcindices = np.random.choice(srcindices, size=int(0.8 * n))
-        srcranges = np.arange(0, srcindices.shape[0] + 1,
-                              srcindices.shape[0] // nblks)
-        assert tgtranges.shape == srcranges.shape
+        tgtindices = _build_block_index(queue, n, nblks, factor)
+        srcindices = _build_block_index(queue, n, nblks, factor)
+        index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices)
 
         extra_kwargs = {}
         if exclude_self:
-            extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
-
-        _, (mat,) = mat_gen(queue, targets, sources, **extra_kwargs)
-        result_mat = mat.dot(strengths[0])
-
-        _, (result_lpot,) = lpot(queue, targets, sources, strengths,
-                                 **extra_kwargs)
+            extra_kwargs["target_to_source"] = \
+                cl.array.arange(queue, 0, n, dtype=np.int)
+
+        _, (result_lpot,) = lpot(queue,
+                targets=targets,
+                sources=sources,
+                strength=strengths, **extra_kwargs)
+        result_lpot = result_lpot.get()
+
+        _, (mat,) = mat_gen(queue,
+                targets=targets,
+                sources=sources, **extra_kwargs)
+        mat = mat.get()
+        result_mat = mat.dot(strengths[0].get())
+
+        _, (blk,) = blk_gen(queue,
+                targets=targets,
+                sources=sources,
+                index_set=index_set, **extra_kwargs)
+        blk = blk.get()
 
         eps = 1.0e-10 * la.norm(result_lpot)
         assert la.norm(result_mat - result_lpot) < eps
 
-        _, (blk,) = blk_gen(queue, targets, sources,
-                            tgtindices, srcindices, tgtranges, srcranges,
-                            **extra_kwargs)
-
-        for i in range(srcranges.shape[0] - 1):
-            itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
-            isrc = np.s_[srcranges[i]:srcranges[i + 1]]
-            block = np.ix_(tgtindices[itgt], srcindices[isrc])
-
-            eps = 1.0e-10 * la.norm(mat[block])
-            assert la.norm(blk[itgt, isrc] - mat[block]) < eps
+        index_set = index_set.get(queue)
+        for i in range(index_set.nblocks):
+            assert la.norm(index_set.block_take(blk, i) -
+                           index_set.take(mat, i)) < eps
 
 
 # You can test individual routines by typing