diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 25651f66571c7be459c4074a7890df940c50ce08..0e932ef2f55adcecd1eee76b51151fae037c9988 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -126,6 +126,7 @@ jobs:
                 python-version: '3.x'
         -   name: "Main Script"
             run: |
+                PROJECT=loopy
                 CONDA_ENVIRONMENT=.test-conda-env-py3.yml
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
                 . ci-support.sh
diff --git a/.gitignore b/.gitignore
index 7be271c37ca0d6d1d67185ce4fbf202bbc488240..e7ea1299a074d23ec23ad85c88c1289b4f7d2df1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,5 @@ loopy/_git_rev.py
 virtualenv-[0-9]*[0-9]
 
 *.so
+
+.asv
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 07a0492393048c64d93df70f800b7da4e3d1861f..2ea1707ff43eb9e1d17760a4ac86bec1a886ae1d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,9 @@
+stages:
+  - test
+  - deploy
+
 Python 3 POCL:
+  stage: test
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=portable:pthread
@@ -16,6 +21,7 @@ Python 3 POCL:
       junit: test/pytest.xml
 
 Python 3 POCL without arg check:
+  stage: test
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=portable:pthread
@@ -34,6 +40,7 @@ Python 3 POCL without arg check:
       junit: test/pytest.xml
 
 Python 3 Intel:
+  stage: test
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=intel
@@ -54,6 +61,7 @@ Python 3 Intel:
 
 
 Python 3 POCL Twice With Cache:
+  stage: test
   script: |
     export PY_EXE=python3
     export PYOPENCL_TEST=portable:pthread
@@ -87,6 +95,7 @@ Python 3 POCL Twice With Cache:
 #   - tags
 
 Python 3 POCL Examples:
+  stage: test
   script: |
     export PY_EXE=python3
     export PYOPENCL_TEST=portable:pthread
@@ -113,6 +122,7 @@ Python 3 POCL Examples:
   - tags
 
 Pylint:
+  stage: test
   script:
   # Needed to avoid name shadowing issues when running from source directory.
   - PROJECT_INSTALL_FLAGS="--editable"
@@ -126,7 +136,9 @@ Pylint:
   - tags
 
 Documentation:
+  stage: deploy
   script:
+  - PROJECT=loopy
   - EXTRA_INSTALL="pybind11 numpy"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-docs.sh
   - ". ./build-docs.sh"
@@ -134,6 +146,7 @@ Documentation:
   - python3
 
 Flake8:
+  stage: test
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
   - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
@@ -141,3 +154,19 @@ Flake8:
   - python3
   except:
   - tags
+
+Benchmarks:
+  stage: test
+  script:
+  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - PROJECT=loopy
+  - PYOPENCL_TEST=portable:pthread
+  - export LOOPY_NO_CACHE=1
+  - export ASV_FACTOR=1.5
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh
+  - ". ./build-and-benchmark-py-project.sh"
+  tags:
+  - linux
+  - benchmark
+  except:
+  - tags
diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 0000000000000000000000000000000000000000..99c2ea2b5941721a045d8aa7a0586d7d5f9e1eb6
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,159 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "loopy",
+
+    // The project's homepage
+    "project_url": "https://documen.tician.de/loopy",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"],    // for git
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://gitlab.tiker.net/inducer/loopy/commits/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.6"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // "matrix": {
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+    "matrix": {
+        "numpy" : [""],
+        "pyopencl" : [""],
+        "islpy" : [""],
+        "pocl" : [""],
+        "pip+git+https://github.com/inducer/pymbolic#egg=pymbolic": [""],
+        "pip+git+https://github.com/inducer/boxtree#egg=boxtree": [""],
+        "pip+git+https://github.com/inducer/loopy#egg=loopy": [""],
+        "pip+git+https://github.com/inducer/sumpy#egg=sumpy": [""],
+    },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    // "wheel_cache_size": 0
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // }
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d37a6bb8a8e06d51651c7d36b47e66c624ccf26f
--- /dev/null
+++ b/benchmarks/run_sumpy_kernels.py
@@ -0,0 +1,125 @@
+import loopy as lp
+import numpy as np
+import pyopencl as cl
+import logging
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+from pyopencl.tools import (  # noqa
+    pytest_generate_tests_for_pyopencl as pytest_generate_tests,
+)
+
+
+def _sumpy_kernel_init(param):
+    name, dim, order = param.name, param.dim, param.order
+    # TODO: add other kernels
+    assert name == "m2l"
+    from sumpy.expansion.multipole import (
+        LaplaceConformingVolumeTaylorMultipoleExpansion,
+    )
+    from sumpy.expansion.local import LaplaceConformingVolumeTaylorLocalExpansion
+    from sumpy.kernel import LaplaceKernel
+    from sumpy import E2EFromCSR
+
+    ctx = cl.create_some_context()
+    np.random.seed(17)
+
+    knl = LaplaceKernel(dim)
+    local_expn_class = LaplaceConformingVolumeTaylorLocalExpansion
+    mpole_expn_class = LaplaceConformingVolumeTaylorMultipoleExpansion
+    m_expn = mpole_expn_class(knl, order=order)
+    l_expn = local_expn_class(knl, order=order)
+
+    m2l = E2EFromCSR(ctx, m_expn, l_expn)
+    m2l.get_translation_loopy_insns()
+    m2l.ctx = None
+    m2l.device = None
+    return m2l
+
+
+def _sumpy_kernel_make(expn, param):
+    assert param.name == "m2l"
+    loopy_knl = expn.get_optimized_kernel()
+    loopy_knl = lp.add_and_infer_dtypes(
+        loopy_knl,
+        dict(
+            tgt_ibox=np.int32,
+            centers=np.float64,
+            tgt_center=np.float64,
+            target_boxes=np.int32,
+            src_ibox=np.int32,
+            src_expansions=np.float64,
+            tgt_rscale=np.float64,
+            src_rscale=np.float64,
+            src_box_starts=np.int32,
+            src_box_lists=np.int32,
+        ),
+    )
+    return loopy_knl
+
+
+@dataclass(frozen=True)
+class Param:
+    name: str
+    dim: int
+    order: int
+
+
+def cached_data(params):
+    data = {}
+    np.random.seed(17)
+    logging.basicConfig(level=logging.INFO)
+    for param in params:
+        data[param] = {}
+        expn = _sumpy_kernel_init(param)
+        data[param]["setup"] = expn
+        knl = _sumpy_kernel_make(expn, param)
+        knl = lp.preprocess_kernel(knl)
+        data[param]["instantiated"] = knl
+        scheduled = lp.get_one_scheduled_kernel(knl)
+        data[param]["scheduled"] = scheduled
+    return data
+
+
+class SumpyBenchmarkSuite:
+
+    params = [
+        Param("m2l", dim=3, order=6),
+        Param("m2l", dim=3, order=12),
+    ]
+
+    param_names = ["test_name"]
+
+    version = 1
+
+    def setup_cache(self):
+        return cached_data(self.params)
+
+    def time_instantiate(self, data, param):
+        knl = _sumpy_kernel_make(data[param]["setup"], param)
+        lp.preprocess_kernel(knl)
+
+    def time_schedule(self, data, param):
+        lp.get_one_scheduled_kernel(data[param]["instantiated"])
+
+    def time_generate_code(self, data, param):
+        lp.generate_code_v2(data[param]["scheduled"])
+
+    time_instantiate.timeout = 600.0
+    time_schedule.timeout = 600.0
+    time_generate_code.timeout = 600.0
+
+    # No warmup is needed
+    time_instantiate.warmup_time = 0
+    time_schedule.warmup_time = 0
+    time_generate_code.warmup_time = 0
+
+    # These are expensive operations. Run only once
+    time_schedule.number = 1
+    time_generate_code.number = 1
+
+    # Run memory benchmarks as well
+    peakmem_instantiate = time_instantiate
+    peakmem_schedule = time_schedule
+    peakmem_generate_code = time_generate_code