diff --git a/.conda-ci-build-configure.sh b/.conda-ci-build-configure.sh
new file mode 100644
index 0000000000000000000000000000000000000000..80a2fb778073ec5d3277308090f6273030d331f8
--- /dev/null
+++ b/.conda-ci-build-configure.sh
@@ -0,0 +1 @@
+python ./configure.py --cl-inc-dir="$CONDA_PREFIX/include" --cl-lib-dir="$CONDA_PREFIX/lib"
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000000000000000000000000000000000000..dcbc21d86f9e4b17ea7e8803d538c4c0f0b6276a
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,32 @@
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..96fb4a49bf500e4e70494a6c5839385222c4c835
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,188 @@
+name: CI
+on:
+    push:
+        branches:
+        - master
+        tags:
+        - v*
+    pull_request:
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    flake8:
+        name: Flake8
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                echo "- matplotlib" >> $CONDA_ENVIRONMENT
+                echo "- pyopengl" >> $CONDA_ENVIRONMENT
+                echo "- ipython" >> $CONDA_ENVIRONMENT
+                echo "-------------------------------------------"
+                cat $CONDA_ENVIRONMENT
+                echo "-------------------------------------------"
+                USE_CONDA_BUILD=1
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
+
+                # Pylint won't find the Cython bits without this
+                PROJECT_INSTALL_FLAGS="--editable"
+
+                . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
+
+    pytest:
+        name: Pytest Linux
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                ./configure.py --cl-use-shipped-ext
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest_mac:
+        name: Pytest Mac
+        runs-on: macos-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                export CC=gcc
+                CONDA_ENVIRONMENT=.test-conda-env.yml
+                grep -v ocl-icd .test-conda-env-py3.yml > $CONDA_ENVIRONMENT
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                ./configure.py --cxxflags= --ldflags= --cl-libname=OpenCL
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ci-support.sh
+                ./configure.py --cl-use-shipped-ext
+                build_py_project_in_conda_env
+                build_docs
+
+    examples:
+        name: Examples
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ci-support.sh
+                EXTRA_INSTALL="pillow cgen mako imageio"
+                build_py_project_in_conda_env
+                (cd examples; rm -f gl_*)
+                run_examples --no-require-main
+
+    wheels:
+        name: Build and upload wheels
+        runs-on: ubuntu-latest
+        strategy:
+            matrix:
+                DOCKER_IMAGE:
+                - quay.io/pypa/manylinux2014_x86_64
+                # Disable i686 builds for now: no binary wheels for cryptography,
+                # source build fails, e.g. https://github.com/inducer/pyopencl/pull/421/checks?check_run_id=1781071632
+                # - quay.io/pypa/manylinux2014_i686
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            env:
+                TWINE_USERNAME: __token__
+                TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+                DOCKER_IMAGE: ${{ matrix.DOCKER_IMAGE }}
+
+            run: |
+                pwd
+                ls -la
+
+                # Only perform upload for tag builds, otherwise unset TWINE_USERNAME to prevent
+                if ! [[ $GITHUB_REF == refs/tags* ]]; then
+                    echo "Not a tag build, GITHUB_REF is '$GITHUB_REF'. Unsetting TWINE_USERNAME"
+                    unset TWINE_USERNAME
+                fi
+
+                if [[ $DOCKER_IMAGE == *i686* ]]; then
+                    PRE_CMD=linux32
+                else
+                    PRE_CMD=""
+                fi
+
+                docker run --rm -v `pwd`:/io -e TWINE_USERNAME -e TWINE_PASSWORD $DOCKER_IMAGE $PRE_CMD /io/scripts/build-wheels.sh
+                ls wheelhouse/
+
+    downstream_tests:
+        strategy:
+            matrix:
+                downstream_project: [loopy, boxtree, meshmode]
+        name: Tests for downstream project ${{ matrix.downstream_project }}
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            env:
+                DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
+            run: |
+                # {{{ configure pyopencl so it finds its headers
+
+                TEST_ENV_ROOT="$(pwd)/$DOWNSTREAM_PROJECT/.miniforge3/envs/testing"
+                ./configure.py --cl-inc-dir="$TEST_ENV_ROOT/include" --cl-lib-dir="$TEST_ENV_ROOT/lib"
+                git add -f siteconf.py
+
+                git config --global user.email "inform@tiker.net"
+                git config --global user.name "Github CI runner"
+                git commit -a -m "Fake commit to add aksetup.py"
+
+                # }}}
+
+                git clone "https://github.com/inducer/$DOWNSTREAM_PROJECT.git"
+
+                cd "$DOWNSTREAM_PROJECT"
+                echo "*** $DOWNSTREAM_PROJECT version: $(git rev-parse --short HEAD)"
+
+                sed -i "/egg=pyopencl/ c git+file://$(readlink -f ..)#egg=pyopencl" requirements.txt
+
+                export CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                sed -i 's/pyopencl/ocl-icd/' "$CONDA_ENVIRONMENT"
+
+                # Avoid slow or complicated tests in downstream projects
+                export PYTEST_ADDOPTS="-k 'not (slowtest or octave or mpi)'"
+
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ./ci-support.sh
+
+                build_py_project_in_conda_env
+                test_py_project
+
+# vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0550a676f98f96ff4876857ed895f781bfbb986e..fa83e362b87c93eb1692d1db82cccc48462e62e7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,21 +1,3 @@
-"Python 2.7 AMD CPU":
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=amd:pu
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  allow_failure: true
-  tags:
-  - python2.7
-  - amd-cl-cpu
-  - opengl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 Intel CPU:
   script:
   - export PY_EXE=python3
@@ -33,7 +15,7 @@ Python 3 Intel CPU:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan X:
+Python 3 Nvidia Titan X:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -49,7 +31,7 @@ Python 3 Titan X:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan V:
+Python 3 Nvidia Titan V:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -65,7 +47,7 @@ Python 3 Titan V:
     reports:
       junit: test/pytest.xml
 
-Python 3 K40:
+Python 3 Nvidia K40:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:k40
@@ -82,6 +64,7 @@ Python 3 K40:
       junit: test/pytest.xml
 
 Python 3 AMD GPU:
+  allow_failure: true
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=amd:gfx803
@@ -101,15 +84,15 @@ Python 3 AMD GPU:
     reports:
       junit: test/pytest.xml
 
-Python 2.7 POCL:
+Python 3 POCL:
   script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
-  - python2.7
+  - python3
   - pocl
   except:
   - tags
@@ -117,12 +100,13 @@ Python 2.7 POCL:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL:
+Python 3 POCL CL 1.1:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - echo "CL_PRETEND_VERSION = '1.1'" > siteconf.py
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
@@ -133,33 +117,34 @@ Python 3 POCL:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL CL 1.1:
+Python 3 POCL K40:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:k40
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - echo "CL_PRETEND_VERSION = '1.1'" > siteconf.py
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
   - pocl
+  - nvidia-k40
   except:
   - tags
   artifacts:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL:
+Python 3 POCL Titan V:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:titan
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
   - pocl
+  - nvidia-titan-v
   except:
   - tags
   artifacts:
@@ -169,7 +154,7 @@ Python 3 POCL:
 Python 3 POCL (+GL and special functions):
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako scipy pyfmmlib"
   - echo "CL_ENABLE_GL = True" > siteconf.py
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -183,30 +168,14 @@ Python 3 POCL (+GL and special functions):
     reports:
       junit: test/pytest.xml
 
-Python 2.7 Apple:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=app:cpu
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - export PKG_CONFIG_PATH=/usr/local/opt/libffi/lib/pkgconfig
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - apple
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 Conda Apple:
-  script:
-  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-  - export CC=gcc
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-  - "./configure.py --cxxflags= --ldflags= --cl-libname=OpenCL"
-  - ". ./build-and-test-py-project-within-miniconda.sh"
+  script: |
+    CONDA_ENVIRONMENT=.test-conda-env.yml
+    grep -v ocl-icd .test-conda-env-py3.yml > $CONDA_ENVIRONMENT
+    export CC=gcc
+    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+    ./configure.py --cxxflags= --ldflags= --cl-libname=OpenCL
+    . ./build-and-test-py-project-within-miniconda.sh
   tags:
   - apple
   except:
@@ -215,13 +184,15 @@ Python 3 Conda Apple:
     reports:
       junit: test/pytest.xml
 
-PyPy POCL:
+PyPy3 POCL:
   script:
-  - export PY_EXE=pypy
-  - export PYOPENCL_TEST=portable
+  - export PY_EXE=pypy3
+  - export PYOPENCL_TEST=portable:pthread
 
-  # https://github.com/pybind/pybind11/pull/1494
-  - export EXTRA_INSTALL="git+https://github.com/inducer/pybind11 numpy mako"
+  # On pypy, this seems to install old versions from the package index
+  # independently of whether newer ones are already present.
+  - rm -f pyproject.toml
+  - export EXTRA_INSTALL="pybind11 numpy mako"
 
   - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -245,12 +216,26 @@ Pylint:
   # is only one copy of everything.
   - PROJECT_INSTALL_FLAGS="--editable"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-  - ". ./prepare-and-run-pylint.sh pyopencl test/test_*.py"
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
   except:
   - tags
 
+Examples:
+  script: |
+    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+    . ci-support.sh
+    EXTRA_INSTALL="pillow cgen mako imageio"
+    build_py_project_in_venv
+    (cd examples; rm -f gl_*)
+    run_examples --no-require-main
+  except:
+  - tags
+  tags:
+  - python3
+  - pocl
+
 Documentation:
   script:
   - EXTRA_INSTALL="pybind11 numpy mako"
@@ -258,13 +243,11 @@ Documentation:
   - ". ./build-docs.sh"
   tags:
   - linux
-  only:
-  - master
 
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh pyopencl test"
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test
   tags:
   - python3
   except:
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index c3680c93a813507fa085855e2bf64ce9ebbba60b..f57daef50f0123216d0aff556237f7f5813d00a1 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,12 +1,13 @@
 name: test-conda-env
 channels:
 - conda-forge
-- defaults
+- nodefaults
 
 dependencies:
 - python=3
 - git
-- conda-forge::numpy
+- numpy
+- conda-forge/label/ocl-icd-dev::ocl-icd=3.0.0.dev0
 - pocl
 - mako
 - pybind11
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index c02ba146db1441958a5cac91da3ecf863bbd1feb..0000000000000000000000000000000000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-notifications:
-  email: false
-matrix:
-  include:
-  - sudo: required
-    services:
-      - docker
-    env:
-      - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
-  - sudo: required
-    services:
-      - docker
-    env:
-      - DOCKER_IMAGE=quay.io/pypa/manylinux1_i686
-      - PRE_CMD=linux32
-install:
-  - docker pull $DOCKER_IMAGE
-script:
-  - pwd
-  - ls -la
-  - if [[ "${TRAVIS_TAG}" == "" ]]; then unset TWINE_USERNAME; fi
-  - docker run --rm -v `pwd`:/io -e TWINE_USERNAME -e TWINE_PASSWORD $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh
-  - ls wheelhouse/
diff --git a/MANIFEST.in b/MANIFEST.in
index 89c7cb4d98214c2d9dd9b4eb5d21a63a00e433d5..1ad9dfc3a2898ddf2a35b1c7fb193720f30030ba 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,7 +3,6 @@ recursive-include pyopencl/cl *.h *.cl
 include src/*.h
 include src/*.hpp
 include src/*.cpp
-include *.h
 include test/*.py
 include test/*.h
 include test/*.spv
@@ -14,15 +13,13 @@ include doc/*.rst
 include doc/Makefile
 include doc/*.py
 include doc/conf.py
-include doc/_static/*.css
-include doc/_templates/*.html
 
-include *.py.in
 include configure.py
 include Makefile.in
 include aksetup_helper.py
 include README_SETUP.txt
 include README.rst
 include LICENSE
+include pytest.ini
 
 recursive-include contrib *.vim *.py README
diff --git a/README.rst b/README.rst
index 3b6e12016c2b28245765be54070c27c92bc698fc..e2906fbbf4abc0654ab48b99ebe4e1b408625011 100644
--- a/README.rst
+++ b/README.rst
@@ -4,15 +4,13 @@ PyOpenCL: Pythonic Access to OpenCL, with Arrays and Algorithms
 .. image:: https://gitlab.tiker.net/inducer/pyopencl/badges/master/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/pyopencl/commits/master
-.. image:: https://dev.azure.com/ak-spam/inducer/_apis/build/status/inducer.pyopencl?branchName=master
-    :alt: Azure Build Status
-    :target: https://dev.azure.com/ak-spam/inducer/_build/latest?definitionId=5&branchName=master
+.. image:: https://github.com/inducer/pyopencl/workflows/CI/badge.svg?branch=master&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/pyopencl/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/pyopencl.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/pyopencl/
 
-(Also: `Travis CI <https://travis-ci.org/inducer/pyopencl/builds>`_ to build binary wheels for releases, see `#264 <https://github.com/inducer/pyopencl/pull/264>`_)
-
 PyOpenCL lets you access GPUs and other massively parallel compute
 devices from Python. It tries to offer computing goodness in the
 spirit of its sister project `PyCUDA <https://mathema.tician.de/software/pycuda>`_:
@@ -24,7 +22,7 @@ spirit of its sister project `PyCUDA <https://mathema.tician.de/software/pycuda>
   crash-free code.
 
 * Completeness. PyOpenCL puts the full power of OpenCL's API at
-  your disposal, if you wish.  Every obscure `get_info()` query and 
+  your disposal, if you wish.  Every obscure `get_info()` query and
   all CL calls are accessible.
 
 * Automatic Error Checking. All CL errors are automatically
@@ -36,11 +34,11 @@ spirit of its sister project `PyCUDA <https://mathema.tician.de/software/pycuda>
 * Helpful and complete `Documentation <https://documen.tician.de/pyopencl>`__
   as well as a `Wiki <https://wiki.tiker.net/PyOpenCL>`_.
 
-* Liberal license. PyOpenCL is open-source under the 
+* Liberal license. PyOpenCL is open-source under the
   `MIT license <https://en.wikipedia.org/wiki/MIT_License>`_
   and free for commercial, academic, and private use.
 
-* Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's 
+* Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's
   CL implementations.
 
 Simple 4-step `install instructions <https://documen.tician.de/pyopencl/misc.html#installation>`_
diff --git a/aksetup_helper.py b/aksetup_helper.py
index f90d085284f00e6a6cd2a44e6b9bd63a857485f3..8fd58f2b9bae4a0141cd5d3e0ba6fd9144160df4 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -1,8 +1,15 @@
-import setuptools  # noqa
-from setuptools import Extension
 import sys
-from setuptools.command.build_ext import (  # noqa: N812
-        build_ext as BaseBuildExtCommand)
+try:
+    from setuptools import Extension
+    from setuptools.command.build_ext import (  # noqa: N812
+            build_ext as BaseBuildExtCommand)
+
+except ImportError:
+    class Extension:
+        pass
+
+    class BaseBuildExtCommand:
+        pass
 
 
 def count_down_delay(delay):
@@ -162,10 +169,12 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
         from distutils import sysconfig
 
         cvars = sysconfig.get_config_vars()
-        cflags = cvars.get('OPT')
+
+        bad_prefixes = ["-g", "-O", "-Wstrict-prototypes", "-DNDEBUG"]
+
+        cflags = cvars.get("OPT")
         if cflags:
-            cflags = remove_prefixes(cflags.split(),
-                    ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG'])
+            cflags = remove_prefixes(cflags.split(), bad_prefixes)
             if debug:
                 cflags.append("-g")
             else:
@@ -175,19 +184,25 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
                     cflags.append("-O%s" % what_opt)
                     cflags.append("-DNDEBUG")
 
-            cvars['OPT'] = str.join(' ', cflags)
-            if "BASECFLAGS" in cvars:
-                cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars["OPT"]
-            else:
-                assert "CFLAGS" in cvars
+            cvars["OPT"] = str.join(" ", cflags)
+
+        cflags = cvars.get("CONFIGURE_CFLAGS")
+        if cflags:
+            cflags = remove_prefixes(cflags.split(), bad_prefixes)
+            cvars["CONFIGURE_CFLAGS"] = str.join(" ", cflags)
+
+        if "BASECFLAGS" in cvars:
+            cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars.get("OPT", "")
+        else:
+            assert "CFLAGS" in cvars
 
         if fast_link:
             for varname in ["LDSHARED", "BLDSHARED"]:
                 ldsharedflags = cvars.get(varname)
                 if ldsharedflags:
                     ldsharedflags = remove_prefixes(ldsharedflags.split(),
-                            ['-Wl,-O'])
-                    cvars[varname] = str.join(' ', ldsharedflags)
+                            ["-Wl,-O"])
+                    cvars[varname] = str.join(" ", ldsharedflags)
 
 # }}}
 
@@ -621,7 +636,7 @@ def set_up_shipped_boost_if_requested(project_name, conf, source_path=None,
 
                 "BOOST_MULTI_INDEX_DISABLE_SERIALIZATION": 1,
                 "BOOST_PYTHON_SOURCE": 1,
-                "boost": '%sboost' % project_name,
+                "boost": "%sboost" % project_name,
                 }
 
         if boost_chrono is False:
@@ -708,7 +723,9 @@ def substitute(substitutions, fname):
     string_var_re = re.compile(r"\$str\{([A-Za-z_0-9]+)\}")
 
     fname_in = fname+".in"
-    lines = open(fname_in, "r").readlines()
+    with open(fname_in, "r") as inf:
+        lines = inf.readlines()
+
     new_lines = []
     for line in lines:
         made_change = True
@@ -738,7 +755,8 @@ def substitute(substitutions, fname):
     new_lines.insert(1, "# DO NOT EDIT THIS FILE -- "
             "it was generated by configure.py\n")
     new_lines.insert(2, "# %s\n" % (" ".join(sys.argv)))
-    open(fname, "w").write("".join(new_lines))
+    with open(fname, "w") as outf:
+        outf.write("".join(new_lines))
 
     from os import stat, chmod
     infile_stat_res = stat(fname_in)
@@ -774,7 +792,7 @@ def _run_git_command(cmd):
     if stdout:
         return stdout.decode("utf-8"), git_error
     else:
-        return '', "(subprocess call to git did not succeed)"
+        return "", "(subprocess call to git did not succeed)"
 
 
 def check_git_submodules():
@@ -793,12 +811,12 @@ def check_git_submodules():
     pkg_warnings = []
 
     lines = stdout.split("\n")
-    for l in lines:
-        if not l.strip():
+    for ln in lines:
+        if not ln.strip():
             continue
 
-        status = l[0]
-        sha, package = l[1:].split(" ", 1)
+        status = ln[0]
+        sha, package = ln[1:].split(" ", 1)
 
         if package == "bpl-subset" or (
                 package.startswith("boost") and package.endswith("subset")):
@@ -818,39 +836,39 @@ def check_git_submodules():
                     % package)
 
     if pkg_warnings:
-            print(DASH_SEPARATOR)
-            print("git submodules are not up-to-date or in odd state")
-            print(DASH_SEPARATOR)
-            print("If this makes no sense, you probably want to say")
-            print("")
-            print(" $ git submodule update --init")
-            print("")
-            print("to fetch what you are presently missing and "
-                    "move on with your life.")
-            print("If you got this from a distributed package on the "
-                    "net, that package is")
-            print("broken and should be fixed. Please inform whoever "
-                    "gave you this package.")
-            print("")
-            print("These issues were found:")
-            for w in pkg_warnings:
-                print("  %s" % w)
-            print("")
-            print("I will try to initialize the submodules for you "
-                    "after a short wait.")
-            print(DASH_SEPARATOR)
-            print("Hit Ctrl-C now if you'd like to think about the situation.")
-            print(DASH_SEPARATOR)
+        print(DASH_SEPARATOR)
+        print("git submodules are not up-to-date or in odd state")
+        print(DASH_SEPARATOR)
+        print("If this makes no sense, you probably want to say")
+        print("")
+        print(" $ git submodule update --init")
+        print("")
+        print("to fetch what you are presently missing and "
+                "move on with your life.")
+        print("If you got this from a distributed package on the "
+                "net, that package is")
+        print("broken and should be fixed. Please inform whoever "
+                "gave you this package.")
+        print("")
+        print("These issues were found:")
+        for w in pkg_warnings:
+            print("  %s" % w)
+        print("")
+        print("I will try to initialize the submodules for you "
+                "after a short wait.")
+        print(DASH_SEPARATOR)
+        print("Hit Ctrl-C now if you'd like to think about the situation.")
+        print(DASH_SEPARATOR)
 
-            from os.path import exists
-            if not exists(".dirty-git-ok"):
-                count_down_delay(delay=10)
-                stdout, git_error = _run_git_command(
-                        ["submodule", "update", "--init"])
-                if git_error is None:
-                    print(DASH_SEPARATOR)
-                    print("git submodules initialized successfully")
-                    print(DASH_SEPARATOR)
+        from os.path import exists
+        if not exists(".dirty-git-ok"):
+            count_down_delay(delay=10)
+            stdout, git_error = _run_git_command(
+                    ["submodule", "update", "--init"])
+            if git_error is None:
+                print(DASH_SEPARATOR)
+                print("git submodules initialized successfully")
+                print(DASH_SEPARATOR)
 
 # }}}
 
@@ -902,9 +920,12 @@ def has_flag(compiler, flagname):
     the specified compiler.
     """
     import tempfile
-    with tempfile.NamedTemporaryFile('w', suffix='.cpp', delete=False) as f:
-        f.write('int main (int argc, char **argv) { return 0; }')
+    with tempfile.NamedTemporaryFile("w", suffix=".cpp", delete=False) as f:
+        f.write("int main (int argc, char **argv) { return 0; }")
         fname = f.name
+
+    import setuptools
+
     try:
         compiler.compile([fname], extra_postargs=[flagname])
     except setuptools.distutils.errors.CompileError:
@@ -917,41 +938,60 @@ def cpp_flag(compiler):
 
     The c++14 is prefered over c++11 (when it is available).
     """
-    if has_flag(compiler, '-std=gnu++14'):
-        return '-std=gnu++14'
-    elif has_flag(compiler, '-std=c++14'):
-        return '-std=c++14'
-    elif has_flag(compiler, '-std=c++11'):
-        return '-std=c++11'
+    if has_flag(compiler, "-std=gnu++14"):
+        return "-std=gnu++14"
+    elif has_flag(compiler, "-std=c++14"):
+        return "-std=c++14"
+    elif has_flag(compiler, "-std=c++11"):
+        return "-std=c++11"
     else:
-        raise RuntimeError('Unsupported compiler -- at least C++11 support '
-                           'is needed!')
+        raise RuntimeError("Unsupported compiler -- at least C++11 support "
+                           "is needed!")
 
 
 class PybindBuildExtCommand(NumpyBuildExtCommand):
     """A custom build extension for adding compiler-specific options."""
     c_opts = {
-        'msvc': ['/EHsc'],
-        'unix': [],
+        "msvc": ["/EHsc"],
+        "unix": [],
     }
 
-    if sys.platform == 'darwin':
-        c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
-
     def build_extensions(self):
         ct = self.compiler.compiler_type
         opts = self.c_opts.get(ct, [])
-        if ct in ['unix', 'mingw32']:
+        cxx_opts = []
+
+        if ct in ["unix", "mingw32"]:
             opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
-            opts.append(cpp_flag(self.compiler))
-            if has_flag(self.compiler, '-fvisibility=hidden'):
-                opts.append('-fvisibility=hidden')
-        elif ct == 'msvc':
+            cxx_opts.append(cpp_flag(self.compiler))
+            if has_flag(self.compiler, "-fvisibility=hidden"):
+                opts.append("-fvisibility=hidden")
+            if sys.platform == "darwin":
+                if has_flag(self.compiler, "-stdlib=libc++"):
+                    opts.append("-stdlib=libc++")
+                if has_flag(self.compiler, "-mmacosx-version-min=10.7"):
+                    opts.append("-mmacosx-version-min=10.7")
+        elif ct == "msvc":
             opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
         for ext in self.extensions:
             ext.extra_compile_args = ext.extra_compile_args + opts
 
-        NumpyBuildExtCommand.build_extensions(self)
+        prev__compile = self.compiler._compile
+
+        # -std=... used on C files causes an error on Apple LLVM
+        # https://gitlab.tiker.net/inducer/pymetis/-/jobs/102421
+        def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            if ext == ".cpp":
+                cc_args = cc_args + cxx_opts
+
+            return prev__compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+
+        self.compiler._compile = _compile
+
+        try:
+            NumpyBuildExtCommand.build_extensions(self)
+        finally:
+            self.compiler._compile = prev__compile
 
 # }}}
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index b1bcbc9f9dd1c379939fc26be69cbca9654b11b3..0000000000000000000000000000000000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-jobs:
--
-    job: 'Python2'
-    pool:
-        vmImage: 'ubuntu-16.04'
-
-    steps:
-    -
-        script: |
-            set -e
-            sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml
-            CONDA_ENVIRONMENT=.test-conda-env-py2.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Python3'
-    pool:
-        vmImage: 'ubuntu-16.04'
-
-    steps:
-    -
-        script: |
-            set -e
-            CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Flake8'
-    pool:
-        vmImage: 'ubuntu-16.04'
-    strategy:
-        matrix:
-            Python36:
-                python.version: '3.6'
-
-    steps:
-    -
-        task: UsePythonVersion@0
-        inputs:
-            versionSpec: '$(python.version)'
-
-    -
-        script: |
-            set -e
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-            . ./prepare-and-run-flake8.sh pyopencl test
-
-        displayName: 'Flake8'
diff --git a/configure.py b/configure.py
index 63edd4b997e7a83f6d182359cbafe5078b126a60..85c9841d9f995805d8b64bb8aae976b6ecc43ba3 100755
--- a/configure.py
+++ b/configure.py
@@ -1,6 +1,4 @@
-#! /usr/bin/env python
-
-from __future__ import absolute_import
+#! /usr/bin/env python3
 
 from aksetup_helper import configure_frontend
 configure_frontend()
diff --git a/contrib/fortran-to-opencl/setup.cfg b/contrib/fortran-to-opencl/setup.cfg
index d3f13a0e64b79c00a957cb1369e335e0b8a00d76..6e9077bdac0dda5dbeface44a1612eb57519c695 100644
--- a/contrib/fortran-to-opencl/setup.cfg
+++ b/contrib/fortran-to-opencl/setup.cfg
@@ -1,3 +1,8 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,N815
 max-line-length=85
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
+
diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py
index 66f6e1dbfa80c2647313177730153ee75e80f4d0..39611307c3ef903515c3555068ef6909e1f4e12e 100644
--- a/contrib/fortran-to-opencl/translate.py
+++ b/contrib/fortran-to-opencl/translate.py
@@ -1,9 +1,3 @@
-from __future__ import division, with_statement
-from __future__ import absolute_import
-from __future__ import print_function
-import six
-from six.moves import range
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -33,6 +27,7 @@ from pymbolic.parser import Parser as ExpressionParserBase
 from pymbolic.mapper import CombineMapper
 import pymbolic.primitives as p
 from pymbolic.mapper.c_code import CCodeMapper as CCodeMapperBase
+from sys import intern
 
 from warnings import warn
 
@@ -344,7 +339,7 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
     def map_sum(self, expr, enclosing_prec):
         tgt_dtype = self.infer_type(expr)
-        is_complex = tgt_dtype.kind == 'c'
+        is_complex = tgt_dtype.kind == "c"
 
         if not is_complex:
             return CCodeMapperBase.map_sum(self, expr, enclosing_prec)
@@ -352,9 +347,9 @@ class ComplexCCodeMapper(CCodeMapperBase):
             tgt_name = complex_type_name(tgt_dtype)
 
             reals = [child for child in expr.children
-                    if 'c' != self.infer_type(child).kind]
+                    if "c" != self.infer_type(child).kind]
             complexes = [child for child in expr.children
-                    if 'c' == self.infer_type(child).kind]
+                    if "c" == self.infer_type(child).kind]
 
             from pymbolic.mapper.stringifier import PREC_SUM, PREC_NONE
             real_sum = self.join_rec(" + ", reals, PREC_SUM)
@@ -366,12 +361,12 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
             complex_sum = self.rec(complexes[0], myprec)
             for child in complexes[1:]:
-                complex_sum = "%s_add(%s, %s)" % (
+                complex_sum = "{}_add({}, {})".format(
                         tgt_name, complex_sum,
                         self.rec(child, PREC_NONE))
 
             if real_sum:
-                result = "%s_add(%s_fromreal(%s), %s)" % (
+                result = "{}_add({}_fromreal({}), {})".format(
                         tgt_name, tgt_name, real_sum, complex_sum)
             else:
                 result = complex_sum
@@ -380,7 +375,7 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
     def map_product(self, expr, enclosing_prec):
         tgt_dtype = self.infer_type(expr)
-        is_complex = 'c' == tgt_dtype.kind
+        is_complex = "c" == tgt_dtype.kind
 
         if not is_complex:
             return CCodeMapperBase.map_product(self, expr, enclosing_prec)
@@ -388,9 +383,9 @@ class ComplexCCodeMapper(CCodeMapperBase):
             tgt_name = complex_type_name(tgt_dtype)
 
             reals = [child for child in expr.children
-                    if 'c' != self.infer_type(child).kind]
+                    if "c" != self.infer_type(child).kind]
             complexes = [child for child in expr.children
-                    if 'c' == self.infer_type(child).kind]
+                    if "c" == self.infer_type(child).kind]
 
             from pymbolic.mapper.stringifier import PREC_PRODUCT, PREC_NONE
             real_prd = self.join_rec("*", reals, PREC_PRODUCT)
@@ -402,12 +397,12 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
             complex_prd = self.rec(complexes[0], myprec)
             for child in complexes[1:]:
-                complex_prd = "%s_mul(%s, %s)" % (
+                complex_prd = "{}_mul({}, {})".format(
                         tgt_name, complex_prd,
                         self.rec(child, PREC_NONE))
 
             if real_prd:
-                result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd)
+                result = f"{tgt_name}_rmul({real_prd}, {complex_prd})"
             else:
                 result = complex_prd
 
@@ -415,32 +410,32 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
     def map_quotient(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_NONE
-        n_complex = 'c' == self.infer_type(expr.numerator).kind
-        d_complex = 'c' == self.infer_type(expr.denominator).kind
+        n_complex = "c" == self.infer_type(expr.numerator).kind
+        d_complex = "c" == self.infer_type(expr.denominator).kind
 
         tgt_dtype = self.infer_type(expr)
 
         if not (n_complex or d_complex):
             return CCodeMapperBase.map_quotient(self, expr, enclosing_prec)
         elif n_complex and not d_complex:
-            return "%s_divider(%s, %s)" % (
+            return "{}_divider({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
         elif not n_complex and d_complex:
-            return "%s_rdivide(%s, %s)" % (
+            return "{}_rdivide({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
         else:
-            return "%s_divide(%s, %s)" % (
+            return "{}_divide({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
 
     def map_remainder(self, expr, enclosing_prec):
         tgt_dtype = self.infer_type(expr)
-        if 'c' == tgt_dtype.kind:
+        if "c" == tgt_dtype.kind:
             raise RuntimeError("complex remainder not defined")
 
         return CCodeMapperBase.map_remainder(self, expr, enclosing_prec)
@@ -449,23 +444,23 @@ class ComplexCCodeMapper(CCodeMapperBase):
         from pymbolic.mapper.stringifier import PREC_NONE
 
         tgt_dtype = self.infer_type(expr)
-        if 'c' == tgt_dtype.kind:
+        if "c" == tgt_dtype.kind:
             if expr.exponent in [2, 3, 4]:
                 value = expr.base
                 for i in range(expr.exponent-1):
                     value = value * expr.base
                 return self.rec(value, enclosing_prec)
             else:
-                b_complex = 'c' == self.infer_type(expr.base).kind
-                e_complex = 'c' == self.infer_type(expr.exponent).kind
+                b_complex = "c" == self.infer_type(expr.base).kind
+                e_complex = "c" == self.infer_type(expr.exponent).kind
 
                 if b_complex and not e_complex:
-                    return "%s_powr(%s, %s)" % (
+                    return "{}_powr({}, {})".format(
                             complex_type_name(tgt_dtype),
                             self.rec(expr.base, PREC_NONE),
                             self.rec(expr.exponent, PREC_NONE))
                 else:
-                    return "%s_pow(%s, %s)" % (
+                    return "{}_pow({}, {})".format(
                             complex_type_name(tgt_dtype),
                             self.rec(expr.base, PREC_NONE),
                             self.rec(expr.exponent, PREC_NONE))
@@ -485,7 +480,7 @@ class CCodeMapper(ComplexCCodeMapper):
 
     def map_subscript(self, expr, enclosing_prec):
         idx_dtype = self.infer_type(expr.index)
-        if not 'i' == idx_dtype.kind or 'u' == idx_dtype.kind:
+        if not "i" == idx_dtype.kind or "u" == idx_dtype.kind:
             ind_prefix = "(int) "
         else:
             ind_prefix = ""
@@ -509,10 +504,10 @@ class CCodeMapper(ComplexCCodeMapper):
         arg_dtypes = [self.infer_type(par) for par in expr.parameters]
 
         name = expr.function.name
-        if 'f' == tgt_dtype.kind and name == "abs":
+        if "f" == tgt_dtype.kind and name == "abs":
             name = "fabs"
 
-        elif 'c' == tgt_dtype.kind:
+        elif "c" == tgt_dtype.kind:
             if name in ["conjg", "dconjg"]:
                 name = "conj"
 
@@ -522,7 +517,7 @@ class CCodeMapper(ComplexCCodeMapper):
             if name == "dble":
                 name = "real"
 
-            name = "%s_%s" % (
+            name = "{}_{}".format(
                     complex_type_name(tgt_dtype),
                     name)
 
@@ -532,11 +527,11 @@ class CCodeMapper(ComplexCCodeMapper):
             if name == "aimag":
                 name = "imag"
 
-            name = "%s_%s" % (
+            name = "{}_{}".format(
                     complex_type_name(arg_dtype),
                     name)
 
-        elif 'c' == tgt_dtype.kind and name == "abs":
+        elif "c" == tgt_dtype.kind and name == "abs":
             arg_dtype, = arg_dtypes
 
             name = "%s_abs" % (
@@ -568,7 +563,7 @@ class CCodeMapper(ComplexCCodeMapper):
         from pymbolic.mapper.stringifier import PREC_NONE
         if expr.dtype.kind == "c":
             r, i = expr.value
-            return "%s_new(%s, %s)" % (
+            return "{}_new({}, {})".format(
                     complex_type_name(expr.dtype),
                     self.rec(r, PREC_NONE),
                     self.rec(i, PREC_NONE))
@@ -581,7 +576,7 @@ class CCodeMapper(ComplexCCodeMapper):
 
 # }}}
 
-class Scope(object):
+class Scope:
     def __init__(self, subprogram_name, arg_names=set()):
         self.subprogram_name = subprogram_name
 
@@ -608,8 +603,8 @@ class Scope(object):
 
     def known_names(self):
         return (self.used_names
-                | set(six.iterkeys(self.dim_map))
-                | set(six.iterkeys(self.type_map)))
+                | set(self.dim_map.keys())
+                | set(self.type_map.keys()))
 
     def is_known(self, name):
         return (name in self.used_names
@@ -643,12 +638,12 @@ class Scope(object):
     def translate_var_name(self, name):
         shape = self.dim_map.get(name)
         if name in self.data and shape is not None:
-            return "%s_%s" % (self.subprogram_name, name)
+            return f"{self.subprogram_name}_{name}"
         else:
             return name
 
 
-class FTreeWalkerBase(object):
+class FTreeWalkerBase:
     def __init__(self):
         self.scope_stack = []
 
@@ -675,7 +670,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z,]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z,]+)\))?$")
 
     def parse_dimension_specs(self, dim_decls):
         def parse_bounds(bounds_str):
@@ -716,7 +711,7 @@ class ArgumentAnalayzer(FTreeWalkerBase):
         FTreeWalkerBase.__init__(self)
 
         # map (func, arg_nr) to
-        # 'w' for 'needs pointer'
+        # "w" for "needs pointer"
         # [] for no obstacle to de-pointerification known
         # [(func_name, arg_nr), ...] # depends on how this arg is used
 
@@ -949,12 +944,12 @@ class F2CLTranslator(FTreeWalkerBase):
 
             if shape is not None:
                 dim_stmt = cgen.Statement(
-                    "dimension \"fortran\" %s[%s]" % (
+                    'dimension \"fortran\" {}[{}]'.format(
                         scope.translate_var_name(name),
                         ", ".join(gen_shape(s) for s in shape)
                         ))
 
-                # cannot omit 'dimension' decl even for rank-1 args:
+                # cannot omit "dimension" decl even for rank-1 args:
                 result.append(dim_stmt)
 
             if name in scope.data:
@@ -975,7 +970,7 @@ class F2CLTranslator(FTreeWalkerBase):
                             cgen.Initializer(
                                 CLConstant(
                                     cgen.ArrayOf(self.get_declarator(
-                                        "%s_%s" % (scope.subprogram_name, name)))),
+                                        f"{scope.subprogram_name}_{name}"))),
                                 "{ %s }" % ",\n".join(self.gen_expr(x) for x in data)
                                 ))
             else:
@@ -1092,7 +1087,7 @@ class F2CLTranslator(FTreeWalkerBase):
 
             ("integer", ""): np.int32,
             ("integer", "4"): np.int32,
-            ("complex", "8"): np.int64,
+            ("integer", "8"): np.int64,
             }
 
     def dtype_from_stmt(self, stmt):
@@ -1186,11 +1181,11 @@ class F2CLTranslator(FTreeWalkerBase):
         rhs_dtype = infer_type(rhs)
 
         # check for silent truncation of complex
-        if lhs_dtype.kind != 'c' and rhs_dtype.kind == 'c':
+        if lhs_dtype.kind != "c" and rhs_dtype.kind == "c":
             from pymbolic import var
             rhs = var("real")(rhs)
         # check for silent widening of real
-        if lhs_dtype.kind == 'c' and rhs_dtype.kind != 'c':
+        if lhs_dtype.kind == "c" and rhs_dtype.kind != "c":
             from pymbolic import var
             rhs = var("fromreal")(rhs)
 
@@ -1231,11 +1226,11 @@ class F2CLTranslator(FTreeWalkerBase):
             cast = self.force_casts.get(
                     (node.designator, i))
             if cast is not None:
-                result = "(%s) (%s)" % (cast, result)
+                result = f"({cast}) ({result})"
 
             return result
 
-        return cgen.Statement("%s(%s)" % (
+        return cgen.Statement("{}({})".format(
             node.designator,
             ", ".join(transform_arg(i, arg_str)
                 for i, arg_str in enumerate(node.items))))
@@ -1328,9 +1323,9 @@ class F2CLTranslator(FTreeWalkerBase):
                 comp_op = "<="
 
             return cgen.For(
-                    "%s = %s" % (loop_var, self.gen_expr(start)),
-                    "%s %s %s" % (loop_var, comp_op, self.gen_expr(stop)),
-                    "%s += %s" % (loop_var, self.gen_expr(step)),
+                    "{} = {}".format(loop_var, self.gen_expr(start)),
+                    "{} {} {}".format(loop_var, comp_op, self.gen_expr(stop)),
+                    "{} += {}".format(loop_var, self.gen_expr(step)),
                     cgen.block_if_necessary(body))
 
         else:
@@ -1418,9 +1413,9 @@ if __name__ == "__main__":
     import logging
     console = logging.StreamHandler()
     console.setLevel(logging.DEBUG)
-    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
     console.setFormatter(formatter)
-    logging.getLogger('fparser').addHandler(console)
+    logging.getLogger("fparser").addHandler(console)
 
     from cgen.opencl import CLConstant
 
diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index cbaf1e9305c61d0e8928bbd46da2bdb6fd2aef83..2ff63e07ef24e46ef8e1e3c63ae290d5663a8f3e 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -19,7 +19,7 @@ evaluate multi-stage expressions on one or several operands in a single pass.
     .. method:: __call__(*args, wait_for=None)
 
         Invoke the generated scalar kernel. The arguments may either be scalars or
-        :class:`GPUArray` instances.
+        :class:`pyopencl.array.Array` instances.
 
         |std-enqueue-blurb|
 
@@ -110,7 +110,7 @@ Prefix Sums ("scan")
     or include statements.
 
 A prefix sum is a running sum of an array, as provided by
-e.g. :mod:`numpy.cumsum`::
+e.g. :func:`numpy.cumsum`::
 
     >>> import numpy as np
     >>> a = [1,1,1,1,1,2,2,2,2,2]
@@ -169,7 +169,8 @@ in PyOpenCL:
 * Segmented scans
 
 * Access to the previous item in *input_expr* (e.g. for comparisons)
-  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_ of :func:`unique` for an example.
+  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_
+  of :func:`pyopencl.algorithm.unique` for an example.
 
 Making Custom Scan Kernels
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/conf.py b/doc/conf.py
index 7c3707fd55061faa25503d18422c545192a55aed..d5258330f57098baf3c9df136cd8eed4d7560b84 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,50 +1,29 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import
-
-# PyOpenCL documentation build configuration file, created by
-# sphinx-quickstart on Fri Jun 13 00:51:19 2008.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# The contents of this file are pickled, so don't put values in the namespace
-# that aren't pickleable (module imports are okay, they're removed automatically).
-#
-# All configuration values have a default value; values that are commented out
-# serve to show the default value.
-
-#import sys, os
-
-# If your extensions are in another directory, add it here. If the directory
-# is relative to the documentation root, use os.path.abspath to make it
-# absolute, like shown here.
-#sys.path.append(os.path.abspath('some/directory'))
-
 # General configuration
 # ---------------------
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
-        'sphinx.ext.intersphinx',
-        'sphinx.ext.autodoc',
-        'sphinx.ext.doctest',
-        ]
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+    "sphinx_copybutton",
+]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
-exclude_patterns = ['subst.rst']
+exclude_patterns = ["subst.rst"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General substitutions.
-project = 'PyOpenCL'
-copyright = '2009, Andreas Kloeckner'
+project = "PyOpenCL"
+copyright = "2009, Andreas Kloeckner"
 
 # The default replacements for |version| and |release|, also used in various
 # other places throughout the built documents.
@@ -53,95 +32,82 @@ copyright = '2009, Andreas Kloeckner'
 ver_dic = {}
 with open("../pyopencl/version.py") as ver_file:
     ver_src = ver_file.read()
-exec(compile(ver_src, "../pyopencl/version.py", 'exec'), ver_dic)
+exec(compile(ver_src, "../pyopencl/version.py", "exec"), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-today_fmt = '%B %d, %Y'
+today_fmt = "%B %d, %Y"
 
 # List of documents that shouldn't be included in the build.
-#unused_docs = []
+# unused_docs = []
 
 # List of directories, relative to source directories, that shouldn't be searched
 # for source files.
-#exclude_dirs = []
+# exclude_dirs = []
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 
 # Options for HTML output
 # -----------------------
 
-html_theme = "alabaster"
+html_theme = "furo"
 
 html_theme_options = {
-        "extra_nav_links": {
-            "🚀 Github": "https://github.com/inducer/pyopencl",
-            "💾 Download Releases": "https://pypi.python.org/pypi/pyopencl",
-            }
         }
 
-html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
-}
-
 # The style sheet to use for HTML and HTML Help pages. A file of that name
 # must exist either in Sphinx' static/ path, or in one of the custom paths
 # given in html_static_path.
-#html_style = 'default.css'
+# html_style = 'default.css'
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
-#html_title = None
+# html_title = None
 
 # The name of an image file (within the static path) to place at the top of
 # the sidebar.
-#html_logo = None
+# html_logo = None
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-html_last_updated_fmt = '%b %d, %Y'
+html_last_updated_fmt = "%b %d, %Y"
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_use_modindex = True
+# html_use_modindex = True
 
 # If true, the reST sources are included in the HTML build as _sources/<name>.
 html_copy_source = False
@@ -149,52 +115,20 @@ html_copy_source = False
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
 
 # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = ''
+# html_file_suffix = ''
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'PyOpenClDoc'
-
+htmlhelp_basename = "PyOpenClDoc"
 
-# Options for LaTeX output
-# ------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, document class [howto/manual]).
-latex_documents = [
-        ('index', 'pyopencl.tex', 'PyOpenCL Documentation',
-            'Andreas Kloeckner', 'manual'),
-        ]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_use_modindex = True
 
 intersphinx_mapping = {
-        'http://docs.python.org/dev': None,
-        'http://docs.scipy.org/doc/numpy/': None,
-        'http://docs.makotemplates.org/en/latest/': None,
-        }
+    "https://docs.python.org/dev": None,
+    "https://numpy.org/doc/stable/": None,
+    "https://docs.makotemplates.org/en/latest/": None,
+}
 
-autoclass_content = "both"
+autoclass_content = "class"
+autodoc_typehints = "description"
diff --git a/doc/index.rst b/doc/index.rst
index c86d04ae10533a1d3aaf6b528063932d973b5222..28a910ab08d1a5fbcbeff9cfc260ec346ddc2bb4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -88,7 +88,7 @@ Software that works with or enhances PyOpenCL
   :class:`pyopencl.array.Array` instances.
 
 * Robbert Harms and Alard Roebroeck's `MOT
-  <https://github.com/cbclab/MOT>`_ offers a variety of GPU-enabled non-linear optimization algorithms 
+  <https://github.com/cbclab/MOT>`_ offers a variety of GPU-enabled non-linear optimization algorithms
   and MCMC sampling routines for parallel optimization and sampling of multiple problems.
 
 If you know of a piece of software you feel that should be on this list, please
@@ -113,6 +113,8 @@ Contents
     howto
     tools
     misc
+    🚀 Github <https://github.com/inducer/pyopencl>
+    💾 Download Releases <https://pypi.org/project/pyopencl>
 
 Note that this guide does not explain OpenCL programming and technology. Please
 refer to the official `Khronos OpenCL documentation <http://khronos.org/opencl>`_
diff --git a/doc/make_constants.py b/doc/make_constants.py
index 9ab78ad070ec6d0cc419458335a75ed44f9c9a16..c84925443b9158b267f062d6c241083c53e4b18c 100644
--- a/doc/make_constants.py
+++ b/doc/make_constants.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -32,7 +30,10 @@ cl_11 = ("CL_1.1", "0.92")
 cl_12 = ("CL_1.2", "2011.2")
 cl_12_2015 = ("CL_1.2", "2015.2")
 cl_20 = ("CL_2.0", "2015.2")
+cl_21_late = ("CL_2.1", "2020.3")
 cl_21 = ("CL_2.1", "2016.2")
+cl_22 = ("CL_2.1", "2020.3")
+cl_30 = ("CL_3.0", "2020.3")
 amd_devattr = ("cl_amd_device_attribute_query", "2013.2")
 qcom_hp_devattr = ("cl_qcom_ext_host_ptr", "2016.2")
 intel_me_devattr = ("cl_intel_advanced_motion_estimation", "2016.2")
@@ -60,6 +61,7 @@ def get_extra_lines(tup):
         yield "    .. versionadded:: %s" % pyopencl_ver
         yield ""
 
+
 const_ext_lookup = {
         cl.status_code: {
             "PLATFORM_NOT_FOUND_KHR": ("cl_khr_icd", "2011.1"),
@@ -83,6 +85,9 @@ const_ext_lookup = {
             "INVALID_PIPE_SIZE": cl_20,
             "INVALID_DEVICE_QUEUE": cl_20,
 
+            "INVALID_SPEC_ID": cl_22,
+            "MAX_SIZE_RESTRICTION_EXCEEDED": cl_22,
+
             },
 
         cl.device_info: {
@@ -106,7 +111,8 @@ const_ext_lookup = {
             "INTEGRATED_MEMORY_NV": nv_devattr,
             "ATTRIBUTE_ASYNC_ENGINE_COUNT_NV": nv_devattr,
             "PCI_BUS_ID_NV": nv_devattr,
-            "PCI_BUS_SLOT_NV": nv_devattr,
+            "PCI_SLOT_ID_NV": nv_devattr,
+            "PCI_DOMAIN_ID_NV": nv_devattr,
 
             "DOUBLE_FP_CONFIG":
             ("cl_khr_fp64", "2011.1"),
@@ -186,6 +192,25 @@ const_ext_lookup = {
             "IL_VERSION": cl_21,
             "MAX_NUM_SUB_GROUPS": cl_21,
             "SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS": cl_21,
+
+            "NUMERIC_VERSION": cl_30,
+            "EXTENSIONS_WITH_VERSION": cl_30,
+            "ILS_WITH_VERSION": cl_30,
+            "BUILT_IN_KERNELS_WITH_VERSION": cl_30,
+            "ATOMIC_MEMORY_CAPABILITIES": cl_30,
+            "ATOMIC_FENCE_CAPABILITIES": cl_30,
+            "NON_UNIFORM_WORK_GROUP_SUPPORT": cl_30,
+            "OPENCL_C_ALL_VERSIONS": cl_30,
+            "PREFERRED_WORK_GROUP_SIZE_MULTIPLE": cl_30,
+            "WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT": cl_30,
+            "GENERIC_ADDRESS_SPACE_SUPPORT": cl_30,
+            "OPENCL_C_FEATURES": cl_30,
+            "DEVICE_ENQUEUE_CAPABILITIES": cl_30,
+            "PIPE_SUPPORT": cl_30,
+            },
+
+        cl.device_topology_type_amd: {
+            "PCIE": amd_devattr,
             },
 
         cl.mem_object_type: {
@@ -227,6 +252,11 @@ const_ext_lookup = {
             "INTEROP_USER_SYNC": cl_12,
             },
 
+        cl.channel_type: {
+            "UNORM_INT24": ("CL_1.2", "2020.3"),
+            "UNORM_INT_101010_2": ("CL_2.1", "2020.3"),
+            },
+
         cl.channel_order: {
             "Rx": cl_11,
             "RGx": cl_11,
@@ -245,14 +275,26 @@ const_ext_lookup = {
             "GLOBAL_WORK_SIZE": cl_12,
             },
 
+        cl.kernel_sub_group_info: {
+            "MAX_SUB_GROUP_SIZE_FOR_NDRANGE": cl_21_late,
+            "SUB_GROUP_COUNT_FOR_NDRANGE": cl_21_late,
+            "LOCAL_SIZE_FOR_SUB_GROUP_COUNT": cl_21_late,
+            "MAX_NUM_SUB_GROUPS": cl_21_late,
+            "COMPILE_NUM_SUB_GROUPS": cl_21_late,
+            },
+
         cl.addressing_mode: {
             "MIRRORED_REPEAT": cl_11,
             },
 
         cl.sampler_info: {
-            "MIP_FILTER_MODE": cl_20,
-            "LOD_MIN": cl_20,
-            "LOD_MAX": cl_20,
+            "MIP_FILTER_MODE": ("(deprecated)", "2015.2"),
+            "LOD_MIN": ("(deprecated)", "2015.2"),
+            "LOD_MAX": ("(deprecated)", "2015.2"),
+            "MIP_FILTER_MODE_KHR": ("cl_khr_mipmap_image", "2020.3"),
+            "LOD_MIN_KHR": ("cl_khr_mipmap_image", "2020.3"),
+            "LOD_MAX_KHR": ("cl_khr_mipmap_image", "2020.3"),
+            "PROPERTIES": cl_30,
             },
 
         cl.event_info: {
@@ -273,6 +315,17 @@ const_ext_lookup = {
             "NUM_SAMPLES": cl_12,
             },
 
+        cl.pipe_info: {
+            "PACKET_SIZE": ("CL_2.0", "2020.3"),
+            "MAX_PACKETS": ("CL_2.0", "2020.3"),
+            "PROPERTIES": cl_30,
+            },
+
+        cl.pipe_properties: {
+            "PACKET_SIZE": ("CL_2.0", "2020.3"),
+            "MAX_PACKETS": ("CL_2.0", "2020.3"),
+            },
+
         cl.map_flags: {
             "WRITE_INVALIDATE_REGION": cl_12,
             },
@@ -280,6 +333,9 @@ const_ext_lookup = {
         cl.program_info: {
             "NUM_KERNELS": cl_12,
             "KERNEL_NAMES": cl_12,
+            "PROGRAM_IL": cl_21_late,
+            "SCOPE_GLOBAL_CTORS_PRESENT": cl_22,
+            "SCOPE_GLOBAL_DTORS_PRESENT": cl_22,
             },
 
         cl.program_build_info: {
@@ -345,6 +401,7 @@ const_ext_lookup = {
             "SVM_MEMFILL": cl_20,
             "SVM_MAP": cl_20,
             "SVM_UNMAP": cl_20,
+            "SVM_MIGRATE_MEM": cl_30,
             },
 
         cl.command_queue_info: {
@@ -398,6 +455,21 @@ const_ext_lookup = {
             "NEXT_PARITIONNABLE": cl_12,
             },
 
+        cl.device_atomic_capabilities: {
+            "ORDER_RELAXED": cl_30,
+            "ORDER_ACQ_REL": cl_30,
+            "ORDER_SEQ_CST": cl_30,
+            "SCOPE_WORK_ITEM": cl_30,
+            "SCOPE_WORK_GROUP": cl_30,
+            "SCOPE_DEVICE": cl_30,
+            "SCOPE_ALL_DEVICES": cl_30,
+            },
+
+        cl.device_device_enqueue_capabilities: {
+            "SUPPORTED": cl_30,
+            "REPLACEABLE_DEFAULT": cl_30,
+            },
+
         cl.profiling_info: {
             "COMPLETE": cl_20,
             },
@@ -407,6 +479,18 @@ const_ext_lookup = {
             "CONTENT_UNDEFINED": cl_12,
             },
 
+        cl.version_bits: {
+            "MAJOR_BITS": cl_30,
+            "MINOR_BITS": cl_30,
+            "PATCH_BITS": cl_30,
+            "MAJOR_MASK": cl_30,
+            "MINOR_MASK": cl_30,
+            "PATCH_MASK": cl_30,
+            },
+
+        cl.khronos_vendor_id: {
+            "CODEPLAY": cl_30,
+            },
         }
 try:
     gl_ci = cl.gl_context_info
@@ -435,8 +519,8 @@ def doc_class(cls):
         print()
 
     if cls in cls_ext_lookup:
-        for l in get_extra_lines(cls_ext_lookup[cls]):
-            print(l)
+        for ln in get_extra_lines(cls_ext_lookup[cls]):
+            print(ln)
 
     cls_const_ext = const_ext_lookup.get(cls, {})
     for name in sorted(dir(cls)):
@@ -444,8 +528,8 @@ def doc_class(cls):
             print("    .. attribute :: %s" % name)
 
             if name in cls_const_ext:
-                for l in get_extra_lines(cls_const_ext[name]):
-                    print("    "+l)
+                for ln in get_extra_lines(cls_const_ext[name]):
+                    print("    "+ln)
 
     print("    .. method :: to_string(value)")
     print()
diff --git a/doc/misc.rst b/doc/misc.rst
index a2b2c9993a6bcf19392c26a754482919d13ce6d4..17609a730b8a4b934fae63237b357a8c37acc76a 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -11,8 +11,10 @@ package manager.
 
 On Linux or OS X, the following set of instructions should work:
 
-#.  Install a version of `miniconda <https://conda.io/miniconda.html>`_
-    that fits your system. Both Python 2 and Python 3 work.
+#.  Install a version of
+    `miniforge <https://github.com/conda-forge/miniforge#miniforge3>`_
+    or `miniconda <https://conda.io/miniconda.html>`_
+    that fits your system.
     You can install these pieces of software in your user account and
     do not need root/administrator privileges.
 
@@ -49,6 +51,20 @@ the `CPU OpenCL driver from Intel <https://software.intel.com/en-us/articles/ope
 On macOS, pocl can offer a marked robustness (and, sometimes, performance)
 improvement over the OpenCL drivers built into the operating system.
 
+On Linux and macOS, you can use Oclgrind to detect memory access errors.
+
+#. ``conda install oclgrind``
+
+On Linux Intel Broadwell or newer processors with an Intel graphics card, you
+can use NEO.
+
+#. ``conda install intel-compute-runtime``
+
+On Linux Intel Sandybridge or newer processors with an Intel graphics card, you
+can use Beignet.
+
+#. ``conda install beignet``
+
 You are now ready to run code based on PyOpenCL, such as the `code
 examples <https://github.com/inducer/pyopencl/tree/master/examples>`_.
 
@@ -78,12 +94,20 @@ qualified path names of the shared library providing the OpenCL driver.
 
     Note that you should replace ``ENVIRONMENTNAME`` with the name of your environment,
     shown between parentheses on your command line prompt.
+    This path (for the currently-active conda environment) can be obtained from the
+    environment variable ``CONDA_PREFIX``, i.e.,
+
+    :file:`$CONDA_PREFIX/etc/OpenCL/vendors` (once the Conda environment is activated).
 
 On Linux, if you have other OpenCL drivers installed (such as for your GPU), those will be
 in :file:`/etc/OpenCL/vendors`. You can make them work with PyOpenCL from Conda Forge
-by placing a symbolic link to :file:`/etc/OpenCL/vendors` in the directory described
-above. The version of ocl-icd installed with PyOpenCL from Conda Forge in Linux will
-automatically recurse and find system-wide ICDs *if* that link is present.
+by using the command::
+
+    conda install ocl-icd-system
+
+will make sure these system-wide ICDs are also visible in your conda environment.
+As an alternative, one may manually copy ICD files from :file:`/etc/OpenCL/vendors` into,
+e.g., :file:`$CONDA_PREFIX/etc/OpenCL/vendors`.
 
 If you are looking for more information, see `ocl-icd
 <https://github.com/OCL-dev/ocl-icd>`_ and its documentation. Ocl-icd is the
@@ -120,6 +144,12 @@ To install pyopencl with oclgrind, an OpenCL debugger do,
 
 #.  ``pip install pyopencl[oclgrind]``
 
+.. note::
+
+    Avoid mixing components installed from Conda Forge and PyPI.
+    For example, installing PyOpenCL from pip followed by OCL-ICD from Conda Forge can
+    redirect the ICD loader, removing access to system-wide ICDs.
+
 
 Installing from source
 ----------------------
@@ -230,14 +260,23 @@ other software to be turned into the corresponding :mod:`pyopencl` objects.
 User-visible Changes
 ====================
 
-Version 2018.2
+Version 2020.3
 --------------
-
 .. note::
 
     This version is currently under development. You can get snapshots from
     PyOpenCL's `git repository <https://github.com/inducer/pyopencl>`_
 
+Version 2020.2
+--------------
+
+- Drop Python 2 support.
+- Add ``allow_empty_ndrange`` to kernel enqueue.
+- Bug fixes.
+
+Version 2018.2
+--------------
+
 * Use pybind11.
 * Many bug fixes.
 * Support arrays with offsets in scan kernels.
@@ -354,8 +393,8 @@ Version 2013.1
 * Deprecated :func:`pyopencl.tools.register_dtype` in favor of
   :func:`pyopencl.tools.get_or_register_dtype`.
 * Clean up the :class:`pyopencl.array.Array` constructor interface.
-* Deprecate :class:`pyopencl.array.DefaultAllocator`.
-* Deprecate :class:`pyopencl.tools.CLAllocator`.
+* Deprecate ``pyopencl.array.DefaultAllocator``.
+* Deprecate ``pyopencl.tools.CLAllocator``
 * Introduce :class:`pyopencl.tools.DeferredAllocator`, :class:`pyopencl.tools.ImmediateAllocator`.
 * Allow arrays whose beginning does not coincide with the beginning of their
   :attr:`pyopencl.array.Array.data` :class:`pyopencl.Buffer`.
@@ -389,7 +428,7 @@ Version 2013.1
     may take a very long time to execute. This is because :mod:`numpy` first
     builds an object array of (compute-device) scalars (!) before it decides that
     that's probably not such a bright idea and finally calls
-    :meth:`pyopencl.array.Array.__rmul__`.
+    ``pyopencl.array.Array.__rmul__``.
 
     Note that only left arithmetic operations of :class:`pyopencl.array.Array`
     by :mod:`numpy` scalars are affected. Python's number types (:class:`float` etc.)
@@ -416,7 +455,7 @@ Version 2012.1
 Version 2011.2
 --------------
 
-* Add :func:`pyopencl.enqueue_migrate_mem_object`.
+* Add :func:`pyopencl.enqueue_migrate_mem_objects`.
 * Add :func:`pyopencl.image_from_array`.
 * IMPORTANT BUGFIX: Kernel caching was broken for all the 2011.1.x releases, with
   severe consequences on the execution time of :class:`pyopencl.array.Array`
@@ -424,7 +463,7 @@ Version 2011.2
   Henrik Andresen at a `PyOpenCL workshop at DTU <http://gpulab.imm.dtu.dk/courses.html>`_
   first noticed the strange timings.
 * All comparable PyOpenCL objects are now also hashable.
-* Add :func:`pyopencl.tools.context_dependent_memoize` to the documented
+* Add ``pyopencl.tools.context_dependent_memoize`` to the documented
   functionality.
 * Base :mod:`pyopencl.clrandom` on `RANLUXCL <https://bitbucket.org/ivarun/ranluxcl>`_,
   add functionality.
@@ -432,13 +471,13 @@ Version 2011.2
 * Add :mod:`pyopencl.characterize`.
 * Ensure compatibility with OS X Lion.
 * Add :func:`pyopencl.tools.register_dtype` to enable scan/reduction on struct types.
-* :func:`pyopencl.enqueue_migrate_mem_object` was renamed
-  :func:`pyopencl.enqueue_migrate_mem_object_ext`.
-  :func:`pyopencl.enqueue_migrate_mem_object` now refers to the OpenCL 1.2 function
+* :func:``pyopencl.enqueue_migrate_mem_objects`` was renamed
+  ``pyopencl.enqueue_migrate_mem_objects_ext``.
+  :func:`pyopencl.enqueue_migrate_mem_objects` now refers to the OpenCL 1.2 function
   of this name, if available.
-* :func:`pyopencl.create_sub_devices` was renamed
-  :func:`pyopencl.create_sub_devices_ext`.
-  :func:`pyopencl.create_sub_devices` now refers to the OpenCL 1.2 function
+* :meth:`pyopencl.Device.create_sub_devices` was renamed
+  ``pyopencl.Device.create_sub_devices_ext``.
+  :meth:`pyopencl.Device.create_sub_devices` now refers to the OpenCL 1.2 function
   of this name, if available.
 * Alpha support for OpenCL 1.2.
 
@@ -458,14 +497,14 @@ Version 2011.1
 * All *is_blocking* parameters now default to *True* to avoid
   crashy-by-default behavior. (suggested by Jan Meinke)
   In particular, this change affects
-  :func:`pyopencl.enqueue_read_buffer`,
-  :func:`pyopencl.enqueue_write_buffer`,
-  :func:`pyopencl.enqueue_read_buffer_rect`,
-  :func:`pyopencl.enqueue_write_buffer_rect`,
-  :func:`pyopencl.enqueue_read_image`,
-  :func:`pyopencl.enqueue_write_image`,
-  :func:`pyopencl.enqueue_map_buffer`,
-  :func:`pyopencl.enqueue_map_image`.
+  ``pyopencl.enqueue_read_buffer``,
+  ``pyopencl.enqueue_write_buffer``,
+  ``pyopencl.enqueue_read_buffer_rect``,
+  ``pyopencl.enqueue_write_buffer_rect``,
+  ``pyopencl.enqueue_read_image``,
+  ``pyopencl.enqueue_write_image``,
+  ``pyopencl.enqueue_map_buffer``,
+  ``pyopencl.enqueue_map_image``.
 * Add :mod:`pyopencl.reduction`.
 * Add :ref:`reductions`.
 * Add :mod:`pyopencl.scan`.
@@ -507,7 +546,7 @@ Version 0.91.5
 * Add :attr:`pyopencl.ImageFormat.channel_count`,
   :attr:`pyopencl.ImageFormat.dtype_size`,
   :attr:`pyopencl.ImageFormat.itemsize`.
-* Add missing :func:`pyopencl.enqueue_copy_buffer`.
+* Add missing ``pyopencl.enqueue_copy_buffer``.
 * Add :func:`pyopencl.create_some_context`.
 * Add :func:`pyopencl.enqueue_barrier`, which was previously missing.
 
@@ -531,7 +570,7 @@ Version 0.91.2
 
 * :meth:`pyopencl.Program.build` now captures build logs and adds them
   to the exception text.
-* Deprecate :func:`pyopencl.create_context_from_type` in favor of second
+* Deprecate ``pyopencl.create_context_from_type`` in favor of second
   form of :class:`pyopencl.Context` constructor
 * Introduce :class:`pyopencl.LocalMemory`.
 * Document kernel invocation and :meth:`pyopencl.Kernel.set_arg`.
@@ -542,7 +581,7 @@ Version 0.91.1
 * Fixed a number of bugs, notably involving :class:`pyopencl.Sampler`.
 * :class:`pyopencl.Device`, :class:`pyopencl.Platform`,
   :class:`pyopencl.Context` now have nicer string representations.
-* Add :attr:`Image.shape`. (suggested by David Garcia)
+* Add :attr:`pyopencl.Image.shape`. (suggested by David Garcia)
 
 Version 0.91
 ------------
@@ -553,26 +592,26 @@ Version 0.91
 * Add :meth:`pyopencl.ImageFormat.__repr__`.
 * Add :meth:`pyopencl.addressing_mode.to_string` and colleagues.
 * The `pitch` arguments to
-  :func:`pyopencl.create_image_2d`,
-  :func:`pyopencl.create_image_3d`,
-  :func:`pyopencl.enqueue_read_image`, and
-  :func:`pyopencl.enqueue_write_image`
+  ``pyopencl.create_image_2d``,
+  ``pyopencl.create_image_3d``,
+  ``pyopencl.enqueue_read_image``, and
+  ``pyopencl.enqueue_write_image``
   are now defaulted to zero. The argument order of `enqueue_{read,write}_image`
   has changed for this reason.
 * Deprecate
-  :func:`pyopencl.create_image_2d`,
-  :func:`pyopencl.create_image_3d`
+  ``pyopencl.create_image_2d``,
+  ``pyopencl.create_image_3d``
   in favor of the :class:`pyopencl.Image` constructor.
 * Deprecate
-  :func:`pyopencl.create_program_with_source`,
-  :func:`pyopencl.create_program_with_binary`
+  ``pyopencl.create_program_with_source``,
+  ``pyopencl.create_program_with_binary``
   in favor of the :class:`pyopencl.Program` constructor.
 * Deprecate
-  :func:`pyopencl.create_buffer`,
-  :func:`pyopencl.create_host_buffer`
+  ``pyopencl.create_buffer``,
+  ``pyopencl.create_host_buffer``
   in favor of the :class:`pyopencl.Buffer` constructor.
-* :meth:`pyopencl.MemoryObject.get_image_info` now actually exists.
-* Add :attr:`pyopencl.MemoryObject.image.info`.
+* :meth:`pyopencl.Image.get_image_info` now actually exists.
+* Add :attr:`pyopencl.Image.info`.
 * Fix API tracing.
 * Add constructor arguments to :class:`pyopencl.ImageFormat`.  (suggested by David Garcia)
 
@@ -671,3 +710,121 @@ Andreas Klöckner's work on :mod:`pyopencl` was supported in part by
 AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
 views and opinions expressed herein do not necessarily reflect those of the
 funding agencies.
+
+Documentation Cross-References
+==============================
+
+Numpy
+-----
+.. currentmodule:: numpy
+
+.. class:: int8
+
+    See :class:`numpy.generic`.
+
+.. class:: int32
+
+    See :class:`numpy.generic`.
+
+.. class:: float64
+
+    See :class:`numpy.generic`.
+
+OpenCL Specification
+--------------------
+.. c:type:: cl_platform_id
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-platform-layer>`__.
+
+.. c:type:: cl_device_id
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-platform-layer>`__.
+
+.. c:type:: cl_context
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_contexts>`__.
+
+.. c:type:: cl_command_queue
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_command_queues>`__.
+
+.. c:type:: cl_mem
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_memory_objects>`__.
+
+.. c:type:: cl_program
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_program_objects>`__.
+
+.. c:type:: cl_kernel
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_kernel_objects>`__.
+
+.. c:type:: cl_sampler
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_sampler_objects>`__.
+
+.. c:type:: cl_event
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#event-objects>`__.
+
+.. c:function:: void clCreateCommandQueueWithProperties()
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clCreateCommandQueueWithProperties>`__.
+
+.. c:function:: void clCreateSamplerWithProperties()
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clCreateSamplerWithProperties>`__.
+
+.. c:function:: void clCreatePipe()
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clCreatePipe>`__.
+
+Internal Types
+--------------
+
+.. currentmodule:: pyopencl._cl
+
+.. class:: Platform
+
+    See :class:`pyopencl.Platform`.
+
+.. class:: Device
+
+    See :class:`pyopencl.Device`.
+
+.. class:: CommandQueue
+
+    See :class:`pyopencl.CommandQueue`.
+
+.. class:: Context
+
+    See :class:`pyopencl.Context`.
+
+.. class:: Event
+
+    See :class:`pyopencl.Event`.
+
+.. class:: SVMAllocation
+
+    See :class:`pyopencl.SVMAllocation`.
+
+.. class:: MemoryMap
+
+    See :class:`pyopencl.MemoryMap`.
+
+.. class:: Sampler
+
+    See :class:`pyopencl.Sampler`.
+
+.. class:: Program
+
+    See :class:`pyopencl.Program`.
+
+.. class:: _Program
+
+    See :class:`pyopencl.Program`.
+
+.. class:: Kernel
+
+    See :class:`pyopencl.Kernel`.
diff --git a/doc/runtime_const.rst b/doc/runtime_const.rst
index 3001cc1143ef19a2e5caac182f2a4359b2d1f572..864a641c7ce303e3d46ecfecbf8c6d443113e364 100644
--- a/doc/runtime_const.rst
+++ b/doc/runtime_const.rst
@@ -1,4 +1,28 @@
 OpenCL Runtime: Constants
 =========================
 
+.. currentmodule:: pyopencl
+
 .. include:: constants.inc
+
+.. class:: NameVersion
+    Describes the version of a specific feature.
+
+    .. note::
+
+        Only available with OpenCL 3.0 or newer.
+
+    .. versionadded:: 2020.3
+
+    .. method:: __init__(version, name)
+    .. attribute:: version
+    .. attribute:: name
+
+.. class:: DeviceTopologyAmd
+    .. method:: __init__(bus, device, function)
+    .. attribute:: type
+    .. attribute:: bus
+    .. attribute:: device
+    .. attribute:: function
+
+.. vim: shiftwidth=4
diff --git a/doc/runtime_gl.rst b/doc/runtime_gl.rst
index a391c173c8f7eb46aef2d5b1a8d8d32217615868..ecc3891d88e701ad6e56272b5518df032bec1bab 100644
--- a/doc/runtime_gl.rst
+++ b/doc/runtime_gl.rst
@@ -51,7 +51,7 @@ with GL support. See :func:`have_gl`.
 
     .. method:: get_gl_texture_info(param)
 
-        See :class:`gl_texture_info` for values of *param*.  Only available when PyOpenCL is compiled with GL support. See :func:`have_gl`.
+        See ``gl_texture_info`` for values of *param*.  Only available when PyOpenCL is compiled with GL support. See :func:`have_gl`.
 
 .. function:: enqueue_acquire_gl_objects(queue, mem_objects, wait_for=None)
 
diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index f92f13cd67f91329073d948f3f15e487b724d3e0..fc121554842af4c6565d88fe1305082fd76cf7f5 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -191,7 +191,7 @@ Image
     See :class:`mem_flags` for possible values of *flags*
     and :class:`mem_object_type` for possible values of *image_type*.
 
-.. class:: Image(context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None):
+.. class:: Image(context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None)
 
     See :class:`mem_flags` for values of *flags*.
     *shape* is a 2- or 3-tuple. *format* is an instance of :class:`ImageFormat`.
@@ -206,7 +206,7 @@ Image
 
     .. note::
 
-        If you want to load images from :mod:`numpy.ndarray` instances or read images
+        If you want to load images from :class:`numpy.ndarray` instances or read images
         back into them, be aware that OpenCL images expect the *x* dimension to vary
         fastest, whereas in the default (C) order of :mod:`numpy` arrays, the last index
         varies fastest. If your array is arranged in the wrong order in memory,
@@ -344,7 +344,6 @@ Samplers
 
 .. class:: Sampler
 
-
     .. method:: __init__(context, normalized_coords, addressing_mode, filter_mode)
 
         *normalized_coords* is a :class:`bool` indicating whether
@@ -353,7 +352,7 @@ Samplers
         See :class:`addressing_mode` and :class:`filter_mode` for possible
         argument values.
 
-    .. method:: __init__(context, properties)
+        Also supports an alternate signature ``(context, properties)``.
 
         :arg properties: a sequence
             of keys and values from :class:`sampler_properties` as accepted
@@ -361,9 +360,11 @@ Samplers
             spec for details). The trailing *0* is added automatically
             and does not need to be included.
 
-        Requires OpenCL 2 or newer.
+        This signature Requires OpenCL 2 or newer.
+
+        .. versionchanged:: 2018.2
 
-        .. versionadded:: 2018.2
+            The properties-based signature was added.
 
     .. attribute:: info
 
@@ -380,3 +381,23 @@ Samplers
 
     |comparable|
 
+Pipes
+-----
+
+.. class:: Pipe(context, flags, packet_size, max_packets, properties)
+
+    See :class:`mem_flags` for values of *flags*.
+
+    :arg properties: a sequence
+        of keys and values from :class:`pipe_properties` as accepted
+        by :c:func:`clCreatePipe`. The trailing *0* is added automatically
+        and does not need to be included.
+
+    This function Requires OpenCL 2 or newer.
+
+    .. versionadded:: 2020.3
+
+    .. method:: get_pipe_info(param)
+
+        See :class:`pipe_info` for values of *param*.
+
diff --git a/doc/runtime_platform.rst b/doc/runtime_platform.rst
index 6ee8fb661a842e60696019d43aef91be65981e77..d6e2ecb859edf03f4f1a769b117e2644df1d04f0 100644
--- a/doc/runtime_platform.rst
+++ b/doc/runtime_platform.rst
@@ -45,6 +45,8 @@ Device
 
 .. class:: Device
 
+    Two instances of this class may be compared using *=="* and *"!="*.
+
     .. attribute:: info
 
         Lower case versions of the :class:`device_info` constants
@@ -58,6 +60,18 @@ Device
     .. automethod:: from_int_ptr
     .. autoattribute:: int_ptr
 
+    .. attribute :: hashable_model_and_version_identifier
+
+        An unspecified data type that can be used to (as precisely as possible,
+        given identifying information available in OpenCL) identify a given
+        model and software stack version of a compute device. Note that this
+        identifier does not differentiate between different instances of the
+        same device installed in a single host.
+
+        The returned data type is hashable.
+
+        .. versionadded:: 2020.1
+
     .. method:: create_sub_devices(properties)
 
         *properties* is an array of one (or more) of the forms::
@@ -76,7 +90,19 @@ Device
 
         .. versionadded:: 2011.2
 
-    Two instances of this class may be compared using *=="* and *"!="*.
+    .. method:: device_and_host_timer
+
+        :returns: a tuple ``(device_timestamp, host_timestamp)``.
+
+        Only available with CL 2.0.
+
+        .. versionadded:: 2020.3
+
+    .. method:: host_timer
+
+        Only available with CL 2.0.
+
+        .. versionadded:: 2020.3
 
 Context
 -------
@@ -123,12 +149,12 @@ Context
     .. note::
 
         For
-        :attr:`context_properties.CL_GL_CONTEXT_KHR`,
-        :attr:`context_properties.CL_EGL_DISPLAY_KHR`,
-        :attr:`context_properties.CL_GLX_DISPLAY_KHR`,
-        :attr:`context_properties.CL_WGL_HDC_KHR`, and
-        :attr:`context_properties.CL_CGL_SHAREGROUP_KHR`
-        :attr:`context_properties.CL_CGL_SHAREGROUP_APPLE`
+        ``context_properties.CL_GL_CONTEXT_KHR``,
+        ``context_properties.CL_EGL_DISPLAY_KHR``,
+        ``context_properties.CL_GLX_DISPLAY_KHR``,
+        ``context_properties.CL_WGL_HDC_KHR``, and
+        ``context_properties.CL_CGL_SHAREGROUP_KHR``
+        ``context_properties.CL_CGL_SHAREGROUP_APPLE``
         the value in the key-value pair is a PyOpenGL context or display
         instance.
 
@@ -148,6 +174,8 @@ Context
     .. automethod:: from_int_ptr
     .. autoattribute:: int_ptr
 
+    .. method:: set_default_device_command_queue(dev, queue)
+
     |comparable|
 
 .. function:: create_some_context(interactive=True, answers=None, cache_dir=None)
diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst
index e95468782e37f3ec9fc1ab18556dfda371dc3adf..8cb2077ccddb785595490dd3ff9244abdfd431d9 100644
--- a/doc/runtime_program.rst
+++ b/doc/runtime_program.rst
@@ -8,6 +8,21 @@ OpenCL Runtime: Programs and Kernels
 Program
 -------
 
+.. envvar:: PYOPENCL_NO_CACHE
+
+    By setting the environment variable :envvar:`PYOPENCL_NO_CACHE` to any
+    non-empty value, this caching is suppressed.
+
+    .. versionadded:: 2013.1
+
+.. envvar:: PYOPENCL_BUILD_OPTIONS
+
+    Any options found in the environment variable
+    :envvar:`PYOPENCL_BUILD_OPTIONS` will be appended to *options*
+    in :meth:`Program.build`.
+
+    .. versionadded:: 2013.1
+
 .. class:: Program(context, src)
            Program(context, devices, binaries)
 
@@ -47,17 +62,13 @@ Program
         If passed *cache_dir* is None and context was created with None cache_dir:
         built binaries will be cached in an on-disk cache called
         :file:`pyopencl-compiler-cache-vN-uidNAME-pyVERSION` in the directory
-        returned by :func:`tempfile.gettempdir`.  By setting the environment
-        variable :envvar:`PYOPENCL_NO_CACHE` to any non-empty value, this
-        caching is suppressed.  Any options found in the environment variable
-        :envvar:`PYOPENCL_BUILD_OPTIONS` will be appended to *options*.
+        returned by :func:`tempfile.gettempdir`.
+
+        See also :envvar:`PYOPENCL_NO_CACHE`, :envvar:`PYOPENCL_BUILD_OPTIONS`.
 
         .. versionchanged:: 2011.1
-            *options* may now also be a :class:`list` of :class:`str`.
 
-        .. versionchanged:: 2013.1
-            Added :envvar:`PYOPENCL_NO_CACHE`.
-            Added :envvar:`PYOPENCL_BUILD_OPTIONS`.
+            *options* may now also be a :class:`list` of :class:`str`.
 
     .. method:: compile(self, options=[], devices=None, headers=[])
 
@@ -99,6 +110,12 @@ Program
 
         Returns a list of all :class:`Kernel` objects in the :class:`Program`.
 
+    .. method:: set_specialization_constant(spec_id, buffer)
+
+        Only available with CL 2.2 and newer.
+
+        .. versionadded:: 2020.3
+
     .. automethod:: from_int_ptr
     .. autoattribute:: int_ptr
 
@@ -133,6 +150,12 @@ Kernel
         may be used as attributes on instances of this class
         to directly query info attributes.
 
+    .. method:: clone()
+
+        Only available with CL 2.1.
+
+        .. versionadded:: 2020.3
+
     .. method:: get_info(param)
 
         See :class:`kernel_info` for values of *param*.
@@ -147,6 +170,17 @@ Kernel
 
         Only available in OpenCL 1.2 and newer.
 
+    .. method:: get_sub_group_info(self, device, param, input_value=None)
+
+        When the OpenCL spec requests *input_value* to be of type ``size_t``,
+        these may be passed directly as a number. When it requests
+        *input_value* to be of type ``size_t *``, a tuple of integers
+        may be passed.
+
+        Only available in OpenCL 2.1 and newer.
+
+        .. versionadded:: 2020.3
+
     .. method:: set_arg(self, index, arg)
 
         *arg* may be
@@ -205,19 +239,17 @@ Kernel
                prg.kernel(queue, n_globals, None, args)
 
 
-    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
         :meth:`set_args` to set each argument in turn. See the documentation for
         :meth:`set_arg` to see what argument types are allowed.
-        |std-enqueue-blurb|
 
-        *None* may be passed for local_size.
+        |glsize|
 
-        If *g_times_l* is specified, the global size will be multiplied by the
-        local size. (which makes the behavior more like Nvidia CUDA) In this case,
-        *global_size* and *local_size* also do not have to have the same number
-        of dimensions.
+        |empty-nd-range|
+
+        |std-enqueue-blurb|
 
         .. note::
 
@@ -233,6 +265,7 @@ Kernel
             <http://lists.tiker.net/pipermail/pyopencl/2012-October/001311.html>`_.
 
         .. versionchanged:: 0.92
+
             *local_size* was promoted to third positional argument from being a
             keyword argument. The old keyword argument usage will continue to
             be accepted with a warning throughout the 0.92 release cycle.
@@ -244,8 +277,13 @@ Kernel
             it from working.
 
         .. versionchanged:: 2011.1
+
             Added the *g_times_l* keyword arg.
 
+        .. versionchanged:: 2020.2
+
+            Added the *allow_empty_ndrange* keyword argument.
+
     .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
 
         This method supports the exact same interface as :meth:`__call__`, but
@@ -283,19 +321,18 @@ Kernel
 
         The size of local buffer in bytes to be provided.
 
-.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False)
+.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
-    |std-enqueue-blurb|
+    |glsize|
+
+    |empty-nd-range|
 
-    If *g_times_l* is specified, the global size will be multiplied by the
-    local size. (which makes the behavior more like Nvidia CUDA) In this case,
-    *global_size* and *local_size* also do not have to have the same number
-    of dimensions.
+    |std-enqueue-blurb|
 
     .. versionchanged:: 2011.1
-        Added the *g_times_l* keyword arg.
 
+        Added the *g_times_l* keyword arg.
 
-.. function:: enqueue_task(queue, kernel, wait_for=None)
+    .. versionchanged:: 2020.2
 
-    |std-enqueue-blurb|
+        Added the *allow_empty_ndrange* keyword argument.
diff --git a/doc/runtime_queue.rst b/doc/runtime_queue.rst
index c0b42897d151cd95c1289665e4f5a00d801fc078..f120c61edaa3b1b876e1f482a55bc6a2be49369c 100644
--- a/doc/runtime_queue.rst
+++ b/doc/runtime_queue.rst
@@ -78,14 +78,9 @@ Event
         may be used as attributes on instances of this class
         to directly query info attributes.
 
-    .. attribute:: profile.info
+    .. attribute:: profile
 
-        Lower case versions of the :class:`profiling_info` constants
-        may be used as attributes on the attribute `profile` of this
-        class to directly query profiling info.
-
-        For example, you may use *evt.profile.end* instead of
-        *evt.get_profiling_info(pyopencl.profiling_info.END)*.
+        An instance of :class:`ProfilingInfoGetter`.
 
     .. method:: get_info(param)
 
@@ -114,6 +109,17 @@ Event
 
     |comparable|
 
+.. class:: ProfilingInfoGetter
+
+   .. attribute:: info
+
+        Lower case versions of the :class:`profiling_info` constants
+        may be used as attributes on the attribute `profile` of this
+        class to directly query profiling info.
+
+        For example, you may use *evt.profile.end* instead of
+        *evt.get_profiling_info(pyopencl.profiling_info.END)*.
+
 Event Subclasses
 ----------------
 
diff --git a/doc/subst.rst b/doc/subst.rst
index 4210ab24ce99a871aa4cfe318d3eb07049d5a98a..b9603526f4ec761b098aee342e424b912657b9cf 100644
--- a/doc/subst.rst
+++ b/doc/subst.rst
@@ -12,4 +12,25 @@
 .. |std-enqueue-blurb| replace:: Returns a new :class:`pyopencl.Event`. |explain-waitfor|
 
 .. |copy-depr| replace:: **Note:** This function is deprecated as of PyOpenCL 2011.1.
-        Use :func:`enqueue_copy` instead.
+        Use :func:`~pyopencl.enqueue_copy` instead.
+
+.. |glsize| replace:: *global_size* and *local_size* are tuples of identical length, with
+        between one and three entries. *global_size* specifies the overall size
+        of the computational grid: one work item will be launched for every
+        integer point in the grid. *local_size* specifies the workgroup size,
+        which must evenly divide the *global_size* in a dimension-by-dimension
+        manner.  *None* may be passed for local_size, in which case the
+        implementation will use an implementation-defined workgroup size.
+        If *g_times_l* is *True*, the global size will be multiplied by the
+        local size. (which makes the behavior more like Nvidia CUDA) In this case,
+        *global_size* and *local_size* also do not have to have the same number
+        of entries.
+
+.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating
+        how an empty NDRange is to be treated, where "empty" means that one or more
+        entries of *global_size* or *local_size* are zero. OpenCL itself does not
+        allow enqueueing kernels over empty NDRanges. Setting this flag to *True*
+        enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``)
+        to obtain the synchronization effects that would have resulted from
+        the kernel enqueue.
+        Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer.
diff --git a/doc/tools.rst b/doc/tools.rst
index 243535142b21ef219391f7e9a552b846b58ec60c..7fdde084ee6be97fc0fb05309927a02ccd8f4107 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -15,7 +15,7 @@ fresh memory area is allocated for each intermediate result. Memory pools are a
 remedy for this problem based on the observation that often many of the block
 allocations are of the same sizes as previously used ones.
 
-Then, instead of fully returning the memory to the system and incurring the 
+Then, instead of fully returning the memory to the system and incurring the
 associated reallocation overhead, the pool holds on to the memory and uses it
 to satisfy future allocations of similarly-sized blocks. The pool reacts
 appropriately to out-of-memory conditions as long as all memory allocations
@@ -36,6 +36,15 @@ not complicated::
     memory is returned to the pool. This supports the same interface
     as :class:`pyopencl.Buffer`.
 
+.. class:: AllocatorInterface
+
+   An interface implemented by various memory allocation functions
+   in :mod:`pyopencl`.
+
+    .. method:: __call__(size)
+
+        Allocate and return a :class:`pyopencl.Buffer` of the given *size*.
+
 .. class:: DeferredAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE)
 
     *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
@@ -46,14 +55,22 @@ not complicated::
     bound to contexts, not devices, and memory availability depends on which
     device the buffer is used with.)
 
-    .. versionchanged::
-        In version 2013.1, :class:`CLAllocator` was deprecated and replaced
+    Implements :class:`AllocatorInterface`.
+
+    .. versionchanged :: 2013.1
+
+        ``CLAllocator`` was deprecated and replaced
         by :class:`DeferredAllocator`.
 
     .. method:: __call__(size)
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
 .. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE)
 
     *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
@@ -62,12 +79,19 @@ not complicated::
     allocated memory is actually available. If no memory is available, an out-of-memory
     error is reported at allocation time.
 
+    Implements :class:`AllocatorInterface`.
+
     .. versionadded:: 2013.1
 
     .. method:: __call__(size)
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
 .. class:: MemoryPool(allocator[, leading_bits_in_bin_id])
 
     A memory pool for OpenCL device memory. *allocator* must be an instance of
@@ -76,6 +100,8 @@ not complicated::
     by the allocator immediately, and not in the OpenCL-typical
     deferred manner.
 
+    Implements :class:`AllocatorInterface`.
+
     .. note::
 
         The current implementation of the memory pool will retain allocated
@@ -108,13 +134,27 @@ not complicated::
         The number of blocks in active use that have been allocated
         through this pool.
 
+    .. attribute:: managed_bytes
+
+        "Managed" memory is "active" and "held" memory.
+
+        .. versionadded: 2021.1.2
+
+    .. attribute:: active_bytes
+
+        "Active" bytes are bytes under the control of the application.
+        This may be smaller than the actual allocated size reflected
+        in :attr:`managed_bytes`.
+
+        .. versionadded: 2021.1.2
+
     .. method:: allocate(size)
 
         Return a :class:`PooledBuffer` of the given *size*.
 
     .. method:: __call__(size)
 
-        Synonym for :meth:`allocate` to match :class:`CLAllocator` interface.
+        Synonym for :meth:`allocate` to match the :class:`AllocatorInterface`.
 
         .. versionadded: 2011.2
 
diff --git a/doc/types.rst b/doc/types.rst
index ccc96fdd355737bc90e6ec1112b876ba30f1e7cb..dbd9794947c9188c008742e0d93b3324abcdf66b 100644
--- a/doc/types.rst
+++ b/doc/types.rst
@@ -1,7 +1,7 @@
 OpenCL Type Mapping
 ===================
 
-.. module:: pyopencl.types
+.. module:: pyopencl.cltypes
 
 .. _type-mappings:
 
@@ -21,6 +21,11 @@ see that a cl_long is 64 bit unsigned integer. Use the module as follows:
     >>> cl_long = cl.cltypes.long(1235) # maps to numpy.int64
     >>> floats = np.empty((128,), dtype=cl.cltypes.float) # array of numpy.float32
 
+.. note::
+
+    The OpenCL type ``bool`` does not have a correpsonding :mod:`numpy` type defined here,
+    because OpenCL does not specify the in-memory representation (or even the storage
+    size) for this type.
 
 Vector Types
 ------------
diff --git a/examples/demo-struct-reduce.py b/examples/demo-struct-reduce.py
index 2b0d9803f1fdd32e85a2da7fe245297a8ac5cf95..c0c26e34743687a281c534c05d9c8cb74c6587ec 100644
--- a/examples/demo-struct-reduce.py
+++ b/examples/demo-struct-reduce.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import numpy as np
 import pyopencl as cl
 
diff --git a/examples/demo.py b/examples/demo.py
index 59f7b3d45399777c36c49b0b1690953dbe3387cc..a4a503336e126cf5a392fc32f040166c6d92b939 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-from __future__ import absolute_import, print_function
 import numpy as np
 import pyopencl as cl
 
@@ -25,7 +23,8 @@ __kernel void sum(
 """).build()
 
 res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
-prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a_np.shape, None, a_g, b_g, res_g)
 
 res_np = np.empty_like(a_np)
 cl.enqueue_copy(queue, res_np, res_g)
diff --git a/examples/demo_array.py b/examples/demo_array.py
index c645b372632b8792d302658bbfa6c263b051491e..74bb7cfc6fead21ff0a0bb29266e82541586aa9e 100644
--- a/examples/demo_array.py
+++ b/examples/demo_array.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import pyopencl.array as cl_array
 import numpy
@@ -24,6 +22,7 @@ prg = cl.Program(ctx, """
     }
     """).build()
 
-prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
 
 print(la.norm((dest_dev - (a_dev+b_dev)).get()))
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index a8a3a007c094cf9b1ca7d3fc66142b7817a8b83d..3521089210a6dcde9d957443cc909049f39b61c8 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,7 +1,4 @@
-from __future__ import absolute_import
-from __future__ import print_function
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import numpy as np
 import pyopencl as cl
diff --git a/examples/demo_elementwise_complex.py b/examples/demo_elementwise_complex.py
index 9e04e2dd5a4f09c4235e860de1aa32dfc41a714f..4fe98ec9d0f0d514c84180e2775d84c7f808b152 100644
--- a/examples/demo_elementwise_complex.py
+++ b/examples/demo_elementwise_complex.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import pyopencl.array as cl_array
 import numpy
diff --git a/examples/demo_mandelbrot.py b/examples/demo_mandelbrot.py
index 802dfb215802c70e86bdc7534d401b4efe2f173b..1c04da6124d3cbb276e990369c381879b6df307b 100644
--- a/examples/demo_mandelbrot.py
+++ b/examples/demo_mandelbrot.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 # I found this example for PyCuda here:
 # http://wiki.tiker.net/PyCuda/Examples/Mandelbrot
 #
@@ -24,7 +22,8 @@ import time
 import numpy as np
 
 import pyopencl as cl
-from six.moves import range
+
+from PIL import Image
 
 # You can choose a calculation routine below (calc_fractal), uncomment
 # one of the three lines to test the three variations
@@ -45,7 +44,9 @@ def calc_fractal_opencl(q, maxiter):
     q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
     output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
 
-    prg = cl.Program(ctx, """
+    prg = cl.Program(
+        ctx,
+        """
     #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
     __kernel void mandelbrot(__global float2 *q,
                      __global ushort *output, ushort const maxiter)
@@ -65,10 +66,12 @@ def calc_fractal_opencl(q, maxiter):
                  output[gid] = curiter;
         }
     }
-    """).build()
+    """,
+    ).build()
 
-    prg.mandelbrot(queue, output.shape, None, q_opencl,
-                   output_opencl, np.uint16(maxiter))
+    prg.mandelbrot(
+        queue, output.shape, None, q_opencl, output_opencl, np.uint16(maxiter)
+    )
 
     cl.enqueue_copy(queue, output, output_opencl).wait()
 
@@ -80,10 +83,15 @@ def calc_fractal_serial(q, maxiter):
     # note that, unlike the other two implementations,
     # the number of iterations per point is NOT constant
     z = np.zeros(q.shape, complex)
-    output = np.resize(np.array(0,), q.shape)
+    output = np.resize(
+        np.array(
+            0,
+        ),
+        q.shape,
+    )
     for i in range(len(q)):
         for iter in range(maxiter):
-            z[i] = z[i]*z[i] + q[i]
+            z[i] = z[i] * z[i] + q[i]
             if abs(z[i]) > 2.0:
                 output[i] = iter
                 break
@@ -93,71 +101,81 @@ def calc_fractal_serial(q, maxiter):
 def calc_fractal_numpy(q, maxiter):
     # calculate z using numpy, this is the original
     # routine from vegaseat's URL
-    output = np.resize(np.array(0,), q.shape)
+    output = np.resize(
+        np.array(
+            0,
+        ),
+        q.shape,
+    )
     z = np.zeros(q.shape, np.complex64)
 
     for it in range(maxiter):
-        z = z*z + q
+        z = z * z + q
         done = np.greater(abs(z), 2.0)
-        q = np.where(done, 0+0j, q)
-        z = np.where(done, 0+0j, z)
+        q = np.where(done, 0 + 0j, q)
+        z = np.where(done, 0 + 0j, z)
         output = np.where(done, it, output)
     return output
 
+
 # choose your calculation routine here by uncommenting one of the options
 calc_fractal = calc_fractal_opencl
 # calc_fractal = calc_fractal_serial
 # calc_fractal = calc_fractal_numpy
 
-if __name__ == '__main__':
+
+class Mandelbrot:
+    def draw(self, x1, x2, y1, y2, maxiter=30):
+        # draw the Mandelbrot set, from numpy example
+        xx = np.arange(x1, x2, (x2 - x1) / w)
+        yy = np.arange(y2, y1, (y1 - y2) / h) * 1j
+        q = np.ravel(xx + yy[:, np.newaxis]).astype(np.complex64)
+
+        start_main = time.time()
+        output = calc_fractal(q, maxiter)
+        end_main = time.time()
+
+        secs = end_main - start_main
+        print("Main took", secs)
+
+        self.mandel = (output.reshape((h, w)) / float(output.max()) * 255.0).astype(
+            np.uint8
+        )
+
+    def create_image(self):
+        """ "
+        create the image from the draw() string
+        """
+        # you can experiment with these x and y ranges
+        self.draw(-2.13, 0.77, -1.3, 1.3)
+        self.im = Image.fromarray(self.mandel)
+        self.im.putpalette([i for rgb in ((j, 0, 0) for j in range(255))
+            for i in rgb])
+
+    def create_label(self):
+        # put the image on a label widget
+        self.image = ImageTk.PhotoImage(self.im)
+        self.label = tk.Label(self.root, image=self.image)
+        self.label.pack()
+
+    def run_tk(self):
+        self.root = tk.Tk()
+        self.root.title("Mandelbrot Set")
+        self.create_image()
+        self.create_label()
+        # start event loop
+        self.root.mainloop()
+
+
+if __name__ == "__main__":
+    test = Mandelbrot()
     try:
-        import six.moves.tkinter as tk
-    except ImportError:
-        # Python 3
         import tkinter as tk
-    from PIL import Image, ImageTk
-
-    class Mandelbrot(object):
-        def __init__(self):
-            # create window
-            self.root = tk.Tk()
-            self.root.title("Mandelbrot Set")
-            self.create_image()
-            self.create_label()
-            # start event loop
-            self.root.mainloop()
-
-        def draw(self, x1, x2, y1, y2, maxiter=30):
-            # draw the Mandelbrot set, from numpy example
-            xx = np.arange(x1, x2, (x2-x1)/w)
-            yy = np.arange(y2, y1, (y1-y2)/h) * 1j
-            q = np.ravel(xx+yy[:, np.newaxis]).astype(np.complex64)
-
-            start_main = time.time()
-            output = calc_fractal(q, maxiter)
-            end_main = time.time()
-
-            secs = end_main - start_main
-            print("Main took", secs)
-
-            self.mandel = (output.reshape((h, w)) /
-                           float(output.max()) * 255.).astype(np.uint8)
-
-        def create_image(self):
-            """"
-            create the image from the draw() string
-            """
-            # you can experiment with these x and y ranges
-            self.draw(-2.13, 0.77, -1.3, 1.3)
-            self.im = Image.fromarray(self.mandel)
-            self.im.putpalette([i for rgb in ((j, 0, 0) for j in range(255))
-                                for i in rgb])
-
-        def create_label(self):
-            # put the image on a label widget
-            self.image = ImageTk.PhotoImage(self.im)
-            self.label = tk.Label(self.root, image=self.image)
-            self.label.pack()
-
-    # test the class
-    test = Mandelbrot()
+    except ModuleNotFoundError:
+        test.create_image()
+    else:
+        from PIL import ImageTk
+        try:
+            test.run_tk()
+        except tk.TclError:
+            test.create_image()
diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
index 7ab9958f490bb17b5a55b18c2e9649909ac8c703..2ba293c5dfc3783f449b8bc6e0b060a90a4d0c3e 100644
--- a/examples/demo_meta_codepy.py
+++ b/examples/demo_meta_codepy.py
@@ -1,8 +1,6 @@
-from __future__ import absolute_import
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
-from six.moves import range
 
 local_size = 256
 thread_strides = 32
@@ -21,10 +19,10 @@ a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
 b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
 c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
 
-from codepy.cgen import FunctionBody, \
+from cgen import FunctionBody, \
         FunctionDeclaration, Typedef, POD, Value, \
         Pointer, Module, Block, Initializer, Assign, Const
-from codepy.cgen.opencl import CLKernel, CLGlobal, \
+from cgen.opencl import CLKernel, CLGlobal, \
         CLRequiredWorkGroupSize
 
 mod = Module([
@@ -35,14 +33,14 @@ mod = Module([
             arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name))))
                 for name in ["tgt", "op1", "op2"]]))),
         Block([
-            Initializer(POD(numpy.int32, "idx"), 
+            Initializer(POD(numpy.int32, "idx"),
                 "get_local_id(0) + %d * get_group_id(0)"
                 % (local_size*thread_strides))
             ]+[
             Assign(
                 "tgt[idx+%d]" % (o*local_size),
                 "op1[idx+%d] + op2[idx+%d]" % (
-                    o*local_size, 
+                    o*local_size,
                     o*local_size))
             for o in range(thread_strides)]))])
 
@@ -52,7 +50,7 @@ knl(queue, (local_size*macroblock_count,), (local_size,),
         c_buf, a_buf, b_buf)
 
 c = numpy.empty_like(a)
-cl.enqueue_read_buffer(queue, c_buf, c).wait()
+cl.enqueue_copy(queue, c, c_buf).wait()
 
 assert la.norm(c-(a+b)) == 0
 
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
index 76b5f65bf88ba938273b640831a998f93cd94812..a39e954221b94cd44876e3af42a3d0feca129849 100644
--- a/examples/demo_meta_template.py
+++ b/examples/demo_meta_template.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
@@ -24,8 +23,8 @@ from mako.template import Template
 
 tpl = Template("""
     __kernel void add(
-            __global ${ type_name } *tgt, 
-            __global const ${ type_name } *op1, 
+            __global ${ type_name } *tgt,
+            __global const ${ type_name } *op1,
             __global const ${ type_name } *op2)
     {
       int idx = get_local_id(0)
@@ -34,13 +33,13 @@ tpl = Template("""
 
       % for i in range(thread_strides):
           <% offset = i*local_size %>
-          tgt[idx + ${ offset }] = 
-            op1[idx + ${ offset }] 
+          tgt[idx + ${ offset }] =
+            op1[idx + ${ offset }]
             + op2[idx + ${ offset } ];
       % endfor
     }""")
 
-rendered_tpl = tpl.render(type_name="float", 
+rendered_tpl = tpl.render(type_name="float",
     local_size=local_size, thread_strides=thread_strides)
 
 knl = cl.Program(ctx, str(rendered_tpl)).build().add
@@ -49,6 +48,6 @@ knl(queue, (local_size*macroblock_count,), (local_size,),
         c_buf, a_buf, b_buf)
 
 c = numpy.empty_like(a)
-cl.enqueue_read_buffer(queue, c_buf, c).wait()
+cl.enqueue_copy(queue, c, c_buf).wait()
 
 assert la.norm(c-(a+b)) == 0
diff --git a/examples/download-examples-from-wiki.py b/examples/download-examples-from-wiki.py
deleted file mode 100755
index 0f8ea87527baeb492f2a264c476ac45f4ccff02d..0000000000000000000000000000000000000000
--- a/examples/download-examples-from-wiki.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /usr/bin/env python
-
-from __future__ import absolute_import, print_function
-
-import six.moves.xmlrpc_client
-destwiki = six.moves.xmlrpc_client.ServerProxy("http://wiki.tiker.net?action=xmlrpc2")
-
-import os
-try:
-    os.mkdir("wiki-examples")
-except OSError:
-    pass
-
-print("downloading  wiki examples to wiki-examples/...")
-print("fetching page list...")
-all_pages = destwiki.getAllPages()
-
-
-from os.path import exists
-
-for page in all_pages:
-    if not page.startswith("PyOpenCL/Examples/"):
-        continue
-
-    print(page)
-    try:
-        content = destwiki.getPage(page)
-
-        import re
-        match = re.search(r"\{\{\{\#\!python(.*)\}\}\}", content, re.DOTALL)
-        code = match.group(1)
-
-        match = re.search("([^/]+)$", page)
-        fname = match.group(1)
-
-        outfname = os.path.join("wiki-examples", fname+".py")
-        if exists(outfname):
-            print("%s exists, refusing to overwrite." % outfname)
-        else:
-            outf = open(outfname, "w")
-            outf.write(code)
-            outf.close()
-
-        for att_name in destwiki.listAttachments(page):
-            content = destwiki.getAttachment(page, att_name)
-
-            outfname = os.path.join("wiki-examples", att_name)
-            if exists(outfname):
-                print("%s exists, refusing to overwrite." % outfname)
-            else:
-                outf = open(outfname, "w")
-                outf.write(str(content))
-                outf.close()
-
-    except Exception as e:
-        print("Error when processing %s: %s" % (page, e))
-        from traceback import print_exc
-        print_exc()
diff --git a/examples/dump-performance.py b/examples/dump-performance.py
index 00df1d1bad6e62fc284eb7fa7ce18731255fabc4..f582cd99fcae98df7325717b4e1541dbf873bbcb 100644
--- a/examples/dump-performance.py
+++ b/examples/dump-performance.py
@@ -1,7 +1,5 @@
-from __future__ import division, absolute_import, print_function
 import pyopencl as cl
 import pyopencl.characterize.performance as perf
-from six.moves import range
 
 
 def main():
@@ -9,7 +7,7 @@ def main():
 
     prof_overhead, latency = perf.get_profiling_overhead(ctx)
     print("command latency: %g s" % latency)
-    print("profiling overhead: %g s -> %.1f %%" % (
+    print("profiling overhead: {:g} s -> {:.1f} %".format(
             prof_overhead, 100*prof_overhead/latency))
     queue = cl.CommandQueue(
             ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
diff --git a/examples/dump-properties.py b/examples/dump-properties.py
index e64f66fa25c9d0e47af70f9409b9ddd2b5aa424d..07d9159827c315605286d46a4f7de494b7d7489e 100644
--- a/examples/dump-properties.py
+++ b/examples/dump-properties.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 from optparse import OptionParser
 
@@ -21,13 +19,13 @@ def print_info(obj, info_cls):
 
             if (info_cls == cl.device_info and info_name == "PARTITION_TYPES_EXT"
                     and isinstance(info_value, list)):
-                print("%s: %s" % (info_name, [
+                print("{}: {}".format(info_name, [
                     cl.device_partition_property_ext.to_string(v,
                         "<unknown device partition property %d>")
                     for v in info_value]))
             else:
                 try:
-                    print("%s: %s" % (info_name, info_value))
+                    print(f"{info_name}: {info_value}")
                 except:
                     print("%s: <error>" % info_name)
 
@@ -72,13 +70,13 @@ for platform in cl.get_platforms():
                             return result
 
                         formats = ", ".join(
-                                "%s-%s" % (
+                                "{}-{}".format(
                                     cl.channel_order.to_string(iform.channel_order,
                                         "<unknown channel order 0x%x>"),
                                     str_chd_type(iform.channel_data_type))
                                 for iform in formats)
 
-                    print("%s %s FORMATS: %s\n" % (
+                    print("{} {} FORMATS: {}\n".format(
                             cl.mem_object_type.to_string(itype),
                             cl.mem_flags.to_string(mf),
                             formats))
diff --git a/examples/gl_interop_demo.py b/examples/gl_interop_demo.py
index da5ba3b0d6c84216a4a2273134c7cacef3a26b1f..99524cb30b3662b09aa4599d14d6df259ff6f340 100644
--- a/examples/gl_interop_demo.py
+++ b/examples/gl_interop_demo.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 from OpenGL.GL import *
 from OpenGL.GLUT import *
 from OpenGL.raw.GL.VERSION.GL_1_5 import glBufferData as rawGlBufferData
diff --git a/examples/gl_particle_animation.py b/examples/gl_particle_animation.py
index dd2f05c24686cc9cd777923b45de8963ed1f58b3..c8ac9c20a461c4e307497a4430f4a832dead4f84 100644
--- a/examples/gl_particle_animation.py
+++ b/examples/gl_particle_animation.py
@@ -1,26 +1,27 @@
-from __future__ import absolute_import
 # Visualization of particles with gravity
 # Source: http://enja.org/2010/08/27/adventures-in-opencl-part-2-particles-with-opengl/
 
-import pyopencl as cl # OpenCL - GPU computing interface
+import pyopencl as cl  # OpenCL - GPU computing interface
+
 mf = cl.mem_flags
 from pyopencl.tools import get_gl_sharing_context_properties
-from OpenGL.GL import * # OpenGL - GPU rendering interface
-from OpenGL.GLU import * # OpenGL tools (mipmaps, NURBS, perspective projection, shapes)
-from OpenGL.GLUT import * # OpenGL tool to make a visualization window
-from OpenGL.arrays import vbo 
-import numpy # Number tools
-import sys # System tools (path, modules, maxint)
+from OpenGL.GL import *  # OpenGL - GPU rendering interface
+from OpenGL.GLU import *  # OpenGL tools (mipmaps, NURBS, perspective projection, shapes)
+from OpenGL.GLUT import *  # OpenGL tool to make a visualization window
+from OpenGL.arrays import vbo
+import numpy  # Number tools
+import sys  # System tools (path, modules, maxint)
 
 width = 800
 height = 600
 num_particles = 100000
-time_step = .005
+time_step = 0.005
 mouse_down = False
-mouse_old = {'x': 0., 'y': 0.}
-rotate = {'x': 0., 'y': 0., 'z': 0.}
-translate = {'x': 0., 'y': 0., 'z': 0.}
-initial_translate = {'x': 0., 'y': 0., 'z': -2.5}
+mouse_old = {"x": 0.0, "y": 0.0}
+rotate = {"x": 0.0, "y": 0.0, "z": 0.0}
+translate = {"x": 0.0, "y": 0.0, "z": 0.0}
+initial_translate = {"x": 0.0, "y": 0.0, "z": -2.5}
+
 
 def glut_window():
     glutInit(sys.argv)
@@ -38,60 +39,79 @@ def glut_window():
     glViewport(0, 0, width, height)
     glMatrixMode(GL_PROJECTION)
     glLoadIdentity()
-    gluPerspective(60., width / float(height), .1, 1000.)
+    gluPerspective(60.0, width / float(height), 0.1, 1000.0)
+
+    return window
 
-    return(window)
 
 def initial_buffers(num_particles):
     np_position = numpy.ndarray((num_particles, 4), dtype=numpy.float32)
     np_color = numpy.ndarray((num_particles, 4), dtype=numpy.float32)
     np_velocity = numpy.ndarray((num_particles, 4), dtype=numpy.float32)
 
-    np_position[:,0] = numpy.sin(numpy.arange(0., num_particles) * 2.001 * numpy.pi / num_particles) 
-    np_position[:,0] *= numpy.random.random_sample((num_particles,)) / 3. + .2
-    np_position[:,1] = numpy.cos(numpy.arange(0., num_particles) * 2.001 * numpy.pi / num_particles) 
-    np_position[:,1] *= numpy.random.random_sample((num_particles,)) / 3. + .2
-    np_position[:,2] = 0.
-    np_position[:,3] = 1.
-
-    np_color[:,:] = [1.,1.,1.,1.] # White particles
-
-    np_velocity[:,0] = np_position[:,0] * 2.
-    np_velocity[:,1] = np_position[:,1] * 2.
-    np_velocity[:,2] = 3.
-    np_velocity[:,3] = numpy.random.random_sample((num_particles, ))
-    
-    gl_position = vbo.VBO(data=np_position, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER)
+    np_position[:, 0] = numpy.sin(
+        numpy.arange(0.0, num_particles) * 2.001 * numpy.pi / num_particles
+    )
+    np_position[:, 0] *= numpy.random.random_sample((num_particles,)) / 3.0 + 0.2
+    np_position[:, 1] = numpy.cos(
+        numpy.arange(0.0, num_particles) * 2.001 * numpy.pi / num_particles
+    )
+    np_position[:, 1] *= numpy.random.random_sample((num_particles,)) / 3.0 + 0.2
+    np_position[:, 2] = 0.0
+    np_position[:, 3] = 1.0
+
+    np_color[:, :] = [1.0, 1.0, 1.0, 1.0]  # White particles
+
+    np_velocity[:, 0] = np_position[:, 0] * 2.0
+    np_velocity[:, 1] = np_position[:, 1] * 2.0
+    np_velocity[:, 2] = 3.0
+    np_velocity[:, 3] = numpy.random.random_sample((num_particles,))
+
+    gl_position = vbo.VBO(
+        data=np_position, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER
+    )
     gl_position.bind()
     gl_color = vbo.VBO(data=np_color, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER)
     gl_color.bind()
 
     return (np_position, np_velocity, gl_position, gl_color)
 
+
 def on_timer(t):
     glutTimerFunc(t, on_timer, t)
     glutPostRedisplay()
 
+
 def on_key(*args):
-    if args[0] == '\033' or args[0] == 'q':
+    if args[0] == "\033" or args[0] == "q":
         sys.exit()
 
+
 def on_click(button, state, x, y):
-    mouse_old['x'] = x
-    mouse_old['y'] = y
+    mouse_old["x"] = x
+    mouse_old["y"] = y
+
 
 def on_mouse_move(x, y):
-    rotate['x'] += (y - mouse_old['y']) * .2
-    rotate['y'] += (x - mouse_old['x']) * .2
+    rotate["x"] += (y - mouse_old["y"]) * 0.2
+    rotate["y"] += (x - mouse_old["x"]) * 0.2
+
+    mouse_old["x"] = x
+    mouse_old["y"] = y
 
-    mouse_old['x'] = x
-    mouse_old['y'] = y
 
 def on_display():
-    """Render the particles"""        
+    """Render the particles"""
     # Update or particle positions by calling the OpenCL kernel
     cl.enqueue_acquire_gl_objects(queue, [cl_gl_position, cl_gl_color])
-    kernelargs = (cl_gl_position, cl_gl_color, cl_velocity, cl_start_position, cl_start_velocity, numpy.float32(time_step))
+    kernelargs = (
+        cl_gl_position,
+        cl_gl_color,
+        cl_velocity,
+        cl_start_position,
+        cl_start_velocity,
+        numpy.float32(time_step),
+    )
     program.particle_fountain(queue, (num_particles,), None, *(kernelargs))
     cl.enqueue_release_gl_objects(queue, [cl_gl_position, cl_gl_color])
     queue.finish()
@@ -102,11 +122,11 @@ def on_display():
     glLoadIdentity()
 
     # Handle mouse transformations
-    glTranslatef(initial_translate['x'], initial_translate['y'], initial_translate['z'])
-    glRotatef(rotate['x'], 1, 0, 0)
-    glRotatef(rotate['y'], 0, 1, 0) #we switched around the axis so make this rotate_z
-    glTranslatef(translate['x'], translate['y'], translate['z'])
-    
+    glTranslatef(initial_translate["x"], initial_translate["y"], initial_translate["z"])
+    glRotatef(rotate["x"], 1, 0, 0)
+    glRotatef(rotate["y"], 0, 1, 0)  # we switched around the axis so make this rotate_z
+    glTranslatef(translate["x"], translate["y"], translate["z"])
+
     # Render the particles
     glEnable(GL_POINT_SMOOTH)
     glPointSize(2)
@@ -131,17 +151,25 @@ def on_display():
 
     glutSwapBuffers()
 
+
 window = glut_window()
 
 (np_position, np_velocity, gl_position, gl_color) = initial_buffers(num_particles)
 
 platform = cl.get_platforms()[0]
-context = cl.Context(properties=[(cl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties())  
+context = cl.Context(
+    properties=[(cl.context_properties.PLATFORM, platform)]
+    + get_gl_sharing_context_properties()
+)
 queue = cl.CommandQueue(context)
 
 cl_velocity = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_velocity)
-cl_start_position = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_position)
-cl_start_velocity = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_velocity)
+cl_start_position = cl.Buffer(
+    context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_position
+)
+cl_start_velocity = cl.Buffer(
+    context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_velocity
+)
 
 cl_gl_position = cl.GLBuffer(context, mf.READ_WRITE, int(gl_position))
 cl_gl_color = cl.GLBuffer(context, mf.READ_WRITE, int(gl_color))
diff --git a/examples/median-filter.py b/examples/median-filter.py
index 010e2851d7ee9567732ac45b4a46d08d2d8fb212..7f787500ccf82a77b5961413f86e16dbf3cfe8a9 100644
--- a/examples/median-filter.py
+++ b/examples/median-filter.py
@@ -1,25 +1,14 @@
 import pyopencl as cl
 import numpy as np
-from scipy.misc import imread, imsave
+from imageio import imread, imsave
 
 #Read in image
-img = imread('noisyImage.jpg', flatten=True).astype(np.float32)
+img = imread('noisyImage.jpg').astype(np.float32)
+print(img.shape)
+img = np.mean(img, axis=2)
+print(img.shape)
 
-# Get platforms, both CPU and GPU
-plat = cl.get_platforms()
-CPU = plat[0].get_devices()
-try:
-    GPU = plat[1].get_devices()
-except IndexError:
-    GPU = "none"
-
-#Create context for GPU/CPU
-if GPU!= "none":
-    ctx = cl.Context(GPU)
-else:
-    ctx = cl.Context(CPU)
-
-# Create queue for each kernel execution
+ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
 mf = cl.mem_flags
@@ -97,4 +86,4 @@ result = np.empty_like(img)
 cl.enqueue_copy(queue, result, result_g)
 
 # Show the blurred image
-imsave('medianFilter-OpenCL.jpg',result)
\ No newline at end of file
+imsave('medianFilter-OpenCL.jpg', result)
diff --git a/examples/narray.py b/examples/narray.py
index 78b9bb9205b326b207730d411524fad93fd2c142..924c0d69cd89754574b68939c403c92822c5aa07 100644
--- a/examples/narray.py
+++ b/examples/narray.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 # example by Roger Pau Monn'e
 import pyopencl as cl
 import numpy as np
@@ -31,7 +29,7 @@ except:
     raise
 
 prg.demo(queue, (500,), None, demo_buf)
-cl.enqueue_read_buffer(queue, demo_buf, demo_r).wait()
+cl.enqueue_copy(queue, demo_r, demo_buf).wait()
 
 for res in demo_r:
     print(res)
diff --git a/examples/noisyImage.jpg b/examples/noisyImage.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..64db427319e4f2e4ce20d76f44cec3cca51a9697
Binary files /dev/null and b/examples/noisyImage.jpg differ
diff --git a/examples/transpose.py b/examples/transpose.py
index 99f68a28e4bc97e889248e01e7d145172587cf3f..6b06a98802eda2e26f4ad3ffb86cd7c761abd87c 100644
--- a/examples/transpose.py
+++ b/examples/transpose.py
@@ -1,13 +1,9 @@
 # Transposition of a matrix
 # originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
 
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
-from six.moves import range
 
 
 
@@ -106,7 +102,7 @@ def transpose_using_cl(ctx, queue, cpu_src, cls):
 
     w, h = cpu_src.shape
     result = numpy.empty((h, w), dtype=cpu_src.dtype)
-    cl.enqueue_read_buffer(queue, a_t_buf, result).wait()
+    cl.enqueue_copy(queue, result, a_t_buf).wait()
 
     a_buf.release()
     a_t_buf.release()
@@ -148,7 +144,7 @@ def benchmark_transpose():
     for dev in ctx.devices:
         assert dev.local_mem_size > 0
 
-    queue = cl.CommandQueue(ctx, 
+    queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     sizes = [int(((2**i) // 32) * 32)
@@ -190,27 +186,27 @@ def benchmark_transpose():
             a_buf.release()
             a_t_buf.release()
 
-    from matplotlib.pyplot import clf, plot, title, xlabel, ylabel, \
-            savefig, legend, grid
-    for i in range(len(methods)):
-        clf()
-        for j in range(i+1):
-            method = methods[j]
-            name = method.__name__.replace("Transpose", "")
-            plot(sizes, numpy.array(mem_bandwidths[method])/1e9, "o-", label=name)
+    try:
+        from matplotlib.pyplot import clf, plot, title, xlabel, ylabel, \
+                savefig, legend, grid
+    except ModuleNotFoundError:
+        pass
+    else:
+        for i in range(len(methods)):
+            clf()
+            for j in range(i+1):
+                method = methods[j]
+                name = method.__name__.replace("Transpose", "")
+                plot(sizes, numpy.array(mem_bandwidths[method])/1e9, "o-", label=name)
 
-        xlabel("Matrix width/height $N$")
-        ylabel("Memory Bandwidth [GB/s]")
-        legend(loc="best")
-        grid()
+            xlabel("Matrix width/height $N$")
+            ylabel("Memory Bandwidth [GB/s]")
+            legend(loc="best")
+            grid()
 
-        savefig("transpose-benchmark-%d.pdf" % i)
+            savefig("transpose-benchmark-%d.pdf" % i)
 
 
-
-
-
-
-#check_transpose()
+check_transpose()
 benchmark_transpose()
 
diff --git a/examples/print-binary.py b/experiments/print-binary.py
similarity index 72%
rename from examples/print-binary.py
rename to experiments/print-binary.py
index c7ea523947f522f9165399a52f483842d21d8744..d45c1d0fe67989eda42342d8b0dee4c90bfcc616 100755
--- a/examples/print-binary.py
+++ b/experiments/print-binary.py
@@ -1,12 +1,11 @@
 #! /usr/bin/env python
 
-from __future__ import division
 
 import pyopencl as cl
 import sys
 
 ctx = cl.create_some_context()
-with open(sys.argv[1], "r") as inf:
+with open(sys.argv[1]) as inf:
     src = inf.read()
 
 prg = cl.Program(ctx, src).build()
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 59be64d1d8db45576bed722d2afe3cfd38eb78df..1d304fd2447425785b99b17adaccf14e832bc3a7 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009-15 Andreas Kloeckner"
 
 __license__ = """
@@ -24,11 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import input, intern
+from sys import intern
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
+# must import, otherwise dtype registry will not be fully populated
+import pyopencl.cltypes  # noqa: F401
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -50,16 +48,16 @@ import numpy as np
 
 import sys
 
-_PYPY = '__pypy__' in sys.builtin_module_names
-_CPY2 = not _PYPY and sys.version_info < (3,)
+_PYPY = "__pypy__" in sys.builtin_module_names
 
-from pyopencl._cl import (  # noqa
+from pyopencl._cl import (  # noqa: F401
         get_cl_header_version,
         program_kind,
         status_code,
         platform_info,
         device_type,
         device_info,
+        device_topology_type_amd,
         device_fp_config,
         device_mem_cache_type,
         device_local_mem_type,
@@ -81,9 +79,12 @@ from pyopencl._cl import (  # noqa
         mem_object_type,
         mem_info,
         image_info,
+        pipe_info,
+        pipe_properties,
         addressing_mode,
         filter_mode,
         sampler_info,
+        sampler_properties,
         map_flags,
         program_info,
         program_build_info,
@@ -95,6 +96,7 @@ from pyopencl._cl import (  # noqa
         kernel_arg_access_qualifier,
         kernel_arg_type_qualifier,
         kernel_work_group_info,
+        kernel_sub_group_info,
 
         event_info,
         command_type,
@@ -103,6 +105,11 @@ from pyopencl._cl import (  # noqa
         mem_migration_flags,
         device_partition_property,
         device_affinity_domain,
+        device_atomic_capabilities,
+        device_device_enqueue_capabilities,
+
+        version_bits,
+        khronos_vendor_id,
 
         Error, MemoryError, LogicError, RuntimeError,
 
@@ -149,9 +156,17 @@ from pyopencl._cl import (  # noqa
 
         Image,
         Sampler,
-        DeviceTopologyAmd,
+
+        # This class is available unconditionally, even though CL only
+        # has it on CL2.0 and newer.
+        Pipe,
         )
 
+try:
+    from pyopencl._cl import DeviceTopologyAmd  # noqa: F401
+except ImportError:
+    pass
+
 if not _PYPY:
     # FIXME: Add back to default set when pypy support catches up
     from pyopencl._cl import (  # noqa
@@ -161,7 +176,7 @@ if not _PYPY:
 
 if get_cl_header_version() >= (1, 1):
     from pyopencl._cl import (  # noqa
-          UserEvent,
+        UserEvent,
         )
 if get_cl_header_version() >= (1, 2):
     from pyopencl._cl import (  # noqa
@@ -217,6 +232,24 @@ CONSTANT_CLASSES = tuple(
         if _inspect.isclass(getattr(_cl, name))
         and name[0].islower() and name not in ["zip", "map", "range"])
 
+BITFIELD_CONSTANT_CLASSES = (
+        _cl.device_type,
+        _cl.device_fp_config,
+        _cl.device_exec_capabilities,
+        _cl.command_queue_properties,
+        _cl.mem_flags,
+        _cl.map_flags,
+        _cl.kernel_arg_type_qualifier,
+        _cl.device_affinity_domain,
+        _cl.mem_migration_flags,
+        _cl.device_svm_capabilities,
+        _cl.queue_properties,
+        _cl.svm_mem_flags,
+        _cl.device_atomic_capabilities,
+        _cl.device_device_enqueue_capabilities,
+        _cl.version_bits,
+        )
+
 
 # {{{ diagnostics
 
@@ -240,27 +273,24 @@ def compiler_output(text):
 # {{{ find pyopencl shipped source code
 
 def _find_pyopencl_include_path():
-    from pkg_resources import Requirement, resource_filename, DistributionNotFound
+    from os.path import join, abspath, dirname, exists
     try:
+        # Try to find the include path in the same directory as this file
+        include_path = join(abspath(dirname(__file__)), "cl")
+        if not exists(include_path):
+            raise OSError("unable to find pyopencl include path")
+    except Exception:
         # Try to find the resource with pkg_resources (the recommended
-        # setuptools approach)
+        # setuptools approach). This is very slow.
+        from pkg_resources import Requirement, resource_filename
         include_path = resource_filename(
                 Requirement.parse("pyopencl"), "pyopencl/cl")
-    except DistributionNotFound:
-        # If pkg_resources can't find it (e.g. if the module is part of a
-        # frozen application), try to find the include path in the same
-        # directory as this file
-        from os.path import join, abspath, dirname, exists
-
-        include_path = join(abspath(dirname(__file__)), "cl")
-        # If that doesn't exist, just re-raise the exception caught from
-        # resource_filename.
         if not exists(include_path):
-            raise
+            raise OSError("unable to find pyopencl include path")
 
     # Quote the path if it contains a space and is not quoted already.
     # See https://github.com/inducer/pyopencl/issues/250 for discussion.
-    if ' ' in include_path and not include_path.startswith('"'):
+    if " " in include_path and not include_path.startswith('"'):
         return '"' + include_path + '"'
     else:
         return include_path
@@ -271,16 +301,11 @@ def _find_pyopencl_include_path():
 # {{{ build option munging
 
 def _split_options_if_necessary(options):
-    if isinstance(options, six.string_types):
+    if isinstance(options, str):
         import shlex
-        if six.PY2:
-            # shlex.split takes bytes (py2 str) on py2
-            if isinstance(options, six.text_type):
-                options = options.encode("utf-8")
-        else:
-            # shlex.split takes unicode (py3 str) on py3
-            if isinstance(options, six.binary_type):
-                options = options.decode("utf-8")
+        # shlex.split takes unicode (py3 str) on py3
+        if isinstance(options, bytes):
+            options = options.decode("utf-8")
 
         options = shlex.split(options)
 
@@ -317,7 +342,7 @@ def _find_include_path(options):
 
 def _options_to_bytestring(options):
     def encode_if_necessary(s):
-        if isinstance(s, six.text_type):
+        if isinstance(s, str):
             return s.encode("utf-8")
         else:
             return s
@@ -361,11 +386,12 @@ def enable_debugging(platform_or_context):
                 % platform.name)
 
 
-class Program(object):
+class Program:
     def __init__(self, arg1, arg2=None, arg3=None):
         if arg2 is None:
             # 1-argument form: program
             self._prg = arg1
+            self._context = self._prg.get_info(program_info.CONTEXT)
 
         elif arg3 is None:
             # 2-argument form: context, source
@@ -379,7 +405,7 @@ class Program(object):
                 return
 
             import sys
-            if isinstance(source, six.text_type) and sys.version_info < (3,):
+            if isinstance(source, str) and sys.version_info < (3,):
                 from warnings import warn
                 warn("Received OpenCL source code in Unicode, "
                      "should be ASCII string. Attempting conversion.",
@@ -407,7 +433,6 @@ class Program(object):
                     stacklevel=3)
 
             self._prg = _cl._Program(self._context, self._source)
-            del self._context
             return self._prg
 
     def get_info(self, arg):
@@ -476,7 +501,7 @@ class Program(object):
                 self._context, options)
 
         if cache_dir is None:
-            cache_dir = getattr(self._context, 'cache_dir', None)
+            cache_dir = getattr(self._context, "cache_dir", None)
 
         import os
         build_descr = None
@@ -608,7 +633,7 @@ def _add_functionality():
     # {{{ Platform
 
     def platform_repr(self):
-        return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
+        return f"<pyopencl.Platform '{self.name}' at 0x{self.int_ptr:x}>"
 
     Platform.__repr__ = platform_repr
     Platform._get_cl_version = generic_get_cl_version
@@ -618,16 +643,25 @@ def _add_functionality():
     # {{{ Device
 
     def device_repr(self):
-        return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
+        return "<pyopencl.Device '{}' on '{}' at 0x{:x}>".format(
                 self.name.strip(), self.platform.name.strip(), self.int_ptr)
 
+    def device_hashable_model_and_version_identifier(self):
+        return ("v1", self.vendor, self.vendor_id, self.name, self.version)
+
     def device_persistent_unique_id(self):
-        return (self.vendor, self.vendor_id, self.name, self.version)
+        from warnings import warn
+        warn("Device.persistent_unique_id is deprecated. "
+                "Use Device.hashable_model_and_version_identifier instead.",
+                DeprecationWarning, stacklevel=2)
+        return device_hashable_model_and_version_identifier(self)
 
     Device.__repr__ = device_repr
 
     # undocumented for now:
     Device._get_cl_version = generic_get_cl_version
+    Device.hashable_model_and_version_identifier = property(
+            device_hashable_model_and_version_identifier)
     Device.persistent_unique_id = property(device_persistent_unique_id)
 
     # }}}
@@ -648,7 +682,7 @@ def _add_functionality():
         context_old_init(self, devices, properties, dev_type)
 
     def context_repr(self):
-        return "<pyopencl.Context at 0x%x on %s>" % (self.int_ptr,
+        return "<pyopencl.Context at 0x{:x} on {}>".format(self.int_ptr,
                 ", ".join(repr(dev) for dev in self.devices))
 
     def context_get_cl_version(self):
@@ -669,7 +703,7 @@ def _add_functionality():
         self.finish()
 
     def command_queue_get_cl_version(self):
-        return self.context._get_cl_version()
+        return self.device._get_cl_version()
 
     CommandQueue.__enter__ = command_queue_enter
     CommandQueue.__exit__ = command_queue_exit
@@ -697,7 +731,7 @@ def _add_functionality():
             self._build(options=options_bytes, devices=devices)
         except Error as e:
             msg = str(e) + "\n\n" + (75*"="+"\n").join(
-                    "Build on %s:\n\n%s" % (dev, log)
+                    f"Build on {dev}:\n\n{log}"
                     for dev, log in self._get_build_logs())
             code = e.code
             routine = e.routine
@@ -714,7 +748,7 @@ def _add_functionality():
             raise err
 
         message = (75*"="+"\n").join(
-                "Build on %s succeeded, but said:\n\n%s" % (dev, log)
+                f"Build on {dev} succeeded, but said:\n\n{log}"
                 for dev, log in self._get_build_logs()
                 if log is not None and log.strip())
 
@@ -785,8 +819,8 @@ def _add_functionality():
         self._wg_info_cache = {}
         return self
 
-    def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes):
-        self._scalar_arg_dtypes = tuple(scalar_arg_dtypes)
+    def kernel_set_arg_types(self, arg_types):
+        arg_types = tuple(arg_types)
 
         # {{{ arg counting bug handling
 
@@ -813,21 +847,31 @@ def _add_functionality():
         # }}}
 
         from pyopencl.invoker import generate_enqueue_and_set_args
-        self._enqueue, self._set_args = generate_enqueue_and_set_args(
-                self.function_name,
-                len(scalar_arg_dtypes), self.num_args,
-                self._scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
+        enqueue, my_set_args = \
+                generate_enqueue_and_set_args(
+                        self.function_name,
+                        len(arg_types), self.num_args,
+                        arg_types,
+                        warn_about_arg_count_bug=warn_about_arg_count_bug,
+                        work_around_arg_count_bug=work_around_arg_count_bug)
+
+        # Make ourselves a kernel-specific class, so that we're able to override
+        # __call__. Inspired by https://stackoverflow.com/a/38541437
+        class KernelWithCustomEnqueue(type(self)):
+            __call__ = enqueue
+            set_args = my_set_args
+
+        self.__class__ = KernelWithCustomEnqueue
 
     def kernel_get_work_group_info(self, param, device):
+        cache_key = (param, device.int_ptr)
         try:
-            return self._wg_info_cache[param, device]
+            return self._wg_info_cache[cache_key]
         except KeyError:
             pass
 
         result = kernel_old_get_work_group_info(self, param, device)
-        self._wg_info_cache[param, device] = result
+        self._wg_info_cache[cache_key] = result
         return result
 
     def kernel_set_args(self, *args, **kwargs):
@@ -837,6 +881,9 @@ def _add_functionality():
     def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
         # __call__ can't be overridden directly, so we need this
         # trampoline hack.
+
+        # Note: This is only used for the generic __call__, before
+        # kernel_set_scalar_arg_dtypes is called.
         return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
 
     def kernel_capture_call(self, filename, queue, global_size, local_size,
@@ -856,7 +903,11 @@ def _add_functionality():
     Kernel.__init__ = kernel_init
     Kernel._setup = kernel__setup
     Kernel.get_work_group_info = kernel_get_work_group_info
-    Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
+
+    # FIXME: Possibly deprecate this version
+    Kernel.set_scalar_arg_dtypes = kernel_set_arg_types
+    Kernel.set_arg_types = kernel_set_arg_types
+
     Kernel.set_args = kernel_set_args
     Kernel.__call__ = kernel_call
     Kernel.capture_call = kernel_capture_call
@@ -867,7 +918,7 @@ def _add_functionality():
     # {{{ ImageFormat
 
     def image_format_repr(self):
-        return "ImageFormat(%s, %s)" % (
+        return "ImageFormat({}, {})".format(
                 channel_order.to_string(self.channel_order,
                     "<unknown channel order 0x%x>"),
                 channel_type.to_string(self.channel_data_type,
@@ -979,7 +1030,7 @@ def _add_functionality():
     class _ImageInfoGetter:
         def __init__(self, event):
             from warnings import warn
-            warn("Image.image.attr is deprecated. "
+            warn("Image.image.attr is deprecated and will go away in 2021. "
                     "Use Image.attr directly, instead.")
 
             self.event = event
@@ -1022,7 +1073,7 @@ def _add_functionality():
                         val.code(), "<unknown error %d>")
             routine = val.routine()
             if routine:
-                result = "%s failed: %s" % (routine, result)
+                result = f"{routine} failed: {result}"
             what = val.what()
             if what:
                 if result:
@@ -1094,10 +1145,8 @@ def _add_functionality():
         """
         svmallocation_old_init(self, ctx, size, alignment, flags)
 
-        read_write = (
-                flags & mem_flags.WRITE_ONLY != 0
-                or flags & mem_flags.READ_WRITE != 0)
-
+        # mem_flags.READ_ONLY applies to kernels, not the host
+        read_write = True
         _interface["data"] = (
                 int(self._ptr_as_int()), not read_write)
 
@@ -1111,8 +1160,9 @@ def _add_functionality():
     # {{{ SVM
 
     if get_cl_header_version() >= (2, 0):
-        SVM.__doc__ = """Tags an object exhibiting the Python buffer interface (such as a
-            :class:`numpy.ndarray`) as referring to shared virtual memory.
+        SVM.__doc__ = """Tags an object exhibiting the Python buffer interface
+            (such as a :class:`numpy.ndarray`) as referring to shared virtual
+            memory.
 
             Depending on the features of the OpenCL implementation, the following
             types of objects may be passed to/wrapped in this type:
@@ -1167,7 +1217,7 @@ def _add_functionality():
                 This object merely serves as a 'tag' that changes the behavior
                 of functions to which it is passed. It has no special management
                 relationship to the memory it tags. For example, it is permissible
-                to grab a :mod:`numpy.array` out of :attr:`SVM.mem` of one
+                to grab a :class:`numpy.ndarray` out of :attr:`SVM.mem` of one
                 :class:`SVM` instance and use the array to construct another.
                 Neither of the tags need to be kept alive.
 
@@ -1263,15 +1313,28 @@ def _add_functionality():
             _cl.MemoryObjectHolder:
             (MemoryObjectHolder.get_info, _cl.mem_info, []),
             Image: (_cl.Image.get_image_info, _cl.image_info, []),
+            Pipe: (_cl.Pipe.get_pipe_info, _cl.pipe_info, []),
             Program: (Program.get_info, _cl.program_info, []),
             Kernel: (Kernel.get_info, _cl.kernel_info, []),
             _cl.Sampler: (Sampler.get_info, _cl.sampler_info, []),
             }
 
     def to_string(cls, value, default_format=None):
-        for name in dir(cls):
-            if (not name.startswith("_") and getattr(cls, name) == value):
-                return name
+        if cls._is_bitfield:
+            names = []
+            for name in dir(cls):
+                attr = getattr(cls, name)
+                if not isinstance(attr, int):
+                    continue
+                if attr == value or attr & value:
+                    names.append(name)
+            if names:
+                return " | ".join(names)
+        else:
+            for name in dir(cls):
+                if (not name.startswith("_")
+                        and getattr(cls, name) == value):
+                    return name
 
         if default_format is None:
             raise ValueError("a name for value %d was not found in %s"
@@ -1280,6 +1343,7 @@ def _add_functionality():
             return default_format % value
 
     for cls in CONSTANT_CLASSES:
+        cls._is_bitfield = cls in BITFIELD_CONSTANT_CLASSES
         cls.to_string = classmethod(to_string)
 
     # {{{ get_info attributes -------------------------------------------------
@@ -1304,8 +1368,8 @@ def _add_functionality():
         return property(result)
 
     for cls, (info_method, info_class, cacheable_attrs) \
-            in six.iteritems(cls_to_info_cls):
-        for info_name, info_value in six.iteritems(info_class.__dict__):
+            in cls_to_info_cls.items():
+        for info_name, info_value in info_class.__dict__.items():
             if info_name == "to_string" or info_name.startswith("_"):
                 continue
 
@@ -1374,7 +1438,7 @@ def create_some_context(interactive=None, answers=None):
         if answers:
             return str(answers.pop(0))
         elif not interactive:
-            return ''
+            return ""
         else:
             user_input = input(prompt)
             user_inputs.append(user_input)
@@ -1472,7 +1536,7 @@ _csc = create_some_context
 
 # {{{ SVMMap
 
-class SVMMap(object):
+class SVMMap:
     """
     .. attribute:: event
 
@@ -1529,10 +1593,17 @@ def enqueue_copy(queue, dest, src, **kwargs):
     :arg wait_for: (optional, default empty)
     :arg is_blocking: Wait for completion. Defaults to *True*.
       (Available on any copy involving host memory)
-
     :return: A :class:`NannyEvent` if the transfer involved a
         host-side buffer, otherwise an :class:`Event`.
 
+    .. note::
+
+        Be aware that the deletion of the :class:`NannyEvent` that is
+        returned by the function if the transfer involved a host-side buffer
+        will block until the transfer is complete, so be sure to keep a
+        reference to this :class:`Event` until the
+        transfer has completed.
+
     .. note::
 
         Two types of 'buffer' occur in the arguments to this function,
@@ -1770,9 +1841,9 @@ def image_from_array(ctx, ary, num_channels=None, mode="r", norm_int=False):
     dtype = ary.dtype
     if num_channels is None:
 
-        from pyopencl.array import vec
         try:
-            dtype, num_channels = vec.type_to_scalar_and_count[dtype]
+            dtype, num_channels = \
+                    pyopencl.cltypes.vec_type_to_scalar_and_count[dtype]
         except KeyError:
             # It must be a scalar type then.
             num_channels = 1
@@ -1925,11 +1996,7 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
         for dim in shape:
             s *= dim
     except TypeError:
-        import sys
-        if sys.version_info >= (3,):
-            admissible_types = (int, np.integer)
-        else:
-            admissible_types = (np.integer,) + six.integer_types
+        admissible_types = (int, np.integer)
 
         if not isinstance(shape, admissible_types):
             raise TypeError("shape must either be iterable or "
diff --git a/pyopencl/_buffers.py b/pyopencl/_buffers.py
deleted file mode 100644
index bbf81a2fe3bb631dd9d28f13b86caa56a4fb84bc..0000000000000000000000000000000000000000
--- a/pyopencl/_buffers.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#! /usr/bin/env python
-# Shamelessly stolen from pyopengl-ctypes on 2015-06-21.
-#
-# Original file here:
-# http://bazaar.launchpad.net/~mcfletch/pyopengl/trunk/view/head:/OpenGL/arrays/_buffers.py
-
-"""Python 3.x buffer-handling (currently just for bytes/bytearray types)
-"""
-
-import ctypes
-import sys
-
-if sys.version_info[:2] < (2, 6):
-    raise ImportError('Buffer interface only usable on Python 2.6+')
-
-PyBUF_SIMPLE = 0
-PyBUF_WRITABLE = PyBUF_WRITEABLE = 0x0001
-PyBUF_ND = 0x0008
-PyBUF_STRIDES = (0x0010 | PyBUF_ND)
-PyBUF_CONTIG = (PyBUF_ND | PyBUF_WRITABLE)
-PyBUF_CONTIG_RO = (PyBUF_ND)
-PyBUF_C_CONTIGUOUS = (0x0020 | PyBUF_STRIDES)
-PyBUF_F_CONTIGUOUS = (0x0040 | PyBUF_STRIDES)
-PyBUF_ANY_CONTIGUOUS = (0x0080 | PyBUF_STRIDES)
-PyBUF_FORMAT = 0x0004
-
-# Python 2.6 doesn't define this...
-c_ssize_t = getattr(ctypes, 'c_ssize_t', ctypes.c_ulong)
-
-_fields_ = [
-    ('buf', ctypes.c_void_p),
-    ('obj', ctypes.c_void_p),
-    ('len', c_ssize_t),
-    ('itemsize', c_ssize_t),
-
-    ('readonly', ctypes.c_int),
-    ('ndim', ctypes.c_int),
-    ('format', ctypes.c_char_p),
-    ('shape', ctypes.POINTER(c_ssize_t)),
-    ('strides', ctypes.POINTER(c_ssize_t)),
-    ('suboffsets', ctypes.POINTER(c_ssize_t)),
-]
-
-
-if sys.version_info[:2] <= (2, 6) or sys.version_info[:2] >= (3, 3):
-    # Original structure was eventually restored in 3.3, so just
-    # 2.7 through 3.2 uses the "enhanced" structure below
-    _fields_.extend([
-        ('internal', ctypes.c_void_p),
-    ])
-
-else:
-    # Sigh, this structure seems to have changed with Python 3.x...
-    _fields_.extend([
-        ('smalltable', ctypes.c_size_t*2),
-        ('internal', ctypes.c_void_p),
-    ])
-
-
-class Py_buffer(ctypes.Structure):  # noqa
-    @classmethod
-    def from_object(cls, obj, flags):
-        """Create a new Py_buffer referencing ram of object"""
-        if not CheckBuffer(obj):
-            raise TypeError(
-                    "%s type does not support Buffer Protocol" % (obj.__class__,))
-        buf = cls()
-
-        # deallocation of the buf causes glibc abort :(
-        result = GetBuffer(obj, buf, flags)
-
-        if result != 0:
-            raise ValueError("Unable to retrieve Buffer from %s" % (obj,))
-        if not buf.buf:
-            raise ValueError("Null pointer result from %s" % (obj,))
-        return buf
-
-    _fields_ = _fields_
-
-    @property
-    def dims(self):
-        return self.shape[:self.ndim]
-
-    def __len__(self):
-        return self.shape[0]
-
-    @property
-    def dim_strides(self):
-        if self.strides:
-            return self.strides[:self.ndim]
-        return None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
-        if self.obj:
-            ReleaseBuffer(self)
-
-    def __del__(self):
-        if self.obj:
-            ReleaseBuffer(self)
-
-
-try:
-    CheckBuffer = ctypes.pythonapi.PyObject_CheckBuffer
-    CheckBuffer.argtypes = [ctypes.py_object]
-    CheckBuffer.restype = ctypes.c_int
-except AttributeError:
-    # Python 2.6 doesn't appear to have CheckBuffer support...
-    def CheckBuffer(x):  # noqa
-        return True
-
-IncRef = ctypes.pythonapi.Py_IncRef
-IncRef.argtypes = [ctypes.py_object]
-
-GetBuffer = ctypes.pythonapi.PyObject_GetBuffer
-GetBuffer.argtypes = [ctypes.py_object, ctypes.POINTER(Py_buffer), ctypes.c_int]
-GetBuffer.restype = ctypes.c_int
-
-ReleaseBuffer = ctypes.pythonapi.PyBuffer_Release
-ReleaseBuffer.argtypes = [ctypes.POINTER(Py_buffer)]
-ReleaseBuffer.restype = None
diff --git a/pyopencl/_mymako.py b/pyopencl/_mymako.py
index 78061f31e6baf7e300e0caa95ce6a175f31e9823..5d5e92f81b2307d6104c2213af8a8bf8da6fd0ad 100644
--- a/pyopencl/_mymako.py
+++ b/pyopencl/_mymako.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 try:
     import mako.template  # noqa
 except ImportError:
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index dfa5d4bbfd0721915e3979c9cd027d62ecab63f1..446eb9c318d8d241d0ef6fa07fb80eeea8fb57cd 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1,12 +1,10 @@
-"""Scan primitive."""
+"""Algorithms built on scans."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
 
-__copyright__ = """Copyright 2011-2012 Andreas Kloeckner \
-                   Copyright 2017 Hao Gao"""
+__copyright__ = """
+Copyright 2011-2012 Andreas Kloeckner
+Copyright 2017 Hao Gao
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person
@@ -31,6 +29,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 """
 
+
 import numpy as np
 import pyopencl as cl
 import pyopencl.array  # noqa
@@ -40,35 +39,41 @@ from pytools import memoize, memoize_method, Record
 from mako.template import Template
 
 
-# {{{ copy_if
+# {{{ "extra args" handling utility
 
-_copy_if_template = ScanTemplate(
-        arguments="item_t *ary, item_t *out, scan_t *count",
-        input_expr="(%(predicate)s) ? 1 : 0",
-        scan_expr="a+b", neutral="0",
-        output_statement="""
-            if (prev_item != item) out[item-1] = ary[i];
-            if (i+1 == N) *count = item;
-            """,
-        template_processor="printf")
-
-
-def extract_extra_args_types_values(extra_args):
+def _extract_extra_args_types_values(extra_args):
     from pyopencl.tools import VectorArg, ScalarArg
 
     extra_args_types = []
     extra_args_values = []
+    extra_wait_for = []
     for name, val in extra_args:
         if isinstance(val, cl.array.Array):
             extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
             extra_args_values.append(val)
+            extra_wait_for.extend(val.events)
         elif isinstance(val, np.generic):
             extra_args_types.append(ScalarArg(val.dtype, name))
             extra_args_values.append(val)
         else:
             raise RuntimeError("argument '%d' not understood" % name)
 
-    return tuple(extra_args_types), extra_args_values
+    return tuple(extra_args_types), extra_args_values, extra_wait_for
+
+# }}}
+
+
+# {{{ copy_if
+
+_copy_if_template = ScanTemplate(
+        arguments="item_t *ary, item_t *out, scan_t *count",
+        input_expr="(%(predicate)s) ? 1 : 0",
+        scan_expr="a+b", neutral="0",
+        output_statement="""
+            if (prev_item != item) out[item-1] = ary[i];
+            if (i+1 == N) *count = item;
+            """,
+        template_processor="printf")
 
 
 def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None):
@@ -94,7 +99,12 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     else:
         scan_dtype = np.int32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+        _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _copy_if_template.build(ary.context,
             type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
@@ -104,9 +114,8 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out, count, evt
 
@@ -176,7 +185,12 @@ def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
     else:
         scan_dtype = np.uint32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+            _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _partition_template.build(
             ary.context,
@@ -189,9 +203,8 @@ def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out_true, out_false, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out_true, out_false, count, evt
 
@@ -244,7 +257,12 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="",
     else:
         scan_dtype = np.uint32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+            _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _unique_template.build(
             ary.context,
@@ -256,9 +274,8 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="",
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out, count, evt
 
@@ -274,13 +291,13 @@ def to_bin(n):
         digs.append(str(n % 2))
         n >>= 1
 
-    return ''.join(digs[::-1])
+    return "".join(digs[::-1])
 
 
-def _padded_bin(i, l):
+def _padded_bin(i, nbits):
     s = to_bin(i)
-    while len(s) < l:
-        s = '0' + s
+    while len(s) < nbits:
+        s = "0" + s
     return s
 
 
@@ -291,7 +308,7 @@ def _make_sort_scan_type(device, bits, index_dtype):
 
     fields = []
     for mnr in range(2**bits):
-        fields.append(('c%s' % _padded_bin(mnr, bits), index_dtype))
+        fields.append(("c%s" % _padded_bin(mnr, bits), index_dtype))
 
     dtype = np.dtype(fields)
 
@@ -402,11 +419,11 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
 from pyopencl.scan import GenericScanKernel
 
 
-class RadixSort(object):
+class RadixSort:
     """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
     on the compute device.
 
-    .. seealso:: :class:`pyopencl.algorithm.BitonicSort`
+    .. seealso:: :class:`pyopencl.bitonic_sort.BitonicSort`
 
     .. versionadded:: 2013.1
     """
@@ -459,7 +476,7 @@ class RadixSort(object):
 
             boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0"
 
-            return ("((mnr < %s) ? %s : %s)" % (
+            return ("((mnr < {}) ? {} : {})".format(
                 int(boundary_mnr, 2),
                 get_count_branch(known_bits+"0"),
                 get_count_branch(known_bits+"1")))
@@ -541,7 +558,7 @@ class RadixSort(object):
             scan_args = args + sorted_args + [base_bit]
 
             last_evt = self.scan_kernel(*scan_args,
-                    **dict(queue=queue, wait_for=wait_for))
+                    queue=queue, wait_for=wait_for)
             wait_for = [last_evt]
 
             # substitute sorted
@@ -875,11 +892,7 @@ class ListOfListsBuilder:
             __global ${index_t} *compressed_indices,
             __global ${index_t} *num_non_empty_list
         """
-        from sys import version_info
-        if version_info > (3, 0):
-            arguments = Template(arguments)
-        else:
-            arguments = Template(arguments, disable_unicode=True)
+        arguments = Template(arguments)
 
         from pyopencl.scan import GenericScanKernel
         return GenericScanKernel(
@@ -898,7 +911,6 @@ class ListOfListsBuilder:
                 devices=self.devices)
 
     def do_not_vectorize(self):
-        from pytools import any
         return (self.complex_kernel
                 and any(dev.type & cl.device_type.CPU
                     for dev in self.context.devices))
@@ -918,7 +930,7 @@ class ListOfListsBuilder:
                 continue
 
             name = "plb_loc_%s_count" % name
-            user_list_args.append(OtherArg("%s *%s" % (
+            user_list_args.append(OtherArg("{} *{}".format(
                 index_ctype, name), name))
 
         kernel_name = self.name_prefix+"_count"
@@ -989,10 +1001,10 @@ class ListOfListsBuilder:
                     VectorArg(index_dtype, "%s_compressed_indices" % name))
 
             index_name = "plb_%s_index" % name
-            user_list_args.append(OtherArg("%s *%s" % (
+            user_list_args.append(OtherArg("{} *{}".format(
                 index_ctype, index_name), index_name))
 
-            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)
+            kernel_list_arg_values += f"{list_name}, &{index_name}, "
 
         kernel_name = self.name_prefix+"_write"
 
@@ -1107,6 +1119,9 @@ class ListOfListsBuilder:
 
         if wait_for is None:
             wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
 
         count_kernel = self.get_count_kernel(index_dtype)
         write_kernel = self.get_write_kernel(index_dtype)
@@ -1133,6 +1148,7 @@ class ListOfListsBuilder:
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
@@ -1172,12 +1188,12 @@ class ListOfListsBuilder:
             gsize = (4*queue.device.max_compute_units,)
             lsize = (1,)
         else:
-            from pyopencl.array import splay
-            gsize, lsize = splay(queue, n_objects)
+            from pyopencl.array import _splay
+            gsize, lsize = _splay(queue.device, n_objects)
 
         count_event = count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + data_args + (n_objects,)),
-                **dict(wait_for=wait_for))
+                wait_for=wait_for)
 
         compress_events = {}
         for name, dtype in self.list_names_and_dtypes:
@@ -1283,7 +1299,7 @@ class ListOfListsBuilder:
 
         evt = write_kernel(queue, gsize, lsize,
                 *(tuple(write_list_args) + data_args + (n_objects,)),
-                **dict(wait_for=scan_events))
+                wait_for=scan_events)
 
         return result, evt
 
@@ -1309,7 +1325,7 @@ def _make_cl_int_literal(value, dtype):
     return result
 
 
-class KeyValueSorter(object):
+class KeyValueSorter:
     """Given arrays *values* and *keys* of equal length
     and a number *nkeys* of keys, returns a tuple `(starts,
     lists)`, as follows: *values* and *keys* are sorted
diff --git a/pyopencl/array.py b/pyopencl/array.py
index ad0e469e0c4cdb392ce936a5fdf1b18412c16949..874ae92c47adf72ebcf678705745feebd8700ce6 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2,7 +2,6 @@
 
 # pylint:disable=unexpected-keyword-arg  # for @elwise_kernel_runner
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -29,13 +28,11 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 """
 
-import six
-from six.moves import range, reduce
+from functools import reduce
 
 import numpy as np
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
-from pytools import memoize_method
 from pyopencl.compyte.array import (
         as_strided as _as_strided,
         f_contiguous_strides as _f_contiguous_strides,
@@ -47,25 +44,64 @@ from pyopencl.characterize import has_double_support
 from pyopencl import cltypes
 
 
+_COMMON_DTYPE_CACHE = {}
+
+
 def _get_common_dtype(obj1, obj2, queue):
-    return _get_common_dtype_base(obj1, obj2,
-                                  has_double_support(queue.device))
+    dsupport = has_double_support(queue.device)
+    cache_key = None
+    o1_dtype = obj1.dtype
+    try:
+        cache_key = (o1_dtype, obj2.dtype, dsupport)
+        return _COMMON_DTYPE_CACHE[cache_key]
+    except KeyError:
+        pass
+    except AttributeError:
+        # obj2 doesn't have a dtype
+        try:
+            tobj2 = type(obj2)
+            cache_key = (o1_dtype, tobj2, dsupport)
+
+            # Integers are weird, sized, and signed. Don't pretend that 'int'
+            # is enough information to decide what should happen.
+            if tobj2 != int:
+                return _COMMON_DTYPE_CACHE[cache_key]
+        except KeyError:
+            pass
+
+    result = _get_common_dtype_base(obj1, obj2, dsupport)
+
+    # we succeeded in constructing the cache key
+    if cache_key is not None:
+        _COMMON_DTYPE_CACHE[cache_key] = result
+
+    return result
 
 
-# Work around PyPy not currently supporting the object dtype.
-# (Yes, it doesn't even support checking!)
-# (as of May 27, 2014 on PyPy 2.3)
-try:
-    np.dtype(object)
+def _get_truedivide_dtype(obj1, obj2, queue):
+    # the dtype of the division result obj1 / obj2
 
-    def _dtype_is_object(t):
-        return t == object
-except Exception:
-    def _dtype_is_object(t):
-        return False
+    allow_double = has_double_support(queue.device)
 
+    x1 = obj1 if np.isscalar(obj1) else np.ones(1, obj1.dtype)
+    x2 = obj2 if np.isscalar(obj2) else np.ones(1, obj2.dtype)
 
-class VecLookupWarner(object):
+    result = (x1/x2).dtype
+
+    if not allow_double:
+        if result == np.float64:
+            result = np.dtype(np.float32)
+        elif result == np.complex128:
+            result = np.dtype(np.complex64)
+
+    return result
+
+
+class InconsistentOpenCLQueueWarning(UserWarning):
+    pass
+
+
+class VecLookupWarner:
     def __getattr__(self, name):
         from warnings import warn
         warn("pyopencl.array.vec is deprecated. "
@@ -82,19 +118,18 @@ class VecLookupWarner(object):
 
 vec = VecLookupWarner()
 
-# {{{ helper functionality
 
+# {{{ helper functionality
 
-def splay(queue, n, kernel_specific_max_wg_size=None):
-    dev = queue.device
-    max_work_items = _builtin_min(128, dev.max_work_group_size)
+def _splay(device, n, kernel_specific_max_wg_size=None):
+    max_work_items = _builtin_min(128, device.max_work_group_size)
 
     if kernel_specific_max_wg_size is not None:
-        from six.moves.builtins import min
+        from builtins import min
         max_work_items = min(max_work_items, kernel_specific_max_wg_size)
 
     min_work_items = _builtin_min(32, max_work_items)
-    max_groups = dev.max_compute_units * 4 * 8
+    max_groups = device.max_compute_units * 4 * 8
     # 4 to overfill the device
     # 8 is an Nvidia constant--that's how many
     # groups fit onto one compute device
@@ -118,6 +153,10 @@ def splay(queue, n, kernel_specific_max_wg_size=None):
     return (group_count*work_items_per_group,), (work_items_per_group,)
 
 
+# deliberately undocumented for now
+ARRAY_KERNEL_EXEC_HOOK = None
+
+
 def elwise_kernel_runner(kernel_getter):
     """Take a kernel getter of the same signature as the kernel
     and return a function that invokes that kernel.
@@ -127,38 +166,28 @@ def elwise_kernel_runner(kernel_getter):
 
     def kernel_runner(*args, **kwargs):
         repr_ary = args[0]
-        queue = kwargs.pop("queue", None) or repr_ary.queue
-        wait_for = kwargs.pop("wait_for", None)
+        queue = kwargs.pop("queue", None)
+        implicit_queue = queue is None
+        if implicit_queue:
+            queue = repr_ary.queue
 
-        # wait_for must be a copy, because we modify it in-place below
-        if wait_for is None:
-            wait_for = []
-        else:
-            wait_for = list(wait_for)
+        wait_for = kwargs.pop("wait_for", None)
 
         knl = kernel_getter(*args, **kwargs)
 
-        gs, ls = repr_ary.get_sizes(queue,
+        gs, ls = repr_ary._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
         assert isinstance(repr_ary, Array)
+        args = args + (repr_ary.size,)
 
-        actual_args = []
-        for arg in args:
-            if isinstance(arg, Array):
-                if not arg.flags.forc:
-                    raise RuntimeError("only contiguous arrays may "
-                            "be used as arguments to this operation")
-                actual_args.append(arg.base_data)
-                actual_args.append(arg.offset)
-                wait_for.extend(arg.events)
-            else:
-                actual_args.append(arg)
-        actual_args.append(repr_ary.size)
-
-        return knl(queue, gs, ls, *actual_args, **dict(wait_for=wait_for))
+        if ARRAY_KERNEL_EXEC_HOOK is not None:
+            return ARRAY_KERNEL_EXEC_HOOK(  # pylint: disable=not-callable
+                    knl, queue, gs, ls, *args, wait_for=wait_for)
+        else:
+            return knl(queue, gs, ls, *args, wait_for=wait_for)
 
     try:
         from functools import update_wrapper
@@ -177,15 +206,6 @@ class DefaultAllocator(cl.tools.DeferredAllocator):
                 DeprecationWarning, 2)
         cl.tools.DeferredAllocator.__init__(self, *args, **kwargs)
 
-
-def _make_strides(itemsize, shape, order):
-    if order in "fF":
-        return _f_contiguous_strides(itemsize, shape)
-    elif order in "cC":
-        return _c_contiguous_strides(itemsize, shape)
-    else:
-        raise ValueError("invalid order: %s" % order)
-
 # }}}
 
 
@@ -206,18 +226,21 @@ class _copy_queue:  # noqa
     pass
 
 
-class Array(object):
+_ARRAY_GET_SIZES_CACHE = {}
+
+
+class Array:
     """A :class:`numpy.ndarray` work-alike that stores its data and performs
     its computations on the compute device.  *shape* and *dtype* work exactly
     as in :mod:`numpy`.  Arithmetic methods in :class:`Array` support the
     broadcasting of scalars. (e.g. `array+5`)
 
-    *cq* must be a :class:`pyopencl.CommandQueue` or a :class:`pyopencl.Context`.
+    *cq* must be a :class:`~pyopencl.CommandQueue` or a :class:`~pyopencl.Context`.
 
     If it is a queue, *cq* specifies the queue in which the array carries out
     its computations by default. If a default queue (and thereby overloaded
     operators and many other niceties) are not desired, pass a
-    :class:`Context`.
+    :class:`~pyopencl.Context`.
 
     *allocator* may be `None` or a callable that, upon being called with an
     argument of the number of bytes to be allocated, returns an
@@ -341,6 +364,7 @@ class Array(object):
     .. autoattribute :: real
     .. autoattribute :: imag
     .. automethod :: conj
+    .. automethod :: conjugate
 
     .. automethod :: __getitem__
     .. automethod :: __setitem__
@@ -391,7 +415,7 @@ class Array(object):
     __array_priority__ = 100
 
     def __init__(self, cq, shape, dtype, order="C", allocator=None,
-            data=None, offset=0, strides=None, events=None):
+            data=None, offset=0, strides=None, events=None, _flags=None):
         # {{{ backward compatibility
 
         if isinstance(cq, cl.CommandQueue):
@@ -435,24 +459,37 @@ class Array(object):
             size = 1
             for dim in shape:
                 size *= dim
+                if dim < 0:
+                    raise ValueError("negative dimensions are not allowed")
+
         except TypeError:
-            import sys
-            if sys.version_info >= (3,):
-                admissible_types = (int, np.integer)
-            else:
-                admissible_types = (np.integer,) + six.integer_types
+            admissible_types = (int, np.integer)
 
             if not isinstance(shape, admissible_types):
                 raise TypeError("shape must either be iterable or "
                         "castable to an integer")
             size = shape
+            if shape < 0:
+                raise ValueError("negative dimensions are not allowed")
             shape = (shape,)
 
         if isinstance(size, np.integer):
             size = size.item()
 
         if strides is None:
-            strides = _make_strides(dtype.itemsize, shape, order)
+            if order in "cC":
+                # inlined from compyte.array.c_contiguous_strides
+                if shape:
+                    strides = [dtype.itemsize]
+                    for s in shape[:0:-1]:
+                        strides.append(strides[-1]*s)
+                    strides = tuple(strides[::-1])
+                else:
+                    strides = ()
+            elif order in "fF":
+                strides = _f_contiguous_strides(dtype.itemsize, shape)
+            else:
+                raise ValueError("invalid order: %s" % order)
 
         else:
             # FIXME: We should possibly perform some plausibility
@@ -462,9 +499,8 @@ class Array(object):
 
         # }}}
 
-        if _dtype_is_object(dtype):
-            raise TypeError("object arrays on the compute device are not allowed")
-
+        assert dtype != object, \
+                "object arrays on the compute device are not allowed"
         assert isinstance(shape, tuple)
         assert isinstance(strides, tuple)
 
@@ -483,28 +519,28 @@ class Array(object):
         self.allocator = allocator
 
         if data is None:
-            if alloc_nbytes <= 0:
-                if alloc_nbytes == 0:
-                    # Work around CL not allowing zero-sized buffers.
-                    alloc_nbytes = 1
-
-                else:
-                    raise ValueError("cannot allocate CL buffer with "
-                            "negative size")
+            if alloc_nbytes < 0:
+                raise ValueError("cannot allocate CL buffer with "
+                        "negative size")
 
-            if allocator is None:
-                if context is None and queue is not None:
-                    context = queue.context
+            elif alloc_nbytes == 0:
+                self.base_data = None
 
-                self.base_data = cl.Buffer(
-                        context, cl.mem_flags.READ_WRITE, alloc_nbytes)
             else:
-                self.base_data = self.allocator(alloc_nbytes)
+                if allocator is None:
+                    if context is None and queue is not None:
+                        context = queue.context
+
+                    self.base_data = cl.Buffer(
+                            context, cl.mem_flags.READ_WRITE, alloc_nbytes)
+                else:
+                    self.base_data = self.allocator(alloc_nbytes)
         else:
             self.base_data = data
 
         self.offset = offset
         self.context = context
+        self._flags = _flags
 
     @property
     def ndim(self):
@@ -518,9 +554,11 @@ class Array(object):
             return self.base_data
 
     @property
-    @memoize_method
     def flags(self):
-        return _ArrayFlags(self)
+        f = self._flags
+        if f is None:
+            self._flags = f = _ArrayFlags(self)
+        return f
 
     def _new_with_changes(self, data, offset, shape=None, dtype=None,
             strides=None, queue=_copy_queue, allocator=None):
@@ -570,12 +608,17 @@ class Array(object):
         return self._new_with_changes(self.base_data, self.offset,
                 queue=queue)
 
-    #@memoize_method FIXME: reenable
-    def get_sizes(self, queue, kernel_specific_max_wg_size=None):
+    def _get_sizes(self, queue, kernel_specific_max_wg_size=None):
         if not self.flags.forc:
             raise NotImplementedError("cannot operate on non-contiguous array")
-        return splay(queue, self.size,
-                kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+        cache_key = (queue.device.int_ptr, self.size, kernel_specific_max_wg_size)
+        try:
+            return _ARRAY_GET_SIZES_CACHE[cache_key]
+        except KeyError:
+            sizes = _splay(queue.device, self.size,
+                    kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+            _ARRAY_GET_SIZES_CACHE[cache_key] = sizes
+            return sizes
 
     def set(self, ary, queue=None, async_=None, **kwargs):
         """Transfer the contents the :class:`numpy.ndarray` object *ary*
@@ -688,7 +731,7 @@ class Array(object):
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
         """Transfer the contents of *self* into *ary* or a newly allocated
-        :mod:`numpy.ndarray`. If *ary* is given, it must have the same
+        :class:`numpy.ndarray`. If *ary* is given, it must have the same
         shape and dtype.
 
         .. versionchanged:: 2019.1.2
@@ -736,7 +779,7 @@ class Array(object):
 
     def copy(self, queue=_copy_queue):
         """
-        :arg queue: The :class:`CommandQueue` for the returned array.
+        :arg queue: The :class:`~pyopencl.CommandQueue` for the returned array.
 
         .. versionchanged:: 2017.1.2
             Updates the queue of the returned array.
@@ -764,13 +807,29 @@ class Array(object):
         return result
 
     def __str__(self):
+        if self.queue is None:
+            return (f"<cl.Array {self.shape} of {self.dtype} "
+                    "without queue, call with_queue()>")
+
         return str(self.get())
 
     def __repr__(self):
-        return repr(self.get())
+        if self.queue is None:
+            return (f"<cl.Array {self.shape} of {self.dtype} "
+                    f"at {id(self):x} without queue, "
+                    "call with_queue()>")
+
+        result = repr(self.get())
+        if result[:5] == "array":
+            result = "cl.Array" + result[5:]
+        else:
+            from warnings import warn
+            warn("numpy.ndarray.__repr__ was expected to return a string starting "
+                    f"with 'array'. It didn't: '{result[:10]:r}'")
+        return result
 
     def safely_stringify_for_pudb(self):
-        return "cl.Array %s %s" % (self.dtype, self.shape)
+        return f"cl.Array {self.dtype} {self.shape}"
 
     def __hash__(self):
         raise TypeError("pyopencl arrays are not hashable.")
@@ -905,19 +964,21 @@ class Array(object):
 
     def _new_like_me(self, dtype=None, queue=None):
         strides = None
+        flags = None
         if dtype is None:
             dtype = self.dtype
 
         if dtype == self.dtype:
             strides = self.strides
+            flags = self.flags
 
         queue = queue or self.queue
         if queue is not None:
             return self.__class__(queue, self.shape, dtype,
-                    allocator=self.allocator, strides=strides)
+                    allocator=self.allocator, strides=strides, _flags=flags)
         else:
             return self.__class__(self.context, self.shape, dtype,
-                    strides=strides, allocator=self.allocator)
+                    strides=strides, allocator=self.allocator, _flags=flags)
 
     @staticmethod
     @elwise_kernel_runner
@@ -992,7 +1053,7 @@ class Array(object):
             result.add_event(
                     self._axpbyz(result,
                         self.dtype.type(1), self,
-                        other.dtype.type(-1), other))
+                        result.dtype.type(-1), other))
 
             return result
         else:
@@ -1015,7 +1076,7 @@ class Array(object):
         # other must be a scalar
         result = self._new_like_me(common_dtype)
         result.add_event(
-                self._axpbz(result, self.dtype.type(-1), self,
+                self._axpbz(result, result.dtype.type(-1), self,
                     common_dtype.type(other)))
         return result
 
@@ -1083,20 +1144,20 @@ class Array(object):
     def __div__(self, other):
         """Divides an array by an array or a scalar, i.e. ``self / other``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(self._div(result, self, other))
         else:
             if other == 1:
                 return self.copy()
             else:
                 # create a new array for the result
-                common_dtype = _get_common_dtype(self, other, self.queue)
                 result = self._new_like_me(common_dtype)
                 result.add_event(
                         self._axpbz(result,
-                            common_dtype.type(1/other), self, self.dtype.type(0)))
+                                    np.true_divide(common_dtype.type(1), other),
+                                    self, self.dtype.type(0)))
 
         return result
 
@@ -1105,14 +1166,13 @@ class Array(object):
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array, i.e. ``other / self``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
 
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(other._div(result, self))
         else:
             # create a new array for the result
-            common_dtype = _get_common_dtype(self, other, self.queue)
             result = self._new_like_me(common_dtype)
             result.add_event(
                     self._rdiv_scalar(result, self, common_dtype.type(other)))
@@ -1121,6 +1181,26 @@ class Array(object):
 
     __rtruediv__ = __rdiv__
 
+    def __itruediv__(self, other):
+        # raise an error if the result cannot be cast to self
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
+        if not np.can_cast(common_dtype, self.dtype.type):
+            raise TypeError("Cannot cast {!r} to {!r}"
+                            .format(self.dtype, common_dtype))
+
+        if isinstance(other, Array):
+            self.add_event(
+                self._div(self, self, other))
+        else:
+            if other == 1:
+                return self
+            else:
+                self.add_event(
+                    self._axpbz(self, common_dtype.type(np.true_divide(1, other)),
+                                self, self.dtype.type(0)))
+
+        return self
+
     def __and__(self, other):
         common_dtype = _get_common_dtype(self, other, self.queue)
 
@@ -1223,10 +1303,18 @@ class Array(object):
     def _zero_fill(self, queue=None, wait_for=None):
         queue = queue or self.queue
 
-        if (
-                queue._get_cl_version() >= (1, 2)
-                and cl.get_cl_header_version() >= (1, 2)):
+        if not self.size:
+            return
+
+        cl_version_gtr_1_2 = (
+            queue._get_cl_version() >= (1, 2)
+            and cl.get_cl_header_version() >= (1, 2)
+        )
+        on_nvidia = queue.device.vendor.startswith("NVIDIA")
 
+        # circumvent bug with large buffers on NVIDIA
+        # https://github.com/inducer/pyopencl/issues/395
+        if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31):
             self.add_event(
                     cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0),
                         self.offset, self.nbytes, wait_for=wait_for))
@@ -1467,6 +1555,8 @@ class Array(object):
         else:
             return self
 
+    conjugate = conj
+
     # }}}
 
     # {{{ event management
@@ -1538,6 +1628,15 @@ class Array(object):
         if size != self.size:
             raise ValueError("total size of new array must be unchanged")
 
+        if self.size == 0:
+            return self._new_with_changes(
+                    data=None, offset=0, shape=shape,
+                    strides=(
+                        _f_contiguous_strides(self.dtype.itemsize, shape)
+                        if order == "F" else
+                        _c_contiguous_strides(self.dtype.itemsize, shape)
+                        ))
+
         # {{{ determine reshaped strides
 
         # copied and translated from
@@ -1941,7 +2040,7 @@ def as_strided(ary, shape=None, strides=None):
             data=ary.data, strides=strides)
 
 
-class _same_as_transfer(object):  # noqa
+class _same_as_transfer:  # noqa
     pass
 
 
@@ -1950,7 +2049,7 @@ def to_device(queue, ary, allocator=None, async_=None,
     """Return a :class:`Array` that is an exact copy of the
     :class:`numpy.ndarray` instance *ary*.
 
-    :arg array_queue: The :class:`CommandQueue` which will
+    :arg array_queue: The :class:`~pyopencl.CommandQueue` which will
         be stored in the resulting array. Useful
         to make sure there is no implicit queue associated
         with the array by passing *None*.
@@ -1985,7 +2084,7 @@ def to_device(queue, ary, allocator=None, async_=None,
 
     # }}}
 
-    if _dtype_is_object(ary.dtype):
+    if ary.dtype == object:
         raise RuntimeError("to_device does not work on object arrays.")
 
     if array_queue is _same_as_transfer:
@@ -2099,7 +2198,7 @@ def arange(queue, *args, **kwargs):
         raise ValueError("too many arguments")
 
     admissible_names = ["start", "stop", "step", "dtype", "allocator"]
-    for k, v in six.iteritems(kwargs):
+    for k, v in kwargs.items():
         if k in admissible_names:
             if getattr(inf, k) is None:
                 setattr(inf, k, v)
@@ -2201,7 +2300,7 @@ def multi_take(arrays, indices, out=None, queue=None):
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = indices.get_sizes(queue,
+        gs, ls = indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
@@ -2279,24 +2378,18 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = src_indices.get_sizes(queue,
+        gs, ls = src_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (dest_indices.events + src_indices.events
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
-                *([o.data for o in out[chunk_slice]]
-                    + [dest_indices.base_data,
-                        dest_indices.offset,
-                        src_indices.base_data,
-                        src_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
+                *([o for o in out[chunk_slice]]
+                    + [dest_indices, src_indices]
+                    + [i for i in arrays[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]), wait_for=wait_for_this)
         for o in out[chunk_slice]:
@@ -2362,28 +2455,21 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = dest_indices.get_sizes(queue,
+        gs, ls = dest_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (wait_for
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
                 *(
-                    list(flatten(
-                        (o.base_data, o.offset)
-                        for o in out[chunk_slice]))
-                    + [dest_indices.base_data, dest_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
-                    + [use_fill_cla.base_data, use_fill_cla.offset]
-                    + [array_lengths_cla.base_data, array_lengths_cla.offset]
-                    + [dest_indices.size]),
-                **dict(wait_for=wait_for_this))
+                    [o for o in out[chunk_slice]]
+                    + [dest_indices]
+                    + [i for i in arrays[chunk_slice]]
+                    + [use_fill_cla, array_lengths_cla, dest_indices.size]),
+                wait_for=wait_for_this)
 
         for o in out[chunk_slice]:
             o.add_event(evt)
diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py
index 4c13cbaa871bd88556c87618f4a42b6d619abd68..29fff563a05ebd4393d28068ecedef68f917945a 100644
--- a/pyopencl/bitonic_sort.py
+++ b/pyopencl/bitonic_sort.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement, absolute_import, print_function
-
 __copyright__ = """
 Copyright (c) 2011, Eric Bainville
 Copyright (c) 2015, Ilya Efimoff
@@ -50,7 +48,7 @@ def _is_power_of_2(n):
     return n == 0 or 2**bitlog2(n) == n
 
 
-class BitonicSort(object):
+class BitonicSort:
     """Sort an array (or one axis of one) using a sorting network.
 
     Will only work if the axis of the array to be sorted has a length
@@ -64,14 +62,14 @@ class BitonicSort(object):
     """
 
     kernels_srcs = {
-            'B2': _tmpl.ParallelBitonic_B2,
-            'B4': _tmpl.ParallelBitonic_B4,
-            'B8': _tmpl.ParallelBitonic_B8,
-            'B16': _tmpl.ParallelBitonic_B16,
-            'C4': _tmpl.ParallelBitonic_C4,
-            'BL': _tmpl.ParallelBitonic_Local,
-            'BLO': _tmpl.ParallelBitonic_Local_Optim,
-            'PML': _tmpl.ParallelMerge_Local
+            "B2": _tmpl.ParallelBitonic_B2,
+            "B4": _tmpl.ParallelBitonic_B4,
+            "B8": _tmpl.ParallelBitonic_B8,
+            "B16": _tmpl.ParallelBitonic_B16,
+            "C4": _tmpl.ParallelBitonic_C4,
+            "BL": _tmpl.ParallelBitonic_Local,
+            "BLO": _tmpl.ParallelBitonic_Local_Optim,
+            "PML": _tmpl.ParallelMerge_Local
             }
 
     def __init__(self, context):
@@ -162,7 +160,7 @@ class BitonicSort(object):
         key_ctype = dtype_to_ctype(key_dtype)
 
         if idx_dtype is None:
-            idx_ctype = 'uint'  # Dummy
+            idx_ctype = "uint"  # Dummy
 
         else:
             idx_ctype = dtype_to_ctype(idx_dtype)
@@ -206,7 +204,7 @@ class BitonicSort(object):
 
         length = wg >> 1
         prg = self.get_program(
-                'BLO', argsort, (1, 1, key_ctype, idx_ctype, ds, ns))
+                "BLO", argsort, (1, 1, key_ctype, idx_ctype, ds, ns))
         run_queue.append((prg.run, size, (wg,), True))
 
         while length < ds:
@@ -215,16 +213,16 @@ class BitonicSort(object):
                 ninc = 0
                 direction = length << 1
                 if allowb16 and inc >= 8 and ninc == 0:
-                    letter = 'B16'
+                    letter = "B16"
                     ninc = 4
                 elif allowb8 and inc >= 4 and ninc == 0:
-                    letter = 'B8'
+                    letter = "B8"
                     ninc = 3
                 elif allowb4 and inc >= 2 and ninc == 0:
-                    letter = 'B4'
+                    letter = "B4"
                     ninc = 2
                 elif inc >= 0:
-                    letter = 'B2'
+                    letter = "B2"
                     ninc = 1
 
                 nthreads = size >> ninc
diff --git a/pyopencl/cache.py b/pyopencl/cache.py
index b6d21f280d6295d60cc0d88f229e2dde9ccb040c..ecab64a131a6222f519f979f6f1b319098c01663 100644
--- a/pyopencl/cache.py
+++ b/pyopencl/cache.py
@@ -1,6 +1,5 @@
 """PyOpenCL compiler cache."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2011 Andreas Kloeckner"
 
@@ -24,8 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import zip
 import pyopencl._cl as _cl
 import re
 import sys
@@ -36,13 +33,8 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-try:
-    import hashlib
-    new_hash = hashlib.md5
-except ImportError:
-    # for Python << 2.5
-    import md5
-    new_hash = md5.new
+import hashlib
+new_hash = hashlib.md5
 
 
 def _erase_dir(dir):
@@ -54,7 +46,7 @@ def _erase_dir(dir):
 
 
 def update_checksum(checksum, obj):
-    if isinstance(obj, six.text_type):
+    if isinstance(obj, str):
         checksum.update(obj.encode("utf8"))
     else:
         checksum.update(obj)
@@ -62,7 +54,7 @@ def update_checksum(checksum, obj):
 
 # {{{ cleanup
 
-class CleanupBase(object):
+class CleanupBase:
     pass
 
 
@@ -173,8 +165,8 @@ def get_dependencies(src, include_path):
 
                 if included_file_name not in result:
                     try:
-                        src_file = open(included_file_name, "rt")
-                    except IOError:
+                        src_file = open(included_file_name)
+                    except OSError:
                         continue
 
                     try:
@@ -203,7 +195,7 @@ def get_dependencies(src, include_path):
 
     _inner(src)
 
-    result = list((name,) + vals for name, vals in six.iteritems(result))
+    result = list((name,) + vals for name, vals in result.items())
     result.sort()
 
     return result
@@ -276,11 +268,11 @@ def retrieve_from_cache(cache_dir, cache_key):
             # {{{ load info file
 
             try:
-                from six.moves.cPickle import load
+                from pickle import load
 
                 try:
                     info_file = open(info_path, "rb")
-                except IOError:
+                except OSError:
                     raise _InvalidInfoFile()
 
                 try:
@@ -335,8 +327,8 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
     if cache_dir is None:
         import appdirs
         cache_dir = join(appdirs.user_cache_dir("pyopencl", "pyopencl"),
-                "pyopencl-compiler-cache-v2-py%s" % (
-                    ".".join(str(i) for i in sys.version_info),))
+                "pyopencl-compiler-cache-v2-py{}".format(
+                    ".".join(str(i) for i in sys.version_info)))
 
     # {{{ ensure cache directory exists
 
@@ -374,7 +366,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
             logs.append(log)
 
     message = (75*"="+"\n").join(
-            "Build on %s succeeded, but said:\n\n%s" % (dev, log)
+            f"Build on {dev} succeeded, but said:\n\n{log}"
             for dev, log in zip(devices, logs)
             if log is not None and log.strip())
 
@@ -453,7 +445,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
                     outf.write(binary)
                     outf.close()
 
-                    from six.moves.cPickle import dump
+                    from pickle import dump
                     info_file = open(info_path, "wb")
                     dump(_SourceInfo(
                         dependencies=get_dependencies(src, include_path),
@@ -474,28 +466,36 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
 def create_built_program_from_source_cached(ctx, src, options_bytes, devices=None,
         cache_dir=None, include_path=None):
     try:
+        was_cached = False
+        already_built = False
         if cache_dir is not False:
             prg, already_built, was_cached = \
                     _create_built_program_from_source_cached(
                             ctx, src, options_bytes, devices, cache_dir,
                             include_path=include_path)
+            if was_cached and not already_built:
+                prg.build(options_bytes, devices)
+                already_built = True
         else:
             prg = _cl._Program(ctx, src)
-            was_cached = False
-            already_built = False
 
     except Exception as e:
         from pyopencl import Error
-        if (isinstance(e, Error)
-                and e.code == _cl.status_code.BUILD_PROGRAM_FAILURE):  # noqa pylint:disable=no-member
-            # no need to try again
+        build_program_failure = (isinstance(e, Error)
+                and e.code == _cl.status_code.BUILD_PROGRAM_FAILURE)  # noqa pylint:disable=no-member
+
+        # Mac error on intel CPU driver: can't build from cached version.
+        # If we get a build_program_failure from the cached version then
+        # build from source instead, otherwise report the failure.
+        if build_program_failure and not was_cached:
             raise
 
-        from warnings import warn
-        from traceback import format_exc
-        warn("PyOpenCL compiler caching failed with an exception:\n"
-                "[begin exception]\n%s[end exception]"
-                % format_exc())
+        if not build_program_failure:
+            from warnings import warn
+            from traceback import format_exc
+            warn("PyOpenCL compiler caching failed with an exception:\n"
+                    "[begin exception]\n%s[end exception]"
+                    % format_exc())
 
         prg = _cl._Program(ctx, src)
         was_cached = False
diff --git a/pyopencl/capture_call.py b/pyopencl/capture_call.py
index 09d483a57ad6d87387e519d8a72fea79142f3244..867365319f39f4e4a629aa6446b8a607c2d16b93 100644
--- a/pyopencl/capture_call.py
+++ b/pyopencl/capture_call.py
@@ -1,7 +1,3 @@
-from __future__ import with_statement, division
-from __future__ import absolute_import
-from six.moves import zip
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -51,8 +47,8 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
     cg("")
 
     cg('CODE = r"""//CL//')
-    for l in source.split("\n"):
-        cg(l)
+    for line in source.split("\n"):
+        cg(line)
     cg('"""')
 
     # {{{ invocation
@@ -83,13 +79,13 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
             elif isinstance(arg, (int, float)):
                 kernel_args.append(repr(arg))
             elif isinstance(arg, np.integer):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(int(arg))))
             elif isinstance(arg, np.floating):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(float(arg))))
             elif isinstance(arg, np.complexfloating):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(complex(arg))))
             else:
                 try:
@@ -133,7 +129,7 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
                     % ", ".join(
                         strify_dtype(dt) for dt in kernel._scalar_arg_dtypes))
 
-        cg("knl(queue, %s, %s," % (repr(g_size), repr(l_size)))
+        cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size)))
         cg("    %s)" % ", ".join(kernel_args))
         cg("")
         cg("queue.finish()")
@@ -163,7 +159,7 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
     # {{{ file trailer
 
     cg("")
-    cg("if __name__ == \"__main__\":")
+    cg('if __name__ == "__main__":')
     with Indentation(cg):
         cg("main()")
     cg("")
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
index eae523be2f045bcadafb28166001cc6beeaf445f..19e4c95b51d9e6b5b9cc849930c4bc34de6ecce3 100644
--- a/pyopencl/characterize/__init__.py
+++ b/pyopencl/characterize/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -24,8 +22,6 @@ THE SOFTWARE.
 
 import pyopencl as cl
 from pytools import memoize
-import six
-from six.moves import range, zip
 
 
 class CLCharacterizationWarning(UserWarning):
@@ -237,13 +233,13 @@ def why_not_local_access_conflict_free(dev, itemsize,
 
             bank = (addr // gran) % bank_count
             bank_accesses.setdefault(bank, []).append(
-                    "w.item %s -> %s" % (work_item_id, idx[::-1]))
+                    "w.item {} -> {}".format(work_item_id, idx[::-1]))
 
         conflict_multiplicity = max(
-                len(acc) for acc in six.itervalues(bank_accesses))
+                len(acc) for acc in bank_accesses.values())
 
         if conflict_multiplicity > 1:
-            for bank, acc in six.iteritems(bank_accesses):
+            for bank, acc in bank_accesses.items():
                 if len(acc) == conflict_multiplicity:
                     conflicts.append(
                             (conflict_multiplicity,
@@ -284,13 +280,15 @@ def get_simd_group_size(dev, type_size):
     except Exception:
         pass
 
-    lc_vendor = dev.platform.vendor.lower()
-    if "nvidia" in lc_vendor:
+    lc_plat_vendor = dev.platform.vendor.lower()
+    lc_dev_vendor = dev.vendor.lower()
+    if "nvidia" in lc_plat_vendor or "nvidia" in lc_dev_vendor:
         return 32
 
-    if ("advanced micro" in lc_vendor or "ati" in lc_vendor):
+    if ("advanced micro" in lc_plat_vendor or "ati" in lc_plat_vendor
+      or "advanced micro" in lc_dev_vendor or "ati" in lc_dev_vendor):
         if dev.type & cl.device_type.GPU:
-            # Tomasz Rybak says, in response to reduction mishbehaving on the AMD
+            # Tomasz Rybak says, in response to reduction misbehaving on the AMD
             # 'Loveland' APU:
             #
             #    Like in CUDA reduction bug (related to Fermi) it again seems
diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py
index f0c769077fe4a2d2959b5f39e2f46588c0eca3cc..f629240438ddd92404a1a8f29fa100761347c95b 100644
--- a/pyopencl/characterize/performance.py
+++ b/pyopencl/characterize/performance.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
 import pyopencl as cl
 import numpy as np
 
@@ -100,7 +97,7 @@ def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
 
 # {{{ transfer measurements
 
-class HostDeviceTransferBase(object):
+class HostDeviceTransferBase:
     def __init__(self, queue, block_size):
         self.queue = queue
         self.host_buf = np.empty(block_size, dtype=np.uint8)
@@ -117,7 +114,7 @@ class DeviceToHostTransfer(HostDeviceTransferBase):
         return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)
 
 
-class DeviceToDeviceTransfer(object):
+class DeviceToDeviceTransfer:
     def __init__(self, queue, block_size):
         self.queue = queue
         mf = cl.mem_flags
diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py
index 2ae8bfbfa22fd1842134c8db96be03a2e7fb9a44..58c20ce5f48fa25379dd5d84bce95709537c6d61 100644
--- a/pyopencl/clmath.py
+++ b/pyopencl/clmath.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 # pylint:disable=unexpected-keyword-arg  # for @elwise_kernel_runner
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
@@ -35,7 +33,7 @@ def _make_unary_array_func(name):
     def knl_runner(result, arg):
         if arg.dtype.kind == "c":
             from pyopencl.elementwise import complex_dtype_to_name
-            fname = "%s_%s" % (complex_dtype_to_name(arg.dtype), name)
+            fname = "{}_{}".format(complex_dtype_to_name(arg.dtype), name)
         else:
             fname = name
 
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 96acce1f40c15cd5d87cc71a5761e328d950146c..dd6c1276cffe2effb53255e1c2d8bc02ec24a3ac 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2009-16 Andreas Kloeckner"
 
 __license__ = """
@@ -26,7 +23,7 @@ THE SOFTWARE.
 
 # {{{ documentation
 
-__doc__ = u"""
+__doc__ = """
 PyOpenCL now includes and uses some of the `Random123 random number generators
 <https://www.deshawresearch.com/resources_random123.html>`_ by D.E. Shaw
 Research.  In addition to being usable through the convenience functions above,
@@ -72,7 +69,7 @@ import numpy as np
 
 # {{{ RanluxGenerator (deprecated)
 
-class RanluxGenerator(object):
+class RanluxGenerator:
     """
     .. warning::
 
@@ -117,6 +114,10 @@ class RanluxGenerator(object):
             Added default value for `num_work_items`.
         """
 
+        from warnings import warn
+        warn("Ranlux random number generation is deprecated and will go away "
+                "in 2022.", DeprecationWarning, stacklevel=2)
+
         if luxury is None:
             luxury = 4
 
@@ -333,10 +334,13 @@ class RanluxGenerator(object):
             queue = ary.queue
 
         knl, size_multiplier = self.get_gen_kernel(ary.dtype, "uniform")
-        return knl(queue,
+        evt = knl(queue,
                 (self.num_work_items,), None,
                 self.state.data, ary.data, ary.size*size_multiplier,
-                b-a, a)
+                b-a, a, wait_for=ary.events)
+        ary.add_event(evt)
+        self.state.add_event(evt)
+        return ary
 
     def uniform(self, *args, **kwargs):
         """Make a new empty array, apply :meth:`fill_uniform` to it.
@@ -345,9 +349,7 @@ class RanluxGenerator(object):
         b = kwargs.pop("b", 1)
 
         result = cl_array.empty(*args, **kwargs)
-
-        result.add_event(
-                self.fill_uniform(result, queue=result.queue, a=a, b=b))
+        self.fill_uniform(result, queue=result.queue, a=a, b=b)
         return result
 
     def fill_normal(self, ary, mu=0, sigma=1, queue=None):
@@ -363,9 +365,13 @@ class RanluxGenerator(object):
             queue = ary.queue
 
         knl, size_multiplier = self.get_gen_kernel(ary.dtype, "normal")
-        return knl(queue,
+        evt = knl(queue,
                 (self.num_work_items,), self.wg_size,
-                self.state.data, ary.data, ary.size*size_multiplier, sigma, mu)
+                self.state.data, ary.data, ary.size*size_multiplier, sigma, mu,
+                wait_for=ary.events)
+        ary.add_event(evt)
+        self.state.add_event(evt)
+        return evt
 
     def normal(self, *args, **kwargs):
         """Make a new empty array, apply :meth:`fill_normal` to it.
@@ -374,29 +380,27 @@ class RanluxGenerator(object):
         sigma = kwargs.pop("sigma", 1)
 
         result = cl_array.empty(*args, **kwargs)
-
-        result.add_event(
-                self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma))
+        self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
         return result
 
     @memoize_method
     def get_sync_kernel(self):
         src = """//CL//
-            %(defines)s
+            {defines}
 
             #include <pyopencl-ranluxcl.cl>
 
             kernel void sync(
                 global ranluxcl_state_t *ranluxcltab)
-            {
+            {{
               ranluxcl_state_t ranluxclstate;
               ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
               ranluxcl_synchronize(&ranluxclstate);
               ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
-            }
-            """ % {
-                "defines": self.generate_settings_defines(),
-                }
+            }}
+            """.format(
+                defines=self.generate_settings_defines(),
+                )
         prg = cl.Program(self.context, src).build()
         return prg.sync
 
@@ -414,7 +418,7 @@ class RanluxGenerator(object):
 
 # {{{ Random123 generators
 
-class Random123GeneratorBase(object):
+class Random123GeneratorBase:
     """
     .. versionadded:: 2016.2
 
@@ -535,9 +539,9 @@ class Random123GeneratorBase(object):
                     "unsupported RNG distribution/data type combination '%s/%s'"
                     % rng_key)
 
-        kernel_name = "rng_gen_%s_%s" % (self.generator_name, distribution)
+        kernel_name = f"rng_gen_{self.generator_name}_{distribution}"
         src = """//CL//
-            #include <%(header_name)s>
+            #include <{header_name}>
 
             #ifndef M_PI
             #ifdef M_PI_F
@@ -547,29 +551,29 @@ class Random123GeneratorBase(object):
             #endif
             #endif
 
-            typedef %(output_t)s output_t;
-            typedef %(output_t)s4 output_vec_t;
-            typedef %(gen_name)s_ctr_t ctr_t;
-            typedef %(gen_name)s_key_t key_t;
+            typedef {output_t} output_t;
+            typedef {output_t}4 output_vec_t;
+            typedef {gen_name}_ctr_t ctr_t;
+            typedef {gen_name}_key_t key_t;
 
             uint4 gen_bits(key_t *key, ctr_t *ctr)
-            {
-                union {
+            {{
+                union {{
                     ctr_t ctr_el;
                     uint4 vec_el;
-                } u;
+                }} u;
 
-                u.ctr_el = %(gen_name)s(*ctr, *key);
+                u.ctr_el = {gen_name}(*ctr, *key);
                 if (++ctr->v[0] == 0)
                     if (++ctr->v[1] == 0)
                         ++ctr->v[2];
 
                 return u.vec_el;
-            }
+            }}
 
-            #if %(include_box_muller)s
+            #if {include_box_muller}
             output_vec_t box_muller(output_vec_t x)
-            {
+            {{
                 #define BOX_MULLER(I, COMPA, COMPB) \
                     output_t r##I = sqrt(-2*log(x.COMPA)); \
                     output_t c##I; \
@@ -578,14 +582,14 @@ class Random123GeneratorBase(object):
                 BOX_MULLER(0, x, y);
                 BOX_MULLER(1, z, w);
                 return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1);
-            }
+            }}
             #endif
 
-            #define GET_RANDOM_NUM(gen) %(rng_expr)s
+            #define GET_RANDOM_NUM(gen) {rng_expr}
 
-            kernel void %(kernel_name)s(
+            kernel void {kernel_name}(
                 int k1,
-                #if %(key_length)s > 2
+                #if {key_length} > 2
                 int k2, int k3,
                 #endif
                 int c0, int c1, int c2, int c3,
@@ -593,23 +597,23 @@ class Random123GeneratorBase(object):
                 long out_size,
                 output_t scale,
                 output_t shift)
-            {
-                #if %(key_length)s == 2
-                key_t k = {{get_global_id(0), k1}};
+            {{
+                #if {key_length} == 2
+                key_t k = {{{{get_global_id(0), k1}}}};
                 #else
-                key_t k = {{get_global_id(0), k1, k2, k3}};
+                key_t k = {{{{get_global_id(0), k1, k2, k3}}}};
                 #endif
 
-                ctr_t c = {{c0, c1, c2, c3}};
+                ctr_t c = {{{{c0, c1, c2, c3}}}};
 
                 // output bulk
                 unsigned long idx = get_global_id(0)*4;
                 while (idx + 4 < out_size)
-                {
+                {{
                     output_vec_t ran = GET_RANDOM_NUM(gen_bits(&k, &c));
                     vstore4(ran, 0, &output[idx]);
                     idx += 4*get_global_size(0);
-                }
+                }}
 
                 // output tail
                 output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c));
@@ -621,16 +625,16 @@ class Random123GeneratorBase(object):
                   output[idx+2] = tail_ran.z;
                 if (idx+3 < out_size)
                   output[idx+3] = tail_ran.w;
-            }
-            """ % {
-                "kernel_name": kernel_name,
-                "gen_name": self.generator_name,
-                "header_name": self.header_name,
-                "output_t": c_type,
-                "key_length": self.key_length,
-                "include_box_muller": int(distribution == "normal"),
-                "rng_expr": rng_expr
-                }
+            }}
+            """.format(
+                kernel_name=kernel_name,
+                gen_name=self.generator_name,
+                header_name=self.header_name,
+                output_t=c_type,
+                key_length=self.key_length,
+                include_box_muller=int(distribution == "normal"),
+                rng_expr=rng_expr
+                )
 
         prg = cl.Program(self.context, src).build()
         knl = getattr(prg, kernel_name)
@@ -658,10 +662,11 @@ class Random123GeneratorBase(object):
                 scale, shift]
 
         n = ary.size
-        from pyopencl.array import splay
-        gsize, lsize = splay(queue, ary.size)
+        from pyopencl.array import _splay
+        gsize, lsize = _splay(queue.device, ary.size)
 
         evt = knl(queue, gsize, lsize, *args)
+        ary.add_event(evt)
 
         self.counter[0] += n * counter_multiplier
         c1_incr, self.counter[0] = divmod(self.counter[0], self.counter_max)
@@ -683,9 +688,7 @@ class Random123GeneratorBase(object):
         b = kwargs.pop("b", 1)
 
         result = cl_array.empty(*args, **kwargs)
-
-        result.add_event(
-                self.fill_uniform(result, queue=result.queue, a=a, b=b))
+        self.fill_uniform(result, queue=result.queue, a=a, b=b)
         return result
 
     def fill_normal(self, ary, mu=0, sigma=1, queue=None):
@@ -702,9 +705,7 @@ class Random123GeneratorBase(object):
         sigma = kwargs.pop("sigma", 1)
 
         result = cl_array.empty(*args, **kwargs)
-
-        result.add_event(
-                self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma))
+        self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
         return result
 
 
@@ -763,8 +764,7 @@ def rand(queue, shape, dtype, luxury=None, a=0, b=1):
     from pyopencl.array import Array
     gen = _get_generator(queue.context)
     result = Array(queue, shape, dtype)
-    result.add_event(
-            gen.fill_uniform(result, a=a, b=b))
+    gen.fill_uniform(result, a=a, b=b)
     return result
 
 
diff --git a/pyopencl/cltypes.py b/pyopencl/cltypes.py
index d1ba79f3f8e3905bdee8f119dca3e57a8dda6509..fed1834ca087fed17791171536e70e0446c580c6 100644
--- a/pyopencl/cltypes.py
+++ b/pyopencl/cltypes.py
@@ -1,5 +1,3 @@
-# encoding: utf8
-
 __copyright__ = "Copyright (C) 2016 Jonathan Mackenzie"
 
 __license__ = """
@@ -24,7 +22,7 @@ import numpy as np
 from pyopencl.tools import get_or_register_dtype
 import warnings
 
-if __file__.endswith('array.py'):
+if __file__.endswith("array.py"):
     warnings.warn("pyopencl.array.vec is deprecated. Please use pyopencl.cltypes")
 
 """
@@ -48,8 +46,8 @@ double = np.float64
 
 def _create_vector_types():
     _mapping = [(k, globals()[k]) for k in
-                ['char', 'uchar', 'short', 'ushort', 'int',
-                 'uint', 'long', 'ulong', 'float', 'double']]
+                ["char", "uchar", "short", "ushort", "int",
+                 "uint", "long", "ulong", "float", "double"]]
 
     def set_global(key, val):
         globals()[key] = val
diff --git a/pyopencl/compyte b/pyopencl/compyte
index 49e670e0ab7bbc822032196b3478522c04168d6f..7533db88124045924a47d7392eaf9a078670fc4d 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit 49e670e0ab7bbc822032196b3478522c04168d6f
+Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index cbd8d74666d3b7eca797672ca4f80224dd3f150c..dae42b7e70883c18f88f39461f925ae124926d82 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -1,9 +1,5 @@
 """Elementwise functionality."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -83,29 +79,29 @@ def get_elwise_program(context, arguments, operation,
                 stacklevel=3)
 
     source = ("""//CL//
-        %(preamble)s
+        {preamble}
 
         #define PYOPENCL_ELWISE_CONTINUE continue
 
-        __kernel void %(name)s(%(arguments)s)
-        {
+        __kernel void {name}({arguments})
+        {{
           int lid = get_local_id(0);
           int gsize = get_global_size(0);
           int work_group_start = get_local_size(0)*get_group_id(0);
           long i;
 
-          %(loop_prep)s;
-          %(body)s
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            "body": body % dict(operation=operation),
-            })
+          {loop_prep};
+          {body}
+          {after_loop};
+        }}
+        """.format(
+            arguments=", ".join(arg.declarator() for arg in arguments),
+            name=name,
+            preamble=preamble,
+            loop_prep=loop_prep,
+            after_loop=after_loop,
+            body=body % dict(operation=operation),
+            ))
 
     from pyopencl import Program
     return Program(context, source).build(options)
@@ -136,7 +132,7 @@ def get_elwise_kernel_and_types(context, arguments, operation,
                         #define PYOPENCL_DEFINE_CDOUBLE
                         """)
                     have_double_pragma = True
-            if arg.dtype.kind == 'c':
+            if arg.dtype.kind == "c":
                 if not have_complex_include:
                     includes.append("#include <pyopencl-complex.h>\n")
                     have_complex_include = True
@@ -160,10 +156,10 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         name=name, options=options, preamble=preamble,
         use_range=use_range, loop_prep=loop_prep, **kwargs)
 
-    from pyopencl.tools import get_arg_list_scalar_arg_dtypes
+    from pyopencl.tools import get_arg_list_arg_types
 
     kernel = getattr(prg, name)
-    kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args))
+    kernel.set_scalar_arg_dtypes(get_arg_list_arg_types(parsed_args))
 
     return kernel, parsed_args
 
@@ -250,28 +246,29 @@ class ElementwiseKernel:
         use_range = range_ is not None or slice_ is not None
         kernel, arg_descrs = self.get_kernel(use_range)
 
+        queue = kwargs.pop("queue", None)
+        wait_for = kwargs.pop("wait_for", None)
+
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         # {{{ assemble arg array
 
         invocation_args = []
         for arg, arg_descr in zip(args, arg_descrs):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("ElementwiseKernel cannot "
-                            "deal with non-contiguous arrays")
-
                 if repr_vec is None:
                     repr_vec = arg
 
-                invocation_args.append(arg.base_data)
-                if arg_descr.with_offset:
-                    invocation_args.append(arg.offset)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
         # }}}
 
-        queue = kwargs.pop("queue", None)
-        wait_for = kwargs.pop("wait_for", None)
         if kwargs:
             raise TypeError("unknown keyword arguments: '%s'"
                     % ", ".join(kwargs))
@@ -303,13 +300,13 @@ class ElementwiseKernel:
 
             invocation_args.append(step)
 
-            from pyopencl.array import splay
-            gs, ls = splay(queue,
+            from pyopencl.array import _splay
+            gs, ls = _splay(queue.device,
                     abs(range_.stop - start)//step,
                     max_wg_size)
         else:
             invocation_args.append(repr_vec.size)
-            gs, ls = repr_vec.get_sizes(queue, max_wg_size)
+            gs, ls = repr_vec._get_sizes(queue, max_wg_size)
 
         if capture_as is not None:
             kernel.set_args(*invocation_args)
@@ -317,9 +314,7 @@ class ElementwiseKernel:
                     capture_as, queue,
                     gs, ls, *invocation_args, wait_for=wait_for)
 
-        kernel.set_args(*invocation_args)
-        return cl.enqueue_nd_range_kernel(queue, kernel,
-                gs, ls, wait_for=wait_for)
+        return kernel(queue, gs, ls, *invocation_args, wait_for=wait_for)
 
 # }}}
 
@@ -465,20 +460,20 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
 def get_copy_kernel(context, dtype_dest, dtype_src):
     src = "src[i]"
     if dtype_dest.kind == "c" != dtype_src.kind:
-        src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src)
+        src = "{}_fromreal({})".format(complex_dtype_to_name(dtype_dest), src)
 
     if dtype_dest.kind == "c" and dtype_src != dtype_dest:
-        src = "%s_cast(%s)" % (complex_dtype_to_name(dtype_dest), src),
+        src = "{}_cast({})".format(complex_dtype_to_name(dtype_dest), src),
 
     if dtype_dest != dtype_src and (
             dtype_dest.kind == "V" or dtype_src.kind == "V"):
         raise TypeError("copying between non-identical struct types")
 
     return get_elwise_kernel(context,
-            "%(tp_dest)s *dest, %(tp_src)s *src" % {
-                "tp_dest": dtype_to_ctype(dtype_dest),
-                "tp_src": dtype_to_ctype(dtype_src),
-                },
+            "{tp_dest} *dest, {tp_src} *src".format(
+                tp_dest=dtype_to_ctype(dtype_dest),
+                tp_src=dtype_to_ctype(dtype_src),
+                ),
             "dest[i] = %s" % src,
             preamble=dtype_to_c_struct(context.devices[0], dtype_dest),
             name="copy")
@@ -499,40 +494,40 @@ def real_dtype(dtype):
 
 @context_dependent_memoize
 def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
-    ax = "a*x[i]"
-    by = "b*y[i]"
+    result_t = dtype_to_ctype(dtype_z)
 
     x_is_complex = dtype_x.kind == "c"
     y_is_complex = dtype_y.kind == "c"
 
-    if x_is_complex:
-        ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)
-
-    if y_is_complex:
-        by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)
+    if dtype_z.kind == "c":
+        # a and b will always be complex here.
+        z_ct = complex_dtype_to_name(dtype_z)
 
-    if x_is_complex and not y_is_complex:
-        by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by)
+        if x_is_complex:
+            ax = f"{z_ct}_mul(a, {z_ct}_cast(x[i]))"
+        else:
+            ax = f"{z_ct}_mulr(a, x[i])"
 
-    if not x_is_complex and y_is_complex:
-        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax)
+        if y_is_complex:
+            by = f"{z_ct}_mul(b, {z_ct}_cast(y[i]))"
+        else:
+            by = f"{z_ct}_mulr(b, y[i])"
 
-    if x_is_complex or y_is_complex:
-        result = (
-                "{root}_add({root}_cast({ax}), {root}_cast({by}))"
-                .format(
-                    ax=ax,
-                    by=by,
-                    root=complex_dtype_to_name(dtype_z)))
+        result = f"{z_ct}_add({ax}, {by})"
     else:
-        result = "%s + %s" % (ax, by)
+        # real-only
+
+        ax = f"a*(({result_t}) x[i])"
+        by = f"b*(({result_t}) y[i])"
+
+        result = f"{ax} + {by}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % result,
             name="axpbyz")
 
@@ -551,33 +546,33 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
         x = "x[i]"
 
         if dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
 
         if a_is_complex:
             if dtype_a != dtype_z:
-                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
+                a = "{}_cast({})".format(complex_dtype_to_name(dtype_z), a)
 
-            ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            ax = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
         else:
-            ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            ax = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
     elif a_is_complex:
         a = "a"
         x = "x[i]"
 
         if dtype_a != dtype_z:
-            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
-        ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            a = "{}_cast({})".format(complex_dtype_to_name(dtype_z), a)
+        ax = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
 
     b = "b"
     if z_is_complex and not b_is_complex:
-        b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b)
+        b = "{}_fromreal({})".format(complex_dtype_to_name(dtype_z), b)
 
     if z_is_complex and not (a_is_complex or x_is_complex):
-        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax)
+        ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_z), ax)
 
     if z_is_complex:
-        ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax)
-        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b)
+        ax = "{}_cast({})".format(complex_dtype_to_name(dtype_z), ax)
+        b = "{}_cast({})".format(complex_dtype_to_name(dtype_z), b)
 
     if a_is_complex or x_is_complex or b_is_complex:
         expr = "{root}_add({ax}, {b})".format(
@@ -585,15 +580,15 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
                 b=b,
                 root=complex_dtype_to_name(dtype_z))
     else:
-        expr = "%s + %s" % (ax, b)
+        expr = f"{ax} + {b}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % {
-                "tp_a": dtype_to_ctype(dtype_a),
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_b": dtype_to_ctype(dtype_b),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_a} a, {tp_x} *x,{tp_b} b".format(
+                tp_a=dtype_to_ctype(dtype_a),
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_b=dtype_to_ctype(dtype_b),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = " + expr,
             name="axpb")
 
@@ -607,25 +602,25 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
     y = "y[i]"
 
     if x_is_complex and dtype_x != dtype_z:
-        x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+        x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
     if y_is_complex and dtype_y != dtype_z:
-        y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+        y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        xy = "%s * %s" % (x, y)
+        xy = f"{x} * {y}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % xy,
             name="multiply")
 
@@ -641,28 +636,28 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        xoy = "%s / %s" % (x, y)
+        xoy = f"{x} / {y}"
 
     if z_is_complex:
-        xoy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xoy)
+        xoy = "{}_cast({})".format(complex_dtype_to_name(dtype_z), xoy)
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % xoy,
             name="divide")
 
@@ -679,25 +674,25 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     elif not y_is_complex and x_is_complex:
-        yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     elif y_is_complex and not x_is_complex:
-        yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     else:
-        yox = "%s / %s" % (y, x)
+        yox = f"{y} / {x}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % yox,
             name="divide_r")
 
@@ -705,9 +700,9 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_fill_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *z, %(tp)s a" % {
-                "tp": dtype_to_ctype(dtype),
-                },
+            "{tp} *z, {tp} a".format(
+                tp=dtype_to_ctype(dtype),
+                ),
             "z[i] = a",
             preamble=dtype_to_c_struct(context.devices[0], dtype),
             name="fill")
@@ -716,9 +711,9 @@ def get_fill_kernel(context, dtype):
 @context_dependent_memoize
 def get_reverse_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *z, %(tp)s *y" % {
-                "tp": dtype_to_ctype(dtype),
-                },
+            "{tp} *z, {tp} *y".format(
+                tp=dtype_to_ctype(dtype),
+                ),
             "z[i] = y[n-1-i]",
             name="reverse")
 
@@ -764,23 +759,23 @@ def get_pow_kernel(context, dtype_x, dtype_y, dtype_z,
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
     elif dtype_x != dtype_y:
         if dtype_x != dtype_z:
-            x = "(%s) (%s)" % (dtype_to_ctype(dtype_z), x)
+            x = "({}) ({})".format(dtype_to_ctype(dtype_z), x)
         if dtype_y != dtype_z:
-            y = "(%s) (%s)" % (dtype_to_ctype(dtype_z), y)
+            y = "({}) ({})".format(dtype_to_ctype(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        result = "%s_pow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_pow({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        result = "%s_powr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_powr({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        result = "%s_rpow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_rpow({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        result = "pow(%s, %s)" % (x, y)
+        result = f"pow({x}, {y})"
 
     return get_elwise_kernel(context,
             ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % {
@@ -876,7 +871,7 @@ def get_binary_func_kernel(context, func_name, x_dtype, y_dtype, out_dtype,
 def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
                                  out_dtype, preamble="", name=None):
     if (np.array(0, x_dtype) * np.array(0, y_dtype)).itemsize > 4:
-        arg_type = 'double'
+        arg_type = "double"
         preamble = """
         #if __OPENCL_C_VERSION__ < 120
         #pragma OPENCL EXTENSION cl_khr_fp64: enable
@@ -884,13 +879,13 @@ def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
         #define PYOPENCL_DEFINE_CDOUBLE
         """ + preamble
     else:
-        arg_type = 'float'
+        arg_type = "float"
     return get_elwise_kernel(context, [
         VectorArg(out_dtype, "z", with_offset=True),
         VectorArg(x_dtype, "x", with_offset=True),
         VectorArg(y_dtype, "y", with_offset=True),
         ],
-        "z[i] = %s((%s)x[i], (%s)y[i])" % (func_name, arg_type, arg_type),
+        f"z[i] = {func_name}(({arg_type})x[i], ({arg_type})y[i])",
         name="%s_kernel" % func_name if name is None else name,
         preamble=preamble)
 
@@ -898,7 +893,7 @@ def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
 @context_dependent_memoize
 def get_fmod_kernel(context, out_dtype=np.float32, arg_dtype=np.float32,
                     mod_dtype=np.float32):
-    return get_float_binary_func_kernel(context, 'fmod', arg_dtype,
+    return get_float_binary_func_kernel(context, "fmod", arg_dtype,
                                         mod_dtype, out_dtype)
 
 
@@ -936,7 +931,7 @@ def get_frexp_kernel(context, sign_dtype=np.float32, exp_dtype=np.float32,
 def get_ldexp_kernel(context, out_dtype=np.float32, sig_dtype=np.float32,
                      expt_dtype=np.float32):
     return get_binary_func_kernel(
-        context, '_PYOCL_LDEXP', sig_dtype, expt_dtype, out_dtype,
+        context, "_PYOCL_LDEXP", sig_dtype, expt_dtype, out_dtype,
         preamble="#define _PYOCL_LDEXP(x, y) ldexp(x, (int)(y))",
         name="ldexp_kernel")
 
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index e29db97cd4b6d01d11fd02508fc8d981ea07ff5f..8c17699b0d7e3cb91924d84acb67185ca3267e82 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = """
 Copyright (C) 2017 Andreas Kloeckner
 """
@@ -24,96 +22,48 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import sys
 import numpy as np
 
 from warnings import warn
 import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
-from pyopencl.tools import _NumpyTypesKeyBuilder
-
-_PYPY = '__pypy__' in sys.builtin_module_names
-_CPY2 = not _PYPY and sys.version_info < (3,)
-_CPY26 = _CPY2 and sys.version_info < (2, 7)
+from pytools.py_codegen import Indentation, PythonCodeGenerator
+from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg
 
 
 # {{{ arg packing helpers
 
 _size_t_char = ({
-    8: 'Q',
-    4: 'L',
-    2: 'H',
-    1: 'B',
+    8: "Q",
+    4: "L",
+    2: "H",
+    1: "B",
 })[_cl._sizeof_size_t()]
 _type_char_map = {
-    'n': _size_t_char.lower(),
-    'N': _size_t_char
+    "n": _size_t_char.lower(),
+    "N": _size_t_char
 }
 del _size_t_char
 
 # }}}
 
 
-# {{{ individual arg handling
-
-def generate_buffer_arg_setter(gen, arg_idx, buf_var):
-    from pytools.py_codegen import Indentation
-
-    if _CPY2 or _PYPY:
-        # https://github.com/numpy/numpy/issues/5381
-        gen("if isinstance({buf_var}, np.generic):".format(buf_var=buf_var))
-        with Indentation(gen):
-            if _PYPY:
-                gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var))
-            else:
-                gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
-
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-
-def generate_bytes_arg_setter(gen, arg_idx, buf_var):
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-
-def generate_generic_arg_handler(gen, arg_idx, arg_var):
-    from pytools.py_codegen import Indentation
-
-    gen("""
-        if {arg_var} is None:
-            self._set_arg_null({arg_idx})
-        elif isinstance({arg_var}, _KERNEL_ARG_CLASSES):
-            self.set_arg({arg_idx}, {arg_var})
-        """
-        .format(arg_idx=arg_idx, arg_var=arg_var))
-
-    gen("else:")
-    with Indentation(gen):
-        generate_buffer_arg_setter(gen, arg_idx, arg_var)
-
-# }}}
-
-
 # {{{ generic arg handling body
 
 def generate_generic_arg_handling_body(num_args):
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
     if num_args == 0:
         gen("pass")
+    else:
+        gen_indices_and_args = []
+        for i in range(num_args):
+            gen_indices_and_args.append(i)
+            gen_indices_and_args.append(f"arg{i}")
 
-    for i in range(num_args):
-        gen("# process argument {arg_idx}".format(arg_idx=i))
-        gen("")
-        gen("current_arg = {arg_idx}".format(arg_idx=i))
-        generate_generic_arg_handler(gen, i, "arg%d" % i)
-        gen("")
+        gen(f"self._set_arg_multi("
+                f"({', '.join(str(i) for i in gen_indices_and_args)},), "
+                ")")
 
     return gen
 
@@ -122,9 +72,13 @@ def generate_generic_arg_handling_body(num_args):
 
 # {{{ specific arg handling body
 
+BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"]
+
+
 def generate_specific_arg_handling_body(function_name,
-        num_cl_args, scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug):
+        num_cl_args, arg_types,
+        work_around_arg_count_bug, warn_about_arg_count_bug,
+        in_enqueue):
 
     assert work_around_arg_count_bug is not None
     assert warn_about_arg_count_bug is not None
@@ -132,28 +86,75 @@ def generate_specific_arg_handling_body(function_name,
     fp_arg_count = 0
     cl_arg_idx = 0
 
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
-    if not scalar_arg_dtypes:
+    if not arg_types:
         gen("pass")
 
-    for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
-        gen("# process argument {arg_idx}".format(arg_idx=arg_idx))
+    gen_indices_and_args = []
+    buf_indices_and_args = []
+    buf_pack_indices_and_args = []
+
+    def add_buf_arg(arg_idx, typechar, expr_str):
+        if typechar in BUF_PACK_TYPECHARS:
+            buf_pack_indices_and_args.append(arg_idx)
+            buf_pack_indices_and_args.append(repr(typechar.encode()))
+            buf_pack_indices_and_args.append(expr_str)
+        else:
+            buf_indices_and_args.append(arg_idx)
+            buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
+
+    if in_enqueue and arg_types is not None and \
+            any(isinstance(arg_type, VectorArg) for arg_type in arg_types):
+        # We're about to modify wait_for, make sure it's a copy.
+        gen("""
+            if wait_for is None:
+                wait_for = []
+            else:
+                wait_for = list(wait_for)
+            """)
         gen("")
-        gen("current_arg = {arg_idx}".format(arg_idx=arg_idx))
+
+    for arg_idx, arg_type in enumerate(arg_types):
         arg_var = "arg%d" % arg_idx
 
-        if arg_dtype is None:
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+        if arg_type is None:
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(arg_var)
             cl_arg_idx += 1
             gen("")
             continue
 
-        arg_dtype = np.dtype(arg_dtype)
+        elif isinstance(arg_type, VectorArg):
+            gen(f"if not {arg_var}.flags.forc:")
+            with Indentation(gen):
+                gen("raise RuntimeError('only contiguous arrays may '")
+                gen("   'be used as arguments to this operation')")
+                gen("")
+
+            if in_enqueue:
+                gen(f"assert {arg_var}.queue is None or {arg_var}.queue == queue, "
+                    "'queues for all arrays must match the queue supplied "
+                    "to enqueue'")
+
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(f"{arg_var}.base_data")
+            cl_arg_idx += 1
+
+            if arg_type.with_offset:
+                add_buf_arg(cl_arg_idx, np.dtype(np.int64).char, f"{arg_var}.offset")
+                cl_arg_idx += 1
+
+            if in_enqueue:
+                gen(f"wait_for.extend({arg_var}.events)")
+
+            continue
+
+        arg_dtype = np.dtype(arg_type)
 
         if arg_dtype.char == "V":
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+            buf_indices_and_args.append(cl_arg_idx)
+            buf_indices_and_args.append(arg_var)
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -162,7 +163,7 @@ def generate_specific_arg_handling_body(function_name,
                         "some (but not all) of the target devices mishandle "
                         "struct kernel arguments (hence the workaround is "
                         "disabled".format(
-                            knl_name=function_name, stacklevel=2))
+                            knl_name=function_name), stacklevel=2)
 
             if arg_dtype == np.complex64:
                 arg_char = "f"
@@ -174,16 +175,9 @@ def generate_specific_arg_handling_body(function_name,
             if (work_around_arg_count_bug == "pocl"
                     and arg_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.real)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.real")
                 cl_arg_idx += 1
-                gen("current_arg = current_arg + 1000")
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.imag")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -195,41 +189,35 @@ def generate_specific_arg_handling_body(function_name,
                         "Cannot pass complex numbers to kernels.")
 
             else:
-                gen(
-                        "buf = pack('{arg_char}{arg_char}', "
-                        "{arg_var}.real, {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                buf_indices_and_args.append(cl_arg_idx)
+                buf_indices_and_args.append(
+                    f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)")
                 cl_arg_idx += 1
 
             fp_arg_count += 2
 
-        elif arg_dtype.char in "IL" and _CPY26:
-            # Prevent SystemError: ../Objects/longobject.c:336: bad
-            # argument to internal function
-
-            gen(
-                    "buf = pack('{arg_char}', long({arg_var}))"
-                    .format(arg_char=arg_dtype.char, arg_var=arg_var))
-            generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
-            cl_arg_idx += 1
-
         else:
             if arg_dtype.kind == "f":
                 fp_arg_count += 1
 
             arg_char = arg_dtype.char
             arg_char = _type_char_map.get(arg_char, arg_char)
-            gen(
-                    "buf = pack('{arg_char}', {arg_var})"
-                    .format(
-                        arg_char=arg_char,
-                        arg_var=arg_var))
-            generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+            add_buf_arg(cl_arg_idx, arg_char, arg_var)
             cl_arg_idx += 1
 
         gen("")
 
+    for arg_kind, args_and_indices, entry_length in [
+            ("", gen_indices_and_args, 2),
+            ("_buf", buf_indices_and_args, 2),
+            ("_buf_pack", buf_pack_indices_and_args, 3),
+            ]:
+        assert len(args_and_indices) % entry_length == 0
+        if args_and_indices:
+            gen(f"self._set_arg{arg_kind}_multi("
+                    f"({', '.join(str(i) for i in args_and_indices)},), "
+                    ")")
+
     if cl_arg_idx != num_cl_args:
         raise TypeError(
             "length of argument list (%d) and "
@@ -241,77 +229,29 @@ def generate_specific_arg_handling_body(function_name,
 # }}}
 
 
-# {{{ error handler
-
-def wrap_in_error_handler(body, arg_names):
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
-    err_gen = PythonCodeGenerator()
-
-    def gen_error_handler():
-        err_gen("""
-            if current_arg is not None:
-                args = [{args}]
-                advice = ""
-                from pyopencl.array import Array
-                if isinstance(args[current_arg], Array):
-                    advice = " (perhaps you meant to pass 'array.data' " \
-                        "instead of the array itself?)"
-
-                raise _cl.LogicError(
-                        "when processing argument #%d (1-based): %s%s"
-                        % (current_arg+1, str(e), advice))
-            else:
-                raise
-            """
-            .format(args=", ".join(arg_names)))
-        err_gen("")
-
-    err_gen("try:")
-    with Indentation(err_gen):
-        err_gen.extend(body)
-    err_gen("except TypeError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-    err_gen("except _cl.LogicError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-
-    return err_gen
-
-# }}}
-
-
-def add_local_imports(gen):
-    gen("import numpy as np")
-    gen("import pyopencl._cl as _cl")
-    gen("from pyopencl import _KERNEL_ARG_CLASSES")
-    gen("")
-
-
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
     arg_names = ["arg%d" % i for i in range(num_passed_args)]
 
-    if scalar_arg_dtypes is None:
-        body = generate_generic_arg_handling_body(num_passed_args)
-    else:
-        body = generate_specific_arg_handling_body(
-                function_name, num_cl_args, scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
-
-    err_handler = wrap_in_error_handler(body, arg_names)
+    def gen_arg_setting(in_enqueue):
+        if arg_types is None:
+            return generate_generic_arg_handling_body(num_passed_args)
+        else:
+            return generate_specific_arg_handling_body(
+                    function_name, num_cl_args, arg_types,
+                    warn_about_arg_count_bug=warn_about_arg_count_bug,
+                    work_around_arg_count_bug=work_around_arg_count_bug,
+                    in_enqueue=in_enqueue)
 
     gen = PythonCodeGenerator()
 
     gen("from struct import pack")
     gen("from pyopencl import status_code")
+    gen("import numpy as np")
+    gen("import pyopencl._cl as _cl")
     gen("")
 
     # {{{ generate _enqueue
@@ -322,16 +262,19 @@ def _generate_enqueue_and_set_args_module(function_name,
                 ", ".join(
                     ["self", "queue", "global_size", "local_size"]
                     + arg_names
-                    + ["global_offset=None", "g_times_l=None",
+                    + ["global_offset=None",
+                        "g_times_l=None",
+                        "allow_empty_ndrange=False",
                         "wait_for=None"])))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(gen_arg_setting(in_enqueue=True))
 
+        # Using positional args here because pybind is slow with keyword args
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l)
+                    global_offset, wait_for, g_times_l,
+                    allow_empty_ndrange)
             """)
 
     # }}}
@@ -343,8 +286,7 @@ def _generate_enqueue_and_set_args_module(function_name,
             % (", ".join(["self"] + arg_names)))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(gen_arg_setting(in_enqueue=False))
 
     # }}}
 
@@ -352,17 +294,17 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v7",
+        "pyopencl-invoker-cache-v39",
         key_builder=_NumpyTypesKeyBuilder())
 
 
 def generate_enqueue_and_set_args(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
     cache_key = (function_name, num_passed_args, num_cl_args,
-            scalar_arg_dtypes,
+            arg_types,
             work_around_arg_count_bug, warn_about_arg_count_bug)
 
     from_cache = False
diff --git a/pyopencl/ipython_ext.py b/pyopencl/ipython_ext.py
index ce80fc07a8b774996f0154f34fec24d2d2b98e1f..619ac5908b2bc1925ad302146b6b116e638b532e 100644
--- a/pyopencl/ipython_ext.py
+++ b/pyopencl/ipython_ext.py
@@ -1,15 +1,11 @@
-from __future__ import division
-from __future__ import absolute_import
-
 from IPython.core.magic import (magics_class, Magics, cell_magic, line_magic)
 
 import pyopencl as cl
 import sys
-import six
 
 
 def _try_to_utf8(text):
-    if isinstance(text, six.text_type):
+    if isinstance(text, str):
         return text.encode("utf8")
     return text
 
@@ -48,16 +44,16 @@ class PyOpenCLMagics(Magics):
     def cl_kernel(self, line, cell):
         kernel = cell
 
-        opts, args = self.parse_options(line, 'o:')
-        build_options = opts.get('o', '')
+        opts, args = self.parse_options(line, "o:")
+        build_options = opts.get("o", "")
 
         self._run_kernel(kernel, build_options)
 
     def _load_kernel_and_options(self, line):
-        opts, args = self.parse_options(line, 'o:f:')
+        opts, args = self.parse_options(line, "o:f:")
 
-        build_options = opts.get('o')
-        kernel = self.shell.find_user_code(opts.get('f') or args)
+        build_options = opts.get("o")
+        kernel = self.shell.find_user_code(opts.get("f") or args)
 
         return kernel, build_options
 
@@ -72,9 +68,9 @@ class PyOpenCLMagics(Magics):
         header = "%%cl_kernel"
 
         if build_options:
-            header = '%s -o "%s"' % (header, build_options)
+            header = f'{header} -o "{build_options}"'
 
-        content = "%s\n\n%s" % (header, kernel)
+        content = f"{header}\n\n{kernel}"
 
         self.shell.set_next_input(content)
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 7c017419359bdd5b6baacf419a08980890cdadbe..7d6c6482c4dc16620fd9f255938ac86a1f349833 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -1,8 +1,5 @@
 """Computation of reductions on vectors."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import zip
 
 __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
 
@@ -159,7 +156,6 @@ def _get_reduction_source(
     # }}}
 
     from mako.template import Template
-    from pytools import all
     from pyopencl.characterize import has_double_support
 
     arguments = ", ".join(arg.declarator() for arg in parsed_args)
@@ -304,6 +300,12 @@ class ReductionKernel:
         return_event = kwargs.pop("return_event", False)
         out = kwargs.pop("out", None)
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         range_ = kwargs.pop("range", None)
         slice_ = kwargs.pop("slice", None)
 
@@ -327,6 +329,7 @@ class ReductionKernel:
                     invocation_args.append(arg.base_data)
                     if arg_tp.with_offset:
                         invocation_args.append(arg.offset)
+                    wait_for.extend(arg.events)
                 else:
                     invocation_args.append(arg)
 
@@ -382,10 +385,16 @@ class ReductionKernel:
                 else:
                     allocator = repr_vec.allocator
 
-            if sz <= stage_inf.group_size*SMALL_SEQ_COUNT*MAX_GROUP_COUNT:
+            if sz == 0:
+                result = empty(use_queue, (), self.dtype_out, allocator=allocator)
+                group_count = 1
+                seq_count = 0
+
+            elif sz <= stage_inf.group_size*SMALL_SEQ_COUNT*MAX_GROUP_COUNT:
                 total_group_size = SMALL_SEQ_COUNT*stage_inf.group_size
                 group_count = (sz + total_group_size - 1) // total_group_size
                 seq_count = SMALL_SEQ_COUNT
+
             else:
                 group_count = MAX_GROUP_COUNT
                 macrogroup_size = group_count*stage_inf.group_size
@@ -410,9 +419,11 @@ class ReductionKernel:
                     (stage_inf.group_size,),
                     *([result.base_data, result.offset]
                         + invocation_args + size_args),
-                    **dict(wait_for=wait_for))
+                    wait_for=wait_for)
             wait_for = [last_evt]
 
+            result.add_event(last_evt)
+
             if group_count == 1:
                 if return_event:
                     return result, last_evt
@@ -528,22 +539,22 @@ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
     b = "b[%s]" % index_expr
 
     if a_is_complex and (dtype_a != dtype_out):
-        a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
+        a = "{}_cast({})".format(complex_dtype_to_name(dtype_out), a)
     if b_is_complex and (dtype_b != dtype_out):
-        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
+        b = "{}_cast({})".format(complex_dtype_to_name(dtype_out), b)
 
     if a_is_complex and conjugate_first and a_is_complex:
-        a = "%s_conj(%s)" % (
+        a = "{}_conj({})".format(
                 complex_dtype_to_name(dtype_out), a)
 
     if a_is_complex and not b_is_complex:
-        map_expr = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     elif not a_is_complex and b_is_complex:
-        map_expr = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     elif a_is_complex and b_is_complex:
-        map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     else:
-        map_expr = "%s*%s" % (a, b)
+        map_expr = f"{a}*{b}"
 
     return map_expr, dtype_out, dtype_b
 
@@ -625,10 +636,10 @@ def get_minmax_kernel(ctx, what, dtype):
 
     return ReductionKernel(ctx, dtype,
             neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
-            arguments="const %(tp)s *in" % {
-                "tp": dtype_to_ctype(dtype),
-                }, preamble="#define MY_INFINITY (1./0)")
+            reduce_expr=f"{reduce_expr}",
+            arguments="const {tp} *in".format(
+                tp=dtype_to_ctype(dtype),
+                ), preamble="#define MY_INFINITY (1./0)")
 
 
 @context_dependent_memoize
@@ -642,7 +653,7 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
 
     return ReductionKernel(ctx, dtype,
             neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
+            reduce_expr=f"{reduce_expr}",
             map_expr="in[lookup_tbl[i]]",
             arguments=(
                 "const %(tp_lut)s *lookup_tbl, "
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 71460f25beaadff9258aa79d3f6d34b58e362546..0106207cb1b91eaf9289f4f03b47dfe5b015b669 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -1,6 +1,5 @@
 """Scan primitive."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = """
 Copyright 2011-2012 Andreas Kloeckner
@@ -21,12 +20,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 
 Derived from code within the Thrust project, https://github.com/thrust/thrust/
-
 """
 
-import six
-from six.moves import range, zip
-
 import numpy as np
 
 import pyopencl as cl
@@ -940,7 +935,7 @@ class ScanPerformanceWarning(UserWarning):
     pass
 
 
-class _GenericScanKernelBase(object):
+class _GenericScanKernelBase:
     # {{{ constructor, argument processing
 
     def __init__(self, ctx, dtype,
@@ -1084,7 +1079,6 @@ class _GenericScanKernelBase(object):
 
         # {{{ set up shared code dict
 
-        from pytools import all
         from pyopencl.characterize import has_double_support
 
         self.code_variables = dict(
@@ -1263,7 +1257,6 @@ class GenericScanKernel(_GenericScanKernelBase):
                     solutions.append((wg_size*k_group_size, k_group_size, wg_size))
 
         if is_gpu:
-            from pytools import any
             for wg_size_floor in [256, 192, 128]:
                 have_sol_above_floor = any(wg_size >= wg_size_floor
                         for _, _, wg_size in solutions)
@@ -1469,6 +1462,11 @@ class GenericScanKernel(_GenericScanKernelBase):
         n = kwargs.get("size")
         wait_for = kwargs.get("wait_for")
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            wait_for = list(wait_for)
+
         if len(args) != len(self.parsed_args):
             raise TypeError("expected %d arguments, got %d" %
                     (len(self.parsed_args), len(args)))
@@ -1491,6 +1489,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
@@ -1541,7 +1540,7 @@ class GenericScanKernel(_GenericScanKernelBase):
 
         l1_evt = l1_info.kernel(
                 queue, (num_intervals,), (l1_info.wg_size,),
-                *scan1_args, **dict(g_times_l=True, wait_for=wait_for))
+                *scan1_args, g_times_l=True, wait_for=wait_for)
 
         # }}}
 
@@ -1561,7 +1560,7 @@ class GenericScanKernel(_GenericScanKernelBase):
 
         l2_evt = l2_info.kernel(
                 queue, (1,), (l1_info.wg_size,),
-                *scan2_args, **dict(g_times_l=True, wait_for=[l1_evt]))
+                *scan2_args, g_times_l=True, wait_for=[l1_evt])
 
         # }}}
 
@@ -1577,7 +1576,7 @@ class GenericScanKernel(_GenericScanKernelBase):
         return self.final_update_info.kernel(
                 queue, (num_intervals,),
                 (self.final_update_info.update_wg_size,),
-                *upd_args, **dict(g_times_l=True, wait_for=[l2_evt]))
+                *upd_args, g_times_l=True, wait_for=[l2_evt])
 
         # }}}
 
@@ -1679,6 +1678,12 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
         n = kwargs.get("size")
         wait_for = kwargs.get("wait_for")
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         if len(args) != len(self.parsed_args):
             raise TypeError("expected %d arguments, got %d" %
                     (len(self.parsed_args), len(args)))
@@ -1701,13 +1706,14 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
         # }}}
 
         return self.kernel(queue, (1,), (1,),
-                *(data_args + [n]), **dict(wait_for=wait_for))
+                *(data_args + [n]), wait_for=wait_for)
 
 # }}}
 
@@ -1721,7 +1727,7 @@ class _LegacyScanKernelBase(GenericScanKernel):
         scan_ctype = dtype_to_ctype(dtype)
         GenericScanKernel.__init__(self,
                 ctx, dtype,
-                arguments="__global %s *input_ary, __global %s *output_ary" % (
+                arguments="__global {} *input_ary, __global {} *output_ary".format(
                     scan_ctype, scan_ctype),
                 input_expr="input_ary[i]",
                 scan_expr=scan_expr,
@@ -1740,7 +1746,7 @@ class _LegacyScanKernelBase(GenericScanKernel):
         if output_ary is None:
             output_ary = input_ary
 
-        if isinstance(output_ary, (str, six.text_type)) and output_ary == "new":
+        if isinstance(output_ary, (str, str)) and output_ary == "new":
             output_ary = cl.array.empty_like(input_ary, allocator=allocator)
 
         if input_ary.shape != output_ary.shape:
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 2a2d0f9d061d6f74c38dc07a4341fbf8081d463d..b16de3f64cd033cc0f4b3d2ce19b65a9e169d0ed 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -1,6 +1,5 @@
 """Various helpful bits and pieces without much of a common theme."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
 
@@ -28,12 +27,12 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 
 
-import six
-from six.moves import zip, intern
+from sys import intern
+
+# Do not add a pyopencl import here: This will add an import cycle.
 
 import numpy as np
 from decorator import decorator
-import pyopencl as cl
 from pytools import memoize, memoize_method
 from pyopencl._cl import bitlog2  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
@@ -171,12 +170,39 @@ atexit.register(clear_first_arg_caches)
 # }}}
 
 
+# {{{ pytest fixtures
+
+class _ContextFactory:
+    def __init__(self, device):
+        self.device = device
+
+    def __call__(self):
+        # Get rid of leftovers from past tests.
+        # CL implementations are surprisingly limited in how many
+        # simultaneous contexts they allow...
+        clear_first_arg_caches()
+
+        from gc import collect
+        collect()
+
+        import pyopencl as cl
+        return cl.Context([self.device])
+
+    def __str__(self):
+        # Don't show address, so that parallel test collection works
+        return ("<context factory for <pyopencl.Device '%s' on '%s'>" %
+                (self.device.name.strip(),
+                 self.device.platform.name.strip()))
+
+
 def get_test_platforms_and_devices(plat_dev_string=None):
     """Parse a string of the form 'PYOPENCL_TEST=0:0,1;intel:i5'.
 
     :return: list of tuples (platform, [device, device, ...])
     """
 
+    import pyopencl as cl
+
     if plat_dev_string is None:
         import os
         plat_dev_string = os.environ.get("PYOPENCL_TEST", None)
@@ -191,7 +217,7 @@ def get_test_platforms_and_devices(plat_dev_string=None):
 
         found = False
         for obj in objs:
-            if identifier.lower() in (obj.name + ' ' + obj.vendor).lower():
+            if identifier.lower() in (obj.name + " " + obj.vendor).lower():
                 return obj
         if not found:
             raise RuntimeError("object '%s' not found" % identifier)
@@ -226,34 +252,17 @@ def get_test_platforms_and_devices(plat_dev_string=None):
                 for platform in cl.get_platforms()]
 
 
-def pytest_generate_tests_for_pyopencl(metafunc):
-    class ContextFactory:
-        def __init__(self, device):
-            self.device = device
-
-        def __call__(self):
-            # Get rid of leftovers from past tests.
-            # CL implementations are surprisingly limited in how many
-            # simultaneous contexts they allow...
-
-            clear_first_arg_caches()
+def get_pyopencl_fixture_arg_names(metafunc, extra_arg_names=None):
+    if extra_arg_names is None:
+        extra_arg_names = []
 
-            from gc import collect
-            collect()
-
-            return cl.Context([self.device])
-
-        def __str__(self):
-            # Don't show address, so that parallel test collection works
-            return ("<context factory for <pyopencl.Device '%s' on '%s'>" %
-                    (self.device.name.strip(),
-                     self.device.platform.name.strip()))
-
-    test_plat_and_dev = get_test_platforms_and_devices()
+    supported_arg_names = [
+            "platform", "device",
+            "ctx_factory", "ctx_getter",
+            ] + extra_arg_names
 
     arg_names = []
-
-    for arg in ("platform", "device", "ctx_factory", "ctx_getter"):
+    for arg in supported_arg_names:
         if arg not in metafunc.fixturenames:
             continue
 
@@ -265,29 +274,52 @@ def pytest_generate_tests_for_pyopencl(metafunc):
 
         arg_names.append(arg)
 
+    return arg_names
+
+
+def get_pyopencl_fixture_arg_values():
+    import pyopencl as cl
+
     arg_values = []
+    for platform, devices in get_test_platforms_and_devices():
+        for device in devices:
+            arg_dict = {
+                "platform": platform,
+                "device": device,
+                "ctx_factory": _ContextFactory(device),
+                "ctx_getter": _ContextFactory(device)
+            }
+            arg_values.append(arg_dict)
 
-    for platform, plat_devs in test_plat_and_dev:
-        if arg_names == ["platform"]:
-            arg_values.append((platform,))
-            continue
+    def idfn(val):
+        if isinstance(val, cl.Platform):
+            # Don't show address, so that parallel test collection works
+            return f"<pyopencl.Platform '{val.name}'>"
+        else:
+            return str(val)
+
+    return arg_values, idfn
 
-        arg_dict = {"platform": platform}
 
-        for device in plat_devs:
-            arg_dict["device"] = device
-            arg_dict["ctx_factory"] = ContextFactory(device)
-            arg_dict["ctx_getter"] = ContextFactory(device)
+def pytest_generate_tests_for_pyopencl(metafunc):
+    arg_names = get_pyopencl_fixture_arg_names(metafunc)
+    if not arg_names:
+        return
+
+    arg_values, ids = get_pyopencl_fixture_arg_values()
+    arg_values = [
+            tuple(arg_dict[name] for name in arg_names)
+            for arg_dict in arg_values
+            ]
 
-            arg_values.append(tuple(arg_dict[name] for name in arg_names))
+    metafunc.parametrize(arg_names, arg_values, ids=ids)
 
-    if arg_names:
-        metafunc.parametrize(arg_names, arg_values, ids=str)
+# }}}
 
 
 # {{{ C argument lists
 
-class Argument(object):
+class Argument:
     pass
 
 
@@ -297,31 +329,49 @@ class DtypedArgument(Argument):
         self.name = name
 
     def __repr__(self):
-        return "%s(%r, %s)" % (
+        return "{}({!r}, {})".format(
                 self.__class__.__name__,
                 self.name,
                 self.dtype)
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.dtype == other.dtype
+                and self.name == other.name)
+
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.dtype)
+                ^ hash(self.name))
+
 
 class VectorArg(DtypedArgument):
     def __init__(self, dtype, name, with_offset=False):
-        DtypedArgument.__init__(self, dtype, name)
+        super().__init__(dtype, name)
         self.with_offset = with_offset
 
     def declarator(self):
         if self.with_offset:
             # Two underscores -> less likelihood of a name clash.
-            return "__global %s *%s__base, long %s__offset" % (
+            return "__global {} *{}__base, long {}__offset".format(
                     dtype_to_ctype(self.dtype), self.name, self.name)
         else:
-            result = "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
+            result = "__global {} *{}".format(dtype_to_ctype(self.dtype), self.name)
 
         return result
 
+    def __eq__(self, other):
+        return (super().__eq__(other)
+                and self.with_offset == other.with_offset)
+
+    def __hash__(self):
+        return super().__hash__() ^ hash(self.with_offset)
+
 
 class ScalarArg(DtypedArgument):
     def declarator(self):
-        return "%s %s" % (dtype_to_ctype(self.dtype), self.name)
+        return "{} {}".format(dtype_to_ctype(self.dtype), self.name)
 
 
 class OtherArg(Argument):
@@ -332,6 +382,17 @@ class OtherArg(Argument):
     def declarator(self):
         return self.decl
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.decl == other.decl
+                and self.name == other.name)
+
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.decl)
+                ^ hash(self.name))
+
 
 def parse_c_arg(c_arg, with_offset=False):
     for aspace in ["__local", "__constant"]:
@@ -370,6 +431,20 @@ def parse_arg_list(arguments, with_offset=False):
     return [parse_single_arg(arg) for arg in arguments]
 
 
+def get_arg_list_arg_types(arg_types):
+    result = []
+
+    for arg_type in arg_types:
+        if isinstance(arg_type, ScalarArg):
+            result.append(arg_type.dtype)
+        elif isinstance(arg_type, VectorArg):
+            result.append(arg_type)
+        else:
+            raise RuntimeError("arg type not understood: %s" % type(arg_type))
+
+    return tuple(result)
+
+
 def get_arg_list_scalar_arg_dtypes(arg_types):
     result = []
 
@@ -404,6 +479,8 @@ def get_arg_offset_adjuster_code(arg_types):
 
 
 def get_gl_sharing_context_properties():
+    import pyopencl as cl
+
     ctx_props = cl.context_properties
 
     from OpenGL import platform as gl_platform
@@ -463,7 +540,11 @@ class _CDeclList:
         if dtype in pyopencl.cltypes.vec_type_to_scalar_and_count:
             return
 
-        for name, field_data in sorted(six.iteritems(dtype.fields)):
+        if hasattr(dtype, "subdtype") and dtype.subdtype is not None:
+            self.add_dtype(dtype.subdtype[0])
+            return
+
+        for name, field_data in sorted(dtype.fields.items()):
             field_dtype, offset = field_data[:2]
             self.add_dtype(field_dtype)
 
@@ -541,15 +622,33 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
     function, not the original one.
     """
 
-    fields = sorted(six.iteritems(dtype.fields),
+    import pyopencl as cl
+
+    fields = sorted(dtype.fields.items(),
             key=lambda name_dtype_offset: name_dtype_offset[1][1])
 
     c_fields = []
     for field_name, dtype_and_offset in fields:
         field_dtype, offset = dtype_and_offset[:2]
-        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))
+        if hasattr(field_dtype, "subdtype") and field_dtype.subdtype is not None:
+            array_dtype = field_dtype.subdtype[0]
+            if hasattr(array_dtype, "subdtype") and array_dtype.subdtype is not None:
+                raise NotImplementedError("nested array dtypes are not supported")
+            array_dims = field_dtype.subdtype[1]
+            dims_str = ""
+            try:
+                for dim in array_dims:
+                    dims_str += "[%d]" % dim
+            except TypeError:
+                dims_str = "[%d]" % array_dims
+            c_fields.append("  {} {}{};".format(
+                dtype_to_ctype(array_dtype), field_name, dims_str)
+            )
+        else:
+            c_fields.append(
+                    "  {} {};".format(dtype_to_ctype(field_dtype), field_name))
 
-    c_decl = "typedef struct {\n%s\n} %s;\n\n" % (
+    c_decl = "typedef struct {{\n{}\n}} {};\n\n".format(
             "\n".join(c_fields),
             name)
 
@@ -601,7 +700,6 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
 
     size = int(size_and_offsets[0])
 
-    from pytools import any
     offsets = size_and_offsets[1:]
     if any(ofs >= size for ofs in offsets):
         # offsets not plausible
@@ -626,12 +724,12 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
 
     try:
         dtype_arg_dict = {
-            'names': [field_name
+            "names": [field_name
                       for field_name, (field_dtype, offset) in fields],
-            'formats': [field_dtype
+            "formats": [field_dtype
                         for field_name, (field_dtype, offset) in fields],
-            'offsets': [int(x) for x in offsets],
-            'itemsize': int(size_and_offsets[0]),
+            "offsets": [int(x) for x in offsets],
+            "itemsize": int(size_and_offsets[0]),
             }
         dtype = np.dtype(dtype_arg_dict)
         if dtype.itemsize != size_and_offsets[0]:
@@ -647,8 +745,8 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
             for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                 if offset > total_size:
                     padding_count += 1
-                    yield ('__pycl_padding%d' % padding_count,
-                           'V%d' % offset - total_size)
+                    yield ("__pycl_padding%d" % padding_count,
+                           "V%d" % offset - total_size)
                 yield field_name, field_dtype
                 total_size = field_dtype.itemsize + offset
         dtype = np.dtype(list(calc_field_type()))
@@ -674,7 +772,7 @@ def dtype_to_c_struct(device, dtype):
     def dtypes_match():
         result = len(dtype.fields) == len(matched_dtype.fields)
 
-        for name, val in six.iteritems(dtype.fields):
+        for name, val in dtype.fields.items():
             result = result and matched_dtype.fields[name] == val
 
         return result
@@ -745,7 +843,7 @@ class _ScalarArgPlaceholder(_ArgumentPlaceholder):
     target_class = ScalarArg
 
 
-class _TemplateRenderer(object):
+class _TemplateRenderer:
     def __init__(self, template, type_aliases, var_values, context=None,
             options=[]):
         self.template = template
@@ -769,6 +867,7 @@ class _TemplateRenderer(object):
         return str(result)
 
     def get_rendered_kernel(self, txt, kernel_name):
+        import pyopencl as cl
         prg = cl.Program(self.context, self(txt)).build(self.options)
 
         kernel_name_prefix = self.var_dict.get("kernel_name_prefix")
@@ -851,18 +950,18 @@ class _TemplateRenderer(object):
         if arguments is not None:
             cdl.visit_arguments(arguments)
 
-        for _, tv in sorted(six.iteritems(self.type_aliases)):
+        for _, tv in sorted(self.type_aliases.items()):
             cdl.add_dtype(tv)
 
         type_alias_decls = [
-                "typedef %s %s;" % (dtype_to_ctype(val), name)
-                for name, val in sorted(six.iteritems(self.type_aliases))
+                "typedef {} {};".format(dtype_to_ctype(val), name)
+                for name, val in sorted(self.type_aliases.items())
                 ]
 
         return cdl.get_declarations() + "\n" + "\n".join(type_alias_decls)
 
 
-class KernelTemplateBase(object):
+class KernelTemplateBase:
     def __init__(self, template_processor=None):
         self.template_processor = template_processor
 
@@ -905,7 +1004,7 @@ class KernelTemplateBase(object):
     def build(self, context, *args, **kwargs):
         """Provide caching for an :meth:`build_inner`."""
 
-        cache_key = (context, args, tuple(sorted(six.iteritems(kwargs))))
+        cache_key = (context, args, tuple(sorted(kwargs.items())))
         try:
             return self.build_cache[cache_key]
         except KeyError:
@@ -960,7 +1059,7 @@ def array_module(a):
 def is_spirv(s):
     spirv_magic = b"\x07\x23\x02\x03"
     return (
-            isinstance(s, six.binary_type)
+            isinstance(s, bytes)
             and (
                 s[:4] == spirv_magic
                 or s[:4] == spirv_magic[::-1]))
@@ -969,6 +1068,11 @@ def is_spirv(s):
 # {{{ numpy key types builder
 
 class _NumpyTypesKeyBuilder(KeyBuilderBase):
+    def update_for_VectorArg(self, key_hash, key):  # noqa: N802
+        self.rec(key_hash, key.dtype)
+        self.update_for_str(key_hash, key.name)
+        self.rec(key_hash, key.with_offset)
+
     def update_for_type(self, key_hash, key):
         if issubclass(key, np.generic):
             self.update_for_str(key_hash, key.__name__)
diff --git a/pyopencl/version.py b/pyopencl/version.py
index c28843608e8afb5b21c249ae4de15c848f20adfb..8b36d4f9e9fce9d4f7b324fbb69d561f854d7a36 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2019, 1, 1)
+VERSION = (2021, 1, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..f2a2f6894081711b89214e24c18a5104f99db607
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers=
+        bitonic: tests involving bitonic sort
diff --git a/travis/build-wheels.sh b/scripts/build-wheels.sh
similarity index 61%
rename from travis/build-wheels.sh
rename to scripts/build-wheels.sh
index b004fa3df5f9879464966c46852e782446e30193..c2735b9197403aac571e844248d9d097af6da637 100755
--- a/travis/build-wheels.sh
+++ b/scripts/build-wheels.sh
@@ -4,11 +4,41 @@ set -e -x
 mkdir -p /deps
 cd /deps
 
-yum install -y git yum
+function start_spinner {
+    if [ -n "$SPINNER_PID" ]; then
+        return
+    fi
+
+    >&2 echo "Building libraries..."
+    # Start a process that runs as a keep-alive
+    # to avoid travis quitting if there is no output
+    (while true; do
+        sleep 60
+        >&2 echo "Still building..."
+    done) &
+    SPINNER_PID=$!
+    disown
+}
+
+function stop_spinner {
+    if [ ! -n "$SPINNER_PID" ]; then
+        return
+    fi
+
+    kill $SPINNER_PID
+    unset SPINNER_PID
+
+    >&2 echo "Building libraries finished."
+}
+
+#start_spinner
+
+curl https://tiker.net/tmp/.tmux.conf
+yum install -y git yum openssl-devel
 curl -L -O http://cache.ruby-lang.org/pub/ruby/2.1/ruby-2.1.2.tar.gz
 tar -xf ruby-2.1.2.tar.gz
 cd ruby-2.1.2
-./configure
+./configure --disable-install-doc --disable-install-rdoc
 make -j4
 make install
 cd ..
@@ -33,10 +63,12 @@ for PYBIN in /opt/python/*/bin; do
         NUMPY_VERSION="1.11.3"
     elif [[ "${PYBIN}" == *cp37* ]]; then
         NUMPY_VERSION="1.14.5"
-    elif [[ "${PYBIN}" == *cp35* ]]; then
-        NUMPY_VERSION="1.9.3"
+    elif [[ "${PYBIN}" == *cp38* ]]; then
+        NUMPY_VERSION="1.17.3"
+    elif [[ "${PYBIN}" == *cp39* ]]; then
+        NUMPY_VERSION="1.19.5"
     else
-        NUMPY_VERSION="1.8.2"
+        continue
     fi
     # Build with the oldest numpy available to be compatible with newer ones
     "${PYBIN}/pip" install "numpy==${NUMPY_VERSION}" pybind11 mako
@@ -50,25 +82,27 @@ done
 
 # Bundle license files
 
-/opt/python/cp37-cp37m/bin/pip install delocate
-/opt/python/cp37-cp37m/bin/python /io/travis/fix-wheel.py /deps/ocl-icd/COPYING
+/opt/python/cp39-cp39/bin/pip install delocate
+/opt/python/cp39-cp39/bin/python /io/scripts/fix-wheel.py /deps/ocl-icd/COPYING
 
 if [[ "${TWINE_USERNAME}" == "" ]]; then
     echo "TWINE_USERNAME not set. Skipping uploading wheels"
     exit 0
 fi
 
-/opt/python/cp37-cp37m/bin/pip install twine
+/opt/python/cp39-cp39/bin/pip install twine
 for WHEEL in /io/wheelhouse/pyopencl*.whl; do
     # dev
-    # /opt/python/cp37-cp37m/bin/twine upload \
+    # /opt/python/cp39-cp39/bin/twine upload \
     #     --skip-existing \
     #     --repository-url https://test.pypi.org/legacy/ \
     #     -u "${TWINE_USERNAME}" -p "${TWINE_PASSWORD}" \
     #     "${WHEEL}"
     # prod
-    /opt/python/cp37-cp37m/bin/twine upload \
+    /opt/python/cp39-cp39/bin/twine upload \
         --skip-existing \
         -u "${TWINE_USERNAME}" -p "${TWINE_PASSWORD}" \
         "${WHEEL}"
 done
+
+#stop_spinner
diff --git a/travis/fix-wheel.py b/scripts/fix-wheel.py
similarity index 100%
rename from travis/fix-wheel.py
rename to scripts/fix-wheel.py
diff --git a/setup.cfg b/setup.cfg
index 2bc760d67cfc68d91478948399e51cf470abfe07..845fb8c484af1df89a79136b4bd54f97d89de4c1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,3 +2,8 @@
 ignore = E126,E127,E128,E123,E226,E241,E242,E265,W503,E402
 max-line-length=85
 exclude=pyopencl/compyte/ndarray,pyopencl/compyte/array.py
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
+
diff --git a/setup.py b/setup.py
index 1b71cb57655d47924c14795428a3b489caef6ccd..fa50aeed7409812c2d8751ece2f1ccb404eba7db 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-from __future__ import absolute_import, print_function
 
 __copyright__ = """
 Copyright (C) 2009-15 Andreas Kloeckner
@@ -44,10 +42,10 @@ def get_config_schema():
             "-fvisibility=hidden"
             ]
 
-    if 'darwin' in sys.platform:
+    if "darwin" in sys.platform:
         import platform
         osx_ver, _, _ = platform.mac_ver()
-        osx_ver = '.'.join(osx_ver.split('.')[:2])
+        osx_ver = ".".join(osx_ver.split(".")[:2])
 
         sysroot_paths = [
                 "/Applications/Xcode.app/Contents/Developer/Platforms/"
@@ -57,14 +55,12 @@ def get_config_schema():
 
         default_libs = []
         default_cxxflags = default_cxxflags + [
-                '-stdlib=libc++', '-mmacosx-version-min=10.7',
-                '-arch', 'i386', '-arch', 'x86_64'
-                ]
+                "-stdlib=libc++", "-mmacosx-version-min=10.7"]
 
         from os.path import isdir
         for srp in sysroot_paths:
             if isdir(srp):
-                default_cxxflags.extend(['-isysroot', srp])
+                default_cxxflags.extend(["-isysroot", srp])
                 break
 
         default_ldflags = default_cxxflags[:] + ["-Wl,-framework,OpenCL"]
@@ -151,7 +147,7 @@ def main():
     finally:
         version_file.close()
 
-    exec(compile(version_file_contents, "pyopencl/version.py", 'exec'), ver_dic)
+    exec(compile(version_file_contents, "pyopencl/version.py", "exec"), ver_dic)
 
     try:
         import mako  # noqa
@@ -194,29 +190,25 @@ def main():
             # metadata
             version=ver_dic["VERSION_TEXT"],
             description="Python wrapper for OpenCL",
-            long_description=open("README.rst", "rt").read(),
+            long_description=open("README.rst").read(),
             author="Andreas Kloeckner",
             author_email="inform@tiker.net",
             license="MIT",
             url="http://mathema.tician.de/software/pyopencl",
             classifiers=[
-                'Environment :: Console',
-                'Development Status :: 5 - Production/Stable',
-                'Intended Audience :: Developers',
-                'Intended Audience :: Other Audience',
-                'Intended Audience :: Science/Research',
-                'License :: OSI Approved :: MIT License',
-                'Natural Language :: English',
-                'Programming Language :: C++',
-                'Programming Language :: Python',
-                'Programming Language :: Python :: 2',
-                'Programming Language :: Python :: 2.7',
-                'Programming Language :: Python :: 3',
-                'Programming Language :: Python :: 3.2',
-                'Programming Language :: Python :: 3.3',
-                'Topic :: Scientific/Engineering',
-                'Topic :: Scientific/Engineering :: Mathematics',
-                'Topic :: Scientific/Engineering :: Physics',
+                "Environment :: Console",
+                "Development Status :: 5 - Production/Stable",
+                "Intended Audience :: Developers",
+                "Intended Audience :: Other Audience",
+                "Intended Audience :: Science/Research",
+                "License :: OSI Approved :: MIT License",
+                "Natural Language :: English",
+                "Programming Language :: C++",
+                "Programming Language :: Python",
+                "Programming Language :: Python :: 3",
+                "Topic :: Scientific/Engineering",
+                "Topic :: Scientific/Engineering :: Mathematics",
+                "Topic :: Scientific/Engineering :: Physics",
                 ],
 
             # build info
@@ -234,33 +226,32 @@ def main():
                         ],
                     include_dirs=INCLUDE_DIRS + [
                         get_pybind_include(),
-                        get_pybind_include(user=True)
                         ],
                     library_dirs=conf["CL_LIB_DIR"],
                     libraries=conf["CL_LIBNAME"],
                     define_macros=list(conf["EXTRA_DEFINES"].items()),
                     extra_compile_args=conf["CXXFLAGS"],
                     extra_link_args=conf["LDFLAGS"],
-                    language='c++',
+                    language="c++",
                     ),
                 ],
 
             setup_requires=[
-                "pybind11",
+                "pybind11>=2.5.0",
                 "numpy",
                 ],
 
+            python_requires="~=3.6",
             install_requires=[
                 "numpy",
                 "pytools>=2017.6",
                 "decorator>=3.2.0",
                 "appdirs>=1.4.0",
-                "six>=1.9.0",
                 # "Mako>=0.3.6",
                 ],
             extras_require={
-                'pocl':  ["pocl_binary_distribution>=1.2"],
-                'oclgrind':  ["oclgrind_binary_distribution>=18.3"],
+                "pocl":  ["pocl_binary_distribution>=1.2"],
+                "oclgrind":  ["oclgrind_binary_distribution>=18.3"],
             },
             include_package_data=True,
             package_data={
@@ -272,11 +263,11 @@ def main():
                         ]
                     },
 
-            cmdclass={'build_ext': PybindBuildExtCommand},
+            cmdclass={"build_ext": PybindBuildExtCommand},
             zip_safe=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 
 # vim: foldmethod=marker
diff --git a/src/clinfo_ext.h b/src/clinfo_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc87fd95c554db1c6c8958b78b3c3229dd380030
--- /dev/null
+++ b/src/clinfo_ext.h
@@ -0,0 +1,134 @@
+/* Include OpenCL header, and define OpenCL extensions, since what is and is not
+ * available in the official headers is very system-dependent */
+
+#ifndef _EXT_H
+#define _EXT_H
+
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+/* These two defines were introduced in the 1.2 headers
+ * on 2012-11-30, so earlier versions don't have them
+ * (e.g. Debian wheezy)
+ */
+
+#ifndef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT          0x104B
+#endif
+
+/*
+ * Extensions
+ */
+
+/* cl_khr_icd */
+#define CL_PLATFORM_ICD_SUFFIX_KHR			0x0920
+#define CL_PLATFORM_NOT_FOUND_KHR			-1001
+
+
+/* cl_khr_fp64 */
+#define CL_DEVICE_DOUBLE_FP_CONFIG			0x1032
+
+/* cl_khr_fp16 */
+#define CL_DEVICE_HALF_FP_CONFIG			0x1033
+
+/* cl_khr_terminate_context */
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR		0x200F
+
+/* cl_nv_device_attribute_query */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV		0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV		0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV		0x4002
+#define CL_DEVICE_WARP_SIZE_NV				0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV			0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV		0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV			0x4006
+#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV	0x4007
+#define CL_DEVICE_PCI_BUS_ID_NV				0x4008
+#define CL_DEVICE_PCI_SLOT_ID_NV			0x4009
+#define CL_DEVICE_PCI_DOMAIN_ID_NV          0x400A
+
+/* cl_ext_atomic_counters_{32,64} */
+#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT		0x4032
+
+/* cl_amd_device_attribute_query */
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD		0x4036
+#define CL_DEVICE_TOPOLOGY_AMD				0x4037
+#define CL_DEVICE_BOARD_NAME_AMD			0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD		0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD		0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD			0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD		0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD			0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD		0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD		0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD	0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD	0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD			0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD		0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD			0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD			0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD		0x404C
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
+#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
+#define CL_DEVICE_PCIE_ID_AMD                           0x4034
+
+#ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD		1
+
+typedef union
+{
+	struct { cl_uint type; cl_uint data[5]; } raw;
+	struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+#endif
+
+/* cl_amd_offline_devices */
+#define CL_CONTEXT_OFFLINE_DEVICES_AMD			0x403F
+
+/* cl_ext_device_fission */
+#define cl_ext_device_fission				1
+
+typedef cl_ulong  cl_device_partition_property_ext;
+
+#define CL_DEVICE_PARTITION_EQUALLY_EXT			0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT		0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT		0x4052
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL		0x4052 /* cl_intel_device_partition_by_names */
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT	0x4053
+
+#define CL_DEVICE_PARENT_DEVICE_EXT			0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT			0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT			0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT			0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT			0x4058
+
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT			0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT			0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT			0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT			0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT			0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT		0x100
+
+/* cl_intel_advanced_motion_estimation */
+#define CL_DEVICE_ME_VERSION_INTEL			0x407E
+
+/* cl_qcom_ext_host_ptr */
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM		0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM			0x40A1
+
+/* cl_khr_spir */
+#define CL_DEVICE_SPIR_VERSIONS				0x40E0
+
+/* cl_altera_device_temperature */
+#define CL_DEVICE_CORE_TEMPERATURE_ALTERA		0x40F3
+
+/* cl_intel_simultaneous_sharing */
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL		0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL	0x4105
+
+#endif
diff --git a/src/mempool.hpp b/src/mempool.hpp
index 3491c69db8f6372aa0132f046d7714494081d41e..23e5758a9ea54117273810f74129e9d97be08c4b 100644
--- a/src/mempool.hpp
+++ b/src/mempool.hpp
@@ -83,10 +83,18 @@ namespace PYGPU_PACKAGE
 
       // A held block is one that's been released by the application, but that
       // we are keeping around to dish out again.
-      unsigned m_held_blocks;
+      size_type m_held_blocks;
 
       // An active block is one that is in use by the application.
-      unsigned m_active_blocks;
+      size_type m_active_blocks;
+
+      // "Managed" memory is "active" and "held" memory.
+      size_type m_managed_bytes;
+
+      // "Active" bytes are bytes under the control of the application.
+      // This may be smaller than the actual allocated size reflected
+      // in m_managed_bytes.
+      size_type m_active_bytes;
 
       bool m_stop_holding;
       int m_trace;
@@ -96,7 +104,9 @@ namespace PYGPU_PACKAGE
     public:
       memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4)
         : m_allocator(alloc.copy()),
-        m_held_blocks(0), m_active_blocks(0), m_stop_holding(false),
+        m_held_blocks(0), m_active_blocks(0),
+        m_managed_bytes(0), m_active_bytes(0),
+        m_stop_holding(false),
         m_trace(false), m_leading_bits_in_bin_id(leading_bits_in_bin_id)
       {
         if (m_allocator->is_deferred())
@@ -210,7 +220,7 @@ namespace PYGPU_PACKAGE
         if (m_trace)
           std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl;
 
-        try { return get_from_allocator(alloc_sz); }
+        try { return get_from_allocator(alloc_sz, size); }
         catch (PYGPU_PACKAGE::error &e)
         {
           if (!e.is_out_of_memory())
@@ -229,7 +239,7 @@ namespace PYGPU_PACKAGE
 
         while (try_to_free_memory())
         {
-          try { return get_from_allocator(alloc_sz); }
+          try { return get_from_allocator(alloc_sz, size); }
           catch (PYGPU_PACKAGE::error &e)
           {
             if (!e.is_out_of_memory())
@@ -251,6 +261,7 @@ namespace PYGPU_PACKAGE
       void free(pointer_type p, size_type size)
       {
         --m_active_blocks;
+        m_active_bytes -= size;
         bin_nr_t bin_nr = bin_number(size);
 
         if (!m_stop_holding)
@@ -264,7 +275,10 @@ namespace PYGPU_PACKAGE
               << " entries" << std::endl;
         }
         else
+        {
           m_allocator->free(p);
+          m_managed_bytes -= alloc_size(bin_nr);
+        }
       }
 
       void free_held()
@@ -276,6 +290,7 @@ namespace PYGPU_PACKAGE
           while (bin.size())
           {
             m_allocator->free(bin.back());
+            m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
             dec_held_blocks();
@@ -291,12 +306,18 @@ namespace PYGPU_PACKAGE
         free_held();
       }
 
-      unsigned active_blocks()
+      size_type active_blocks() const
       { return m_active_blocks; }
 
-      unsigned held_blocks()
+      size_type held_blocks() const
       { return m_held_blocks; }
 
+      size_type managed_bytes() const
+      { return m_managed_bytes; }
+
+      size_type active_bytes() const
+      { return m_active_bytes; }
+
       bool try_to_free_memory()
       {
         // free largest stuff first
@@ -307,6 +328,7 @@ namespace PYGPU_PACKAGE
           if (bin.size())
           {
             m_allocator->free(bin.back());
+            m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
             dec_held_blocks();
@@ -319,10 +341,12 @@ namespace PYGPU_PACKAGE
       }
 
     private:
-      pointer_type get_from_allocator(size_type alloc_sz)
+      pointer_type get_from_allocator(size_type alloc_sz, size_type size)
       {
         pointer_type result = m_allocator->allocate(alloc_sz);
         ++m_active_blocks;
+        m_managed_bytes += alloc_sz;
+        m_active_bytes += size;
 
         return result;
       }
@@ -334,6 +358,7 @@ namespace PYGPU_PACKAGE
 
         dec_held_blocks();
         ++m_active_blocks;
+        m_active_bytes += size;
 
         return result;
       }
diff --git a/src/pyopencl_ext.h b/src/pyopencl_ext.h
index a9792d8b07cb5be7a2def7d0b4a50941639442e1..d72449198e74bf712785fd5f84bf2dbf7275ed82 100644
--- a/src/pyopencl_ext.h
+++ b/src/pyopencl_ext.h
@@ -49,6 +49,10 @@ typedef union
 #define CL_DEVICE_PCI_SLOT_ID_NV                        0x4009
 #endif
 
+#ifndef CL_DEVICE_PCI_DOMAIN_ID_NV
+#define CL_DEVICE_PCI_DOMAIN_ID_NV                      0x400A
+#endif
+
 /* }}} */
 
 #endif
diff --git a/src/wrap_cl.cpp b/src/wrap_cl.cpp
index b9393ed0f4963968d7006d686e72c349eebc20ad..50a482016ebda749ec4031f9d7cb92ab200dfa12 100644
--- a/src/wrap_cl.cpp
+++ b/src/wrap_cl.cpp
@@ -43,11 +43,7 @@ extern void pyopencl_expose_mempool(py::module &m);
 
 static bool import_numpy_helper()
 {
-#ifdef PYPY_VERSION
-      import_array();
-#else
-      import_array1(false);
-#endif
+  import_array1(false);
   return true;
 }
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index ec4854c03e4294d50cc1c4fa78fb1148df5c83b3..03932d828e29bee75c4b17b47f797e05b5b322df 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -30,6 +30,19 @@
 // CL 1.2 undecided:
 // clSetPrintfCallback
 
+// CL 2.0 complete
+
+// CL 2.1 complete
+
+// CL 2.2 complete
+
+// CL 3.0 missing:
+// clCreateBufferWithProperties
+// clCreateImageWithProperties
+// (no wrappers for now: OpenCL 3.0 does not define any optional properties for
+// buffers or images, no implementations to test with.)
+
+
 // {{{ includes
 
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
@@ -52,7 +65,7 @@
 #else
 
 // elsewhere ------------------------------------------------------------------
-#define CL_TARGET_OPENCL_VERSION 220
+#define CL_TARGET_OPENCL_VERSION 300
 
 #include <CL/cl.h>
 #include "pyopencl_ext.h"
@@ -73,6 +86,7 @@
 
 #endif
 
+#include <functional>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@@ -91,7 +105,9 @@
 #define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION
 #else
 
-#if defined(CL_VERSION_2_2)
+#if defined(CL_VERSION_3_0)
+#define PYOPENCL_CL_VERSION 0x3000
+#elif defined(CL_VERSION_2_2)
 #define PYOPENCL_CL_VERSION 0x2020
 #elif defined(CL_VERSION_2_1)
 #define PYOPENCL_CL_VERSION 0x2010
@@ -108,14 +124,6 @@
 #endif
 
 
-#if (PY_VERSION_HEX >= 0x03000000) or defined(PYPY_VERSION)
-#define PYOPENCL_USE_NEW_BUFFER_INTERFACE
-#define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) std::move(s)
-#else
-#define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) (s)
-#endif
-
-
 #if defined(_WIN32)
 // MSVC does not understand variable-length arrays
 #define PYOPENCL_STACK_CONTAINER(TYPE, NAME, COUNT) std::vector<TYPE> NAME(COUNT)
@@ -365,7 +373,7 @@
 
 
 
-#define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
+#define PYOPENCL_GET_TYPED_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
   { \
     TYPE param_value; \
     PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
@@ -382,14 +390,15 @@
     \
     if (py_wait_for.ptr() != Py_None) \
     { \
-      event_wait_list.resize(len(py_wait_for)); \
       for (py::handle evt: py_wait_for) \
-        event_wait_list[num_events_in_wait_list++] = \
-          evt.cast<const event &>().data(); \
+      { \
+        event_wait_list.push_back(evt.cast<const event &>().data()); \
+        ++num_events_in_wait_list; \
+      } \
     }
 
 #define PYOPENCL_WAITLIST_ARGS \
-    num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front()
+    num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front()
 
 #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \
     try \
@@ -430,6 +439,7 @@
 namespace pyopencl
 {
   class program;
+  class command_queue;
 
   // {{{ error
   class error : public std::runtime_error
@@ -487,8 +497,8 @@ namespace pyopencl
 
 
   // {{{ buffer interface helper
-  //
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+
+
   class py_buffer_wrapper : public noncopyable
   {
     private:
@@ -529,7 +539,7 @@ namespace pyopencl
         PyBuffer_Release(&m_buf);
     }
   };
-#endif
+
 
   // }}}
 
@@ -579,6 +589,20 @@ namespace pyopencl
 #endif
             PYOPENCL_GET_STR_INFO(Platform, m_platform, param_name);
 
+#if PYOPENCL_CL_VERSION >= 0x2010
+          case CL_PLATFORM_HOST_TIMER_RESOLUTION:
+            PYOPENCL_GET_TYPED_INFO(Platform, m_platform, param_name, cl_ulong);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+          case CL_PLATFORM_NUMERIC_VERSION:
+            PYOPENCL_GET_TYPED_INFO(Platform, m_platform, param_name, cl_version);
+          case CL_PLATFORM_EXTENSIONS_WITH_VERSION:
+            {
+              std::vector<cl_name_version> result;
+              PYOPENCL_GET_VEC_INFO(Platform, m_platform, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_name_version, result);
+            }
+#endif
           default:
             throw error("Platform.get_info", CL_INVALID_VALUE);
         }
@@ -670,7 +694,7 @@ namespace pyopencl
       py::object get_info(cl_device_info param_name) const
       {
 #define DEV_GET_INT_INF(TYPE) \
-        PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE);
+        PYOPENCL_GET_TYPED_INFO(Device, m_device, param_name, TYPE);
 
         switch (param_name)
         {
@@ -787,6 +811,10 @@ namespace pyopencl
           case CL_DEVICE_PCI_SLOT_ID_NV:
             DEV_GET_INT_INF(cl_uint);
 #endif
+#ifdef CL_DEVICE_PCI_DOMAIN_ID_NV
+          case CL_DEVICE_PCI_DOMAIN_ID_NV:
+            DEV_GET_INT_INF(cl_uint);
+#endif
 #ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
           case CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD: DEV_GET_INT_INF(cl_bool);
 #endif
@@ -837,15 +865,15 @@ namespace pyopencl
 // {{{ AMD dev attrs cl_amd_device_attribute_query
 //
 // types of AMD dev attrs divined from
-// https://www.khronos.org/registry/cl/api/1.2/cl.hpp
+// https://github.com/KhronosGroup/OpenCL-CLHPP/blob/3b03738fef487378b188d21cc5f2bae276aa8721/include/CL/opencl.hpp#L1471-L1500
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
           case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong);
 #endif
-/* FIXME
 #ifdef CL_DEVICE_TOPOLOGY_AMD
           case CL_DEVICE_TOPOLOGY_AMD:
+            PYOPENCL_GET_TYPED_INFO(
+                Device, m_device, param_name, cl_device_topology_amd);
 #endif
-*/
 #ifdef CL_DEVICE_BOARD_NAME_AMD
           case CL_DEVICE_BOARD_NAME_AMD: ;
             PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
@@ -876,6 +904,17 @@ namespace pyopencl
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
           case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
 #endif
+// FIXME: MISSING:
+//
+// CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
+// CL_DEVICE_GFXIP_MAJOR_AMD
+// CL_DEVICE_GFXIP_MINOR_AMD
+// CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
+// CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
+// CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
+// CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD
+// CL_DEVICE_PCIE_ID_AMD
+
 // }}}
 
 #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
@@ -904,6 +943,35 @@ namespace pyopencl
           case CL_DEVICE_MAX_NUM_SUB_GROUPS: DEV_GET_INT_INF(cl_uint);
           case CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: DEV_GET_INT_INF(cl_bool);
 #endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+          case CL_DEVICE_NUMERIC_VERSION: DEV_GET_INT_INF(cl_version);
+          case CL_DEVICE_EXTENSIONS_WITH_VERSION:
+          case CL_DEVICE_ILS_WITH_VERSION:
+          case CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION:
+          case CL_DEVICE_OPENCL_C_ALL_VERSIONS:
+          case CL_DEVICE_OPENCL_C_FEATURES:
+            {
+              std::vector<cl_name_version> result;
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_name_version, result);
+            }
+          case CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES: DEV_GET_INT_INF(cl_device_atomic_capabilities);
+          case CL_DEVICE_ATOMIC_FENCE_CAPABILITIES: DEV_GET_INT_INF(cl_device_atomic_capabilities);
+          case CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT: DEV_GET_INT_INF(cl_bool);
+
+#ifdef CL_DEVICE_DEVICE_ENQUEUE_SUPPORT
+          case CL_DEVICE_DEVICE_ENQUEUE_SUPPORT: DEV_GET_INT_INF(cl_bool);
+#endif
+#ifdef CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES
+          case CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES: DEV_GET_INT_INF(cl_device_device_enqueue_capabilities);
+#endif
+
+          case CL_DEVICE_PIPE_SUPPORT: DEV_GET_INT_INF(cl_bool);
+#endif
+
 #ifdef CL_DEVICE_ME_VERSION_INTEL
           case CL_DEVICE_ME_VERSION_INTEL: DEV_GET_INT_INF(cl_uint);
 #endif
@@ -968,6 +1036,23 @@ namespace pyopencl
       }
 #endif
 
+#if PYOPENCL_CL_VERSION >= 0x2010
+      py::tuple device_and_host_timer() const
+      {
+        cl_ulong device_timestamp, host_timestamp;
+        PYOPENCL_CALL_GUARDED(clGetDeviceAndHostTimer,
+            (m_device, &device_timestamp, &host_timestamp));
+        return py::make_tuple(device_timestamp, host_timestamp);
+      }
+
+      cl_ulong host_timer() const
+      {
+        cl_ulong host_timestamp;
+        PYOPENCL_CALL_GUARDED(clGetHostTimer,
+            (m_device, &host_timestamp));
+        return host_timestamp;
+      }
+#endif
   };
 
 
@@ -1038,7 +1123,7 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_CONTEXT_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(
+            PYOPENCL_GET_TYPED_INFO(
                 Context, m_context, param_name, cl_uint);
 
           case CL_CONTEXT_DEVICES:
@@ -1102,7 +1187,7 @@ namespace pyopencl
 
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_CONTEXT_NUM_DEVICES:
-            PYOPENCL_GET_INTEGRAL_INFO(
+            PYOPENCL_GET_TYPED_INFO(
                 Context, m_context, param_name, cl_uint);
 #endif
 
@@ -1146,11 +1231,15 @@ namespace pyopencl
         errno = 0;
         int match_count = sscanf(plat_version.c_str(), "OpenCL %d.%d ", &major_ver, &minor_ver);
         if (errno || match_count != 2)
-          throw error("Context._get_hex_version", CL_INVALID_VALUE,
+          throw error("Context._get_hex_platform_version", CL_INVALID_VALUE,
               "Platform version string did not have expected format");
 
         return major_ver << 12 | minor_ver << 4;
       }
+
+#if PYOPENCL_CL_VERSION >= 0x2010
+      void set_default_device_command_queue(device const &dev, command_queue const &queue);
+#endif
   };
 
 
@@ -1439,11 +1528,29 @@ namespace pyopencl
             PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
                 cl_device_id, device);
           case CL_QUEUE_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                 cl_uint);
           case CL_QUEUE_PROPERTIES:
-            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                 cl_command_queue_properties);
+#if PYOPENCL_CL_VERSION >= 0x2000
+          case CL_QUEUE_SIZE:
+            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
+                cl_uint);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+          case CL_QUEUE_DEVICE_DEFAULT:
+            PYOPENCL_GET_OPAQUE_INFO(
+                CommandQueue, m_queue, param_name, cl_command_queue, command_queue);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+          case CL_QUEUE_PROPERTIES_ARRAY:
+            {
+              std::vector<cl_queue_properties> result;
+              PYOPENCL_GET_VEC_INFO(CommandQueue, m_queue, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_queue_properties, result);
+            }
+#endif
 
           default:
             throw error("CommandQueue.get_info", CL_INVALID_VALUE);
@@ -1475,6 +1582,39 @@ namespace pyopencl
       { PYOPENCL_CALL_GUARDED(clFlush, (m_queue)); }
       void finish()
       { PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue)); }
+
+      // not exposed to python
+      int get_hex_device_version() const
+      {
+        cl_device_id dev;
+
+        PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+            (m_queue, CL_QUEUE_DEVICE, sizeof(dev), &dev, nullptr));
+
+        std::string dev_version;
+        {
+          size_t param_value_size;
+          PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
+              (dev, CL_DEVICE_VERSION, 0, 0, &param_value_size));
+
+          std::vector<char> param_value(param_value_size);
+          PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
+              (dev, CL_DEVICE_VERSION, param_value_size,
+               param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size));
+
+          dev_version =
+              param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1);
+        }
+
+        int major_ver, minor_ver;
+        errno = 0;
+        int match_count = sscanf(dev_version.c_str(), "OpenCL %d.%d ", &major_ver, &minor_ver);
+        if (errno || match_count != 2)
+          throw error("CommandQueue._get_hex_device_version", CL_INVALID_VALUE,
+              "Platform version string did not have expected format");
+
+        return major_ver << 12 | minor_ver << 4;
+      }
   };
 
   // }}}
@@ -1518,13 +1658,13 @@ namespace pyopencl
             PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
                 cl_command_queue, command_queue);
           case CL_EVENT_COMMAND_TYPE:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_command_type);
           case CL_EVENT_COMMAND_EXECUTION_STATUS:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_int);
           case CL_EVENT_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_uint);
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_EVENT_CONTEXT:
@@ -1548,7 +1688,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x2000
           case CL_PROFILING_COMMAND_COMPLETE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(EventProfiling, m_event, param_name,
                 cl_ulong);
           default:
             throw error("Event.get_profiling_info", CL_INVALID_VALUE);
@@ -1680,7 +1820,6 @@ namespace pyopencl
 #endif
   };
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
   class nanny_event : public event
   {
     // In addition to everything an event does, the nanny event holds a reference
@@ -1724,48 +1863,6 @@ namespace pyopencl
         m_ward.reset();
       }
   };
-#else
-  class nanny_event : public event
-  {
-    // In addition to everything an event does, the nanny event holds a reference
-    // to a Python object and waits for its own completion upon destruction.
-
-    protected:
-      py::object        m_ward;
-
-    public:
-
-      nanny_event(cl_event evt, bool retain, py::object ward)
-        : event(evt, retain), m_ward(ward)
-      { }
-
-      nanny_event(nanny_event const &src)
-        : event(src), m_ward(src.m_ward)
-      { }
-
-      ~nanny_event()
-      {
-        // It appears that Pybind can get very confused if we release the GIL here:
-        // https://github.com/inducer/pyopencl/issues/296
-        wait_during_cleanup_without_releasing_the_gil();
-      }
-
-      py::object get_ward() const
-      { return m_ward; }
-
-      virtual void wait()
-      {
-        event::wait();
-        m_ward = py::none();
-      }
-
-      virtual void wait_during_cleanup_without_releasing_the_gil()
-      {
-        event::wait_during_cleanup_without_releasing_the_gil();
-        m_ward = py::none();
-      }
-  };
-#endif
 
 
 
@@ -1922,11 +2019,7 @@ namespace pyopencl
   class memory_object : noncopyable, public memory_object_holder
   {
     public:
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
       typedef std::unique_ptr<py_buffer_wrapper> hostbuf_t;
-#else
-      typedef py::object hostbuf_t;
-#endif
 
     private:
       bool m_valid;
@@ -1940,12 +2033,12 @@ namespace pyopencl
         if (retain)
           PYOPENCL_CALL_GUARDED(clRetainMemObject, (mem));
 
-        m_hostbuf = PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf);
+        m_hostbuf = std::move(hostbuf);
       }
 
       memory_object(memory_object &src)
         : m_valid(true), m_mem(src.m_mem),
-        m_hostbuf(PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(src.m_hostbuf))
+        m_hostbuf(std::move(src.m_hostbuf))
       {
         PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
       }
@@ -1973,14 +2066,10 @@ namespace pyopencl
 
       py::object hostbuf()
       {
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         if (m_hostbuf.get())
           return py::reinterpret_borrow<py::object>(m_hostbuf->m_buf.obj);
         else
           return py::none();
-#else
-        return m_hostbuf;
-#endif
       }
 
       const cl_mem data() const
@@ -2087,7 +2176,7 @@ namespace pyopencl
   {
     public:
       buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
-        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+        : memory_object(mem, retain, std::move(hostbuf))
       { }
 
 #if PYOPENCL_CL_VERSION >= 0x1010
@@ -2164,7 +2253,6 @@ namespace pyopencl
 
     void *buf = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (py_hostbuf.ptr() != Py_None)
     {
@@ -2186,46 +2274,15 @@ namespace pyopencl
       if (size == 0)
         size = retained_buf_obj->m_buf.len;
     }
-#else
-    py::object retained_buf_obj;
-    if (py_hostbuf.ptr() != Py_None)
-    {
-      PYOPENCL_BUFFER_SIZE_T len;
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(py_hostbuf.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              py_hostbuf.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = py_hostbuf;
-
-      if (size > size_t(len))
-        throw pyopencl::error("Buffer", CL_INVALID_VALUE,
-            "specified size is greater than host buffer size");
-      if (size == 0)
-        size = len;
-    }
-#endif
 
     cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf);
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
-      return new buffer(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+      return new buffer(mem, false, std::move(retained_buf_obj));
     }
     catch (...)
     {
@@ -2254,18 +2311,12 @@ namespace pyopencl
     void *buf;
     PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
     len = ward->m_buf.len;
-#else
-    py::object ward = buffer;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2297,18 +2348,12 @@ namespace pyopencl
     const void *buf;
     PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
     len = ward->m_buf.len;
-#else
-    py::object ward = buffer;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2391,19 +2436,11 @@ namespace pyopencl
 
     void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2447,18 +2484,11 @@ namespace pyopencl
 
     const void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2536,17 +2566,12 @@ namespace pyopencl
     const void *pattern_buf;
     PYOPENCL_BUFFER_SIZE_T pattern_len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     pattern_buf = ward->m_buf.buf;
     pattern_len = ward->m_buf.len;
-#else
-    if (PyObject_AsReadBuffer(pattern.ptr(), &pattern_buf, &pattern_len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2570,7 +2595,7 @@ namespace pyopencl
   {
     public:
       image(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
-        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+        : memory_object(mem, retain, std::move(hostbuf))
       { }
 
       py::object get_image_info(cl_image_info param_name) const
@@ -2578,7 +2603,7 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_IMAGE_FORMAT:
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name,
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name,
                 cl_image_format);
           case CL_IMAGE_ELEMENT_SIZE:
           case CL_IMAGE_ROW_PITCH:
@@ -2589,7 +2614,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_IMAGE_ARRAY_SIZE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, size_t);
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, size_t);
 
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_IMAGE_BUFFER:
@@ -2608,11 +2633,11 @@ namespace pyopencl
 
           case CL_IMAGE_NUM_MIP_LEVELS:
           case CL_IMAGE_NUM_SAMPLES:
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_uint);
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, cl_uint);
 #endif
 
           default:
-            throw error("MemoryObject.get_image_info", CL_INVALID_VALUE);
+            throw error("Image.get_image_info", CL_INVALID_VALUE);
         }
       }
   };
@@ -2725,7 +2750,6 @@ namespace pyopencl
     void *buf = 0;
     PYOPENCL_BUFFER_SIZE_T len = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (buffer.ptr() != Py_None)
     {
@@ -2742,28 +2766,6 @@ namespace pyopencl
       buf = retained_buf_obj->m_buf.buf;
       len = retained_buf_obj->m_buf.len;
     }
-#else
-    py::object retained_buf_obj;
-    if (buffer.ptr() != Py_None)
-    {
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              buffer.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = buffer;
-    }
-#endif
 
     unsigned dims = py::len(shape);
     cl_int status_code;
@@ -2838,14 +2840,12 @@ namespace pyopencl
       throw pyopencl::error("Image", CL_INVALID_VALUE,
           "invalid dimension");
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
-      return new image(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+      return new image(mem, false, std::move(retained_buf_obj));
     }
     catch (...)
     {
@@ -2871,7 +2871,6 @@ namespace pyopencl
 
     void *buf = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (buffer.ptr() != Py_None)
     {
@@ -2887,29 +2886,6 @@ namespace pyopencl
 
       buf = retained_buf_obj->m_buf.buf;
     }
-#else
-    py::object retained_buf_obj;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (buffer.ptr() != Py_None)
-    {
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              buffer.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = buffer;
-    }
-#endif
 
     PYOPENCL_PRINT_CALL_TRACE("clCreateImage");
     cl_int status_code;
@@ -2917,14 +2893,12 @@ namespace pyopencl
     if (status_code != CL_SUCCESS)
       throw pyopencl::error("clCreateImage", status_code);
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
-      return new image(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+      return new image(mem, false, std::move(retained_buf_obj));
     }
     catch (...)
     {
@@ -2955,18 +2929,11 @@ namespace pyopencl
 
     void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
 
@@ -3001,18 +2968,11 @@ namespace pyopencl
 
     const void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -3134,17 +3094,11 @@ namespace pyopencl
 
     const void *color_buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(color.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     color_buf = ward->m_buf.buf;
-#else
-    PYOPENCL_BUFFER_SIZE_T color_len;
-    if (PyObject_AsReadBuffer(color.ptr(), &color_buf, &color_len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -3162,6 +3116,84 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ pipe
+
+  class pipe : public memory_object
+  {
+    public:
+      pipe(cl_mem mem, bool retain)
+        : memory_object(mem, retain)
+      { }
+
+#if PYOPENCL_CL_VERSION < 0x2000
+      typedef void* cl_pipe_info;
+#endif
+
+      py::object get_pipe_info(cl_pipe_info param_name) const
+      {
+#if PYOPENCL_CL_VERSION >= 0x2000
+        switch (param_name)
+        {
+          case CL_PIPE_PACKET_SIZE:
+          case CL_PIPE_MAX_PACKETS:
+            PYOPENCL_GET_TYPED_INFO(Pipe, data(), param_name, cl_uint);
+
+          default:
+            throw error("Pipe.get_pipe_info", CL_INVALID_VALUE);
+        }
+#else
+        throw error("Pipes not available. PyOpenCL was not compiled against a CL2+ header.",
+            CL_INVALID_VALUE);
+#endif
+      }
+  };
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+  inline
+  pipe *create_pipe(
+      context const &ctx,
+      cl_mem_flags flags,
+      cl_uint pipe_packet_size,
+      cl_uint pipe_max_packets,
+      py::sequence py_props)
+  {
+    PYOPENCL_STACK_CONTAINER(cl_pipe_properties, props, py::len(py_props) + 1);
+    {
+      size_t i = 0;
+      for (auto prop: py_props)
+        props[i++] = py::cast<cl_pipe_properties>(prop);
+      props[i++] = 0;
+    }
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreatePipe");
+
+    cl_mem mem = clCreatePipe(
+        ctx.data(),
+        flags,
+        pipe_packet_size,
+        pipe_max_packets,
+        PYOPENCL_STACK_CONTAINER_GET_PTR(props),
+        &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("Pipe", status_code);
+
+    try
+    {
+      return new pipe(mem, false);
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+}
+#endif
+
+  // }}}
+
+
   // {{{ maps
   class memory_map
   {
@@ -3370,14 +3402,11 @@ namespace pyopencl
     private:
       void *m_ptr;
       PYOPENCL_BUFFER_SIZE_T m_size;
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
-        std::unique_ptr<py_buffer_wrapper> ward;
-#endif
+      std::unique_ptr<py_buffer_wrapper> ward;
 
     public:
       svm_arg_wrapper(py::object holder)
       {
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         ward = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
 #ifdef PYPY_VERSION
         // FIXME: get a read-only buffer
@@ -3389,11 +3418,6 @@ namespace pyopencl
 #endif
         m_ptr = ward->m_buf.buf;
         m_size = ward->m_buf.len;
-#else
-        py::object ward = holder;
-        if (PyObject_AsWriteBuffer(holder.ptr(), &m_ptr, &m_size))
-          throw py::error_already_set();
-#endif
       }
 
       void *ptr() const
@@ -3525,18 +3549,12 @@ namespace pyopencl
     const void *pattern_ptr;
     PYOPENCL_BUFFER_SIZE_T pattern_len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> pattern_ward(new py_buffer_wrapper);
 
     pattern_ward->get(py_pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     pattern_ptr = pattern_ward->m_buf.buf;
     pattern_len = pattern_ward->m_buf.len;
-#else
-    py::object pattern_ward = py_pattern;
-    if (PyObject_AsReadBuffer(py_pattern.ptr(), &pattern_ptr, &pattern_len))
-      throw py::error_already_set();
-#endif
 
     size_t fill_size = dst.size();
     if (!byte_count.is_none())
@@ -3767,20 +3785,37 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_SAMPLER_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_uint);
           case CL_SAMPLER_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name,
                 cl_context, context);
           case CL_SAMPLER_ADDRESSING_MODE:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_addressing_mode);
           case CL_SAMPLER_FILTER_MODE:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_filter_mode);
           case CL_SAMPLER_NORMALIZED_COORDS:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_bool);
+#if PYOPENCL_CL_VERSION >= 0x3000
+          case CL_SAMPLER_PROPERTIES:
+            {
+              std::vector<cl_sampler_properties> result;
+              PYOPENCL_GET_VEC_INFO(Sampler, m_sampler, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_sampler_properties, result);
+            }
+#endif
+
+#ifdef CL_SAMPLER_MIP_FILTER_MODE_KHR
+          case CL_SAMPLER_MIP_FILTER_MODE_KHR:
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
+                cl_filter_mode);
+          case CL_SAMPLER_LOD_MIN_KHR:
+          case CL_SAMPLER_LOD_MAX_KHR:
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, float);
+#endif
 
           default:
             throw error("Sampler.get_info", CL_INVALID_VALUE);
@@ -3832,14 +3867,13 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_PROGRAM_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                 cl_uint);
           case CL_PROGRAM_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
                 cl_context, context);
           case CL_PROGRAM_NUM_DEVICES:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
-                cl_uint);
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_uint);
           case CL_PROGRAM_DEVICES:
             {
               std::vector<cl_device_id> result;
@@ -3904,11 +3938,20 @@ namespace pyopencl
             // }}}
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_PROGRAM_NUM_KERNELS:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                 size_t);
           case CL_PROGRAM_KERNEL_NAMES:
             PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
 #endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+          case CL_PROGRAM_IL:
+            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2020
+          case CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT:
+          case CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT:
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_bool);
+#endif
 
           default:
             throw error("Program.get_info", CL_INVALID_VALUE);
@@ -3923,7 +3966,7 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
           case CL_PROGRAM_BUILD_STATUS:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_build_status);
           case CL_PROGRAM_BUILD_OPTIONS:
@@ -3932,13 +3975,13 @@ namespace pyopencl
                 PYOPENCL_FIRST_ARG, param_name);
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_PROGRAM_BINARY_TYPE:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_program_binary_type);
 #endif
 #if PYOPENCL_CL_VERSION >= 0x2000
           case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
 #endif
@@ -3996,6 +4039,16 @@ namespace pyopencl
              0, 0));
       }
 #endif
+
+#if PYOPENCL_CL_VERSION >= 0x2020
+      void set_specialization_constant(cl_uint spec_id, py::object py_buffer)
+      {
+        py_buffer_wrapper bufwrap;
+        bufwrap.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+        PYOPENCL_CALL_GUARDED(clSetProgramSpecializationConstant,
+            (m_program, spec_id, bufwrap.m_buf.len, bufwrap.m_buf.buf));
+      }
+#endif
   };
 
 
@@ -4053,18 +4106,12 @@ namespace pyopencl
       const void *buf;
       PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
       py_buffer_wrapper buf_wrapper;
 
       buf_wrapper.get(py::object(py_binaries[i]).ptr(), PyBUF_ANY_CONTIGUOUS);
 
       buf = buf_wrapper.m_buf.buf;
       len = buf_wrapper.m_buf.len;
-#else
-      if (PyObject_AsReadBuffer(
-            py::object(py_binaries[i]).ptr(), &buf, &len))
-        throw py::error_already_set();
-#endif
 
       binaries.push_back(reinterpret_cast<const unsigned char *>(buf));
       sizes.push_back(len);
@@ -4273,6 +4320,28 @@ namespace pyopencl
 
       PYOPENCL_EQUALITY_TESTS(kernel);
 
+#if PYOPENCL_CL_VERSION >= 0x2010
+      kernel *clone()
+      {
+        cl_int status_code;
+
+        PYOPENCL_PRINT_CALL_TRACE("clCloneKernel");
+        cl_kernel result = clCloneKernel(m_kernel, &status_code);
+        if (status_code != CL_SUCCESS)
+          throw pyopencl::error("clCloneKernel", status_code);
+
+        try
+        {
+          return new kernel(result, /* retain */ false);
+        }
+        catch (...)
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (result));
+          throw;
+        }
+      }
+#endif
+
       void set_arg_null(cl_uint arg_index)
       {
         cl_mem m = 0;
@@ -4307,12 +4376,47 @@ namespace pyopencl
             (m_kernel, arg_index, sizeof(cl_command_queue), &q));
       }
 
-      void set_arg_buf(cl_uint arg_index, py::object py_buffer)
+      void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
+      {
+        std::string typechar_str(py::cast<std::string>(py_typechar));
+        if (typechar_str.size() != 1)
+          throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+              "type char argument must have exactly one character");
+
+        char typechar = typechar_str[0];
+
+#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \
+        case TYPECH_VAL: \
+          { \
+            TYPE val = py::cast<TYPE>(obj); \
+            PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
+            break; \
+          }
+        switch (typechar)
+        {
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double)
+          default:
+            throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+                "invalid type char");
+        }
+#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG
+      }
+
+      void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
       {
         const void *buf;
         PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         py_buffer_wrapper buf_wrapper;
 
         try
@@ -4328,14 +4432,6 @@ namespace pyopencl
 
         buf = buf_wrapper.m_buf.buf;
         len = buf_wrapper.m_buf.len;
-#else
-        if (PyObject_AsReadBuffer(py_buffer.ptr(), &buf, &len))
-        {
-          PyErr_Clear();
-          throw error("Kernel.set_arg", CL_INVALID_VALUE,
-              "invalid kernel argument");
-        }
-#endif
 
         PYOPENCL_CALL_GUARDED(clSetKernelArg,
             (m_kernel, arg_index, len, buf));
@@ -4349,7 +4445,7 @@ namespace pyopencl
       }
 #endif
 
-      void set_arg(cl_uint arg_index, py::object arg)
+      void set_arg(cl_uint arg_index, py::handle arg)
       {
         if (arg.ptr() == Py_None)
         {
@@ -4405,7 +4501,7 @@ namespace pyopencl
             PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
           case CL_KERNEL_NUM_ARGS:
           case CL_KERNEL_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name,
+            PYOPENCL_GET_TYPED_INFO(Kernel, m_kernel, param_name,
                 cl_uint);
           case CL_KERNEL_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
@@ -4431,7 +4527,7 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
           case CL_KERNEL_WORK_GROUP_SIZE:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
           case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
@@ -4446,13 +4542,13 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_KERNEL_PRIVATE_MEM_SIZE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_ulong);
 
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
 #endif
@@ -4472,26 +4568,174 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
           case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_kernel_arg_address_qualifier);
 
           case CL_KERNEL_ARG_ACCESS_QUALIFIER:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_kernel_arg_access_qualifier);
 
           case CL_KERNEL_ARG_TYPE_NAME:
           case CL_KERNEL_ARG_NAME:
             PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);
+
+          case CL_KERNEL_ARG_TYPE_QUALIFIER:
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_kernel_arg_type_qualifier);
 #undef PYOPENCL_FIRST_ARG
           default:
             throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
         }
       }
 #endif
+
+#if PYOPENCL_CL_VERSION >= 0x2010
+    py::object get_sub_group_info(
+        device const &dev,
+        cl_kernel_sub_group_info param_name,
+        py::object py_input_value)
+    {
+      switch (param_name)
+      {
+        // size_t * -> size_t
+        case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE:
+        case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE:
+          {
+            std::vector<size_t> input_value;
+            COPY_PY_LIST(size_t, input_value);
+
+            size_t param_value;
+            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
+                (m_kernel, dev.data(), param_name,
+                 input_value.size()*sizeof(input_value.front()),
+                 input_value.empty() ? nullptr : &input_value.front(),
+                 sizeof(param_value), &param_value, 0));
+
+            return py::cast(param_value);
+          }
+
+        // size_t -> size_t[]
+        case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT:
+          {
+            size_t input_value = py::cast<size_t>(py_input_value);
+
+            std::vector<size_t> result;
+            size_t size;
+            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
+                (m_kernel, dev.data(), param_name,
+                 sizeof(input_value), &input_value,
+                 0, nullptr, &size));
+            result.resize(size / sizeof(result.front()));
+            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
+                (m_kernel, dev.data(), param_name,
+                 sizeof(input_value), &input_value,
+                 size, result.empty() ? nullptr : &result.front(), 0));
+
+            PYOPENCL_RETURN_VECTOR(size_t, result);
+          }
+
+        // () -> size_t
+        case CL_KERNEL_MAX_NUM_SUB_GROUPS:
+        case CL_KERNEL_COMPILE_NUM_SUB_GROUPS:
+          {
+            size_t param_value;
+            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
+                (m_kernel, dev.data(), param_name,
+                 0, nullptr,
+                 sizeof(param_value), &param_value, 0));
+
+            return py::cast(param_value);
+          }
+
+        default:
+          throw error("Kernel.get_sub_group_info", CL_INVALID_VALUE);
+      }
+  }
+#endif
   };
 
+#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \
+    catch (error &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      auto mod_cl_ary(py::module::import("pyopencl.array")); \
+      auto cls_array(mod_cl_ary.attr("Array")); \
+      if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \
+        msg.append( \
+            " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \
+      throw error(err.routine().c_str(), err.code(), msg.c_str()); \
+    } \
+    catch (std::exception &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      throw std::runtime_error(msg.c_str()); \
+    }
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_descr, arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_descr = *it++;
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_descr, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
 
   inline
   py::list create_kernels_in_program(program &pgm)
@@ -4512,69 +4756,100 @@ namespace pyopencl
     return result;
   }
 
-
+#define MAX_WS_DIM_COUNT 10
 
   inline
   event *enqueue_nd_range_kernel(
       command_queue &cq,
       kernel &knl,
-      py::object py_global_work_size,
-      py::object py_local_work_size,
-      py::object py_global_work_offset,
-      py::object py_wait_for,
-      bool g_times_l)
+      py::handle py_global_work_size,
+      py::handle py_local_work_size,
+      py::handle py_global_work_offset,
+      py::handle py_wait_for,
+      bool g_times_l,
+      bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    cl_uint work_dim = len(py_global_work_size);
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
+    unsigned gws_size = 0;
+    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size);
+    cl_uint work_dim = gws_size;
 
-    std::vector<size_t> global_work_size;
-    COPY_PY_LIST(size_t, global_work_size);
+    std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
+    unsigned lws_size = 0;
+    size_t *local_work_size_ptr = nullptr;
 
-    size_t *local_work_size_ptr = 0;
-    std::vector<size_t> local_work_size;
     if (py_local_work_size.ptr() != Py_None)
     {
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size);
+
       if (g_times_l)
-        work_dim = std::max(work_dim, unsigned(len(py_local_work_size)));
+        work_dim = std::max(work_dim, lws_size);
       else
-        if (work_dim != unsigned(len(py_local_work_size)))
+        if (work_dim != lws_size)
           throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
               "global/local work sizes have differing dimensions");
 
-      COPY_PY_LIST(size_t, local_work_size);
-
-      while (local_work_size.size() < work_dim)
-        local_work_size.push_back(1);
-      while (global_work_size.size() < work_dim)
-        global_work_size.push_back(1);
+      while (lws_size < work_dim)
+        local_work_size[lws_size++] = 1;
+      while (gws_size < work_dim)
+        global_work_size[gws_size++] = 1;
 
-      local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front();
+      local_work_size_ptr = &local_work_size.front();
     }
 
-    if (g_times_l && local_work_size_ptr)
+    if (g_times_l && lws_size)
     {
       for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
         global_work_size[work_axis] *= local_work_size[work_axis];
     }
 
-    size_t *global_work_offset_ptr = 0;
-    std::vector<size_t> global_work_offset;
+    size_t *global_work_offset_ptr = nullptr;
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
     if (py_global_work_offset.ptr() != Py_None)
     {
-      if (work_dim != unsigned(len(py_global_work_offset)))
+      unsigned gwo_size = 0;
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size);
+
+      if (work_dim != gwo_size)
         throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
             "global work size and offset have differing dimensions");
 
-      COPY_PY_LIST(size_t, global_work_offset);
-
       if (g_times_l && local_work_size_ptr)
       {
         for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
           global_work_offset[work_axis] *= local_work_size[work_axis];
       }
 
-      global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
+      global_work_offset_ptr = &global_work_offset.front();
+    }
+
+    if (allow_empty_ndrange)
+    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+      bool is_empty = false;
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        if (global_work_size[work_axis] == 0)
+          is_empty = true;
+      if (local_work_size_ptr)
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          if (local_work_size_ptr[work_axis] == 0)
+            is_empty = true;
+
+      if (is_empty)
+      {
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+#else
+      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
+      // in the case of an out-of-order queue.
+      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+          "allow_empty_ndrange requires OpenCL 1.2");
+#endif
     }
 
     PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
@@ -4584,7 +4859,7 @@ namespace pyopencl
                 knl.data(),
                 work_dim,
                 global_work_offset_ptr,
-                global_work_size.empty( ) ? nullptr : &global_work_size.front(),
+                &global_work_size.front(),
                 local_work_size_ptr,
                 PYOPENCL_WAITLIST_ARGS, &evt
                 ));
@@ -4629,7 +4904,7 @@ namespace pyopencl
   {
     public:
       gl_buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
-        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+        : memory_object(mem, retain, std::move(hostbuf))
       { }
   };
 
@@ -4640,7 +4915,7 @@ namespace pyopencl
   {
     public:
       gl_renderbuffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
-        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+        : memory_object(mem, retain, std::move(hostbuf))
       { }
   };
 
@@ -4651,7 +4926,7 @@ namespace pyopencl
   {
     public:
       gl_texture(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
-        : image(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+        : image(mem, retain, std::move(hostbuf))
       { }
 
       py::object get_gl_texture_info(cl_gl_texture_info param_name)
@@ -4659,9 +4934,9 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_GL_TEXTURE_TARGET:
-            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLenum);
+            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLenum);
           case CL_GL_MIPMAP_LEVEL:
-            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLint);
+            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLint);
 
           default:
             throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
@@ -4867,6 +5142,15 @@ namespace pyopencl
 
   // {{{ deferred implementation bits
 
+#if PYOPENCL_CL_VERSION >= 0x2010
+  inline void context::set_default_device_command_queue(device const &dev, command_queue const &queue)
+  {
+    PYOPENCL_CALL_GUARDED(clSetDefaultDeviceCommandQueue,
+        (m_context, dev.data(), queue.data()));
+  }
+#endif
+
+
   inline program *error::get_program() const
   {
     return new program(m_program, /* retain */ true);
@@ -4912,22 +5196,22 @@ namespace pyopencl
     switch (param_name)
     {
       case CL_MEM_TYPE:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_mem_object_type);
       case CL_MEM_FLAGS:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_mem_flags);
       case CL_MEM_SIZE:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             size_t);
       case CL_MEM_HOST_PTR:
         throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE,
             "Use MemoryObject.get_host_array to get host pointer.");
       case CL_MEM_MAP_COUNT:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_uint);
       case CL_MEM_REFERENCE_COUNT:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_uint);
       case CL_MEM_CONTEXT:
         PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name,
@@ -4948,9 +5232,22 @@ namespace pyopencl
           return create_mem_object_wrapper(param_value);
         }
       case CL_MEM_OFFSET:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             size_t);
 #endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+      case CL_MEM_USES_SVM_POINTER:
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
+            cl_bool);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+      case CL_MEM_PROPERTIES:
+            {
+              std::vector<cl_mem_properties> result;
+              PYOPENCL_GET_VEC_INFO(MemObject, data(), param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_mem_properties, result);
+            }
+#endif
 
       default:
         throw error("MemoryObjectHolder.get_info", CL_INVALID_VALUE);
diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp
index 541201e38bde65625ffe527ebd225d466e4025c8..4b0ec771ef4f20dd3edfbc7065152d6492effc39 100644
--- a/src/wrap_cl_part_1.cpp
+++ b/src/wrap_cl_part_1.cpp
@@ -68,6 +68,10 @@ void pyopencl_expose_part_1(py::module &m)
       .DEF_SIMPLE_METHOD(create_sub_devices)
 #endif
       PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_device_id)
+#if PYOPENCL_CL_VERSION >= 0x2010
+      .DEF_SIMPLE_METHOD(device_and_host_timer)
+      .DEF_SIMPLE_METHOD(host_timer)
+#endif
       ;
   }
 
@@ -99,6 +103,9 @@ void pyopencl_expose_part_1(py::module &m)
       .def(py::self != py::self)
       .def("__hash__", &cls::hash)
       PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_context)
+#if PYOPENCL_CL_VERSION >= 0x2010
+      .DEF_SIMPLE_METHOD(set_default_device_command_queue)
+#endif
       ;
   }
 
@@ -221,7 +228,7 @@ void pyopencl_expose_part_1(py::module &m)
       .def_static("from_int_ptr", memory_object_from_int,
         "(static method) Return a new Python object referencing the C-level "
         ":c:type:`cl_mem` object at the location pointed to "
-        "by *int_ptr_value*. The relevant :c:func:`clRetain*` function "
+        "by *int_ptr_value*. The relevant ``clRetain*`` function "
         "will be called if *retain* is True."
         "If the previous owner of the object will *not* release the reference, "
         "*retain* should be set to *False*, to effectively transfer ownership to "
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 5ca5efcf658bc7a81f2c67c265427486d6c4dfc4..205b31ec452b388fe1b32f3443e63762b33a10c0 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -222,6 +222,36 @@ void pyopencl_expose_part_2(py::module &m)
 
   // }}}
 
+  // {{{ pipe
+
+  {
+    typedef pyopencl::pipe cls;
+    py::class_<cls, memory_object>(m, "Pipe", py::dynamic_attr())
+#if PYOPENCL_CL_VERSION >= 0x2000
+      .def(
+          py::init(
+            [](
+              context const &ctx,
+              cl_mem_flags flags,
+              cl_uint pipe_packet_size,
+              cl_uint pipe_max_packets,
+              py::sequence py_props)
+            {
+              return create_pipe(ctx, flags, pipe_packet_size, pipe_max_packets, py_props);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("packet_size"),
+          py::arg("max_packets"),
+          py::arg("properties")
+          )
+#endif
+      .DEF_SIMPLE_METHOD(get_pipe_info)
+      ;
+  }
+
+  // }}}
+
   // {{{ memory_map
   {
     typedef memory_map cls;
@@ -400,6 +430,11 @@ void pyopencl_expose_part_2(py::module &m)
           py::arg("options")="",
           py::arg("devices")=py::none()
           )
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2020
+      .def("set_specialization_constant", &cls::set_specialization_constant,
+          py::arg("spec_id"),
+          py::arg("buffer"))
 #endif
       .def(py::self == py::self)
       .def(py::self != py::self)
@@ -427,11 +462,36 @@ void pyopencl_expose_part_2(py::module &m)
       .def(py::init<const program &, std::string const &>())
       .DEF_SIMPLE_METHOD(get_info)
       .DEF_SIMPLE_METHOD(get_work_group_info)
+#if PYOPENCL_CL_VERSION >= 0x2010
+      .DEF_SIMPLE_METHOD(clone)
+#endif
       .def("_set_arg_null", &cls::set_arg_null)
       .def("_set_arg_buf", &cls::set_arg_buf)
 #if PYOPENCL_CL_VERSION >= 0x2000
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
+      .def("_set_arg_multi",
+          [](cls &knl, py::tuple indices_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); },
+                indices_and_args);
+          })
+      .def("_set_arg_buf_multi",
+          [](cls &knl, py::tuple indices_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); },
+                indices_and_args);
+          })
+      .def("_set_arg_buf_pack_multi",
+          [](cls &knl, py::tuple indices_chars_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle typechar, py::handle arg)
+                { knl.set_arg_buf_pack(i, typechar, arg); },
+                indices_chars_and_args);
+          })
       .DEF_SIMPLE_METHOD(set_arg)
 #if PYOPENCL_CL_VERSION >= 0x1020
       .DEF_SIMPLE_METHOD(get_arg_info)
@@ -440,6 +500,13 @@ void pyopencl_expose_part_2(py::module &m)
       .def(py::self != py::self)
       .def("__hash__", &cls::hash)
       PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_kernel)
+#if PYOPENCL_CL_VERSION >= 0x2010
+      .def("get_sub_group_info", &cls::get_sub_group_info,
+          py::arg("device"),
+          py::arg("param"),
+          py::arg("input_value")=py::none()
+          )
+#endif
       ;
   }
 
@@ -461,7 +528,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("local_work_size"),
       py::arg("global_work_offset")=py::none(),
       py::arg("wait_for")=py::none(),
-      py::arg("g_times_l")=false
+      py::arg("g_times_l")=false,
+      py::arg("allow_empty_ndrange")=false
       );
 
   // TODO: clEnqueueNativeKernel
@@ -551,41 +619,6 @@ void pyopencl_expose_part_2(py::module &m)
 #endif
   // }}}
 
-  // {{{ CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
-
-  {
-    typedef cl_device_topology_amd cls;
-    py::class_<cls>(m, "DeviceTopologyAmd")
-      .def(py::init(
-            [](cl_char bus, cl_char device, cl_char function)
-            {
-              cl_device_topology_amd result;
-              result.pcie.bus = bus;
-              result.pcie.device = device;
-              result.pcie.function = function;
-              return result;
-            }),
-          py::arg("bus")=0,
-          py::arg("device")=0,
-          py::arg("function")=0)
-
-      .def_property("type",
-          [](cls &t) { return t.pcie.type; },
-          [](cls &t, cl_uint val) { t.pcie.type = val; })
-
-      .def_property("bus",
-          [](cls &t) { return t.pcie.bus; },
-          [](cls &t, cl_char val) { t.pcie.bus = val; })
-      .def_property("device",
-          [](cls &t) { return t.pcie.device; },
-          [](cls &t, cl_char val) { t.pcie.device = val; })
-      .def_property("function",
-          [](cls &t) { return t.pcie.function; },
-          [](cls &t, cl_char val) { t.pcie.function = val; })
-      ;
-  }
-
-  // }}}
 }
 
 
diff --git a/src/wrap_constants.cpp b/src/wrap_constants.cpp
index 258df2781283a4ee3834aa72b68b011402ac93f9..77dbc25fde9af39aaf0103bb326e52190575913d 100644
--- a/src/wrap_constants.cpp
+++ b/src/wrap_constants.cpp
@@ -40,6 +40,7 @@ namespace
   class platform_info { };
   class device_type { };
   class device_info { };
+  class device_topology_type_amd { };
   class device_fp_config { };
   class device_mem_cache_type { };
   class device_local_mem_type { };
@@ -58,9 +59,12 @@ namespace
   class mem_object_type { };
   class mem_info { };
   class image_info { };
+  class pipe_info { };
+  class pipe_properties { };
   class addressing_mode { };
   class filter_mode { };
   class sampler_info { };
+  class sampler_properties { };
   class map_flags { };
   class program_info { };
   class program_build_info { };
@@ -72,6 +76,7 @@ namespace
   class kernel_arg_access_qualifier { };
   class kernel_arg_type_qualifier { };
   class kernel_work_group_info { };
+  class kernel_sub_group_info { };
   class event_info { };
   class command_type { };
   class command_execution_status { };
@@ -81,6 +86,11 @@ namespace
 
   class device_partition_property { };
   class device_affinity_domain { };
+  class device_atomic_capabilities { };
+  class device_device_enqueue_capabilities { };
+
+  class version_bits { };
+  class khronos_vendor_id { };
 
   class gl_object_type { };
   class gl_texture_info { };
@@ -233,6 +243,11 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(, INVALID_DEVICE_QUEUE);
 #endif
 
+#if PYOPENCL_CL_VERSION >= 0x2020
+    ADD_ATTR(, INVALID_SPEC_ID);
+    ADD_ATTR(, MAX_SIZE_RESTRICTION_EXCEEDED);
+#endif
+
 #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)
     ADD_ATTR(, DEVICE_PARTITION_FAILED_EXT);
     ADD_ATTR(, INVALID_PARTITION_COUNT_EXT);
@@ -248,6 +263,13 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(PLATFORM_, VENDOR);
 #if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
     ADD_ATTR(PLATFORM_, EXTENSIONS);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR(PLATFORM_, HOST_TIMER_RESOLUTION);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(PLATFORM_, NUMERIC_VERSION);
+    ADD_ATTR(PLATFORM_, EXTENSIONS_WITH_VERSION);
 #endif
   }
 
@@ -356,7 +378,11 @@ void pyopencl_expose_constants(py::module &m)
 #ifdef CL_DEVICE_PCI_SLOT_ID_NV
     ADD_ATTR(DEVICE_, PCI_SLOT_ID_NV);
 #endif
+#ifdef CL_DEVICE_PCI_SLOT_ID_NV
+    ADD_ATTR(DEVICE_, PCI_DOMAIN_ID_NV);
+#endif
 #endif
+
 // {{{ cl_amd_device_attribute_query
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
     ADD_ATTR(DEVICE_, PROFILING_TIMER_OFFSET_AMD);
@@ -397,7 +423,6 @@ void pyopencl_expose_constants(py::module &m)
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
     ADD_ATTR(DEVICE_, LOCAL_MEM_BANKS_AMD);
 #endif
-// }}}
 #ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
     ADD_ATTR(DEVICE_, THREAD_TRACE_SUPPORTED_AMD);
 #endif
@@ -410,6 +435,19 @@ void pyopencl_expose_constants(py::module &m)
 #ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
     ADD_ATTR(DEVICE_, AVAILABLE_ASYNC_QUEUES_AMD);
 #endif
+#ifdef CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
+    ADD_ATTR(DEVICE_, PREFERRED_WORK_GROUP_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
+    ADD_ATTR(DEVICE_, MAX_WORK_GROUP_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD
+    ADD_ATTR(DEVICE_, PREFERRED_CONSTANT_BUFFER_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_PCIE_ID_AMD
+    ADD_ATTR(DEVICE_, PCIE_ID_AMD);
+#endif
+// }}}
 
 #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
     ADD_ATTR(DEVICE_, MAX_ATOMIC_COUNTERS_EXT);
@@ -453,6 +491,27 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(DEVICE_, IL_VERSION);
     ADD_ATTR(DEVICE_, MAX_NUM_SUB_GROUPS);
     ADD_ATTR(DEVICE_, SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(DEVICE_, NUMERIC_VERSION);
+    ADD_ATTR(DEVICE_, EXTENSIONS_WITH_VERSION);
+    ADD_ATTR(DEVICE_, ILS_WITH_VERSION);
+    ADD_ATTR(DEVICE_, BUILT_IN_KERNELS_WITH_VERSION);
+    ADD_ATTR(DEVICE_, ATOMIC_MEMORY_CAPABILITIES);
+    ADD_ATTR(DEVICE_, ATOMIC_FENCE_CAPABILITIES);
+    ADD_ATTR(DEVICE_, NON_UNIFORM_WORK_GROUP_SUPPORT);
+    ADD_ATTR(DEVICE_, OPENCL_C_ALL_VERSIONS);
+    ADD_ATTR(DEVICE_, PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
+    ADD_ATTR(DEVICE_, WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT);
+    ADD_ATTR(DEVICE_, GENERIC_ADDRESS_SPACE_SUPPORT);
+    ADD_ATTR(DEVICE_, OPENCL_C_FEATURES);
+#ifdef CL_DEVICE_DEVICE_ENQUEUE_SUPPORT
+    // some busted headers shipped by Debian have this
+    cls.attr("DEVICE_ENQUEUE_CAPABILITIES") = CL_DEVICE_DEVICE_ENQUEUE_SUPPORT;
+#else
+    ADD_ATTR(DEVICE_, DEVICE_ENQUEUE_CAPABILITIES);
+#endif
+    ADD_ATTR(DEVICE_, PIPE_SUPPORT);
 #endif
     /* cl_intel_advanced_motion_estimation */
 #ifdef CL_DEVICE_ME_VERSION_INTEL
@@ -486,6 +545,13 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<device_topology_type_amd> cls(m, "device_topology_type_amd");
+#ifdef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+    cls.attr("PCIE") = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
+#endif
+  }
+
   {
     py::class_<device_fp_config> cls(m, "device_fp_config");
     ADD_ATTR(FP_, DENORM);
@@ -594,6 +660,9 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(QUEUE_, DEVICE);
     ADD_ATTR(QUEUE_, REFERENCE_COUNT);
     ADD_ATTR(QUEUE_, PROPERTIES);
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(QUEUE_, PROPERTIES_ARRAY);
+#endif
   }
 
   {
@@ -601,6 +670,9 @@ void pyopencl_expose_constants(py::module &m)
 #if PYOPENCL_CL_VERSION >= 0x2000
     ADD_ATTR(QUEUE_, PROPERTIES);
     ADD_ATTR(QUEUE_, SIZE);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR(QUEUE_, DEVICE_DEFAULT);
 #endif
   }
 
@@ -678,6 +750,12 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR( , UNSIGNED_INT32);
     ADD_ATTR( , HALF_FLOAT);
     ADD_ATTR( , FLOAT);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR( , UNORM_INT24);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR( , UNORM_INT_101010_2);
+#endif
   }
 
   {
@@ -711,6 +789,9 @@ void pyopencl_expose_constants(py::module &m)
 #endif
 #if PYOPENCL_CL_VERSION >= 0x2000
     ADD_ATTR(MEM_, USES_SVM_POINTER);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(MEM_, PROPERTIES);
 #endif
   }
 
@@ -731,6 +812,24 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<pipe_info> cls(m, "pipe_info");
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(PIPE_, PACKET_SIZE);
+    ADD_ATTR(PIPE_, MAX_PACKETS);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(PIPE_, PROPERTIES);
+#endif
+  }
+
+  {
+    py::class_<pipe_properties> cls(m, "pipe_properties");
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(PIPE_, PACKET_SIZE);
+    ADD_ATTR(PIPE_, MAX_PACKETS);
+#endif
+  }
   {
     py::class_<addressing_mode> cls(m, "addressing_mode");
     ADD_ATTR(ADDRESS_, NONE);
@@ -760,6 +859,23 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(SAMPLER_, LOD_MIN);
     ADD_ATTR(SAMPLER_, LOD_MAX);
 #endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(SAMPLER_, PROPERTIES);
+#endif
+// {{{ cl_khr_mipmap_image
+#ifdef CL_SAMPLER_MIP_FILTER_MODE_KHR
+    ADD_ATTR(SAMPLER_, MIP_FILTER_MODE_KHR);
+    ADD_ATTR(SAMPLER_, LOD_MIN_KHR);
+    ADD_ATTR(SAMPLER_, LOD_MAX_KHR);
+#endif
+// }}}
+  }
+
+  {
+    py::class_<sampler_properties> cls(m, "sampler_properties");
+    ADD_ATTR(SAMPLER_, NORMALIZED_COORDS);
+    ADD_ATTR(SAMPLER_, ADDRESSING_MODE);
+    ADD_ATTR(SAMPLER_, FILTER_MODE);
   }
 
   {
@@ -783,6 +899,13 @@ void pyopencl_expose_constants(py::module &m)
 #if PYOPENCL_CL_VERSION >= 0x1020
     ADD_ATTR(PROGRAM_, NUM_KERNELS);
     ADD_ATTR(PROGRAM_, KERNEL_NAMES);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR(PROGRAM_, IL);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2020
+    ADD_ATTR(PROGRAM_, SCOPE_GLOBAL_CTORS_PRESENT);
+    ADD_ATTR(PROGRAM_, SCOPE_GLOBAL_DTORS_PRESENT);
 #endif
   }
 
@@ -882,6 +1005,17 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<kernel_sub_group_info> cls(m, "kernel_sub_group_info");
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR(KERNEL_, MAX_SUB_GROUP_SIZE_FOR_NDRANGE);
+    ADD_ATTR(KERNEL_, SUB_GROUP_COUNT_FOR_NDRANGE);
+    ADD_ATTR(KERNEL_, LOCAL_SIZE_FOR_SUB_GROUP_COUNT);
+    ADD_ATTR(KERNEL_, MAX_NUM_SUB_GROUPS);
+    ADD_ATTR(KERNEL_, COMPILE_NUM_SUB_GROUPS);
+#endif
+  }
+
   {
     py::class_<event_info> cls(m, "event_info");
     ADD_ATTR(EVENT_, COMMAND_QUEUE);
@@ -930,6 +1064,9 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(COMMAND_, SVM_MEMFILL);
     ADD_ATTR(COMMAND_, SVM_MAP);
     ADD_ATTR(COMMAND_, SVM_UNMAP);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(COMMAND_, SVM_MIGRATE_MEM);
 #endif
   }
 
@@ -993,6 +1130,47 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<device_atomic_capabilities> cls(m, "device_atomic_capabilities");
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(DEVICE_ATOMIC_, ORDER_RELAXED);
+    ADD_ATTR(DEVICE_ATOMIC_, ORDER_ACQ_REL);
+    ADD_ATTR(DEVICE_ATOMIC_, ORDER_SEQ_CST);
+    ADD_ATTR(DEVICE_ATOMIC_, SCOPE_WORK_ITEM);
+    ADD_ATTR(DEVICE_ATOMIC_, SCOPE_WORK_GROUP);
+    ADD_ATTR(DEVICE_ATOMIC_, SCOPE_DEVICE);
+    ADD_ATTR(DEVICE_ATOMIC_, SCOPE_ALL_DEVICES);
+#endif
+  }
+  {
+    py::class_<device_device_enqueue_capabilities> cls(m, "device_device_enqueue_capabilities");
+#if (PYOPENCL_CL_VERSION >= 0x3000) && defined(CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES)
+    ADD_ATTR(DEVICE_QUEUE_, SUPPORTED);
+    ADD_ATTR(DEVICE_QUEUE_, REPLACEABLE_DEFAULT);
+#endif
+  }
+
+  {
+    py::class_<version_bits> cls(m, "version_bits");
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(VERSION_, MAJOR_BITS);
+    ADD_ATTR(VERSION_, MINOR_BITS);
+    ADD_ATTR(VERSION_, PATCH_BITS);
+
+    ADD_ATTR(VERSION_, MAJOR_MASK);
+    ADD_ATTR(VERSION_, MINOR_MASK);
+    ADD_ATTR(VERSION_, PATCH_MASK);
+#endif
+  }
+
+  {
+    py::class_<khronos_vendor_id> cls(m, "khronos_vendor_id");
+#if PYOPENCL_CL_VERSION >= 0x3000
+    ADD_ATTR(KHRONOS_VENDOR_ID_, CODEPLAY);
+#endif
+  }
+
+
 #ifdef HAVE_GL
   {
     py::class_<gl_object_type> cls(m, "gl_object_type");
@@ -1010,6 +1188,80 @@ void pyopencl_expose_constants(py::module &m)
 #endif
 
   // }}}
+
+  // {{{ cl_name_version
+#if PYOPENCL_CL_VERSION >= 0x3000
+  {
+    typedef cl_name_version cls;
+    py::class_<cls>(m, "NameVersion")
+      .def(py::init(
+            [](cl_version version, const char* name)
+            {
+              cl_name_version result;
+              result.version = version;
+              result.name[0] = '\0';
+              // https://stackoverflow.com/a/1258577
+              strncat(result.name, name, CL_NAME_VERSION_MAX_NAME_SIZE-1);
+              return result;
+            }),
+          py::arg("version")=0,
+          py::arg("name")=0)
+
+      .def_property("version",
+          [](cls &t) { return t.version; },
+          [](cls &t, cl_version val) { t.version = val; })
+      .def_property("name",
+          [](cls &t) { return t.name; },
+          [](cls &t, const char *name)
+          {
+              t.name[0] = '\0';
+              // https://stackoverflow.com/a/1258577
+              strncat(t.name, name, CL_NAME_VERSION_MAX_NAME_SIZE-1);
+          })
+      ;
+  }
+#endif
+  // }}}
+
+  // {{{ CL_DEVICE_TOPOLOGY_AMD
+
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+  {
+    typedef cl_device_topology_amd cls;
+    py::class_<cls>(m, "DeviceTopologyAmd")
+      .def(py::init(
+            [](cl_char bus, cl_char device, cl_char function)
+            {
+              cl_device_topology_amd result;
+              result.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
+              result.pcie.bus = bus;
+              result.pcie.device = device;
+              result.pcie.function = function;
+              return result;
+            }),
+          py::arg("bus")=0,
+          py::arg("device")=0,
+          py::arg("function")=0)
+
+      .def_property("type",
+          [](cls &t) { return t.pcie.type; },
+          [](cls &t, cl_uint val) { t.pcie.type = val; })
+
+      .def_property("bus",
+          [](cls &t) { return t.pcie.bus; },
+          [](cls &t, cl_char val) { t.pcie.bus = val; })
+      .def_property("device",
+          [](cls &t) { return t.pcie.device; },
+          [](cls &t, cl_char val) { t.pcie.device = val; })
+      .def_property("function",
+          [](cls &t) { return t.pcie.function; },
+          [](cls &t, cl_char val) { t.pcie.function = val; })
+      ;
+  }
+#endif
+
+  // }}}
+
 }
 
 
diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp
index bf6853ea919509b679f799d7937c109bffbba8ad..cabc012d4a683a20807f066757278e22fafc8268 100644
--- a/src/wrap_helpers.hpp
+++ b/src/wrap_helpers.hpp
@@ -71,6 +71,18 @@ namespace py = pybind11;
       NAME.push_back(it.cast<TYPE>()); \
   }
 
+#define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \
+  { \
+    COUNTER = 0; \
+    for (auto it: py_##NAME) \
+    { \
+      if (COUNTER == NAME.size()) \
+        throw error(FUNC_NAME, \
+            CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \
+      NAME[COUNTER++] = it.cast<TYPE>(); \
+    } \
+  }
+
 #define COPY_PY_COORD_TRIPLE(NAME) \
   size_t NAME[3] = {0, 0, 0}; \
   { \
@@ -173,7 +185,7 @@ namespace
       py::arg("retain")=true, \
       "(static method) Return a new Python object referencing the C-level " \
       ":c:type:`" #CL_TYPENAME "` object at the location pointed to " \
-      "by *int_ptr_value*. The relevant :c:func:`clRetain*` function " \
+      "by *int_ptr_value*. The relevant ``clRetain*`` function " \
       "will be called if *retain* is True." \
       "If the previous owner of the object will *not* release the reference, " \
       "*retain* should be set to *False*, to effectively transfer ownership to " \
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index e29110ec25caf46cbf8d11a4d653772e6068789a..04027b014b3b8c82bc1e9433d4453a82b3e7a052 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -104,6 +104,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         return pyopencl::create_buffer(m_context->data(), m_flags, s, 0);
       }
   };
@@ -137,6 +140,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         pointer_type ptr =  pyopencl::create_buffer(
             m_context->data(), m_flags, s, 0);
 
@@ -144,15 +150,29 @@ namespace
         // This looks (and is) expensive. But immediate allocators
         // have their main use in memory pools, whose basic assumption
         // is that allocation is too expensive anyway--but they rely
-        // on exact 'out-of-memory' information.
-        unsigned zero = 0;
-        PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, (
-              m_queue.data(),
-              ptr,
-              /* is blocking */ CL_FALSE,
-              0, std::min(s, sizeof(zero)), &zero,
-              0, NULL, NULL
-              ));
+        // on 'out-of-memory' being reported on allocation. (If it is
+        // reported in a deferred manner, it has no way to react
+        // (e.g. by freeing unused memory) because it is not part of
+        // the call stack.)
+        if (m_queue.get_hex_device_version() < 0x1020)
+        {
+          unsigned zero = 0;
+          PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, (
+                m_queue.data(),
+                ptr,
+                /* is blocking */ CL_FALSE,
+                0, std::min(s, sizeof(zero)), &zero,
+                0, NULL, NULL
+                ));
+        }
+        else
+        {
+          PYOPENCL_CALL_GUARDED(clEnqueueMigrateMemObjects, (
+                m_queue.data(),
+                1, &ptr, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED,
+                0, NULL, NULL
+                ));
+        }
 
         // No need to wait for completion here. clWaitForEvents (e.g.)
         // cannot return mem object allocation failures. This implies that
@@ -188,6 +208,15 @@ namespace
       alloc.try_release_blocks();
     }
 
+    if (!mem)
+    {
+      if (size == 0)
+        return nullptr;
+      else
+        throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+            "allocator succeeded but returned NULL cl_mem");
+    }
+
     try
     {
       return new pyopencl::buffer(mem, false);
@@ -241,6 +270,8 @@ namespace
     wrapper
       .def_property_readonly("held_blocks", &cls::held_blocks)
       .def_property_readonly("active_blocks", &cls::active_blocks)
+      .def_property_readonly("managed_bytes", &cls::managed_bytes)
+      .def_property_readonly("active_bytes", &cls::active_bytes)
       .DEF_SIMPLE_METHOD(bin_number)
       .DEF_SIMPLE_METHOD(alloc_size)
       .DEF_SIMPLE_METHOD(free_held)
@@ -275,7 +306,8 @@ void pyopencl_expose_mempool(py::module &m)
           std::shared_ptr<pyopencl::context> const &>())
       .def(py::init<
           std::shared_ptr<pyopencl::context> const &,
-          cl_mem_flags>())
+          cl_mem_flags>(),
+          py::arg("queue"), py::arg("mem_flags"))
       ;
   }
 
@@ -285,7 +317,8 @@ void pyopencl_expose_mempool(py::module &m)
         m, "_tools_ImmediateAllocator");
     wrapper
       .def(py::init<pyopencl::command_queue &>())
-      .def(py::init<pyopencl::command_queue &, cl_mem_flags>())
+      .def(py::init<pyopencl::command_queue &, cl_mem_flags>(),
+          py::arg("queue"), py::arg("mem_flags"))
       ;
   }
 
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 0360d6a348b1e1e1ab46f6bb4df0785252b2bea4..676aee379272ede7352d26e3bd81ebfc7f5686f9 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -1,6 +1,5 @@
 #! /usr/bin/env python
 
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
@@ -24,7 +23,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, zip
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import numpy.linalg as la
 import sys
@@ -75,7 +76,7 @@ def test_elwise_kernel_with_options(ctx_factory):
 
     in_gpu = clrand(queue, (50,), np.float32)
 
-    options = ['-D', 'ADD_ONE']
+    options = ["-D", "ADD_ONE"]
     add_one = ElementwiseKernel(
         context,
         "float* out, const float *in",
@@ -246,15 +247,21 @@ def test_sum(ctx_factory):
                 slice(1000, -3000),
                 slice(1000, None),
                 slice(1000, None, 3),
+                slice(1000, 1000),
                 ]:
             sum_a = np.sum(a[slc])
 
+            if sum_a:
+                ref_divisor = abs(sum_a)
+            else:
+                ref_divisor = 1
+
             if slc.step is None:
                 sum_a_gpu = cl_array.sum(a_gpu[slc]).get()
-                assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
+                assert abs(sum_a_gpu - sum_a) / ref_divisor < 1e-4
 
             sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get()
-            assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4
+            assert abs(sum_a_gpu_2 - sum_a) / ref_divisor < 1e-4
 
 
 def test_sum_without_data(ctx_factory):
@@ -378,7 +385,7 @@ def test_dot(ctx_factory):
                 vdot_ab = np.vdot(a, b)
             except NotImplementedError:
                 import sys
-                is_pypy = '__pypy__' in sys.builtin_module_names
+                is_pypy = "__pypy__" in sys.builtin_module_names
                 if is_pypy:
                     print("PYPY: VDOT UNIMPLEMENTED")
                     continue
@@ -500,7 +507,7 @@ def summarize_error(obtained, desired, orig, thresh=1e-5):
             bad_count += 1
 
             if bad_count < bad_limit:
-                entries.append("%r (want: %r, got: %r, orig: %r)" % (
+                entries.append("{!r} (want: {!r}, got: {!r}, orig: {!r})".format(
                     obtained[i], desired[i], obtained[i], orig[i]))
         else:
             if bad_count:
@@ -849,7 +856,7 @@ def test_sort(ctx_factory, scan_kernel):
 
         numpy_elapsed = numpy_end-dev_end
         dev_elapsed = dev_end-dev_start
-        print("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
+        print("  dev: {:.2f} MKeys/s numpy: {:.2f} MKeys/s ratio: {:.2f}x".format(
                 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
         assert (a_dev_sorted.get() == a_sorted).all()
 
@@ -1070,7 +1077,7 @@ def test_bitonic_sort(ctx_factory, size, dtype):
 @pytest.mark.bitonic
 def test_bitonic_argsort(ctx_factory, size, dtype):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     if not size and is_pypy:
         # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
@@ -1080,6 +1087,12 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     dev = ctx.devices[0]
     if (dev.platform.name == "Portable Computing Language"
             and sys.platform == "darwin"):
diff --git a/test/test_array.py b/test/test_array.py
index e9fb2ddd1d4ae2aaf16f18a2696666b607970056..deb6ac28f9e1b798dc04a990abe7b5fb5039baed 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -23,11 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import numpy.linalg as la
 import sys
 
-from six.moves import range
 import pytest
 
 import pyopencl as cl
@@ -154,7 +155,7 @@ def test_mix_complex(ctx_factory):
                         # served a Python complex that is really a
                         # smaller numpy complex.
 
-                        print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
+                        print("HOST_DTYPE: {} DEV_DTYPE: {}".format(
                                 host_result.dtype, dev_result.dtype))
 
                         dev_result = dev_result.astype(host_result.dtype)
@@ -204,6 +205,23 @@ def test_vector_fill(ctx_factory):
     a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
 
 
+def test_zeros_large_array(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+    dev = queue.device
+
+    size = 2**28 + 1
+    if dev.address_bits == 64 and dev.max_mem_alloc_size >= 8 * size:
+        # this shouldn't hang/cause errors
+        # see https://github.com/inducer/pyopencl/issues/395
+        a_gpu = cl_array.zeros(queue, (size,), dtype="float64")
+        # run a couple kernels to ensure no propagated runtime errors
+        a_gpu[...] = 1.
+        a_gpu = 2 * a_gpu - 3
+    else:
+        pass
+
+
 def test_absrealimag(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -425,12 +443,20 @@ def test_addition_scalar(ctx_factory):
     assert (7 + a == a_added).all()
 
 
-def test_substract_array(ctx_factory):
+@pytest.mark.parametrize(("dtype_a", "dtype_b"),
+        [
+            (np.float32, np.float32),
+            (np.float32, np.int32),
+            (np.int32, np.int32),
+            (np.int64, np.int32),
+            (np.int64, np.uint32),
+            ])
+def test_subtract_array(ctx_factory, dtype_a, dtype_b):
     """Test the substraction of two arrays."""
     #test data
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(dtype_a)
     b = np.array([10, 20, 30, 40, 50,
-                  60, 70, 80, 90, 100]).astype(np.float32)
+                  60, 70, 80, 90, 100]).astype(dtype_b)
 
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -470,14 +496,32 @@ def test_divide_scalar(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
-    a_gpu = cl_array.to_device(queue, a)
+    if queue.device.platform.name == "Apple":
+        pytest.xfail("Apple CL compiler crashes on this.")
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
+
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
 
-    result = (a_gpu / 2).get()
-    assert (a / 2 == result).all()
+        b = a / s
+        b_gpu = a_gpu / s
+        assert (np.abs(b_gpu.get() - b) < 1e-3).all()
+        assert b_gpu.dtype is b.dtype
 
-    result = (2 / a_gpu).get()
-    assert (np.abs(2 / a - result) < 1e-5).all()
+        c = s / a
+        c_gpu = s / a_gpu
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
 
 
 def test_divide_array(ctx_factory):
@@ -486,18 +530,103 @@ def test_divide_array(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    #test data
-    a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
-    b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
+    dtypes = (np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
 
-    a_gpu = cl_array.to_device(queue, a)
-    b_gpu = cl_array.to_device(queue, b)
+    from itertools import product
+
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+        c = a / b
+        c_gpu = (a_gpu / b_gpu)
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
+
+        d = b / a
+        d_gpu = (b_gpu / a_gpu)
+        assert (np.abs(d_gpu.get() - d) < 1e-3).all()
+        assert d_gpu.dtype is d.dtype
+
+
+def test_divide_inplace_scalar(ctx_factory):
+    """Test inplace division of arrays and a scalar."""
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    if queue.device.platform.name == "Apple":
+        pytest.xfail("Apple CL compiler crashes on this.")
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
+
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
 
-    a_divide = (a_gpu / b_gpu).get()
-    assert (np.abs(a / b - a_divide) < 1e-3).all()
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a /= s
+        except TypeError:
+            with np.testing.assert_raises(TypeError):
+                a_gpu /= s
+        else:
+            a_gpu /= s
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
 
-    a_divide = (b_gpu / a_gpu).get()
-    assert (np.abs(b / a - a_divide) < 1e-3).all()
+
+def test_divide_inplace_array(ctx_factory):
+    """Test inplace division of arrays."""
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
+
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+        print(dtype_a, dtype_b)
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a_gpu /= b_gpu
+        except TypeError:
+            # pass for now, as numpy casts differently for in-place and out-place
+            # true_divide
+            pass
+            # with np.testing.assert_raises(TypeError):
+            #     a /= b
+        else:
+            a /= b
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
 
 
 def test_bitwise(ctx_factory):
@@ -530,8 +659,8 @@ def test_bitwise(ctx_factory):
 
         a = a_dev.get()
         b = b_dev.get()
-        s = int((clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
-                 .astype(b_dtype).get()))
+        s = int(clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
+                 .astype(b_dtype).get())
 
         import operator as o
 
@@ -586,6 +715,13 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if has_double_support(context.devices[0]):
         dtypes = [np.float32, np.float64]
     else:
@@ -638,6 +774,12 @@ def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    if queue.device.platform.vendor == "The pocl project" \
+            and queue.device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if rng_class is RanluxGenerator:
         gen = rng_class(queue, 5120)
     else:
@@ -712,7 +854,7 @@ def test_nan_arithmetic(ctx_factory):
         a = np.random.randn(*shape).astype(np.float32)
         from random import randrange
         for i in range(size // 10):
-            a[randrange(0, size)] = float('nan')
+            a[randrange(0, size)] = float("nan")
         return a
 
     size = 1 << 20
@@ -771,7 +913,7 @@ def test_diff(ctx_factory):
     a = a_dev.get()
 
     err = la.norm(
-            (cl.array.diff(a_dev).get() - np.diff(a)))
+            cl.array.diff(a_dev).get() - np.diff(a))
     assert err < 1e-4
 
 
@@ -1055,7 +1197,7 @@ def test_reshape(ctx_factory):
     # using -1 as unknown dimension
     assert a_dev.reshape(-1, 32).shape == (4, 32)
     assert a_dev.reshape((32, -1)).shape == (32, 4)
-    assert a_dev.reshape(((8, -1, 4))).shape == (8, 4, 4)
+    assert a_dev.reshape((8, -1, 4)).shape == (8, 4, 4)
 
     import pytest
     with pytest.raises(ValueError):
@@ -1221,7 +1363,13 @@ def test_get_async(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("the async get test fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     b = a + a**5 + 1
     b_gpu = a_gpu + a_gpu**5 + 1
@@ -1250,7 +1398,7 @@ def test_outoforderqueue_get(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = a_gpu + a_gpu**5 + 1
     b1 = b_gpu.get()  # testing that this waits for events
@@ -1265,7 +1413,7 @@ def test_outoforderqueue_copy(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     c_gpu = a_gpu**2 - 7
     b_gpu = c_gpu.copy()  # testing that this waits for and creates events
@@ -1283,8 +1431,8 @@ def test_outoforderqueue_indexing(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
-    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
+    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype("int32"))
     a_gpu = cl_array.to_device(queue, a)
     i_gpu = cl_array.to_device(queue, i)
     c_gpu = (a_gpu**2)[i_gpu - 10000]
@@ -1307,7 +1455,7 @@ def test_outoforderqueue_reductions(ctx_factory):
     except Exception:
         pytest.skip("out-of-order queue not available")
     # 0/1 values to avoid accumulated rounding error
-    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
+    a = (np.random.rand(10**6) > 0.5).astype(np.dtype("float32"))
     a[800000] = 10  # all<5 looks true until near the end
     a_gpu = cl_array.to_device(queue, a)
     b1 = cl_array.sum(a_gpu).get()
@@ -1316,9 +1464,59 @@ def test_outoforderqueue_reductions(ctx_factory):
     assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
 
 
+def test_negative_dim_rejection(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    with pytest.raises(ValueError):
+        cl_array.Array(queue, shape=-10, dtype=np.float64)
+
+    with pytest.raises(ValueError):
+        cl_array.Array(queue, shape=(-10,), dtype=np.float64)
+
+    for left_dim in (-1, 0, 1):
+        with pytest.raises(ValueError):
+            cl_array.Array(queue, shape=(left_dim, -1), dtype=np.float64)
+
+    for right_dim in (-1, 0, 1):
+        with pytest.raises(ValueError):
+            cl_array.Array(queue, shape=(-1, right_dim), dtype=np.float64)
+
+
+@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2), (0, 5), (5, 0)])
+def test_zero_size_array(ctx_factory, empty_shape):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b.fill(1)
+    c = a + b
+    c_host = c.get()
+    cl_array.to_device(queue, c_host)
+
+    assert c.flags.c_contiguous == c_host.flags.c_contiguous
+    assert c.flags.f_contiguous == c_host.flags.f_contiguous
+
+    for order in "CF":
+        c_flat = c.reshape(-1, order=order)
+        c_host_flat = c_host.reshape(-1, order=order)
+        assert c_flat.shape == c_host_flat.shape
+        assert c_flat.strides == c_host_flat.strides
+        assert c_flat.flags.c_contiguous == c_host_flat.flags.c_contiguous
+        assert c_flat.flags.f_contiguous == c_host_flat.flags.f_contiguous
+
+
+def test_str_without_queue(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = cl_array.zeros(queue, 10, dtype=np.float32).with_queue(None)
+    print(str(a))
+    print(repr(a))
+
+
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the
-    # tests.
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
diff --git a/test/test_arrays_in_structs.py b/test/test_arrays_in_structs.py
new file mode 100644
index 0000000000000000000000000000000000000000..625b6105448080bf361aa02d66950dba28207fe9
--- /dev/null
+++ b/test/test_arrays_in_structs.py
@@ -0,0 +1,101 @@
+__copyright__ = "Copyright (C) 2020 Sotiris Niarchos"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.cltypes as cltypes
+import pyopencl.tools as cl_tools
+from pyopencl import mem_flags
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+def test_struct_with_array_fields(ctx_factory):
+    #
+    # typedef struct {
+    #     uint x[2];
+    #     float y;
+    #     uint z[3][4];
+    # } my_struct;
+    #
+    cl_ctx = ctx_factory()
+    device = cl_ctx.devices[0]
+    queue = cl.CommandQueue(cl_ctx)
+
+    my_struct = np.dtype([
+        ("x", cltypes.uint, 2),
+        ("y", cltypes.int),
+        ("z", cltypes.uint, (3, 4))
+    ])
+    my_struct, cdecl = cl_tools.match_dtype_to_c_struct(
+        device, "my_struct", my_struct
+    )
+
+    # a random buffer of 4 structs
+    my_struct_arr = np.array([
+        ([81, 24], -57, [[15, 28, 45,  7], [71, 95, 65, 84], [2, 11, 59,  9]]),
+        ([5, 20],  47, [[15, 53,  7, 59], [73, 22, 27, 86], [59,  6, 39, 49]]),
+        ([11, 99], -32, [[73, 83,  4, 65], [19, 21, 22, 27], [1, 55,  6, 64]]),
+        ([57, 38], -54, [[74, 90, 38, 67], [77, 30, 99, 18], [91,  3, 63, 67]])
+    ], dtype=my_struct)
+
+    expected_res = []
+    for x in my_struct_arr:
+        expected_res.append(int(np.sum(x[0]) + x[1] + np.sum(x[2])))
+    expected_res = np.array(expected_res, dtype=cltypes.int)
+
+    kernel_src = """%s
+    // this kernel sums every number contained in each struct
+    __kernel void array_structs(__global my_struct *structs, __global int *res) {
+        int i = get_global_id(0);
+        my_struct s = structs[i];
+        res[i] = s.x[0] + s.x[1] + s.y;
+        for (int r = 0; r < 3; r++)
+            for (int c = 0; c < 4; c++)
+                res[i] += s.z[r][c];
+    }""" % cdecl
+
+    mem_flags1 = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR
+    mem_flags2 = mem_flags.WRITE_ONLY
+
+    my_struct_buf = cl.Buffer(cl_ctx, mem_flags1, hostbuf=my_struct_arr)
+    res_buf = cl.Buffer(cl_ctx, mem_flags2, size=expected_res.nbytes)
+
+    program = cl.Program(cl_ctx, kernel_src).build()
+    kernel = program.array_structs
+    kernel(queue, (4,), None, my_struct_buf, res_buf)
+
+    res = np.empty_like(expected_res)
+    cl.enqueue_copy(queue, res, res_buf)
+
+    assert (res == expected_res).all()
+
+
+if __name__ == "__main__":
+
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 9c844016077ffaf31252b7852fd43138eed62fbd..409875f8a1c1ff842982dbf4247637f99a7b6cd5 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function, absolute_import
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
+# avoid spurious: pytest.mark.parametrize is not callable
+# avoid spurious: Module 'scipy.special' has no 'jn' member; maybe 'jv'
+# pylint: disable=not-callable,no-member
+
 
 import math
 import numpy as np
@@ -341,7 +342,7 @@ def test_complex_bessel(ctx_factory, ref_src):
     if ref_src == "pyfmmlib":
         pyfmmlib = pytest.importorskip("pyfmmlib")
 
-        jv_ref = np.zeros(len(z), 'complex')
+        jv_ref = np.zeros(len(z), "complex")
 
         vin = v+1
 
@@ -453,7 +454,7 @@ def test_outoforderqueue_clmath(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     # testing that clmath functions wait for and create events
     b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
diff --git a/test/test_clrandom.py b/test/test_clrandom.py
index b6b2094e2b0de7630f66c0db876452d81226bbc0..1ec53842a036bc659d12fa546425604ac01c4964 100644
--- a/test/test_clrandom.py
+++ b/test/test_clrandom.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Matt Wala"
 
 __license__ = """
@@ -22,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import pytest
 
@@ -67,6 +68,13 @@ def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
     size = 10
 
     with cl.CommandQueue(cl_ctx) as queue:
+        device = queue.device
+        if device.platform.vendor == "The pocl project" \
+                and device.type & cl.device_type.GPU \
+                and rng_class is make_ranlux_generator:
+            pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                    "at least the K40, as of pocl 1.6, 2021-01-20")
+
         rng.uniform(queue, size, dtype)
 
         if dtype not in (np.int32, np.int64):
diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py
index bfbf4f16edd757c3bf8e8bc59fb2d2ed311c0d29..402bc8b9d71e47592f67101ff58dfb90a0c56630 100644
--- a/test/test_enqueue_copy.py
+++ b/test/test_enqueue_copy.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2016 Shane J. Latham"
 
@@ -49,6 +48,15 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
+    if honor_skip and queue.device.platform.name == "Apple":
+        pytest.xfail("Apple's CL implementation crashes on this.")
+
     ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
     sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
     ary_in_origin = 20, 13  # Sub-array origin
@@ -136,6 +144,15 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
+    if honor_skip and queue.device.platform.name == "Apple":
+        pytest.skip("Apple's CL implementation crashes on this.")
+
     ary_in_shp = 256, 128, 31  # array shape from which sub-array copied to device
     sub_ary_shp = 128, 96, 20  # Sub-array shape to be copied to device
     ary_in_origin = 20, 13, 7  # Sub-array origin
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index d798a417b07b7b9c72f7fc1c453cabfdb8277d97..4b80f4d0c6eebabd1e51287f858024694e77dd8c 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 
 import numpy as np
 import numpy.linalg as la
@@ -33,7 +33,8 @@ import pyopencl.array as cl_array
 import pyopencl.cltypes as cltypes
 import pyopencl.clrandom
 from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests,
+        ImmediateAllocator, DeferredAllocator)
 from pyopencl.characterize import get_pocl_version
 from pyopencl.check_concurrency import with_concurrency_check
 
@@ -46,10 +47,17 @@ else:
     faulthandler.enable()
 
 
-def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'):
+def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
     if plat.vendor == "The pocl project":
         if up_to_version is None or get_pocl_version(plat) <= up_to_version:
-            pytest.skip(msg)
+            pytest.xfail(msg)
+
+
+def _xfail_if_pocl_gpu(device, what):
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail(f"POCL's {what} support don't work right on Nvidia GPUs, "
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
 
 
 def test_get_info(ctx_factory):
@@ -57,6 +65,9 @@ def test_get_info(ctx_factory):
     device, = ctx.devices
     platform = device.platform
 
+    device.persistent_unique_id
+    device.hashable_model_and_version_identifier
+
     failure_count = [0]
 
     pocl_quirks = [
@@ -379,7 +390,7 @@ def test_image_2d(ctx_factory):
     if "Intel" in device.vendor and "31360.31426" in device.version:
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image(
@@ -451,7 +462,7 @@ def test_image_3d(ctx_factory):
     if device.platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image_plane(
@@ -571,6 +582,23 @@ def test_mempool_2(ctx_factory):
         assert asize < asize*(1+1/8)
 
 
+@pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator])
+def test_allocator(ctx_factory, allocator_cls):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    if allocator_cls is DeferredAllocator:
+        allocator = allocator_cls(context)
+    else:
+        allocator = allocator_cls(queue)
+
+    mem = allocator(15)
+    mem2 = allocator(0)
+
+    assert mem is not None
+    assert mem2 is None
+
+
 def test_vector_args(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -656,8 +684,8 @@ def test_can_build_and_run_binary(ctx_factory):
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
     # Still relevant on pocl 1.0RC1.
-    _skip_if_pocl(
-            ctx.devices[0].platform, (1, 0), 'pocl crashes on enqueue_barrier')
+    _xfail_if_pocl(
+            ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")
 
     queue = cl.CommandQueue(ctx)
 
@@ -684,7 +712,7 @@ def test_unload_compiler(platform):
             or cl.get_cl_header_version() < (1, 2)):
         from pytest import skip
         skip("clUnloadPlatformCompiler is only available in OpenCL 1.2")
-    _skip_if_pocl(platform, (0, 13), 'pocl does not support unloading compiler')
+    _xfail_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
     if platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("Intel proprietary driver does not support unloading compiler")
@@ -711,7 +739,7 @@ def test_platform_get_devices(ctx_factory):
         devs = platform.get_devices(dev_type)
         if dev_type in (cl.device_type.DEFAULT,
                         cl.device_type.ALL,
-                        getattr(cl.device_type, 'CUSTOM', None)):
+                        getattr(cl.device_type, "CUSTOM", None)):
             continue
         for dev in devs:
             assert dev.type & dev_type == dev_type
@@ -725,7 +753,7 @@ def test_user_event(ctx_factory):
         skip("UserEvent is only available in OpenCL 1.1")
 
     # https://github.com/pocl/pocl/issues/201
-    _skip_if_pocl(ctx.devices[0].platform, (0, 13),
+    _xfail_if_pocl(ctx.devices[0].platform, (0, 13),
             "pocl's user events don't work right")
 
     status = {}
@@ -744,22 +772,22 @@ def test_user_event(ctx_factory):
     Thread(target=event_waiter1, args=(evt, 1)).start()
     sleep(.05)
     if status.get(1, False):
-        raise RuntimeError('UserEvent triggered before set_status')
+        raise RuntimeError("UserEvent triggered before set_status")
     evt.set_status(cl.command_execution_status.COMPLETE)
     sleep(.05)
     if not status.get(1, False):
-        raise RuntimeError('UserEvent.wait timeout')
+        raise RuntimeError("UserEvent.wait timeout")
     assert evt.command_execution_status == cl.command_execution_status.COMPLETE
 
     evt = cl.UserEvent(ctx)
     Thread(target=event_waiter2, args=(evt, 2)).start()
     sleep(.05)
     if status.get(2, False):
-        raise RuntimeError('UserEvent triggered before set_status')
+        raise RuntimeError("UserEvent triggered before set_status")
     evt.set_status(cl.command_execution_status.COMPLETE)
     sleep(.05)
     if not status.get(2, False):
-        raise RuntimeError('cl.wait_for_events timeout on UserEvent')
+        raise RuntimeError("cl.wait_for_events timeout on UserEvent")
     assert evt.command_execution_status == cl.command_execution_status.COMPLETE
 
 
@@ -775,8 +803,8 @@ def test_buffer_get_host_array(ctx_factory):
     buf = cl.Buffer(ctx, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=host_buf)
     host_buf2 = buf.get_host_array(25, np.float32)
     assert (host_buf == host_buf2).all()
-    assert (host_buf.__array_interface__['data'][0]
-            == host_buf.__array_interface__['data'][0])
+    assert (host_buf.__array_interface__["data"][0]
+            == host_buf.__array_interface__["data"][0])
     assert host_buf2.base is buf
 
     buf = cl.Buffer(ctx, mf.READ_WRITE | mf.ALLOC_HOST_PTR, size=100)
@@ -822,6 +850,8 @@ def test_event_set_callback(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "event callbacks")
+
     if ctx._get_cl_version() < (1, 1):
         pytest.skip("OpenCL 1.1 or newer required for set_callback")
 
@@ -876,6 +906,8 @@ def test_global_offset(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    _xfail_if_pocl_gpu(queue.device, "global offset")
+
     prg = cl.Program(context, """
         __kernel void mult(__global float *a)
         { a[get_global_id(0)] *= 2; }
@@ -956,11 +988,13 @@ def test_spirv(ctx_factory):
 
 def test_coarse_grain_svm(ctx_factory):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "SVM")
+
     dev = ctx.devices[0]
 
     from pyopencl.characterize import has_coarse_grain_buffer_svm
@@ -1010,14 +1044,40 @@ def test_coarse_grain_svm(ctx_factory):
         cl.enqueue_copy(queue, new_ary, svm_ary)
         assert np.array_equal(orig_ary*2, new_ary)
 
+    # {{{ https://github.com/inducer/pyopencl/issues/372
+
+    buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32)
+    out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32)
+
+    svm_buf_arr = cl.SVM(buf_arr)
+    svm_out_arr = cl.SVM(out_arr)
+    with svm_buf_arr.map_rw(queue) as ary:
+        ary.fill(17)
+
+    prg_ro = cl.Program(ctx, r"""
+        __kernel void twice_ro(__global int *out_g, __global int *in_g)
+        {
+          out_g[get_global_id(0)] = 2*in_g[get_global_id(0)];
+        }
+        """).build()
+
+    prg_ro.twice_ro(queue, buf_arr.shape, None, svm_out_arr, svm_buf_arr)
+
+    with svm_out_arr.map_ro(queue) as ary:
+        print(ary)
+
+    # }}}
+
 
 def test_fine_grain_svm(ctx_factory):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "GPU SVM")
+
     from pyopencl.characterize import has_fine_grain_buffer_svm
     from pytest import skip
     if not has_fine_grain_buffer_svm(queue.device):
@@ -1104,6 +1164,8 @@ def test_copy_buffer_rect(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "rectangular copies")
+
     arr1 = cl_array.zeros(queue, (2, 3), "f")
     arr2 = cl_array.zeros(queue, (4, 5), "f")
     arr1.fill(1)
@@ -1154,6 +1216,26 @@ def test_concurrency_checker(ctx_factory):
     arr1 - arr2
 
 
+@pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)])
+def test_empty_ndrange(ctx_factory, empty_shape):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if ctx._get_cl_version() < (1, 2) or cl.get_cl_header_version() < (1, 2):
+        pytest.skip("OpenCL 1.2 required for empty NDRange suuport")
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+
+    prg = cl.Program(ctx, """
+        __kernel void add_two(__global float *a_g)
+        {
+          a_g[get_global_id(0)] += 2;
+        }
+        """).build()
+
+    prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True)
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
     import pyopencl  # noqa