diff --git a/.conda-ci-build-configure.sh b/.conda-ci-build-configure.sh
new file mode 100644
index 0000000000000000000000000000000000000000..80a2fb778073ec5d3277308090f6273030d331f8
--- /dev/null
+++ b/.conda-ci-build-configure.sh
@@ -0,0 +1 @@
+python ./configure.py --cl-inc-dir="$CONDA_PREFIX/include" --cl-lib-dir="$CONDA_PREFIX/lib"
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000000000000000000000000000000000000..dcbc21d86f9e4b17ea7e8803d538c4c0f0b6276a
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,32 @@
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d41702c9002b4d649501134dc7a47d8f4a26a90d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,90 @@
+name: CI
+on:
+    push:
+        branches:
+        - master
+    pull_request:
+        paths-ignore:
+        - 'doc/*.rst'
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    flake8:
+        name: Flake8
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                echo "- matplotlib" >> $CONDA_ENVIRONMENT
+                echo "- pyopengl" >> $CONDA_ENVIRONMENT
+                echo "- ipython" >> $CONDA_ENVIRONMENT
+                echo "-------------------------------------------"
+                cat $CONDA_ENVIRONMENT
+                echo "-------------------------------------------"
+                USE_CONDA_BUILD=1
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
+
+                # Pylint won't find the Cython bits without this
+                PROJECT_INSTALL_FLAGS="--editable"
+
+                . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
+
+    pytest:
+        name: Pytest Linux
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest_mac:
+        name: Pytest Mac
+        runs-on: macos-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                export CC=gcc
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                ./configure.py --cxxflags= --ldflags= --cl-libname=OpenCL
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ci-support.sh
+                build_py_project_in_conda_env
+                build_docs
+
+# vim: sw=4
diff --git a/.gitignore b/.gitignore
index 20a1c6e5eefc9782512a7dfb98c6d319fcf6080e..103ff507f895518184778eee49a08a5435af0d9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,4 +64,5 @@ build-and-test-py-project.sh
 cffi_build.py
 
 .cache
+.pytest_cache
 .idea
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ec1495d7891a953f7f37484403924cd37ee10d87..29c364fcabe3eeb19bc04f0261eb7d7432b1b851 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,29 +1,11 @@
-"Python 2.7 AMD CPU":
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=amd:pu
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  allow_failure: true
-  tags:
-  - python2.7
-  - amd-cl-cpu
-  - opengl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 Intel CPU:
   script:
   - export PY_EXE=python3
+  - source /opt/enable-intel-cl.sh
   - export PYOPENCL_TEST="intel(r):pu"
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
-  allow_failure: true
   tags:
   - python3
   - intel-cl-cpu
@@ -33,23 +15,6 @@ Python 3 Intel CPU:
     reports:
       junit: test/pytest.xml
 
-Python 3 AMD CPU:
-  script:
-  - export PY_EXE=python3
-  - export PYOPENCL_TEST=amd:pu
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  allow_failure: true
-  tags:
-  - python3
-  - amd-cl-cpu
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 Titan X:
   script:
   - export PY_EXE=python3
@@ -82,26 +47,30 @@ Python 3 Titan V:
     reports:
       junit: test/pytest.xml
 
-Python 3 K40:
-  script:
-  - export PY_EXE=python3
-  - export PYOPENCL_TEST=nvi:k40
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python3
-  - nvidia-k40
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
+# https://github.com/illinois-scicomp/machine-shop-maintenance/issues/7
+
+# Python 3 K40:
+#   script:
+#   - export PY_EXE=python3
+#   - export PYOPENCL_TEST=nvi:k40
+#   - export EXTRA_INSTALL="pybind11 numpy mako"
+#   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+#   - ". ./build-and-test-py-project.sh"
+#   tags:
+#   - python3
+#   - nvidia-k40
+#   except:
+#   - tags
+#
+#   artifacts:
+#     reports:
+#       junit: test/pytest.xml
 
 Python 3 AMD GPU:
+  allow_failure: true
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=amd:fiji
+  - export PYOPENCL_TEST=amd:gfx803
   - export EXTRA_INSTALL="pybind11 numpy mako"
 
   # https://andreask.cs.illinois.edu/MachineShop/UserNotes
@@ -109,7 +78,6 @@ Python 3 AMD GPU:
 
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
-  allow_failure: true
   tags:
   - python3
   - amd-fiji
@@ -119,26 +87,10 @@ Python 3 AMD GPU:
     reports:
       junit: test/pytest.xml
 
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -154,7 +106,7 @@ Python 3 POCL:
 Python 3 POCL CL 1.1:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - echo "CL_PRETEND_VERSION = '1.1'" > siteconf.py
@@ -171,7 +123,7 @@ Python 3 POCL CL 1.1:
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -187,7 +139,7 @@ Python 3 POCL:
 Python 3 POCL (+GL and special functions):
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako scipy pyfmmlib"
   - echo "CL_ENABLE_GL = True" > siteconf.py
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -201,28 +153,12 @@ Python 3 POCL (+GL and special functions):
     reports:
       junit: test/pytest.xml
 
-Python 2.7 Apple:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=app:cpu
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - export PKG_CONFIG_PATH=/usr/local/opt/libffi/lib/pkgconfig
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - apple
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 Conda Apple:
   script:
   - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
   - export CC=gcc
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+  - "./configure.py --cxxflags= --ldflags= --cl-libname=OpenCL"
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
   - apple
@@ -232,13 +168,15 @@ Python 3 Conda Apple:
     reports:
       junit: test/pytest.xml
 
-PyPy POCL:
+PyPy3 POCL:
   script:
-  - export PY_EXE=pypy
-  - export PYOPENCL_TEST=portable
+  - export PY_EXE=pypy3
+  - export PYOPENCL_TEST=portable:pthread
 
-  # https://github.com/pybind/pybind11/pull/1494
-  - export EXTRA_INSTALL="git+https://github.com/inducer/pybind11 numpy mako"
+  # On pypy, this seems to install old versions from the package index
+  # independently of whether newer ones are already present.
+  - rm -f pyproject.toml
+  - export EXTRA_INSTALL="pybind11 numpy mako"
 
   - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -262,7 +200,7 @@ Pylint:
   # is only one copy of everything.
   - PROJECT_INSTALL_FLAGS="--editable"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-  - ". ./prepare-and-run-pylint.sh pyopencl test/test_*.py"
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
   except:
@@ -275,13 +213,11 @@ Documentation:
   - ". ./build-docs.sh"
   tags:
   - linux
-  only:
-  - master
 
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh pyopencl test"
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test
   tags:
   - python3
   except:
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 9389377fb73abfeb2fbcc90b7ad0c20da894bbef..0000000000000000000000000000000000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,813 +0,0 @@
-# Based on: https://gitlab.tiker.net/inducer/ci-support/raw/237bf5f0414a47499bc9ce4ef54ebd285e6fe648/.pylintrc-default
-
-[MASTER]
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code.
-extension-pkg-whitelist=numpy
-
-# Add files or directories to the blacklist. They should be base names, not
-# paths.
-ignore=CVS,compyte
-
-# Add files or directories matching the regex patterns to the blacklist. The
-# regex matches against base names, not paths.
-ignore-patterns=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use.
-jobs=1
-
-# Control the amount of potential inferred values when inferring a single
-# object. This can help the performance when dealing with large functions or
-# complex, nested conditions.
-limit-inference-results=100
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Pickle collected data for later comparisons.
-persistent=no
-
-# Specify a configuration file.
-#rcfile=
-
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
-confidence=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=assignment-from-no-return,  # https://github.com/PyCQA/pylint/issues/2694
-        blacklisted-name,
-        invalid-name,
-        missing-docstring,
-        empty-docstring,
-        unneeded-not,
-        singleton-comparison,
-        misplaced-comparison-constant,
-        unidiomatic-typecheck,
-        consider-using-enumerate,
-        consider-iterating-dictionary,
-        bad-classmethod-argument,
-        bad-mcs-method-argument,
-        bad-mcs-classmethod-argument,
-        single-string-used-for-slots,
-        line-too-long,
-        too-many-lines,
-        trailing-whitespace,
-        missing-final-newline,
-        trailing-newlines,
-        multiple-statements,
-        superfluous-parens,
-        bad-whitespace,
-        mixed-line-endings,
-        unexpected-line-ending-format,
-        bad-continuation,
-        wrong-spelling-in-comment,
-        wrong-spelling-in-docstring,
-        invalid-characters-in-docstring,
-        multiple-imports,
-        wrong-import-order,
-        ungrouped-imports,
-        wrong-import-position,
-        useless-import-alias,
-        len-as-condition,
-        print-statement,
-        parameter-unpacking,
-        unpacking-in-except,
-        old-raise-syntax,
-        backtick,
-        long-suffix,
-        old-ne-operator,
-        old-octal-literal,
-        import-star-module-level,
-        non-ascii-bytes-literal,
-        raw-checker-failed,
-        bad-inline-option,
-        locally-disabled,
-        file-ignored,
-        suppressed-message,
-        useless-suppression,
-        deprecated-pragma,
-        use-symbolic-message-instead,
-        c-extension-no-member,
-        literal-comparison,
-        comparison-with-itself,
-        no-self-use,
-        no-classmethod-decorator,
-        no-staticmethod-decorator,
-        useless-object-inheritance,
-        cyclic-import,
-        duplicate-code,
-        too-many-ancestors,
-        too-many-instance-attributes,
-        too-few-public-methods,
-        too-many-public-methods,
-        too-many-return-statements,
-        too-many-branches,
-        too-many-arguments,
-        too-many-locals,
-        too-many-statements,
-        too-many-boolean-expressions,
-        consider-merging-isinstance,
-        too-many-nested-blocks,
-        simplifiable-if-statement,
-        redefined-argument-from-local,
-        no-else-return,
-        consider-using-ternary,
-        trailing-comma-tuple,
-        stop-iteration-return,
-        simplify-boolean-expression,
-        inconsistent-return-statements,
-        useless-return,
-        consider-swap-variables,
-        consider-using-join,
-        consider-using-in,
-        consider-using-get,
-        chained-comparison,
-        consider-using-dict-comprehension,
-        consider-using-set-comprehension,
-        simplifiable-if-expression,
-        unreachable,
-        dangerous-default-value,
-        pointless-statement,
-        pointless-string-statement,
-        expression-not-assigned,
-        unnecessary-pass,
-        unnecessary-lambda,
-        duplicate-key,
-        assign-to-new-keyword,
-        useless-else-on-loop,
-        exec-used,
-        eval-used,
-        confusing-with-statement,
-        using-constant-test,
-        comparison-with-callable,
-        lost-exception,
-        assert-on-tuple,
-        attribute-defined-outside-init,
-        bad-staticmethod-argument,
-        protected-access,
-        arguments-differ,
-        signature-differs,
-        abstract-method,
-        super-init-not-called,
-        no-init,
-        non-parent-init-called,
-        useless-super-delegation,
-        unnecessary-semicolon,
-        bad-indentation,
-        mixed-indentation,
-        wildcard-import,
-        deprecated-module,
-        relative-import,
-        reimported,
-        import-self,
-        misplaced-future,
-        fixme,
-        invalid-encoded-data,
-        global-variable-undefined,
-        global-variable-not-assigned,
-        global-statement,
-        global-at-module-level,
-        unused-import,
-        unused-variable,
-        unused-argument,
-        unused-wildcard-import,
-        redefined-outer-name,
-        redefined-builtin,
-        redefine-in-handler,
-        undefined-loop-variable,
-        unbalanced-tuple-unpacking,
-        cell-var-from-loop,
-        possibly-unused-variable,
-        self-cls-assignment,
-        bare-except,
-        broad-except,
-        duplicate-except,
-        try-except-raise,
-        binary-op-exception,
-        raising-format-tuple,
-        keyword-arg-before-vararg,
-        logging-not-lazy,
-        logging-format-interpolation,
-        logging-fstring-interpolation,
-        bad-format-string-key,
-        unused-format-string-key,
-        bad-format-string,
-        missing-format-argument-key,
-        unused-format-string-argument,
-        format-combined-specification,
-        missing-format-attribute,
-        invalid-format-index,
-        duplicate-string-formatting-argument,
-        anomalous-backslash-in-string,
-        anomalous-unicode-escape-in-string,
-        implicit-str-concat-in-sequence,
-        bad-open-mode,
-        boolean-datetime,
-        redundant-unittest-assert,
-        deprecated-method,
-        bad-thread-instantiation,
-        shallow-copy-environ,
-        invalid-envvar-default,
-        subprocess-popen-preexec-fn,
-        apply-builtin,
-        basestring-builtin,
-        buffer-builtin,
-        cmp-builtin,
-        coerce-builtin,
-        execfile-builtin,
-        file-builtin,
-        long-builtin,
-        raw_input-builtin,
-        reduce-builtin,
-        standarderror-builtin,
-        unicode-builtin,
-        xrange-builtin,
-        coerce-method,
-        delslice-method,
-        getslice-method,
-        setslice-method,
-        no-absolute-import,
-        old-division,
-        dict-iter-method,
-        dict-view-method,
-        next-method-called,
-        metaclass-assignment,
-        indexing-exception,
-        raising-string,
-        reload-builtin,
-        oct-method,
-        hex-method,
-        nonzero-method,
-        cmp-method,
-        input-builtin,
-        round-builtin,
-        intern-builtin,
-        unichr-builtin,
-        map-builtin-not-iterating,
-        zip-builtin-not-iterating,
-        range-builtin-not-iterating,
-        filter-builtin-not-iterating,
-        using-cmp-argument,
-        eq-without-hash,
-        div-method,
-        idiv-method,
-        rdiv-method,
-        exception-message-attribute,
-        invalid-str-codec,
-        sys-max-int,
-        bad-python3-import,
-        deprecated-string-function,
-        deprecated-str-translate-call,
-        deprecated-itertools-function,
-        deprecated-types-field,
-        next-method-defined,
-        dict-items-not-iterating,
-        dict-keys-not-iterating,
-        dict-values-not-iterating,
-        deprecated-operator-function,
-        deprecated-urllib-function,
-        xreadlines-attribute,
-        deprecated-sys-function,
-        exception-escape,
-        comprehension-escape
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=syntax-error,
-       unrecognized-inline-option,
-       bad-option-value,
-       init-is-generator,
-       return-in-init,
-       function-redefined,
-       not-in-loop,
-       return-outside-function,
-       yield-outside-function,
-       return-arg-in-generator,
-       nonexistent-operator,
-       duplicate-argument-name,
-       abstract-class-instantiated,
-       bad-reversed-sequence,
-       too-many-star-expressions,
-       invalid-star-assignment-target,
-       star-needs-assignment-target,
-       nonlocal-and-global,
-       continue-in-finally,
-       nonlocal-without-binding,
-       used-prior-global-declaration,
-       misplaced-format-function,
-       method-hidden,
-       access-member-before-definition,
-       no-method-argument,
-       no-self-argument,
-       invalid-slots-object,
-       assigning-non-slot,
-       invalid-slots,
-       inherit-non-class,
-       inconsistent-mro,
-       duplicate-bases,
-       non-iterator-returned,
-       unexpected-special-method-signature,
-       invalid-length-returned,
-       import-error,
-       relative-beyond-top-level,
-       used-before-assignment,
-       undefined-variable,
-       undefined-all-variable,
-       invalid-all-object,
-       no-name-in-module,
-       unpacking-non-sequence,
-       bad-except-order,
-       raising-bad-type,
-       bad-exception-context,
-       misplaced-bare-raise,
-       raising-non-exception,
-       notimplemented-raised,
-       catching-non-exception,
-       bad-super-call,
-       missing-super-argument,
-       no-member,
-       not-callable,
-       no-value-for-parameter,
-       too-many-function-args,
-       unexpected-keyword-arg,
-       redundant-keyword-arg,
-       missing-kwoa,
-       invalid-sequence-index,
-       invalid-slice-index,
-       assignment-from-none,
-       not-context-manager,
-       invalid-unary-operand-type,
-       unsupported-binary-operation,
-       repeated-keyword,
-       not-an-iterable,
-       not-a-mapping,
-       unsupported-membership-test,
-       unsubscriptable-object,
-       unsupported-assignment-operation,
-       unsupported-delete-operation,
-       invalid-metaclass,
-       unhashable-dict-key,
-       logging-unsupported-format,
-       logging-format-truncated,
-       logging-too-many-args,
-       logging-too-few-args,
-       bad-format-character,
-       truncated-format-string,
-       mixed-format-string,
-       format-needs-mapping,
-       missing-format-string-key,
-       too-many-format-args,
-       too-few-format-args,
-       bad-string-format-type,
-       bad-str-strip-call,
-       invalid-envvar-value,
-       yield-inside-async-function,
-       not-async-context-manager,
-       fatal,
-       astroid-error,
-       parse-error,
-       method-check-failed
-
-
-[REPORTS]
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details.
-#msg-template=
-
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages.
-reports=no
-
-# Activate the evaluation score.
-score=no
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit
-
-
-[LOGGING]
-
-# Format style used to check logging format string. `old` means using %
-# formatting, while `new` is for `{}` formatting.
-logging-format-style=old
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format.
-logging-modules=logging
-
-
-[SPELLING]
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package..
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,
-      XXX,
-      TODO
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=cltypes.*,gl_platform.*,mako.template
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,
-          _cb
-
-# A regular expression matching the name of dummy variables (i.e. expected to
-# not be used).
-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore.
-ignored-argument-names=_.*|^ignored_|^unused_
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
-
-
-[FORMAT]
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Maximum number of characters on a single line.
-max-line-length=100
-
-# Maximum number of lines in a module.
-max-module-lines=1000
-
-# List of optional constructs for which whitespace checking is disabled. `dict-
-# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
-# `trailing-comma` allows a space between comma and closing bracket: (a, ).
-# `empty-line` allows space-only lines.
-no-space-check=trailing-comma,
-               dict-separator
-
-# Allow the body of a class to be on the same line as the declaration if body
-# contains single statement.
-single-line-class-stmt=no
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-
-[SIMILARITIES]
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-
-[BASIC]
-
-# Naming style matching correct argument names.
-argument-naming-style=snake_case
-
-# Regular expression matching correct argument names. Overrides argument-
-# naming-style.
-#argument-rgx=
-
-# Naming style matching correct attribute names.
-attr-naming-style=snake_case
-
-# Regular expression matching correct attribute names. Overrides attr-naming-
-# style.
-#attr-rgx=
-
-# Bad variable names which should always be refused, separated by a comma.
-bad-names=foo,
-          bar,
-          baz,
-          toto,
-          tutu,
-          tata
-
-# Naming style matching correct class attribute names.
-class-attribute-naming-style=any
-
-# Regular expression matching correct class attribute names. Overrides class-
-# attribute-naming-style.
-#class-attribute-rgx=
-
-# Naming style matching correct class names.
-class-naming-style=PascalCase
-
-# Regular expression matching correct class names. Overrides class-naming-
-# style.
-#class-rgx=
-
-# Naming style matching correct constant names.
-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct constant names. Overrides const-naming-
-# style.
-#const-rgx=
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-# Naming style matching correct function names.
-function-naming-style=snake_case
-
-# Regular expression matching correct function names. Overrides function-
-# naming-style.
-#function-rgx=
-
-# Good variable names which should always be accepted, separated by a comma.
-good-names=i,
-           j,
-           k,
-           ex,
-           Run,
-           _
-
-# Include a hint for the correct naming format with invalid-name.
-include-naming-hint=no
-
-# Naming style matching correct inline iteration names.
-inlinevar-naming-style=any
-
-# Regular expression matching correct inline iteration names. Overrides
-# inlinevar-naming-style.
-#inlinevar-rgx=
-
-# Naming style matching correct method names.
-method-naming-style=snake_case
-
-# Regular expression matching correct method names. Overrides method-naming-
-# style.
-#method-rgx=
-
-# Naming style matching correct module names.
-module-naming-style=snake_case
-
-# Regular expression matching correct module names. Overrides module-naming-
-# style.
-#module-rgx=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=^_
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-# These decorators are taken in consideration only for invalid-name.
-property-classes=abc.abstractproperty
-
-# Naming style matching correct variable names.
-variable-naming-style=snake_case
-
-# Regular expression matching correct variable names. Overrides variable-
-# naming-style.
-#variable-rgx=
-
-
-[IMPORTS]
-
-# Allow wildcard imports from modules that define __all__.
-allow-wildcard-with-all=no
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-# Deprecated modules which should not be used, separated by a comma.
-deprecated-modules=optparse,tkinter.tix
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled).
-ext-import-graph=
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled).
-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled).
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=cls
-
-
-[DESIGN]
-
-# Maximum number of arguments for function / method.
-max-args=5
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Maximum number of boolean expressions in an if statement.
-max-bool-expr=5
-
-# Maximum number of branch for function / method body.
-max-branches=12
-
-# Maximum number of locals for function / method body.
-max-locals=15
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of return / yield for function / method body.
-max-returns=6
-
-# Maximum number of statements in function / method body.
-max-statements=50
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception".
-overgeneral-exceptions=Exception
diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8c8533810e9fdcefd769f8d64d2fcd1b159db531
--- /dev/null
+++ b/.pylintrc-local.yml
@@ -0,0 +1,7 @@
+- arg: ignore
+  val: compyte
+- arg: generated-members
+  val:
+    - cltypes.*
+    - gl_platform.*
+    - mako.template
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index b7824b0bb6b3cdbc4070215affce4e3ae7f1751d..498f1a6b333eafd594b2c7982ee3b2f93913b742 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,12 +1,12 @@
-name: test-conda-env-py3
+name: test-conda-env
 channels:
 - conda-forge
-- defaults
+- nodefaults
 
 dependencies:
+- python=3
 - git
-- conda-forge::numpy
+- numpy
 - pocl
-- osx-pocl-opencl
 - mako
 - pybind11
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..03815e36d3fb8b366a77da6c5b0aee662977861b
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,23 @@
+matrix:
+  include:
+  - sudo: required
+    services:
+      - docker
+    arch: amd64
+    env:
+      - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
+  - sudo: required
+    services:
+      - docker
+    arch: amd64
+    env:
+      - DOCKER_IMAGE=quay.io/pypa/manylinux1_i686
+      - PRE_CMD=linux32
+install:
+  - docker pull $DOCKER_IMAGE
+script:
+  - pwd
+  - ls -la
+  - if [[ "${TRAVIS_TAG}" == "" ]]; then unset TWINE_USERNAME; fi
+  - docker run --rm -v `pwd`:/io -e TWINE_USERNAME -e TWINE_PASSWORD $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh
+  - ls wheelhouse/
diff --git a/MANIFEST.in b/MANIFEST.in
index 89c7cb4d98214c2d9dd9b4eb5d21a63a00e433d5..ac712a49fae8cbae4349becc3afe56093578de97 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -22,6 +22,7 @@ include configure.py
 include Makefile.in
 include aksetup_helper.py
 include README_SETUP.txt
+include pyproject.toml
 include README.rst
 include LICENSE
 
diff --git a/README.rst b/README.rst
index 4f75c30d300be74d6c7e8f90dfab627141cdd62c..58fbdb0a67cfdc0f5d3431d053056b26ea648a82 100644
--- a/README.rst
+++ b/README.rst
@@ -2,17 +2,24 @@ PyOpenCL: Pythonic Access to OpenCL, with Arrays and Algorithms
 ---------------------------------------------------------------
 
 .. image:: https://gitlab.tiker.net/inducer/pyopencl/badges/master/pipeline.svg
-   :target: https://gitlab.tiker.net/inducer/pyopencl/commits/master
+    :alt: Gitlab Build Status
+    :target: https://gitlab.tiker.net/inducer/pyopencl/commits/master
+.. image:: https://github.com/inducer/pyopencl/workflows/CI/badge.svg?branch=master&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/pyopencl/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/pyopencl.png
-  :target: http://pypi.python.org/pypi/pyopencl
+    :alt: Python Package Index Release Page
+    :target: https://pypi.org/project/pyopencl/
+
+(Also: `Travis CI <https://travis-ci.org/inducer/pyopencl/builds>`_ to build binary wheels for releases, see `#264 <https://github.com/inducer/pyopencl/pull/264>`_)
 
 PyOpenCL lets you access GPUs and other massively parallel compute
 devices from Python. It tries to offer computing goodness in the
-spirit of its sister project `PyCUDA <http://mathema.tician.de/software/pycuda>`_:
+spirit of its sister project `PyCUDA <https://mathema.tician.de/software/pycuda>`_:
 
 * Object cleanup tied to lifetime of objects. This idiom, often
   called
-  `RAII <http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization>`_
+  `RAII <https://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization>`_
   in C++, makes it much easier to write correct, leak- and
   crash-free code.
 
@@ -26,11 +33,11 @@ spirit of its sister project `PyCUDA <http://mathema.tician.de/software/pycuda>`
 * Speed. PyOpenCL's base layer is written in C++, so all the niceties
   above are virtually free.
 
-* Helpful and complete `Documentation <http://documen.tician.de/pyopencl>`_
-  as well as a `Wiki <http://wiki.tiker.net/PyOpenCL>`_.
+* Helpful and complete `Documentation <https://documen.tician.de/pyopencl>`__
+  as well as a `Wiki <https://wiki.tiker.net/PyOpenCL>`_.
 
 * Liberal license. PyOpenCL is open-source under the 
-  `MIT license <http://en.wikipedia.org/wiki/MIT_License>`_
+  `MIT license <https://en.wikipedia.org/wiki/MIT_License>`_
   and free for commercial, academic, and private use.
 
 * Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's 
@@ -38,22 +45,22 @@ spirit of its sister project `PyCUDA <http://mathema.tician.de/software/pycuda>`
 
 Simple 4-step `install instructions <https://documen.tician.de/pyopencl/misc.html#installation>`_
 using Conda on Linux and macOS (that also install a working OpenCL implementation!)
-can be found in the `documentation <https://documen.tician.de/pyopencl/>`_.
+can be found in the `documentation <https://documen.tician.de/pyopencl/>`__.
 
 What you'll need if you do *not* want to use the convenient instructions above and
 instead build from source:
 
 *   gcc/g++ new enough to be compatible with pybind11
     (see their `FAQ <https://pybind11.readthedocs.io/en/stable/faq.html>`_)
-*   `numpy <http://numpy.org>`_, and
-*   an OpenCL implementation. (See this `howto <http://wiki.tiker.net/OpenCLHowTo>`_ for how to get one.)
+*   `numpy <https://numpy.org>`_, and
+*   an OpenCL implementation. (See this `howto <https://wiki.tiker.net/OpenCLHowTo>`_ for how to get one.)
 
 Places on the web related to PyOpenCL:
 
-* `Python package index <http://pypi.python.org/pypi/pyopencl>`_ (download releases)
+* `Python package index <https://pypi.python.org/pypi/pyopencl>`_ (download releases)
 
-* `Documentation <http://documen.tician.de/pyopencl>`_ (read how things work)
+* `Documentation <https://documen.tician.de/pyopencl>`__ (read how things work)
 * `Conda Forge <https://anaconda.org/conda-forge/pyopencl>`_ (download binary packages for Linux, macOS, Windows)
-* `C. Gohlke's Windows binaries <http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
-* `Github <http://github.com/inducer/pyopencl>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
+* `C. Gohlke's Windows binaries <https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
+* `Github <https://github.com/inducer/pyopencl>`_ (get latest source code, file bugs)
+* `Wiki <https://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
diff --git a/aksetup_helper.py b/aksetup_helper.py
index e86f4cd8e8e193d014462b57b4b0a79357558698..a168adf7a381ca8c33d005493030bdf7fab9addf 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -162,10 +162,12 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
         from distutils import sysconfig
 
         cvars = sysconfig.get_config_vars()
-        cflags = cvars.get('OPT')
+
+        bad_prefixes = ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG']
+
+        cflags = cvars.get("OPT")
         if cflags:
-            cflags = remove_prefixes(cflags.split(),
-                    ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG'])
+            cflags = remove_prefixes(cflags.split(), bad_prefixes)
             if debug:
                 cflags.append("-g")
             else:
@@ -175,11 +177,17 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
                     cflags.append("-O%s" % what_opt)
                     cflags.append("-DNDEBUG")
 
-            cvars['OPT'] = str.join(' ', cflags)
-            if "BASECFLAGS" in cvars:
-                cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars["OPT"]
-            else:
-                assert "CFLAGS" in cvars
+            cvars["OPT"] = str.join(' ', cflags)
+
+        cflags = cvars.get("CONFIGURE_CFLAGS")
+        if cflags:
+            cflags = remove_prefixes(cflags.split(), bad_prefixes)
+            cvars["CONFIGURE_CFLAGS"] = str.join(' ', cflags)
+
+        if "BASECFLAGS" in cvars:
+            cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars.get("OPT", "")
+        else:
+            assert "CFLAGS" in cvars
 
         if fast_link:
             for varname in ["LDSHARED", "BLDSHARED"]:
@@ -917,7 +925,9 @@ def cpp_flag(compiler):
 
     The c++14 is prefered over c++11 (when it is available).
     """
-    if has_flag(compiler, '-std=c++14'):
+    if has_flag(compiler, '-std=gnu++14'):
+        return '-std=gnu++14'
+    elif has_flag(compiler, '-std=c++14'):
         return '-std=c++14'
     elif has_flag(compiler, '-std=c++11'):
         return '-std=c++11'
@@ -933,23 +943,42 @@ class PybindBuildExtCommand(NumpyBuildExtCommand):
         'unix': [],
     }
 
-    if sys.platform == 'darwin':
-        c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
-
     def build_extensions(self):
         ct = self.compiler.compiler_type
         opts = self.c_opts.get(ct, [])
+        cxx_opts = []
+
         if ct in ['unix', 'mingw32']:
             opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
-            opts.append(cpp_flag(self.compiler))
+            cxx_opts.append(cpp_flag(self.compiler))
             if has_flag(self.compiler, '-fvisibility=hidden'):
                 opts.append('-fvisibility=hidden')
+            if sys.platform == 'darwin':
+                if has_flag(self.compiler, '-stdlib=libc++'):
+                    opts.append('-stdlib=libc++')
+                if has_flag(self.compiler, '-mmacosx-version-min=10.7'):
+                    opts.append('-mmacosx-version-min=10.7')
         elif ct == 'msvc':
             opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
         for ext in self.extensions:
             ext.extra_compile_args = ext.extra_compile_args + opts
 
-        NumpyBuildExtCommand.build_extensions(self)
+        prev__compile = self.compiler._compile
+
+        # -std=... used on C files causes an error on Apple LLVM
+        # https://gitlab.tiker.net/inducer/pymetis/-/jobs/102421
+        def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            if ext == ".cpp":
+                cc_args = cc_args + cxx_opts
+
+            return prev__compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+
+        self.compiler._compile = _compile
+
+        try:
+            NumpyBuildExtCommand.build_extensions(self)
+        finally:
+            self.compiler._compile = prev__compile
 
 # }}}
 
diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py
index 66f6e1dbfa80c2647313177730153ee75e80f4d0..371b012034300f7db6016468eeea0c9c3d448585 100644
--- a/contrib/fortran-to-opencl/translate.py
+++ b/contrib/fortran-to-opencl/translate.py
@@ -1,8 +1,4 @@
-from __future__ import division, with_statement
-from __future__ import absolute_import
-from __future__ import print_function
 import six
-from six.moves import range
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -366,12 +362,12 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
             complex_sum = self.rec(complexes[0], myprec)
             for child in complexes[1:]:
-                complex_sum = "%s_add(%s, %s)" % (
+                complex_sum = "{}_add({}, {})".format(
                         tgt_name, complex_sum,
                         self.rec(child, PREC_NONE))
 
             if real_sum:
-                result = "%s_add(%s_fromreal(%s), %s)" % (
+                result = "{}_add({}_fromreal({}), {})".format(
                         tgt_name, tgt_name, real_sum, complex_sum)
             else:
                 result = complex_sum
@@ -402,12 +398,12 @@ class ComplexCCodeMapper(CCodeMapperBase):
 
             complex_prd = self.rec(complexes[0], myprec)
             for child in complexes[1:]:
-                complex_prd = "%s_mul(%s, %s)" % (
+                complex_prd = "{}_mul({}, {})".format(
                         tgt_name, complex_prd,
                         self.rec(child, PREC_NONE))
 
             if real_prd:
-                result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd)
+                result = f"{tgt_name}_rmul({real_prd}, {complex_prd})"
             else:
                 result = complex_prd
 
@@ -423,17 +419,17 @@ class ComplexCCodeMapper(CCodeMapperBase):
         if not (n_complex or d_complex):
             return CCodeMapperBase.map_quotient(self, expr, enclosing_prec)
         elif n_complex and not d_complex:
-            return "%s_divider(%s, %s)" % (
+            return "{}_divider({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
         elif not n_complex and d_complex:
-            return "%s_rdivide(%s, %s)" % (
+            return "{}_rdivide({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
         else:
-            return "%s_divide(%s, %s)" % (
+            return "{}_divide({}, {})".format(
                     complex_type_name(tgt_dtype),
                     self.rec(expr.numerator, PREC_NONE),
                     self.rec(expr.denominator, PREC_NONE))
@@ -460,12 +456,12 @@ class ComplexCCodeMapper(CCodeMapperBase):
                 e_complex = 'c' == self.infer_type(expr.exponent).kind
 
                 if b_complex and not e_complex:
-                    return "%s_powr(%s, %s)" % (
+                    return "{}_powr({}, {})".format(
                             complex_type_name(tgt_dtype),
                             self.rec(expr.base, PREC_NONE),
                             self.rec(expr.exponent, PREC_NONE))
                 else:
-                    return "%s_pow(%s, %s)" % (
+                    return "{}_pow({}, {})".format(
                             complex_type_name(tgt_dtype),
                             self.rec(expr.base, PREC_NONE),
                             self.rec(expr.exponent, PREC_NONE))
@@ -522,7 +518,7 @@ class CCodeMapper(ComplexCCodeMapper):
             if name == "dble":
                 name = "real"
 
-            name = "%s_%s" % (
+            name = "{}_{}".format(
                     complex_type_name(tgt_dtype),
                     name)
 
@@ -532,7 +528,7 @@ class CCodeMapper(ComplexCCodeMapper):
             if name == "aimag":
                 name = "imag"
 
-            name = "%s_%s" % (
+            name = "{}_{}".format(
                     complex_type_name(arg_dtype),
                     name)
 
@@ -568,7 +564,7 @@ class CCodeMapper(ComplexCCodeMapper):
         from pymbolic.mapper.stringifier import PREC_NONE
         if expr.dtype.kind == "c":
             r, i = expr.value
-            return "%s_new(%s, %s)" % (
+            return "{}_new({}, {})".format(
                     complex_type_name(expr.dtype),
                     self.rec(r, PREC_NONE),
                     self.rec(i, PREC_NONE))
@@ -581,7 +577,7 @@ class CCodeMapper(ComplexCCodeMapper):
 
 # }}}
 
-class Scope(object):
+class Scope:
     def __init__(self, subprogram_name, arg_names=set()):
         self.subprogram_name = subprogram_name
 
@@ -608,8 +604,8 @@ class Scope(object):
 
     def known_names(self):
         return (self.used_names
-                | set(six.iterkeys(self.dim_map))
-                | set(six.iterkeys(self.type_map)))
+                | set(self.dim_map.keys())
+                | set(self.type_map.keys()))
 
     def is_known(self, name):
         return (name in self.used_names
@@ -643,12 +639,12 @@ class Scope(object):
     def translate_var_name(self, name):
         shape = self.dim_map.get(name)
         if name in self.data and shape is not None:
-            return "%s_%s" % (self.subprogram_name, name)
+            return f"{self.subprogram_name}_{name}"
         else:
             return name
 
 
-class FTreeWalkerBase(object):
+class FTreeWalkerBase:
     def __init__(self):
         self.scope_stack = []
 
@@ -675,7 +671,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z,]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z,]+)\))?$")
 
     def parse_dimension_specs(self, dim_decls):
         def parse_bounds(bounds_str):
@@ -949,7 +945,7 @@ class F2CLTranslator(FTreeWalkerBase):
 
             if shape is not None:
                 dim_stmt = cgen.Statement(
-                    "dimension \"fortran\" %s[%s]" % (
+                    "dimension \"fortran\" {}[{}]".format(
                         scope.translate_var_name(name),
                         ", ".join(gen_shape(s) for s in shape)
                         ))
@@ -975,7 +971,7 @@ class F2CLTranslator(FTreeWalkerBase):
                             cgen.Initializer(
                                 CLConstant(
                                     cgen.ArrayOf(self.get_declarator(
-                                        "%s_%s" % (scope.subprogram_name, name)))),
+                                        f"{scope.subprogram_name}_{name}"))),
                                 "{ %s }" % ",\n".join(self.gen_expr(x) for x in data)
                                 ))
             else:
@@ -1231,11 +1227,11 @@ class F2CLTranslator(FTreeWalkerBase):
             cast = self.force_casts.get(
                     (node.designator, i))
             if cast is not None:
-                result = "(%s) (%s)" % (cast, result)
+                result = f"({cast}) ({result})"
 
             return result
 
-        return cgen.Statement("%s(%s)" % (
+        return cgen.Statement("{}({})".format(
             node.designator,
             ", ".join(transform_arg(i, arg_str)
                 for i, arg_str in enumerate(node.items))))
@@ -1328,9 +1324,9 @@ class F2CLTranslator(FTreeWalkerBase):
                 comp_op = "<="
 
             return cgen.For(
-                    "%s = %s" % (loop_var, self.gen_expr(start)),
-                    "%s %s %s" % (loop_var, comp_op, self.gen_expr(stop)),
-                    "%s += %s" % (loop_var, self.gen_expr(step)),
+                    "{} = {}".format(loop_var, self.gen_expr(start)),
+                    "{} {} {}".format(loop_var, comp_op, self.gen_expr(stop)),
+                    "{} += {}".format(loop_var, self.gen_expr(step)),
                     cgen.block_if_necessary(body))
 
         else:
diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index cbaf1e9305c61d0e8928bbd46da2bdb6fd2aef83..2ff63e07ef24e46ef8e1e3c63ae290d5663a8f3e 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -19,7 +19,7 @@ evaluate multi-stage expressions on one or several operands in a single pass.
     .. method:: __call__(*args, wait_for=None)
 
         Invoke the generated scalar kernel. The arguments may either be scalars or
-        :class:`GPUArray` instances.
+        :class:`pyopencl.array.Array` instances.
 
         |std-enqueue-blurb|
 
@@ -110,7 +110,7 @@ Prefix Sums ("scan")
     or include statements.
 
 A prefix sum is a running sum of an array, as provided by
-e.g. :mod:`numpy.cumsum`::
+e.g. :func:`numpy.cumsum`::
 
     >>> import numpy as np
     >>> a = [1,1,1,1,1,2,2,2,2,2]
@@ -169,7 +169,8 @@ in PyOpenCL:
 * Segmented scans
 
 * Access to the previous item in *input_expr* (e.g. for comparisons)
-  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_ of :func:`unique` for an example.
+  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_
+  of :func:`pyopencl.algorithm.unique` for an example.
 
 Making Custom Scan Kernels
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/conf.py b/doc/conf.py
index 7c3707fd55061faa25503d18422c545192a55aed..2b47e9e4bf04b05d848814a78d0ceccf0d78aa9d 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import
-
 # PyOpenCL documentation build configuration file, created by
 # sphinx-quickstart on Fri Jun 13 00:51:19 2008.
 #
@@ -123,7 +119,7 @@ html_sidebars = {
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+#html_static_path = ['_static']
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
@@ -192,9 +188,9 @@ latex_documents = [
 #latex_use_modindex = True
 
 intersphinx_mapping = {
-        'http://docs.python.org/dev': None,
-        'http://docs.scipy.org/doc/numpy/': None,
-        'http://docs.makotemplates.org/en/latest/': None,
+        'https://docs.python.org/dev': None,
+        'https://numpy.org/doc/stable/': None,
+        'https://docs.makotemplates.org/en/latest/': None,
         }
 
 autoclass_content = "both"
diff --git a/doc/make_constants.py b/doc/make_constants.py
index 9ab78ad070ec6d0cc419458335a75ed44f9c9a16..2e20383bab957c2d26e84f066804cd4ea925be4c 100644
--- a/doc/make_constants.py
+++ b/doc/make_constants.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
diff --git a/doc/misc.rst b/doc/misc.rst
index 0b90781b543254e7399bc9b8f8672204556c26cf..b8d5c22e2477b1e9d9f2e2ca53a992702592dbb2 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -1,14 +1,19 @@
 Installation
 ============
 
+Installing from Conda Forge
+---------------------------
+
 By far the easiest way to install PyOpenCL is to use the packages available in
 `Conda Forge <https://conda-forge.org/>`_. Conda Forge is a repository of
 community-maintained packages for the `Conda <https://conda.io/docs/>`_
 package manager.
 
-On Linux and OS X, the following set of instructions should work:
+On Linux or OS X, the following set of instructions should work:
 
-#.  Install a version of `miniconda <https://conda.io/miniconda.html>`_
+#.  Install a version of
+    `miniforge <https://github.com/conda-forge/miniforge#miniforge3>`_
+    or `miniconda <https://conda.io/miniconda.html>`_
     that fits your system. Both Python 2 and Python 3 work.
     You can install these pieces of software in your user account and
     do not need root/administrator privileges.
@@ -37,20 +42,34 @@ with PyOpenCL from Conda Forge.
 
 It is important to note that OpenCL is not restricted to GPUs. In fact, no special
 hardware is required to use OpenCL for computation--your existing CPU is enough.
-On Linux, type:
+On Linux or macOS, type:
 
 #.  ``conda install pocl``
 
 to install a CPU-based OpenCL driver. On Windows, you may install e.g.
 the `CPU OpenCL driver from Intel <https://software.intel.com/en-us/articles/opencl-drivers#latest_CPU_runtime>`_.
-OS X has support for OpenCL built into the operating system and does not need
-additional software to run code based on PyOpenCL (but see below).
+On macOS, pocl can offer a marked robustness (and, sometimes, performance)
+improvement over the OpenCL drivers built into the operating system.
+
+On Linux and macOS, you can use Oclgrind to detect memory access errors.
+
+#. ``conda install oclgrind``
+
+On Linux Intel Broadwell or newer processors with an Intel graphics card, you
+can use NEO.
+
+#. ``conda install intel-compute-runtime``
+
+On Linux Intel Sandybridge or newer processors with an Intel graphics card, you
+can use Beignet.
+
+#. ``conda install beignet``
 
 You are now ready to run code based on PyOpenCL, such as the `code
 examples <https://github.com/inducer/pyopencl/tree/master/examples>`_.
 
-Using vendor-supplied OpenCL drivers (Linux)
---------------------------------------------
+Using vendor-supplied OpenCL drivers (mainly on Linux)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The instructions above help you get a basic OpenCL environment going that
 will work independently of whether you have specialized hardware (such as GPUs
@@ -76,33 +95,49 @@ qualified path names of the shared library providing the OpenCL driver.
     Note that you should replace ``ENVIRONMENTNAME`` with the name of your environment,
     shown between parentheses on your command line prompt.
 
-If you have other OpenCL drivers installed (such as for your GPU), those will be
+On Linux, if you have other OpenCL drivers installed (such as for your GPU), those will be
 in :file:`/etc/OpenCL/vendors`. You can make them work with PyOpenCL from Conda Forge
-by simply copying them to the above folder.
+by using the command::
+
+    conda install ocl-icd-system
+
+will make sure these system-wide ICDs are also visible  in your conda environment.
 
 If you are looking for more information, see `ocl-icd
 <https://github.com/OCL-dev/ocl-icd>`_ and its documentation. Ocl-icd is the
-"ICD loader" used by PyOpenCL when installed from Conda Forge. It represents the
-code behind :file:`libOpenCL.so`.
+"ICD loader" used by PyOpenCL when installed from Conda Forge on Linux.
+It represents the code behind :file:`libOpenCL.so`.
+
+On macOS, the packaging of PyOpenCL for Conda Forge relies on the
+`Khronos ICD Loader <https://github.com/KhronosGroup/OpenCL-ICD-Loader>`_,
+and it is packaged so that the OpenCL drivers built into the operating system
+are automatically available, in addition to other ICDs installed manually.
 
-Getting a better CPU-based OpenCL driver (OS X)
------------------------------------------------
+Installing from PyPI with Linux wheels
+--------------------------------------
 
-OS X has support for both CPU- and GPU-based OpenCL built in. Unfortunately,
-the built-in drivers can be temperamental, and they have not advanced as quickly
-as one might like. To make PyOpenCL use a more up-to-date (and open-source)
-CPU-based OpenCL driver, type the following:
+PyOpenCL distributes manylinux1 wheels in PyPI. These wheels are compatible with
+GLIBC>=2.5 based distributions.
 
-``conda install osx-pocl-opencl pocl pyopencl`` (OS X)
+On Linux, type
 
-Note that, by installing ``osx-pocl-opencl``, you will no longer be able to
-use PyOpenCL to talk to the system-wide Apple OpenCL drivers. To regain access
-to those drivers, simply uninstall ``osx-pocl-opencl`` and reinstall ``pyopencl``
-afterwards.
+#.  ``pip install pyopencl``
+
+The wheels comes with OCL-ICD bundled and configured to use any OpenCL implementation
+supporting ICD interface installed in :file:`/etc/OpenCL/vendors`
+
+You can also install the following CPU based OpenCL implementation using pip shipped as binary
+wheels. Note that pyopencl has to be installed using a wheel for pyopencl to recognize these
+wheels.
+
+To install pyopencl with pocl, a CPU based implementation do,
+
+#.  ``pip install pyopencl[pocl]``
+
+To install pyopencl with oclgrind, an OpenCL debugger do,
+
+#.  ``pip install pyopencl[oclgrind]``
 
-In addition, you will also be unaffected by Apple's pending deprecation of
-OpenCL functionality--you'll be able to keep using OpenCL irrespective of what
-Apple does.
 
 Installing from source
 ----------------------
@@ -213,14 +248,23 @@ other software to be turned into the corresponding :mod:`pyopencl` objects.
 User-visible Changes
 ====================
 
-Version 2018.2
+Version 2020.3
 --------------
-
 .. note::
 
     This version is currently under development. You can get snapshots from
     PyOpenCL's `git repository <https://github.com/inducer/pyopencl>`_
 
+Version 2020.2
+--------------
+
+- Drop Python 2 support.
+- Add ``allow_empty_ndrange`` to kernel enqueue.
+- Bug fixes.
+
+Version 2018.2
+--------------
+
 * Use pybind11.
 * Many bug fixes.
 * Support arrays with offsets in scan kernels.
@@ -337,8 +381,8 @@ Version 2013.1
 * Deprecated :func:`pyopencl.tools.register_dtype` in favor of
   :func:`pyopencl.tools.get_or_register_dtype`.
 * Clean up the :class:`pyopencl.array.Array` constructor interface.
-* Deprecate :class:`pyopencl.array.DefaultAllocator`.
-* Deprecate :class:`pyopencl.tools.CLAllocator`.
+* Deprecate ``pyopencl.array.DefaultAllocator``.
+* Deprecate ``pyopencl.tools.CLAllocator``
 * Introduce :class:`pyopencl.tools.DeferredAllocator`, :class:`pyopencl.tools.ImmediateAllocator`.
 * Allow arrays whose beginning does not coincide with the beginning of their
   :attr:`pyopencl.array.Array.data` :class:`pyopencl.Buffer`.
@@ -372,7 +416,7 @@ Version 2013.1
     may take a very long time to execute. This is because :mod:`numpy` first
     builds an object array of (compute-device) scalars (!) before it decides that
     that's probably not such a bright idea and finally calls
-    :meth:`pyopencl.array.Array.__rmul__`.
+    ``pyopencl.array.Array.__rmul__``.
 
     Note that only left arithmetic operations of :class:`pyopencl.array.Array`
     by :mod:`numpy` scalars are affected. Python's number types (:class:`float` etc.)
@@ -399,7 +443,7 @@ Version 2012.1
 Version 2011.2
 --------------
 
-* Add :func:`pyopencl.enqueue_migrate_mem_object`.
+* Add :func:`pyopencl.enqueue_migrate_mem_objects`.
 * Add :func:`pyopencl.image_from_array`.
 * IMPORTANT BUGFIX: Kernel caching was broken for all the 2011.1.x releases, with
   severe consequences on the execution time of :class:`pyopencl.array.Array`
@@ -407,7 +451,7 @@ Version 2011.2
   Henrik Andresen at a `PyOpenCL workshop at DTU <http://gpulab.imm.dtu.dk/courses.html>`_
   first noticed the strange timings.
 * All comparable PyOpenCL objects are now also hashable.
-* Add :func:`pyopencl.tools.context_dependent_memoize` to the documented
+* Add ``pyopencl.tools.context_dependent_memoize`` to the documented
   functionality.
 * Base :mod:`pyopencl.clrandom` on `RANLUXCL <https://bitbucket.org/ivarun/ranluxcl>`_,
   add functionality.
@@ -415,13 +459,13 @@ Version 2011.2
 * Add :mod:`pyopencl.characterize`.
 * Ensure compatibility with OS X Lion.
 * Add :func:`pyopencl.tools.register_dtype` to enable scan/reduction on struct types.
-* :func:`pyopencl.enqueue_migrate_mem_object` was renamed
-  :func:`pyopencl.enqueue_migrate_mem_object_ext`.
-  :func:`pyopencl.enqueue_migrate_mem_object` now refers to the OpenCL 1.2 function
+* :func:``pyopencl.enqueue_migrate_mem_objects`` was renamed
+  ``pyopencl.enqueue_migrate_mem_objects_ext``.
+  :func:`pyopencl.enqueue_migrate_mem_objects` now refers to the OpenCL 1.2 function
   of this name, if available.
-* :func:`pyopencl.create_sub_devices` was renamed
-  :func:`pyopencl.create_sub_devices_ext`.
-  :func:`pyopencl.create_sub_devices` now refers to the OpenCL 1.2 function
+* :meth:`pyopencl.Device.create_sub_devices` was renamed
+  ``pyopencl.Device.create_sub_devices_ext``.
+  :meth:`pyopencl.Device.create_sub_devices` now refers to the OpenCL 1.2 function
   of this name, if available.
 * Alpha support for OpenCL 1.2.
 
@@ -441,14 +485,14 @@ Version 2011.1
 * All *is_blocking* parameters now default to *True* to avoid
   crashy-by-default behavior. (suggested by Jan Meinke)
   In particular, this change affects
-  :func:`pyopencl.enqueue_read_buffer`,
-  :func:`pyopencl.enqueue_write_buffer`,
-  :func:`pyopencl.enqueue_read_buffer_rect`,
-  :func:`pyopencl.enqueue_write_buffer_rect`,
-  :func:`pyopencl.enqueue_read_image`,
-  :func:`pyopencl.enqueue_write_image`,
-  :func:`pyopencl.enqueue_map_buffer`,
-  :func:`pyopencl.enqueue_map_image`.
+  ``pyopencl.enqueue_read_buffer``,
+  ``pyopencl.enqueue_write_buffer``,
+  ``pyopencl.enqueue_read_buffer_rect``,
+  ``pyopencl.enqueue_write_buffer_rect``,
+  ``pyopencl.enqueue_read_image``,
+  ``pyopencl.enqueue_write_image``,
+  ``pyopencl.enqueue_map_buffer``,
+  ``pyopencl.enqueue_map_image``.
 * Add :mod:`pyopencl.reduction`.
 * Add :ref:`reductions`.
 * Add :mod:`pyopencl.scan`.
@@ -490,7 +534,7 @@ Version 0.91.5
 * Add :attr:`pyopencl.ImageFormat.channel_count`,
   :attr:`pyopencl.ImageFormat.dtype_size`,
   :attr:`pyopencl.ImageFormat.itemsize`.
-* Add missing :func:`pyopencl.enqueue_copy_buffer`.
+* Add missing ``pyopencl.enqueue_copy_buffer``.
 * Add :func:`pyopencl.create_some_context`.
 * Add :func:`pyopencl.enqueue_barrier`, which was previously missing.
 
@@ -514,7 +558,7 @@ Version 0.91.2
 
 * :meth:`pyopencl.Program.build` now captures build logs and adds them
   to the exception text.
-* Deprecate :func:`pyopencl.create_context_from_type` in favor of second
+* Deprecate ``pyopencl.create_context_from_type`` in favor of second
   form of :class:`pyopencl.Context` constructor
 * Introduce :class:`pyopencl.LocalMemory`.
 * Document kernel invocation and :meth:`pyopencl.Kernel.set_arg`.
@@ -525,7 +569,7 @@ Version 0.91.1
 * Fixed a number of bugs, notably involving :class:`pyopencl.Sampler`.
 * :class:`pyopencl.Device`, :class:`pyopencl.Platform`,
   :class:`pyopencl.Context` now have nicer string representations.
-* Add :attr:`Image.shape`. (suggested by David Garcia)
+* Add :attr:`pyopencl.Image.shape`. (suggested by David Garcia)
 
 Version 0.91
 ------------
@@ -536,26 +580,26 @@ Version 0.91
 * Add :meth:`pyopencl.ImageFormat.__repr__`.
 * Add :meth:`pyopencl.addressing_mode.to_string` and colleagues.
 * The `pitch` arguments to
-  :func:`pyopencl.create_image_2d`,
-  :func:`pyopencl.create_image_3d`,
-  :func:`pyopencl.enqueue_read_image`, and
-  :func:`pyopencl.enqueue_write_image`
+  ``pyopencl.create_image_2d``,
+  ``pyopencl.create_image_3d``,
+  ``pyopencl.enqueue_read_image``, and
+  ``pyopencl.enqueue_write_image``
   are now defaulted to zero. The argument order of `enqueue_{read,write}_image`
   has changed for this reason.
 * Deprecate
-  :func:`pyopencl.create_image_2d`,
-  :func:`pyopencl.create_image_3d`
+  ``pyopencl.create_image_2d``,
+  ``pyopencl.create_image_3d``
   in favor of the :class:`pyopencl.Image` constructor.
 * Deprecate
-  :func:`pyopencl.create_program_with_source`,
-  :func:`pyopencl.create_program_with_binary`
+  ``pyopencl.create_program_with_source``,
+  ``pyopencl.create_program_with_binary``
   in favor of the :class:`pyopencl.Program` constructor.
 * Deprecate
-  :func:`pyopencl.create_buffer`,
-  :func:`pyopencl.create_host_buffer`
+  ``pyopencl.create_buffer``,
+  ``pyopencl.create_host_buffer``
   in favor of the :class:`pyopencl.Buffer` constructor.
-* :meth:`pyopencl.MemoryObject.get_image_info` now actually exists.
-* Add :attr:`pyopencl.MemoryObject.image.info`.
+* :meth:`pyopencl.Image.get_image_info` now actually exists.
+* Add :attr:`pyopencl.Image.info`.
 * Fix API tracing.
 * Add constructor arguments to :class:`pyopencl.ImageFormat`.  (suggested by David Garcia)
 
@@ -654,3 +698,117 @@ Andreas KlÃ¶ckner's work on :mod:`pyopencl` was supported in part by
 AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
 views and opinions expressed herein do not necessarily reflect those of the
 funding agencies.
+
+Documentation Cross-References
+==============================
+
+Numpy
+-----
+.. currentmodule:: numpy
+
+.. class:: int8
+
+    See :class:`numpy.generic`.
+
+.. class:: int32
+
+    See :class:`numpy.generic`.
+
+.. class:: float64
+
+    See :class:`numpy.generic`.
+
+OpenCL Specification
+--------------------
+.. c:type:: cl_platform_id
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-platform-layer>`__.
+
+.. c:type:: cl_device_id
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-platform-layer>`__.
+
+.. c:type:: cl_context
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_contexts>`__.
+
+.. c:type:: cl_command_queue
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_command_queues>`__.
+
+.. c:type:: cl_mem
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_memory_objects>`__.
+
+.. c:type:: cl_program
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_program_objects>`__.
+
+.. c:type:: cl_kernel
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_kernel_objects>`__.
+
+.. c:type:: cl_sampler
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_sampler_objects>`__.
+
+.. c:type:: cl_event
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#event-objects>`__.
+
+.. c:function:: void clCreateCommandQueueWithProperties()
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clCreateCommandQueueWithProperties>`__.
+
+.. c:function:: void clCreateSamplerWithProperties()
+
+   See the  `CL specification <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clCreateSamplerWithProperties>`__.
+
+Internal Types
+--------------
+
+.. currentmodule:: pyopencl._cl
+
+.. class:: Platform
+
+    See :class:`pyopencl.Platform`.
+
+.. class:: Device
+
+    See :class:`pyopencl.Device`.
+
+.. class:: CommandQueue
+
+    See :class:`pyopencl.CommandQueue`.
+
+.. class:: Context
+
+    See :class:`pyopencl.Context`.
+
+.. class:: Event
+
+    See :class:`pyopencl.Event`.
+
+.. class:: SVMAllocation
+
+    See :class:`pyopencl.SVMAllocation`.
+
+.. class:: MemoryMap
+
+    See :class:`pyopencl.MemoryMap`.
+
+.. class:: Sampler
+
+    See :class:`pyopencl.Sampler`.
+
+.. class:: Program
+
+    See :class:`pyopencl.Program`.
+
+.. class:: _Program
+
+    See :class:`pyopencl.Program`.
+
+.. class:: Kernel
+
+    See :class:`pyopencl.Kernel`.
diff --git a/doc/runtime_const.rst b/doc/runtime_const.rst
index 3001cc1143ef19a2e5caac182f2a4359b2d1f572..53ebf1fe96fa60a12598a3f776095f94244a7dcf 100644
--- a/doc/runtime_const.rst
+++ b/doc/runtime_const.rst
@@ -1,4 +1,6 @@
 OpenCL Runtime: Constants
 =========================
 
+.. currentmodule:: pyopencl
+
 .. include:: constants.inc
diff --git a/doc/runtime_gl.rst b/doc/runtime_gl.rst
index a391c173c8f7eb46aef2d5b1a8d8d32217615868..ecc3891d88e701ad6e56272b5518df032bec1bab 100644
--- a/doc/runtime_gl.rst
+++ b/doc/runtime_gl.rst
@@ -51,7 +51,7 @@ with GL support. See :func:`have_gl`.
 
     .. method:: get_gl_texture_info(param)
 
-        See :class:`gl_texture_info` for values of *param*.  Only available when PyOpenCL is compiled with GL support. See :func:`have_gl`.
+        See ``gl_texture_info`` for values of *param*.  Only available when PyOpenCL is compiled with GL support. See :func:`have_gl`.
 
 .. function:: enqueue_acquire_gl_objects(queue, mem_objects, wait_for=None)
 
diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index f92f13cd67f91329073d948f3f15e487b724d3e0..ce2ee2227a1311ada8b3074c4bf3ea1d095d9151 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -191,7 +191,7 @@ Image
     See :class:`mem_flags` for possible values of *flags*
     and :class:`mem_object_type` for possible values of *image_type*.
 
-.. class:: Image(context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None):
+.. class:: Image(context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None)
 
     See :class:`mem_flags` for values of *flags*.
     *shape* is a 2- or 3-tuple. *format* is an instance of :class:`ImageFormat`.
@@ -206,7 +206,7 @@ Image
 
     .. note::
 
-        If you want to load images from :mod:`numpy.ndarray` instances or read images
+        If you want to load images from :class:`numpy.ndarray` instances or read images
         back into them, be aware that OpenCL images expect the *x* dimension to vary
         fastest, whereas in the default (C) order of :mod:`numpy` arrays, the last index
         varies fastest. If your array is arranged in the wrong order in memory,
@@ -344,7 +344,6 @@ Samplers
 
 .. class:: Sampler
 
-
     .. method:: __init__(context, normalized_coords, addressing_mode, filter_mode)
 
         *normalized_coords* is a :class:`bool` indicating whether
@@ -353,7 +352,7 @@ Samplers
         See :class:`addressing_mode` and :class:`filter_mode` for possible
         argument values.
 
-    .. method:: __init__(context, properties)
+        Also supports an alternate signature ``(context, properties)``.
 
         :arg properties: a sequence
             of keys and values from :class:`sampler_properties` as accepted
@@ -361,9 +360,11 @@ Samplers
             spec for details). The trailing *0* is added automatically
             and does not need to be included.
 
-        Requires OpenCL 2 or newer.
+        This signature Requires OpenCL 2 or newer.
+
+        .. versionchanged:: 2018.2
 
-        .. versionadded:: 2018.2
+            The properties-based signature was added.
 
     .. attribute:: info
 
diff --git a/doc/runtime_platform.rst b/doc/runtime_platform.rst
index 6ee8fb661a842e60696019d43aef91be65981e77..51eecdba8710dc9688dd40ec3fc6e04286768b11 100644
--- a/doc/runtime_platform.rst
+++ b/doc/runtime_platform.rst
@@ -58,6 +58,18 @@ Device
     .. automethod:: from_int_ptr
     .. autoattribute:: int_ptr
 
+    .. attribute :: hashable_model_and_version_identifier
+
+        An unspecified data type that can be used to (as precisely as possible,
+        given identifying information available in OpenCL) identify a given
+        model and software stack version of a compute device. Note that this
+        identifier does not differentiate between different instances of the
+        same device installed in a single host.
+
+        The returned data type is hashable.
+
+        .. versionadded:: 2020.1
+
     .. method:: create_sub_devices(properties)
 
         *properties* is an array of one (or more) of the forms::
@@ -123,12 +135,12 @@ Context
     .. note::
 
         For
-        :attr:`context_properties.CL_GL_CONTEXT_KHR`,
-        :attr:`context_properties.CL_EGL_DISPLAY_KHR`,
-        :attr:`context_properties.CL_GLX_DISPLAY_KHR`,
-        :attr:`context_properties.CL_WGL_HDC_KHR`, and
-        :attr:`context_properties.CL_CGL_SHAREGROUP_KHR`
-        :attr:`context_properties.CL_CGL_SHAREGROUP_APPLE`
+        ``context_properties.CL_GL_CONTEXT_KHR``,
+        ``context_properties.CL_EGL_DISPLAY_KHR``,
+        ``context_properties.CL_GLX_DISPLAY_KHR``,
+        ``context_properties.CL_WGL_HDC_KHR``, and
+        ``context_properties.CL_CGL_SHAREGROUP_KHR``
+        ``context_properties.CL_CGL_SHAREGROUP_APPLE``
         the value in the key-value pair is a PyOpenGL context or display
         instance.
 
diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst
index e95468782e37f3ec9fc1ab18556dfda371dc3adf..18d831eb88ac6d3971651242aeb6367220dc3d9a 100644
--- a/doc/runtime_program.rst
+++ b/doc/runtime_program.rst
@@ -8,6 +8,21 @@ OpenCL Runtime: Programs and Kernels
 Program
 -------
 
+.. envvar:: PYOPENCL_NO_CACHE
+
+    By setting the environment variable :envvar:`PYOPENCL_NO_CACHE` to any
+    non-empty value, this caching is suppressed.
+
+    .. versionadded:: 2013.1
+
+.. envvar:: PYOPENCL_BUILD_OPTIONS
+
+    Any options found in the environment variable
+    :envvar:`PYOPENCL_BUILD_OPTIONS` will be appended to *options*
+    in :meth:`Program.build`.
+
+    .. versionadded:: 2013.1
+
 .. class:: Program(context, src)
            Program(context, devices, binaries)
 
@@ -47,17 +62,13 @@ Program
         If passed *cache_dir* is None and context was created with None cache_dir:
         built binaries will be cached in an on-disk cache called
         :file:`pyopencl-compiler-cache-vN-uidNAME-pyVERSION` in the directory
-        returned by :func:`tempfile.gettempdir`.  By setting the environment
-        variable :envvar:`PYOPENCL_NO_CACHE` to any non-empty value, this
-        caching is suppressed.  Any options found in the environment variable
-        :envvar:`PYOPENCL_BUILD_OPTIONS` will be appended to *options*.
+        returned by :func:`tempfile.gettempdir`.
+
+        See also :envvar:`PYOPENCL_NO_CACHE`, :envvar:`PYOPENCL_BUILD_OPTIONS`.
 
         .. versionchanged:: 2011.1
-            *options* may now also be a :class:`list` of :class:`str`.
 
-        .. versionchanged:: 2013.1
-            Added :envvar:`PYOPENCL_NO_CACHE`.
-            Added :envvar:`PYOPENCL_BUILD_OPTIONS`.
+            *options* may now also be a :class:`list` of :class:`str`.
 
     .. method:: compile(self, options=[], devices=None, headers=[])
 
@@ -205,19 +216,17 @@ Kernel
                prg.kernel(queue, n_globals, None, args)
 
 
-    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
         :meth:`set_args` to set each argument in turn. See the documentation for
         :meth:`set_arg` to see what argument types are allowed.
-        |std-enqueue-blurb|
 
-        *None* may be passed for local_size.
+        |glsize|
+
+        |empty-nd-range|
 
-        If *g_times_l* is specified, the global size will be multiplied by the
-        local size. (which makes the behavior more like Nvidia CUDA) In this case,
-        *global_size* and *local_size* also do not have to have the same number
-        of dimensions.
+        |std-enqueue-blurb|
 
         .. note::
 
@@ -233,6 +242,7 @@ Kernel
             <http://lists.tiker.net/pipermail/pyopencl/2012-October/001311.html>`_.
 
         .. versionchanged:: 0.92
+
             *local_size* was promoted to third positional argument from being a
             keyword argument. The old keyword argument usage will continue to
             be accepted with a warning throughout the 0.92 release cycle.
@@ -244,8 +254,13 @@ Kernel
             it from working.
 
         .. versionchanged:: 2011.1
+
             Added the *g_times_l* keyword arg.
 
+        .. versionchanged:: 2020.2
+
+            Added the *allow_empty_ndrange* keyword argument.
+
     .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
 
         This method supports the exact same interface as :meth:`__call__`, but
@@ -283,19 +298,18 @@ Kernel
 
         The size of local buffer in bytes to be provided.
 
-.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False)
+.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
-    |std-enqueue-blurb|
+    |glsize|
 
-    If *g_times_l* is specified, the global size will be multiplied by the
-    local size. (which makes the behavior more like Nvidia CUDA) In this case,
-    *global_size* and *local_size* also do not have to have the same number
-    of dimensions.
+    |empty-nd-range|
+
+    |std-enqueue-blurb|
 
     .. versionchanged:: 2011.1
-        Added the *g_times_l* keyword arg.
 
+        Added the *g_times_l* keyword arg.
 
-.. function:: enqueue_task(queue, kernel, wait_for=None)
+    .. versionchanged:: 2020.2
 
-    |std-enqueue-blurb|
+        Added the *allow_empty_ndrange* keyword argument.
diff --git a/doc/runtime_queue.rst b/doc/runtime_queue.rst
index c0b42897d151cd95c1289665e4f5a00d801fc078..f120c61edaa3b1b876e1f482a55bc6a2be49369c 100644
--- a/doc/runtime_queue.rst
+++ b/doc/runtime_queue.rst
@@ -78,14 +78,9 @@ Event
         may be used as attributes on instances of this class
         to directly query info attributes.
 
-    .. attribute:: profile.info
+    .. attribute:: profile
 
-        Lower case versions of the :class:`profiling_info` constants
-        may be used as attributes on the attribute `profile` of this
-        class to directly query profiling info.
-
-        For example, you may use *evt.profile.end* instead of
-        *evt.get_profiling_info(pyopencl.profiling_info.END)*.
+        An instance of :class:`ProfilingInfoGetter`.
 
     .. method:: get_info(param)
 
@@ -114,6 +109,17 @@ Event
 
     |comparable|
 
+.. class:: ProfilingInfoGetter
+
+   .. attribute:: info
+
+        Lower case versions of the :class:`profiling_info` constants
+        may be used as attributes on the attribute `profile` of this
+        class to directly query profiling info.
+
+        For example, you may use *evt.profile.end* instead of
+        *evt.get_profiling_info(pyopencl.profiling_info.END)*.
+
 Event Subclasses
 ----------------
 
diff --git a/doc/subst.rst b/doc/subst.rst
index 4210ab24ce99a871aa4cfe318d3eb07049d5a98a..eba3536324545ccd4586244d5c2665b7db5d9d26 100644
--- a/doc/subst.rst
+++ b/doc/subst.rst
@@ -13,3 +13,24 @@
 
 .. |copy-depr| replace:: **Note:** This function is deprecated as of PyOpenCL 2011.1.
         Use :func:`enqueue_copy` instead.
+
+.. |glsize| replace:: *global_size* and *local_size* are tuples of identical length, with
+        between one and three entries. *global_size* specifies the overall size
+        of the computational grid: one work item will be launched for every
+        integer point in the grid. *local_size* specifies the workgroup size,
+        which must evenly divide the *global_size* in a dimension-by-dimension
+        manner.  *None* may be passed for local_size, in which case the
+        implementation will use an implementation-defined workgroup size.
+        If *g_times_l* is *True*, the global size will be multiplied by the
+        local size. (which makes the behavior more like Nvidia CUDA) In this case,
+        *global_size* and *local_size* also do not have to have the same number
+        of entries.
+
+.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating
+        how an empty NDRange is to be treated, where "empty" means that one or more
+        entries of *global_size* or *local_size* are zero. OpenCL itself does not
+        allow enqueueing kernels over empty NDRanges. Setting this flag to *True*
+        enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``)
+        to obtain the synchronization effects that would have resulted from
+        the kernel enqueue.
+        Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer.
diff --git a/doc/tools.rst b/doc/tools.rst
index ade730a14369a703b8441618104e0c19cb3948be..74a303f104a9710e807659c78ef0629993416c87 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -15,7 +15,7 @@ fresh memory area is allocated for each intermediate result. Memory pools are a
 remedy for this problem based on the observation that often many of the block
 allocations are of the same sizes as previously used ones.
 
-Then, instead of fully returning the memory to the system and incurring the 
+Then, instead of fully returning the memory to the system and incurring the
 associated reallocation overhead, the pool holds on to the memory and uses it
 to satisfy future allocations of similarly-sized blocks. The pool reacts
 appropriately to out-of-memory conditions as long as all memory allocations
@@ -36,6 +36,15 @@ not complicated::
     memory is returned to the pool. This supports the same interface
     as :class:`pyopencl.Buffer`.
 
+.. class:: AllocatorInterface
+
+   An interface implemented by various memory allocation functions
+   in :mod:`pyopencl`.
+
+    .. method:: __call__(size)
+
+        Allocate and return a :class:`pyopencl.Buffer` of the given *size*.
+
 .. class:: DeferredAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE)
 
     *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
@@ -46,14 +55,22 @@ not complicated::
     bound to contexts, not devices, and memory availability depends on which
     device the buffer is used with.)
 
-    .. versionchanged::
-        In version 2013.1, :class:`CLAllocator` was deprecated and replaced
+    Implements :class:`AllocatorInterface`.
+
+    .. versionchanged :: 2013.1
+
+        ``CLAllocator`` was deprecated and replaced
         by :class:`DeferredAllocator`.
 
     .. method:: __call__(size)
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
 .. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE)
 
     *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
@@ -62,13 +79,20 @@ not complicated::
     allocated memory is actually available. If no memory is available, an out-of-memory
     error is reported at allocation time.
 
+    Implements :class:`AllocatorInterface`.
+
     .. versionadded:: 2013.1
 
     .. method:: __call__(size)
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
-.. class:: MemoryPool(allocator)
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
+.. class:: MemoryPool(allocator[, leading_bits_in_bin_id])
 
     A memory pool for OpenCL device memory. *allocator* must be an instance of
     one of the above classes, and should be an :class:`ImmediateAllocator`.
@@ -76,6 +100,31 @@ not complicated::
     by the allocator immediately, and not in the OpenCL-typical
     deferred manner.
 
+    Implements :class:`AllocatorInterface`.
+
+    .. note::
+
+        The current implementation of the memory pool will retain allocated
+        memory after it is returned by the application and keep it in a bin
+        identified by the leading *leading_bits_in_bin_id* bits of the
+        allocation size. To ensure that allocations within each bin are
+        interchangeable, allocation sizes are rounded up to the largest size
+        that shares the leading bits of the requested allocation size.
+
+        The current default value of *leading_bits_in_bin_id* is
+        four, but this may change in future versions and is not
+        guaranteed.
+
+        *leading_bits_in_bin_id* must be passed by keyword,
+        and its role is purely advisory. It is not guaranteed
+        that future versions of the pool will use the
+        same allocation scheme and/or honor *leading_bits_in_bin_id*.
+
+    .. versionchanged:: 2019.1
+
+        Current bin allocation behavior documented, *leading_bits_in_bin_id*
+        added.
+
     .. attribute:: held_blocks
 
         The number of unused blocks being held by this pool.
@@ -91,7 +140,7 @@ not complicated::
 
     .. method:: __call__(size)
 
-        Synonym for :meth:`allocate` to match :class:`CLAllocator` interface.
+        Synonym for :meth:`allocate` to match the :class:`AllocatorInterface`.
 
         .. versionadded: 2011.2
 
diff --git a/doc/types.rst b/doc/types.rst
index ccc96fdd355737bc90e6ec1112b876ba30f1e7cb..dbd9794947c9188c008742e0d93b3324abcdf66b 100644
--- a/doc/types.rst
+++ b/doc/types.rst
@@ -1,7 +1,7 @@
 OpenCL Type Mapping
 ===================
 
-.. module:: pyopencl.types
+.. module:: pyopencl.cltypes
 
 .. _type-mappings:
 
@@ -21,6 +21,11 @@ see that a cl_long is 64 bit unsigned integer. Use the module as follows:
     >>> cl_long = cl.cltypes.long(1235) # maps to numpy.int64
     >>> floats = np.empty((128,), dtype=cl.cltypes.float) # array of numpy.float32
 
+.. note::
+
+    The OpenCL type ``bool`` does not have a correpsonding :mod:`numpy` type defined here,
+    because OpenCL does not specify the in-memory representation (or even the storage
+    size) for this type.
 
 Vector Types
 ------------
diff --git a/examples/demo-struct-reduce.py b/examples/demo-struct-reduce.py
index 2b0d9803f1fdd32e85a2da7fe245297a8ac5cf95..c0c26e34743687a281c534c05d9c8cb74c6587ec 100644
--- a/examples/demo-struct-reduce.py
+++ b/examples/demo-struct-reduce.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import numpy as np
 import pyopencl as cl
 
diff --git a/examples/demo.py b/examples/demo.py
index 62c0f7ee5fe975f0d4097be41396b558e82eec50..623660fee1b20b9ba140504ca594cc648e28bc45 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-from __future__ import absolute_import, print_function
 import numpy as np
 import pyopencl as cl
 
@@ -33,3 +31,4 @@ cl.enqueue_copy(queue, res_np, res_g)
 # Check on CPU with Numpy:
 print(res_np - (a_np + b_np))
 print(np.linalg.norm(res_np - (a_np + b_np)))
+assert np.allclose(res_np, a_np + b_np)
diff --git a/examples/demo_array.py b/examples/demo_array.py
index c645b372632b8792d302658bbfa6c263b051491e..41b0f79ef2ccb74a807a8da5aff5eedf6a3bb15f 100644
--- a/examples/demo_array.py
+++ b/examples/demo_array.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import pyopencl.array as cl_array
 import numpy
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index a8a3a007c094cf9b1ca7d3fc66142b7817a8b83d..21646c4f42a8cce495c02aef7beae5d4a2ceaffe 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
diff --git a/examples/demo_elementwise_complex.py b/examples/demo_elementwise_complex.py
index 9e04e2dd5a4f09c4235e860de1aa32dfc41a714f..4fe98ec9d0f0d514c84180e2775d84c7f808b152 100644
--- a/examples/demo_elementwise_complex.py
+++ b/examples/demo_elementwise_complex.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import pyopencl.array as cl_array
 import numpy
diff --git a/examples/demo_mandelbrot.py b/examples/demo_mandelbrot.py
index 802dfb215802c70e86bdc7534d401b4efe2f173b..9753b3ad5d9287f1968bf1d182bf22c50fe9bb79 100644
--- a/examples/demo_mandelbrot.py
+++ b/examples/demo_mandelbrot.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 # I found this example for PyCuda here:
 # http://wiki.tiker.net/PyCuda/Examples/Mandelbrot
 #
@@ -24,7 +22,6 @@ import time
 import numpy as np
 
 import pyopencl as cl
-from six.moves import range
 
 # You can choose a calculation routine below (calc_fractal), uncomment
 # one of the three lines to test the three variations
@@ -117,7 +114,7 @@ if __name__ == '__main__':
         import tkinter as tk
     from PIL import Image, ImageTk
 
-    class Mandelbrot(object):
+    class Mandelbrot:
         def __init__(self):
             # create window
             self.root = tk.Tk()
diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
index 7ab9958f490bb17b5a55b18c2e9649909ac8c703..c080109b9dcfe45c16525db2eaa7709f9250b3a9 100644
--- a/examples/demo_meta_codepy.py
+++ b/examples/demo_meta_codepy.py
@@ -1,8 +1,6 @@
-from __future__ import absolute_import
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
-from six.moves import range
 
 local_size = 256
 thread_strides = 32
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
index 76b5f65bf88ba938273b640831a998f93cd94812..fc64934385b58c7ac6a2d5b72a5b4fb1327de688 100644
--- a/examples/demo_meta_template.py
+++ b/examples/demo_meta_template.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
diff --git a/examples/download-examples-from-wiki.py b/examples/download-examples-from-wiki.py
index 0f8ea87527baeb492f2a264c476ac45f4ccff02d..13fd8fb7a09ecbd4d18f8055377047bc1a00ac4c 100755
--- a/examples/download-examples-from-wiki.py
+++ b/examples/download-examples-from-wiki.py
@@ -1,6 +1,5 @@
 #! /usr/bin/env python
 
-from __future__ import absolute_import, print_function
 
 import six.moves.xmlrpc_client
 destwiki = six.moves.xmlrpc_client.ServerProxy("http://wiki.tiker.net?action=xmlrpc2")
@@ -53,6 +52,6 @@ for page in all_pages:
                 outf.close()
 
     except Exception as e:
-        print("Error when processing %s: %s" % (page, e))
+        print(f"Error when processing {page}: {e}")
         from traceback import print_exc
         print_exc()
diff --git a/examples/dump-performance.py b/examples/dump-performance.py
index 00df1d1bad6e62fc284eb7fa7ce18731255fabc4..f582cd99fcae98df7325717b4e1541dbf873bbcb 100644
--- a/examples/dump-performance.py
+++ b/examples/dump-performance.py
@@ -1,7 +1,5 @@
-from __future__ import division, absolute_import, print_function
 import pyopencl as cl
 import pyopencl.characterize.performance as perf
-from six.moves import range
 
 
 def main():
@@ -9,7 +7,7 @@ def main():
 
     prof_overhead, latency = perf.get_profiling_overhead(ctx)
     print("command latency: %g s" % latency)
-    print("profiling overhead: %g s -> %.1f %%" % (
+    print("profiling overhead: {:g} s -> {:.1f} %".format(
             prof_overhead, 100*prof_overhead/latency))
     queue = cl.CommandQueue(
             ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
diff --git a/examples/dump-properties.py b/examples/dump-properties.py
index e64f66fa25c9d0e47af70f9409b9ddd2b5aa424d..07d9159827c315605286d46a4f7de494b7d7489e 100644
--- a/examples/dump-properties.py
+++ b/examples/dump-properties.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 from optparse import OptionParser
 
@@ -21,13 +19,13 @@ def print_info(obj, info_cls):
 
             if (info_cls == cl.device_info and info_name == "PARTITION_TYPES_EXT"
                     and isinstance(info_value, list)):
-                print("%s: %s" % (info_name, [
+                print("{}: {}".format(info_name, [
                     cl.device_partition_property_ext.to_string(v,
                         "<unknown device partition property %d>")
                     for v in info_value]))
             else:
                 try:
-                    print("%s: %s" % (info_name, info_value))
+                    print(f"{info_name}: {info_value}")
                 except:
                     print("%s: <error>" % info_name)
 
@@ -72,13 +70,13 @@ for platform in cl.get_platforms():
                             return result
 
                         formats = ", ".join(
-                                "%s-%s" % (
+                                "{}-{}".format(
                                     cl.channel_order.to_string(iform.channel_order,
                                         "<unknown channel order 0x%x>"),
                                     str_chd_type(iform.channel_data_type))
                                 for iform in formats)
 
-                    print("%s %s FORMATS: %s\n" % (
+                    print("{} {} FORMATS: {}\n".format(
                             cl.mem_object_type.to_string(itype),
                             cl.mem_flags.to_string(mf),
                             formats))
diff --git a/examples/gl_interop_demo.py b/examples/gl_interop_demo.py
index da5ba3b0d6c84216a4a2273134c7cacef3a26b1f..99524cb30b3662b09aa4599d14d6df259ff6f340 100644
--- a/examples/gl_interop_demo.py
+++ b/examples/gl_interop_demo.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 from OpenGL.GL import *
 from OpenGL.GLUT import *
 from OpenGL.raw.GL.VERSION.GL_1_5 import glBufferData as rawGlBufferData
diff --git a/examples/gl_particle_animation.py b/examples/gl_particle_animation.py
index dd2f05c24686cc9cd777923b45de8963ed1f58b3..1d838a2a4a0884dc53f7d24e8319336c5b7ca3ee 100644
--- a/examples/gl_particle_animation.py
+++ b/examples/gl_particle_animation.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 # Visualization of particles with gravity
 # Source: http://enja.org/2010/08/27/adventures-in-opencl-part-2-particles-with-opengl/
 
diff --git a/examples/narray.py b/examples/narray.py
index 78b9bb9205b326b207730d411524fad93fd2c142..40ba945042b8d6337d7d4139deb1991d20532d81 100644
--- a/examples/narray.py
+++ b/examples/narray.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 # example by Roger Pau Monn'e
 import pyopencl as cl
 import numpy as np
diff --git a/examples/print-binary.py b/examples/print-binary.py
index c7ea523947f522f9165399a52f483842d21d8744..d45c1d0fe67989eda42342d8b0dee4c90bfcc616 100755
--- a/examples/print-binary.py
+++ b/examples/print-binary.py
@@ -1,12 +1,11 @@
 #! /usr/bin/env python
 
-from __future__ import division
 
 import pyopencl as cl
 import sys
 
 ctx = cl.create_some_context()
-with open(sys.argv[1], "r") as inf:
+with open(sys.argv[1]) as inf:
     src = inf.read()
 
 prg = cl.Program(ctx, src).build()
diff --git a/examples/transpose.py b/examples/transpose.py
index 99f68a28e4bc97e889248e01e7d145172587cf3f..9b07e2b0566be8f0c02677a9c8cfb53448654a0e 100644
--- a/examples/transpose.py
+++ b/examples/transpose.py
@@ -1,13 +1,9 @@
 # Transposition of a matrix
 # originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
 
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
 import pyopencl as cl
 import numpy
 import numpy.linalg as la
-from six.moves import range
 
 
 
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 7f77154f66278a5fc56bf59ebecc104aa5551a6c..ef56ad0fd8782286e6d5f35b331e99b3df1b82f5 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009-15 Andreas Kloeckner"
 
 __license__ = """
@@ -25,13 +21,19 @@ THE SOFTWARE.
 """
 
 import six
-from six.moves import input, intern
+from six.moves import intern
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
+# must import, otherwise dtype registry will not be fully populated
+import pyopencl.cltypes  # noqa: F401
+
 import logging
 logger = logging.getLogger(__name__)
 
+import os
+os.environ["PYOPENCL_HOME"] = os.path.dirname(os.path.abspath(__file__))
+
 try:
     import pyopencl._cl as _cl
 except ImportError:
@@ -47,8 +49,7 @@ import numpy as np
 
 import sys
 
-_PYPY = '__pypy__' in sys.builtin_module_names
-_CPY2 = not _PYPY and sys.version_info < (3,)
+_PYPY = "__pypy__" in sys.builtin_module_names
 
 from pyopencl._cl import (  # noqa
         get_cl_header_version,
@@ -81,6 +82,7 @@ from pyopencl._cl import (  # noqa
         addressing_mode,
         filter_mode,
         sampler_info,
+        sampler_properties,
         map_flags,
         program_info,
         program_build_info,
@@ -158,7 +160,7 @@ if not _PYPY:
 
 if get_cl_header_version() >= (1, 1):
     from pyopencl._cl import (  # noqa
-          UserEvent,
+        UserEvent,
         )
 if get_cl_header_version() >= (1, 2):
     from pyopencl._cl import (  # noqa
@@ -214,6 +216,21 @@ CONSTANT_CLASSES = tuple(
         if _inspect.isclass(getattr(_cl, name))
         and name[0].islower() and name not in ["zip", "map", "range"])
 
+BITFIELD_CONSTANT_CLASSES = (
+        _cl.device_type,
+        _cl.device_fp_config,
+        _cl.device_exec_capabilities,
+        _cl.command_queue_properties,
+        _cl.mem_flags,
+        _cl.map_flags,
+        _cl.kernel_arg_type_qualifier,
+        _cl.device_affinity_domain,
+        _cl.mem_migration_flags,
+        _cl.device_svm_capabilities,
+        _cl.queue_properties,
+        _cl.svm_mem_flags,
+        )
+
 
 # {{{ diagnostics
 
@@ -237,27 +254,24 @@ def compiler_output(text):
 # {{{ find pyopencl shipped source code
 
 def _find_pyopencl_include_path():
-    from pkg_resources import Requirement, resource_filename, DistributionNotFound
+    from os.path import join, abspath, dirname, exists
     try:
+        # Try to find the include path in the same directory as this file
+        include_path = join(abspath(dirname(__file__)), "cl")
+        if not exists(include_path):
+            raise OSError("unable to find pyopencl include path")
+    except Exception:
         # Try to find the resource with pkg_resources (the recommended
-        # setuptools approach)
+        # setuptools approach). This is very slow.
+        from pkg_resources import Requirement, resource_filename
         include_path = resource_filename(
                 Requirement.parse("pyopencl"), "pyopencl/cl")
-    except DistributionNotFound:
-        # If pkg_resources can't find it (e.g. if the module is part of a
-        # frozen application), try to find the include path in the same
-        # directory as this file
-        from os.path import join, abspath, dirname, exists
-
-        include_path = join(abspath(dirname(__file__)), "cl")
-        # If that doesn't exist, just re-raise the exception caught from
-        # resource_filename.
         if not exists(include_path):
-            raise
+            raise OSError("unable to find pyopencl include path")
 
     # Quote the path if it contains a space and is not quoted already.
     # See https://github.com/inducer/pyopencl/issues/250 for discussion.
-    if ' ' in include_path and not include_path.startswith('"'):
+    if " " in include_path and not include_path.startswith('"'):
         return '"' + include_path + '"'
     else:
         return include_path
@@ -265,6 +279,66 @@ def _find_pyopencl_include_path():
 # }}}
 
 
+# {{{ build option munging
+
+def _split_options_if_necessary(options):
+    if isinstance(options, str):
+        import shlex
+        if six.PY2:
+            # shlex.split takes bytes (py2 str) on py2
+            if isinstance(options, str):
+                options = options.encode("utf-8")
+        else:
+            # shlex.split takes unicode (py3 str) on py3
+            if isinstance(options, bytes):
+                options = options.decode("utf-8")
+
+        options = shlex.split(options)
+
+    return options
+
+
+def _find_include_path(options):
+    def unquote(path):
+        if path.startswith('"') and path.endswith('"'):
+            return path[1:-1]
+        else:
+            return path
+
+    include_path = ["."]
+
+    option_idx = 0
+    while option_idx < len(options):
+        option = options[option_idx].strip()
+        if option.startswith("-I") or option.startswith("/I"):
+            if len(option) == 2:
+                if option_idx+1 < len(options):
+                    include_path.append(unquote(options[option_idx+1]))
+                option_idx += 2
+            else:
+                include_path.append(unquote(option[2:].lstrip()))
+                option_idx += 1
+        else:
+            option_idx += 1
+
+    # }}}
+
+    return include_path
+
+
+def _options_to_bytestring(options):
+    def encode_if_necessary(s):
+        if isinstance(s, str):
+            return s.encode("utf-8")
+        else:
+            return s
+
+    return b" ".join(encode_if_necessary(s) for s in options)
+
+
+# }}}
+
+
 # {{{ Program (wrapper around _Program, adds caching support)
 
 _DEFAULT_BUILD_OPTIONS = []
@@ -298,11 +372,12 @@ def enable_debugging(platform_or_context):
                 % platform.name)
 
 
-class Program(object):
+class Program:
     def __init__(self, arg1, arg2=None, arg3=None):
         if arg2 is None:
             # 1-argument form: program
             self._prg = arg1
+            self._context = self._prg.get_info(program_info.CONTEXT)
 
         elif arg3 is None:
             # 2-argument form: context, source
@@ -310,13 +385,13 @@ class Program(object):
 
             from pyopencl.tools import is_spirv
             if is_spirv(source):
-                # no caching in SPIR-V case
+                # FIXME no caching in SPIR-V case
                 self._context = context
-                self._prg = _cl._Program(context, source)
+                self._prg = _cl._create_program_with_il(context, source)
                 return
 
             import sys
-            if isinstance(source, six.text_type) and sys.version_info < (3,):
+            if isinstance(source, str) and sys.version_info < (3,):
                 from warnings import warn
                 warn("Received OpenCL source code in Unicode, "
                      "should be ASCII string. Attempting conversion.",
@@ -344,7 +419,6 @@ class Program(object):
                     stacklevel=3)
 
             self._prg = _cl._Program(self._context, self._source)
-            del self._context
             return self._prg
 
     def get_info(self, arg):
@@ -390,25 +464,8 @@ class Program(object):
     # {{{ build
 
     @classmethod
-    def _process_build_options(cls, context, options):
-        if isinstance(options, six.string_types):
-            import shlex
-            if six.PY2:
-                # shlex.split takes bytes (py2 str) on py2
-                if isinstance(options, six.text_type):
-                    options = options.encode("utf-8")
-            else:
-                # shlex.split takes unicode (py3 str) on py3
-                if isinstance(options, six.binary_type):
-                    options = options.decode("utf-8")
-
-            options = shlex.split(options)
-
-        def encode_if_necessary(s):
-            if isinstance(s, six.text_type):
-                return s.encode("utf-8")
-            else:
-                return s
+    def _process_build_options(cls, context, options, _add_include_path=False):
+        options = _split_options_if_necessary(options)
 
         options = (options
                 + _DEFAULT_BUILD_OPTIONS
@@ -421,42 +478,16 @@ class Program(object):
         if forced_options:
             options = options + forced_options.split()
 
-        # {{{ find include path
-
-        def unquote(path):
-            if path.startswith('"') and path.endswith('"'):
-                return path[1:-1]
-            else:
-                return path
-
-        include_path = ["."]
-
-        option_idx = 0
-        while option_idx < len(options):
-            option = options[option_idx].strip()
-            if option.startswith("-I") or option.startswith("/I"):
-                if len(option) == 2:
-                    if option_idx+1 < len(options):
-                        include_path.append(unquote(options[option_idx+1]))
-                    option_idx += 2
-                else:
-                    include_path.append(unquote(option[2:].lstrip()))
-                    option_idx += 1
-            else:
-                option_idx += 1
-
-        # }}}
-
-        options = [encode_if_necessary(s) for s in options]
-
-        return b" ".join(options), include_path
+        return (
+                _options_to_bytestring(options),
+                _find_include_path(options))
 
     def build(self, options=[], devices=None, cache_dir=None):
         options_bytes, include_path = self._process_build_options(
                 self._context, options)
 
         if cache_dir is None:
-            cache_dir = getattr(self._context, 'cache_dir', None)
+            cache_dir = getattr(self._context, "cache_dir", None)
 
         import os
         build_descr = None
@@ -559,8 +590,11 @@ def create_program_with_built_in_kernels(context, devices, kernel_names):
         context, devices, kernel_names))
 
 
-def link_program(context, programs, options=[], devices=None):
-    options_bytes, _ = Program._process_build_options(context, options)
+def link_program(context, programs, options=None, devices=None):
+    if options is None:
+        options = []
+
+    options_bytes = _options_to_bytestring(_split_options_if_necessary(options))
     programs = [prg._get_prg() for prg in programs]
     raw_prg = _Program.link(context, programs, options_bytes, devices)
     return Program(raw_prg)
@@ -585,7 +619,7 @@ def _add_functionality():
     # {{{ Platform
 
     def platform_repr(self):
-        return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
+        return f"<pyopencl.Platform '{self.name}' at 0x{self.int_ptr:x}>"
 
     Platform.__repr__ = platform_repr
     Platform._get_cl_version = generic_get_cl_version
@@ -595,16 +629,25 @@ def _add_functionality():
     # {{{ Device
 
     def device_repr(self):
-        return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
+        return "<pyopencl.Device '{}' on '{}' at 0x{:x}>".format(
                 self.name.strip(), self.platform.name.strip(), self.int_ptr)
 
+    def device_hashable_model_and_version_identifier(self):
+        return ("v1", self.vendor, self.vendor_id, self.name, self.version)
+
     def device_persistent_unique_id(self):
-        return (self.vendor, self.vendor_id, self.name, self.version)
+        from warnings import warn
+        warn("Device.persistent_unique_id is deprecated. "
+                "Use Device.hashable_model_and_version_identifier instead.",
+                DeprecationWarning, stacklevel=2)
+        return device_hashable_model_and_version_identifier(self)
 
     Device.__repr__ = device_repr
 
     # undocumented for now:
     Device._get_cl_version = generic_get_cl_version
+    Device.hashable_model_and_version_identifier = property(
+            device_hashable_model_and_version_identifier)
     Device.persistent_unique_id = property(device_persistent_unique_id)
 
     # }}}
@@ -625,7 +668,7 @@ def _add_functionality():
         context_old_init(self, devices, properties, dev_type)
 
     def context_repr(self):
-        return "<pyopencl.Context at 0x%x on %s>" % (self.int_ptr,
+        return "<pyopencl.Context at 0x{:x} on {}>".format(self.int_ptr,
                 ", ".join(repr(dev) for dev in self.devices))
 
     def context_get_cl_version(self):
@@ -646,7 +689,7 @@ def _add_functionality():
         self.finish()
 
     def command_queue_get_cl_version(self):
-        return self.context._get_cl_version()
+        return self.device._get_cl_version()
 
     CommandQueue.__enter__ = command_queue_enter
     CommandQueue.__exit__ = command_queue_exit
@@ -674,7 +717,7 @@ def _add_functionality():
             self._build(options=options_bytes, devices=devices)
         except Error as e:
             msg = str(e) + "\n\n" + (75*"="+"\n").join(
-                    "Build on %s:\n\n%s" % (dev, log)
+                    f"Build on {dev}:\n\n{log}"
                     for dev, log in self._get_build_logs())
             code = e.code
             routine = e.routine
@@ -691,7 +734,7 @@ def _add_functionality():
             raise err
 
         message = (75*"="+"\n").join(
-                "Build on %s succeeded, but said:\n\n%s" % (dev, log)
+                f"Build on {dev} succeeded, but said:\n\n{log}"
                 for dev, log in self._get_build_logs()
                 if log is not None and log.strip())
 
@@ -700,6 +743,8 @@ def _add_functionality():
                 build_type = "From-source build"
             elif self.kind() == program_kind.BINARY:
                 build_type = "From-binary build"
+            elif self.kind() == program_kind.IL:
+                build_type = "From-IL build"
             else:
                 build_type = "Build"
 
@@ -842,7 +887,7 @@ def _add_functionality():
     # {{{ ImageFormat
 
     def image_format_repr(self):
-        return "ImageFormat(%s, %s)" % (
+        return "ImageFormat({}, {})".format(
                 channel_order.to_string(self.channel_order,
                     "<unknown channel order 0x%x>"),
                 channel_type.to_string(self.channel_data_type,
@@ -997,7 +1042,7 @@ def _add_functionality():
                         val.code(), "<unknown error %d>")
             routine = val.routine()
             if routine:
-                result = "%s failed: %s" % (routine, result)
+                result = f"{routine} failed: {result}"
             what = val.what()
             if what:
                 if result:
@@ -1069,10 +1114,8 @@ def _add_functionality():
         """
         svmallocation_old_init(self, ctx, size, alignment, flags)
 
-        read_write = (
-                flags & mem_flags.WRITE_ONLY != 0
-                or flags & mem_flags.READ_WRITE != 0)
-
+        # mem_flags.READ_ONLY applies to kernels, not the host
+        read_write = True
         _interface["data"] = (
                 int(self._ptr_as_int()), not read_write)
 
@@ -1086,8 +1129,9 @@ def _add_functionality():
     # {{{ SVM
 
     if get_cl_header_version() >= (2, 0):
-        SVM.__doc__ = """Tags an object exhibiting the Python buffer interface (such as a
-            :class:`numpy.ndarray`) as referring to shared virtual memory.
+        SVM.__doc__ = """Tags an object exhibiting the Python buffer interface
+            (such as a :class:`numpy.ndarray`) as referring to shared virtual
+            memory.
 
             Depending on the features of the OpenCL implementation, the following
             types of objects may be passed to/wrapped in this type:
@@ -1142,7 +1186,7 @@ def _add_functionality():
                 This object merely serves as a 'tag' that changes the behavior
                 of functions to which it is passed. It has no special management
                 relationship to the memory it tags. For example, it is permissible
-                to grab a :mod:`numpy.array` out of :attr:`SVM.mem` of one
+                to grab a :class:`numpy.ndarray` out of :attr:`SVM.mem` of one
                 :class:`SVM` instance and use the array to construct another.
                 Neither of the tags need to be kept alive.
 
@@ -1244,9 +1288,21 @@ def _add_functionality():
             }
 
     def to_string(cls, value, default_format=None):
-        for name in dir(cls):
-            if (not name.startswith("_") and getattr(cls, name) == value):
-                return name
+        if cls._is_bitfield:
+            names = []
+            for name in dir(cls):
+                attr = getattr(cls, name)
+                if not isinstance(attr, int):
+                    continue
+                if attr == value or attr & value:
+                    names.append(name)
+            if names:
+                return " | ".join(names)
+        else:
+            for name in dir(cls):
+                if (not name.startswith("_")
+                        and getattr(cls, name) == value):
+                    return name
 
         if default_format is None:
             raise ValueError("a name for value %d was not found in %s"
@@ -1255,6 +1311,7 @@ def _add_functionality():
             return default_format % value
 
     for cls in CONSTANT_CLASSES:
+        cls._is_bitfield = cls in BITFIELD_CONSTANT_CLASSES
         cls.to_string = classmethod(to_string)
 
     # {{{ get_info attributes -------------------------------------------------
@@ -1279,8 +1336,8 @@ def _add_functionality():
         return property(result)
 
     for cls, (info_method, info_class, cacheable_attrs) \
-            in six.iteritems(cls_to_info_cls):
-        for info_name, info_value in six.iteritems(info_class.__dict__):
+            in cls_to_info_cls.items():
+        for info_name, info_value in info_class.__dict__.items():
             if info_name == "to_string" or info_name.startswith("_"):
                 continue
 
@@ -1349,7 +1406,7 @@ def create_some_context(interactive=None, answers=None):
         if answers:
             return str(answers.pop(0))
         elif not interactive:
-            return ''
+            return ""
         else:
             user_input = input(prompt)
             user_inputs.append(user_input)
@@ -1447,7 +1504,7 @@ _csc = create_some_context
 
 # {{{ SVMMap
 
-class SVMMap(object):
+class SVMMap:
     """
     .. attribute:: event
 
@@ -1504,10 +1561,17 @@ def enqueue_copy(queue, dest, src, **kwargs):
     :arg wait_for: (optional, default empty)
     :arg is_blocking: Wait for completion. Defaults to *True*.
       (Available on any copy involving host memory)
-
     :return: A :class:`NannyEvent` if the transfer involved a
         host-side buffer, otherwise an :class:`Event`.
 
+    .. note::
+
+        Be aware that the deletion of the :class:`NannyEvent` that is
+        returned by the function if the transfer involved a host-side buffer
+        will block until the transfer is complete, so be sure to keep a
+        reference to this :class:`Event` until the
+        transfer has completed.
+
     .. note::
 
         Two types of 'buffer' occur in the arguments to this function,
@@ -1670,10 +1734,11 @@ def enqueue_copy(queue, dest, src, **kwargs):
 
     elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM):
         # to SVM
-        if isinstance(src, SVM):
-            src = src.mem
+        if not isinstance(src, SVM):
+            src = SVM(src)
 
-        return _cl._enqueue_svm_memcpy(queue, dest.mem, src, **kwargs)
+        is_blocking = kwargs.pop("is_blocking", True)
+        return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
     else:
         # assume to-host
@@ -1701,7 +1766,9 @@ def enqueue_copy(queue, dest, src, **kwargs):
         elif isinstance(src, SVM):
             # from svm
             # dest is not a SVM instance, otherwise we'd be in the branch above
-            return _cl._enqueue_svm_memcpy(queue, dest, src.mem, **kwargs)
+            is_blocking = kwargs.pop("is_blocking", True)
+            return _cl._enqueue_svm_memcpy(
+                    queue, is_blocking, SVM(dest), src, **kwargs)
         else:
             # assume from-host
             raise TypeError("enqueue_copy cannot perform host-to-host transfers")
@@ -1742,9 +1809,9 @@ def image_from_array(ctx, ary, num_channels=None, mode="r", norm_int=False):
     dtype = ary.dtype
     if num_channels is None:
 
-        from pyopencl.array import vec
         try:
-            dtype, num_channels = vec.type_to_scalar_and_count[dtype]
+            dtype, num_channels = \
+                    pyopencl.cltypes.vec_type_to_scalar_and_count[dtype]
         except KeyError:
             # It must be a scalar type then.
             num_channels = 1
@@ -1897,11 +1964,7 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
         for dim in shape:
             s *= dim
     except TypeError:
-        import sys
-        if sys.version_info >= (3,):
-            admissible_types = (int, np.integer)
-        else:
-            admissible_types = (np.integer,) + six.integer_types
+        admissible_types = (int, np.integer)
 
         if not isinstance(shape, admissible_types):
             raise TypeError("shape must either be iterable or "
diff --git a/pyopencl/_buffers.py b/pyopencl/_buffers.py
deleted file mode 100644
index bbf81a2fe3bb631dd9d28f13b86caa56a4fb84bc..0000000000000000000000000000000000000000
--- a/pyopencl/_buffers.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#! /usr/bin/env python
-# Shamelessly stolen from pyopengl-ctypes on 2015-06-21.
-#
-# Original file here:
-# http://bazaar.launchpad.net/~mcfletch/pyopengl/trunk/view/head:/OpenGL/arrays/_buffers.py
-
-"""Python 3.x buffer-handling (currently just for bytes/bytearray types)
-"""
-
-import ctypes
-import sys
-
-if sys.version_info[:2] < (2, 6):
-    raise ImportError('Buffer interface only usable on Python 2.6+')
-
-PyBUF_SIMPLE = 0
-PyBUF_WRITABLE = PyBUF_WRITEABLE = 0x0001
-PyBUF_ND = 0x0008
-PyBUF_STRIDES = (0x0010 | PyBUF_ND)
-PyBUF_CONTIG = (PyBUF_ND | PyBUF_WRITABLE)
-PyBUF_CONTIG_RO = (PyBUF_ND)
-PyBUF_C_CONTIGUOUS = (0x0020 | PyBUF_STRIDES)
-PyBUF_F_CONTIGUOUS = (0x0040 | PyBUF_STRIDES)
-PyBUF_ANY_CONTIGUOUS = (0x0080 | PyBUF_STRIDES)
-PyBUF_FORMAT = 0x0004
-
-# Python 2.6 doesn't define this...
-c_ssize_t = getattr(ctypes, 'c_ssize_t', ctypes.c_ulong)
-
-_fields_ = [
-    ('buf', ctypes.c_void_p),
-    ('obj', ctypes.c_void_p),
-    ('len', c_ssize_t),
-    ('itemsize', c_ssize_t),
-
-    ('readonly', ctypes.c_int),
-    ('ndim', ctypes.c_int),
-    ('format', ctypes.c_char_p),
-    ('shape', ctypes.POINTER(c_ssize_t)),
-    ('strides', ctypes.POINTER(c_ssize_t)),
-    ('suboffsets', ctypes.POINTER(c_ssize_t)),
-]
-
-
-if sys.version_info[:2] <= (2, 6) or sys.version_info[:2] >= (3, 3):
-    # Original structure was eventually restored in 3.3, so just
-    # 2.7 through 3.2 uses the "enhanced" structure below
-    _fields_.extend([
-        ('internal', ctypes.c_void_p),
-    ])
-
-else:
-    # Sigh, this structure seems to have changed with Python 3.x...
-    _fields_.extend([
-        ('smalltable', ctypes.c_size_t*2),
-        ('internal', ctypes.c_void_p),
-    ])
-
-
-class Py_buffer(ctypes.Structure):  # noqa
-    @classmethod
-    def from_object(cls, obj, flags):
-        """Create a new Py_buffer referencing ram of object"""
-        if not CheckBuffer(obj):
-            raise TypeError(
-                    "%s type does not support Buffer Protocol" % (obj.__class__,))
-        buf = cls()
-
-        # deallocation of the buf causes glibc abort :(
-        result = GetBuffer(obj, buf, flags)
-
-        if result != 0:
-            raise ValueError("Unable to retrieve Buffer from %s" % (obj,))
-        if not buf.buf:
-            raise ValueError("Null pointer result from %s" % (obj,))
-        return buf
-
-    _fields_ = _fields_
-
-    @property
-    def dims(self):
-        return self.shape[:self.ndim]
-
-    def __len__(self):
-        return self.shape[0]
-
-    @property
-    def dim_strides(self):
-        if self.strides:
-            return self.strides[:self.ndim]
-        return None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
-        if self.obj:
-            ReleaseBuffer(self)
-
-    def __del__(self):
-        if self.obj:
-            ReleaseBuffer(self)
-
-
-try:
-    CheckBuffer = ctypes.pythonapi.PyObject_CheckBuffer
-    CheckBuffer.argtypes = [ctypes.py_object]
-    CheckBuffer.restype = ctypes.c_int
-except AttributeError:
-    # Python 2.6 doesn't appear to have CheckBuffer support...
-    def CheckBuffer(x):  # noqa
-        return True
-
-IncRef = ctypes.pythonapi.Py_IncRef
-IncRef.argtypes = [ctypes.py_object]
-
-GetBuffer = ctypes.pythonapi.PyObject_GetBuffer
-GetBuffer.argtypes = [ctypes.py_object, ctypes.POINTER(Py_buffer), ctypes.c_int]
-GetBuffer.restype = ctypes.c_int
-
-ReleaseBuffer = ctypes.pythonapi.PyBuffer_Release
-ReleaseBuffer.argtypes = [ctypes.POINTER(Py_buffer)]
-ReleaseBuffer.restype = None
diff --git a/pyopencl/_mymako.py b/pyopencl/_mymako.py
index 78061f31e6baf7e300e0caa95ce6a175f31e9823..5d5e92f81b2307d6104c2213af8a8bf8da6fd0ad 100644
--- a/pyopencl/_mymako.py
+++ b/pyopencl/_mymako.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 try:
     import mako.template  # noqa
 except ImportError:
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index dfa5d4bbfd0721915e3979c9cd027d62ecab63f1..197ad94839745eb62934db9b4993a45e56038244 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1,12 +1,10 @@
-"""Scan primitive."""
+"""Algorithms built on scans."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
 
-__copyright__ = """Copyright 2011-2012 Andreas Kloeckner \
-                   Copyright 2017 Hao Gao"""
+__copyright__ = """
+Copyright 2011-2012 Andreas Kloeckner
+Copyright 2017 Hao Gao
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person
@@ -31,6 +29,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 """
 
+
 import numpy as np
 import pyopencl as cl
 import pyopencl.array  # noqa
@@ -40,35 +39,41 @@ from pytools import memoize, memoize_method, Record
 from mako.template import Template
 
 
-# {{{ copy_if
+# {{{ "extra args" handling utility
 
-_copy_if_template = ScanTemplate(
-        arguments="item_t *ary, item_t *out, scan_t *count",
-        input_expr="(%(predicate)s) ? 1 : 0",
-        scan_expr="a+b", neutral="0",
-        output_statement="""
-            if (prev_item != item) out[item-1] = ary[i];
-            if (i+1 == N) *count = item;
-            """,
-        template_processor="printf")
-
-
-def extract_extra_args_types_values(extra_args):
+def _extract_extra_args_types_values(extra_args):
     from pyopencl.tools import VectorArg, ScalarArg
 
     extra_args_types = []
     extra_args_values = []
+    extra_wait_for = []
     for name, val in extra_args:
         if isinstance(val, cl.array.Array):
             extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
             extra_args_values.append(val)
+            extra_wait_for.extend(val.events)
         elif isinstance(val, np.generic):
             extra_args_types.append(ScalarArg(val.dtype, name))
             extra_args_values.append(val)
         else:
             raise RuntimeError("argument '%d' not understood" % name)
 
-    return tuple(extra_args_types), extra_args_values
+    return tuple(extra_args_types), extra_args_values, extra_wait_for
+
+# }}}
+
+
+# {{{ copy_if
+
+_copy_if_template = ScanTemplate(
+        arguments="item_t *ary, item_t *out, scan_t *count",
+        input_expr="(%(predicate)s) ? 1 : 0",
+        scan_expr="a+b", neutral="0",
+        output_statement="""
+            if (prev_item != item) out[item-1] = ary[i];
+            if (i+1 == N) *count = item;
+            """,
+        template_processor="printf")
 
 
 def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None):
@@ -94,7 +99,12 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     else:
         scan_dtype = np.int32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+        _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _copy_if_template.build(ary.context,
             type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
@@ -104,9 +114,8 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out, count, evt
 
@@ -176,7 +185,12 @@ def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
     else:
         scan_dtype = np.uint32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+            _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _partition_template.build(
             ary.context,
@@ -189,9 +203,8 @@ def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out_true, out_false, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out_true, out_false, count, evt
 
@@ -244,7 +257,12 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="",
     else:
         scan_dtype = np.uint32
 
-    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
+    if wait_for is None:
+        wait_for = []
+
+    extra_args_types, extra_args_values, extra_wait_for = \
+            _extract_extra_args_types_values(extra_args)
+    wait_for = wait_for + extra_wait_for
 
     knl = _unique_template.build(
             ary.context,
@@ -256,9 +274,8 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="",
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
 
-    # **dict is a Py2.5 workaround
     evt = knl(ary, out, count, *extra_args_values,
-            **dict(queue=queue, wait_for=wait_for))
+            queue=queue, wait_for=wait_for)
 
     return out, count, evt
 
@@ -274,13 +291,13 @@ def to_bin(n):
         digs.append(str(n % 2))
         n >>= 1
 
-    return ''.join(digs[::-1])
+    return "".join(digs[::-1])
 
 
-def _padded_bin(i, l):
+def _padded_bin(i, nbits):
     s = to_bin(i)
-    while len(s) < l:
-        s = '0' + s
+    while len(s) < nbits:
+        s = "0" + s
     return s
 
 
@@ -291,7 +308,7 @@ def _make_sort_scan_type(device, bits, index_dtype):
 
     fields = []
     for mnr in range(2**bits):
-        fields.append(('c%s' % _padded_bin(mnr, bits), index_dtype))
+        fields.append(("c%s" % _padded_bin(mnr, bits), index_dtype))
 
     dtype = np.dtype(fields)
 
@@ -402,11 +419,11 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
 from pyopencl.scan import GenericScanKernel
 
 
-class RadixSort(object):
+class RadixSort:
     """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
     on the compute device.
 
-    .. seealso:: :class:`pyopencl.algorithm.BitonicSort`
+    .. seealso:: :class:`pyopencl.bitonic_sort.BitonicSort`
 
     .. versionadded:: 2013.1
     """
@@ -459,7 +476,7 @@ class RadixSort(object):
 
             boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0"
 
-            return ("((mnr < %s) ? %s : %s)" % (
+            return ("((mnr < {}) ? {} : {})".format(
                 int(boundary_mnr, 2),
                 get_count_branch(known_bits+"0"),
                 get_count_branch(known_bits+"1")))
@@ -541,7 +558,7 @@ class RadixSort(object):
             scan_args = args + sorted_args + [base_bit]
 
             last_evt = self.scan_kernel(*scan_args,
-                    **dict(queue=queue, wait_for=wait_for))
+                    queue=queue, wait_for=wait_for)
             wait_for = [last_evt]
 
             # substitute sorted
@@ -875,11 +892,7 @@ class ListOfListsBuilder:
             __global ${index_t} *compressed_indices,
             __global ${index_t} *num_non_empty_list
         """
-        from sys import version_info
-        if version_info > (3, 0):
-            arguments = Template(arguments)
-        else:
-            arguments = Template(arguments, disable_unicode=True)
+        arguments = Template(arguments)
 
         from pyopencl.scan import GenericScanKernel
         return GenericScanKernel(
@@ -918,7 +931,7 @@ class ListOfListsBuilder:
                 continue
 
             name = "plb_loc_%s_count" % name
-            user_list_args.append(OtherArg("%s *%s" % (
+            user_list_args.append(OtherArg("{} *{}".format(
                 index_ctype, name), name))
 
         kernel_name = self.name_prefix+"_count"
@@ -989,10 +1002,10 @@ class ListOfListsBuilder:
                     VectorArg(index_dtype, "%s_compressed_indices" % name))
 
             index_name = "plb_%s_index" % name
-            user_list_args.append(OtherArg("%s *%s" % (
+            user_list_args.append(OtherArg("{} *{}".format(
                 index_ctype, index_name), index_name))
 
-            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)
+            kernel_list_arg_values += f"{list_name}, &{index_name}, "
 
         kernel_name = self.name_prefix+"_write"
 
@@ -1107,6 +1120,9 @@ class ListOfListsBuilder:
 
         if wait_for is None:
             wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
 
         count_kernel = self.get_count_kernel(index_dtype)
         write_kernel = self.get_write_kernel(index_dtype)
@@ -1133,6 +1149,7 @@ class ListOfListsBuilder:
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
@@ -1177,7 +1194,7 @@ class ListOfListsBuilder:
 
         count_event = count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + data_args + (n_objects,)),
-                **dict(wait_for=wait_for))
+                wait_for=wait_for)
 
         compress_events = {}
         for name, dtype in self.list_names_and_dtypes:
@@ -1283,7 +1300,7 @@ class ListOfListsBuilder:
 
         evt = write_kernel(queue, gsize, lsize,
                 *(tuple(write_list_args) + data_args + (n_objects,)),
-                **dict(wait_for=scan_events))
+                wait_for=scan_events)
 
         return result, evt
 
@@ -1309,7 +1326,7 @@ def _make_cl_int_literal(value, dtype):
     return result
 
 
-class KeyValueSorter(object):
+class KeyValueSorter:
     """Given arrays *values* and *keys* of equal length
     and a number *nkeys* of keys, returns a tuple `(starts,
     lists)`, as follows: *values* and *keys* are sorted
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 046c841c69ef449f974c777f8fba80ee35eba0f0..b10c8cc47911ea8f9fe617fb32beefdfa947b398 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2,7 +2,6 @@
 
 # pylint:disable=unexpected-keyword-arg  # for @elwise_kernel_runner
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -29,8 +28,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 """
 
-import six
-from six.moves import range, reduce
+from functools import reduce
 
 import numpy as np
 import pyopencl.elementwise as elementwise
@@ -86,7 +84,7 @@ except Exception:
         return False
 
 
-class VecLookupWarner(object):
+class VecLookupWarner:
     def __getattr__(self, name):
         from warnings import warn
         warn("pyopencl.array.vec is deprecated. "
@@ -179,7 +177,7 @@ def elwise_kernel_runner(kernel_getter):
                 actual_args.append(arg)
         actual_args.append(repr_ary.size)
 
-        return knl(queue, gs, ls, *actual_args, **dict(wait_for=wait_for))
+        return knl(queue, gs, ls, *actual_args, wait_for=wait_for)
 
     try:
         from functools import update_wrapper
@@ -227,18 +225,18 @@ class _copy_queue:  # noqa
     pass
 
 
-class Array(object):
+class Array:
     """A :class:`numpy.ndarray` work-alike that stores its data and performs
     its computations on the compute device.  *shape* and *dtype* work exactly
     as in :mod:`numpy`.  Arithmetic methods in :class:`Array` support the
     broadcasting of scalars. (e.g. `array+5`)
 
-    *cq* must be a :class:`pyopencl.CommandQueue` or a :class:`pyopencl.Context`.
+    *cq* must be a :class:`~pyopencl.CommandQueue` or a :class:`~pyopencl.Context`.
 
     If it is a queue, *cq* specifies the queue in which the array carries out
     its computations by default. If a default queue (and thereby overloaded
     operators and many other niceties) are not desired, pass a
-    :class:`Context`.
+    :class:`~pyopencl.Context`.
 
     *allocator* may be `None` or a callable that, upon being called with an
     argument of the number of bytes to be allocated, returns an
@@ -326,6 +324,7 @@ class Array(object):
     .. attribute :: T
     .. automethod :: set
     .. automethod :: get
+    .. automethod :: get_async
     .. automethod :: copy
 
     .. automethod :: __str__
@@ -361,6 +360,7 @@ class Array(object):
     .. autoattribute :: real
     .. autoattribute :: imag
     .. automethod :: conj
+    .. automethod :: conjugate
 
     .. automethod :: __getitem__
     .. automethod :: __setitem__
@@ -456,11 +456,7 @@ class Array(object):
             for dim in shape:
                 size *= dim
         except TypeError:
-            import sys
-            if sys.version_info >= (3,):
-                admissible_types = (int, np.integer)
-            else:
-                admissible_types = (np.integer,) + six.integer_types
+            admissible_types = (int, np.integer)
 
             if not isinstance(shape, admissible_types):
                 raise TypeError("shape must either be iterable or "
@@ -468,6 +464,9 @@ class Array(object):
             size = shape
             shape = (shape,)
 
+        if any(dim < 0 for dim in shape):
+            raise ValueError("negative dimensions are not allowed")
+
         if isinstance(size, np.integer):
             size = size.item()
 
@@ -503,23 +502,22 @@ class Array(object):
         self.allocator = allocator
 
         if data is None:
-            if alloc_nbytes <= 0:
-                if alloc_nbytes == 0:
-                    # Work around CL not allowing zero-sized buffers.
-                    alloc_nbytes = 1
-
-                else:
-                    raise ValueError("cannot allocate CL buffer with "
-                            "negative size")
+            if alloc_nbytes < 0:
+                raise ValueError("cannot allocate CL buffer with "
+                        "negative size")
 
-            if allocator is None:
-                if context is None and queue is not None:
-                    context = queue.context
+            elif alloc_nbytes == 0:
+                self.base_data = None
 
-                self.base_data = cl.Buffer(
-                        context, cl.mem_flags.READ_WRITE, alloc_nbytes)
             else:
-                self.base_data = self.allocator(alloc_nbytes)
+                if allocator is None:
+                    if context is None and queue is not None:
+                        context = queue.context
+
+                    self.base_data = cl.Buffer(
+                            context, cl.mem_flags.READ_WRITE, alloc_nbytes)
+                else:
+                    self.base_data = self.allocator(alloc_nbytes)
         else:
             self.base_data = data
 
@@ -604,6 +602,10 @@ class Array(object):
         *ary* must have the same dtype and size (not necessarily shape) as
         *self*.
 
+        *async_* is a Boolean indicating whether the function is allowed
+        to return before the transfer completes. To avoid synchronization
+        bugs, this defaults to *False*.
+
         .. versionchanged:: 2017.2.1
 
             Python 3.7 makes ``async`` a reserved keyword. On older Pythons,
@@ -648,23 +650,7 @@ class Array(object):
                     is_blocking=not async_)
             self.add_event(event1)
 
-    def get(self, queue=None, ary=None, async_=None, **kwargs):
-        """Transfer the contents of *self* into *ary* or a newly allocated
-        :mod:`numpy.ndarray`. If *ary* is given, it must have the same
-        shape and dtype.
-
-        .. versionchanged:: 2015.2
-
-            *ary* with different shape was deprecated.
-
-        .. versionchanged:: 2017.2.1
-
-            Python 3.7 makes ``async`` a reserved keyword. On older Pythons,
-            we will continue to  accept *async* as a parameter, however this
-            should be considered deprecated. *async_* is the new, official
-            spelling.
-        """
-
+    def _get(self, queue=None, ary=None, async_=None, **kwargs):
         # {{{ handle 'async' deprecation
 
         async_arg = kwargs.pop("async", None)
@@ -709,15 +695,66 @@ class Array(object):
                     "to associate one.")
 
         if self.size:
-            cl.enqueue_copy(queue, ary, self.base_data,
+            event1 = cl.enqueue_copy(queue, ary, self.base_data,
                     device_offset=self.offset,
                     wait_for=self.events, is_blocking=not async_)
+            self.add_event(event1)
+        else:
+            event1 = None
+
+        return ary, event1
+
+    def get(self, queue=None, ary=None, async_=None, **kwargs):
+        """Transfer the contents of *self* into *ary* or a newly allocated
+        :class:`numpy.ndarray`. If *ary* is given, it must have the same
+        shape and dtype.
+
+        .. versionchanged:: 2019.1.2
+
+            Calling with `async_=True` was deprecated and replaced by
+            :meth:`get_async`.
+            The event returned by :meth:`pyopencl.enqueue_copy` is now stored into
+            :attr:`events` to ensure data is not modified before the copy is
+            complete.
+
+        .. versionchanged:: 2015.2
+
+            *ary* with different shape was deprecated.
+
+        .. versionchanged:: 2017.2.1
+
+            Python 3.7 makes ``async`` a reserved keyword. On older Pythons,
+            we will continue to  accept *async* as a parameter, however this
+            should be considered deprecated. *async_* is the new, official
+            spelling.
+        """
+
+        if async_:
+            from warnings import warn
+            warn("calling pyopencl.Array.get with `async_=True` is deprecated. "
+                    "Please use pyopencl.Array.get_async for asynchronous "
+                    "device-to-host transfers",
+                    DeprecationWarning, 2)
+
+        ary, event1 = self._get(queue=queue, ary=ary, async_=async_, **kwargs)
 
         return ary
 
+    def get_async(self, queue=None, ary=None, **kwargs):
+        """
+        Asynchronous version of :meth:`get` which returns a tuple ``(ary, event)``
+        containing the host array `ary`
+        and the :class:`pyopencl.NannyEvent` `event` returned by
+        :meth:`pyopencl.enqueue_copy`.
+
+        .. versionadded:: 2019.1.2
+        """
+
+        return self._get(queue=queue, ary=ary, async_=True, **kwargs)
+
     def copy(self, queue=_copy_queue):
         """
-        :arg queue: The :class:`CommandQueue` for the returned array.
+        :arg queue: The :class:`~pyopencl.CommandQueue` for the returned array.
 
         .. versionchanged:: 2017.1.2
             Updates the queue of the returned array.
@@ -751,7 +788,7 @@ class Array(object):
         return repr(self.get())
 
     def safely_stringify_for_pudb(self):
-        return "cl.Array %s %s" % (self.dtype, self.shape)
+        return f"cl.Array {self.dtype} {self.shape}"
 
     def __hash__(self):
         raise TypeError("pyopencl arrays are not hashable.")
@@ -1223,6 +1260,9 @@ class Array(object):
     def _zero_fill(self, queue=None, wait_for=None):
         queue = queue or self.queue
 
+        if not self.size:
+            return
+
         if (
                 queue._get_cl_version() >= (1, 2)
                 and cl.get_cl_header_version() >= (1, 2)):
@@ -1467,6 +1507,8 @@ class Array(object):
         else:
             return self
 
+    conjugate = conj
+
     # }}}
 
     # {{{ event management
@@ -1941,7 +1983,7 @@ def as_strided(ary, shape=None, strides=None):
             data=ary.data, strides=strides)
 
 
-class _same_as_transfer(object):  # noqa
+class _same_as_transfer:  # noqa
     pass
 
 
@@ -1950,7 +1992,7 @@ def to_device(queue, ary, allocator=None, async_=None,
     """Return a :class:`Array` that is an exact copy of the
     :class:`numpy.ndarray` instance *ary*.
 
-    :arg array_queue: The :class:`CommandQueue` which will
+    :arg array_queue: The :class:`~pyopencl.CommandQueue` which will
         be stored in the resulting array. Useful
         to make sure there is no implicit queue associated
         with the array by passing *None*.
@@ -2042,15 +2084,16 @@ def _arange_knl(result, start, step):
 
 
 def arange(queue, *args, **kwargs):
-    """Create a :class:`Array` filled with numbers spaced `step` apart,
-    starting from `start` and ending at `stop`.
+    """arange(queue, [start, ] stop [, step], **kwargs)
+    Create a :class:`Array` filled with numbers spaced `step` apart,
+    starting from `start` and ending at `stop`. If not given, *start*
+    defaults to 0, *step* defaults to 1.
 
     For floating point arguments, the length of the result is
     `ceil((stop - start)/step)`.  This rule may result in the last
     element of the result being greater than `stop`.
 
-    *dtype*, if not specified, is taken as the largest common type
-    of *start*, *stop* and *step*.
+    *dtype* is a required keyword argument.
 
     .. versionchanged:: 2011.1
         *context* argument was deprecated.
@@ -2098,7 +2141,7 @@ def arange(queue, *args, **kwargs):
         raise ValueError("too many arguments")
 
     admissible_names = ["start", "stop", "step", "dtype", "allocator"]
-    for k, v in six.iteritems(kwargs):
+    for k, v in kwargs.items():
         if k in admissible_names:
             if getattr(inf, k) is None:
                 setattr(inf, k, v)
@@ -2382,7 +2425,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + [use_fill_cla.base_data, use_fill_cla.offset]
                     + [array_lengths_cla.base_data, array_lengths_cla.offset]
                     + [dest_indices.size]),
-                **dict(wait_for=wait_for_this))
+                wait_for=wait_for_this)
 
         for o in out[chunk_slice]:
             o.add_event(evt)
diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py
index 4c13cbaa871bd88556c87618f4a42b6d619abd68..29fff563a05ebd4393d28068ecedef68f917945a 100644
--- a/pyopencl/bitonic_sort.py
+++ b/pyopencl/bitonic_sort.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement, absolute_import, print_function
-
 __copyright__ = """
 Copyright (c) 2011, Eric Bainville
 Copyright (c) 2015, Ilya Efimoff
@@ -50,7 +48,7 @@ def _is_power_of_2(n):
     return n == 0 or 2**bitlog2(n) == n
 
 
-class BitonicSort(object):
+class BitonicSort:
     """Sort an array (or one axis of one) using a sorting network.
 
     Will only work if the axis of the array to be sorted has a length
@@ -64,14 +62,14 @@ class BitonicSort(object):
     """
 
     kernels_srcs = {
-            'B2': _tmpl.ParallelBitonic_B2,
-            'B4': _tmpl.ParallelBitonic_B4,
-            'B8': _tmpl.ParallelBitonic_B8,
-            'B16': _tmpl.ParallelBitonic_B16,
-            'C4': _tmpl.ParallelBitonic_C4,
-            'BL': _tmpl.ParallelBitonic_Local,
-            'BLO': _tmpl.ParallelBitonic_Local_Optim,
-            'PML': _tmpl.ParallelMerge_Local
+            "B2": _tmpl.ParallelBitonic_B2,
+            "B4": _tmpl.ParallelBitonic_B4,
+            "B8": _tmpl.ParallelBitonic_B8,
+            "B16": _tmpl.ParallelBitonic_B16,
+            "C4": _tmpl.ParallelBitonic_C4,
+            "BL": _tmpl.ParallelBitonic_Local,
+            "BLO": _tmpl.ParallelBitonic_Local_Optim,
+            "PML": _tmpl.ParallelMerge_Local
             }
 
     def __init__(self, context):
@@ -162,7 +160,7 @@ class BitonicSort(object):
         key_ctype = dtype_to_ctype(key_dtype)
 
         if idx_dtype is None:
-            idx_ctype = 'uint'  # Dummy
+            idx_ctype = "uint"  # Dummy
 
         else:
             idx_ctype = dtype_to_ctype(idx_dtype)
@@ -206,7 +204,7 @@ class BitonicSort(object):
 
         length = wg >> 1
         prg = self.get_program(
-                'BLO', argsort, (1, 1, key_ctype, idx_ctype, ds, ns))
+                "BLO", argsort, (1, 1, key_ctype, idx_ctype, ds, ns))
         run_queue.append((prg.run, size, (wg,), True))
 
         while length < ds:
@@ -215,16 +213,16 @@ class BitonicSort(object):
                 ninc = 0
                 direction = length << 1
                 if allowb16 and inc >= 8 and ninc == 0:
-                    letter = 'B16'
+                    letter = "B16"
                     ninc = 4
                 elif allowb8 and inc >= 4 and ninc == 0:
-                    letter = 'B8'
+                    letter = "B8"
                     ninc = 3
                 elif allowb4 and inc >= 2 and ninc == 0:
-                    letter = 'B4'
+                    letter = "B4"
                     ninc = 2
                 elif inc >= 0:
-                    letter = 'B2'
+                    letter = "B2"
                     ninc = 1
 
                 nthreads = size >> ninc
diff --git a/pyopencl/cache.py b/pyopencl/cache.py
index fd46dc5c88fd85b3731f5758d38ee56492f9f69e..adae470b2a58f96db11f22d38a8e374712ee259f 100644
--- a/pyopencl/cache.py
+++ b/pyopencl/cache.py
@@ -1,6 +1,5 @@
 """PyOpenCL compiler cache."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2011 Andreas Kloeckner"
 
@@ -24,8 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import zip
 import pyopencl._cl as _cl
 import re
 import sys
@@ -36,13 +33,8 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-try:
-    import hashlib
-    new_hash = hashlib.md5
-except ImportError:
-    # for Python << 2.5
-    import md5
-    new_hash = md5.new
+import hashlib
+new_hash = hashlib.md5
 
 
 def _erase_dir(dir):
@@ -54,7 +46,7 @@ def _erase_dir(dir):
 
 
 def update_checksum(checksum, obj):
-    if isinstance(obj, six.text_type):
+    if isinstance(obj, str):
         checksum.update(obj.encode("utf8"))
     else:
         checksum.update(obj)
@@ -62,7 +54,7 @@ def update_checksum(checksum, obj):
 
 # {{{ cleanup
 
-class CleanupBase(object):
+class CleanupBase:
     pass
 
 
@@ -106,6 +98,11 @@ class CacheLockManager(CleanupBase):
                     warn("could not obtain cache lock--delete '%s' if necessary"
                             % self.lock_file)
 
+                if attempts > 3 * 60:
+                    raise RuntimeError("waited more than three minutes "
+                            "on the lock file '%s'"
+                            "--something is wrong" % self.lock_file)
+
             cleanup_m.register(self)
 
     def clean_up(self):
@@ -168,8 +165,8 @@ def get_dependencies(src, include_path):
 
                 if included_file_name not in result:
                     try:
-                        src_file = open(included_file_name, "rt")
-                    except IOError:
+                        src_file = open(included_file_name)
+                    except OSError:
                         continue
 
                     try:
@@ -198,7 +195,7 @@ def get_dependencies(src, include_path):
 
     _inner(src)
 
-    result = list((name,) + vals for name, vals in six.iteritems(result))
+    result = list((name,) + vals for name, vals in result.items())
     result.sort()
 
     return result
@@ -275,7 +272,7 @@ def retrieve_from_cache(cache_dir, cache_key):
 
                 try:
                     info_file = open(info_path, "rb")
-                except IOError:
+                except OSError:
                     raise _InvalidInfoFile()
 
                 try:
@@ -330,8 +327,8 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
     if cache_dir is None:
         import appdirs
         cache_dir = join(appdirs.user_cache_dir("pyopencl", "pyopencl"),
-                "pyopencl-compiler-cache-v2-py%s" % (
-                    ".".join(str(i) for i in sys.version_info),))
+                "pyopencl-compiler-cache-v2-py{}".format(
+                    ".".join(str(i) for i in sys.version_info)))
 
     # {{{ ensure cache directory exists
 
@@ -369,7 +366,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
             logs.append(log)
 
     message = (75*"="+"\n").join(
-            "Build on %s succeeded, but said:\n\n%s" % (dev, log)
+            f"Build on {dev} succeeded, but said:\n\n{log}"
             for dev, log in zip(devices, logs)
             if log is not None and log.strip())
 
@@ -469,28 +466,36 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
 def create_built_program_from_source_cached(ctx, src, options_bytes, devices=None,
         cache_dir=None, include_path=None):
     try:
+        was_cached = False
+        already_built = False
         if cache_dir is not False:
             prg, already_built, was_cached = \
                     _create_built_program_from_source_cached(
                             ctx, src, options_bytes, devices, cache_dir,
                             include_path=include_path)
+            if was_cached and not already_built:
+                prg.build(options_bytes, devices)
+                already_built = True
         else:
             prg = _cl._Program(ctx, src)
-            was_cached = False
-            already_built = False
 
     except Exception as e:
         from pyopencl import Error
-        if (isinstance(e, Error)
-                and e.code == _cl.status_code.BUILD_PROGRAM_FAILURE):  # noqa pylint:disable=no-member
-            # no need to try again
+        build_program_failure = (isinstance(e, Error)
+                and e.code == _cl.status_code.BUILD_PROGRAM_FAILURE)  # noqa pylint:disable=no-member
+
+        # Mac error on intel CPU driver: can't build from cached version.
+        # If we get a build_program_failure from the cached version then
+        # build from source instead, otherwise report the failure.
+        if build_program_failure and not was_cached:
             raise
 
-        from warnings import warn
-        from traceback import format_exc
-        warn("PyOpenCL compiler caching failed with an exception:\n"
-                "[begin exception]\n%s[end exception]"
-                % format_exc())
+        if not build_program_failure:
+            from warnings import warn
+            from traceback import format_exc
+            warn("PyOpenCL compiler caching failed with an exception:\n"
+                    "[begin exception]\n%s[end exception]"
+                    % format_exc())
 
         prg = _cl._Program(ctx, src)
         was_cached = False
diff --git a/pyopencl/capture_call.py b/pyopencl/capture_call.py
index 09d483a57ad6d87387e519d8a72fea79142f3244..867365319f39f4e4a629aa6446b8a607c2d16b93 100644
--- a/pyopencl/capture_call.py
+++ b/pyopencl/capture_call.py
@@ -1,7 +1,3 @@
-from __future__ import with_statement, division
-from __future__ import absolute_import
-from six.moves import zip
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -51,8 +47,8 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
     cg("")
 
     cg('CODE = r"""//CL//')
-    for l in source.split("\n"):
-        cg(l)
+    for line in source.split("\n"):
+        cg(line)
     cg('"""')
 
     # {{{ invocation
@@ -83,13 +79,13 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
             elif isinstance(arg, (int, float)):
                 kernel_args.append(repr(arg))
             elif isinstance(arg, np.integer):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(int(arg))))
             elif isinstance(arg, np.floating):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(float(arg))))
             elif isinstance(arg, np.complexfloating):
-                kernel_args.append("np.%s(%s)" % (
+                kernel_args.append("np.{}({})".format(
                     arg.dtype.type.__name__, repr(complex(arg))))
             else:
                 try:
@@ -133,7 +129,7 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
                     % ", ".join(
                         strify_dtype(dt) for dt in kernel._scalar_arg_dtypes))
 
-        cg("knl(queue, %s, %s," % (repr(g_size), repr(l_size)))
+        cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size)))
         cg("    %s)" % ", ".join(kernel_args))
         cg("")
         cg("queue.finish()")
@@ -163,7 +159,7 @@ def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs
     # {{{ file trailer
 
     cg("")
-    cg("if __name__ == \"__main__\":")
+    cg('if __name__ == "__main__":')
     with Indentation(cg):
         cg("main()")
     cg("")
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
index eae523be2f045bcadafb28166001cc6beeaf445f..dfb8d0195a9bd7fc946ec50f9b79b2791db5e4e2 100644
--- a/pyopencl/characterize/__init__.py
+++ b/pyopencl/characterize/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -24,8 +22,6 @@ THE SOFTWARE.
 
 import pyopencl as cl
 from pytools import memoize
-import six
-from six.moves import range, zip
 
 
 class CLCharacterizationWarning(UserWarning):
@@ -237,13 +233,13 @@ def why_not_local_access_conflict_free(dev, itemsize,
 
             bank = (addr // gran) % bank_count
             bank_accesses.setdefault(bank, []).append(
-                    "w.item %s -> %s" % (work_item_id, idx[::-1]))
+                    "w.item {} -> {}".format(work_item_id, idx[::-1]))
 
         conflict_multiplicity = max(
-                len(acc) for acc in six.itervalues(bank_accesses))
+                len(acc) for acc in bank_accesses.values())
 
         if conflict_multiplicity > 1:
-            for bank, acc in six.iteritems(bank_accesses):
+            for bank, acc in bank_accesses.items():
                 if len(acc) == conflict_multiplicity:
                     conflicts.append(
                             (conflict_multiplicity,
diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py
index f0c769077fe4a2d2959b5f39e2f46588c0eca3cc..f629240438ddd92404a1a8f29fa100761347c95b 100644
--- a/pyopencl/characterize/performance.py
+++ b/pyopencl/characterize/performance.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
 import pyopencl as cl
 import numpy as np
 
@@ -100,7 +97,7 @@ def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
 
 # {{{ transfer measurements
 
-class HostDeviceTransferBase(object):
+class HostDeviceTransferBase:
     def __init__(self, queue, block_size):
         self.queue = queue
         self.host_buf = np.empty(block_size, dtype=np.uint8)
@@ -117,7 +114,7 @@ class DeviceToHostTransfer(HostDeviceTransferBase):
         return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)
 
 
-class DeviceToDeviceTransfer(object):
+class DeviceToDeviceTransfer:
     def __init__(self, queue, block_size):
         self.queue = queue
         mf = cl.mem_flags
diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py
index 2ae8bfbfa22fd1842134c8db96be03a2e7fb9a44..58c20ce5f48fa25379dd5d84bce95709537c6d61 100644
--- a/pyopencl/clmath.py
+++ b/pyopencl/clmath.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 # pylint:disable=unexpected-keyword-arg  # for @elwise_kernel_runner
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
@@ -35,7 +33,7 @@ def _make_unary_array_func(name):
     def knl_runner(result, arg):
         if arg.dtype.kind == "c":
             from pyopencl.elementwise import complex_dtype_to_name
-            fname = "%s_%s" % (complex_dtype_to_name(arg.dtype), name)
+            fname = "{}_{}".format(complex_dtype_to_name(arg.dtype), name)
         else:
             fname = name
 
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 96acce1f40c15cd5d87cc71a5761e328d950146c..aada2bd850d305735a62b67517cb4418d6fe7687 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2009-16 Andreas Kloeckner"
 
 __license__ = """
@@ -26,7 +23,7 @@ THE SOFTWARE.
 
 # {{{ documentation
 
-__doc__ = u"""
+__doc__ = """
 PyOpenCL now includes and uses some of the `Random123 random number generators
 <https://www.deshawresearch.com/resources_random123.html>`_ by D.E. Shaw
 Research.  In addition to being usable through the convenience functions above,
@@ -72,7 +69,7 @@ import numpy as np
 
 # {{{ RanluxGenerator (deprecated)
 
-class RanluxGenerator(object):
+class RanluxGenerator:
     """
     .. warning::
 
@@ -382,21 +379,21 @@ class RanluxGenerator(object):
     @memoize_method
     def get_sync_kernel(self):
         src = """//CL//
-            %(defines)s
+            {defines}
 
             #include <pyopencl-ranluxcl.cl>
 
             kernel void sync(
                 global ranluxcl_state_t *ranluxcltab)
-            {
+            {{
               ranluxcl_state_t ranluxclstate;
               ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
               ranluxcl_synchronize(&ranluxclstate);
               ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
-            }
-            """ % {
-                "defines": self.generate_settings_defines(),
-                }
+            }}
+            """.format(
+                defines=self.generate_settings_defines(),
+                )
         prg = cl.Program(self.context, src).build()
         return prg.sync
 
@@ -414,7 +411,7 @@ class RanluxGenerator(object):
 
 # {{{ Random123 generators
 
-class Random123GeneratorBase(object):
+class Random123GeneratorBase:
     """
     .. versionadded:: 2016.2
 
@@ -535,9 +532,9 @@ class Random123GeneratorBase(object):
                     "unsupported RNG distribution/data type combination '%s/%s'"
                     % rng_key)
 
-        kernel_name = "rng_gen_%s_%s" % (self.generator_name, distribution)
+        kernel_name = f"rng_gen_{self.generator_name}_{distribution}"
         src = """//CL//
-            #include <%(header_name)s>
+            #include <{header_name}>
 
             #ifndef M_PI
             #ifdef M_PI_F
@@ -547,29 +544,29 @@ class Random123GeneratorBase(object):
             #endif
             #endif
 
-            typedef %(output_t)s output_t;
-            typedef %(output_t)s4 output_vec_t;
-            typedef %(gen_name)s_ctr_t ctr_t;
-            typedef %(gen_name)s_key_t key_t;
+            typedef {output_t} output_t;
+            typedef {output_t}4 output_vec_t;
+            typedef {gen_name}_ctr_t ctr_t;
+            typedef {gen_name}_key_t key_t;
 
             uint4 gen_bits(key_t *key, ctr_t *ctr)
-            {
-                union {
+            {{
+                union {{
                     ctr_t ctr_el;
                     uint4 vec_el;
-                } u;
+                }} u;
 
-                u.ctr_el = %(gen_name)s(*ctr, *key);
+                u.ctr_el = {gen_name}(*ctr, *key);
                 if (++ctr->v[0] == 0)
                     if (++ctr->v[1] == 0)
                         ++ctr->v[2];
 
                 return u.vec_el;
-            }
+            }}
 
-            #if %(include_box_muller)s
+            #if {include_box_muller}
             output_vec_t box_muller(output_vec_t x)
-            {
+            {{
                 #define BOX_MULLER(I, COMPA, COMPB) \
                     output_t r##I = sqrt(-2*log(x.COMPA)); \
                     output_t c##I; \
@@ -578,14 +575,14 @@ class Random123GeneratorBase(object):
                 BOX_MULLER(0, x, y);
                 BOX_MULLER(1, z, w);
                 return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1);
-            }
+            }}
             #endif
 
-            #define GET_RANDOM_NUM(gen) %(rng_expr)s
+            #define GET_RANDOM_NUM(gen) {rng_expr}
 
-            kernel void %(kernel_name)s(
+            kernel void {kernel_name}(
                 int k1,
-                #if %(key_length)s > 2
+                #if {key_length} > 2
                 int k2, int k3,
                 #endif
                 int c0, int c1, int c2, int c3,
@@ -593,23 +590,23 @@ class Random123GeneratorBase(object):
                 long out_size,
                 output_t scale,
                 output_t shift)
-            {
-                #if %(key_length)s == 2
-                key_t k = {{get_global_id(0), k1}};
+            {{
+                #if {key_length} == 2
+                key_t k = {{{{get_global_id(0), k1}}}};
                 #else
-                key_t k = {{get_global_id(0), k1, k2, k3}};
+                key_t k = {{{{get_global_id(0), k1, k2, k3}}}};
                 #endif
 
-                ctr_t c = {{c0, c1, c2, c3}};
+                ctr_t c = {{{{c0, c1, c2, c3}}}};
 
                 // output bulk
                 unsigned long idx = get_global_id(0)*4;
                 while (idx + 4 < out_size)
-                {
+                {{
                     output_vec_t ran = GET_RANDOM_NUM(gen_bits(&k, &c));
                     vstore4(ran, 0, &output[idx]);
                     idx += 4*get_global_size(0);
-                }
+                }}
 
                 // output tail
                 output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c));
@@ -621,16 +618,16 @@ class Random123GeneratorBase(object):
                   output[idx+2] = tail_ran.z;
                 if (idx+3 < out_size)
                   output[idx+3] = tail_ran.w;
-            }
-            """ % {
-                "kernel_name": kernel_name,
-                "gen_name": self.generator_name,
-                "header_name": self.header_name,
-                "output_t": c_type,
-                "key_length": self.key_length,
-                "include_box_muller": int(distribution == "normal"),
-                "rng_expr": rng_expr
-                }
+            }}
+            """.format(
+                kernel_name=kernel_name,
+                gen_name=self.generator_name,
+                header_name=self.header_name,
+                output_t=c_type,
+                key_length=self.key_length,
+                include_box_muller=int(distribution == "normal"),
+                rng_expr=rng_expr
+                )
 
         prg = cl.Program(self.context, src).build()
         knl = getattr(prg, kernel_name)
diff --git a/pyopencl/cltypes.py b/pyopencl/cltypes.py
index d1ba79f3f8e3905bdee8f119dca3e57a8dda6509..fed1834ca087fed17791171536e70e0446c580c6 100644
--- a/pyopencl/cltypes.py
+++ b/pyopencl/cltypes.py
@@ -1,5 +1,3 @@
-# encoding: utf8
-
 __copyright__ = "Copyright (C) 2016 Jonathan Mackenzie"
 
 __license__ = """
@@ -24,7 +22,7 @@ import numpy as np
 from pyopencl.tools import get_or_register_dtype
 import warnings
 
-if __file__.endswith('array.py'):
+if __file__.endswith("array.py"):
     warnings.warn("pyopencl.array.vec is deprecated. Please use pyopencl.cltypes")
 
 """
@@ -48,8 +46,8 @@ double = np.float64
 
 def _create_vector_types():
     _mapping = [(k, globals()[k]) for k in
-                ['char', 'uchar', 'short', 'ushort', 'int',
-                 'uint', 'long', 'ulong', 'float', 'double']]
+                ["char", "uchar", "short", "ushort", "int",
+                 "uint", "long", "ulong", "float", "double"]]
 
     def set_global(key, val):
         globals()[key] = val
diff --git a/pyopencl/compyte b/pyopencl/compyte
index 49e670e0ab7bbc822032196b3478522c04168d6f..3367a19729cfe42d51133453b7bdfa1756a853d9 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit 49e670e0ab7bbc822032196b3478522c04168d6f
+Subproject commit 3367a19729cfe42d51133453b7bdfa1756a853d9
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index cbd8d74666d3b7eca797672ca4f80224dd3f150c..357aa2bbf17477713905d040376ec199a518f877 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -1,9 +1,5 @@
 """Elementwise functionality."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -83,29 +79,29 @@ def get_elwise_program(context, arguments, operation,
                 stacklevel=3)
 
     source = ("""//CL//
-        %(preamble)s
+        {preamble}
 
         #define PYOPENCL_ELWISE_CONTINUE continue
 
-        __kernel void %(name)s(%(arguments)s)
-        {
+        __kernel void {name}({arguments})
+        {{
           int lid = get_local_id(0);
           int gsize = get_global_size(0);
           int work_group_start = get_local_size(0)*get_group_id(0);
           long i;
 
-          %(loop_prep)s;
-          %(body)s
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            "body": body % dict(operation=operation),
-            })
+          {loop_prep};
+          {body}
+          {after_loop};
+        }}
+        """.format(
+            arguments=", ".join(arg.declarator() for arg in arguments),
+            name=name,
+            preamble=preamble,
+            loop_prep=loop_prep,
+            after_loop=after_loop,
+            body=body % dict(operation=operation),
+            ))
 
     from pyopencl import Program
     return Program(context, source).build(options)
@@ -136,7 +132,7 @@ def get_elwise_kernel_and_types(context, arguments, operation,
                         #define PYOPENCL_DEFINE_CDOUBLE
                         """)
                     have_double_pragma = True
-            if arg.dtype.kind == 'c':
+            if arg.dtype.kind == "c":
                 if not have_complex_include:
                     includes.append("#include <pyopencl-complex.h>\n")
                     have_complex_include = True
@@ -250,6 +246,15 @@ class ElementwiseKernel:
         use_range = range_ is not None or slice_ is not None
         kernel, arg_descrs = self.get_kernel(use_range)
 
+        queue = kwargs.pop("queue", None)
+        wait_for = kwargs.pop("wait_for", None)
+
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         # {{{ assemble arg array
 
         invocation_args = []
@@ -265,13 +270,12 @@ class ElementwiseKernel:
                 invocation_args.append(arg.base_data)
                 if arg_descr.with_offset:
                     invocation_args.append(arg.offset)
+                wait_for.extend(arg.events)
             else:
                 invocation_args.append(arg)
 
         # }}}
 
-        queue = kwargs.pop("queue", None)
-        wait_for = kwargs.pop("wait_for", None)
         if kwargs:
             raise TypeError("unknown keyword arguments: '%s'"
                     % ", ".join(kwargs))
@@ -465,20 +469,20 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
 def get_copy_kernel(context, dtype_dest, dtype_src):
     src = "src[i]"
     if dtype_dest.kind == "c" != dtype_src.kind:
-        src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src)
+        src = "{}_fromreal({})".format(complex_dtype_to_name(dtype_dest), src)
 
     if dtype_dest.kind == "c" and dtype_src != dtype_dest:
-        src = "%s_cast(%s)" % (complex_dtype_to_name(dtype_dest), src),
+        src = "{}_cast({})".format(complex_dtype_to_name(dtype_dest), src),
 
     if dtype_dest != dtype_src and (
             dtype_dest.kind == "V" or dtype_src.kind == "V"):
         raise TypeError("copying between non-identical struct types")
 
     return get_elwise_kernel(context,
-            "%(tp_dest)s *dest, %(tp_src)s *src" % {
-                "tp_dest": dtype_to_ctype(dtype_dest),
-                "tp_src": dtype_to_ctype(dtype_src),
-                },
+            "{tp_dest} *dest, {tp_src} *src".format(
+                tp_dest=dtype_to_ctype(dtype_dest),
+                tp_src=dtype_to_ctype(dtype_src),
+                ),
             "dest[i] = %s" % src,
             preamble=dtype_to_c_struct(context.devices[0], dtype_dest),
             name="copy")
@@ -512,10 +516,10 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
         by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)
 
     if x_is_complex and not y_is_complex:
-        by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by)
+        by = "{}_fromreal({})".format(complex_dtype_to_name(dtype_x), by)
 
     if not x_is_complex and y_is_complex:
-        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax)
+        ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_y), ax)
 
     if x_is_complex or y_is_complex:
         result = (
@@ -525,14 +529,14 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
                     by=by,
                     root=complex_dtype_to_name(dtype_z)))
     else:
-        result = "%s + %s" % (ax, by)
+        result = f"{ax} + {by}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} a, {tp_x} *x, {tp_y} b, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % result,
             name="axpbyz")
 
@@ -551,33 +555,33 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
         x = "x[i]"
 
         if dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
 
         if a_is_complex:
             if dtype_a != dtype_z:
-                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
+                a = "{}_cast({})".format(complex_dtype_to_name(dtype_z), a)
 
-            ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            ax = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
         else:
-            ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            ax = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
     elif a_is_complex:
         a = "a"
         x = "x[i]"
 
         if dtype_a != dtype_z:
-            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
-        ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+            a = "{}_cast({})".format(complex_dtype_to_name(dtype_z), a)
+        ax = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_z), a, x)
 
     b = "b"
     if z_is_complex and not b_is_complex:
-        b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b)
+        b = "{}_fromreal({})".format(complex_dtype_to_name(dtype_z), b)
 
     if z_is_complex and not (a_is_complex or x_is_complex):
-        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax)
+        ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_z), ax)
 
     if z_is_complex:
-        ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax)
-        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b)
+        ax = "{}_cast({})".format(complex_dtype_to_name(dtype_z), ax)
+        b = "{}_cast({})".format(complex_dtype_to_name(dtype_z), b)
 
     if a_is_complex or x_is_complex or b_is_complex:
         expr = "{root}_add({ax}, {b})".format(
@@ -585,15 +589,15 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
                 b=b,
                 root=complex_dtype_to_name(dtype_z))
     else:
-        expr = "%s + %s" % (ax, b)
+        expr = f"{ax} + {b}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % {
-                "tp_a": dtype_to_ctype(dtype_a),
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_b": dtype_to_ctype(dtype_b),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_a} a, {tp_x} *x,{tp_b} b".format(
+                tp_a=dtype_to_ctype(dtype_a),
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_b=dtype_to_ctype(dtype_b),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = " + expr,
             name="axpb")
 
@@ -607,25 +611,25 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
     y = "y[i]"
 
     if x_is_complex and dtype_x != dtype_z:
-        x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+        x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
     if y_is_complex and dtype_y != dtype_z:
-        y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+        y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xy = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        xy = "%s * %s" % (x, y)
+        xy = f"{x} * {y}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % xy,
             name="multiply")
 
@@ -641,28 +645,28 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        xoy = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        xoy = "%s / %s" % (x, y)
+        xoy = f"{x} / {y}"
 
     if z_is_complex:
-        xoy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xoy)
+        xoy = "{}_cast({})".format(complex_dtype_to_name(dtype_z), xoy)
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} *y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % xoy,
             name="divide")
 
@@ -679,25 +683,25 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     elif not y_is_complex and x_is_complex:
-        yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     elif y_is_complex and not x_is_complex:
-        yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+        yox = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), y, x)
     else:
-        yox = "%s / %s" % (y, x)
+        yox = f"{y} / {x}"
 
     return get_elwise_kernel(context,
-            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
+            "{tp_z} *z, {tp_x} *x, {tp_y} y".format(
+                tp_x=dtype_to_ctype(dtype_x),
+                tp_y=dtype_to_ctype(dtype_y),
+                tp_z=dtype_to_ctype(dtype_z),
+                ),
             "z[i] = %s" % yox,
             name="divide_r")
 
@@ -705,9 +709,9 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_fill_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *z, %(tp)s a" % {
-                "tp": dtype_to_ctype(dtype),
-                },
+            "{tp} *z, {tp} a".format(
+                tp=dtype_to_ctype(dtype),
+                ),
             "z[i] = a",
             preamble=dtype_to_c_struct(context.devices[0], dtype),
             name="fill")
@@ -716,9 +720,9 @@ def get_fill_kernel(context, dtype):
 @context_dependent_memoize
 def get_reverse_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *z, %(tp)s *y" % {
-                "tp": dtype_to_ctype(dtype),
-                },
+            "{tp} *z, {tp} *y".format(
+                tp=dtype_to_ctype(dtype),
+                ),
             "z[i] = y[n-1-i]",
             name="reverse")
 
@@ -764,23 +768,23 @@ def get_pow_kernel(context, dtype_x, dtype_y, dtype_z,
 
     if z_is_complex and dtype_x != dtype_y:
         if x_is_complex and dtype_x != dtype_z:
-            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
+            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
         if y_is_complex and dtype_y != dtype_z:
-            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
+            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
     elif dtype_x != dtype_y:
         if dtype_x != dtype_z:
-            x = "(%s) (%s)" % (dtype_to_ctype(dtype_z), x)
+            x = "({}) ({})".format(dtype_to_ctype(dtype_z), x)
         if dtype_y != dtype_z:
-            y = "(%s) (%s)" % (dtype_to_ctype(dtype_z), y)
+            y = "({}) ({})".format(dtype_to_ctype(dtype_z), y)
 
     if x_is_complex and y_is_complex:
-        result = "%s_pow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_pow({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif x_is_complex and not y_is_complex:
-        result = "%s_powr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_powr({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-        result = "%s_rpow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+        result = "{}_rpow({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
     else:
-        result = "pow(%s, %s)" % (x, y)
+        result = f"pow({x}, {y})"
 
     return get_elwise_kernel(context,
             ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % {
@@ -876,7 +880,7 @@ def get_binary_func_kernel(context, func_name, x_dtype, y_dtype, out_dtype,
 def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
                                  out_dtype, preamble="", name=None):
     if (np.array(0, x_dtype) * np.array(0, y_dtype)).itemsize > 4:
-        arg_type = 'double'
+        arg_type = "double"
         preamble = """
         #if __OPENCL_C_VERSION__ < 120
         #pragma OPENCL EXTENSION cl_khr_fp64: enable
@@ -884,13 +888,13 @@ def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
         #define PYOPENCL_DEFINE_CDOUBLE
         """ + preamble
     else:
-        arg_type = 'float'
+        arg_type = "float"
     return get_elwise_kernel(context, [
         VectorArg(out_dtype, "z", with_offset=True),
         VectorArg(x_dtype, "x", with_offset=True),
         VectorArg(y_dtype, "y", with_offset=True),
         ],
-        "z[i] = %s((%s)x[i], (%s)y[i])" % (func_name, arg_type, arg_type),
+        f"z[i] = {func_name}(({arg_type})x[i], ({arg_type})y[i])",
         name="%s_kernel" % func_name if name is None else name,
         preamble=preamble)
 
@@ -898,7 +902,7 @@ def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
 @context_dependent_memoize
 def get_fmod_kernel(context, out_dtype=np.float32, arg_dtype=np.float32,
                     mod_dtype=np.float32):
-    return get_float_binary_func_kernel(context, 'fmod', arg_dtype,
+    return get_float_binary_func_kernel(context, "fmod", arg_dtype,
                                         mod_dtype, out_dtype)
 
 
@@ -936,7 +940,7 @@ def get_frexp_kernel(context, sign_dtype=np.float32, exp_dtype=np.float32,
 def get_ldexp_kernel(context, out_dtype=np.float32, sig_dtype=np.float32,
                      expt_dtype=np.float32):
     return get_binary_func_kernel(
-        context, '_PYOCL_LDEXP', sig_dtype, expt_dtype, out_dtype,
+        context, "_PYOCL_LDEXP", sig_dtype, expt_dtype, out_dtype,
         preamble="#define _PYOCL_LDEXP(x, y) ldexp(x, (int)(y))",
         name="ldexp_kernel")
 
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index b580c5375e298ff5d5864c52cebd656af42eac89..c996768d97d8f9a3e58a99e4839db5f37143128e 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = """
 Copyright (C) 2017 Andreas Kloeckner
 """
@@ -32,22 +30,20 @@ import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
 from pyopencl.tools import _NumpyTypesKeyBuilder
 
-_PYPY = '__pypy__' in sys.builtin_module_names
-_CPY2 = not _PYPY and sys.version_info < (3,)
-_CPY26 = _CPY2 and sys.version_info < (2, 7)
+_PYPY = "__pypy__" in sys.builtin_module_names
 
 
 # {{{ arg packing helpers
 
 _size_t_char = ({
-    8: 'Q',
-    4: 'L',
-    2: 'H',
-    1: 'B',
+    8: "Q",
+    4: "L",
+    2: "H",
+    1: "B",
 })[_cl._sizeof_size_t()]
 _type_char_map = {
-    'n': _size_t_char.lower(),
-    'N': _size_t_char
+    "n": _size_t_char.lower(),
+    "N": _size_t_char
 }
 del _size_t_char
 
@@ -59,9 +55,9 @@ del _size_t_char
 def generate_buffer_arg_setter(gen, arg_idx, buf_var):
     from pytools.py_codegen import Indentation
 
-    if _CPY2 or _PYPY:
+    if _PYPY:
         # https://github.com/numpy/numpy/issues/5381
-        gen("if isinstance({buf_var}, np.generic):".format(buf_var=buf_var))
+        gen(f"if isinstance({buf_var}, np.generic):")
         with Indentation(gen):
             if _PYPY:
                 gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var))
@@ -109,9 +105,9 @@ def generate_generic_arg_handling_body(num_args):
         gen("pass")
 
     for i in range(num_args):
-        gen("# process argument {arg_idx}".format(arg_idx=i))
+        gen(f"# process argument {i}")
         gen("")
-        gen("current_arg = {arg_idx}".format(arg_idx=i))
+        gen(f"current_arg = {i}")
         generate_generic_arg_handler(gen, i, "arg%d" % i)
         gen("")
 
@@ -139,9 +135,9 @@ def generate_specific_arg_handling_body(function_name,
         gen("pass")
 
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
-        gen("# process argument {arg_idx}".format(arg_idx=arg_idx))
+        gen(f"# process argument {arg_idx}")
         gen("")
-        gen("current_arg = {arg_idx}".format(arg_idx=arg_idx))
+        gen(f"current_arg = {arg_idx}")
         arg_var = "arg%d" % arg_idx
 
         if arg_dtype is None:
@@ -162,7 +158,7 @@ def generate_specific_arg_handling_body(function_name,
                         "some (but not all) of the target devices mishandle "
                         "struct kernel arguments (hence the workaround is "
                         "disabled".format(
-                            knl_name=function_name, stacklevel=2))
+                            knl_name=function_name), stacklevel=2)
 
             if arg_dtype == np.complex64:
                 arg_char = "f"
@@ -204,16 +200,6 @@ def generate_specific_arg_handling_body(function_name,
 
             fp_arg_count += 2
 
-        elif arg_dtype.char in "IL" and _CPY26:
-            # Prevent SystemError: ../Objects/longobject.c:336: bad
-            # argument to internal function
-
-            gen(
-                    "buf = pack('{arg_char}', long({arg_var}))"
-                    .format(arg_char=arg_dtype.char, arg_var=arg_var))
-            generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
-            cl_arg_idx += 1
-
         else:
             if arg_dtype.kind == "f":
                 fp_arg_count += 1
@@ -322,7 +308,9 @@ def _generate_enqueue_and_set_args_module(function_name,
                 ", ".join(
                     ["self", "queue", "global_size", "local_size"]
                     + arg_names
-                    + ["global_offset=None", "g_times_l=None",
+                    + ["global_offset=None",
+                        "g_times_l=None",
+                        "allow_empty_ndrange=False",
                         "wait_for=None"])))
 
     with Indentation(gen):
@@ -331,7 +319,8 @@ def _generate_enqueue_and_set_args_module(function_name,
 
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l)
+                    global_offset, wait_for, g_times_l=g_times_l,
+                    allow_empty_ndrange=allow_empty_ndrange)
             """)
 
     # }}}
@@ -352,7 +341,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v6",
+        "pyopencl-invoker-cache-v7",
         key_builder=_NumpyTypesKeyBuilder())
 
 
diff --git a/pyopencl/ipython_ext.py b/pyopencl/ipython_ext.py
index ce80fc07a8b774996f0154f34fec24d2d2b98e1f..619ac5908b2bc1925ad302146b6b116e638b532e 100644
--- a/pyopencl/ipython_ext.py
+++ b/pyopencl/ipython_ext.py
@@ -1,15 +1,11 @@
-from __future__ import division
-from __future__ import absolute_import
-
 from IPython.core.magic import (magics_class, Magics, cell_magic, line_magic)
 
 import pyopencl as cl
 import sys
-import six
 
 
 def _try_to_utf8(text):
-    if isinstance(text, six.text_type):
+    if isinstance(text, str):
         return text.encode("utf8")
     return text
 
@@ -48,16 +44,16 @@ class PyOpenCLMagics(Magics):
     def cl_kernel(self, line, cell):
         kernel = cell
 
-        opts, args = self.parse_options(line, 'o:')
-        build_options = opts.get('o', '')
+        opts, args = self.parse_options(line, "o:")
+        build_options = opts.get("o", "")
 
         self._run_kernel(kernel, build_options)
 
     def _load_kernel_and_options(self, line):
-        opts, args = self.parse_options(line, 'o:f:')
+        opts, args = self.parse_options(line, "o:f:")
 
-        build_options = opts.get('o')
-        kernel = self.shell.find_user_code(opts.get('f') or args)
+        build_options = opts.get("o")
+        kernel = self.shell.find_user_code(opts.get("f") or args)
 
         return kernel, build_options
 
@@ -72,9 +68,9 @@ class PyOpenCLMagics(Magics):
         header = "%%cl_kernel"
 
         if build_options:
-            header = '%s -o "%s"' % (header, build_options)
+            header = f'{header} -o "{build_options}"'
 
-        content = "%s\n\n%s" % (header, kernel)
+        content = f"{header}\n\n{kernel}"
 
         self.shell.set_next_input(content)
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 7c017419359bdd5b6baacf419a08980890cdadbe..00995450831b101cefca040dfc756a4261e52fd7 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -1,8 +1,5 @@
 """Computation of reductions on vectors."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import zip
 
 __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
 
@@ -304,6 +301,12 @@ class ReductionKernel:
         return_event = kwargs.pop("return_event", False)
         out = kwargs.pop("out", None)
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         range_ = kwargs.pop("range", None)
         slice_ = kwargs.pop("slice", None)
 
@@ -327,6 +330,7 @@ class ReductionKernel:
                     invocation_args.append(arg.base_data)
                     if arg_tp.with_offset:
                         invocation_args.append(arg.offset)
+                    wait_for.extend(arg.events)
                 else:
                     invocation_args.append(arg)
 
@@ -410,9 +414,11 @@ class ReductionKernel:
                     (stage_inf.group_size,),
                     *([result.base_data, result.offset]
                         + invocation_args + size_args),
-                    **dict(wait_for=wait_for))
+                    wait_for=wait_for)
             wait_for = [last_evt]
 
+            result.add_event(last_evt)
+
             if group_count == 1:
                 if return_event:
                     return result, last_evt
@@ -528,22 +534,22 @@ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
     b = "b[%s]" % index_expr
 
     if a_is_complex and (dtype_a != dtype_out):
-        a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
+        a = "{}_cast({})".format(complex_dtype_to_name(dtype_out), a)
     if b_is_complex and (dtype_b != dtype_out):
-        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
+        b = "{}_cast({})".format(complex_dtype_to_name(dtype_out), b)
 
     if a_is_complex and conjugate_first and a_is_complex:
-        a = "%s_conj(%s)" % (
+        a = "{}_conj({})".format(
                 complex_dtype_to_name(dtype_out), a)
 
     if a_is_complex and not b_is_complex:
-        map_expr = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     elif not a_is_complex and b_is_complex:
-        map_expr = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     elif a_is_complex and b_is_complex:
-        map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+        map_expr = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
     else:
-        map_expr = "%s*%s" % (a, b)
+        map_expr = f"{a}*{b}"
 
     return map_expr, dtype_out, dtype_b
 
@@ -625,10 +631,10 @@ def get_minmax_kernel(ctx, what, dtype):
 
     return ReductionKernel(ctx, dtype,
             neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
-            arguments="const %(tp)s *in" % {
-                "tp": dtype_to_ctype(dtype),
-                }, preamble="#define MY_INFINITY (1./0)")
+            reduce_expr=f"{reduce_expr}",
+            arguments="const {tp} *in".format(
+                tp=dtype_to_ctype(dtype),
+                ), preamble="#define MY_INFINITY (1./0)")
 
 
 @context_dependent_memoize
@@ -642,7 +648,7 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
 
     return ReductionKernel(ctx, dtype,
             neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
+            reduce_expr=f"{reduce_expr}",
             map_expr="in[lookup_tbl[i]]",
             arguments=(
                 "const %(tp_lut)s *lookup_tbl, "
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 71460f25beaadff9258aa79d3f6d34b58e362546..24b759069d54303c2607a36d8c2013f71a192dee 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -1,6 +1,5 @@
 """Scan primitive."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = """
 Copyright 2011-2012 Andreas Kloeckner
@@ -21,12 +20,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 
 Derived from code within the Thrust project, https://github.com/thrust/thrust/
-
 """
 
-import six
-from six.moves import range, zip
-
 import numpy as np
 
 import pyopencl as cl
@@ -940,7 +935,7 @@ class ScanPerformanceWarning(UserWarning):
     pass
 
 
-class _GenericScanKernelBase(object):
+class _GenericScanKernelBase:
     # {{{ constructor, argument processing
 
     def __init__(self, ctx, dtype,
@@ -1469,6 +1464,11 @@ class GenericScanKernel(_GenericScanKernelBase):
         n = kwargs.get("size")
         wait_for = kwargs.get("wait_for")
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            wait_for = list(wait_for)
+
         if len(args) != len(self.parsed_args):
             raise TypeError("expected %d arguments, got %d" %
                     (len(self.parsed_args), len(args)))
@@ -1491,6 +1491,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
@@ -1541,7 +1542,7 @@ class GenericScanKernel(_GenericScanKernelBase):
 
         l1_evt = l1_info.kernel(
                 queue, (num_intervals,), (l1_info.wg_size,),
-                *scan1_args, **dict(g_times_l=True, wait_for=wait_for))
+                *scan1_args, g_times_l=True, wait_for=wait_for)
 
         # }}}
 
@@ -1561,7 +1562,7 @@ class GenericScanKernel(_GenericScanKernelBase):
 
         l2_evt = l2_info.kernel(
                 queue, (1,), (l1_info.wg_size,),
-                *scan2_args, **dict(g_times_l=True, wait_for=[l1_evt]))
+                *scan2_args, g_times_l=True, wait_for=[l1_evt])
 
         # }}}
 
@@ -1577,7 +1578,7 @@ class GenericScanKernel(_GenericScanKernelBase):
         return self.final_update_info.kernel(
                 queue, (num_intervals,),
                 (self.final_update_info.update_wg_size,),
-                *upd_args, **dict(g_times_l=True, wait_for=[l2_evt]))
+                *upd_args, g_times_l=True, wait_for=[l2_evt])
 
         # }}}
 
@@ -1679,6 +1680,12 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
         n = kwargs.get("size")
         wait_for = kwargs.get("wait_for")
 
+        if wait_for is None:
+            wait_for = []
+        else:
+            # We'll be modifying it below.
+            wait_for = list(wait_for)
+
         if len(args) != len(self.parsed_args):
             raise TypeError("expected %d arguments, got %d" %
                     (len(self.parsed_args), len(args)))
@@ -1701,13 +1708,14 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
                 data_args.append(arg_val.base_data)
                 if arg_descr.with_offset:
                     data_args.append(arg_val.offset)
+                wait_for.extend(arg_val.events)
             else:
                 data_args.append(arg_val)
 
         # }}}
 
         return self.kernel(queue, (1,), (1,),
-                *(data_args + [n]), **dict(wait_for=wait_for))
+                *(data_args + [n]), wait_for=wait_for)
 
 # }}}
 
@@ -1721,7 +1729,7 @@ class _LegacyScanKernelBase(GenericScanKernel):
         scan_ctype = dtype_to_ctype(dtype)
         GenericScanKernel.__init__(self,
                 ctx, dtype,
-                arguments="__global %s *input_ary, __global %s *output_ary" % (
+                arguments="__global {} *input_ary, __global {} *output_ary".format(
                     scan_ctype, scan_ctype),
                 input_expr="input_ary[i]",
                 scan_expr=scan_expr,
@@ -1740,7 +1748,7 @@ class _LegacyScanKernelBase(GenericScanKernel):
         if output_ary is None:
             output_ary = input_ary
 
-        if isinstance(output_ary, (str, six.text_type)) and output_ary == "new":
+        if isinstance(output_ary, (str, str)) and output_ary == "new":
             output_ary = cl.array.empty_like(input_ary, allocator=allocator)
 
         if input_ary.shape != output_ary.shape:
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 9dce9216e5df9465557d94dbb35331582479bde4..5f5e7f675d48124a43a43fa82dd6519eed625d5b 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -1,6 +1,5 @@
 """Various helpful bits and pieces without much of a common theme."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
 
@@ -28,12 +27,12 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 
 
-import six
-from six.moves import zip, intern
+from sys import intern
+
+# Do not add a pyopencl import here: This will add an import cycle.
 
 import numpy as np
 from decorator import decorator
-import pyopencl as cl
 from pytools import memoize, memoize_method
 from pyopencl._cl import bitlog2  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
@@ -171,12 +170,39 @@ atexit.register(clear_first_arg_caches)
 # }}}
 
 
+# {{{ pytest fixtures
+
+class _ContextFactory:
+    def __init__(self, device):
+        self.device = device
+
+    def __call__(self):
+        # Get rid of leftovers from past tests.
+        # CL implementations are surprisingly limited in how many
+        # simultaneous contexts they allow...
+        clear_first_arg_caches()
+
+        from gc import collect
+        collect()
+
+        import pyopencl as cl
+        return cl.Context([self.device])
+
+    def __str__(self):
+        # Don't show address, so that parallel test collection works
+        return ("<context factory for <pyopencl.Device '%s' on '%s'>" %
+                (self.device.name.strip(),
+                 self.device.platform.name.strip()))
+
+
 def get_test_platforms_and_devices(plat_dev_string=None):
     """Parse a string of the form 'PYOPENCL_TEST=0:0,1;intel:i5'.
 
     :return: list of tuples (platform, [device, device, ...])
     """
 
+    import pyopencl as cl
+
     if plat_dev_string is None:
         import os
         plat_dev_string = os.environ.get("PYOPENCL_TEST", None)
@@ -191,7 +217,7 @@ def get_test_platforms_and_devices(plat_dev_string=None):
 
         found = False
         for obj in objs:
-            if identifier.lower() in (obj.name + ' ' + obj.vendor).lower():
+            if identifier.lower() in (obj.name + " " + obj.vendor).lower():
                 return obj
         if not found:
             raise RuntimeError("object '%s' not found" % identifier)
@@ -226,35 +252,18 @@ def get_test_platforms_and_devices(plat_dev_string=None):
                 for platform in cl.get_platforms()]
 
 
-def pytest_generate_tests_for_pyopencl(metafunc):
-    class ContextFactory:
-        def __init__(self, device):
-            self.device = device
-
-        def __call__(self):
-            # Get rid of leftovers from past tests.
-            # CL implementations are surprisingly limited in how many
-            # simultaneous contexts they allow...
-
-            clear_first_arg_caches()
-
-            from gc import collect
-            collect()
+def get_pyopencl_fixture_arg_names(metafunc, extra_arg_names=None):
+    if extra_arg_names is None:
+        extra_arg_names = []
 
-            return cl.Context([self.device])
-
-        def __str__(self):
-            # Don't show address, so that parallel test collection works
-            return ("<context factory for <pyopencl.Device '%s' on '%s'>" %
-                    (self.device.name.strip(),
-                     self.device.platform.name.strip()))
-
-    test_plat_and_dev = get_test_platforms_and_devices()
+    supported_arg_names = [
+            "platform", "device",
+            "ctx_factory", "ctx_getter",
+            ] + extra_arg_names
 
     arg_names = []
-
-    for arg in ("platform", "device", "ctx_factory", "ctx_getter"):
-        if arg not in metafunc.funcargnames:
+    for arg in supported_arg_names:
+        if arg not in metafunc.fixturenames:
             continue
 
         if arg == "ctx_getter":
@@ -265,29 +274,52 @@ def pytest_generate_tests_for_pyopencl(metafunc):
 
         arg_names.append(arg)
 
-    arg_values = []
+    return arg_names
 
-    for platform, plat_devs in test_plat_and_dev:
-        if arg_names == ["platform"]:
-            arg_values.append((platform,))
-            continue
 
+def get_pyopencl_fixture_arg_values():
+    import pyopencl as cl
+
+    arg_values = []
+    for platform, devices in get_test_platforms_and_devices():
         arg_dict = {"platform": platform}
 
-        for device in plat_devs:
+        for device in devices:
             arg_dict["device"] = device
-            arg_dict["ctx_factory"] = ContextFactory(device)
-            arg_dict["ctx_getter"] = ContextFactory(device)
+            arg_dict["ctx_factory"] = _ContextFactory(device)
+            arg_dict["ctx_getter"] = _ContextFactory(device)
+
+        arg_values.append(arg_dict)
+
+    def idfn(val):
+        if isinstance(val, cl.Platform):
+            # Don't show address, so that parallel test collection works
+            return f"<pyopencl.Platform '{val.name}'>"
+        else:
+            return str(val)
+
+    return arg_values, idfn
 
-            arg_values.append(tuple(arg_dict[name] for name in arg_names))
 
-    if arg_names:
-        metafunc.parametrize(arg_names, arg_values, ids=str)
+def pytest_generate_tests_for_pyopencl(metafunc):
+    arg_names = get_pyopencl_fixture_arg_names(metafunc)
+    if not arg_names:
+        return
+
+    arg_values, ids = get_pyopencl_fixture_arg_values()
+    arg_values = [
+            tuple(arg_dict[name] for name in arg_names)
+            for arg_dict in arg_values
+            ]
+
+    metafunc.parametrize(arg_names, arg_values, ids=ids)
+
+# }}}
 
 
 # {{{ C argument lists
 
-class Argument(object):
+class Argument:
     pass
 
 
@@ -297,7 +329,7 @@ class DtypedArgument(Argument):
         self.name = name
 
     def __repr__(self):
-        return "%s(%r, %s)" % (
+        return "{}({!r}, {})".format(
                 self.__class__.__name__,
                 self.name,
                 self.dtype)
@@ -311,17 +343,17 @@ class VectorArg(DtypedArgument):
     def declarator(self):
         if self.with_offset:
             # Two underscores -> less likelihood of a name clash.
-            return "__global %s *%s__base, long %s__offset" % (
+            return "__global {} *{}__base, long {}__offset".format(
                     dtype_to_ctype(self.dtype), self.name, self.name)
         else:
-            result = "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
+            result = "__global {} *{}".format(dtype_to_ctype(self.dtype), self.name)
 
         return result
 
 
 class ScalarArg(DtypedArgument):
     def declarator(self):
-        return "%s %s" % (dtype_to_ctype(self.dtype), self.name)
+        return "{} {}".format(dtype_to_ctype(self.dtype), self.name)
 
 
 class OtherArg(Argument):
@@ -404,6 +436,8 @@ def get_arg_offset_adjuster_code(arg_types):
 
 
 def get_gl_sharing_context_properties():
+    import pyopencl as cl
+
     ctx_props = cl.context_properties
 
     from OpenGL import platform as gl_platform
@@ -463,7 +497,11 @@ class _CDeclList:
         if dtype in pyopencl.cltypes.vec_type_to_scalar_and_count:
             return
 
-        for name, field_data in sorted(six.iteritems(dtype.fields)):
+        if hasattr(dtype, "subdtype") and dtype.subdtype is not None:
+            self.add_dtype(dtype.subdtype[0])
+            return
+
+        for name, field_data in sorted(dtype.fields.items()):
             field_dtype, offset = field_data[:2]
             self.add_dtype(field_dtype)
 
@@ -541,15 +579,33 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
     function, not the original one.
     """
 
-    fields = sorted(six.iteritems(dtype.fields),
+    import pyopencl as cl
+
+    fields = sorted(dtype.fields.items(),
             key=lambda name_dtype_offset: name_dtype_offset[1][1])
 
     c_fields = []
     for field_name, dtype_and_offset in fields:
         field_dtype, offset = dtype_and_offset[:2]
-        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))
+        if hasattr(field_dtype, "subdtype") and field_dtype.subdtype is not None:
+            array_dtype = field_dtype.subdtype[0]
+            if hasattr(array_dtype, "subdtype") and array_dtype.subdtype is not None:
+                raise NotImplementedError("nested array dtypes are not supported")
+            array_dims = field_dtype.subdtype[1]
+            dims_str = ""
+            try:
+                for dim in array_dims:
+                    dims_str += "[%d]" % dim
+            except TypeError:
+                dims_str = "[%d]" % array_dims
+            c_fields.append("  {} {}{};".format(
+                dtype_to_ctype(array_dtype), field_name, dims_str)
+            )
+        else:
+            c_fields.append(
+                    "  {} {};".format(dtype_to_ctype(field_dtype), field_name))
 
-    c_decl = "typedef struct {\n%s\n} %s;\n\n" % (
+    c_decl = "typedef struct {{\n{}\n}} {};\n\n".format(
             "\n".join(c_fields),
             name)
 
@@ -626,12 +682,12 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
 
     try:
         dtype_arg_dict = {
-            'names': [field_name
+            "names": [field_name
                       for field_name, (field_dtype, offset) in fields],
-            'formats': [field_dtype
+            "formats": [field_dtype
                         for field_name, (field_dtype, offset) in fields],
-            'offsets': [int(x) for x in offsets],
-            'itemsize': int(size_and_offsets[0]),
+            "offsets": [int(x) for x in offsets],
+            "itemsize": int(size_and_offsets[0]),
             }
         dtype = np.dtype(dtype_arg_dict)
         if dtype.itemsize != size_and_offsets[0]:
@@ -647,8 +703,8 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
             for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                 if offset > total_size:
                     padding_count += 1
-                    yield ('__pycl_padding%d' % padding_count,
-                           'V%d' % offset - total_size)
+                    yield ("__pycl_padding%d" % padding_count,
+                           "V%d" % offset - total_size)
                 yield field_name, field_dtype
                 total_size = field_dtype.itemsize + offset
         dtype = np.dtype(list(calc_field_type()))
@@ -674,7 +730,7 @@ def dtype_to_c_struct(device, dtype):
     def dtypes_match():
         result = len(dtype.fields) == len(matched_dtype.fields)
 
-        for name, val in six.iteritems(dtype.fields):
+        for name, val in dtype.fields.items():
             result = result and matched_dtype.fields[name] == val
 
         return result
@@ -745,7 +801,7 @@ class _ScalarArgPlaceholder(_ArgumentPlaceholder):
     target_class = ScalarArg
 
 
-class _TemplateRenderer(object):
+class _TemplateRenderer:
     def __init__(self, template, type_aliases, var_values, context=None,
             options=[]):
         self.template = template
@@ -769,6 +825,7 @@ class _TemplateRenderer(object):
         return str(result)
 
     def get_rendered_kernel(self, txt, kernel_name):
+        import pyopencl as cl
         prg = cl.Program(self.context, self(txt)).build(self.options)
 
         kernel_name_prefix = self.var_dict.get("kernel_name_prefix")
@@ -851,18 +908,18 @@ class _TemplateRenderer(object):
         if arguments is not None:
             cdl.visit_arguments(arguments)
 
-        for _, tv in sorted(six.iteritems(self.type_aliases)):
+        for _, tv in sorted(self.type_aliases.items()):
             cdl.add_dtype(tv)
 
         type_alias_decls = [
-                "typedef %s %s;" % (dtype_to_ctype(val), name)
-                for name, val in sorted(six.iteritems(self.type_aliases))
+                "typedef {} {};".format(dtype_to_ctype(val), name)
+                for name, val in sorted(self.type_aliases.items())
                 ]
 
         return cdl.get_declarations() + "\n" + "\n".join(type_alias_decls)
 
 
-class KernelTemplateBase(object):
+class KernelTemplateBase:
     def __init__(self, template_processor=None):
         self.template_processor = template_processor
 
@@ -905,7 +962,7 @@ class KernelTemplateBase(object):
     def build(self, context, *args, **kwargs):
         """Provide caching for an :meth:`build_inner`."""
 
-        cache_key = (context, args, tuple(sorted(six.iteritems(kwargs))))
+        cache_key = (context, args, tuple(sorted(kwargs.items())))
         try:
             return self.build_cache[cache_key]
         except KeyError:
@@ -960,7 +1017,7 @@ def array_module(a):
 def is_spirv(s):
     spirv_magic = b"\x07\x23\x02\x03"
     return (
-            isinstance(s, six.binary_type)
+            isinstance(s, bytes)
             and (
                 s[:4] == spirv_magic
                 or s[:4] == spirv_magic[::-1]))
diff --git a/pyopencl/version.py b/pyopencl/version.py
index 6467468d7fb2ee8f1a4ba23c4a11de9f177cbaee..0668cdc6768acd4654d85574dc4fd69788891456 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2018, 2, 2)
+VERSION = (2020, 2, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/setup.cfg b/setup.cfg
index 2bc760d67cfc68d91478948399e51cf470abfe07..ad68ea236494ef28401610d829b2463349b45b36 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,3 +2,9 @@
 ignore = E126,E127,E128,E123,E226,E241,E242,E265,W503,E402
 max-line-length=85
 exclude=pyopencl/compyte/ndarray,pyopencl/compyte/array.py
+
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
+
diff --git a/setup.py b/setup.py
index f01702113c5d050d776829e1c2f5e0b1221b08a5..2bb4252aaa42a3161bfd65c56ea0ad5dd930d4da 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-from __future__ import absolute_import, print_function
 
 __copyright__ = """
 Copyright (C) 2009-15 Andreas Kloeckner
@@ -44,10 +42,10 @@ def get_config_schema():
             "-fvisibility=hidden"
             ]
 
-    if 'darwin' in sys.platform:
+    if "darwin" in sys.platform:
         import platform
         osx_ver, _, _ = platform.mac_ver()
-        osx_ver = '.'.join(osx_ver.split('.')[:2])
+        osx_ver = ".".join(osx_ver.split(".")[:2])
 
         sysroot_paths = [
                 "/Applications/Xcode.app/Contents/Developer/Platforms/"
@@ -57,14 +55,14 @@ def get_config_schema():
 
         default_libs = []
         default_cxxflags = default_cxxflags + [
-                '-stdlib=libc++', '-mmacosx-version-min=10.7',
-                '-arch', 'i386', '-arch', 'x86_64'
+                "-stdlib=libc++", "-mmacosx-version-min=10.7",
+                "-arch', 'i386", "-arch", "x86_64"
                 ]
 
         from os.path import isdir
         for srp in sysroot_paths:
             if isdir(srp):
-                default_cxxflags.extend(['-isysroot', srp])
+                default_cxxflags.extend(["-isysroot", srp])
                 break
 
         default_ldflags = default_cxxflags[:] + ["-Wl,-framework,OpenCL"]
@@ -151,7 +149,7 @@ def main():
     finally:
         version_file.close()
 
-    exec(compile(version_file_contents, "pyopencl/version.py", 'exec'), ver_dic)
+    exec(compile(version_file_contents, "pyopencl/version.py", "exec"), ver_dic)
 
     try:
         import mako  # noqa
@@ -194,29 +192,25 @@ def main():
             # metadata
             version=ver_dic["VERSION_TEXT"],
             description="Python wrapper for OpenCL",
-            long_description=open("README.rst", "rt").read(),
+            long_description=open("README.rst").read(),
             author="Andreas Kloeckner",
             author_email="inform@tiker.net",
             license="MIT",
             url="http://mathema.tician.de/software/pyopencl",
             classifiers=[
-                'Environment :: Console',
-                'Development Status :: 5 - Production/Stable',
-                'Intended Audience :: Developers',
-                'Intended Audience :: Other Audience',
-                'Intended Audience :: Science/Research',
-                'License :: OSI Approved :: MIT License',
-                'Natural Language :: English',
-                'Programming Language :: C++',
-                'Programming Language :: Python',
-                'Programming Language :: Python :: 2',
-                'Programming Language :: Python :: 2.7',
-                'Programming Language :: Python :: 3',
-                'Programming Language :: Python :: 3.2',
-                'Programming Language :: Python :: 3.3',
-                'Topic :: Scientific/Engineering',
-                'Topic :: Scientific/Engineering :: Mathematics',
-                'Topic :: Scientific/Engineering :: Physics',
+                "Environment :: Console",
+                "Development Status :: 5 - Production/Stable",
+                "Intended Audience :: Developers",
+                "Intended Audience :: Other Audience",
+                "Intended Audience :: Science/Research",
+                "License :: OSI Approved :: MIT License",
+                "Natural Language :: English",
+                "Programming Language :: C++",
+                "Programming Language :: Python",
+                "Programming Language :: Python :: 3",
+                "Topic :: Scientific/Engineering",
+                "Topic :: Scientific/Engineering :: Mathematics",
+                "Topic :: Scientific/Engineering :: Physics",
                 ],
 
             # build info
@@ -234,31 +228,33 @@ def main():
                         ],
                     include_dirs=INCLUDE_DIRS + [
                         get_pybind_include(),
-                        get_pybind_include(user=True)
                         ],
                     library_dirs=conf["CL_LIB_DIR"],
                     libraries=conf["CL_LIBNAME"],
                     define_macros=list(conf["EXTRA_DEFINES"].items()),
                     extra_compile_args=conf["CXXFLAGS"],
                     extra_link_args=conf["LDFLAGS"],
-                    language='c++',
+                    language="c++",
                     ),
                 ],
 
             setup_requires=[
-                "pybind11",
+                "pybind11>=2.5.0",
                 "numpy",
                 ],
 
+            python_requires="~=3.6",
             install_requires=[
                 "numpy",
                 "pytools>=2017.6",
                 "decorator>=3.2.0",
                 "appdirs>=1.4.0",
-                "six>=1.9.0",
                 # "Mako>=0.3.6",
                 ],
-
+            extras_require={
+                "pocl":  ["pocl_binary_distribution>=1.2"],
+                "oclgrind":  ["oclgrind_binary_distribution>=18.3"],
+            },
             include_package_data=True,
             package_data={
                     "pyopencl": [
@@ -269,11 +265,11 @@ def main():
                         ]
                     },
 
-            cmdclass={'build_ext': PybindBuildExtCommand},
+            cmdclass={"build_ext": PybindBuildExtCommand},
             zip_safe=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 
 # vim: foldmethod=marker
diff --git a/src/clinfo_ext.h b/src/clinfo_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..43b7b6082fda28ad433f26c5d9a5e2e743e24940
--- /dev/null
+++ b/src/clinfo_ext.h
@@ -0,0 +1,129 @@
+/* Include OpenCL header, and define OpenCL extensions, since what is and is not
+ * available in the official headers is very system-dependent */
+
+#ifndef _EXT_H
+#define _EXT_H
+
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+/* These two defines were introduced in the 1.2 headers
+ * on 2012-11-30, so earlier versions don't have them
+ * (e.g. Debian wheezy)
+ */
+
+#ifndef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT          0x104B
+#endif
+
+/*
+ * Extensions
+ */
+
+/* cl_khr_icd */
+#define CL_PLATFORM_ICD_SUFFIX_KHR			0x0920
+#define CL_PLATFORM_NOT_FOUND_KHR			-1001
+
+
+/* cl_khr_fp64 */
+#define CL_DEVICE_DOUBLE_FP_CONFIG			0x1032
+
+/* cl_khr_fp16 */
+#define CL_DEVICE_HALF_FP_CONFIG			0x1033
+
+/* cl_khr_terminate_context */
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR		0x200F
+
+/* cl_nv_device_attribute_query */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV		0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV		0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV		0x4002
+#define CL_DEVICE_WARP_SIZE_NV				0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV			0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV		0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV			0x4006
+#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV	0x4007
+#define CL_DEVICE_PCI_BUS_ID_NV				0x4008
+#define CL_DEVICE_PCI_SLOT_ID_NV			0x4009
+
+/* cl_ext_atomic_counters_{32,64} */
+#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT		0x4032
+
+/* cl_amd_device_attribute_query */
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD		0x4036
+#define CL_DEVICE_TOPOLOGY_AMD				0x4037
+#define CL_DEVICE_BOARD_NAME_AMD			0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD		0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD		0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD			0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD		0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD			0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD		0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD		0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD	0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD	0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD			0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD		0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD			0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD			0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD		0x404C
+
+#ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD		1
+
+typedef union
+{
+	struct { cl_uint type; cl_uint data[5]; } raw;
+	struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+#endif
+
+/* cl_amd_offline_devices */
+#define CL_CONTEXT_OFFLINE_DEVICES_AMD			0x403F
+
+/* cl_ext_device_fission */
+#define cl_ext_device_fission				1
+
+typedef cl_ulong  cl_device_partition_property_ext;
+
+#define CL_DEVICE_PARTITION_EQUALLY_EXT			0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT		0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT		0x4052
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL		0x4052 /* cl_intel_device_partition_by_names */
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT	0x4053
+
+#define CL_DEVICE_PARENT_DEVICE_EXT			0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT			0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT			0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT			0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT			0x4058
+
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT			0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT			0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT			0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT			0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT			0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT		0x100
+
+/* cl_intel_advanced_motion_estimation */
+#define CL_DEVICE_ME_VERSION_INTEL			0x407E
+
+/* cl_qcom_ext_host_ptr */
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM		0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM			0x40A1
+
+/* cl_khr_spir */
+#define CL_DEVICE_SPIR_VERSIONS				0x40E0
+
+/* cl_altera_device_temperature */
+#define CL_DEVICE_CORE_TEMPERATURE_ALTERA		0x40F3
+
+/* cl_intel_simultaneous_sharing */
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL		0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL	0x4105
+
+#endif
diff --git a/src/mempool.hpp b/src/mempool.hpp
index 3469af8c1d5599d7b79e9cbd1f057df912b7d1f3..3491c69db8f6372aa0132f046d7714494081d41e 100644
--- a/src/mempool.hpp
+++ b/src/mempool.hpp
@@ -91,11 +91,13 @@ namespace PYGPU_PACKAGE
       bool m_stop_holding;
       int m_trace;
 
+      unsigned m_leading_bits_in_bin_id;
+
     public:
-      memory_pool(Allocator const &alloc=Allocator())
+      memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4)
         : m_allocator(alloc.copy()),
         m_held_blocks(0), m_active_blocks(0), m_stop_holding(false),
-        m_trace(false)
+        m_trace(false), m_leading_bits_in_bin_id(leading_bits_in_bin_id)
       {
         if (m_allocator->is_deferred())
         {
@@ -109,17 +111,21 @@ namespace PYGPU_PACKAGE
       virtual ~memory_pool()
       { free_held(); }
 
-      static const unsigned mantissa_bits = 2;
-      static const unsigned mantissa_mask = (1 << mantissa_bits) - 1;
+    private:
+      unsigned mantissa_mask() const
+      {
+        return (1 << m_leading_bits_in_bin_id) - 1;
+      }
 
-      static bin_nr_t bin_number(size_type size)
+    public:
+      bin_nr_t bin_number(size_type size)
       {
         signed l = bitlog2(size);
-        size_type shifted = signed_right_shift(size, l-signed(mantissa_bits));
-        if (size && (shifted & (1 << mantissa_bits)) == 0)
+        size_type shifted = signed_right_shift(size, l-signed(m_leading_bits_in_bin_id));
+        if (size && (shifted & (1 << m_leading_bits_in_bin_id)) == 0)
           throw std::runtime_error("memory_pool::bin_number: bitlog2 fault");
-        size_type chopped = shifted & mantissa_mask;
-        return l << mantissa_bits | chopped;
+        size_type chopped = shifted & mantissa_mask();
+        return l << m_leading_bits_in_bin_id | chopped;
       }
 
       void set_trace(bool flag)
@@ -130,19 +136,19 @@ namespace PYGPU_PACKAGE
           --m_trace;
       }
 
-      static size_type alloc_size(bin_nr_t bin)
+      size_type alloc_size(bin_nr_t bin)
       {
-        bin_nr_t exponent = bin >> mantissa_bits;
-        bin_nr_t mantissa = bin & mantissa_mask;
+        bin_nr_t exponent = bin >> m_leading_bits_in_bin_id;
+        bin_nr_t mantissa = bin & mantissa_mask();
 
         size_type ones = signed_left_shift(1,
-            signed(exponent)-signed(mantissa_bits)
+            signed(exponent)-signed(m_leading_bits_in_bin_id)
             );
         if (ones) ones -= 1;
 
         size_type head = signed_left_shift(
-           (1<<mantissa_bits) | mantissa,
-            signed(exponent)-signed(mantissa_bits));
+           (1<<m_leading_bits_in_bin_id) | mantissa,
+            signed(exponent)-signed(m_leading_bits_in_bin_id));
         if (ones & head)
           throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
         return head | ones;
diff --git a/src/numpy_init.hpp b/src/numpy_init.hpp
deleted file mode 100644
index 2cf7fe0760ea8e37618960eae240c0ff5b329bd6..0000000000000000000000000000000000000000
--- a/src/numpy_init.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Numpy import/init
-//
-// Copyright (C) 2009 Andreas Kloeckner
-//
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-//
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-
-#ifndef _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP
-#define _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP
-
-
-// #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/arrayobject.h>
-#include <stdexcept>
-
-
-namespace
-{
-  static struct pyublas_array_importer
-  {
-    static bool do_import_array()
-    {
-#ifdef PYPY_VERSION
-      import_array();
-#else
-      import_array1(false);
-#endif
-      return true;
-    }
-
-    pyublas_array_importer()
-    {
-      if (!do_import_array())
-        throw std::runtime_error("numpy failed to initialize");
-    }
-  } _array_importer;
-}
-
-
-
-
-#endif
diff --git a/src/tools.hpp b/src/tools.hpp
index 7fd906ef3263376304d8c05dda28156976a67976..f64f443fd98bca9b0ddd0c66cedaa01c276521df 100644
--- a/src/tools.hpp
+++ b/src/tools.hpp
@@ -31,8 +31,7 @@
 #include <pybind11/pybind11.h>
 
 #include <numeric>
-#include "numpy_init.hpp"
-
+#include <numpy/arrayobject.h>
 
 
 
diff --git a/src/wrap_cl.cpp b/src/wrap_cl.cpp
index 2ea3d5913524a2000d03f719165a6208e5d1381a..50a482016ebda749ec4031f9d7cb92ab200dfa12 100644
--- a/src/wrap_cl.cpp
+++ b/src/wrap_cl.cpp
@@ -24,6 +24,8 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 
+#define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
+
 #include "wrap_cl.hpp"
 
 
@@ -39,8 +41,17 @@ extern void pyopencl_expose_part_1(py::module &m);
 extern void pyopencl_expose_part_2(py::module &m);
 extern void pyopencl_expose_mempool(py::module &m);
 
+static bool import_numpy_helper()
+{
+  import_array1(false);
+  return true;
+}
+
 PYBIND11_MODULE(_cl, m)
 {
+  if (!import_numpy_helper())
+    throw py::error_already_set();
+
   pyopencl_expose_constants(m);
   pyopencl_expose_part_1(m);
   pyopencl_expose_part_2(m);
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 145c0b9c264e024c93298cccf34a39fa281f331d..1c7482d78b1bc68b3dedc2add7367bf277e5111b 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -84,7 +84,7 @@
 #include <utility>
 #include <numeric>
 #include "wrap_helpers.hpp"
-#include "numpy_init.hpp"
+#include <numpy/arrayobject.h>
 #include "tools.hpp"
 
 #ifdef PYOPENCL_PRETEND_CL_VERSION
@@ -109,7 +109,6 @@
 
 
 #if (PY_VERSION_HEX >= 0x03000000) or defined(PYPY_VERSION)
-#define PYOPENCL_USE_NEW_BUFFER_INTERFACE
 #define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) std::move(s)
 #else
 #define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) (s)
@@ -487,8 +486,8 @@ namespace pyopencl
 
 
   // {{{ buffer interface helper
-  //
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+
+
   class py_buffer_wrapper : public noncopyable
   {
     private:
@@ -529,7 +528,7 @@ namespace pyopencl
         PyBuffer_Release(&m_buf);
     }
   };
-#endif
+
 
   // }}}
 
@@ -1560,6 +1559,14 @@ namespace pyopencl
         PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (1, &m_event));
       }
 
+      // Called from a destructor context below:
+      // - Should not release the GIL
+      // - Should fail gracefully in the face of errors
+      virtual void wait_during_cleanup_without_releasing_the_gil()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clWaitForEvents, (1, &m_event));
+      }
+
 #if PYOPENCL_CL_VERSION >= 0x1010
     // {{{ set_callback, by way of a a thread-based construction
 
@@ -1672,7 +1679,6 @@ namespace pyopencl
 #endif
   };
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
   class nanny_event : public event
   {
     // In addition to everything an event does, the nanny event holds a reference
@@ -1688,7 +1694,11 @@ namespace pyopencl
       { }
 
       ~nanny_event()
-      { wait(); }
+      {
+        // It appears that Pybind can get very confused if we release the GIL here:
+        // https://github.com/inducer/pyopencl/issues/296
+        wait_during_cleanup_without_releasing_the_gil();
+      }
 
       py::object get_ward() const
       {
@@ -1705,39 +1715,13 @@ namespace pyopencl
         event::wait();
         m_ward.reset();
       }
-  };
-#else
-  class nanny_event : public event
-  {
-    // In addition to everything an event does, the nanny event holds a reference
-    // to a Python object and waits for its own completion upon destruction.
-
-    protected:
-      py::object        m_ward;
-
-    public:
-
-      nanny_event(cl_event evt, bool retain, py::object ward)
-        : event(evt, retain), m_ward(ward)
-      { }
-
-      nanny_event(nanny_event const &src)
-        : event(src), m_ward(src.m_ward)
-      { }
 
-      ~nanny_event()
-      { wait(); }
-
-      py::object get_ward() const
-      { return m_ward; }
-
-      virtual void wait()
+      virtual void wait_during_cleanup_without_releasing_the_gil()
       {
-        event::wait();
-        m_ward = py::none();
+        event::wait_during_cleanup_without_releasing_the_gil();
+        m_ward.reset();
       }
   };
-#endif
 
 
 
@@ -1894,11 +1878,7 @@ namespace pyopencl
   class memory_object : noncopyable, public memory_object_holder
   {
     public:
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
       typedef std::unique_ptr<py_buffer_wrapper> hostbuf_t;
-#else
-      typedef py::object hostbuf_t;
-#endif
 
     private:
       bool m_valid;
@@ -1945,14 +1925,10 @@ namespace pyopencl
 
       py::object hostbuf()
       {
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         if (m_hostbuf.get())
           return py::reinterpret_borrow<py::object>(m_hostbuf->m_buf.obj);
         else
           return py::none();
-#else
-        return m_hostbuf;
-#endif
       }
 
       const cl_mem data() const
@@ -2136,7 +2112,6 @@ namespace pyopencl
 
     void *buf = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (py_hostbuf.ptr() != Py_None)
     {
@@ -2158,42 +2133,11 @@ namespace pyopencl
       if (size == 0)
         size = retained_buf_obj->m_buf.len;
     }
-#else
-    py::object retained_buf_obj;
-    if (py_hostbuf.ptr() != Py_None)
-    {
-      PYOPENCL_BUFFER_SIZE_T len;
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(py_hostbuf.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              py_hostbuf.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = py_hostbuf;
-
-      if (size > size_t(len))
-        throw pyopencl::error("Buffer", CL_INVALID_VALUE,
-            "specified size is greater than host buffer size");
-      if (size == 0)
-        size = len;
-    }
-#endif
 
     cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf);
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
@@ -2226,18 +2170,12 @@ namespace pyopencl
     void *buf;
     PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
     len = ward->m_buf.len;
-#else
-    py::object ward = buffer;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2269,18 +2207,12 @@ namespace pyopencl
     const void *buf;
     PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
     len = ward->m_buf.len;
-#else
-    py::object ward = buffer;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2363,19 +2295,11 @@ namespace pyopencl
 
     void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2404,8 +2328,8 @@ namespace pyopencl
       py::object py_buffer_origin,
       py::object py_host_origin,
       py::object py_region,
-      py::sequence py_buffer_pitches,
-      py::sequence py_host_pitches,
+      py::object py_buffer_pitches,
+      py::object py_host_pitches,
       py::object py_wait_for,
       bool is_blocking
       )
@@ -2419,18 +2343,11 @@ namespace pyopencl
 
     const void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2459,8 +2376,8 @@ namespace pyopencl
       py::object py_src_origin,
       py::object py_dst_origin,
       py::object py_region,
-      py::sequence py_src_pitches,
-      py::sequence py_dst_pitches,
+      py::object py_src_pitches,
+      py::object py_dst_pitches,
       py::object py_wait_for)
   {
     PYOPENCL_PARSE_WAIT_FOR;
@@ -2508,17 +2425,12 @@ namespace pyopencl
     const void *pattern_buf;
     PYOPENCL_BUFFER_SIZE_T pattern_len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     pattern_buf = ward->m_buf.buf;
     pattern_len = ward->m_buf.len;
-#else
-    if (PyObject_AsReadBuffer(pattern.ptr(), &pattern_buf, &pattern_len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -2697,7 +2609,6 @@ namespace pyopencl
     void *buf = 0;
     PYOPENCL_BUFFER_SIZE_T len = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (buffer.ptr() != Py_None)
     {
@@ -2714,28 +2625,6 @@ namespace pyopencl
       buf = retained_buf_obj->m_buf.buf;
       len = retained_buf_obj->m_buf.len;
     }
-#else
-    py::object retained_buf_obj;
-    if (buffer.ptr() != Py_None)
-    {
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              buffer.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = buffer;
-    }
-#endif
 
     unsigned dims = py::len(shape);
     cl_int status_code;
@@ -2810,10 +2699,8 @@ namespace pyopencl
       throw pyopencl::error("Image", CL_INVALID_VALUE,
           "invalid dimension");
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
@@ -2843,7 +2730,6 @@ namespace pyopencl
 
     void *buf = 0;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
     if (buffer.ptr() != Py_None)
     {
@@ -2859,29 +2745,6 @@ namespace pyopencl
 
       buf = retained_buf_obj->m_buf.buf;
     }
-#else
-    py::object retained_buf_obj;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (buffer.ptr() != Py_None)
-    {
-      if ((flags & CL_MEM_USE_HOST_PTR)
-          && ((flags & CL_MEM_READ_WRITE)
-            || (flags & CL_MEM_WRITE_ONLY)))
-      {
-        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-          throw py::error_already_set();
-      }
-      else
-      {
-        if (PyObject_AsReadBuffer(
-              buffer.ptr(), const_cast<const void **>(&buf), &len))
-          throw py::error_already_set();
-      }
-
-      if (flags & CL_MEM_USE_HOST_PTR)
-        retained_buf_obj = buffer;
-    }
-#endif
 
     PYOPENCL_PRINT_CALL_TRACE("clCreateImage");
     cl_int status_code;
@@ -2889,10 +2752,8 @@ namespace pyopencl
     if (status_code != CL_SUCCESS)
       throw pyopencl::error("clCreateImage", status_code);
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     if (!(flags & CL_MEM_USE_HOST_PTR))
       retained_buf_obj.reset();
-#endif
 
     try
     {
@@ -2927,18 +2788,11 @@ namespace pyopencl
 
     void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
 
@@ -2973,18 +2827,11 @@ namespace pyopencl
 
     const void *buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     buf = ward->m_buf.buf;
-#else
-    py::object ward = buffer;
-    PYOPENCL_BUFFER_SIZE_T len;
-    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -3106,17 +2953,11 @@ namespace pyopencl
 
     const void *color_buf;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
 
     ward->get(color.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     color_buf = ward->m_buf.buf;
-#else
-    PYOPENCL_BUFFER_SIZE_T color_len;
-    if (PyObject_AsReadBuffer(color.ptr(), &color_buf, &color_len))
-      throw py::error_already_set();
-#endif
 
     cl_event evt;
     PYOPENCL_RETRY_IF_MEM_ERROR(
@@ -3342,14 +3183,11 @@ namespace pyopencl
     private:
       void *m_ptr;
       PYOPENCL_BUFFER_SIZE_T m_size;
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
-        std::unique_ptr<py_buffer_wrapper> ward;
-#endif
+      std::unique_ptr<py_buffer_wrapper> ward;
 
     public:
       svm_arg_wrapper(py::object holder)
       {
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         ward = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
 #ifdef PYPY_VERSION
         // FIXME: get a read-only buffer
@@ -3361,11 +3199,6 @@ namespace pyopencl
 #endif
         m_ptr = ward->m_buf.buf;
         m_size = ward->m_buf.len;
-#else
-        py::object ward = holder;
-        if (PyObject_AsWriteBuffer(holder.ptr(), &m_ptr, &m_size))
-          throw py::error_already_set();
-#endif
       }
 
       void *ptr() const
@@ -3497,18 +3330,12 @@ namespace pyopencl
     const void *pattern_ptr;
     PYOPENCL_BUFFER_SIZE_T pattern_len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
     std::unique_ptr<py_buffer_wrapper> pattern_ward(new py_buffer_wrapper);
 
     pattern_ward->get(py_pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
 
     pattern_ptr = pattern_ward->m_buf.buf;
     pattern_len = pattern_ward->m_buf.len;
-#else
-    py::object pattern_ward = py_pattern;
-    if (PyObject_AsReadBuffer(py_pattern.ptr(), &pattern_ptr, &pattern_len))
-      throw py::error_already_set();
-#endif
 
     size_t fill_size = dst.size();
     if (!byte_count.is_none())
@@ -3768,7 +3595,7 @@ namespace pyopencl
   class program : noncopyable
   {
     public:
-      enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY };
+      enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY, KND_IL };
 
     private:
       cl_program m_program;
@@ -4025,18 +3852,12 @@ namespace pyopencl
       const void *buf;
       PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
       py_buffer_wrapper buf_wrapper;
 
       buf_wrapper.get(py::object(py_binaries[i]).ptr(), PyBUF_ANY_CONTIGUOUS);
 
       buf = buf_wrapper.m_buf.buf;
       len = buf_wrapper.m_buf.len;
-#else
-      if (PyObject_AsReadBuffer(
-            py::object(py_binaries[i]).ptr(), &buf, &len))
-        throw py::error_already_set();
-#endif
 
       binaries.push_back(reinterpret_cast<const unsigned char *>(buf));
       sizes.push_back(len);
@@ -4106,6 +3927,35 @@ namespace pyopencl
 
 
 
+#if (PYOPENCL_CL_VERSION >= 0x2010)
+  inline
+  program *create_program_with_il(
+      context &ctx,
+      std::string const &src)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithIL");
+    cl_program result = clCreateProgramWithIL(
+        ctx.data(), src.c_str(), src.size(), &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithIL", status_code);
+
+    try
+    {
+      return new program(result, false, program::KND_IL);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+#endif
+
+
+
+
+
 #if PYOPENCL_CL_VERSION >= 0x1020
   inline
   program *link_program(
@@ -4255,7 +4105,6 @@ namespace pyopencl
         const void *buf;
         PYOPENCL_BUFFER_SIZE_T len;
 
-#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
         py_buffer_wrapper buf_wrapper;
 
         try
@@ -4271,14 +4120,6 @@ namespace pyopencl
 
         buf = buf_wrapper.m_buf.buf;
         len = buf_wrapper.m_buf.len;
-#else
-        if (PyObject_AsReadBuffer(py_buffer.ptr(), &buf, &len))
-        {
-          PyErr_Clear();
-          throw error("Kernel.set_arg", CL_INVALID_VALUE,
-              "invalid kernel argument");
-        }
-#endif
 
         PYOPENCL_CALL_GUARDED(clSetKernelArg,
             (m_kernel, arg_index, len, buf));
@@ -4427,6 +4268,11 @@ namespace pyopencl
           case CL_KERNEL_ARG_TYPE_NAME:
           case CL_KERNEL_ARG_NAME:
             PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);
+
+          case CL_KERNEL_ARG_TYPE_QUALIFIER:
+            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_kernel_arg_type_qualifier);
 #undef PYOPENCL_FIRST_ARG
           default:
             throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
@@ -4465,7 +4311,8 @@ namespace pyopencl
       py::object py_local_work_size,
       py::object py_global_work_offset,
       py::object py_wait_for,
-      bool g_times_l)
+      bool g_times_l,
+      bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
@@ -4520,6 +4367,33 @@ namespace pyopencl
       global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
     }
 
+    if (allow_empty_ndrange)
+    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+      bool is_empty = false;
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        if (global_work_size[work_axis] == 0)
+          is_empty = true;
+      if (local_work_size_ptr)
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          if (local_work_size_ptr[work_axis] == 0)
+            is_empty = true;
+
+      if (is_empty)
+      {
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+#else
+      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
+      // in the case of an out-of-order queue.
+      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+          "allow_empty_ndrange requires OpenCL 1.2");
+#endif
+    }
+
     PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
           cl_event evt;
           PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp
index a87158f95344ba3e0c1edcffbffc98966bf22c81..9079d058b8b6cf2f835613584b991ede77c66798 100644
--- a/src/wrap_cl_part_1.cpp
+++ b/src/wrap_cl_part_1.cpp
@@ -24,6 +24,9 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 
+#define NO_IMPORT_ARRAY
+#define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
+
 #include "wrap_cl.hpp"
 
 
@@ -218,7 +221,7 @@ void pyopencl_expose_part_1(py::module &m)
       .def_static("from_int_ptr", memory_object_from_int,
         "(static method) Return a new Python object referencing the C-level "
         ":c:type:`cl_mem` object at the location pointed to "
-        "by *int_ptr_value*. The relevant :c:func:`clRetain*` function "
+        "by *int_ptr_value*. The relevant ``clRetain*`` function "
         "will be called if *retain* is True."
         "If the previous owner of the object will *not* release the reference, "
         "*retain* should be set to *False*, to effectively transfer ownership to "
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index edef9ab7d8ce013a80be87647af6c9e3a6b20d06..cbd1f9a40f85ee0c71be5adfe38bc8cd1cd20e50 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -24,6 +24,9 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 
+#define NO_IMPORT_ARRAY
+#define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
+
 #include "wrap_cl.hpp"
 
 
@@ -284,7 +287,7 @@ void pyopencl_expose_part_2(py::module &m)
       ;
   }
 
-  m.def("_enqueue_svm_memcpyw", enqueue_svm_memcpy,
+  m.def("_enqueue_svm_memcpy", enqueue_svm_memcpy,
       py::arg("queue"),
       py::arg("is_blocking"),
       py::arg("dst"),
@@ -351,6 +354,7 @@ void pyopencl_expose_part_2(py::module &m)
       .value("UNKNOWN", cls::KND_UNKNOWN)
       .value("SOURCE", cls::KND_SOURCE)
       .value("BINARY", cls::KND_BINARY)
+      .value("IL", cls::KND_IL)
       ;
 
     py::class_<cls>(m, "_Program", py::dynamic_attr())
@@ -405,6 +409,10 @@ void pyopencl_expose_part_2(py::module &m)
       ;
   }
 
+#if (PYOPENCL_CL_VERSION >= 0x2010)
+  m.def("_create_program_with_il", create_program_with_il);
+#endif
+
 #if PYOPENCL_CL_VERSION >= 0x1020
   m.def("unload_platform_compiler", unload_platform_compiler);
 #endif
@@ -453,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("local_work_size"),
       py::arg("global_work_offset")=py::none(),
       py::arg("wait_for")=py::none(),
-      py::arg("g_times_l")=false
+      py::arg("g_times_l")=false,
+      py::arg("allow_empty_ndrange")=false
       );
 
   // TODO: clEnqueueNativeKernel
diff --git a/src/wrap_constants.cpp b/src/wrap_constants.cpp
index 48a165c08afbef8b11648e4cad3c979fb727ea3e..2420be38b9849a32a46c135735b163aa4daec974 100644
--- a/src/wrap_constants.cpp
+++ b/src/wrap_constants.cpp
@@ -24,6 +24,9 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 
+#define NO_IMPORT_ARRAY
+#define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
+
 #include "wrap_cl.hpp"
 
 
@@ -58,6 +61,7 @@ namespace
   class addressing_mode { };
   class filter_mode { };
   class sampler_info { };
+  class sampler_properties { };
   class map_flags { };
   class program_info { };
   class program_build_info { };
@@ -759,6 +763,13 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<sampler_properties> cls(m, "sampler_properties");
+    ADD_ATTR(SAMPLER_, NORMALIZED_COORDS);
+    ADD_ATTR(SAMPLER_, ADDRESSING_MODE);
+    ADD_ATTR(SAMPLER_, FILTER_MODE);
+  }
+
   {
     py::class_<map_flags> cls(m, "map_flags");
     ADD_ATTR(MAP_, READ);
diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp
index bf6853ea919509b679f799d7937c109bffbba8ad..4799244ff72c583a73a9d92d43229a171be4ee9c 100644
--- a/src/wrap_helpers.hpp
+++ b/src/wrap_helpers.hpp
@@ -173,7 +173,7 @@ namespace
       py::arg("retain")=true, \
       "(static method) Return a new Python object referencing the C-level " \
       ":c:type:`" #CL_TYPENAME "` object at the location pointed to " \
-      "by *int_ptr_value*. The relevant :c:func:`clRetain*` function " \
+      "by *int_ptr_value*. The relevant ``clRetain*`` function " \
       "will be called if *retain* is True." \
       "If the previous owner of the object will *not* release the reference, " \
       "*retain* should be set to *False*, to effectively transfer ownership to " \
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index a6db0924e0aae8ac8ce3dfe50d4380234be6aa79..3f26a2f94a2eac2209f5e55563f04daf05e5d942 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -28,6 +28,9 @@
 // first to prevent OS X from overriding a bunch of macros. (e.g. isspace)
 #include <Python.h>
 
+#define NO_IMPORT_ARRAY
+#define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
+
 #include <memory>
 #include <vector>
 #include "wrap_helpers.hpp"
@@ -101,6 +104,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         return pyopencl::create_buffer(m_context->data(), m_flags, s, 0);
       }
   };
@@ -134,6 +140,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         pointer_type ptr =  pyopencl::create_buffer(
             m_context->data(), m_flags, s, 0);
 
@@ -141,7 +150,10 @@ namespace
         // This looks (and is) expensive. But immediate allocators
         // have their main use in memory pools, whose basic assumption
         // is that allocation is too expensive anyway--but they rely
-        // on exact 'out-of-memory' information.
+        // on 'out-of-memory' being reported on allocation. (If it is
+        // reported in a deferred manner, it has no way to react
+        // (e.g. by freeing unused memory) because it is not part of
+        // the call stack.)
         unsigned zero = 0;
         PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, (
               m_queue.data(),
@@ -185,6 +197,15 @@ namespace
       alloc.try_release_blocks();
     }
 
+    if (!mem)
+    {
+      if (size == 0)
+        return nullptr;
+      else
+        throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+            "allocator succeeded but returned NULL cl_mem");
+    }
+
     try
     {
       return new pyopencl::buffer(mem, false);
@@ -238,8 +259,8 @@ namespace
     wrapper
       .def_property_readonly("held_blocks", &cls::held_blocks)
       .def_property_readonly("active_blocks", &cls::active_blocks)
-      .DEF_SIMPLE_STATIC_METHOD(bin_number)
-      .DEF_SIMPLE_STATIC_METHOD(alloc_size)
+      .DEF_SIMPLE_METHOD(bin_number)
+      .DEF_SIMPLE_METHOD(alloc_size)
       .DEF_SIMPLE_METHOD(free_held)
       .DEF_SIMPLE_METHOD(stop_holding)
       ;
@@ -272,7 +293,8 @@ void pyopencl_expose_mempool(py::module &m)
           std::shared_ptr<pyopencl::context> const &>())
       .def(py::init<
           std::shared_ptr<pyopencl::context> const &,
-          cl_mem_flags>())
+          cl_mem_flags>(),
+          py::arg("queue"), py::arg("mem_flags"))
       ;
   }
 
@@ -282,7 +304,8 @@ void pyopencl_expose_mempool(py::module &m)
         m, "_tools_ImmediateAllocator");
     wrapper
       .def(py::init<pyopencl::command_queue &>())
-      .def(py::init<pyopencl::command_queue &, cl_mem_flags>())
+      .def(py::init<pyopencl::command_queue &, cl_mem_flags>(),
+          py::arg("queue"), py::arg("mem_flags"))
       ;
   }
 
@@ -293,7 +316,10 @@ void pyopencl_expose_mempool(py::module &m)
       cls, /* boost::noncopyable, */
       std::shared_ptr<cls>> wrapper( m, "MemoryPool");
     wrapper
-      .def(py::init<cl_allocator_base const &>())
+      .def(py::init<cl_allocator_base const &, unsigned>(),
+          py::arg("allocator"),
+          py::arg("leading_bits_in_bin_id")=4
+          )
       .def("allocate", device_pool_allocate)
       .def("__call__", device_pool_allocate)
       // undoc for now
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 0360d6a348b1e1e1ab46f6bb4df0785252b2bea4..660c7dfc868f9ae8cf356b4199da68f385e5d8dd 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -1,6 +1,5 @@
 #! /usr/bin/env python
 
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
@@ -24,7 +23,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, zip
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import numpy.linalg as la
 import sys
@@ -75,7 +76,7 @@ def test_elwise_kernel_with_options(ctx_factory):
 
     in_gpu = clrand(queue, (50,), np.float32)
 
-    options = ['-D', 'ADD_ONE']
+    options = ["-D", "ADD_ONE"]
     add_one = ElementwiseKernel(
         context,
         "float* out, const float *in",
@@ -378,7 +379,7 @@ def test_dot(ctx_factory):
                 vdot_ab = np.vdot(a, b)
             except NotImplementedError:
                 import sys
-                is_pypy = '__pypy__' in sys.builtin_module_names
+                is_pypy = "__pypy__" in sys.builtin_module_names
                 if is_pypy:
                     print("PYPY: VDOT UNIMPLEMENTED")
                     continue
@@ -500,7 +501,7 @@ def summarize_error(obtained, desired, orig, thresh=1e-5):
             bad_count += 1
 
             if bad_count < bad_limit:
-                entries.append("%r (want: %r, got: %r, orig: %r)" % (
+                entries.append("{!r} (want: {!r}, got: {!r}, orig: {!r})".format(
                     obtained[i], desired[i], obtained[i], orig[i]))
         else:
             if bad_count:
@@ -849,7 +850,7 @@ def test_sort(ctx_factory, scan_kernel):
 
         numpy_elapsed = numpy_end-dev_end
         dev_elapsed = dev_end-dev_start
-        print("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
+        print("  dev: {:.2f} MKeys/s numpy: {:.2f} MKeys/s ratio: {:.2f}x".format(
                 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
         assert (a_dev_sorted.get() == a_sorted).all()
 
@@ -1070,7 +1071,7 @@ def test_bitonic_sort(ctx_factory, size, dtype):
 @pytest.mark.bitonic
 def test_bitonic_argsort(ctx_factory, size, dtype):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     if not size and is_pypy:
         # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
diff --git a/test/test_array.py b/test/test_array.py
index 9c53e9f57a94821930b86044d2180c31ab61e9c6..39f8fd74e572c49e19d1614d1c352fb625f5553b 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -23,11 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import numpy.linalg as la
 import sys
 
-from six.moves import range
 import pytest
 
 import pyopencl as cl
@@ -154,7 +155,7 @@ def test_mix_complex(ctx_factory):
                         # served a Python complex that is really a
                         # smaller numpy complex.
 
-                        print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
+                        print("HOST_DTYPE: {} DEV_DTYPE: {}".format(
                                 host_result.dtype, dev_result.dtype))
 
                         dev_result = dev_result.astype(host_result.dtype)
@@ -627,8 +628,8 @@ def test_bitwise(ctx_factory):
 
         a = a_dev.get()
         b = b_dev.get()
-        s = int((clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
-                 .astype(b_dtype).get()))
+        s = int(clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
+                 .astype(b_dtype).get())
 
         import operator as o
 
@@ -809,7 +810,7 @@ def test_nan_arithmetic(ctx_factory):
         a = np.random.randn(*shape).astype(np.float32)
         from random import randrange
         for i in range(size // 10):
-            a[randrange(0, size)] = float('nan')
+            a[randrange(0, size)] = float("nan")
         return a
 
     size = 1 << 20
@@ -868,7 +869,7 @@ def test_diff(ctx_factory):
     a = a_dev.get()
 
     err = la.norm(
-            (cl.array.diff(a_dev).get() - np.diff(a)))
+            cl.array.diff(a_dev).get() - np.diff(a))
     assert err < 1e-4
 
 
@@ -1152,7 +1153,7 @@ def test_reshape(ctx_factory):
     # using -1 as unknown dimension
     assert a_dev.reshape(-1, 32).shape == (4, 32)
     assert a_dev.reshape((32, -1)).shape == (32, 4)
-    assert a_dev.reshape(((8, -1, 4))).shape == (8, 4, 4)
+    assert a_dev.reshape((8, -1, 4)).shape == (8, 4, 4)
 
     import pytest
     with pytest.raises(ValueError):
@@ -1314,6 +1315,32 @@ def test_multi_put(ctx_factory):
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
 
+def test_get_async(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
+    a_gpu = cl_array.to_device(queue, a)
+    b = a + a**5 + 1
+    b_gpu = a_gpu + a_gpu**5 + 1
+
+    # deprecated, but still test
+    b1 = b_gpu.get(async_=True)  # testing that this waits for events
+    b_gpu.finish()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+    b1, evt = b_gpu.get_async()  # testing that this waits for events
+    evt.wait()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+    wait_event = cl.UserEvent(context)
+    b_gpu.add_event(wait_event)
+    b, evt = b_gpu.get_async()  # testing that this doesn't hang
+    wait_event.set_status(cl.command_execution_status.COMPLETE)
+    evt.wait()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
 def test_outoforderqueue_get(ctx_factory):
     context = ctx_factory()
     try:
@@ -1321,7 +1348,7 @@ def test_outoforderqueue_get(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = a_gpu + a_gpu**5 + 1
     b1 = b_gpu.get()  # testing that this waits for events
@@ -1336,7 +1363,7 @@ def test_outoforderqueue_copy(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     c_gpu = a_gpu**2 - 7
     b_gpu = c_gpu.copy()  # testing that this waits for and creates events
@@ -1354,8 +1381,8 @@ def test_outoforderqueue_indexing(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
-    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
+    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype("int32"))
     a_gpu = cl_array.to_device(queue, a)
     i_gpu = cl_array.to_device(queue, i)
     c_gpu = (a_gpu**2)[i_gpu - 10000]
@@ -1378,7 +1405,7 @@ def test_outoforderqueue_reductions(ctx_factory):
     except Exception:
         pytest.skip("out-of-order queue not available")
     # 0/1 values to avoid accumulated rounding error
-    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
+    a = (np.random.rand(10**6) > 0.5).astype(np.dtype("float32"))
     a[800000] = 10  # all<5 looks true until near the end
     a_gpu = cl_array.to_device(queue, a)
     b1 = cl_array.sum(a_gpu).get()
@@ -1387,9 +1414,39 @@ def test_outoforderqueue_reductions(ctx_factory):
     assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
 
 
+def test_negative_dim_rejection(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    with pytest.raises(ValueError):
+        cl_array.Array(queue, shape=-10, dtype=np.float)
+
+    with pytest.raises(ValueError):
+        cl_array.Array(queue, shape=(-10,), dtype=np.float)
+
+    for left_dim in (-1, 0, 1):
+        with pytest.raises(ValueError):
+            cl_array.Array(queue, shape=(left_dim, -1), dtype=np.float)
+
+    for right_dim in (-1, 0, 1):
+        with pytest.raises(ValueError):
+            cl_array.Array(queue, shape=(-1, right_dim), dtype=np.float)
+
+
+@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2)])
+def test_zero_size_array(ctx_factory, empty_shape):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b.fill(1)
+    c = a + b
+    c_host = c.get()
+    cl_array.to_device(queue, c_host)
+
+
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the
-    # tests.
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
diff --git a/test/test_arrays_in_structs.py b/test/test_arrays_in_structs.py
new file mode 100644
index 0000000000000000000000000000000000000000..625b6105448080bf361aa02d66950dba28207fe9
--- /dev/null
+++ b/test/test_arrays_in_structs.py
@@ -0,0 +1,101 @@
+__copyright__ = "Copyright (C) 2020 Sotiris Niarchos"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.cltypes as cltypes
+import pyopencl.tools as cl_tools
+from pyopencl import mem_flags
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+def test_struct_with_array_fields(ctx_factory):
+    #
+    # typedef struct {
+    #     uint x[2];
+    #     float y;
+    #     uint z[3][4];
+    # } my_struct;
+    #
+    cl_ctx = ctx_factory()
+    device = cl_ctx.devices[0]
+    queue = cl.CommandQueue(cl_ctx)
+
+    my_struct = np.dtype([
+        ("x", cltypes.uint, 2),
+        ("y", cltypes.int),
+        ("z", cltypes.uint, (3, 4))
+    ])
+    my_struct, cdecl = cl_tools.match_dtype_to_c_struct(
+        device, "my_struct", my_struct
+    )
+
+    # a random buffer of 4 structs
+    my_struct_arr = np.array([
+        ([81, 24], -57, [[15, 28, 45,  7], [71, 95, 65, 84], [2, 11, 59,  9]]),
+        ([5, 20],  47, [[15, 53,  7, 59], [73, 22, 27, 86], [59,  6, 39, 49]]),
+        ([11, 99], -32, [[73, 83,  4, 65], [19, 21, 22, 27], [1, 55,  6, 64]]),
+        ([57, 38], -54, [[74, 90, 38, 67], [77, 30, 99, 18], [91,  3, 63, 67]])
+    ], dtype=my_struct)
+
+    expected_res = []
+    for x in my_struct_arr:
+        expected_res.append(int(np.sum(x[0]) + x[1] + np.sum(x[2])))
+    expected_res = np.array(expected_res, dtype=cltypes.int)
+
+    kernel_src = """%s
+    // this kernel sums every number contained in each struct
+    __kernel void array_structs(__global my_struct *structs, __global int *res) {
+        int i = get_global_id(0);
+        my_struct s = structs[i];
+        res[i] = s.x[0] + s.x[1] + s.y;
+        for (int r = 0; r < 3; r++)
+            for (int c = 0; c < 4; c++)
+                res[i] += s.z[r][c];
+    }""" % cdecl
+
+    mem_flags1 = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR
+    mem_flags2 = mem_flags.WRITE_ONLY
+
+    my_struct_buf = cl.Buffer(cl_ctx, mem_flags1, hostbuf=my_struct_arr)
+    res_buf = cl.Buffer(cl_ctx, mem_flags2, size=expected_res.nbytes)
+
+    program = cl.Program(cl_ctx, kernel_src).build()
+    kernel = program.array_structs
+    kernel(queue, (4,), None, my_struct_buf, res_buf)
+
+    res = np.empty_like(expected_res)
+    cl.enqueue_copy(queue, res, res_buf)
+
+    assert (res == expected_res).all()
+
+
+if __name__ == "__main__":
+
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 9c844016077ffaf31252b7852fd43138eed62fbd..409875f8a1c1ff842982dbf4247637f99a7b6cd5 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function, absolute_import
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
+# avoid spurious: pytest.mark.parametrize is not callable
+# avoid spurious: Module 'scipy.special' has no 'jn' member; maybe 'jv'
+# pylint: disable=not-callable,no-member
+
 
 import math
 import numpy as np
@@ -341,7 +342,7 @@ def test_complex_bessel(ctx_factory, ref_src):
     if ref_src == "pyfmmlib":
         pyfmmlib = pytest.importorskip("pyfmmlib")
 
-        jv_ref = np.zeros(len(z), 'complex')
+        jv_ref = np.zeros(len(z), "complex")
 
         vin = v+1
 
@@ -453,7 +454,7 @@ def test_outoforderqueue_clmath(ctx_factory):
                properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     # testing that clmath functions wait for and create events
     b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
diff --git a/test/test_clrandom.py b/test/test_clrandom.py
index b6b2094e2b0de7630f66c0db876452d81226bbc0..1ce479b2b1f18b099c9457d43a03c4b2327ea77f 100644
--- a/test/test_clrandom.py
+++ b/test/test_clrandom.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Matt Wala"
 
 __license__ = """
@@ -22,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 import numpy as np
 import pytest
 
diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py
index bfbf4f16edd757c3bf8e8bc59fb2d2ed311c0d29..162e5292af57f4586269a0ce2a72b2a5ecf2faf5 100644
--- a/test/test_enqueue_copy.py
+++ b/test/test_enqueue_copy.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2016 Shane J. Latham"
 
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index ee3219e971b931c50351332f140f4cfdf4d9591f..193f198fa8d305780b7bac2dd252c0eeba9b7f91 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
+# avoid spurious: pytest.mark.parametrize is not callable
+# pylint: disable=not-callable
+
 
 import numpy as np
 import numpy.linalg as la
@@ -33,7 +33,8 @@ import pyopencl.array as cl_array
 import pyopencl.cltypes as cltypes
 import pyopencl.clrandom
 from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests,
+        ImmediateAllocator, DeferredAllocator)
 from pyopencl.characterize import get_pocl_version
 
 # Are CL implementations crashy? You be the judge. :)
@@ -45,7 +46,7 @@ else:
     faulthandler.enable()
 
 
-def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'):
+def _skip_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
     if plat.vendor == "The pocl project":
         if up_to_version is None or get_pocl_version(plat) <= up_to_version:
             pytest.skip(msg)
@@ -56,6 +57,9 @@ def test_get_info(ctx_factory):
     device, = ctx.devices
     platform = device.platform
 
+    device.persistent_unique_id
+    device.hashable_model_and_version_identifier
+
     failure_count = [0]
 
     pocl_quirks = [
@@ -378,7 +382,7 @@ def test_image_2d(ctx_factory):
     if "Intel" in device.vendor and "31360.31426" in device.version:
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
+    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image(
@@ -450,7 +454,7 @@ def test_image_3d(ctx_factory):
     if device.platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
+    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image_plane(
@@ -551,20 +555,42 @@ def test_mempool(ctx_factory):
     pool.stop_holding()
 
 
-def test_mempool_2():
-    from pyopencl.tools import MemoryPool
+def test_mempool_2(ctx_factory):
+    from pyopencl.tools import MemoryPool, ImmediateAllocator
     from random import randrange
 
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    pool = MemoryPool(ImmediateAllocator(queue))
+
     for i in range(2000):
         s = randrange(1 << 31) >> randrange(32)
-        bin_nr = MemoryPool.bin_number(s)
-        asize = MemoryPool.alloc_size(bin_nr)
+        bin_nr = pool.bin_number(s)
+        asize = pool.alloc_size(bin_nr)
 
         assert asize >= s, s
-        assert MemoryPool.bin_number(asize) == bin_nr, s
+        assert pool.bin_number(asize) == bin_nr, s
         assert asize < asize*(1+1/8)
 
 
+@pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator])
+def test_allocator(ctx_factory, allocator_cls):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    if allocator_cls is DeferredAllocator:
+        allocator = allocator_cls(context)
+    else:
+        allocator = allocator_cls(queue)
+
+    mem = allocator(15)
+    mem2 = allocator(0)
+
+    assert mem is not None
+    assert mem2 is None
+
+
 def test_vector_args(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -623,9 +649,11 @@ def test_context_dep_memoize(ctx_factory):
     assert counter[0] == 1
 
 
-def test_can_build_binary(ctx_factory):
+def test_can_build_and_run_binary(ctx_factory):
     ctx = ctx_factory()
-    device, = ctx.devices
+    queue = cl.CommandQueue(ctx)
+
+    device = queue.device
 
     program = cl.Program(ctx, """
     __kernel void simple(__global float *in, __global float *out)
@@ -638,12 +666,18 @@ def test_can_build_binary(ctx_factory):
     foo = cl.Program(ctx, [device], [binary])
     foo.build()
 
+    n = 256
+    a_dev = cl.clrandom.rand(queue, n, np.float32)
+    dest_dev = cl_array.empty_like(a_dev)
+
+    foo.simple(queue, (n,), (16,), a_dev.data, dest_dev.data)
+
 
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
     # Still relevant on pocl 1.0RC1.
     _skip_if_pocl(
-            ctx.devices[0].platform, (1, 0), 'pocl crashes on enqueue_barrier')
+            ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")
 
     queue = cl.CommandQueue(ctx)
 
@@ -670,7 +704,7 @@ def test_unload_compiler(platform):
             or cl.get_cl_header_version() < (1, 2)):
         from pytest import skip
         skip("clUnloadPlatformCompiler is only available in OpenCL 1.2")
-    _skip_if_pocl(platform, (0, 13), 'pocl does not support unloading compiler')
+    _skip_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
     if platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("Intel proprietary driver does not support unloading compiler")
@@ -697,7 +731,7 @@ def test_platform_get_devices(ctx_factory):
         devs = platform.get_devices(dev_type)
         if dev_type in (cl.device_type.DEFAULT,
                         cl.device_type.ALL,
-                        getattr(cl.device_type, 'CUSTOM', None)):
+                        getattr(cl.device_type, "CUSTOM", None)):
             continue
         for dev in devs:
             assert dev.type & dev_type == dev_type
@@ -730,22 +764,22 @@ def test_user_event(ctx_factory):
     Thread(target=event_waiter1, args=(evt, 1)).start()
     sleep(.05)
     if status.get(1, False):
-        raise RuntimeError('UserEvent triggered before set_status')
+        raise RuntimeError("UserEvent triggered before set_status")
     evt.set_status(cl.command_execution_status.COMPLETE)
     sleep(.05)
     if not status.get(1, False):
-        raise RuntimeError('UserEvent.wait timeout')
+        raise RuntimeError("UserEvent.wait timeout")
     assert evt.command_execution_status == cl.command_execution_status.COMPLETE
 
     evt = cl.UserEvent(ctx)
     Thread(target=event_waiter2, args=(evt, 2)).start()
     sleep(.05)
     if status.get(2, False):
-        raise RuntimeError('UserEvent triggered before set_status')
+        raise RuntimeError("UserEvent triggered before set_status")
     evt.set_status(cl.command_execution_status.COMPLETE)
     sleep(.05)
     if not status.get(2, False):
-        raise RuntimeError('cl.wait_for_events timeout on UserEvent')
+        raise RuntimeError("cl.wait_for_events timeout on UserEvent")
     assert evt.command_execution_status == cl.command_execution_status.COMPLETE
 
 
@@ -761,8 +795,8 @@ def test_buffer_get_host_array(ctx_factory):
     buf = cl.Buffer(ctx, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=host_buf)
     host_buf2 = buf.get_host_array(25, np.float32)
     assert (host_buf == host_buf2).all()
-    assert (host_buf.__array_interface__['data'][0]
-            == host_buf.__array_interface__['data'][0])
+    assert (host_buf.__array_interface__["data"][0]
+            == host_buf.__array_interface__["data"][0])
     assert host_buf2.base is buf
 
     buf = cl.Buffer(ctx, mf.READ_WRITE | mf.ALLOC_HOST_PTR, size=100)
@@ -918,8 +952,8 @@ def test_spirv(ctx_factory):
 
     if (ctx._get_cl_version() < (2, 1)
             or cl.get_cl_header_version() < (2, 1)):
-        from pytest import skip
-        skip("SPIR-V program creation only available in OpenCL 2.1 and higher")
+        pytest.skip("SPIR-V program creation only available "
+                "in OpenCL 2.1 and higher")
 
     n = 50000
 
@@ -930,7 +964,10 @@ def test_spirv(ctx_factory):
     with open("add-vectors-%d.spv" % queue.device.address_bits, "rb") as spv_file:
         spv = spv_file.read()
 
-    prg = cl.Program(ctx, spv)
+    prg = cl.Program(ctx, spv).build()
+    if (not prg.all_kernels()
+            and queue.device.platform.name.startswith("AMD Accelerated")):
+        pytest.skip("SPIR-V program creation on AMD did not result in any kernels")
 
     prg.sum(queue, a_dev.shape, None, a_dev.data, b_dev.data, dest_dev.data)
 
@@ -939,7 +976,7 @@ def test_spirv(ctx_factory):
 
 def test_coarse_grain_svm(ctx_factory):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -954,6 +991,9 @@ def test_coarse_grain_svm(ctx_factory):
     if ("AMD" in dev.platform.name
             and dev.type & cl.device_type.CPU):
         pytest.xfail("AMD CPU doesn't do coarse-grain SVM")
+    if ("AMD" in dev.platform.name
+            and dev.type & cl.device_type.GPU):
+        pytest.xfail("AMD GPU crashes on SVM unmap")
 
     n = 3000
     svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64))
@@ -990,10 +1030,30 @@ def test_coarse_grain_svm(ctx_factory):
         cl.enqueue_copy(queue, new_ary, svm_ary)
         assert np.array_equal(orig_ary*2, new_ary)
 
+    # {{{ https://github.com/inducer/pyopencl/issues/372
+
+    svm_buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32)
+    svm_out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32)
+
+    with cl.SVM(svm_buf_arr).map_rw(queue) as ary:
+        ary.fill(17)
+
+    prg_ro = cl.Program(ctx, """
+        __kernel void twice_ro(__global int *out_g, __global int *in_g)
+        {
+          out_g[get_global_id(0)] = 2*in_g[get_global_id(0)];
+        }
+        """).build()
+
+    prg_ro.twice_ro(queue, svm_buf_arr.shape, None,
+            cl.SVM(svm_out_arr), cl.SVM(svm_buf_arr))
+
+    # }}}
+
 
 def test_fine_grain_svm(ctx_factory):
     import sys
-    is_pypy = '__pypy__' in sys.builtin_module_names
+    is_pypy = "__pypy__" in sys.builtin_module_names
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -1080,6 +1140,66 @@ def test_compile_link(ctx_factory):
     queue.finish()
 
 
+def test_copy_buffer_rect(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    arr1 = cl_array.zeros(queue, (2, 3), "f")
+    arr2 = cl_array.zeros(queue, (4, 5), "f")
+    arr1.fill(1)
+    cl.enqueue_copy(
+            queue, arr2.data, arr1.data,
+            src_origin=(0, 0), dst_origin=(1, 1),
+            region=arr1.shape[::-1])
+
+
+def test_threaded_nanny_events(ctx_factory):
+    # https://github.com/inducer/pyopencl/issues/296
+
+    import gc
+    import threading
+
+    def create_arrays_thread(n1=10, n2=20):
+        ctx = ctx_factory()
+        queue = cl.CommandQueue(ctx)
+        for i1 in range(n2):
+            for i in range(n1):
+                acl = cl.array.zeros(queue, 10, dtype=np.float32)
+                acl.get()
+            # Garbage collection triggers the error
+            print("collected ", str(gc.collect()))
+            print("stats ", gc.get_stats())
+
+    t1 = threading.Thread(target=create_arrays_thread)
+    t2 = threading.Thread(target=create_arrays_thread)
+
+    t1.start()
+    t2.start()
+
+    t1.join()
+    t2.join()
+
+
+@pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)])
+def test_empty_ndrange(ctx_factory, empty_shape):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if ctx._get_cl_version() < (1, 2) or cl.get_cl_header_version() < (1, 2):
+        pytest.skip("OpenCL 1.2 required for empty NDRange suuport")
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+
+    prg = cl.Program(ctx, """
+        __kernel void add_two(__global float *a_g)
+        {
+          a_g[get_global_id(0)] += 2;
+        }
+        """).build()
+
+    prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True)
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
     import pyopencl  # noqa
diff --git a/travis/build-wheels.sh b/travis/build-wheels.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a7a5a943d7252245b60e2d4af43ff20b155ac746
--- /dev/null
+++ b/travis/build-wheels.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+set -e -x
+
+mkdir -p /deps
+cd /deps
+
+function start_spinner {
+    if [ -n "$SPINNER_PID" ]; then
+        return
+    fi
+
+    >&2 echo "Building libraries..."
+    # Start a process that runs as a keep-alive
+    # to avoid travis quitting if there is no output
+    (while true; do
+        sleep 60
+        >&2 echo "Still building..."
+    done) &
+    SPINNER_PID=$!
+    disown
+}
+
+function stop_spinner {
+    if [ ! -n "$SPINNER_PID" ]; then
+        return
+    fi
+
+    kill $SPINNER_PID
+    unset SPINNER_PID
+
+    >&2 echo "Building libraries finished."
+}
+
+start_spinner
+
+yum install -y git yum openssl-devel
+curl -L -O http://cache.ruby-lang.org/pub/ruby/2.1/ruby-2.1.2.tar.gz
+tar -xf ruby-2.1.2.tar.gz
+cd ruby-2.1.2
+./configure --disable-install-doc --disable-install-rdoc
+make -j4
+make install
+cd ..
+rm -rf ruby-2.1.2
+
+git clone --branch v2.2.12 https://github.com/OCL-dev/ocl-icd
+cd ocl-icd
+curl -L -O https://raw.githubusercontent.com/conda-forge/ocl-icd-feedstock/22625432a0ae85920825dfeb103af9fe7bd6a950/recipe/install-headers.patch
+git apply install-headers.patch
+curl -L -O https://github.com/isuruf/ocl-icd/commit/3862386b51930f95d9ad1089f7157a98165d5a6b.patch
+git apply 3862386b51930f95d9ad1089f7157a98165d5a6b.patch
+autoreconf -i
+chmod +x configure
+./configure --prefix=/usr
+make -j4
+make install
+cd ..
+
+# Compile wheels
+for PYBIN in /opt/python/*/bin; do
+    if [[ "${PYBIN}" == *cp36* ]]; then
+        NUMPY_VERSION="1.11.3"
+    elif [[ "${PYBIN}" == *cp37* ]]; then
+        NUMPY_VERSION="1.14.5"
+    elif [[ "${PYBIN}" == *cp38* ]]; then
+        NUMPY_VERSION="1.17.3"
+    else
+        continue
+    fi
+    # Build with the oldest numpy available to be compatible with newer ones
+    "${PYBIN}/pip" install "numpy==${NUMPY_VERSION}" pybind11 mako
+    "${PYBIN}/pip" wheel /io/ -w wheelhouse/ --no-deps
+done
+
+# Bundle external shared libraries into the wheels
+for whl in wheelhouse/pyopencl*.whl; do
+    auditwheel repair "$whl" -w /io/wheelhouse/
+done
+
+# Bundle license files
+
+/opt/python/cp37-cp37m/bin/pip install delocate
+/opt/python/cp37-cp37m/bin/python /io/travis/fix-wheel.py /deps/ocl-icd/COPYING
+
+if [[ "${TWINE_USERNAME}" == "" ]]; then
+    echo "TWINE_USERNAME not set. Skipping uploading wheels"
+    exit 0
+fi
+
+/opt/python/cp37-cp37m/bin/pip install twine
+for WHEEL in /io/wheelhouse/pyopencl*.whl; do
+    # dev
+    # /opt/python/cp37-cp37m/bin/twine upload \
+    #     --skip-existing \
+    #     --repository-url https://test.pypi.org/legacy/ \
+    #     -u "${TWINE_USERNAME}" -p "${TWINE_PASSWORD}" \
+    #     "${WHEEL}"
+    # prod
+    /opt/python/cp37-cp37m/bin/twine upload \
+        --skip-existing \
+        -u "${TWINE_USERNAME}" -p "${TWINE_PASSWORD}" \
+        "${WHEEL}"
+done
+
+stop_spinner
diff --git a/travis/fix-wheel.py b/travis/fix-wheel.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f8bf10d1a1cb2a8244c1c92a3ed428fd9c5d503
--- /dev/null
+++ b/travis/fix-wheel.py
@@ -0,0 +1,22 @@
+import sys
+import os.path
+import shutil
+from glob import glob
+
+from delocate import wheeltools
+
+def add_library(paths):
+    wheel_fnames = glob('/io/wheelhouse/pyopencl*.whl')
+    for fname in wheel_fnames:
+        print('Processing', fname)
+        with wheeltools.InWheel(fname, fname):
+            for lib_path in paths:
+                shutil.copy2(lib_path, os.path.join('pyopencl', '.libs'))
+
+def main():
+    args = list(sys.argv)
+    args.pop(0)
+    add_library(args)
+
+if __name__ == '__main__':
+    main()