diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000000000000000000000000000000000000..dcbc21d86f9e4b17ea7e8803d538c4c0f0b6276a
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,32 @@
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c85e22c12d78cb2e5a3ef753bc8baf4ee4cb3780..7d8101763de864e20bd92c6be0d1fef0e31d1b31 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,11 +18,11 @@ jobs:
         -
             uses: actions/setup-python@v1
             with:
-                python-version: '3.x' 
+                python-version: '3.x'
         -   name: "Main Script"
             run: |
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-                . ./prepare-and-run-flake8.sh ./loopy ./test
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples
 
     pylint:
         name: Pylint
@@ -35,10 +35,10 @@ jobs:
                 CONDA_ENVIRONMENT=.test-conda-env.yml
                 USE_CONDA_BUILD=1
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-                . ./prepare-and-run-pylint.sh loopy test/test_*.py
+                . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
 
-    pytest3:
-        name: Conda Pytest Py3
+    pytest:
+        name: Conda Pytest
         runs-on: ubuntu-latest
         steps:
         -   uses: actions/checkout@v2
@@ -48,29 +48,58 @@ jobs:
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
                 . ./build-and-test-py-project-within-miniconda.sh
 
-    pytest2:
-        name: Conda Pytest Py2
+    pytest_twice:
+        name: Conda Pytest Twice (for cache behavior)
         runs-on: ubuntu-latest
         steps:
         -   uses: actions/checkout@v2
         -   name: "Main Script"
             run: |
-                sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml
-                cat .test-conda-env-py2.yml
-                CONDA_ENVIRONMENT=.test-conda-env-py2.yml
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-                . ./build-and-test-py-project-within-miniconda.sh
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ./ci-support.sh
+                build_py_project_in_conda_env
+                ( test_py_project )
+                ( test_py_project )
 
-    pytest_twice:
-        name: Pytest twice (for cache behavior) on Py${{ matrix.python-version }}
+    examples:
+        name: Conda Examples
         runs-on: ubuntu-latest
         steps:
         -   uses: actions/checkout@v2
         -   name: "Main Script"
             run: |
                 CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-                . ./build-and-test-py-project-within-miniconda.sh
-                ${PY_EXE} -m pytest -rw --durations=10 --tb=native  --junitxml=pytest.xml -rxs $TESTABLES
+                EXTRA_INSTALL="matplotlib ipykernel nbconvert"
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ./ci-support.sh
+                build_py_project_in_conda_env
+
+                curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
+                export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
+
+                export PYOPENCL_TEST=portable:pthread
+
+                . ./build-py-project-and-run-examples.sh
+                run_py_examples
+                run_ipynb_examples
+                run_floopy_examples
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+                . ci-support.sh
+                build_py_project_in_conda_env
+                build_docs
 
 # vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c11e507ee79cdc6f1567acbf6c12bbd7ed22f1cc..d69f0b8c489c07d3aa1512f6f1cbb8ced0f6a2e9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,26 +1,7 @@
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - export LOOPY_NO_CACHE=1
-  - export NO_DOCTESTS=1
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
-
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -55,15 +36,15 @@ Python 3 Intel:
 
 
 Python 3 POCL Twice With Cache:
-  script:
-  - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  - "cd .."
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
+  script: |
+    export PY_EXE=python3
+    export PYOPENCL_TEST=portable:pthread
+    export EXTRA_INSTALL="pybind11 numpy mako"
+    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+    . ./ci-support.sh
+    build_py_project_in_venv
+    ( test_py_project )
+    ( test_py_project )
   tags:
   - python3
   - pocl
@@ -77,7 +58,7 @@ Python 3 POCL Twice With Cache:
 # PyPy POCL:
 #   script:
 #   - export PY_EXE=pypy
-#   - export PYOPENCL_TEST=portable
+#   - export PYOPENCL_TEST=portable:pthread
 #   - export EXTRA_INSTALL="pybind11 numpy mako"
 #   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
 #   - ". ./build-and-test-py-project.sh"
@@ -88,16 +69,26 @@ Python 3 POCL Twice With Cache:
 #   - tags
 
 Python 3 POCL Examples:
-  script:
-  - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
-  - ". ./build-py-project-and-run-examples.sh"
+  script: |
+    export PY_EXE=python3
+    export PYOPENCL_TEST=portable:pthread
+    export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
+
+    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+    . ./ci-support.sh
+    build_py_project_in_venv
+
+    curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
+    export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
+
+    . ./build-py-project-and-run-examples.sh
+    run_py_examples
+    run_ipynb_examples
+    run_floopy_examples
   tags:
   - python3
   - pocl
   - large-node
-  - ispc
   except:
   - tags
 
@@ -108,26 +99,12 @@ Pylint:
   - export PY_EXE=python3
   - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-  - ". ./prepare-and-run-pylint.sh loopy test/test_*.py"
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
   except:
   - tags
 
-CentOS binary:
-  script:
-  - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
-  - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy)
-  artifacts:
-    expire_in: 4 weeks
-    paths:
-    - build-helpers/loopy-centos6
-  tags:
-  - docker
-  only:
-  - master
-  retry: 2
-
 Documentation:
   script:
   - EXTRA_INSTALL="pybind11 numpy"
@@ -135,13 +112,11 @@ Documentation:
   - ". ./build-docs.sh"
   tags:
   - python3
-  only:
-  - master
 
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh loopy test"
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
   tags:
   - python3
   except:
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index a1fe086b4ac4562aaa8fafd32657aebbd1068e8a..0688c79603a66aabd0e021855e543f751cd76542 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,12 +1,12 @@
 name: test-conda-env
 channels:
 - conda-forge
-- defaults
+- nodefaults
 
 dependencies:
 - python=3
 - git
-- conda-forge::numpy
+- numpy
 - pocl
 - mako
 - pyopencl
@@ -16,13 +16,3 @@ dependencies:
 - matplotlib
 - ipykernel
 - ply
-
-- pip
-
-- pip:
-    - git+https://github.com/inducer/pytools.git
-    - git+https://github.com/inducer/cgen.git
-    - git+https://github.com/inducer/pymbolic.git
-    - git+https://github.com/inducer/genpy.git
-    - git+https://github.com/inducer/codepy.git
-    - git+https://github.com/inducer/f2py
diff --git a/MANIFEST.in b/MANIFEST.in
index 119fb6a1dda0b5b9efd95c5908da4d3563e6a543..293d43ffc8130de870932cc17db18ebe35fd0058 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
 include test/*.py
 include test/*.f90
-recursive-include examples *.py *.cl *.floopy *.sh *.ipynb *.cpp
-recursive-include contrib *.vim
+recursive-include examples *.py *.cl *.floopy *.sh *.ipynb *.cpp *.loopy
+recursive-include contrib *.vim *.py
 
 include build-helpers/*.sh
 include build-helpers/*.spec
@@ -18,4 +18,5 @@ include doc/images/*.png
 include configure.py
 include Makefile.in
 include README.rst
+include LICENSE
 include requirements*.txt
diff --git a/README.rst b/README.rst
index 4aa93e0888a0063c6c0af2a2c8916b85018b182e..46204c29e166f86e170441ee1b54fc51def5f844 100644
--- a/README.rst
+++ b/README.rst
@@ -4,12 +4,12 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
 .. image:: https://gitlab.tiker.net/inducer/loopy/badges/master/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/loopy/commits/master
-.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master
+.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master&event=push
     :alt: Github Build Status
-    :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI
-.. image:: https://badge.fury.io/py/loo.py.png
+    :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
+.. image:: https://badge.fury.io/py/loopy.png
     :alt: Python Package Index Release Page
-    :target: https://pypi.org/project/loo.py/
+    :target: https://pypi.org/project/loopy/
 
 Loopy lets you easily generate the tedious, complicated code that is necessary
 to get good performance out of GPUs and multi-core CPUs.
@@ -38,23 +38,21 @@ Loopy targets array-type computations, such as the following:
 It is not (and does not want to be) a general-purpose programming language.
 
 Loopy is licensed under the liberal `MIT license
-<http://en.wikipedia.org/wiki/MIT_License>`_ and free for commercial, academic,
+<https://en.wikipedia.org/wiki/MIT_License>`_ and free for commercial, academic,
 and private use. All of Loopy's dependencies can be automatically installed from
 the package index after using::
 
-    pip install loo.py
+    pip install loopy
 
 In addition, Loopy is compatible with and enhances
-`pyopencl <http://mathema.tician.de/software/pyopencl>`_.
+`pyopencl <https://mathema.tician.de/software/pyopencl>`_.
 
 ---
 
 Places on the web related to Loopy:
 
-* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
-
-* `Documentation <http://documen.tician.de/loopy>`_ (read how things work)
-* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
-* `Homepage <http://mathema.tician.de/software/loopy>`_
+* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
+* `Documentation <https://documen.tician.de/loopy>`_ (read how things work)
+* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
+* `Homepage <https://mathema.tician.de/software/loopy>`_
 
diff --git a/build-helpers/.gitignore b/build-helpers/.gitignore
deleted file mode 100644
index fef83014eecb14936006b90afc65595dd7d30b77..0000000000000000000000000000000000000000
--- a/build-helpers/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-loopy-*-20[0-9][0-9]*
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
deleted file mode 100644
index 08c0b6efe0efd3ad419b6565fd396c2f805eeab7..0000000000000000000000000000000000000000
--- a/build-helpers/loopy.spec
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- mode: python -*-
-
-from os.path import basename, dirname, join
-from glob import glob
-
-single_file = True
-
-# This makes the executable spew debug info.
-debug = False
-
-from os.path import expanduser
-
-import packaging # pip install packaging to add
-
-a = Analysis(['../bin/loopy'],
-             pathex=[expanduser('~/src/loopy')],
-             hiddenimports=[
-                "decorator",
-                "appdirs",
-                "packaging.markers",
-                "packaging.specifiers",
-                "packaging.version",
-                "packaging.requirements",
-                ],
-             hookspath=None,
-             runtime_hooks=None,
-             excludes=["hedge", "meshpy", "pyopencl", "PIL"]
-             )
-
-import ply.lex
-import ply.yacc
-
-
-a.datas += [
-  (join("py-src", "ply", "lex", basename(fn)), fn, "DATA")
-  for fn in glob(join(dirname(ply.lex.__file__), "*.py"))
-  ] + [
-  (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA")
-  for fn in glob(join(dirname(ply.yacc.__file__), "*.py"))
-  ]
-
-pyz = PYZ(a.pure)
-
-if single_file:
-    exe = EXE(pyz,
-              a.scripts,
-              a.binaries,
-              a.zipfiles,
-              a.datas,
-              name='loopy',
-              debug=debug,
-              strip=None,
-              upx=True,
-              console=True)
-else:
-    exe = EXE(pyz,
-              a.scripts,
-              exclude_binaries=True,
-              name='loopy',
-              debug=debug,
-              strip=None,
-              upx=True,
-              console=True)
-    coll = COLLECT(exe,
-                   a.binaries,
-                   a.zipfiles,
-                   a.datas,
-                   strip=None,
-                   upx=True,
-                   name='loopy')
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
deleted file mode 100755
index 035634b16072e0188270abd8736dab99ce31dada..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-VENV_VERSION="virtualenv-15.2.0"
-rm -Rf "$VENV_VERSION"
-curl -k https://files.pythonhosted.org/packages/b1/72/2d70c5a1de409ceb3a27ff2ec007ecdd5cc52239e7c74990e32af57affe9/$VENV_VERSION.tar.gz | tar xfz -
-
-$VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env
-
-source .env/bin/activate
-
-curl -k https://bootstrap.pypa.io/ez_setup.py | python -
-curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python -
-
-pip install packaging
-
-PYTHON_VER=$(python -c 'import sys; print(".".join(str(s) for s in sys.version_info[:2]))')
-pip install git+https://github.com/pyinstaller/pyinstaller.git@413c37bec126c0bd26084813593f65128966b4b7
-
-git clone --recursive git://github.com/inducer/loopy
-cd loopy
-
-grep -v pyopencl requirements.txt > myreq.txt
-
-# needed for pyinstaller package to be usable
-echo packaging >> myreq.txt
-
-pip install -r myreq.txt
-python setup.py install
-
-chown -R user /tmp/build
-
-su user -p -c "cd /tmp/build && source .env/bin/activate && cd loopy && ./build-helpers/run-pyinstaller.sh"
diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh
deleted file mode 100755
index a7f621b1ef21676898d2283d93f8a54f086e5d9d..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-mkdir /tmp/build
-cd /tmp/build
-
-useradd -d /home/user -m -s /bin/bash user
-
-yum install -y centos-release-scl
-yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel
-
-scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh
-
diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh
deleted file mode 100755
index fb0cfb587d654698800bfdc827259691bc056fb7..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#! /bin/bash
-
-# should be run in this directory (build-helpers)
-
-if test "$1" = "--nodate"; then
-  TGT_NAME=loopy-centos6
-else
-  TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
-fi
-
-echo "Generating $TGT_NAME..."
-
-set -e
-set -x
-
-docker pull centos:6
-
-CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
-echo "working in container $CNT"
-
-docker start -i $CNT
-
-docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
-
-mv loopy $TGT_NAME
-
-docker rm $CNT
-
diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh
deleted file mode 100755
index 50f9d85dccc503be2a2ccfb6c0e3d6aa28216981..0000000000000000000000000000000000000000
--- a/build-helpers/run-pyinstaller.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/bash
-
-# run this from the loopy root directory
-
-rm -Rf dist build
-
-pyinstaller \
-  --workpath=build/pyinstaller \
-  build-helpers/loopy.spec
diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh
deleted file mode 100755
index 57b8a873b9395954d76a8fd16f8ca9a261e8baa3..0000000000000000000000000000000000000000
--- a/build-helpers/upload.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#! /bin/bash
-
-set -e
-
-scp "$1" tiker.net:public_html/pub/loopy-binaries/
diff --git a/build-py-project-and-run-examples.sh b/build-py-project-and-run-examples.sh
index e51a86d2085364ca142f5bfde3380a9fade0de01..a3ddf75875a657bdd7134d0580f6bdabfd2af25d 100644
--- a/build-py-project-and-run-examples.sh
+++ b/build-py-project-and-run-examples.sh
@@ -2,9 +2,6 @@
 
 set -e
 
-curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project.sh
-source build-py-project.sh
-
 function run_examples()
 {
   PATTERN=$1
@@ -25,13 +22,10 @@ function run_py_examples()
 }
 function run_ipynb_examples()
 {
-  run_examples "*.ipynb" "${PY_EXE} -m nbconvert --execute"
+  run_examples "*.ipynb" "${PY_EXE} -m nbconvert --to html --execute"
 }
 function run_floopy_examples()
 {
   run_examples "*.floopy" "${PY_EXE} -m loopy"
 }
 
-run_py_examples
-run_ipynb_examples
-run_floopy_examples
diff --git a/contrib/c-integer-semantics.py b/contrib/c-integer-semantics.py
index 5e05ec6884c3c6b5b6c58d0080c6c0a52b91e2e4..23c7cb319177b762e83583e7bb5ea3eecd1d46da 100644
--- a/contrib/c-integer-semantics.py
+++ b/contrib/c-integer-semantics.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 
 from os import system
 import ctypes
diff --git a/doc/conf.py b/doc/conf.py
index a2807b076f562abf8b9250f64e4ea7c16073a7b8..942afcd3ce11056c65c6a7500bb5ed312dc40187 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #
 # loopy documentation build configuration file, created by
 # sphinx-quickstart on Tue Aug  9 13:40:49 2011.
@@ -46,8 +45,8 @@ source_suffix = '.rst'
 master_doc = 'index'
 
 # General information about the project.
-project = u'loopy'
-copyright = u'2016, Andreas Klöckner'
+project = 'loopy'
+copyright = '2016, Andreas Klöckner'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -108,7 +107,7 @@ html_theme = "alabaster"
 html_theme_options = {
         "extra_nav_links": {
             "🚀 Github": "https://github.com/inducer/loopy",
-            "💾 Download Releases": "https://pypi.python.org/pypi/loo.py",
+            "💾 Download Releases": "https://pypi.org/project/loopy",
             }
         }
 
@@ -148,7 +147,7 @@ html_sidebars = {
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
@@ -206,8 +205,8 @@ htmlhelp_basename = 'loopydoc'
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-        ('index', 'loopy.tex', u'loopy Documentation',
-            u'Andreas Kloeckner', 'manual'),
+        ('index', 'loopy.tex', 'loopy Documentation',
+            'Andreas Kloeckner', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -239,8 +238,8 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'loopy', u'loopy Documentation',
-     [u'Andreas Kloeckner'], 1)
+    ('index', 'loopy', 'loopy Documentation',
+     ['Andreas Kloeckner'], 1)
 ]
 
 
@@ -251,6 +250,8 @@ intersphinx_mapping = {
     'https://documen.tician.de/pyopencl': None,
     'https://documen.tician.de/cgen': None,
     'https://docs.scipy.org/doc/numpy/': None,
+    'https://documen.tician.de/pymbolic': None,
+    'https://documen.tician.de/pytools': None,
     }
 
 autoclass_content = "class"
diff --git a/doc/index.rst b/doc/index.rst
index 9a10116d916468fd46b9b23ad113f3d9085ae699..1c64134a34086b59f9b0dd1a7010e49f037b751f 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -20,43 +20,18 @@ When you run this script, the following kernel is generated, compiled, and execu
 
 .. _static-binary:
 
-Want to try out loopy?
-----------------------
-
-There's no need to go through :ref:`installation` if you'd just like to get a
-feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/-/jobs/66778/artifacts/browse/build-helpers/>`_.
-This is purposefully built on an ancient Linux distribution, so it should work
-on most versions of Linux that are currently out there.
-
-Once you have the binary, do the following::
-
-    chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy.loopy
-    ./loopy-centos6 --target=cuda hello-loopy.loopy
-    ./loopy-centos6 --target=ispc hello-loopy.loopy
-
-Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`.
-
-You may also donwload the most recent version by going to the `list of builds
-<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
-of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
-navigating to "build-helpers", and downloading the binary from there.
-
 Places on the web related to Loopy
 ----------------------------------
 
-* `Python package index <http://pypi.python.org/pypi/loo.py>`_ (download releases) Note the extra '.' in the PyPI identifier!
-
-* `Github <http://github.com/inducer/loopy>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/Loopy>`_ (read installation tips, get examples, read FAQ)
-* `Homepage <http://mathema.tician.de/software/loopy>`_
+* `Python package index <https://pypi.org/project/loopy>`_ (download releases)
+* `Github <https://github.com/inducer/loopy>`_ (get latest source code, file bugs)
+* `Homepage <https://mathema.tician.de/software/loopy>`_
 
 Table of Contents
 -----------------
 
 If you're only just learning about loopy, consider the following `paper
-<http://arxiv.org/abs/1405.7470>`_ on loo.py that may serve as a good
+<https://arxiv.org/abs/1405.7470>`_ on loopy that may serve as a good
 introduction.
 
 Please check :ref:`installation` to get started.
@@ -71,6 +46,7 @@ Please check :ref:`installation` to get started.
     ref_call
     ref_other
     misc
+    ref_internals
 
 Indices and tables
 ==================
diff --git a/doc/misc.rst b/doc/misc.rst
index 62e5a1fa20f2709c4933e21f43175fc1f870c348..4c8c9867f3ceee2447f9249097c7c30f4d6f501d 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -17,9 +17,7 @@ Option 1: From Source, no PyOpenCL integration
 
 This command should install :mod:`loopy`::
 
-    pip install loo.py
-
-(Note the extra "."!)
+    pip install loopy
 
 You may need to run this with :command:`sudo`.
 If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
@@ -29,14 +27,13 @@ run this beforehand::
     python get-pip.py
 
 For a more manual installation, `download the source
-<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say::
+<https://pypi.org/project/loopy>`_, unpack it, and say::
 
     python setup.py install
 
 You may also clone its git repository::
 
-    git clone --recursive git://github.com/inducer/loopy
-    git clone --recursive http://git.tiker.net/trees/loopy.git
+    git clone --recursive https://github.com/inducer/loopy.git
 
 Option 2: From Conda Forge, with PyOpenCL integration
 -----------------------------------------------------
@@ -256,7 +253,7 @@ This list is always growing, but here are a few pointers:
 * Precompute subexpressions:
 
   Use a :ref:`substitution-rule` to assign a name to a subexpression,
-  using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`.
+  using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`.
   Then use :func:`loopy.precompute` to create an (array or scalar)
   temporary with precomputed values.
 
@@ -295,12 +292,12 @@ This list is always growing, but here are a few pointers:
   Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag.
   Note that the corresponding axis of an array must
   also be tagged using the ``"vec"`` array axis tag
-  (using :func:`tag_array_axes`) in order for vector code to be
+  (using :func:`loopy.tag_array_axes`) in order for vector code to be
   generated.
 
   Vectorized loops (and array axes) must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters` along with
-  :func:`split_array_axis`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with
+  :func:`loopy.split_array_axis`.)
 
 * Reuse of Temporary Storage
 
@@ -309,7 +306,7 @@ This list is always growing, but here are a few pointers:
 
 * SoA $\leftrightarrow$ AoS
 
-  Use :func:`tag_array_axes` with the ``"sep"`` array axis tag
+  Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag
   to generate separate arrays for each entry of a short, fixed-length
   array axis.
 
@@ -320,7 +317,7 @@ This list is always growing, but here are a few pointers:
 
   Use :func:`loopy.tag_inames` with the ``"ilp"`` tag.
   ILP loops must have a fixed size. (See either
-  :func:`split_iname` or :func:`fix_parameters`.)
+  :func:`loopy.split_iname` or :func:`loopy.fix_parameters`.)
 
 * Type inference
 
@@ -445,7 +442,7 @@ If you use loopy for your work and find its approach helpful, please
 consider citing the following article.
 
     A. Klöckner. `Loo.py: transformation-based code generation for GPUs and
-    CPUs <http://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
+    CPUs <https://arxiv.org/abs/1405.7470>`_. Proceedings of ARRAY '14: ACM
     SIGPLAN Workshop on Libraries, Languages, and Compilers for Array
     Programming. Edinburgh, Scotland.
 
@@ -478,3 +475,16 @@ Andreas Klöckner's work on :mod:`loopy` was supported in part by
 AK also gratefully acknowledges a hardware gift from Nvidia Corporation.  The
 views and opinions expressed herein do not necessarily reflect those of the
 funding agencies.
+
+Cross-References to Other Documentation
+=======================================
+
+.. currentmodule:: numpy
+
+.. class:: int16
+
+    See :class:`numpy.generic`.
+
+.. class:: complex128
+
+    See :class:`numpy.generic`.
diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst
index 6b715033cce60fa3a369f2abc4edbecbf4c9a0d3..05e0edb88245086cabea806e5aa108fa6688a9a8 100644
--- a/doc/ref_creation.rst
+++ b/doc/ref_creation.rst
@@ -1,6 +1,4 @@
-.. module:: loopy
-.. moduleauthor:: Andreas Kloeckner <inform@tiker.net>
-
+.. currentmodule:: loopy
 .. _creating-kernels:
 
 Reference: Creating Kernels
diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3dc0a2bd7306e4b7e68d44e5956fe69e32c9c97f
--- /dev/null
+++ b/doc/ref_internals.rst
@@ -0,0 +1,55 @@
+Reference: Documentation for Internal API
+=========================================
+
+Targets
+-------
+
+See also :ref:`targets`.
+
+.. automodule:: loopy.target.c
+
+Symbolic
+--------
+
+See also :ref:`expression-syntax`.
+
+.. automodule:: loopy.symbolic
+
+Types
+-----
+
+DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in
+the codegen pipeline user-provided types are converted to
+:class:`loopy.types.LoopyType`.
+
+.. automodule:: loopy.types
+
+Codegen
+-------
+
+.. automodule:: loopy.codegen
+
+Reduction Operation
+-------------------
+
+.. automodule:: loopy.library.reduction
+
+Iname Tags
+----------
+
+.. automodule:: loopy.kernel.data
+
+Array
+-----
+
+.. automodule:: loopy.kernel.array
+
+Checks
+------
+
+.. automodule:: loopy.check
+
+Schedule
+--------
+
+.. automodule:: loopy.schedule
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 409cbef576d654be973dd6d1424ac40d3ea60982..d339e1b19caae740401c5b98ffbf8927d2477551 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -151,6 +151,42 @@ Tag                             Meaning
 
 .. }}}
 
+Identifiers
+-----------
+
+Reserved Identifiers
+^^^^^^^^^^^^^^^^^^^^
+
+The identifier prefix ``_lp_`` is reserved for internal usage; when creating
+*inames*, *argument names*, *temporary variable names*, *substitution rule
+names*, *instruction IDs*, and other identifiers, users should *not* use names
+beginning with ``_lp_``.  This prefix is used for identifiers created
+internally when operating on Loopy's kernel IR. For Loopy developers, further
+information on name prefixes used within submodules is below.
+
+Identifier Registry
+^^^^^^^^^^^^^^^^^^^
+
+Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
+all internally-created identifiers. Additionally, each name beginning with
+``_lp_`` must start with one of the reserved prefixes below. New prefixes may
+be registered by adding them to the table below. New prefixes may not themselves
+be the prefix of an existing prefix.
+
+**Reserved Identifier Prefixes**
+
+======================= ==================================
+Reserved Prefix         Usage (module or purpose)
+======================= ==================================
+``_lp_linchk_``         ``loopy.linearization.checker``
+======================= ==================================
+
+.. note::
+
+    Existing Loopy code may not yet fully satisfy these naming requirements.
+    Name changes are in progress, and prefixes will be added to this registry
+    as they are created.
+
 .. _instructions:
 
 Instructions
@@ -358,6 +394,7 @@ TODO: Reductions
 Function Call Instructions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+.. automodule:: loopy
 .. autoclass:: CallInstruction
 
 C Block Instructions
@@ -374,6 +411,8 @@ Atomic Operations
 
 .. autoclass:: VarAtomicity
 
+.. autoclass:: OrderedAtomic
+
 .. autoclass:: AtomicInit
 
 .. autoclass:: AtomicUpdate
@@ -431,7 +470,7 @@ Temporary Variables
 Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both
 have the lifetime of a kernel invocation.
 
-.. autoclass:: temp_var_scope
+.. autoclass:: AddressSpace
 
 .. autoclass:: TemporaryVariable
     :members:
@@ -597,8 +636,8 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to
     :members:
     :undoc-members:
 
-Implementation Detail: The Base Array
--------------------------------------
+Implementation Details: The Base Array
+--------------------------------------
 
 All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
 :class:`TemporaryVariable`) derive from single, shared base array type,
@@ -608,4 +647,5 @@ described next.
 
 .. autoclass:: ArrayBase
 
+
 .. vim: tw=75:spell:fdm=marker
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
index 71d6c54b11dcd15977bdb375cea2207d881b5696..64367c752ba4bfa24cb5957a950f67db701966de 100644
--- a/doc/ref_other.rst
+++ b/doc/ref_other.rst
@@ -16,7 +16,7 @@ Controlling caching
 Running Kernels
 ---------------
 
-In addition to simply calling kernels using :class:`LoopKernel.__call__`,
+In addition to simply calling kernels using :meth:`LoopKernel.__call__`,
 the following underlying functionality may be used:
 
 .. autoclass:: CompiledKernel
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 565f81db46c4ffbee805bbb1f4e34419d2d6b049..1e1489d372e51dc7bd3bcc3ee43c5f7620df4ea9 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -179,11 +179,11 @@ for good measure.
     >>> assert (out.get() == (2*x_vec_dev).get()).all()
 
 We can have loopy print the OpenCL kernel it generated
-by passing :attr:`loopy.Options.write_cl`.
+by passing :attr:`loopy.Options.write_code`.
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
@@ -227,7 +227,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, write_wrapper=True, write_cl=False)
+    >>> knl = lp.set_options(knl, write_wrapper=True, write_code=False)
     >>> evt, (out,) = knl(queue, a=x_vec_host)
     from __future__ import division
     ...
@@ -246,18 +246,26 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
         # }}}
     ...
 
+You can also pass options to the OpenCL implementation
+by passing :attr:`loopy.Options.build_options`.
+
+.. doctest::
+
+    >>> knl = lp.set_options(knl, build_options=["-cl-mad-enable"])
+
+
 Generating code
 ~~~~~~~~~~~~~~~
 
 Instead of using loopy to run the code it generates, you can also just use
 loopy as a code generator and take care of executing the generated kernels
 yourself. In this case, make sure loopy knows about all types, and then
-call :func:`loopy.generate_code`:
+call :func:`loopy.generate_code_v2`:
 
 .. doctest::
 
     >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
-    >>> code, _ = lp.generate_code(typed_knl)
+    >>> code = lp.generate_code_v2(typed_knl).device_code()
     >>> print(code)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
@@ -355,7 +363,7 @@ loopy can also show an instruction dependency graph, using
 
 Dependencies are shown as arrows from prerequisite to dependent in the
 graph.  This functionality requires the open-source `graphviz
-<http://graphviz.org>`_ graph drawing tools to be installed. The generated
+<https://graphviz.org>`_ graph drawing tools to be installed. The generated
 graph will open in a browser window.
 
 Since manually notating lots of dependencies is cumbersome, loopy has
@@ -380,7 +388,7 @@ Let us take a look at the generated code for the above kernel:
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> knl = lp.prioritize_loops(knl, "i,j")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -430,7 +438,7 @@ Now the intended code is generated and our test passes.
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
@@ -485,7 +493,7 @@ ambiguous.
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -523,7 +531,7 @@ is overwritten with the new kernel::
     knl = lp.do_something(knl, arguments...)
 
 We've already seen an example of a transformation above:
-For instance, :func:`prioritize_loops` fit the pattern.
+For instance, :func:`loopy.prioritize_loops` fit the pattern.
 
 :func:`loopy.split_iname` is another fundamental (and useful) transformation. It
 turns one existing iname (recall that this is loopy's word for a 'loop
@@ -543,7 +551,7 @@ Consider this example:
     ...     "a[i] = 0", assumptions="n>=1")
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -574,7 +582,7 @@ relation to loop nesting. For example, it's perfectly possible to request
     ...     "a[i] = 0", assumptions="n>=1")
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.prioritize_loops(knl, "i_inner,i_outer")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -599,7 +607,7 @@ commonly called 'loop tiling':
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.split_iname(knl, "j", 16)
     >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -641,7 +649,7 @@ loop's tag to ``"unr"``:
     >>> knl = lp.split_iname(knl, "i", 4)
     >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
     >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
@@ -716,7 +724,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     ...     "a[i] = 0", assumptions="n>=0")
     >>> knl = lp.split_iname(knl, "i", 128,
     ...         outer_tag="g.0", inner_tag="l.0")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -762,7 +770,7 @@ assumption:
     >>> knl = lp.split_iname(knl, "i", 4)
     >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
     >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -781,7 +789,7 @@ assumption:
 While these conditionals enable the generated code to deal with arbitrary
 *n*, they come at a performance cost. Loopy allows generating separate code
 for the last iteration of the *i_outer* loop, by using the *slabs* keyword
-argument to :func:`split_iname`. Since this last iteration of *i_outer* is
+argument to :func:`loopy.split_iname`. Since this last iteration of *i_outer* is
 the only iteration for which ``i_inner + 4*i_outer`` can become larger than
 *n*, only the (now separate) code for that iteration contains conditionals,
 enabling some cost savings:
@@ -790,7 +798,7 @@ enabling some cost savings:
 
     >>> knl = orig_knl
     >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -886,7 +894,7 @@ memory, local to each work item.
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -947,7 +955,7 @@ Consider the following example:
     ...     """)
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
     >>> knl = lp.set_temporary_scope(knl, "a_temp", "local")
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -1012,7 +1020,7 @@ transformation exists in :func:`loopy.add_prefetch`:
     ...     out[16*i_outer + i_inner] = sum(k, a[16*i_outer + i_inner])
     ...     """)
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> knl_pf = lp.add_prefetch(knl, "a")
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -1110,7 +1118,7 @@ work item:
 * *Local barriers* ensure consistency of memory accesses to items within
   *the same* work group. This synchronizes with all instructions in the work
   group.  The type of memory (local or global) may be specified by the
-  :attr:`loopy.instruction.BarrierInstruction.mem_kind`
+  :attr:`loopy.BarrierInstruction.mem_kind`
 
 * *Global barriers* ensure consistency of memory accesses
   across *all* work groups, i.e. it synchronizes with every work item
@@ -1360,7 +1368,7 @@ a loopy kernel by simply calling them, e.g.::
 Additionally, all functions of one variable are currently recognized during
 code-generation however additional implementation may be required for custom
 functions.  The full lists of available functions may be found in a the
-:class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
+:class:`loopy.TargetBase` implementation (e.g. :class:`loopy.CudaTarget`)
 
 Custom user functions may be represented using the method described in :ref:`functions`
 
@@ -1470,7 +1478,7 @@ When we ask to see the code, the issue becomes apparent:
 
 .. doctest::
 
-    >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_options(knl, "write_code")
     >>> from warnings import catch_warnings
     >>> with catch_warnings():
     ...     filterwarnings("always", category=lp.LoopyWarning)
@@ -1568,13 +1576,13 @@ number of operations matching the characteristics of the :class:`loopy.Op`
 specified in the key (in terms of the :class:`loopy.LoopKernel`
 *inames*). :class:`loopy.Op` attributes include:
 
-- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
   data type operated on.
 
 - name: A :class:`str` that specifies the kind of arithmetic operation as
   *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
 
-One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
+One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_with_dict`:
 
 .. doctest::
 
@@ -1659,7 +1667,7 @@ Each line of output will look roughly like::
 - mtype: A :class:`str` that specifies the memory type accessed as **global**
   or **local**
 
-- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
   data type accessed.
 
 - lid_strides: A :class:`dict` of **{** :class:`int` **:**
@@ -1681,7 +1689,7 @@ Each line of output will look roughly like::
 - variable: A :class:`str` that specifies the variable name of the data
   accessed.
 
-We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dict`:
 
 .. doctest::
 
@@ -1850,7 +1858,7 @@ kernel from the previous example:
     Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 }
     <BLANKLINE>
 
-We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
+We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:
 
 .. doctest::
 
@@ -1915,7 +1923,7 @@ Based on the kernel code printed above, we would expect each work-item to
 encounter 50x10x2 barriers, which matches the result from
 :func:`loopy.get_synchronization_map`. In this case, the number of barriers
 does not depend on any inames, so we can pass an empty dictionary to
-:func:`islpy.eval_with_dict`.
+:meth:`islpy.PwQPolynomial.eval_with_dict`.
 
 .. }}}
 
diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..111ac241198581a75ad42d91f9db8e4e89a3cbf2
--- /dev/null
+++ b/examples/fortran/matmul-driver.py
@@ -0,0 +1,35 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+import loopy as lp
+
+
+def main():
+    fn = "matmul.floopy"
+    with open(fn) as inf:
+        source = inf.read()
+
+    dgemm, = lp.parse_transformed_fortran(source, filename=fn)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2048
+    a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+    cl.clrandom.fill_rand(a)
+    cl.clrandom.fill_rand(b)
+
+    dgemm = lp.set_options(dgemm, write_code=True)
+
+    dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+    c_ref = (a.get() @ b.get())
+    assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
index a8377beddb912a2d6b1d9255694336313089a0f9..733cdaac4d9153803dcb54d5c114a33871403bbf 100644
--- a/examples/fortran/matmul.floopy
+++ b/examples/fortran/matmul.floopy
@@ -22,7 +22,11 @@ end subroutine
 !
 ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
 ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
-! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto")
-! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto")
+! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner",
+!         precompute_outer_inames="i_outer, j_outer, k_outer",
+!         default_tag="l.auto")
+! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner",
+!         precompute_outer_inames="i_outer, j_outer, k_outer",
+!         default_tag="l.auto")
 ! RESULT = dgemm
 !$loopy end
diff --git a/examples/python/call-external.py b/examples/python/call-external.py
index c13d99bd06295096c26d6e113841c853f80645fc..104d12f38a96b6a70aa2313c6ab3a8884e67c696 100644
--- a/examples/python/call-external.py
+++ b/examples/python/call-external.py
@@ -68,8 +68,8 @@ class BLASCallable(lp.ScalarCallable):
                     par_dtype).expr
                 for par, par_dtype in zip(
                     parameters, par_dtypes)]
-        c_parameters.insert(0, var('CblasRowMajor'))
-        c_parameters.insert(1, var('CblasNoTrans'))
+        c_parameters.insert(0, var("CblasRowMajor"))
+        c_parameters.insert(1, var("CblasNoTrans"))
         c_parameters.insert(2, mat_descr.shape[0])
         c_parameters.insert(3, mat_descr.shape[1])
         c_parameters.insert(4, 1)
@@ -85,8 +85,8 @@ class BLASCallable(lp.ScalarCallable):
 
 
 def blas_fn_lookup(target, identifier):
-    if identifier == 'gemv':
-        return BLASCallable(name='gemv')
+    if identifier == "gemv":
+        return BLASCallable(name="gemv")
     return None
 
 # }}}
@@ -99,9 +99,9 @@ knl = lp.make_kernel(
         """
         y[:] = gemv(A[:, :], x[:])
         """, [
-            lp.GlobalArg('A', dtype=np.float64, shape=(n, n)),
-            lp.GlobalArg('x', dtype=np.float64, shape=(n, )),
-            lp.GlobalArg('y', shape=(n, )), ...],
+            lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
+            lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
+            lp.GlobalArg("y", shape=(n, )), ...],
         target=CTarget(),
         lang_version=(2018, 2))
 
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 764cea0e63036ff1a1338cce1210c9e198e954a7..ad0028d19a466474eed5e49cf9526424de4a60a7 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -2,7 +2,7 @@ import numpy as np
 import loopy as lp
 import pyopencl as cl
 import pyopencl.array
-from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
 
 # setup
 # -----
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index 90f31f0946d06edf5565e744b9080c59c66818ca..ce40487b1f41a6a591134a21eeb14113fd8be4fa 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -54,7 +54,7 @@ else:
 
 
 def main():
-    with open("tasksys.cpp", "r") as ts_file:
+    with open("tasksys.cpp") as ts_file:
         tasksys_source = ts_file.read()
 
     def make_knl(name, insn, vars):
diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py
index b8da89c6c75986e3baf5e35ee76b680d08c51632..aa2a650feb165684a9d65207772e093568b9f98e 100644
--- a/examples/python/rank-one.py
+++ b/examples/python/rank-one.py
@@ -33,8 +33,10 @@ evt, (c,) = knl(queue, a=a, b=b)
 split_knl = knl
 
 # PREFETCH1BEGIN
-knl = lp.add_prefetch(knl, "a")
-knl = lp.add_prefetch(knl, "b")
+knl = lp.add_prefetch(knl, "a",
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
+knl = lp.add_prefetch(knl, "b",
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
 # PREFETCH1END
 
 knl = lp.set_options(knl, write_code=True)
@@ -43,8 +45,14 @@ evt, (c,) = knl(queue, a=a, b=b)
 knl = split_knl
 
 # PREFETCH2BEGIN
-knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0")
-knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0")
+knl = lp.add_prefetch(knl, "a", ["i_inner"],
+        fetch_outer_inames="i_outer, j_outer, j_inner",
+        temporary_address_space=lp.AddressSpace.LOCAL,
+        default_tag="l.0")
+knl = lp.add_prefetch(knl, "b", ["j_inner"],
+        fetch_outer_inames="i_outer, j_outer, j_inner",
+        temporary_address_space=lp.AddressSpace.LOCAL,
+        default_tag="l.0")
 # PREFETCH2END
 
 knl = lp.set_options(knl, write_code=True)
@@ -58,8 +66,10 @@ knl = lp.split_iname(knl, "i", 256,
 knl = lp.split_iname(knl, "j", 256,
         outer_tag="g.1", slabs=(0, 1))
 
-knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
-knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
+knl = lp.add_prefetch(knl, "a", ["i_inner"],
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)
+knl = lp.add_prefetch(knl, "b", ["j_inner"],
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)
 
 knl = lp.split_iname(knl, "i_inner", 16,
         inner_tag="l.0")
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 819eccbd3a0feb303a528098b70cb8d3d411f079..36eabd0a38ba60d995f55218ab67acbb4162609f 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,9 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
-from six.moves import range, zip
-
 from loopy.symbolic import (
         TaggedVariable, Reduction, LinearSubscript, TypeCast)
 from loopy.diagnostic import LoopyError, LoopyWarning
@@ -36,7 +31,7 @@ from loopy.program import iterate_over_kernels_if_given_program
 from loopy.kernel.instruction import (
         MemoryOrdering, memory_ordering,
         MemoryScope, memory_scope,
-        VarAtomicity, AtomicInit, AtomicUpdate,
+        VarAtomicity, OrderedAtomic, AtomicInit, AtomicUpdate,
         InstructionBase,
         MultiAssignmentBase, Assignment, ExpressionInstruction,
         CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction)
@@ -79,7 +74,7 @@ from loopy.transform.iname import (
         affine_map_inames, find_unused_axis_tag,
         make_reduction_inames_unique,
         has_schedulable_iname_nesting, get_iname_duplication_options,
-        add_inames_to_insn)
+        add_inames_to_insn, add_inames_for_unused_hw_axes)
 
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
@@ -171,7 +166,7 @@ __all__ = [
         "MemoryScope", "memory_scope",  # lower case is deprecated
 
         "VarAtomicity",
-        "AtomicInit", "AtomicUpdate",
+        "OrderedAtomic", "AtomicInit", "AtomicUpdate",
         "InstructionBase",
         "MultiAssignmentBase", "Assignment", "ExpressionInstruction",
         "CallInstruction", "CInstruction", "NoOpInstruction",
@@ -204,7 +199,7 @@ __all__ = [
         "affine_map_inames", "find_unused_axis_tag",
         "make_reduction_inames_unique",
         "has_schedulable_iname_nesting", "get_iname_duplication_options",
-        "add_inames_to_insn",
+        "add_inames_to_insn", "add_inames_for_unused_hw_axes",
 
         "add_prefetch", "change_arg_to_image",
         "tag_array_axes", "tag_data_axes",
@@ -334,7 +329,7 @@ def set_options(kernel, *args, **kwargs):
         from loopy.options import _apply_legacy_map, Options
         kwargs = _apply_legacy_map(Options._legacy_options_map, kwargs)
 
-        for key, val in six.iteritems(kwargs):
+        for key, val in kwargs.items():
             if not hasattr(new_opt, key):
                 raise ValueError("unknown option '%s'" % key)
 
@@ -440,7 +435,7 @@ def set_caching_enabled(flag):
     CACHING_ENABLED = flag
 
 
-class CacheMode(object):
+class CacheMode:
     """A context manager for setting whether :mod:`loopy` is allowed to use
     disk caches.
     """
@@ -487,10 +482,10 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None):
     shape = ["n%d" % i for i in range(rank)]
     commad_indices = ", ".join(indices)
     bounds = " and ".join(
-            "0<=%s<%s" % (ind, shape_i)
+            f"0<={ind}<{shape_i}"
             for ind, shape_i in zip(indices, shape))
 
-    set_str = "{[%s]: %s}" % (
+    set_str = "{{[{}]: {}}}".format(
                 commad_indices,
                 bounds
                 )
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index ebc07e1fce26f8d0f405ca5e699e32480e21fa4d..73b11b70bbfc8110f7bfed272c88d79d267a218a 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, zip
 import os
 from warnings import warn
 
@@ -452,7 +449,7 @@ def auto_test_vs_ref(
 
         ref_implemented_data_info = ref_codegen_result.implemented_data_info
 
-        logger.info("%s (ref): trying %s for the reference calculation" % (
+        logger.info("{} (ref): trying {} for the reference calculation".format(
             ref_prog.name, dev))
 
         if not quiet and print_ref_code:
@@ -490,7 +487,7 @@ def auto_test_vs_ref(
 
         ref_queue.finish()
 
-        logger.info("%s (ref): using %s for the reference calculation" % (
+        logger.info("{} (ref): using {} for the reference calculation".format(
             ref_prog.name, dev))
         logger.info("%s (ref): run" % ref_prog.name)
 
@@ -526,6 +523,16 @@ def auto_test_vs_ref(
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
+    from loopy.kernel import KernelState
+    from loopy.target.pyopencl import PyOpenCLTarget
+    if test_prog.state not in [
+            KernelState.PREPROCESSED,
+            KernelState.LINEARIZED]:
+        if isinstance(test_prog.target, PyOpenCLTarget):
+            test_prog = test_prog.copy(target=PyOpenCLTarget(ctx.devices[0]))
+
+        test_prog = lp.preprocess_kernel(test_prog)
+
     from loopy.type_inference import infer_unknown_types
 
     test_prog = infer_unknown_types(test_prog, expect_completion=True)
@@ -634,7 +641,7 @@ def auto_test_vs_ref(
 
     rates = ""
     for cnt, lbl in zip(op_count, op_label):
-        rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
+        rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl)
 
     if not quiet:
         def format_float_or_none(v):
@@ -652,10 +659,28 @@ def auto_test_vs_ref(
     if do_check:
         ref_rates = ""
         for cnt, lbl in zip(op_count, op_label):
-            ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
+            rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl)
+
         if not quiet:
-            print("ref: elapsed: %g s event, %g s wall%s" % (
-                    ref_elapsed_event, ref_elapsed_wall, ref_rates))
+            def format_float_or_none(v):
+                if v is None:
+                    return "<unavailable>"
+                else:
+                    return "%g" % v
+
+            print("elapsed: %s s event, %s s marker-event %s s wall "
+                    "(%d rounds)%s" % (
+                        format_float_or_none(elapsed_event),
+                        format_float_or_none(elapsed_event_marker),
+                        format_float_or_none(elapsed_wall), timing_rounds, rates))
+
+        if do_check:
+            ref_rates = ""
+            for cnt, lbl in zip(op_count, op_label):
+                ref_rates += " {:g} {}/s".format(cnt/ref_elapsed_event, lbl)
+            if not quiet:
+                print("ref: elapsed: {:g} s event, {:g} s wall{}".format(
+                        ref_elapsed_event, ref_elapsed_wall, ref_rates))
 
     # }}}
 
diff --git a/loopy/check.py b/loopy/check.py
index b0cf68f08224c44f8e2186a80fd31e99b7f67ebf..44fbfe155fd778350cc6fee642b4fa856ebb6fc3 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,16 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import range
 
 from islpy import dim_type
 import islpy as isl
 from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction
-from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel
+from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning,
+        warn_with_kernel, ExpressionToAffineConversionError)
 from loopy.type_inference import TypeInferenceMapper
 from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
         CInstruction, _DataObliviousInstruction)
+from warnings import warn
 
 from functools import reduce
 
@@ -39,6 +37,35 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+__doc__ = """
+.. currentmodule:: loopy.check
+
+.. autofunction:: check_for_integer_subscript_indices
+
+.. autofunction:: check_for_duplicate_insn_ids
+
+.. autofunction:: check_for_double_use_of_hw_axes
+
+.. autofunction:: check_insn_attributes
+
+.. autofunction:: check_loop_priority_inames_known
+
+.. autofunction:: check_multiple_tags_allowed
+
+.. autofunction:: check_for_inactive_iname_access
+
+.. autofunction:: check_for_unused_inames
+
+.. autofunction:: check_for_write_races
+
+.. autofunction:: check_for_data_dependent_parallel_bounds
+
+.. autofunction:: check_bounds
+
+.. autofunction:: check_variable_access_ordered
+"""
+
+
 # {{{ sanity checks run before preprocessing
 
 def check_identifiers_in_subst_rules(knl):
@@ -50,7 +77,7 @@ def check_identifiers_in_subst_rules(knl):
 
     allowed_identifiers = knl.all_variable_names()
 
-    for rule in six.itervalues(knl.substitutions):
+    for rule in knl.substitutions.values():
         deps = get_dependencies(rule.expression)
         rule_allowed_identifiers = allowed_identifiers | frozenset(rule.arguments)
 
@@ -84,11 +111,11 @@ class UnscopedCallCollector(CombineMapper):
     def map_call_with_kwargs(self, expr):
         if not isinstance(expr.function, ResolvedFunction):
             return (frozenset([expr.function.name]) |
-                    self.combine((self.rec(child) for child in expr.parameters
-                        + tuple(expr.kw_parameters.values()))))
+                    self.combine(self.rec(child) for child in expr.parameters
+                        + tuple(expr.kw_parameters.values())))
         else:
-            return self.combine((self.rec(child) for child in
-                expr.parameters+tuple(expr.kw_parameters.values())))
+            return self.combine(self.rec(child) for child in
+                expr.parameters+tuple(expr.kw_parameters.values()))
 
     def map_constant(self, expr):
         return frozenset()
@@ -136,7 +163,13 @@ VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"])
 class SubscriptIndicesIsIntChecker(TypeInferenceMapper):
     def map_subscript(self, expr):
         for idx in expr.index_tuple:
-            if not self.rec(idx)[0].is_integral():
+            type_inf_result = self.rec(idx)
+            if not type_inf_result:
+                raise LoopyError(
+                        "When checking that subscript indices are integral: "
+                        "Type inference did not find type of '%s'"
+                        % idx)
+            if not type_inf_result[0].is_integral():
                 raise LoopyError("Non-integral array indices obtained in"
                         " {}.".format(expr))
 
@@ -144,6 +177,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper):
 
 
 def check_for_integer_subscript_indices(kernel, callables_table):
+    """
+    Checks is every array access is of type :class:`int`.
+    """
     from pymbolic.primitives import Subscript
     idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table)
     for insn in kernel.instructions:
@@ -160,7 +196,10 @@ def check_for_integer_subscript_indices(kernel, callables_table):
 
 
 def check_insn_attributes(kernel):
-    all_insn_ids = set(insn.id for insn in kernel.instructions)
+    """
+    Check for legality of attributes of every instruction in *kernel*.
+    """
+    all_insn_ids = {insn.id for insn in kernel.instructions}
 
     for insn in kernel.instructions:
         if not insn.within_inames <= kernel.all_inames():
@@ -175,14 +214,14 @@ def check_insn_attributes(kernel):
                     % (insn.id, ", ".join(
                         insn.depends_on - all_insn_ids)))
 
-        no_sync_with_insn_ids = set(id for id, scope in insn.no_sync_with)
+        no_sync_with_insn_ids = {id for id, scope in insn.no_sync_with}
         if not no_sync_with_insn_ids <= all_insn_ids:
             raise LoopyError("insn '%s' has nosync directive with unknown "
                     "instruction ids: %s"
                     % (insn.id,
                        ", ".join(no_sync_with_insn_ids - all_insn_ids)))
 
-        no_sync_with_scopes = set(scope for id, scope in insn.no_sync_with)
+        no_sync_with_scopes = {scope for id, scope in insn.no_sync_with}
         if not no_sync_with_scopes <= VALID_NOSYNC_SCOPES:
             raise LoopyError("insn '%s' has invalid nosync scopes: %s"
                     % (insn.id,
@@ -190,6 +229,10 @@ def check_insn_attributes(kernel):
 
 
 def check_for_duplicate_insn_ids(knl):
+    """
+    Check if multiple instructions of *knl* have the same
+    :attr:`loopy.InstructionBase.id`.
+    """
     insn_ids = set()
 
     for insn in knl.instructions:
@@ -201,6 +244,10 @@ def check_for_duplicate_insn_ids(knl):
 
 
 def check_loop_priority_inames_known(kernel):
+    """
+    Checks if the inames in :attr:`loopy.LoopKernel.loop_priority` are part of
+    the *kernel*'s domain.
+    """
     for prio in kernel.loop_priority:
         for iname in prio:
             if iname not in kernel.all_inames():
@@ -215,26 +262,33 @@ def _get_all_unique_iname_tags(kernel):
     from itertools import chain
     iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in
                               kernel.all_inames())))
-    return set(
+    return {
             tag for tag in iname_tags if
-            isinstance(tag, UniqueTag))
+            isinstance(tag, UniqueTag)}
 
 
 def check_multiple_tags_allowed(kernel):
+    """
+    Checks if a multiple tags of an iname are compatible.
+    """
     from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag,
                 UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type)
     illegal_combinations = [
         (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag),
         (IlpBaseTag, ForceSequentialTag)
     ]
-    for iname, tags in six.iteritems(kernel.iname_to_tags):
+    for iname, tags in kernel.iname_to_tags.items():
         for comb in illegal_combinations:
             if len(filter_iname_tags_by_type(tags, comb)) > 1:
-                raise LoopyError("iname {0} has illegal combination of "
-                                 "tags: {1}".format(iname, tags))
+                raise LoopyError("iname {} has illegal combination of "
+                                 "tags: {}".format(iname, tags))
 
 
 def check_for_double_use_of_hw_axes(kernel, callables_table):
+    """
+    Check if any instruction of *kernel* is within multiple inames tagged with
+    the same hw axis tag.
+    """
     from loopy.kernel.data import UniqueTag
     from loopy.kernel.instruction import CallInstruction
     from loopy.kernel.function_interface import CallableKernel
@@ -267,6 +321,9 @@ def check_for_double_use_of_hw_axes(kernel, callables_table):
 
 
 def check_for_inactive_iname_access(kernel):
+    """
+    Check if any instruction accesses an iname but is not within it.
+    """
     for insn in kernel.instructions:
         expression_inames = insn.read_dependency_names() & kernel.all_inames()
 
@@ -280,6 +337,22 @@ def check_for_inactive_iname_access(kernel):
                             kernel.insn_inames(insn)), kernel.name))
 
 
+def check_for_unused_inames(kernel):
+    """
+    Check if there are any unused inames in the kernel.
+    """
+    # Warn if kernel has unused inames
+    from loopy.transform.iname import get_used_inames
+    unused_inames = kernel.all_inames() - get_used_inames(kernel)
+    if unused_inames:
+        warn_with_kernel(
+            kernel, "unused_inames",
+            "Found unused inames in kernel: %s "
+            "Unused inames during linearization will be prohibited in "
+            "Loopy version 2021.X."
+            % unused_inames)
+
+
 def _is_racing_iname_tag(tv, tag):
     from loopy.kernel.data import (AddressSpace,
             LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
@@ -307,6 +380,9 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
+    """
+    Check if any memory accesses lead to write races.
+    """
     from loopy.kernel.data import ConcurrentTag
 
     for insn in kernel.instructions:
@@ -324,16 +400,16 @@ def check_for_write_races(kernel):
                 # Any concurrent tags that are not depended upon by the assignee
                 # will cause write races.
 
-                raceable_parallel_insn_inames = set(
+                raceable_parallel_insn_inames = {
                     iname for iname in kernel.insn_inames(insn)
-                    if kernel.iname_tags_of_type(iname, ConcurrentTag))
+                    if kernel.iname_tags_of_type(iname, ConcurrentTag)}
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
-                raceable_parallel_insn_inames = set(
+                raceable_parallel_insn_inames = {
                         iname for iname in kernel.insn_inames(insn)
                         if any(_is_racing_iname_tag(temp_var, tag)
-                            for tag in kernel.iname_tags(iname)))
+                            for tag in kernel.iname_tags(iname))}
 
             else:
                 raise LoopyError("invalid assignee name in instruction '%s'"
@@ -355,7 +431,7 @@ def check_for_orphaned_user_hardware_axes(kernel):
     from loopy.kernel.data import LocalIndexTag
     for axis in kernel.local_sizes:
         found = False
-        for tags in six.itervalues(kernel.iname_to_tags):
+        for tags in kernel.iname_to_tags.values():
             for tag in tags:
                 if isinstance(tag, LocalIndexTag) and tag.axis == axis:
                     found = True
@@ -369,13 +445,17 @@ def check_for_orphaned_user_hardware_axes(kernel):
 
 
 def check_for_data_dependent_parallel_bounds(kernel):
+    """
+    Check that inames tagged as hw axes have bounds that are known at kernel
+    launch.
+    """
     from loopy.kernel.data import ConcurrentTag
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
-        par_inames = set(
+        par_inames = {
                 iname for iname in dom_inames
-                if kernel.iname_tags_of_type(iname, ConcurrentTag))
+                if kernel.iname_tags_of_type(iname, ConcurrentTag)}
 
         if not par_inames:
             continue
@@ -392,13 +472,12 @@ def check_for_data_dependent_parallel_bounds(kernel):
 # {{{ check access bounds
 
 class _AccessCheckMapper(WalkMapper):
-    def __init__(self, kernel, domain, insn_id):
+    def __init__(self, kernel, insn_id):
         self.kernel = kernel
-        self.domain = domain
         self.insn_id = insn_id
 
-    def map_subscript(self, expr):
-        WalkMapper.map_subscript(self, expr)
+    def map_subscript(self, expr, domain):
+        WalkMapper.map_subscript(self, expr, domain)
 
         from pymbolic.primitives import Variable
         assert isinstance(expr.aggregate, Variable)
@@ -421,7 +500,7 @@ class _AccessCheckMapper(WalkMapper):
             from loopy.symbolic import (get_dependencies, get_access_range,
                     UnableToDetermineAccessRange)
 
-            available_vars = set(self.domain.get_var_dict())
+            available_vars = set(domain.get_var_dict())
             shape_deps = set()
             for shape_axis in shape:
                 if shape_axis is not None:
@@ -438,8 +517,7 @@ class _AccessCheckMapper(WalkMapper):
                             len(subscript), len(shape)))
 
             try:
-                access_range = get_access_range(self.domain, subscript,
-                        self.kernel.assumptions)
+                access_range = get_access_range(domain, subscript)
             except UnableToDetermineAccessRange:
                 # Likely: index was non-affine, nothing we can do.
                 return
@@ -462,8 +540,29 @@ class _AccessCheckMapper(WalkMapper):
                         " establish '%s' is a subset of '%s')."
                         % (expr, self.insn_id, access_range, shape_domain))
 
+    def map_if(self, expr, domain):
+        from loopy.symbolic import get_dependencies
+        if get_dependencies(expr.condition) <= frozenset(
+                domain.space.get_var_dict()):
+            try:
+                from loopy.symbolic import isl_set_from_expr
+                then_set = isl_set_from_expr(domain.space, expr.condition)
+                else_set = then_set.complement()
+            except ExpressionToAffineConversionError:
+                # non-affine condition: can't do much
+                then_set = else_set = isl.BasicSet.universe(domain.space)
+        else:
+            # data-dependent condition: can't do much
+            then_set = else_set = isl.BasicSet.universe(domain.space)
+
+        self.rec(expr.then, domain & then_set)
+        self.rec(expr.else_, domain & else_set)
+
 
 def check_bounds(kernel):
+    """
+    Performs out-of-bound check for every array access.
+    """
     temp_var_names = set(kernel.temporary_variables)
     for insn in kernel.instructions:
         domain = kernel.get_inames_domain(kernel.insn_inames(insn))
@@ -472,10 +571,12 @@ def check_bounds(kernel):
         if set(domain.get_var_names(dim_type.param)) & temp_var_names:
             continue
 
-        acm = _AccessCheckMapper(kernel, domain, insn.id)
+        acm = _AccessCheckMapper(kernel, insn.id)
+        domain, assumptions = isl.align_two(domain, kernel.assumptions)
+        domain_with_assumptions = domain & assumptions
 
         def run_acm(expr):
-            acm(expr)
+            acm(expr, domain_with_assumptions)
             return expr
 
         insn.with_transformed_expressions(run_acm)
@@ -519,7 +620,7 @@ def check_has_schedulable_iname_nesting(kernel):
     if not has_schedulable_iname_nesting_for_single_kernel(kernel):
         import itertools as it
         opt = get_iname_duplication_options_for_single_kernel(kernel)
-        opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w)
+        opt_str = "\n".join(f"* Duplicate {i} within instructions {w}"
                             for i, w in it.islice(opt, 3))
         raise LoopyError("Kernel does not have a schedulable iname nesting. "
                 "In order for there to exist a feasible loop nesting, you "
@@ -533,45 +634,9 @@ def check_has_schedulable_iname_nesting(kernel):
 
 # {{{ check_variable_access_ordered
 
-class IndirectDependencyEdgeFinder(object):
-    def __init__(self, kernel):
-        self.kernel = kernel
-        self.dep_edge_cache = {}
-
-    def __call__(self, depender_id, dependee_id):
-        cache_key = (depender_id, dependee_id)
-
-        try:
-            result = self.dep_edge_cache[cache_key]
-        except KeyError:
-            pass
-        else:
-            if result is None:
-                from loopy.diagnostic import DependencyCycleFound
-                raise DependencyCycleFound("when "
-                        "checking for dependency edge between "
-                        "depender '%s' and dependee '%s'"
-                        % (depender_id, dependee_id))
-            else:
-                return result
-
-        depender = self.kernel.id_to_insn[depender_id]
-
-        if dependee_id in depender.depends_on:
-            self.dep_edge_cache[cache_key] = True
-            return True
-
-        self.dep_edge_cache[cache_key] = None
-        for dep in depender.depends_on:
-            if self(dep, dependee_id):
-                self.dep_edge_cache[cache_key] = True
-                return True
-
-        self.dep_edge_cache[cache_key] = False
-        return False
-
-
 def declares_nosync_with(kernel, var_address_space, dep_a, dep_b):
+    dep_a = kernel.id_to_insn[dep_a]
+    dep_b = kernel.id_to_insn[dep_b]
     from loopy.kernel.data import AddressSpace
     if var_address_space == AddressSpace.GLOBAL:
         search_scopes = ["global", "any"]
@@ -594,127 +659,215 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b):
     return ab_nosync and ba_nosync
 
 
+def _get_address_space(kernel, var):
+    from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg
+    if var in kernel.temporary_variables:
+        address_space = kernel.temporary_variables[var].address_space
+    else:
+        arg = kernel.arg_dict[var]
+        if isinstance(arg, ArrayArg):
+            address_space = arg.address_space
+        elif isinstance(arg, ValueArg):
+            address_space = AddressSpace.PRIVATE
+        else:
+            # No need to consider ConstantArg and ImageArg (for now)
+            # because those won't be written.
+            raise ValueError("could not determine address_space of '%s'" % var)
+    return address_space
+
+
+def _get_topological_order(kernel):
+    """
+    Returns a :class:`list` of insn ids of *kernel* in a topological sort
+    order.
+
+    If there is a dependency cycle within the instructions of *kernel* raises a
+    :class:`loopy.diagnostic.DependencyCycleFound` exception.
+    """
+    from pytools.graph import compute_sccs
+    from loopy.diagnostic import DependencyCycleFound
+
+    dep_map = {insn.id: insn.depends_on for insn in kernel.instructions}
+
+    # pytools.graph.compute_sccs serves 2 purposes:
+    #   1. computes topological sort order of instructions.
+    #   2. provides info. about any cycles in the graph.
+    sccs = compute_sccs(dep_map)
+    order = []
+
+    for scc in sccs:
+        if len(scc) != 1:
+            raise DependencyCycleFound(", ".join(scc))
+        order.append(scc[0])
+
+    return order
+
+
 def _check_variable_access_ordered_inner(kernel):
-    logger.debug("%s: check_variable_access_ordered: start" % kernel.name)
+    from loopy.kernel.tools import find_aliasing_equivalence_classes
+    from loopy.symbolic import AccessRangeOverlapChecker
+    overlap_checker = AccessRangeOverlapChecker(kernel)
+    aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel)
 
-    checked_variables = kernel.get_written_variables() & (
-            set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict))
+    # dep_reqs_to_vars: A mapping (writer_id, dep_req_id) -> set of variable names,
+    # where the tuple denotes a pair of instructions IDs, and the variable
+    # names are the ones that necessitate a dependency.
+    #
+    # Note: This can be worst-case O(n^2) in the number of instructions.
+    dep_reqs_to_vars = {}
 
     wmap = kernel.writer_map()
     rmap = kernel.reader_map()
 
-    from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg
-    from loopy.kernel.tools import find_aliasing_equivalence_classes
-
-    depfind = IndirectDependencyEdgeFinder(kernel)
-    aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel)
+    # {{{ populate 'dep_reqs_to_vars'
 
-    for name in checked_variables:
-        # This is a tad redundant in that this could probably be restructured
-        # to iterate only over equivalence classes and not individual variables.
-        # But then the access-range overlap check below would have to be smarter.
-        eq_class = aliasing_equiv_classes[name]
+    for var in kernel.get_written_variables():
+        address_space = _get_address_space(kernel, var)
+        eq_class = aliasing_equiv_classes[var]
 
         readers = set.union(
                 *[rmap.get(eq_name, set()) for eq_name in eq_class])
         writers = set.union(
                 *[wmap.get(eq_name, set()) for eq_name in eq_class])
-        unaliased_readers = rmap.get(name, set())
-        unaliased_writers = wmap.get(name, set())
-
-        if not writers:
-            continue
 
-        if name in kernel.temporary_variables:
-            address_space = kernel.temporary_variables[name].address_space
-        else:
-            arg = kernel.arg_dict[name]
-            if isinstance(arg, ArrayArg):
-                address_space = arg.address_space
-            elif isinstance(arg, ValueArg):
-                address_space = AddressSpace.PRIVATE
-            else:
-                # No need to consider ConstantArg and ImageArg (for now)
-                # because those won't be written.
-                raise ValueError("could not determine address_space of '%s'" % name)
-
-        # Check even for PRIVATE address space, to ensure intentional program order.
-
-        from loopy.symbolic import AccessRangeOverlapChecker
-        overlap_checker = AccessRangeOverlapChecker(kernel)
+        for writer in writers:
+            required_deps = (readers | writers) - {writer}
+            required_deps = {req_dep
+                for req_dep in required_deps
+                if not declares_nosync_with(kernel, address_space, writer,
+                    req_dep)}
 
-        for writer_id in writers:
-            for other_id in readers | writers:
-                if writer_id == other_id:
-                    continue
-
-                writer = kernel.id_to_insn[writer_id]
-                other = kernel.id_to_insn[other_id]
+            for req_dep in required_deps:
+                dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var)
 
-                has_dependency_relationship = (
-                        declares_nosync_with(kernel, address_space, other, writer)
-                        or
-                        depfind(writer_id, other_id)
-                        or
-                        depfind(other_id, writer_id)
-                        )
+    # }}}
 
-                if has_dependency_relationship:
-                    continue
+    # depends_on: mapping from insn_ids to their dependencies
+    depends_on = {insn.id: set() for insn in
+            kernel.instructions}
+    # rev_depends: mapping from insn_ids to their reverse deps.
+    rev_depends = {insn.id: set() for insn in
+            kernel.instructions}
 
-                is_relationship_by_aliasing = not (
-                        writer_id in unaliased_writers
-                        and (other_id in unaliased_writers
-                            or other_id in unaliased_readers))
+    # {{{ populate rev_depends, depends_on
 
-                # Do not enforce ordering for disjoint access ranges
-                if (not is_relationship_by_aliasing and not
-                    overlap_checker.do_access_ranges_overlap_conservative(
-                            writer_id, "w", other_id, "any", name)):
-                    continue
+    for insn in kernel.instructions:
+        depends_on[insn.id].update(insn.depends_on)
+        for dep in insn.depends_on:
+            rev_depends[dep].add(insn.id)
+
+    # }}}
+
+    # {{{ remove pairs from dep_reqs_to_vars for which dependencies exist
+
+    topological_order = _get_topological_order(kernel)
+
+    def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order):
+        """
+        Subtracts dependency requirements of insn_ids by all direct/indirect
+        predecessors of a directed graph of insn_ids as nodes and *edges* as
+        the connectivity.
+
+        :arg order: An instance of :class:`list` of instruction ids in which the
+            *edges* graph is to be traversed.
+        """
+        # predecessors: mapping from insn_id to its direct/indirect
+        # predecessors
+        predecessors = {}
+
+        for insn_id in order:
+            # insn_predecessors:insn_id's direct+indirect predecessors
+
+            # This set of predecessors is complete because we're
+            # traversing in topological order: No predecessor
+            # can occur after the instruction itself.
+            insn_predecessors = predecessors.pop(insn_id, set())
+
+            for pred in insn_predecessors:
+                dep_reqs_to_vars.pop(
+                    (insn_id, pred),
+                    # don't fail if pair doesn't exist
+                    None)
+
+            for successor in edges[insn_id]:
+                predecessors.setdefault(successor, set()).update(
+                        insn_predecessors | {insn_id})
+
+    # forward dep. graph traversal in reverse topological sort order
+    # (proceeds "end of program" -> "beginning of program")
+    discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on,
+            topological_order[::-1])
+
+    # reverse dep. graph traversal in topological sort order
+    # (proceeds "beginning of program" -> "end of program")
+    discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order)
+
+    # }}}
+
+    # {{{ handle dependency requirements that weren't satisfied
+
+    for (writer_id, other_id), variables in dep_reqs_to_vars.items():
+        writer = kernel.id_to_insn[writer_id]
+        other = kernel.id_to_insn[other_id]
+
+        for var in variables:
+            eq_class = aliasing_equiv_classes[var]
+            unaliased_readers = rmap.get(var, set())
+            unaliased_writers = wmap.get(var, set())
+
+            is_relationship_by_aliasing = not (
+                writer_id in unaliased_writers
+                and (writer_id in unaliased_writers
+                    or other_id in unaliased_readers))
+
+            # Do not enforce ordering for disjoint access ranges
+            if (not is_relationship_by_aliasing and not
+                overlap_checker.do_access_ranges_overlap_conservative(
+                        writer_id, "w", other_id, "any", var)):
+                continue
 
-                # Do not enforce ordering for aliasing-based relationships
-                # in different groups.
-                if (is_relationship_by_aliasing and (
-                        bool(writer.groups & other.conflicts_with_groups)
-                        or
-                        bool(other.groups & writer.conflicts_with_groups))):
-                    continue
+            # Do not enforce ordering for aliasing-based relationships
+            # in different groups.
+            if (is_relationship_by_aliasing and (
+                    bool(writer.groups & other.conflicts_with_groups)
+                    or
+                    bool(other.groups & writer.conflicts_with_groups))):
+                continue
 
-                msg = ("No dependency relationship found between "
-                        "'{writer_id}' which writes {var} and "
-                        "'{other_id}' which also accesses {var}. "
-                        "Either add a (possibly indirect) dependency "
-                        "between the two, or add them to each others' nosync "
-                        "set to indicate that no ordering is intended, or "
-                        "turn off this check by setting the "
-                        "'enforce_variable_access_ordered' option "
-                        "(more issues of this type may exist--only reporting "
-                        "the first one)"
-                        .format(
-                            writer_id=writer_id,
-                            other_id=other_id,
-                            var=(
-                                "the variable '%s'" % name
-                                if len(eq_class) == 1
-                                else (
-                                    "the aliasing equivalence class '%s'"
-                                    % ", ".join(eq_class))
-                                )))
-
-                from loopy.diagnostic import VariableAccessNotOrdered
-                raise VariableAccessNotOrdered(msg)
-
-    logger.debug("%s: check_variable_access_ordered: done" % kernel.name)
+            msg = ("No dependency relationship found between "
+                    "'{writer_id}' which writes {var} and "
+                    "'{other_id}' which also accesses {var}. "
+                    "Either add a (possibly indirect) dependency "
+                    "between the two, or add them to each others' nosync "
+                    "set to indicate that no ordering is intended, or "
+                    "turn off this check by setting the "
+                    "'enforce_variable_access_ordered' option "
+                    "(more issues of this type may exist--only reporting "
+                    "the first one)"
+                    .format(
+                        writer_id=writer_id,
+                        other_id=other_id,
+                        var=(
+                            "the variable '%s'" % var
+                            if len(eq_class) == 1
+                            else (
+                                "the aliasing equivalence class '%s'"
+                                % ", ".join(eq_class))
+                            )))
+
+            from loopy.diagnostic import VariableAccessNotOrdered
+            raise VariableAccessNotOrdered(msg)
+
+    # }}}
 
 
 def check_variable_access_ordered(kernel):
     """Checks that between each write to a variable and all other accesses to
     the variable there is either:
 
-    * an (at least indirect) depdendency edge, or
+    * a direct/indirect depdendency edge, or
     * an explicit statement that no ordering is necessary (expressed
-      through a bi-directional :attr:`loopy.Instruction.no_sync_with`)
+      through a bi-directional :attr:`loopy.InstructionBase.no_sync_with`)
     """
 
     if kernel.options.enforce_variable_access_ordered not in [
@@ -728,30 +881,17 @@ def check_variable_access_ordered(kernel):
     if kernel.options.enforce_variable_access_ordered == "no_check":
         return
 
-    if kernel.options.enforce_variable_access_ordered:
-        try:
-            _check_variable_access_ordered_inner(kernel)
-        except RuntimeError as e:
-            if isinstance(e.args[0], str) and (
-                    e.args[0].startswith('maximum recursion depth exceeded')):
-                from loopy.diagnostic import warn_with_kernel
-                warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e))
-            else:
-                raise e
-    else:
-        from loopy.diagnostic import VariableAccessNotOrdered
-        try:
+    from pytools import ProcessLogger
+    with ProcessLogger(logger, "%s: check variable access ordered" % kernel.name):
+        if kernel.options.enforce_variable_access_ordered:
             _check_variable_access_ordered_inner(kernel)
-        except VariableAccessNotOrdered as e:
-            from loopy.diagnostic import warn_with_kernel
-            warn_with_kernel(kernel, "variable_access_ordered", str(e))
-        except RuntimeError as e:
-            if isinstance(e.args[0], str) and (
-                    e.args[0].startswith('maximum recursion depth exceeded')):
+        else:
+            from loopy.diagnostic import VariableAccessNotOrdered
+            try:
+                _check_variable_access_ordered_inner(kernel)
+            except VariableAccessNotOrdered as e:
                 from loopy.diagnostic import warn_with_kernel
-                warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e))
-            else:
-                raise e
+                warn_with_kernel(kernel, "variable_access_ordered", str(e))
 
 # }}}
 
@@ -770,6 +910,7 @@ def pre_schedule_checks(kernel, callables_table):
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
         check_for_inactive_iname_access(kernel)
+        check_for_unused_inames(kernel)
         check_for_write_races(kernel)
         check_for_data_dependent_parallel_bounds(kernel)
         check_bounds(kernel)
@@ -793,12 +934,75 @@ def pre_schedule_checks(kernel, callables_table):
 
 # {{{ check for unused hw axes
 
+# {{{ find boostable insn ids
+
+def _find_boostable_insn_ids(kernel):
+    """There used to exist a broken heuristic called "boostability" that allowed
+    instructions to be pushed into hardware-parallel loops. This function survives
+    of that, for now, to provide a thin veneer of compatibility.
+    """
+    logger.debug("%s: idempotence" % kernel.name)
+
+    writer_map = kernel.writer_map()
+
+    arg_names = {arg.name for arg in kernel.args}
+
+    var_names = arg_names | set(kernel.temporary_variables.keys())
+
+    reads_map = {
+            insn.id: insn.read_dependency_names() & var_names
+            for insn in kernel.instructions}
+
+    from collections import defaultdict
+    dep_graph = defaultdict(set)
+
+    for insn in kernel.instructions:
+        dep_graph[insn.id] = {writer_id
+                for var in reads_map[insn.id]
+                for writer_id in writer_map.get(var, set())}
+
+    # Find SCCs of dep_graph. These are used for checking if the instruction is
+    # in a dependency cycle.
+    from pytools.graph import compute_sccs
+
+    sccs = {item: scc
+            for scc in compute_sccs(dep_graph)
+            for item in scc}
+
+    non_idempotently_updated_vars = set()
+    boostable_insn_ids = set()
+
+    for insn in kernel.instructions:
+        boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id]
+
+        if boostable:
+            boostable_insn_ids.add(insn.id)
+        else:
+            non_idempotently_updated_vars.update(
+                    insn.assignee_var_names())
+
+    # {{{ remove boostability from isns that access non-idempotently updated vars
+
+    for insn_id in boostable_insn_ids.copy():
+        insn = kernel.id_to_insn[insn_id]
+        if bool(non_idempotently_updated_vars & insn.dependency_names()):
+            boostable_insn_ids.remove(insn_id)
+
+    # }}}
+
+    return boostable_insn_ids
+
+# }}}
+
+
 def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
         sched_index=None):
     from loopy.schedule import (CallKernel, RunInstruction,
             Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
             get_insn_ids_for_block_at, gather_schedule_block)
 
+    boostable_insn_ids = _find_boostable_insn_ids(kernel)
+
     if sched_index is None:
         group_axes = set()
         local_axes = set()
@@ -812,8 +1016,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
                 callables_table)
 
-        group_axes = set(ax for ax, length in enumerate(group_size))
-        local_axes = set(ax for ax, length in enumerate(local_size))
+        group_axes = {ax for ax, length in enumerate(group_size)}
+        local_axes = {ax for ax, length in enumerate(local_size)}
 
         i = sched_index + 1
         assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
@@ -834,9 +1038,6 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
             insn = kernel.id_to_insn[sched_item.insn_id]
             i += 1
 
-            if insn.boostable:
-                continue
-
             group_axes_used = set()
             local_axes_used = set()
 
@@ -856,17 +1057,44 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
                     raise LoopyError("auto local tag encountered")
 
             if group_axes != group_axes_used:
-                raise LoopyError("instruction '%s' does not use all group hw axes "
-                        "(available: %s used:%s)"
-                        % (insn.id,
-                            ",".join(str(i) for i in group_axes),
-                            ",".join(str(i) for i in group_axes_used)))
+                if insn.id in boostable_insn_ids:
+                    warn("instruction '%s' does not use all group hw axes"
+                            " (available: %s used:%s). Loopy will generate code"
+                            " with the instruction executed along all the"
+                            " missing hw axes. This will result in an"
+                            " error from 2021.x onwards, calling"
+                            " loopy.add_inames_for_unused_hw_axes(...)"
+                            " might help in the transition."
+                            % (insn.id,
+                                ",".join(str(i) for i in group_axes),
+                                ",".join(str(i) for i in group_axes_used)),
+                            DeprecationWarning, stacklevel=2)
+                else:
+                    raise LoopyError("instruction '%s' does not use all group"
+                            " hw axes (available: %s used:%s)"
+                            % (insn.id,
+                                ",".join(str(i) for i in group_axes),
+                                ",".join(str(i) for i in group_axes_used)))
+
             if local_axes != local_axes_used:
-                raise LoopyError("instruction '%s' does not use all local hw axes "
-                        "(available: %s used:%s)"
-                        % (insn.id,
-                            ",".join(str(i) for i in local_axes),
-                            ",".join(str(i) for i in local_axes_used)))
+                if insn.id in boostable_insn_ids:
+                    warn("instruction '%s' does not use all local hw axes"
+                            " (available: %s used:%s). Loopy will generate code"
+                            " with the instruction executed along all the"
+                            " missing hw axes. This will result in an"
+                            " error from 2021.x onwards, calling"
+                            " loopy.add_inames_for_unused_hw_axes(...)"
+                            " might help in the transition."
+                            % (insn.id,
+                                ",".join(str(i) for i in local_axes),
+                                ",".join(str(i) for i in local_axes_used)),
+                            DeprecationWarning, stacklevel=2)
+                else:
+                    raise LoopyError("instruction '%s' does not use all local"
+                            " hw axes (available: %s used:%s)"
+                            % (insn.id,
+                                ",".join(str(i) for i in local_axes),
+                                ",".join(str(i) for i in local_axes_used)))
 
         elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
             i += 1
@@ -893,18 +1121,18 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
     from loopy.kernel.data import ArrayBase, Assignment
     from loopy.types import AtomicType
     atomicity_candidates = (
-            set(v.name for v in six.itervalues(kernel.temporary_variables)
-                if isinstance(v.dtype, AtomicType))
+            {v.name for v in kernel.temporary_variables.values()
+                if isinstance(v.dtype, AtomicType)}
             |
-            set(v.name for v in kernel.args
+            {v.name for v in kernel.args
                 if isinstance(v, ArrayBase)
-                and isinstance(v.dtype, AtomicType)))
+                and isinstance(v.dtype, AtomicType)})
 
     for insn in kernel.instructions:
         if not isinstance(insn, Assignment):
             continue
 
-        atomic_accesses = set(a.var_name for a in insn.atomicity)
+        atomic_accesses = {a.var_name for a in insn.atomicity}
         if not atomic_accesses <= atomicity_candidates:
             raise LoopyError("atomic access in instruction '%s' to "
                     "non-atomic variable(s) '%s'"
@@ -970,12 +1198,12 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
 
 def check_that_all_insns_are_scheduled(kernel):
 
-    all_schedulable_insns = set(insn.id for insn in kernel.instructions)
+    all_schedulable_insns = {insn.id for insn in kernel.instructions}
     from loopy.schedule import sched_item_to_insn_id
-    scheduled_insns = set(
+    scheduled_insns = {
         insn_id
         for sched_item in kernel.schedule
-        for insn_id in sched_item_to_insn_id(sched_item))
+        for insn_id in sched_item_to_insn_id(sched_item)}
 
     assert scheduled_insns <= all_schedulable_insns
 
@@ -983,7 +1211,7 @@ def check_that_all_insns_are_scheduled(kernel):
         from loopy.diagnostic import UnscheduledInstructionError
         raise UnscheduledInstructionError(
             "unscheduled instructions: '%s'"
-            % ', '.join(all_schedulable_insns - scheduled_insns))
+            % ", ".join(all_schedulable_insns - scheduled_insns))
 
 # }}}
 
@@ -996,11 +1224,11 @@ def check_that_shapes_and_strides_are_arguments(kernel):
     from loopy.symbolic import get_dependencies
     import loopy as lp
 
-    integer_arg_names = set(
+    integer_arg_names = {
             arg.name
             for arg in kernel.args
             if isinstance(arg, ValueArg)
-            and arg.dtype.is_integral())
+            and arg.dtype.is_integral()}
 
     for arg in kernel.args:
         if isinstance(arg, ArrayBase):
@@ -1069,7 +1297,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
     last_idomains = None
     last_insn_inames = None
 
-    for insn_id, idomains in six.iteritems(implemented_domains):
+    for insn_id, idomains in implemented_domains.items():
         insn = kernel.id_to_insn[insn_id]
 
         assert idomains
@@ -1127,9 +1355,9 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
             i_minus_d = insn_impl_domain - desired_domain
             d_minus_i = desired_domain - insn_impl_domain
 
-            parameter_inames = set(
+            parameter_inames = {
                     insn_domain.get_dim_name(dim_type.param, i)
-                    for i in range(insn_impl_domain.dim(dim_type.param)))
+                    for i in range(insn_impl_domain.dim(dim_type.param))}
 
             lines = []
             for bigger, smaller, diff_set, gist_domain in [
@@ -1157,10 +1385,10 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                         iname, pt.get_coordinate_val(tp, dim).to_python()))
 
                 lines.append(
-                        "sample point in %s but not %s: %s" % (
+                        "sample point in {} but not {}: {}".format(
                             bigger, smaller, ", ".join(point_axes)))
                 lines.append(
-                        "gist of constraints in %s but not %s: %s" % (
+                        "gist of constraints in {} but not {}: {}".format(
                             smaller, bigger, gist_domain))
 
             if code is not None:
diff --git a/loopy/cli.py b/loopy/cli.py
index 3dbdeb41e37aebc0e3c2b0b8b3fc68866dfec080..a7d209ae87b2120f90a8d360c3ff9eb13bc925f5 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import sys
 
 import loopy as lp
@@ -39,16 +37,16 @@ def defines_to_python_code(defines_str):
     import re
     define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
     result = []
-    for l in defines_str.split("\n"):
-        if not l.strip():
+    for line in defines_str.split("\n"):
+        if not line.strip():
             continue
 
-        match = define_re.match(l)
+        match = define_re.match(line)
         if match is None:
-            raise RuntimeError("#define not understood: '%s'" % l)
+            raise RuntimeError("#define not understood: '%s'" % line)
 
         result.append(
-                "%s = %s" % (match.group(1), to_python_literal(match.group(2))))
+                "{} = {}".format(match.group(1), to_python_literal(match.group(2))))
 
     return "\n".join(result)
 
@@ -60,7 +58,7 @@ def main():
 
     parser.add_argument("infile", metavar="INPUT_FILE")
     parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE",
-            help="Defaults to stdout ('-').", nargs='?')
+            help="Defaults to stdout ('-').", nargs="?")
     parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran")
     parser.add_argument("--target", choices=(
         "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"),
@@ -112,7 +110,7 @@ def main():
                 ".f77": "fortran",
                 ".F77": "fortran",
                 }.get(ext)
-        with open(args.infile, "r") as infile_fd:
+        with open(args.infile) as infile_fd:
             infile_content = infile_fd.read()
 
     if args.lang is not None:
@@ -143,15 +141,15 @@ def main():
         data_dic["np"] = np
 
         if args.occa_defines:
-            with open(args.occa_defines, "r") as defines_fd:
+            with open(args.occa_defines) as defines_fd:
                 occa_define_code = defines_to_python_code(defines_fd.read())
             exec(compile(occa_define_code, args.occa_defines, "exec"), data_dic)
 
-        with open(args.infile, "r") as infile_fd:
+        with open(args.infile) as infile_fd:
             exec(compile(infile_content, args.infile, "exec"), data_dic)
 
         if args.transform:
-            with open(args.transform, "r") as xform_fd:
+            with open(args.transform) as xform_fd:
                 exec(compile(xform_fd.read(),
                     args.transform, "exec"), data_dic)
 
@@ -166,14 +164,14 @@ def main():
     elif lang in ["fortran", "floopy", "fpp"]:
         pre_transform_code = None
         if args.transform:
-            with open(args.transform, "r") as xform_fd:
+            with open(args.transform) as xform_fd:
                 pre_transform_code = xform_fd.read()
 
         if args.occa_defines:
             if pre_transform_code is None:
                 pre_transform_code = ""
 
-            with open(args.occa_defines, "r") as defines_fd:
+            with open(args.occa_defines) as defines_fd:
                 pre_transform_code = (
                         defines_to_python_code(defines_fd.read())
                         + pre_transform_code)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 70cd7cc956acdfdc59402851b081602ca78ce187..e324c6d77248711a18b7d1ca29702791d9688e9e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -25,8 +23,6 @@ THE SOFTWARE.
 import logging
 logger = logging.getLogger(__name__)
 
-import six
-
 from loopy.diagnostic import LoopyError, warn
 from pytools import ImmutableRecord
 import islpy as isl
@@ -44,6 +40,22 @@ from cgen import Collection
 
 from pytools import ProcessLogger
 
+__doc__ = """
+.. currentmodule:: loopy.codegen
+
+.. autoclass:: ImplementedDataInfo
+
+.. autoclass:: PreambleInfo
+
+.. autoclass:: VectorizationInfo
+
+.. autoclass:: SeenFunction
+
+.. autoclass:: CodeGenerationState
+
+.. automodule:: loopy.codegen.result
+"""
+
 
 # {{{ implemented data info
 
@@ -123,7 +135,7 @@ class Unvectorizable(Exception):
     pass
 
 
-class VectorizationInfo(object):
+class VectorizationInfo:
     """
     .. attribute:: iname
     .. attribute:: length
@@ -152,7 +164,7 @@ class SeenFunction(ImmutableRecord):
                 arg_dtypes=arg_dtypes)
 
 
-class CodeGenerationState(object):
+class CodeGenerationState:
     """
     .. attribute:: kernel
     .. attribute:: target
@@ -436,7 +448,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target):
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel, callables_table)
 
-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("cannot generate code for a kernel that has not been "
                 "scheduled")
 
@@ -488,9 +500,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target):
             raise ValueError("argument type not understood: '%s'" % type(arg))
 
     allow_complex = False
-    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
-        dtype = var.dtype
-        if dtype.involves_complex():
+    for var in kernel.args + list(kernel.temporary_variables.values()):
+        if var.dtype.involves_complex():
             allow_complex = True
 
     # }}}
@@ -534,10 +545,12 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target):
 
     # {{{ handle preambles
 
-    for arg in kernel.args:
-        seen_dtypes.add(arg.dtype)
-    for tv in six.itervalues(kernel.temporary_variables):
-        seen_dtypes.add(tv.dtype)
+    for idi in codegen_state.implemented_data_info:
+        seen_dtypes.add(idi.dtype)
+
+    for tv in kernel.temporary_variables.values():
+        for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype):
+            seen_dtypes.add(idi.dtype)
 
     preambles = kernel.preambles[:]
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index b736191ec1dadb842e12453fbec3b68e831338f6..b02c13b389266379a03c41b6f60c2163b16b2986 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -29,11 +27,13 @@ from islpy import dim_type
 
 # {{{ approximate, convex bounds check generator
 
-def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domain):
+def get_approximate_convex_bounds_checks(domain, check_inames,
+        implemented_domain, op_cache_manager):
     if isinstance(domain, isl.BasicSet):
         domain = isl.Set.from_basic_set(domain)
     domain = domain.remove_redundancies()
-    result = domain.eliminate_except(check_inames, [dim_type.set])
+    result = op_cache_manager.eliminate_except(domain, check_inames,
+            (dim_type.set,))
 
     # This is ok, because we're really looking for the
     # projection, with no remaining constraints from
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 3bad73462598c61895f2d274c13941e433986cb4..c2006df518f19d19e241ce8c699243314076b1ce 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -1,6 +1,5 @@
 """Loop nest build top-level control/hoisting."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -201,14 +200,14 @@ def get_required_predicates(kernel, sched_index):
     return result
 
 
-def group_by(l, key, merge):
-    if not l:
-        return l
+def group_by(entry, key, merge):
+    if not entry:
+        return entry
 
     result = []
-    previous = l[0]
+    previous = entry[0]
 
-    for item in l[1:]:
+    for item in entry[1:]:
         if key(previous) == key(item):
             previous = merge(previous, item)
 
@@ -329,7 +328,7 @@ def build_loop_nest(codegen_state, schedule_index):
             # Each instruction individually gets its bounds checks,
             # so we can safely overapproximate here.
             return get_approximate_convex_bounds_checks(domain,
-                    check_inames, self.impl_domain)
+                    check_inames, self.impl_domain, self.kernel.cache_manager)
 
     def build_insn_group(sched_index_info_entries, codegen_state,
             done_group_lengths=set()):
@@ -475,7 +474,7 @@ def build_loop_nest(codegen_state, schedule_index):
                             sched_index_info_entries[0:group_length],
                             inner_codegen_state,
                             done_group_lengths=(
-                                done_group_lengths | set([group_length])))
+                                done_group_lengths | {group_length}))
 
             # gen_code returns a list
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..71133ef7cf2a29be1a8673e99a81f21544f5404a 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -1,6 +1,5 @@
 """Code generation for Instruction objects."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,7 +24,6 @@ THE SOFTWARE.
 """
 
 
-from six.moves import range
 import islpy as isl
 dim_type = isl.dim_type
 from loopy.codegen import Unvectorizable
@@ -39,7 +37,8 @@ def to_codegen_result(
 
     chk_domain = isl.Set.from_basic_set(domain)
     chk_domain = chk_domain.remove_redundancies()
-    chk_domain = chk_domain.eliminate_except(check_inames, [dim_type.set])
+    chk_domain = codegen_state.kernel.cache_manager.eliminate_except(chk_domain,
+            check_inames, (dim_type.set,))
 
     chk_domain, implemented_domain = isl.align_two(
             chk_domain, codegen_state.implemented_domain)
@@ -171,7 +170,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
 
         gs, ls = kernel.get_grid_size_upper_bounds()
 
-        printf_format = "%s.%s[%s][%s]: %s" % (
+        printf_format = "{}.{}[{}][{}]: {}".format(
                 kernel.name,
                 insn.id,
                 ", ".join("gid%d=%%d" % i for i in range(len(gs))),
@@ -208,7 +207,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
         else:
             printf_args_str = ""
 
-        printf_insn = S("printf(\"%s\\n\"%s)" % (
+        printf_insn = S('printf("{}\\n"{})'.format(
                     printf_format, printf_args_str))
 
         from cgen import Block
@@ -274,7 +273,7 @@ def generate_c_instruction_code(codegen_state, insn):
     if body:
         body.append(Line())
 
-    body.extend(Line(l) for l in insn.code.split("\n"))
+    body.extend(Line(line) for line in insn.code.split("\n"))
 
     return Block(body)
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 5796f5133a1d82890c55accf28072dd5db582ee4..59dd33c95507e0e9b790ec5740f2256279393e67 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range
 
 from loopy.diagnostic import warn, LoopyError
 from loopy.codegen.result import merge_codegen_results
@@ -80,11 +77,16 @@ def get_slab_decomposition(kernel, iname):
 
         if upper_incr:
             assert upper_incr > 0
-            upper_slab = ("final", isl.BasicSet.universe(space)
-                    .add_constraint(
-                        isl.Constraint.inequality_from_aff(
-                            iname_rel_aff(space,
-                                iname, ">", upper_bound_aff-upper_incr))))
+            upper_bset = isl.BasicSet.universe(space).add_constraint(
+                isl.Constraint.inequality_from_aff(
+                    iname_rel_aff(space,
+                        iname, ">", upper_bound_aff-upper_incr)))
+            if lower_incr:
+                # Ensure that this slab is actually distinct from the
+                # lower one, if it exists.
+                _, lower_bset = lower_slab
+                upper_bset, = upper_bset.subtract(lower_bset).get_basic_sets()
+            upper_slab = ("final", upper_bset)
             upper_bulk_bound = (
                     isl.Constraint.inequality_from_aff(
                         iname_rel_aff(space,
@@ -320,7 +322,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         if len(slabs) > 1:
             result.append(
                     codegen_state.ast_builder.emit_comment(
-                        "%s slab for '%s'" % (slab_name, iname)))
+                        f"{slab_name} slab for '{iname}'"))
 
         # Have the conditional infrastructure generate the
         # slabbing conditionals.
@@ -359,7 +361,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
     result = []
 
     for slab_name, slab in slabs:
-        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
+        cmt = f"{slab_name} slab for '{loop_iname}'"
         if len(slabs) == 1:
             cmt = None
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 7950c56b3b62693f974cbcc5ab8686f30fa42cbe..d7314fb9750d63dd2f42282be6e1340e2ce073de 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 from pytools import ImmutableRecord
 
 
@@ -43,6 +40,19 @@ def process_preambles(preambles):
             for lines in dedup_preambles]
 
 
+__doc__ = """
+.. currentmodule:: loopy.codegen.result
+
+.. autoclass:: GeneratedProgram
+
+.. autoclass:: CodeGenerationResult
+
+.. autofunction:: merge_codegen_results
+
+.. autofunction:: generate_host_or_device_program
+"""
+
+
 # {{{ code generation result
 
 class GeneratedProgram(ImmutableRecord):
@@ -218,7 +228,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                         el.current_program(codegen_state).name
                         == codegen_result.current_program(codegen_state).name)
 
-            for insn_id, idoms in six.iteritems(el.implemented_domains):
+            for insn_id, idoms in el.implemented_domains.items():
                 implemented_domains.setdefault(insn_id, []).extend(idoms)
 
             if not codegen_state.is_generating_device_code:
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 613bca56fc1de23a66d45d8f990f91f9d3f9b949..f9313c6c95612ddba6566d7c8175d998e8312147 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
@@ -30,11 +28,14 @@ from loopy.target.pyopencl_execution import (  # noqa
 # {{{ compatibility
 
 class CompiledKernel(PyOpenCLKernelExecutor):
+    """
+    .. automethod:: __call__
+    """
     def __init__(self, context, kernel):
         from warnings import warn
         warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.",
                 DeprecationWarning, stacklevel=2)
 
-        super(CompiledKernel, self).__init__(context, kernel)
+        super().__init__(context, kernel)
 
 # }}}
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 561bbc7cc56a8338593a80b7d5890553af89c79b..0ae2e530ad5e3c0de73b3d0d064f7dd85e055894 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -60,7 +58,7 @@ def warn_with_kernel(kernel, id, text, type=LoopyWarning):
             % id)
 
     from warnings import warn
-    warn("in kernel %s: %s" % (kernel.name, text), type, stacklevel=2)
+    warn(f"in kernel {kernel.name}: {text}", type, stacklevel=2)
 
 
 warn = MovedFunctionDeprecationWrapper(warn_with_kernel)
diff --git a/loopy/expression.py b/loopy/expression.py
index 8414efaa5dd614d39e93f55aea3836141e5a6d6e..10e19301470eadecc6f3d206373fb7c5df1c5ae8 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner"
 
 __license__ = """
@@ -32,20 +30,20 @@ from loopy.diagnostic import LoopyError
 
 
 # type_context may be:
-# - 'i' for integer -
-# - 'f' for single-precision floating point
-# - 'd' for double-precision floating point
+# - "i" for integer -
+# - "f" for single-precision floating point
+# - "d" for double-precision floating point
 # or None for 'no known context'.
 
 def dtype_to_type_context(target, dtype):
     from loopy.types import NumpyType
 
     if dtype.is_integral():
-        return 'i'
+        return "i"
     if isinstance(dtype, NumpyType) and dtype.dtype in [np.float64, np.complex128]:
-        return 'd'
+        return "d"
     if isinstance(dtype, NumpyType) and dtype.dtype in [np.float32, np.complex64]:
-        return 'f'
+        return "f"
     if target.is_vector_dtype(dtype):
         return dtype_to_type_context(
                 target, NumpyType(dtype.numpy_dtype.fields["x"][0]))
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 9b63c10f8422d0a17c295e1ef9a4609f5db90e2b..c8fda36d070c3aab49fec4f9d828d9130ad8358c 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -90,17 +88,17 @@ def _extract_loopy_lines(source):
     loopy_lines = []
 
     in_loopy_code = False
-    for l in lines:
-        comment_match = comment_re.match(l)
+    for line in lines:
+        comment_match = comment_re.match(line)
 
         if comment_match is None:
             if in_loopy_code:
                 raise LoopyError("non-comment source line in loopy block")
 
-            remaining_lines.append(l)
+            remaining_lines.append(line)
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
             continue
 
         cmt = comment_match.group(1)
@@ -112,7 +110,7 @@ def _extract_loopy_lines(source):
             in_loopy_code = True
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
         elif cmt_stripped == "$loopy end":
             if not in_loopy_code:
@@ -120,16 +118,16 @@ def _extract_loopy_lines(source):
             in_loopy_code = False
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
         elif in_loopy_code:
             loopy_lines.append(cmt)
 
         else:
-            remaining_lines.append(l)
+            remaining_lines.append(line)
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
     return "\n".join(remaining_lines), "\n".join(loopy_lines)
 
@@ -322,9 +320,9 @@ def parse_fortran(source, filename="<floopy code>", free_form=None, strict=None,
     import logging
     console = logging.StreamHandler()
     console.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
     console.setFormatter(formatter)
-    logging.getLogger('fparser').addHandler(console)
+    logging.getLogger("fparser").addHandler(console)
 
     from fparser import api
     tree = api.parse(source, isfree=free_form, isstrict=strict,
diff --git a/loopy/frontend/fortran/diagnostic.py b/loopy/frontend/fortran/diagnostic.py
index 7cb3c79cc646f0959f69614e5141441e8fc3261b..b2ea02c05b53e132dddaa5d8102620e4941f35cd 100644
--- a/loopy/frontend/fortran/diagnostic.py
+++ b/loopy/frontend/fortran/diagnostic.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py
index 1400fb3b71416355229f11a1e6bbd74e62b4897f..cc93e914d0470c423812b69913a7185dca9c7b67 100644
--- a/loopy/frontend/fortran/expression.py
+++ b/loopy/frontend/fortran/expression.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -25,7 +23,7 @@ THE SOFTWARE.
 from pymbolic.parser import Parser as ExpressionParserBase
 from loopy.frontend.fortran.diagnostic import TranslationError
 
-from six.moves import intern
+from sys import intern
 import numpy as np
 
 import pytools.lex
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 39c2c62d97b23cd44f64ab59920e4336991a47b5..8e3ef5728fa9e0b5ebfc4348f6cc0daf03733ddd 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -24,8 +22,7 @@ THE SOFTWARE.
 
 import re
 
-import six
-from six.moves import intern
+from sys import intern
 
 import loopy as lp
 import numpy as np
@@ -125,7 +122,7 @@ class SubscriptIndexAdjuster(IdentityMapper):
 
 # {{{ scope
 
-class Scope(object):
+class Scope:
     def __init__(self, subprogram_name, arg_names=set()):
         self.subprogram_name = subprogram_name
 
@@ -163,8 +160,8 @@ class Scope(object):
 
     def known_names(self):
         return (self.used_names
-                | set(six.iterkeys(self.dim_map))
-                | set(six.iterkeys(self.type_map)))
+                | set(self.dim_map.keys())
+                | set(self.type_map.keys()))
 
     def is_known(self, name):
         return (name in self.used_names
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index a124757f4729d270b0ab47c7e07cf1c436733045..f4eea255b9b89dba0300f1e81194b0ff64d7007d 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -27,7 +25,7 @@ import re
 from loopy.diagnostic import LoopyError
 
 
-class FTreeWalkerBase(object):
+class FTreeWalkerBase:
     def __init__(self, filename):
         from loopy.frontend.fortran.expression import FortranExpressionParser
         self.expr_parser = FortranExpressionParser(self)
diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py
index ec1b10f1f512e18079f44b94b298e876776cae35..7f9177e0ef8430cc450cb462641b12ed1a9f9b28 100644
--- a/loopy/ipython_ext.py
+++ b/loopy/ipython_ext.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 from IPython.core.magic import (magics_class, Magics, cell_magic)
 
 import loopy as lp
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 4d57de26b6cfa3d8932ba4f85ed02b97ddcda975..59748e01baa7d387514a7a0619f8482d58c363e7 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -1,6 +1,5 @@
 """isl helpers"""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,13 +24,6 @@ THE SOFTWARE.
 """
 
 
-import six
-import numpy as np
-from six.moves import range, zip
-
-from pymbolic.mapper.evaluator import \
-        EvaluationMapper as EvaluationMapperBase
-
 from loopy.diagnostic import StaticValueFindingError, LoopyError
 
 import islpy as isl
@@ -499,130 +491,6 @@ def obj_involves_variable(obj, var_name):
     return False
 
 
-# {{{ performance tweak for dim_{min,max}: project first
-
-def _runs_in_integer_set(s, max_int=None):
-    if not s:
-        return
-
-    if max_int is None:
-        max_int = max(s)
-
-    i = 0
-    while i < max_int:
-        if i in s:
-            start = i
-
-            i += 1
-            while i < max_int and i in s:
-                i += 1
-
-            end = i
-
-            yield (start, end-start)
-
-        else:
-            i += 1
-
-
-class TooManyInteractingDims(Exception):
-    pass
-
-
-def _find_aff_dims(aff, dim_types_and_gen_dim_types):
-    result = []
-
-    for dt, gen_dt in dim_types_and_gen_dim_types:
-        for i in range(aff.dim(dt)):
-            if not aff.get_coefficient_val(dt, i).is_zero():
-                result.append((gen_dt, i))
-
-    result = set(result)
-
-    for i in range(aff.dim(dim_type.div)):
-        if not aff.get_coefficient_val(dim_type.div, i).is_zero():
-            result.update(_find_aff_dims(
-                aff.get_div(i),
-                dim_types_and_gen_dim_types))
-
-    return result
-
-
-def _transitive_closure(graph_dict):
-    pass
-
-
-def _find_noninteracting_dims(obj, dt, idx, other_dt, stop_at=6):
-    if isinstance(obj, isl.BasicSet):
-        basics = [obj]
-    elif isinstance(obj, isl.Set):
-        basics = obj.get_basic_sets()
-    else:
-        raise TypeError("unsupported arg type '%s'" % type(obj))
-
-    connections = []
-    for bs in basics:
-        for c in bs.get_constraints():
-            conn = _find_aff_dims(
-                    c.get_aff(),
-                    [(dim_type.param, dim_type.param), (dim_type.in_, dim_type.set)])
-            if len(conn) > 1:
-                connections.append(conn)
-
-    interacting = set([(dt, idx)])
-
-    while True:
-        changed_something = False
-
-        # Compute the connected component near (dt, idx) by fixed point iteration
-
-        for conn in connections:
-            prev_len = len(interacting)
-
-            overlap = interacting & conn
-            if overlap:
-                interacting.update(conn)
-
-            if len(interacting) != prev_len:
-                changed_something = True
-
-            if len(interacting) >= stop_at:
-                raise TooManyInteractingDims()
-
-        if not changed_something:
-            break
-
-    return set(range(obj.dim(other_dt))) - set(
-            idx for dt, idx in interacting
-            if dt == other_dt)
-
-
-def _eliminate_noninteracting(obj, dt, idx, other_dt):
-    obj = obj.compute_divs()
-    try:
-        nonint = _find_noninteracting_dims(obj, dt, idx, other_dt)
-
-    except TooManyInteractingDims:
-        return obj
-
-    for first, n in _runs_in_integer_set(nonint):
-        obj = obj.eliminate(other_dt, first, n)
-
-    return obj
-
-
-def dim_min_with_elimination(obj, idx):
-    obj_elim = _eliminate_noninteracting(obj, dim_type.out, idx, dim_type.param)
-    return obj_elim.dim_min(idx)
-
-
-def dim_max_with_elimination(obj, idx):
-    obj_elim = _eliminate_noninteracting(obj, dim_type.out, idx, dim_type.param)
-    return obj_elim.dim_max(idx)
-
-# }}}
-
-
 # {{{ get_simple_strides
 
 def get_simple_strides(bset, key_by="name"):
@@ -718,7 +586,7 @@ def get_simple_strides(bset, key_by="name"):
 # }}}
 
 
-# {{{{ find_max_of_pwaff_with_params
+# {{{ find_max_of_pwaff_with_params
 
 def find_max_of_pwaff_with_params(pw_aff, n_allowed_params):
     if n_allowed_params is None:
@@ -743,30 +611,6 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params):
 
 # {{{ subst_into_pwqpolynomial
 
-class QPolynomialEvaluationMapper(EvaluationMapperBase):
-    def __init__(self, space):
-        self.zero = isl.QPolynomial.zero_on_domain(space)
-
-        context = {}
-        for name, (dt, pos) in six.iteritems(space.get_var_dict()):
-            if dt == dim_type.set:
-                dt = dim_type.in_
-
-            context[name] = isl.QPolynomial.var_on_domain(space, dt, pos)
-
-        super(QPolynomialEvaluationMapper, self).__init__(context)
-
-    def map_constant(self, expr):
-        if isinstance(expr, np.integer):
-            expr = int(expr)
-
-        return self.zero + expr
-
-    def map_quotient(self, expr):
-        raise TypeError("true division in '%s' not supported "
-                "for as-pwaff evaluation" % expr)
-
-
 def get_param_subst_domain(new_space, base_obj, subst_dict):
     """Modify the :mod:`islpy` object *base_obj* to incorporate parameters for
     the keys of *subst_dict*, and rename existing parameters to include a
@@ -828,8 +672,18 @@ def get_param_subst_domain(new_space, base_obj, subst_dict):
 
 
 def subst_into_pwqpolynomial(new_space, poly, subst_dict):
+    """
+    Returns an instance of :class:`islpy.PwQPolynomial` with substitutions from
+    *subst_dict* substituted into *poly*.
+
+    :arg poly: an instance of :class:`islpy.PwQPolynomial`
+    :arg subst_dict: a mapping from parameters of *poly* to
+        :class:`pymbolic.primitives.Expression` made up of terms comprising the
+        parameters of *new_space*. The expression must be affine in the param
+        dims of *new_space*.
+    """
     if not poly.get_pieces():
-        assert new_space.is_params()
+        # pw poly is univserally zero
         result = isl.PwQPolynomial.zero(new_space.insert_dims(dim_type.out, 0, 1))
         assert result.dim(dim_type.out) == 1
         return result
@@ -839,7 +693,7 @@ def subst_into_pwqpolynomial(new_space, poly, subst_dict):
     poly, subst_domain, subst_dict = get_param_subst_domain(
             new_space, poly, subst_dict)
 
-    from loopy.symbolic import qpolynomial_to_expr
+    from loopy.symbolic import qpolynomial_to_expr, qpolynomial_from_expr
     new_pieces = []
     for valid_set, qpoly in poly.get_pieces():
         valid_set = valid_set & subst_domain
@@ -851,7 +705,7 @@ def subst_into_pwqpolynomial(new_space, poly, subst_dict):
                 SubstitutionMapper, make_subst_func)
         sub_mapper = SubstitutionMapper(make_subst_func(subst_dict))
         expr = sub_mapper(qpolynomial_to_expr(qpoly))
-        qpoly = QPolynomialEvaluationMapper(valid_set.space)(expr)
+        qpoly = qpolynomial_from_expr(valid_set.space, expr)
 
         new_pieces.append((valid_set, qpoly))
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 0ce06a126ef435f99b32c89bcd576beba648a3bb..1eac93e415663bbca818c591c050d2543a469683 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1,7 +1,5 @@
 """Kernel object."""
 
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -24,8 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import range, zip, intern
+from sys import intern
 
 from collections import defaultdict
 
@@ -49,7 +46,7 @@ from warnings import warn
 class _UniqueVarNameGenerator(UniqueNameGenerator):
 
     def __init__(self, existing_names=set(), forced_prefix=""):
-        super(_UniqueVarNameGenerator, self).__init__(existing_names, forced_prefix)
+        super().__init__(existing_names, forced_prefix)
         array_prefix_pattern = re.compile("(.*)_s[0-9]+$")
 
         array_prefixes = set()
@@ -95,7 +92,7 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
 
 # {{{ loop kernel object
 
-class _deprecated_KernelState_SCHEDULED(object):  # noqa
+class _deprecated_KernelState_SCHEDULED:  # noqa
     def __init__(self, f):
         self.f = f
 
@@ -117,7 +114,7 @@ class KernelState:  # noqa
 
 # {{{ kernel_state, KernelState compataibility
 
-class _deperecated_kernel_state_class_method(object):  # noqa
+class _deperecated_kernel_state_class_method:  # noqa
     def __init__(self, f):
         self.f = f
 
@@ -127,7 +124,7 @@ class _deperecated_kernel_state_class_method(object):  # noqa
         return self.f()
 
 
-class kernel_state(object):  # noqa
+class kernel_state:  # noqa
     """Deprecated. Use :class:`loopy.kernel.KernelState` instead.
     """
 
@@ -241,6 +238,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         would be called from other top level kernels. Default value is
         *True*.
 
+    .. automethod:: __call__
+    .. automethod:: copy
     """
 
     # {{{ constructor
@@ -351,7 +350,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if state not in [
                 KernelState.INITIAL,
                 KernelState.PREPROCESSED,
-                KernelState.SCHEDULED,
+                KernelState.LINEARIZED,
                 ]:
             raise ValueError("invalid value for 'state'")
 
@@ -375,7 +374,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         from collections import defaultdict
         assert not isinstance(iname_to_tags, defaultdict)
 
-        for iname, tags in six.iteritems(iname_to_tags):
+        for iname, tags in iname_to_tags.items():
             # don't tolerate empty sets
             assert tags
             assert isinstance(tags, frozenset)
@@ -479,25 +478,25 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     @memoize_method
     def non_iname_variable_names(self):
-        return (set(six.iterkeys(self.arg_dict))
-                | set(six.iterkeys(self.temporary_variables)))
+        return (set(self.arg_dict.keys())
+                | set(self.temporary_variables.keys()))
 
     @memoize_method
     def all_variable_names(self, include_temp_storage=True):
         return (
-                set(six.iterkeys(self.temporary_variables))
-                | set(tv.base_storage
-                    for tv in six.itervalues(self.temporary_variables)
-                    if tv.base_storage is not None and include_temp_storage)
-                | set(six.iterkeys(self.substitutions))
-                | set(arg.name for arg in self.args)
+                set(self.temporary_variables.keys())
+                | {tv.base_storage
+                    for tv in self.temporary_variables.values()
+                    if tv.base_storage is not None and include_temp_storage}
+                | set(self.substitutions.keys())
+                | {arg.name for arg in self.args}
                 | set(self.all_inames()))
 
     def get_var_name_generator(self):
         return _UniqueVarNameGenerator(self.all_variable_names())
 
     def get_instruction_id_generator(self, based_on="insn"):
-        used_ids = set(insn.id for insn in self.instructions)
+        used_ids = {insn.id for insn in self.instructions}
 
         return UniqueNameGenerator(used_ids)
 
@@ -506,7 +505,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if insns is None:
             insns = self.instructions
 
-        used_ids = set(insn.id for insn in insns) | extra_used_ids
+        used_ids = {insn.id for insn in insns} | extra_used_ids
 
         for id_str in generate_unique_names(based_on):
             if id_str not in used_ids:
@@ -554,7 +553,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @property
     @memoize_method
     def id_to_insn(self):
-        return dict((insn.id, insn) for insn in self.instructions)
+        return {insn.id: insn for insn in self.instructions}
 
     # }}}
 
@@ -649,10 +648,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     @memoize_method
     def _get_home_domain_map(self):
-        return dict(
-                (iname, i_domain)
+        return {
+                iname: i_domain
                 for i_domain, dom in enumerate(self.domains)
-                for iname in dom.get_var_names(dim_type.set))
+                for iname in dom.get_var_names(dim_type.set)}
 
     def get_home_domain_index(self, iname):
         return self._get_home_domain_map()[iname]
@@ -828,7 +827,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @memoize_method
     def all_referenced_inames(self):
         result = set()
-        for inames in six.itervalues(self.all_insn_inames()):
+        for inames in self.all_insn_inames().values():
             result.update(inames)
         return result
 
@@ -839,8 +838,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     @memoize_method
     def iname_to_insns(self):
-        result = dict(
-                (iname, set()) for iname in self.all_inames())
+        result = {
+                iname: set() for iname in self.all_inames()}
         for insn in self.instructions:
             for iname in self.insn_inames(insn):
                 result[iname].add(insn.id)
@@ -866,9 +865,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 tag, = tags
                 tag_key_uses[tag.key].append(iname)
 
-        multi_use_keys = set(
-                key for key, user_inames in six.iteritems(tag_key_uses)
-                if len(user_inames) > 1)
+        multi_use_keys = {
+                key for key, user_inames in tag_key_uses.items()
+                if len(user_inames) > 1}
 
         multi_use_inames = set()
         for iname in cond_inames:
@@ -888,13 +887,13 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         warn("Since version 2018.1, inames can hold multiple tags. Use "
              "iname_to_tags['iname'] instead. iname_to_tag.get('iname') will be "
              "removed at version 2019.0.", DeprecationWarning)
-        for iname, tags in six.iteritems(self.iname_to_tags):
+        for iname, tags in self.iname_to_tags.items():
             if len(tags) > 1:
                 raise LoopyError(
-                    "iname {0} has multiple tags: {1}. "
+                    "iname {} has multiple tags: {}. "
                     "Use iname_to_tags['iname'] instead.".format(iname, tags))
-        return dict((k, next(iter(v)))
-                    for k, v in six.iteritems(self.iname_to_tags) if v)
+        return {k: next(iter(v))
+                    for k, v in self.iname_to_tags.items() if v}
 
     # }}}
 
@@ -944,8 +943,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         result = {}
 
         admissible_vars = (
-                set(arg.name for arg in self.args)
-                | set(six.iterkeys(self.temporary_variables)))
+                {arg.name for arg in self.args}
+                | set(self.temporary_variables.keys()))
 
         for insn in self.instructions:
             for var_name in insn.read_dependency_names() & admissible_vars:
@@ -987,7 +986,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @memoize_method
     def get_temporary_to_base_storage_map(self):
         result = {}
-        for tv in six.itervalues(self.temporary_variables):
+        for tv in self.temporary_variables.values():
             if tv.base_storage:
                 result[tv.name] = tv.base_storage
 
@@ -998,10 +997,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         written_vars = self.get_written_variables()
 
         from loopy.kernel.data import ValueArg
-        return set(
+        return {
                 arg.name
                 for arg in self.args
-                if isinstance(arg, ValueArg) and arg.name not in written_vars)
+                if isinstance(arg, ValueArg) and arg.name not in written_vars}
 
     # }}}
 
@@ -1010,7 +1009,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @property
     @memoize_method
     def arg_dict(self):
-        return dict((arg.name, arg) for arg in self.args)
+        return {arg.name: arg for arg in self.args}
 
     @property
     @memoize_method
@@ -1032,14 +1031,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         from loopy.kernel.data import ArrayArg
         return (
-                set(
+                {
                     arg.name for arg in self.args
-                    if isinstance(arg, ArrayArg)
-                    and arg.address_space == AddressSpace.GLOBAL)
-                | set(
+                    if (isinstance(arg, ArrayArg)
+                        and arg.address_space == AddressSpace.GLOBAL)}
+                | {
                     tv.name
-                    for tv in six.itervalues(self.temporary_variables)
-                    if tv.address_space == AddressSpace.GLOBAL))
+                    for tv in self.temporary_variables.values()
+                    if tv.address_space == AddressSpace.GLOBAL})
 
     # }}}
 
@@ -1207,7 +1206,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             forced_sizes = forced_sizes.copy()
 
             size_list = []
-            sorted_axes = sorted(six.iterkeys(size_dict))
+            sorted_axes = sorted(size_dict.keys())
 
             while sorted_axes or forced_sizes:
                 if sorted_axes:
@@ -1286,15 +1285,15 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @memoize_method
     def local_var_names(self):
         from loopy.kernel.data import AddressSpace
-        return set(
+        return {
             tv.name
-            for tv in six.itervalues(self.temporary_variables)
-            if tv.address_space == AddressSpace.LOCAL)
+            for tv in self.temporary_variables.values()
+            if tv.address_space == AddressSpace.LOCAL}
 
     def local_mem_use(self):
         from loopy.kernel.data import AddressSpace
         return sum(
-                tv.nbytes for tv in six.itervalues(self.temporary_variables)
+                tv.nbytes for tv in self.temporary_variables.values()
                 if tv.address_space == AddressSpace.LOCAL)
 
     # }}}
@@ -1327,13 +1326,13 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 "consistent iname nesting order. This is a possible indication "
                 "that the kernel may not schedule successfully, but for now "
                 "it only impacts printing of the kernel.")
-            embedding = dict((iname, iname) for iname in self.all_inames())
+            embedding = {iname: iname for iname in self.all_inames()}
 
         return embedding
 
     def stringify(self, what=None, with_dependencies=False, use_separators=True,
             show_labels=True):
-        all_what = set([
+        all_what = {
             "name",
             "arguments",
             "domains",
@@ -1343,10 +1342,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "instructions",
             "Dependencies",
             "schedule",
-            ])
+            }
 
-        first_letter_to_what = dict(
-                (w[0], w) for w in all_what)
+        first_letter_to_what = {
+                w[0]: w for w in all_what}
         assert len(first_letter_to_what) == len(all_what)
 
         if what is None:
@@ -1357,11 +1356,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if isinstance(what, str):
             if "," in what:
                 what = what.split(",")
-                what = set(s.strip() for s in what)
+                what = {s.strip() for s in what}
             else:
-                what = set(
+                what = {
                         first_letter_to_what[w]
-                        for w in what)
+                        for w in what}
 
         if not (what <= all_what):
             raise LoopyError("invalid 'what' passed: %s"
@@ -1406,14 +1405,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 else:
                     tags_str = ", ".join(str(tag) for tag in tags)
 
-                line = "%s: %s" % (iname, tags_str)
+                line = f"{iname}: {tags_str}"
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
             lines.extend(sep)
             if show_labels:
                 lines.append("TEMPORARIES:")
-            for tv in natsorted(six.itervalues(kernel.temporary_variables),
+            for tv in natsorted(kernel.temporary_variables.values(),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
@@ -1421,7 +1420,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             lines.extend(sep)
             if show_labels:
                 lines.append("SUBSTITUTION RULES:")
-            for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
+            for rule_name in natsorted(kernel.substitutions.keys()):
                 lines.append(str(kernel.substitutions[rule_name]))
 
         if "instructions" in what:
@@ -1435,7 +1434,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         dep_lines = []
         for insn in kernel.instructions:
             if insn.depends_on:
-                dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on)))
+                dep_lines.append("{} : {}".format(
+                    insn.id, ",".join(insn.depends_on)))
 
         if "Dependencies" in what and dep_lines:
             lines.extend(sep)
@@ -1456,11 +1456,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return "\n".join(lines)
 
     def __str__(self):
-        if six.PY3:
-            return self.stringify()
-        else:
-            # Path of least resistance...
-            return self.stringify().encode("utf-8")
+        return self.stringify()
 
     def __unicode__(self):
         return self.stringify()
@@ -1478,6 +1474,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
+        """
+        Execute the :class:`LoopKernel`.
+        """
         warn("Calling a LoopKernel is deprecated, call a Program "
                 "instead.", DeprecationWarning, stacklevel=2)
         from loopy.program import make_program
@@ -1489,10 +1488,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     # {{{ pickling
 
     def __getstate__(self):
-        result = dict(
-                (key, getattr(self, key))
+        result = {
+                key: getattr(self, key)
                 for key in self.__class__.fields
-                if hasattr(self, key))
+                if hasattr(self, key)}
 
         result.pop("cache_manager", None)
 
@@ -1523,7 +1522,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         new_fields = set()
 
-        for k, v in six.iteritems(attribs):
+        for k, v in attribs.items():
             setattr(self, k, v)
             new_fields.add(k)
 
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 2e43d9b605c313add3d353da38b138f9d57bb9b7..eabaa0900d9238a6b01b2784c2d46deedff701e0 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1,6 +1,5 @@
 """Implementation tagging of array axes."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -26,10 +25,6 @@ THE SOFTWARE.
 
 import re
 
-import six
-from six.moves import range, zip
-from six import iteritems
-
 from pytools import ImmutableRecord, memoize_method
 
 import numpy as np  # noqa
@@ -38,6 +33,25 @@ from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 
 
+__doc__ = """
+.. currentmodule:: loopy.kernel.array
+
+.. autoclass:: ArrayDimImplementationTag
+
+.. autoclass:: _StrideArrayDimTagBase
+
+.. autoclass:: FixedStrideArrayDimTag
+
+.. autoclass:: ComputedStrideArrayDimTag
+
+.. autoclass:: SeparateArrayArrayDimTag
+
+.. autoclass:: VectorArrayDimTag
+
+.. autofunction:: parse_array_dim_tags
+"""
+
+
 # {{{ array dimension tags
 
 class ArrayDimImplementationTag(ImmutableRecord):
@@ -69,9 +83,8 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag):
         The lowest nesting level varies fastest when viewed
         in linear memory.
 
-        May be None on :class:`FixedStrideArrayDimTag`, in which
-        case no :class:`ComputedStrideArrayDimTag` instances may
-        occur.
+        May be None on :class:`FixedStrideArrayDimTag`, in which case no
+        :class:`ComputedStrideArrayDimTag` instances may occur.
     """
 
 
@@ -132,8 +145,8 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase):
 
         :attr:`ArrayBase.dtype` granularity to which to pad this dimension
 
-    This type of stride arg dim gets converted to :class:`FixedStrideArrayDimTag`
-    on input to :class:`ArrayBase` subclasses.
+    This type of stride arg dim gets converted to
+    :class:`FixedStrideArrayDimTag` on input to :class:`ArrayBase` subclasses.
     """
 
     def __init__(self, layout_nesting_level, pad_to=None, target_axis=0, ):
@@ -304,7 +317,7 @@ def parse_array_dim_tags(dim_tags, n_axes=None, use_increasing_target_axes=False
         assert n_axes == len(dim_names)
 
         dim_tags = [None]*n_axes
-        for dim_name, val in six.iteritems(dim_tags_dict):
+        for dim_name, val in dim_tags_dict.items():
             try:
                 dim_idx = dim_names.index(dim_name)
             except ValueError:
@@ -370,7 +383,7 @@ def parse_array_dim_tags(dim_tags, n_axes=None, use_increasing_target_axes=False
 
     # {{{ check contiguity of nesting levels
 
-    for target_axis, ta_nesting_levels in iteritems(nesting_levels):
+    for target_axis, ta_nesting_levels in nesting_levels.items():
         if sorted(ta_nesting_levels) != list(
                 range(
                     min(ta_nesting_levels),
@@ -653,7 +666,7 @@ class ArrayBase(ImmutableRecord):
             or a string which can be parsed into the previous form.
 
         :arg dim_tags: A comma-separated list of tags as understood by
-            :func:`parse_array_dim_tag`.
+            :func:`loopy.kernel.array.parse_array_dim_tags`.
 
         :arg strides: May be one of the following:
 
@@ -881,7 +894,7 @@ class ArrayBase(ImmutableRecord):
             if self.dim_names is not None:
                 info_entries.append("shape: (%s)"
                         % ", ".join(
-                            "%s:%s" % (n, i)
+                            f"{n}:{i}"
                             for n, i in zip(self.dim_names, self.shape)))
             else:
                 info_entries.append("shape: (%s)"
@@ -895,7 +908,7 @@ class ArrayBase(ImmutableRecord):
         if self.offset:
             info_entries.append("offset: %s" % self.offset)
 
-        return "%s: %s" % (self.name, ", ".join(info_entries))
+        return "{}: {}".format(self.name, ", ".join(info_entries))
 
     def __str__(self):
         return self.stringify(include_typename=True)
@@ -935,7 +948,8 @@ class ArrayBase(ImmutableRecord):
         return len(target_axes)
 
     def num_user_axes(self, require_answer=True):
-        if self.shape is not None:
+        from loopy import auto
+        if self.shape not in (None, auto):
             return len(self.shape)
         if self.dim_tags is not None:
             return len(self.dim_tags)
@@ -1088,8 +1102,7 @@ class ArrayBase(ImmutableRecord):
                                 offset_for_name=full_name,
                                 is_written=False)
 
-                for sa in stride_args:
-                    yield sa
+                yield from stride_args
 
                 # }}}
 
@@ -1115,13 +1128,12 @@ class ArrayBase(ImmutableRecord):
                     new_stride_arg_axes = stride_arg_axes
                     new_stride_axis = dim_tag.stride
 
-                for res in gen_decls(name_suffix,
+                yield from gen_decls(name_suffix,
                         shape + (new_shape_axis,), strides + (new_stride_axis,),
                         unvec_shape + (new_shape_axis,),
                         unvec_strides + (new_stride_axis,),
                         new_stride_arg_axes,
-                        dtype, user_index + (None,)):
-                    yield res
+                        dtype, user_index + (None,))
 
             elif isinstance(dim_tag, SeparateArrayArrayDimTag):
                 shape_i = array_shape[user_axis]
@@ -1131,11 +1143,10 @@ class ArrayBase(ImmutableRecord):
                                 self.name, user_axis))
 
                 for i in range(shape_i):
-                    for res in gen_decls(name_suffix + "_s%d" % i,
+                    yield from gen_decls(name_suffix + "_s%d" % i,
                             shape, strides, unvec_shape, unvec_strides,
                             stride_arg_axes, dtype,
-                            user_index + (i,)):
-                        yield res
+                            user_index + (i,))
 
             elif isinstance(dim_tag, VectorArrayDimTag):
                 shape_i = array_shape[user_axis]
@@ -1144,26 +1155,24 @@ class ArrayBase(ImmutableRecord):
                             "integer axis %d (0-based)" % (
                                 self.name, user_axis))
 
-                for res in gen_decls(name_suffix,
+                yield from gen_decls(name_suffix,
                         shape, strides,
                         unvec_shape + (shape_i,),
                         # vectors always have stride 1
                         unvec_strides + (1,),
                         stride_arg_axes,
                         target.vector_dtype(dtype, shape_i),
-                        user_index + (None,)):
-                    yield res
+                        user_index + (None,))
 
             else:
                 raise LoopyError("unsupported array dim implementation tag '%s' "
                         "in array '%s'" % (dim_tag, self.name))
 
-        for res in gen_decls(name_suffix="",
+        yield from gen_decls(name_suffix="",
                 shape=(), strides=(),
                 unvec_shape=(), unvec_strides=(),
                 stride_arg_axes=(),
-                dtype=self.dtype, user_index=()):
-            yield res
+                dtype=self.dtype, user_index=())
 
     @memoize_method
     def sep_shape(self):
@@ -1194,11 +1203,10 @@ class ArrayBase(ImmutableRecord):
             else:
                 return idx
 
-        from pytools import indices_in_shape
         return [
                 (unwrap_1d_indices(i),
                     self.name + "".join("_s%d" % sub_i for sub_i in i))
-                for i in indices_in_shape(sep_shape)]
+                for i in np.ndindex(sep_shape)]
 
 # }}}
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e0834ba9dfbc2d68c063623f77889f04b977b156..0f7a0deff491c68ad171c79c4775deafafac2dfc 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1,6 +1,5 @@
 """UI for kernel creation."""
 
-from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -43,8 +42,7 @@ import islpy as isl
 from islpy import dim_type
 from pytools import ProcessLogger
 
-import six
-from six.moves import range, zip, intern
+from sys import intern
 import loopy.version
 
 import re
@@ -202,7 +200,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
             raise ValueError(
                 "unknown scope for nosync option: '%s' "
                 "(allowable scopes are %s)" %
-                (scope, ', '.join("'%s'" % s for s in allowable_scopes)))
+                (scope, ", ".join("'%s'" % s for s in allowable_scopes)))
         return _NosyncParseResult(expr, scope)
 
     for option in options_str.split(","):
@@ -363,7 +361,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 
         elif opt_key == "mem_kind":
             opt_value = opt_value.lower().strip()
-            if opt_value not in ['local', 'global']:
+            if opt_value not in ["local", "global"]:
                 raise LoopyError("Unknown memory synchronization type %s specified"
                     " expected, 'local' or 'global'."
                     % opt_value)
@@ -439,13 +437,13 @@ SUBST_RE = re.compile(
 
 def check_illegal_options(insn_options, insn_type):
     illegal_options = []
-    if insn_type not in ['gbarrier', 'lbarrier']:
-        illegal_options.append('mem_kind')
+    if insn_type not in ["gbarrier", "lbarrier"]:
+        illegal_options.append("mem_kind")
 
     bad_options = [x for x in illegal_options if x in insn_options]
     if bad_options:
         raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" %
-                         ', '.join(bad_options), insn_type)
+                         ", ".join(bad_options), insn_type)
 
 
 def parse_insn(groups, insn_options):
@@ -520,7 +518,7 @@ def parse_insn(groups, insn_options):
             assignee_names=assignee_names)
 
     # check for bad options
-    check_illegal_options(insn_options, 'assignment')
+    check_illegal_options(insn_options, "assignment")
 
     insn_id = insn_options.pop("insn_id", None)
     inames_to_dup = insn_options.pop("inames_to_dup", [])
@@ -761,8 +759,8 @@ def parse_instructions(instructions, defines):
 
     insn_options_stack = [get_default_insn_options_dict()]
     if_predicates_stack = [
-            {'predicates': frozenset(),
-                'insn_predicates': frozenset()}]
+            {"predicates": frozenset(),
+                "insn_predicates": frozenset()}]
 
     for insn in instructions:
         if isinstance(insn, InstructionBase):
@@ -823,7 +821,7 @@ def parse_instructions(instructions, defines):
                         insn_options_stack[-1],
                         with_options_match.group("options")))
             # check for bad options
-            check_illegal_options(insn_options_stack[-1], 'with-block')
+            check_illegal_options(insn_options_stack[-1], "with-block")
             continue
 
         for_match = FOR_RE.match(insn)
@@ -863,7 +861,7 @@ def parse_instructions(instructions, defines):
 
             #add to the if_stack
             if_options = options.copy()
-            if_options['insn_predicates'] = options["predicates"]
+            if_options["insn_predicates"] = options["predicates"]
             if_predicates_stack.append(if_options)
             del options
             del predicate
@@ -927,9 +925,9 @@ def parse_instructions(instructions, defines):
         if insn == "end":
             obj = insn_options_stack.pop()
             #if this object is the end of an if statement
-            if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
+            if obj["predicates"] == if_predicates_stack[-1]["insn_predicates"] and\
                     if_predicates_stack[-1]["insn_predicates"] and\
-                    obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
+                    obj["within_inames"] == if_predicates_stack[-1]["within_inames"]:
                 if_predicates_stack.pop()
             continue
 
@@ -991,8 +989,8 @@ def _find_inames_in_set(dom_str):
     if match is None:
         raise RuntimeError("invalid syntax for domain '%s'" % dom_str)
 
-    result = set(iname.strip() for iname in match.group(1).split(",")
-            if iname.strip())
+    result = {iname.strip() for iname in match.group(1).split(",")
+            if iname.strip()}
 
     return result
 
@@ -1001,7 +999,7 @@ EX_QUANT_RE = re.compile(r"\bexists\s+([a-zA-Z0-9])\s*\:")
 
 
 def _find_existentially_quantified_inames(dom_str):
-    return set(ex_quant.group(1) for ex_quant in EX_QUANT_RE.finditer(dom_str))
+    return {ex_quant.group(1) for ex_quant in EX_QUANT_RE.finditer(dom_str)}
 
 
 def parse_domains(domains, defines):
@@ -1020,7 +1018,7 @@ def parse_domains(domains, defines):
                 parameters = (_gather_isl_identifiers(dom)
                         - _find_inames_in_set(dom)
                         - _find_existentially_quantified_inames(dom))
-                dom = "[%s] -> %s" % (",".join(sorted(parameters)), dom)
+                dom = "[{}] -> {}".format(",".join(sorted(parameters)), dom)
 
             try:
                 dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
@@ -1182,7 +1180,7 @@ class ArgumentGuesser:
 
         # {{{ find names that are *not* arguments
 
-        temp_var_names = set(six.iterkeys(self.temporary_variables))
+        temp_var_names = set(self.temporary_variables.keys())
 
         for insn in self.instructions:
             if isinstance(insn, MultiAssignmentBase):
@@ -1276,8 +1274,8 @@ def check_for_multiple_writes_to_loop_bounds(knl):
 
 def check_written_variable_names(knl):
     admissible_vars = (
-            set(arg.name for arg in knl.args)
-            | set(six.iterkeys(knl.temporary_variables)))
+            {arg.name for arg in knl.args}
+            | set(knl.temporary_variables.keys()))
 
     for insn in knl.instructions:
         for var_name in insn.assignee_var_names():
@@ -1298,7 +1296,7 @@ class CSEToAssignmentMapper(IdentityMapper):
     def map_reduction(self, expr, additional_inames):
         additional_inames = additional_inames | frozenset(expr.inames)
 
-        return super(CSEToAssignmentMapper, self).map_reduction(
+        return super().map_reduction(
                 expr, additional_inames)
 
     def map_common_subexpression(self, expr, additional_inames):
@@ -1521,7 +1519,7 @@ def determine_shapes_of_temporaries(knl):
 
     vars_needing_shape_inference = set()
 
-    for tv in six.itervalues(knl.temporary_variables):
+    for tv in knl.temporary_variables.values():
         if tv.shape is lp.auto or tv.base_indices is lp.auto:
             vars_needing_shape_inference.add(tv.name)
 
@@ -1539,8 +1537,7 @@ def determine_shapes_of_temporaries(knl):
     if len(var_to_error) > 0:
         vars_needing_shape_inference = set(var_to_error.keys())
 
-        from six import iteritems
-        for varname, err in iteritems(var_to_error):
+        for varname, err in var_to_error.items():
             warn_with_kernel(knl, "temp_shape_fallback",
                              "Had to fall back to legacy method of determining "
                              "shape of temporary '%s' because: %s"
@@ -1558,7 +1555,7 @@ def determine_shapes_of_temporaries(knl):
         if len(var_to_error) > 0:
             # No way around errors: propagate an exception upward.
             formatted_errors = (
-                "\n\n".join("'%s': %s" % (varname, var_to_error[varname])
+                "\n\n".join("'{}': {}".format(varname, var_to_error[varname])
                 for varname in sorted(var_to_error.keys())))
 
             raise LoopyError("got the following exception(s) trying to find the "
@@ -1571,7 +1568,7 @@ def determine_shapes_of_temporaries(knl):
 
     new_temp_vars = {}
 
-    for tv in six.itervalues(knl.temporary_variables):
+    for tv in knl.temporary_variables.values():
         if tv.base_indices is lp.auto:
             tv = tv.copy(base_indices=var_to_base_indices[tv.name])
         if tv.shape is lp.auto:
@@ -1600,7 +1597,7 @@ def expand_defines_in_shapes(kernel, defines):
         processed_args.append(arg)
 
     processed_temp_vars = {}
-    for tv in six.itervalues(kernel.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         processed_temp_vars[tv.name] = tv.map_exprs(expr_map)
 
     return kernel.copy(
@@ -1763,13 +1760,13 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
 
     writer_map = kernel.writer_map()
 
-    arg_names = set(arg.name for arg in kernel.args)
+    arg_names = {arg.name for arg in kernel.args}
 
-    var_names = arg_names | set(six.iterkeys(kernel.temporary_variables))
+    var_names = arg_names | set(kernel.temporary_variables.keys())
 
-    dep_map = dict(
-            (insn.id, insn.read_dependency_names() & var_names)
-            for insn in expanded_kernel.instructions)
+    dep_map = {
+            insn.id: insn.read_dependency_names() & var_names
+            for insn in expanded_kernel.instructions}
 
     new_insns = []
     for insn in kernel.instructions:
@@ -1793,7 +1790,7 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
                 if len(var_writers) == 1:
                     auto_deps.update(
                             var_writers
-                            - set([insn.id]))
+                            - {insn.id})
 
             # }}}
 
@@ -2128,7 +2125,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         breaking language changes *will* apply to your kernel without asking,
         likely breaking your code.)
 
-        If not given, this value defaults to version **(2017, 2, 1)** and
+        If not given, this value defaults to version **(2018, 2)** and
         a warning will be issued.
 
         To set the kernel version for all :mod:`loopy` kernels in a (Python) source
@@ -2194,9 +2191,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     from loopy.version import LANGUAGE_VERSION_SYMBOLS
 
-    version_to_symbol = dict(
-            (getattr(loopy.version, lvs), lvs)
-            for lvs in LANGUAGE_VERSION_SYMBOLS)
+    version_to_symbol = {
+            getattr(loopy.version, lvs): lvs
+            for lvs in LANGUAGE_VERSION_SYMBOLS}
 
     lang_version = kwargs.pop("lang_version", None)
     if lang_version is None:
@@ -2236,11 +2233,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
             lang_version = FALLBACK_LANGUAGE_VERSION
 
     if lang_version not in version_to_symbol:
-        raise LoopyError("Language version '%s' is not known." % (lang_version,))
-    if lang_version >= (2018, 1):
-        options = options.copy(enforce_variable_access_ordered=True)
-    if lang_version >= (2018, 2):
-        options = options.copy(ignore_boostable_into=True)
+        raise LoopyError(f"Language version '{lang_version}' is not known.")
 
     # }}}
 
@@ -2398,7 +2391,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
 
 def make_function(*args, **kwargs):
-    kwargs['is_callee_kernel'] = True
+    kwargs["is_callee_kernel"] = True
     return make_kernel(*args, **kwargs)
 
 # }}}
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 6c0fa0a303d22fa931fe797ff3653d2819d4aa8d..073dc6f6579e005f7c627412ff763d6f019f95fd 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -1,6 +1,5 @@
 """Data used by the kernel object."""
 
-from __future__ import division
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,7 +24,7 @@ THE SOFTWARE.
 """
 
 
-from six.moves import intern
+from sys import intern
 import numpy as np  # noqa
 from pytools import ImmutableRecord
 from loopy.kernel.array import ArrayBase
@@ -45,8 +44,30 @@ from loopy.kernel.instruction import (  # noqa
         CInstruction)
 from warnings import warn
 
+__doc__ = """
+.. currentmodule:: loopy.kernel.data
 
-class auto(object):  # noqa
+.. autofunction:: filter_iname_tags_by_type
+
+.. autoclass:: IndexTag
+
+.. autoclass:: ConcurrentTag
+
+.. autoclass:: UniqueTag
+
+.. autoclass:: AxisTag
+
+.. autoclass:: LocalIndexTag
+
+.. autoclass:: GroupIndexTag
+
+.. autoclass:: VectorizeTag
+
+.. autoclass:: UnrollTag
+"""
+
+
+class auto:  # noqa
     """A generic placeholder object for something that should be automatically
     determined.  See, for example, the *shape* or *strides* argument of
     :class:`ArrayArg`.
@@ -67,7 +88,7 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
     :arg min_num: the minimum number of tags expected to be found.
     """
 
-    result = set(tag for tag in tags if isinstance(tag, tag_type))
+    result = {tag for tag in tags if isinstance(tag, tag_type)}
 
     def strify_tag_type():
         if isinstance(tag_type, tuple):
@@ -77,12 +98,12 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
 
     if max_num is not None:
         if len(result) > max_num:
-            raise LoopyError("cannot have more than {0} tags "
-                    "of type(s): {1}".format(max_num, strify_tag_type()))
+            raise LoopyError("cannot have more than {} tags "
+                    "of type(s): {}".format(max_num, strify_tag_type()))
     if min_num is not None:
         if len(result) < min_num:
-            raise LoopyError("must have more than {0} tags "
-                    "of type(s): {1}".format(max_num, strify_tag_type()))
+            raise LoopyError("must have more than {} tags "
+                    "of type(s): {}".format(max_num, strify_tag_type()))
     return result
 
 
@@ -244,7 +265,7 @@ def parse_tag(tag):
 
 # {{{ memory address space
 
-class AddressSpace(object):
+class AddressSpace:
     """Storage location of a variable.
 
     .. attribute:: PRIVATE
@@ -271,7 +292,7 @@ class AddressSpace(object):
             raise ValueError("unexpected value of AddressSpace")
 
 
-class _deprecated_temp_var_scope_class_method(object):  # noqa
+class _deprecated_temp_var_scope_class_method:  # noqa
     def __init__(self, f):
         self.f = f
 
@@ -281,8 +302,8 @@ class _deprecated_temp_var_scope_class_method(object):  # noqa
         return self.f()
 
 
-class temp_var_scope(object):  # noqa
-    """Deprecated. Use :class:`AddressSpace` instead.
+class temp_var_scope:  # noqa
+    """Deprecated. Use :class:`loopy.AddressSpace` instead.
     """
 
     @_deprecated_temp_var_scope_class_method
@@ -318,8 +339,8 @@ class KernelArgument(ImmutableRecord):
 
         dtype = kwargs.pop("dtype", None)
 
-        if 'for_atomic' in kwargs:
-            for_atomic = kwargs['for_atomic']
+        if "for_atomic" in kwargs:
+            for_atomic = kwargs["for_atomic"]
         else:
             for_atomic = False
 
@@ -384,7 +405,7 @@ class ArrayArg(ArrayBase, KernelArgument):
             kwargs["is_output"] = kwargs.pop("is_output", None)
             kwargs["is_input"] = kwargs.pop("is_input", None)
 
-        super(ArrayArg, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
     min_target_axes = 0
     max_target_axes = 1
@@ -408,7 +429,7 @@ class ArrayArg(ArrayBase, KernelArgument):
         """Custom hash computation function for use with
         :class:`pytools.persistent_dict.PersistentDict`.
         """
-        super(ArrayArg, self).update_persistent_hash(key_hash, key_builder)
+        super().update_persistent_hash(key_hash, key_builder)
         key_builder.rec(key_hash, self.address_space)
         key_builder.rec(key_hash, self.is_output)
         key_builder.rec(key_hash, self.is_input)
@@ -474,7 +495,7 @@ class ValueArg(KernelArgument):
         else:
             type_str = str(self.dtype)
 
-        return "%s: ValueArg, type: %s" % (self.name, type_str)
+        return f"{self.name}: ValueArg, type: {type_str}"
 
     def __repr__(self):
         return "<%s>" % self.__str__()
@@ -550,7 +571,7 @@ class TemporaryVariable(ArrayBase):
             "_base_storage_access_may_be_aliasing",
             ]
 
-    def __init__(self, name, dtype=None, shape=(), address_space=None,
+    def __init__(self, name, dtype=None, shape=auto, address_space=None,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
             base_storage=None, initializer=None, read_only=False,
@@ -604,7 +625,10 @@ class TemporaryVariable(ArrayBase):
 
             if shape is auto:
                 shape = initializer.shape
-
+            else:
+                if shape != initializer.shape:
+                    raise LoopyError("Shape of '{}' does not match that of the"
+                            " initializer.".format(name))
         else:
             raise LoopyError(
                     "temporary variable '%s': "
@@ -614,7 +638,7 @@ class TemporaryVariable(ArrayBase):
         if order is None:
             order = "C"
 
-        if base_indices is None:
+        if base_indices is None and shape is not auto:
             base_indices = (0,) * len(shape)
 
         if not read_only and initializer is not None:
@@ -680,7 +704,7 @@ class TemporaryVariable(ArrayBase):
         if address_space is not None:
             kwargs["address_space"] = address_space
 
-        return super(TemporaryVariable, self).copy(**kwargs)
+        return super().copy(**kwargs)
 
     @property
     def nbytes(self):
@@ -692,7 +716,7 @@ class TemporaryVariable(ArrayBase):
         return product(si for si in shape)*self.dtype.itemsize
 
     def decl_info(self, target, index_dtype):
-        return super(TemporaryVariable, self).decl_info(
+        return super().decl_info(
                 target, is_written=True, index_dtype=index_dtype,
                 shape_override=self.storage_shape)
 
@@ -717,7 +741,7 @@ class TemporaryVariable(ArrayBase):
 
     def __eq__(self, other):
         return (
-                super(TemporaryVariable, self).__eq__(other)
+                super().__eq__(other)
                 and self.storage_shape == other.storage_shape
                 and self.base_indices == other.base_indices
                 and self.address_space == other.address_space
@@ -735,7 +759,7 @@ class TemporaryVariable(ArrayBase):
         :class:`pytools.persistent_dict.PersistentDict`.
         """
 
-        super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder)
+        super().update_persistent_hash(key_hash, key_builder)
         self.update_persistent_hash_for_shape(key_hash, key_builder,
                 self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
@@ -783,7 +807,7 @@ class SubstitutionRule(ImmutableRecord):
                 name=name, arguments=arguments, expression=expression)
 
     def __str__(self):
-        return "%s(%s) := %s" % (
+        return "{}({}) := {}".format(
                 self.name, ", ".join(self.arguments), self.expression)
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -809,19 +833,19 @@ class CallMangleInfo(ImmutableRecord):
 
     .. attribute:: result_dtypes
 
-        A tuple of :class:`LoopyType` instances indicating what
+        A tuple of :class:`loopy.types.LoopyType` instances indicating what
         types of values the function returns.
 
     .. attribute:: arg_dtypes
 
-        A tuple of :class:`LoopyType` instances indicating what
+        A tuple of :class:`loopy.types.LoopyType` instances indicating what
         types of arguments the function actually receives.
     """
 
     def __init__(self, target_name, result_dtypes, arg_dtypes):
         assert isinstance(result_dtypes, tuple)
 
-        super(CallMangleInfo, self).__init__(
+        super().__init__(
                 target_name=target_name,
                 result_dtypes=result_dtypes,
                 arg_dtypes=arg_dtypes)
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index a1e2213020d50e8e564214e3ecddb75acc065c6b..f48e8852f0fb756142f505ed76798760251e4674 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni"
 
 __license__ = """
@@ -22,10 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
-from six.moves import zip
 import islpy as isl
-
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
@@ -83,7 +78,7 @@ class ArrayArgDescriptor(ImmutableRecord):
         A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase`
     """
 
-    fields = set(['shape', 'address_space', 'dim_tags'])
+    fields = {"shape", "address_space", "dim_tags"}
 
     def __init__(self, shape, address_space, dim_tags):
 
@@ -100,7 +95,7 @@ class ArrayArgDescriptor(ImmutableRecord):
 
         # }}}
 
-        super(ArrayArgDescriptor, self).__init__(
+        super().__init__(
                 shape=shape,
                 address_space=address_space,
                 dim_tags=dim_tags)
@@ -266,7 +261,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
         This class acts as a pseudo-callable and its significance lies in
         solving picklability issues.
     """
-    fields = set(["local_size", "global_size"])
+    fields = {"local_size", "global_size"}
 
     def __init__(self, global_size, local_size):
         self.global_size = global_size
@@ -319,12 +314,12 @@ class InKernelCallable(ImmutableRecord):
     .. automethod:: is_ready_for_codegen
     """
 
-    fields = set(["arg_id_to_dtype", "arg_id_to_descr"])
+    fields = {"arg_id_to_dtype", "arg_id_to_descr"}
     init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr")
 
     def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None):
 
-        super(InKernelCallable, self).__init__(
+        super().__init__(
                 arg_id_to_dtype=arg_id_to_dtype,
                 arg_id_to_descr=arg_id_to_descr)
 
@@ -396,8 +391,8 @@ class InKernelCallable(ImmutableRecord):
 
         new_arg_id_to_dtype = None
         if self.arg_id_to_dtype is not None:
-            new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id,
-                    dtype in self.arg_id_to_dtype.items())
+            new_arg_id_to_dtype = {id: with_target_if_not_None(dtype) for id,
+                    dtype in self.arg_id_to_dtype.items()}
 
         return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
 
@@ -463,7 +458,7 @@ class ScalarCallable(InKernelCallable):
         derived subclasses.
     """
 
-    fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"])
+    fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"}
     init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr",
             "name_in_target")
     hash_fields = fields
@@ -471,7 +466,7 @@ class ScalarCallable(InKernelCallable):
     def __init__(self, name, arg_id_to_dtype=None,
             arg_id_to_descr=None, name_in_target=None):
 
-        super(ScalarCallable, self).__init__(
+        super().__init__(
                 arg_id_to_dtype=arg_id_to_dtype,
                 arg_id_to_descr=arg_id_to_descr)
 
@@ -629,7 +624,7 @@ class CallableKernel(InKernelCallable):
     sizes for the :attr:`subkernel` of the callable.
     """
 
-    fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"])
+    fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"}
     init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr")
     hash_fields = fields
 
@@ -637,7 +632,7 @@ class CallableKernel(InKernelCallable):
             arg_id_to_descr=None):
         assert isinstance(subkernel, LoopKernel)
 
-        super(CallableKernel, self).__init__(
+        super().__init__(
                 arg_id_to_dtype=arg_id_to_dtype,
                 arg_id_to_descr=arg_id_to_descr)
 
@@ -731,8 +726,8 @@ class CallableKernel(InKernelCallable):
 
         subst_mapper = SubstitutionMapper(subst_func)
 
-        arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for
-                arg_id, descr in arg_id_to_descr.items())
+        arg_id_to_descr = {arg_id: descr.map_expr(subst_mapper) for
+                arg_id, descr in arg_id_to_descr.items()}
 
         # }}}
 
@@ -795,8 +790,8 @@ class CallableKernel(InKernelCallable):
                     callables_table))
 
         if assumptions:
-            args_added_knl = assume(args_added_knl, ' and '.join([
-                '{0}={1}'.format(key, val) for key, val in assumptions.items()]))
+            args_added_knl = assume(args_added_knl, " and ".join([
+                f"{key}={val}" for key, val in assumptions.items()]))
 
         return (
                 self.copy(
@@ -904,19 +899,19 @@ class ManglerCallable(ScalarCallable):
         A function of signature ``(kernel, name , arg_dtypes)`` and returns an
         instance of ``loopy.CallMangleInfo``.
     """
-    fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr",
-        "name_in_target"])
+    fields = {"name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr",
+        "name_in_target"}
     init_arg_names = ("name", "function_mangler", "arg_id_to_dtype",
             "arg_id_to_descr", "name_in_target")
-    hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr",
-        "name_in_target"])
+    hash_fields = {"name", "arg_id_to_dtype", "arg_id_to_descr",
+        "name_in_target"}
 
     def __init__(self, name, function_mangler, arg_id_to_dtype=None,
             arg_id_to_descr=None, name_in_target=None):
 
         self.function_mangler = function_mangler
 
-        super(ManglerCallable, self).__init__(
+        super().__init__(
                 name=name,
                 arg_id_to_dtype=arg_id_to_dtype,
                 arg_id_to_descr=arg_id_to_descr,
@@ -945,8 +940,8 @@ class ManglerCallable(ScalarCallable):
                 arg_dtypes)
         if mangle_result:
             new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes))
-            new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in
-                enumerate(mangle_result.result_dtypes)))
+            new_arg_id_to_dtype.update({-i-1: dtype for i, dtype in
+                enumerate(mangle_result.result_dtypes)})
             return (
                     self.copy(name_in_target=mangle_result.target_name,
                         arg_id_to_dtype=new_arg_id_to_dtype),
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index c5599863c9260086015e53efc413faf667a80738..6c1fa64e3afcbf86febf4511053c57b261238228 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
@@ -22,8 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import intern
+from sys import intern
 from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional
@@ -39,14 +36,14 @@ class InstructionBase(ImmutableRecord):
     .. attribute:: id
 
         An (otherwise meaningless) identifier that is unique within
-        a :class:`loopy.kernel.LoopKernel`.
+        a :class:`loopy.LoopKernel`.
 
     .. rubric:: Instruction ordering
 
     .. attribute:: depends_on
 
-        a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
-        that *must* be executed before this one. Note that
+        a :class:`frozenset` of :attr:`id` values of :class:`InstructionBase`
+        instances that *must* be executed before this one. Note that
         :func:`loopy.preprocess_kernel` (usually invoked automatically)
         augments this by adding dependencies on any writes to temporaries read
         by this instruction.
@@ -82,7 +79,7 @@ class InstructionBase(ImmutableRecord):
     .. attribute:: conflicts_with_groups
 
         A :class:`frozenset` of strings indicating which instruction groups
-        (see :class:`InstructionBase.groups`) may not be active when this
+        (see :attr:`groups`) may not be active when this
         instruction is scheduled.
 
     .. attribute:: priority
@@ -95,7 +92,7 @@ class InstructionBase(ImmutableRecord):
     .. attribute:: no_sync_with
 
         a :class:`frozenset` of tuples of the form ``(insn_id, scope)``, where
-        `insn_id` refers to :attr:`id` of :class:`Instruction` instances
+        ``insn_id`` refers to :attr:`id` of :class:`InstructionBase` instances
         and `scope` is one of the following strings:
 
            - `"local"`
@@ -114,7 +111,7 @@ class InstructionBase(ImmutableRecord):
         and match expression, just like :attr:`depends_on`.
 
         This data is used specifically by barrier insertion and
-        :func:`loopy.check.enforce_variable_access_ordered`.
+        :func:`loopy.check.check_variable_access_ordered`.
 
     .. rubric:: Conditionals
 
@@ -152,51 +149,27 @@ class InstructionBase(ImmutableRecord):
     .. automethod:: copy
     """
 
-    # within_inames_is_final, boostable and boostable_into are deprecated and
-    # will be removed in version 2017.x.
+    # within_inames_is_final is deprecated and will be removed in version 2017.x.
 
     fields = set("id depends_on depends_on_is_final "
             "groups conflicts_with_groups "
             "no_sync_with "
             "predicates "
             "within_inames_is_final within_inames "
-            "priority boostable boostable_into".split())
+            "priority".split())
 
     # Names of fields that are pymbolic expressions. Needed for key building
     pymbolic_fields = set("")
 
     # Names of fields that are sets of pymbolic expressions. Needed for key building
-    pymbolic_set_fields = set(["predicates"])
+    pymbolic_set_fields = {"predicates"}
 
     def __init__(self, id, depends_on, depends_on_is_final,
             groups, conflicts_with_groups,
             no_sync_with,
             within_inames_is_final, within_inames,
             priority,
-            boostable, boostable_into, predicates, tags,
-            insn_deps=None, insn_deps_is_final=None,
-            forced_iname_deps=None, forced_iname_deps_is_final=None):
-
-        # {{{ backwards compatibility goop
-
-        if depends_on is not None and insn_deps is not None:
-            raise LoopyError("may not specify both insn_deps and depends_on")
-        elif insn_deps is not None:
-            warn("insn_deps is deprecated, use depends_on",
-                    DeprecationWarning, stacklevel=2)
-
-            depends_on = insn_deps
-            depends_on_is_final = insn_deps_is_final
-
-        if forced_iname_deps is not None and within_inames is not None:
-            raise LoopyError("may not specify both forced_iname_deps "
-                    "and within_inames")
-        elif forced_iname_deps is not None:
-            warn("forced_iname_deps is deprecated, use within_inames",
-                    DeprecationWarning, stacklevel=2)
-
-            within_inames = forced_iname_deps
-            within_inames_is_final = forced_iname_deps_is_final
+            predicates, tags):
 
         if predicates is None:
             predicates = frozenset()
@@ -218,8 +191,6 @@ class InstructionBase(ImmutableRecord):
         predicates = frozenset(new_predicates)
         del new_predicates
 
-        # }}}
-
         if depends_on is None:
             depends_on = frozenset()
 
@@ -284,42 +255,9 @@ class InstructionBase(ImmutableRecord):
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
                 priority=priority,
-                boostable=boostable,
-                boostable_into=boostable_into,
                 predicates=predicates,
                 tags=tags)
 
-    # {{{ backwards compatibility goop
-
-    @property
-    def insn_deps(self):
-        warn("insn_deps is deprecated, use depends_on",
-                DeprecationWarning, stacklevel=2)
-
-        return self.depends_on
-
-    # legacy
-    @property
-    def insn_deps_is_final(self):
-        warn("insn_deps_is_final is deprecated, use depends_on_is_final",
-                DeprecationWarning, stacklevel=2)
-
-        return self.depends_on_is_final
-
-    @property
-    def forced_iname_deps(self):
-        warn("forced_iname_deps is deprecated, use within_inames",
-                DeprecationWarning, stacklevel=2)
-        return self.within_inames
-
-    @property
-    def forced_iname_deps_is_final(self):
-        warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final",
-                DeprecationWarning, stacklevel=2)
-        return self.within_inames_is_final
-
-    # }}}
-
     # {{{ abstract interface
 
     def read_dependency_names(self):
@@ -346,10 +284,13 @@ class InstructionBase(ImmutableRecord):
         """
         raise NotImplementedError
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
         """Return a new copy of *self* where *f* has been applied to every
         expression occurring in *self*. *args* will be passed as extra
         arguments (in addition to the expression) to *f*.
+
+        If *assignee_f* is passed, then left-hand sides of assignments are
+        passed to it. If it is not given, it defaults to the same as *f*.
         """
         raise NotImplementedError
 
@@ -393,18 +334,6 @@ class InstructionBase(ImmutableRecord):
     def get_str_options(self):
         result = []
 
-        if self.boostable is True:
-            if self.boostable_into:
-                result.append("boostable into '%s'" % ",".join(self.boostable_into))
-            else:
-                result.append("boostable")
-        elif self.boostable is False:
-            result.append("not boostable")
-        elif self.boostable is None:
-            pass
-        else:
-            raise RuntimeError("unexpected value for Instruction.boostable")
-
         if self.depends_on:
             result.append("dep="+":".join(self.depends_on))
         if self.no_sync_with:
@@ -466,23 +395,8 @@ class InstructionBase(ImmutableRecord):
 
     # }}}
 
-    def copy(self, **kwargs):
-        if "insn_deps" in kwargs:
-            warn("insn_deps is deprecated, use depends_on",
-                    DeprecationWarning, stacklevel=2)
-
-            kwargs["depends_on"] = kwargs.pop("insn_deps")
-
-        if "insn_deps_is_final" in kwargs:
-            warn("insn_deps_is_final is deprecated, use depends_on",
-                    DeprecationWarning, stacklevel=2)
-
-            kwargs["depends_on_is_final"] = kwargs.pop("insn_deps_is_final")
-
-        return super(InstructionBase, self).copy(**kwargs)
-
     def __setstate__(self, val):
-        super(InstructionBase, self).__setstate__(val)
+        super().__setstate__(val)
 
         from loopy.tools import intern_frozenset_of_ids
 
@@ -582,7 +496,7 @@ class MemoryOrdering:  # noqa
 
 # {{{ memory_ordering, MemoryOrdering compatibility
 
-class _deprecated_memory_ordering_class_method(object):  # noqa
+class _deprecated_memory_ordering_class_method:  # noqa
     def __init__(self, f):
         self.f = f
 
@@ -592,7 +506,7 @@ class _deprecated_memory_ordering_class_method(object):  # noqa
         return self.f()
 
 
-class memory_ordering(object):  # noqa
+class memory_ordering:  # noqa
     """Deprecated. Use :class:`MemoryOrdering` instead.
     """
 
@@ -659,7 +573,7 @@ class MemoryScope:  # noqa
 
 # {{{ memory_scope, MemoryScope compatiability
 
-class _deprecated_memory_scope_class_method(object):  # noqa
+class _deprecated_memory_scope_class_method:  # noqa
     def __init__(self, f):
         self.f = f
 
@@ -669,7 +583,7 @@ class _deprecated_memory_scope_class_method(object):  # noqa
         return self.f()
 
 
-class memory_scope(object):  # noqa
+class memory_scope:  # noqa
     """Deprecated. Use :class:`MemoryScope` instead.
     """
 
@@ -702,7 +616,7 @@ class memory_scope(object):  # noqa
 # }}}
 
 
-class VarAtomicity(object):
+class VarAtomicity:
     """A base class for the description of how atomic access to :attr:`var_name`
     shall proceed.
 
@@ -747,13 +661,13 @@ class OrderedAtomic(VarAtomicity):
         :class:`pytools.persistent_dict.PersistentDict`.
         """
 
-        super(OrderedAtomic, self).update_persistent_hash(key_hash, key_builder)
+        super().update_persistent_hash(key_hash, key_builder)
         key_builder.rec(key_hash, str(self.__class__.__name__))
         key_builder.rec(key_hash, self.ordering)
         key_builder.rec(key_hash, self.scope)
 
     def __eq__(self, other):
-        return (super(OrderedAtomic, self).__eq__(other)
+        return (super().__eq__(other)
                 and self.ordering == other.ordering
                 and self.scope == other.scope)
 
@@ -762,7 +676,7 @@ class OrderedAtomic(VarAtomicity):
         raise NotImplementedError
 
     def __str__(self):
-        return "%s[%s]%s/%s" % (
+        return "{}[{}]{}/{}".format(
                 self.op_name,
                 self.var_name,
                 MemoryOrdering.to_string(self.ordering),
@@ -781,11 +695,12 @@ class AtomicInit(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'init'
+    op_name = "init"
 
 
 class AtomicUpdate(OrderedAtomic):
-    """Properties of an atomic update. A subclass of :class:`OrderedAtomic`.
+    """Properties of an atomic update. A subclass of
+    :class:`OrderedAtomic`.
 
     .. attribute:: ordering
 
@@ -795,7 +710,7 @@ class AtomicUpdate(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'update'
+    op_name = "update"
 
 
 class AtomicLoad(OrderedAtomic):
@@ -809,7 +724,7 @@ class AtomicLoad(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'load'
+    op_name = "load"
 
 # }}}
 
@@ -819,14 +734,14 @@ class AtomicLoad(OrderedAtomic):
 class MultiAssignmentBase(InstructionBase):
     """An assignment instruction with an expression as a right-hand side."""
 
-    fields = InstructionBase.fields | set(["expression"])
-    pymbolic_fields = InstructionBase.pymbolic_fields | set(["expression"])
+    fields = InstructionBase.fields | {"expression"}
+    pymbolic_fields = InstructionBase.pymbolic_fields | {"expression"}
 
     @memoize_method
     def read_dependency_names(self):
         from loopy.symbolic import get_dependencies
         result = (
-                super(MultiAssignmentBase, self).read_dependency_names()
+                super().read_dependency_names()
                 | get_dependencies(self.expression))
 
         for subscript_deps in self.assignee_subscript_deps():
@@ -908,7 +823,7 @@ class Assignment(MultiAssignmentBase):
 
     fields = MultiAssignmentBase.fields | \
             set("assignee temp_var_type atomicity".split())
-    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
+    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignee"}
 
     def __init__(self,
             assignee, expression,
@@ -920,13 +835,11 @@ class Assignment(MultiAssignmentBase):
             no_sync_with=None,
             within_inames_is_final=None,
             within_inames=None,
-            boostable=None, boostable_into=None, tags=None,
+            tags=None,
             temp_var_type=Optional(), atomicity=(),
-            priority=0, predicates=frozenset(),
-            insn_deps=None, insn_deps_is_final=None,
-            forced_iname_deps=None, forced_iname_deps_is_final=None):
+            priority=0, predicates=frozenset()):
 
-        super(Assignment, self).__init__(
+        super().__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -935,15 +848,9 @@ class Assignment(MultiAssignmentBase):
                 no_sync_with=no_sync_with,
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
-                boostable=boostable,
-                boostable_into=boostable_into,
                 priority=priority,
                 predicates=predicates,
-                tags=tags,
-                insn_deps=insn_deps,
-                insn_deps_is_final=insn_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
-                forced_iname_deps_is_final=forced_iname_deps_is_final)
+                tags=tags)
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -971,17 +878,20 @@ class Assignment(MultiAssignmentBase):
     def assignee_subscript_deps(self):
         return (_get_assignee_subscript_deps(self.assignee),)
 
-    def with_transformed_expressions(self, f, *args, **kwargs):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
-                assignee=f(self.assignee, *args, **kwargs),
-                expression=f(self.expression, *args, **kwargs),
+                assignee=assignee_f(self.assignee),
+                expression=f(self.expression),
                 predicates=frozenset(
-                    f(pred, *args, **kwargs) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
     def __str__(self):
-        result = "%s <- %s" % (self.assignee, self.expression)
+        result = f"{self.assignee} <- {self.expression}"
 
         if self.id is not None:
             result = "%s: " % self.id + result
@@ -1013,7 +923,7 @@ class ExpressionInstruction(Assignment):
         warn("ExpressionInstruction is deprecated. Use Assignment instead",
                 DeprecationWarning, stacklevel=2)
 
-        super(ExpressionInstruction, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
 # }}}
 
@@ -1044,7 +954,7 @@ class CallInstruction(MultiAssignmentBase):
 
     fields = MultiAssignmentBase.fields | \
             set("assignees temp_var_types".split())
-    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignees"])
+    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignees"}
 
     def __init__(self,
             assignees, expression,
@@ -1056,14 +966,11 @@ class CallInstruction(MultiAssignmentBase):
             no_sync_with=None,
             within_inames_is_final=None,
             within_inames=None,
-            boostable=None, boostable_into=None, tags=None,
+            tags=None,
             temp_var_types=None,
-            priority=0, predicates=frozenset(),
-            insn_deps=None, insn_deps_is_final=None,
-            forced_iname_deps=None,
-            forced_iname_deps_is_final=None):
+            priority=0, predicates=frozenset()):
 
-        super(CallInstruction, self).__init__(
+        super().__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -1072,15 +979,9 @@ class CallInstruction(MultiAssignmentBase):
                 no_sync_with=no_sync_with,
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
-                boostable=boostable,
-                boostable_into=boostable_into,
                 priority=priority,
                 predicates=predicates,
-                tags=tags,
-                insn_deps=insn_deps,
-                insn_deps_is_final=insn_deps_is_final,
-                forced_iname_deps=forced_iname_deps,
-                forced_iname_deps_is_final=forced_iname_deps_is_final)
+                tags=tags)
 
         from pymbolic.primitives import Call, CallWithKwargs
         from loopy.symbolic import Reduction
@@ -1128,17 +1029,20 @@ class CallInstruction(MultiAssignmentBase):
                 _get_assignee_subscript_deps(a)
                 for a in self.assignees)
 
-    def with_transformed_expressions(self, f, *args, **kwargs):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
-                assignees=f(self.assignees, *args, **kwargs),
-                expression=f(self.expression, *args, **kwargs),
+                assignees=assignee_f(self.assignees),
+                expression=f(self.expression),
                 predicates=frozenset(
-                    f(pred, *args, **kwargs) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
     def __str__(self):
-        result = "%s: %s <- %s" % (self.id,
+        result = "{}: {} <- {}".format(self.id,
                 ", ".join(str(a) for a in self.assignees),
                 self.expression)
 
@@ -1159,7 +1063,7 @@ class CallInstruction(MultiAssignmentBase):
         from pymbolic.primitives import CallWithKwargs
         arg_id_to_val = dict(enumerate(self.expression.parameters))
         if isinstance(self.expression, CallWithKwargs):
-            for kw, val in six.iteritems(self.expression.kw_parameters):
+            for kw, val in self.expression.kw_parameters.items():
                 arg_id_to_val[kw] = val
         for i, arg in enumerate(self.assignees):
             arg_id_to_val[-i-1] = arg
@@ -1338,9 +1242,8 @@ class CInstruction(InstructionBase):
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
-            priority=0, boostable=None, boostable_into=None,
-            predicates=frozenset(), tags=None,
-            insn_deps=None, insn_deps_is_final=None):
+            priority=0,
+            predicates=frozenset(), tags=None):
         """
         :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples,
             simple strings pepresenting inames are also allowed. A single
@@ -1359,11 +1262,7 @@ class CInstruction(InstructionBase):
                 no_sync_with=no_sync_with,
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
-                boostable=boostable,
-                boostable_into=boostable_into,
-                priority=priority, predicates=predicates, tags=tags,
-                insn_deps=insn_deps,
-                insn_deps_is_final=insn_deps_is_final)
+                priority=priority, predicates=predicates, tags=tags)
 
         # {{{ normalize iname_exprs
 
@@ -1406,7 +1305,7 @@ class CInstruction(InstructionBase):
 
     def read_dependency_names(self):
         result = (
-                super(CInstruction, self).read_dependency_names()
+                super().read_dependency_names()
                 | frozenset(self.read_variables))
 
         from loopy.symbolic import get_dependencies
@@ -1429,22 +1328,25 @@ class CInstruction(InstructionBase):
                 _get_assignee_subscript_deps(a)
                 for a in self.assignees)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
                 iname_exprs=[
-                    (name, f(expr, *args))
+                    (name, f(expr))
                     for name, expr in self.iname_exprs],
-                assignees=[f(a, *args) for a in self.assignees],
+                assignees=[assignee_f(a) for a in self.assignees],
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
     def __str__(self):
-        first_line = "%s: %s <- CODE(%s|%s)" % (self.id,
+        first_line = "{}: {} <- CODE({}|{})".format(self.id,
                 ", ".join(str(a) for a in self.assignees),
                 ", ".join(str(x) for x in self.read_variables),
-                ", ".join("%s=%s" % (name, expr)
+                ", ".join(f"{name}={expr}"
                     for name, expr in self.iname_exprs))
 
         options = self.get_str_options()
@@ -1471,7 +1373,7 @@ class _DataObliviousInstruction(InstructionBase):
     def assignee_subscript_deps(self):
         return frozenset()
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
         return self.copy(
                 predicates=frozenset(
                     f(pred) for pred in self.predicates))
@@ -1500,9 +1402,8 @@ class NoOpInstruction(_DataObliviousInstruction):
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
             priority=None,
-            boostable=None, boostable_into=None,
             predicates=None, tags=None):
-        super(NoOpInstruction, self).__init__(
+        super().__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -1512,8 +1413,6 @@ class NoOpInstruction(_DataObliviousInstruction):
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
                 priority=priority,
-                boostable=boostable,
-                boostable_into=boostable_into,
                 predicates=predicates,
                 tags=tags)
 
@@ -1554,22 +1453,21 @@ class BarrierInstruction(_DataObliviousInstruction):
         ... lbarrier {mem_kind=global}
     """
 
-    fields = _DataObliviousInstruction.fields | set(["synchronization_kind",
-                                                     "mem_kind"])
+    fields = _DataObliviousInstruction.fields | {"synchronization_kind",
+                                                     "mem_kind"}
 
     def __init__(self, id, depends_on=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
             priority=None,
-            boostable=None, boostable_into=None,
             predicates=None, tags=None, synchronization_kind="global",
             mem_kind="local"):
 
         if predicates:
             raise LoopyError("conditional barriers are not supported")
 
-        super(BarrierInstruction, self).__init__(
+        super().__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -1579,8 +1477,6 @@ class BarrierInstruction(_DataObliviousInstruction):
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
                 priority=priority,
-                boostable=boostable,
-                boostable_into=boostable_into,
                 predicates=predicates,
                 tags=tags
                 )
@@ -1589,12 +1485,13 @@ class BarrierInstruction(_DataObliviousInstruction):
         self.mem_kind = mem_kind
 
     def __str__(self):
-        first_line = "%s: ... %sbarrier" % (self.id, self.synchronization_kind[0])
+        first_line = \
+                "{}: ... {}barrier".format(self.id, self.synchronization_kind[0])
 
         options = self.get_str_options()
         if self.synchronization_kind == "local":
             # add the memory kind
-            options += ['mem_kind={}'.format(self.mem_kind)]
+            options += [f"mem_kind={self.mem_kind}"]
         if options:
             first_line += " {%s}" % (": ".join(options))
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ead996445844e1cc3d09b5a7683b40201dcb6d34..84792cb4b1d137155378026ff21a3accb4680dc5 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1,7 +1,5 @@
-# coding=utf-8
 """Operations on the kernel object."""
 
-from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,8 +25,7 @@ THE SOFTWARE.
 
 import sys
 
-import six
-from six.moves import intern
+from sys import intern
 
 import numpy as np
 import islpy as isl
@@ -60,7 +57,6 @@ def add_dtypes(program, dtype_dict):
     if dtype_dict_remainder:
         raise RuntimeError("unused argument dtypes: %s"
                 % ", ".join(dtype_dict_remainder))
-    root_kernel
 
     root_kernel_with_added_dtypes = (
             root_kernel.copy(args=new_args, temporary_variables=new_temp_vars))
@@ -68,21 +64,21 @@ def add_dtypes(program, dtype_dict):
     return program.with_root_kernel(root_kernel_with_added_dtypes)
 
 
-def _add_dtypes_overdetermined(knl, dtype_dict):
-    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict)
+def _add_dtypes_overdetermined(kernel, dtype_dict):
+    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict)
     # do not throw error for unused args
-    return knl.copy(args=new_args, temporary_variables=new_temp_vars)
+    return kernel.copy(args=new_args, temporary_variables=new_temp_vars)
 
 
-def _add_dtypes(knl, dtype_dict):
+def _add_dtypes(kernel, dtype_dict):
     dtype_dict = dtype_dict.copy()
     new_args = []
 
     from loopy.types import to_loopy_type
-    for arg in knl.args:
+    for arg in kernel.args:
         new_dtype = dtype_dict.pop(arg.name, None)
         if new_dtype is not None:
-            new_dtype = to_loopy_type(new_dtype, target=knl.target)
+            new_dtype = to_loopy_type(new_dtype, target=kernel.target)
             if arg.dtype is not None and arg.dtype != new_dtype:
                 raise RuntimeError(
                         "argument '%s' already has a different dtype "
@@ -92,10 +88,10 @@ def _add_dtypes(knl, dtype_dict):
 
         new_args.append(arg)
 
-    new_temp_vars = knl.temporary_variables.copy()
+    new_temp_vars = kernel.temporary_variables.copy()
 
     import loopy as lp
-    for tv_name in knl.temporary_variables:
+    for tv_name in kernel.temporary_variables:
         new_dtype = dtype_dict.pop(tv_name, None)
         if new_dtype is not None:
             new_dtype = np.dtype(new_dtype)
@@ -112,8 +108,8 @@ def _add_dtypes(knl, dtype_dict):
     return dtype_dict, new_args, new_temp_vars
 
 
-def get_arguments_with_incomplete_dtype(knl):
-    return [arg.name for arg in knl.args
+def get_arguments_with_incomplete_dtype(kernel):
+    return [arg.name for arg in kernel.args
             if arg.dtype is None]
 
 
@@ -121,7 +117,7 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False):
     assert isinstance(prog, Program)
     processed_dtype_dict = {}
 
-    for k, v in six.iteritems(dtype_dict):
+    for k, v in dtype_dict.items():
         for subkey in k.split(","):
             subkey = subkey.strip()
             if subkey:
@@ -133,11 +129,11 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False):
     return infer_unknown_types(prog, expect_completion=expect_completion)
 
 
-def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
-    knl = _add_dtypes_overdetermined(knl, dtype_dict)
+def _add_and_infer_dtypes_overdetermined(kernel, dtype_dict):
+    kernel = _add_dtypes_overdetermined(kernel, dtype_dict)
 
     from loopy.type_inference import infer_unknown_types
-    return infer_unknown_types(knl, expect_completion=True)
+    return infer_unknown_types(kernel, expect_completion=True)
 
 # }}}
 
@@ -300,7 +296,7 @@ def find_all_insn_inames(kernel):
 
     logger.debug("%s: find_all_insn_inames: done" % kernel.name)
 
-    for v in six.itervalues(insn_id_to_inames):
+    for v in insn_id_to_inames.values():
         assert isinstance(v, frozenset)
 
     return insn_id_to_inames
@@ -310,53 +306,65 @@ def find_all_insn_inames(kernel):
 
 # {{{ set operation cache
 
+def _eliminate_except(set_, except_inames, dts):
+    return set_.eliminate_except(except_inames, dts)
+
+
+def _get_dim_max(set_, idx):
+    return set_.dim_max(idx)
+
+
+def _get_dim_min(set_, idx):
+    return set_.dim_min(idx)
+
+
 class SetOperationCacheManager:
     def __init__(self):
-        # mapping: set hash -> [(set, op, args, result)]
+        # mapping: set hash -> [(set, result)]
         self.cache = {}
 
-    def op(self, set, op_name, op, args):
-        hashval = hash(set)
+    def op(self, set_, op, args):
+        hashval = hash((set_, op, args))
         bucket = self.cache.setdefault(hashval, [])
 
-        for bkt_set, bkt_op, bkt_args, result in bucket:
-            if set.plain_is_equal(bkt_set) and op == bkt_op and args == bkt_args:
+        for bkt_set, result in bucket:
+            if set_.plain_is_equal(bkt_set):
                 return result
 
-        #print op, set.get_dim_name(dim_type.set, args[0])
-        result = op(set, *args)
-        bucket.append((set, op_name, args, result))
+        result = op(set_, *args)
+        bucket.append((set_, result))
         return result
 
-    def dim_min(self, set, *args):
-        if set.plain_is_empty():
-            raise LoopyError("domain '%s' is empty" % set)
+    def dim_min(self, set_, *args):
+        if set_.plain_is_empty():
+            raise LoopyError("domain '%s' is empty" % set_)
+
+        return self.op(set_, _get_dim_min, args)
 
-        from loopy.isl_helpers import dim_min_with_elimination
-        return self.op(set, "dim_min", dim_min_with_elimination, args)
+    def dim_max(self, set_, *args):
+        if set_.plain_is_empty():
+            raise LoopyError("domain '%s' is empty" % set_)
 
-    def dim_max(self, set, *args):
-        if set.plain_is_empty():
-            raise LoopyError("domain '%s' is empty" % set)
+        return self.op(set_, _get_dim_max, args)
 
-        from loopy.isl_helpers import dim_max_with_elimination
-        return self.op(set, "dim_max", dim_max_with_elimination, args)
+    def eliminate_except(self, set_, *args):
+        return self.op(set_, _eliminate_except, args)
 
-    def base_index_and_length(self, set, iname, context=None,
+    def base_index_and_length(self, set_, iname, context=None,
             n_allowed_params_in_length=None):
         """
         :arg n_allowed_params_in_length: Simplifies the 'length'
             argument so that only the first that many params
-            (in the domain of *set*) occur.
+            (in the domain of *set_*) occur.
         """
         if not isinstance(iname, int):
-            iname_to_dim = set.space.get_var_dict()
+            iname_to_dim = set_.space.get_var_dict()
             idx = iname_to_dim[iname][1]
         else:
             idx = iname
 
-        lower_bound_pw_aff = self.dim_min(set, idx)
-        upper_bound_pw_aff = self.dim_max(set, idx)
+        lower_bound_pw_aff = self.dim_min(set_, idx)
+        upper_bound_pw_aff = self.dim_max(set_, idx)
 
         from loopy.diagnostic import StaticValueFindingError
         from loopy.isl_helpers import (
@@ -469,7 +477,7 @@ class DomainChanger:
 @iterate_over_kernels_if_given_program
 def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
         use_insn_id=False):
-    """Return a string in the `dot <http://graphviz.org/>`_ language depicting
+    """Return a string in the `dot <https://graphviz.org/>`_ language depicting
     dependencies among kernel instructions.
     """
 
@@ -495,8 +503,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
 
     for insn in kernel.instructions:
         if isinstance(insn, MultiAssignmentBase):
-            lhs = ', '.join(str(assignee) for assignee in insn.assignees)
-            op = "%s <- %s" % (lhs, insn.expression)
+            lhs = ", ".join(str(assignee) for assignee in insn.assignees)
+            op = f"{lhs} <- {insn.expression}"
             if len(op) > 200:
                 op = op[:200] + "..."
 
@@ -512,7 +520,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
             insn_label = op
             tooltip = insn.id
 
-        lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];"
+        lines.append('"%s" [label="%s",shape="box",tooltip="%s"];'
                 % (
                     insn.id,
                     repr(insn_label)[1:-1],
@@ -547,7 +555,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
 
     for insn_1 in dep_graph:
         for insn_2 in dep_graph.get(insn_1, set()):
-            lines.append("%s -> %s" % (insn_2, insn_1))
+            lines.append(f"{insn_2} -> {insn_1}")
 
     if iname_cluster:
         from loopy.schedule import (
@@ -556,7 +564,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
 
         for sched_item in kernel.schedule:
             if isinstance(sched_item, EnterLoop):
-                lines.append("subgraph cluster_%s { label=\"%s\""
+                lines.append('subgraph cluster_%s { label="%s"'
                         % (sched_item.iname, sched_item.iname))
             elif isinstance(sched_item, LeaveLoop):
                 lines.append("}")
@@ -567,7 +575,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
             else:
                 raise LoopyError("schedule item not unterstood: %r" % sched_item)
 
-    return "digraph %s {\n%s\n}" % (
+    return "digraph {} {{\n{}\n}}".format(
             kernel.name,
             "\n".join(lines)
             )
@@ -689,9 +697,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
     # {{{ figure out automatic-axis inames
 
     from loopy.kernel.data import AutoLocalIndexTagBase
-    auto_axis_inames = set(
+    auto_axis_inames = {
         iname for iname in kernel.insn_inames(insn)
-        if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase))
+        if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)}
 
     # }}}
 
@@ -730,7 +738,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
             if stride is None:
                 continue
             coeffs = CoefficientCollector()(iexpr_i)
-            for var, coeff in six.iteritems(coeffs):
+            for var, coeff in coeffs.items():
                 if (isinstance(var, Variable)
                         and var.name in auto_axis_inames):
                     # excludes '1', i.e.  the constant
@@ -742,7 +750,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
         # }}}
 
         from pymbolic import evaluate
-        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
+        for iname, stride_expr in iname_to_stride_expr.items():
             stride = evaluate(stride_expr, approximate_arg_values)
             aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride
 
@@ -954,7 +962,7 @@ def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None):
 
 # {{{ array modifier
 
-class ArrayChanger(object):
+class ArrayChanger:
     def __init__(self, kernel, array_name):
         self.kernel = kernel
         self.array_name = array_name
@@ -1036,8 +1044,8 @@ def guess_var_shape(kernel, var_name):
                         % (var_name, ", ".join(
                                 str(i) for i in armap.bad_subscripts)))
 
-            n_axes_in_subscripts = set(
-                    len(sub.index_tuple) for sub in armap.bad_subscripts)
+            n_axes_in_subscripts = {
+                    len(sub.index_tuple) for sub in armap.bad_subscripts}
 
             if len(n_axes_in_subscripts) != 1:
                 raise RuntimeError("subscripts of '%s' with differing "
@@ -1088,7 +1096,7 @@ def guess_var_shape(kernel, var_name):
 
 # {{{ loop nest tracker
 
-class SetTrie(object):
+class SetTrie:
     """
     Similar to a trie, but uses an unordered sequence as the key.
     """
@@ -1103,9 +1111,8 @@ class SetTrie(object):
 
     def descend(self, on_found=lambda prefix: None, prefix=frozenset()):
         on_found(prefix)
-        from six import iteritems
         for prefix, child in sorted(
-                iteritems(self.children),
+                self.children.items(),
                 key=lambda it: sorted(it[0])):
             child.descend(on_found, prefix=prefix)
 
@@ -1117,9 +1124,7 @@ class SetTrie(object):
         if len(key) == 0:
             return
 
-        from six import iteritems
-
-        for child_key, child in iteritems(self.children):
+        for child_key, child in self.children.items():
             common = child_key & key
             if common:
                 break
@@ -1178,16 +1183,16 @@ def get_visual_iname_order_embedding(kernel):
     iname_trie = SetTrie()
 
     for insn in kernel.instructions:
-        within_inames = set(
+        within_inames = {
             iname for iname in insn.within_inames
-            if iname not in ilp_inames)
+            if iname not in ilp_inames}
         iname_trie.add_or_update(within_inames)
 
     embedding = {}
 
     def update_embedding(inames):
         embedding.update(
-            dict((iname, (len(embedding), iname)) for iname in inames))
+            {iname: (len(embedding), iname) for iname in inames})
 
     iname_trie.descend(update_embedding)
 
@@ -1288,8 +1293,8 @@ def draw_dependencies_as_unicode_arrows(
 
     def make_extender():
         result = n_columns[0] * [" "]
-        for col, (_, pointed_at_insn_id) in six.iteritems(columns_in_use):
-            result[col] = do_flag_downward(u"│", pointed_at_insn_id)
+        for col, (_, pointed_at_insn_id) in columns_in_use.items():
+            result[col] = do_flag_downward("│", pointed_at_insn_id)
 
         return result
 
@@ -1321,28 +1326,28 @@ def draw_dependencies_as_unicode_arrows(
 
         # }}}
 
-        for col, (starts, pointed_at_insn_id) in list(six.iteritems(columns_in_use)):
+        for col, (starts, pointed_at_insn_id) in list(columns_in_use.items()):
             if insn.id == pointed_at_insn_id:
                 if starts:
                     # will continue downward
-                    row[col] = do_flag_downward(u">", pointed_at_insn_id)
+                    row[col] = do_flag_downward(">", pointed_at_insn_id)
                 else:
                     # stops here
 
                     # placeholder, pending deletion
                     columns_in_use[col] = None
 
-                    row[col] = do_flag_downward(u"↳", pointed_at_insn_id)
+                    row[col] = do_flag_downward("↳", pointed_at_insn_id)
 
             elif insn.id in starts:
                 starts.remove(insn.id)
                 if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
-                    row[col] = do_flag_downward(u"├", pointed_at_insn_id)
+                    row[col] = do_flag_downward("├", pointed_at_insn_id)
 
                 else:
                     # stops here
-                    row[col] = u"└"
+                    row[col] = "└"
                     # placeholder, pending deletion
                     columns_in_use[col] = None
 
@@ -1352,7 +1357,7 @@ def draw_dependencies_as_unicode_arrows(
         if dep_key not in dep_to_column and rdeps:
             col = dep_to_column[dep_key] = find_free_column()
             columns_in_use[col] = (rdeps, insn.id)
-            row[col] = u"↱"
+            row[col] = "↱"
 
         # }}}
 
@@ -1368,13 +1373,13 @@ def draw_dependencies_as_unicode_arrows(
                 # we're currently handling it.
                 columns_in_use[col] = (set(), dep)
 
-                row[col] = do_flag_downward(u"┌", dep)
+                row[col] = do_flag_downward("┌", dep)
 
         # }}}
 
         # {{{ delete columns_in_use entry for end-of-life columns
 
-        for col, value in list(six.iteritems(columns_in_use)):
+        for col, value in list(columns_in_use.items()):
             if value is None:
                 del columns_in_use[col]
 
@@ -1398,7 +1403,7 @@ def draw_dependencies_as_unicode_arrows(
                 .replace(style.RESET_ALL, ""))
         return len(s)
 
-    def truncate_without_color_escapes(s, l):
+    def truncate_without_color_escapes(s, length):
         # FIXME: This is a bit dumb--it removes color escapes when truncation
         # is needed.
 
@@ -1406,7 +1411,7 @@ def draw_dependencies_as_unicode_arrows(
                 .replace(fore.RED, "")
                 .replace(style.RESET_ALL, ""))
 
-        return s[:l] + u"…"
+        return s[:length] + "…"
 
     def conform_to_uniform_length(s):
         len_s = len_without_color_escapes(s)
@@ -1445,6 +1450,8 @@ def stringify_instruction_list(kernel):
 
     def insert_insn_into_order(insn):
         if insn.id in printed_insn_ids:
+            # Note: dependency cycles are deliberately ignored so that printing
+            # succeeds.
             return
         printed_insn_ids.add(insn.id)
 
@@ -1523,12 +1530,12 @@ def stringify_instruction_list(kernel):
             trailing = []
         elif isinstance(insn, lp.CInstruction):
             lhs = ", ".join(str(a) for a in insn.assignees)
-            rhs = "CODE(%s|%s)" % (
+            rhs = "CODE({}|{})".format(
                     ", ".join(str(x) for x in insn.read_variables),
-                    ", ".join("%s=%s" % (name, expr)
+                    ", ".join(f"{name}={expr}"
                         for name, expr in insn.iname_exprs))
 
-            trailing = [l for l in insn.code.split("\n")]
+            trailing = insn.code.split("\n")
         elif isinstance(insn, lp.BarrierInstruction):
             lhs = ""
             rhs = "... %sbarrier" % insn.synchronization_kind[0]
@@ -1562,11 +1569,11 @@ def stringify_instruction_list(kernel):
             options.append("no_sync_with=%s" % ":".join(
                 "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
         if isinstance(insn, lp.BarrierInstruction) and \
-                insn.synchronization_kind == 'local':
-            options.append('mem_kind=%s' % insn.mem_kind)
+                insn.synchronization_kind == "local":
+            options.append("mem_kind=%s" % insn.mem_kind)
 
         if lhs:
-            core = "%s = %s" % (
+            core = "{} = {}".format(
                 Fore.CYAN+lhs+Style.RESET_ALL,
                 Fore.MAGENTA+rhs+Style.RESET_ALL,
                 )
@@ -1600,6 +1607,13 @@ def stringify_instruction_list(kernel):
 
 # {{{ global barrier order finding
 
+def _is_global_barrier(kernel, insn_id):
+    insn = kernel.id_to_insn[insn_id]
+    from loopy.kernel.instruction import BarrierInstruction
+    return isinstance(insn, BarrierInstruction) and \
+        insn.synchronization_kind == "global"
+
+
 @memoize_on_first_arg
 def get_global_barrier_order(kernel):
     """Return a :class:`tuple` of the listing the ids of global barrier instructions
@@ -1607,49 +1621,27 @@ def get_global_barrier_order(kernel):
 
     See also :class:`loopy.instruction.BarrierInstruction`.
     """
-    barriers = []
-    visiting = set()
-    visited = set()
-
-    unvisited = set(insn.id for insn in kernel.instructions)
-
-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
-    while unvisited:
-        stack = [unvisited.pop()]
-
-        while stack:
-            top = stack[-1]
-
-            if top in visiting:
-                visiting.remove(top)
-                if is_barrier(top):
-                    barriers.append(top)
+    dep_graph = {insn.id: set() for insn in kernel.instructions}
+    for insn in kernel.instructions:
+        for dep in insn.depends_on:
+            dep_graph[dep].add(insn.id)
 
-            if top in visited:
-                stack.pop()
-                continue
+    from pytools.graph import compute_topological_order
+    order = compute_topological_order(dep_graph)
 
-            visited.add(top)
-            visiting.add(top)
+    barriers = [
+            insn_id for insn_id in order
+            if _is_global_barrier(kernel, insn_id)]
 
-            for child in kernel.id_to_insn[top].depends_on:
-                # Check for no cycles.
-                assert child not in visiting
-                stack.append(child)
+    del order
 
     # Ensure this is the only possible order.
     #
     # We do this by looking at the barriers in order.
     # We check for each adjacent pair (a,b) in the order if a < b,
     # i.e. if a is reachable by a chain of dependencies from b.
-
-    visiting.clear()
-    visited.clear()
+    visited = set()
+    visiting = set()
 
     for prev_barrier, barrier in zip(barriers, barriers[1:]):
         # Check if prev_barrier is reachable from barrier.
@@ -1707,22 +1699,16 @@ def find_most_recent_global_barrier(kernel, insn_id):
     if len(insn.depends_on) == 0:
         return None
 
-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
-    global_barrier_to_ordinal = dict(
-            (b, i) for i, b in enumerate(global_barrier_order))
+    global_barrier_to_ordinal = {
+            b: i for i, b in enumerate(global_barrier_order)}
 
     def get_barrier_ordinal(barrier_id):
         return (global_barrier_to_ordinal[barrier_id]
                 if barrier_id is not None
                 else -1)
 
-    direct_barrier_dependencies = set(
-            dep for dep in insn.depends_on if is_barrier(dep))
+    direct_barrier_dependencies = {
+            dep for dep in insn.depends_on if _is_global_barrier(kernel, dep)}
 
     if len(direct_barrier_dependencies) > 0:
         return max(direct_barrier_dependencies, key=get_barrier_ordinal)
@@ -1744,8 +1730,8 @@ def get_subkernels(kernel):
     See also :class:`loopy.schedule.CallKernel`.
     """
     from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
-        raise LoopyError("Kernel must be scheduled")
+    if kernel.state != KernelState.LINEARIZED:
+        raise LoopyError("Kernel must be linearized")
 
     from loopy.schedule import CallKernel
 
@@ -1761,7 +1747,7 @@ def get_subkernel_to_insn_id_map(kernel):
     kernel must be scheduled.
     """
     from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("Kernel must be scheduled")
 
     from loopy.schedule import (
@@ -1792,7 +1778,7 @@ def get_subkernel_to_insn_id_map(kernel):
 
 # {{{ find aliasing equivalence classes
 
-class DisjointSets(object):
+class DisjointSets:
     """
     .. automethod:: __getitem__
     .. automethod:: find_leader_or_create_group
@@ -1814,7 +1800,7 @@ class DisjointSets(object):
         try:
             leader = self.element_to_leader[item]
         except KeyError:
-            return set([item])
+            return {item}
         else:
             return self.leader_to_group[leader]
 
@@ -1825,7 +1811,7 @@ class DisjointSets(object):
             pass
 
         self.element_to_leader[el] = el
-        self.leader_to_group[el] = set([el])
+        self.leader_to_group[el] = {el}
         return el
 
     def union(self, a, b):
@@ -1864,7 +1850,7 @@ class DisjointSets(object):
 def find_aliasing_equivalence_classes(kernel):
     return DisjointSets().union_many(
             (tv.base_storage, tv.name)
-            for tv in six.itervalues(kernel.temporary_variables)
+            for tv in kernel.temporary_variables.values()
             if tv.base_storage is not None)
 
 # }}}
@@ -1991,8 +1977,8 @@ class CallCollector(CombineMapper):
 
     def map_call_with_kwargs(self, expr):
         return (frozenset([expr.function.name]) |
-                self.combine((self.rec(child) for child in expr.parameters
-                    + tuple(expr.kw_parameters.values()))))
+                self.combine(self.rec(child) for child in expr.parameters
+                    + tuple(expr.kw_parameters.values())))
 
     def map_constant(self, expr):
         return frozenset()
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 378b7de5897912e2e04314b066f40e5ea6b0c785..291f0c372bdac74a79f25da361bb381c5646ed58 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -38,8 +36,8 @@ class MakeTupleCallable(ScalarCallable):
 
     def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr):
         from loopy.kernel.function_interface import ValueArgDescriptor
-        new_arg_id_to_descr = dict(((id, ValueArgDescriptor()),
-            (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys())
+        new_arg_id_to_descr = {(id, ValueArgDescriptor()):
+            (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()}
 
         return (
                 self.copy(arg_id_to_descr=new_arg_id_to_descr),
@@ -48,8 +46,8 @@ class MakeTupleCallable(ScalarCallable):
 
 class IndexOfCallable(ScalarCallable):
     def with_types(self, arg_id_to_dtype, kernel, callables_table):
-        new_arg_id_to_dtype = dict((i, dtype) for i, dtype in
-                arg_id_to_dtype.items() if dtype is not None)
+        new_arg_id_to_dtype = {i: dtype for i, dtype in
+                arg_id_to_dtype.items() if dtype is not None}
         new_arg_id_to_dtype[-1] = kernel.index_dtype
 
         return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype),
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index e59a892bb4c7b3bd7222bf61b29e0ade92195240..6ec8affe35982c1412112fd07f93458cb6a63cde 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -1,6 +1,5 @@
 """Library integration with Random123."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
@@ -63,12 +62,12 @@ RNG_VARIANTS = [
         _threefry_base_info.copy(width=4, bits=64),
         ]
 
-FUNC_NAMES_TO_RNG = dict(
-        (v.full_name + suffix, v)
+FUNC_NAMES_TO_RNG = {
+        v.full_name + suffix: v
         for v in RNG_VARIANTS
         for suffix in [
             "", "_f32", "_f64",
-            ])
+            ]}
 
 # }}}
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 5b78c08f4d3588123a6eaf1d6dccda239ef6fed7..f44d243230fb31264a7e2a588e6086b2173daa2a 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -33,8 +31,24 @@ from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.tools import update_persistent_hash
 
+__doc__ = """
+.. currentmodule:: loopy.library.reduction
+
+.. autoclass:: ReductionOperation
+
+.. autoclass:: ScalarReductionOperation
+
+.. autoclass:: SumReductionOperation
+
+.. autoclass:: ProductReductionOperation
+
+.. autoclass:: MaxReductionOperation
+
+.. autoclass:: MinReductionOperation
+"""
+
 
-class ReductionOperation(object):
+class ReductionOperation:
     """Subclasses of this type have to be hashable, picklable, and
     equality-comparable.
     """
@@ -122,7 +136,7 @@ class ScalarReductionOperation(ReductionOperation):
         result = type(self).__name__.replace("ReductionOperation", "").lower()
 
         if self.forced_result_type is not None:
-            result = "%s<%s>" % (result, str(self.forced_result_type))
+            result = "{}<{}>".format(result, str(self.forced_result_type))
 
         return result
 
@@ -154,11 +168,11 @@ def get_le_neutral(dtype):
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
-            #32 bit integer
+            # 32 bit integer
             return var("INT_MAX")
         elif dtype.numpy_dtype.itemsize == 8:
-            #64 bit integer
-            return var('LONG_MAX')
+            # 64 bit integer
+            return var("LONG_MAX")
     else:
         raise NotImplementedError("less")
 
@@ -172,11 +186,11 @@ def get_ge_neutral(dtype):
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
-            #32 bit integer
+            # 32 bit integer
             return var("INT_MIN")
         elif dtype.numpy_dtype.itemsize == 8:
-            #64 bit integer
-            return var('LONG_MIN')
+            # 64 bit integer
+            return var("LONG_MIN")
     else:
         raise NotImplementedError("less")
 
@@ -255,7 +269,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return 2
 
     def prefix(self, scalar_dtype, segment_flag_dtype):
-        return "loopy_segmented_%s_%s_%s" % (self.which,
+        return "loopy_segmented_{}_{}_{}".format(self.which,
                 scalar_dtype.numpy_dtype.type.__name__,
                 segment_flag_dtype.numpy_dtype.type.__name__)
 
@@ -328,7 +342,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         raise NotImplementedError
 
     def prefix(self, scalar_dtype, index_dtype):
-        return "loopy_arg%s_%s_%s" % (self.which,
+        return "loopy_arg{}_{}_{}".format(self.which,
                 scalar_dtype.numpy_dtype.type.__name__,
                 index_dtype.numpy_dtype.type.__name__)
 
@@ -406,7 +420,7 @@ _REDUCTION_OP_PARSERS = [
 
 
 def register_reduction_parser(parser):
-    """Register a new :class:`ReductionOperation`.
+    """Register a new :class:`loopy.library.reduction.ReductionOperation`.
 
     :arg parser: A function that receives a string and returns
         a subclass of ReductionOperation.
@@ -472,28 +486,28 @@ class ReductionCallable(ScalarCallable):
             prefix = op.prefix(scalar_dtype, index_dtype)
 
             yield (prefix, """
-            inline %(scalar_t)s %(prefix)s_op(
-                %(scalar_t)s op1, %(index_t)s index1,
-                %(scalar_t)s op2, %(index_t)s index2,
-                %(index_t)s *index_out)
-            {
-                if (op2 %(comp)s op1)
-                {
+            inline {scalar_t} {prefix}_op(
+                {scalar_t} op1, {index_t} index1,
+                {scalar_t} op2, {index_t} index2,
+                {index_t} *index_out)
+            {{
+                if (op2 {comp} op1)
+                {{
                     *index_out = index2;
                     return op2;
-                }
+                }}
                 else
-                {
+                {{
                     *index_out = index1;
                     return op1;
-                }
-            }
-            """ % dict(
-                    scalar_t=target.dtype_to_typename(scalar_dtype),
-                    prefix=prefix,
-                    index_t=target.dtype_to_typename(index_dtype),
-                    comp=op.update_comparison,
-                    ))
+                }}
+            }}
+            """.format(
+                   scalar_t=target.dtype_to_typename(scalar_dtype),
+                   prefix=prefix,
+                   index_t=target.dtype_to_typename(index_dtype),
+                   comp=op.update_comparison,
+                   ))
         elif isinstance(self.name, SegmentedOp):
             op = self.name.reduction_op
             scalar_dtype = self.arg_id_to_dtype[-1]
@@ -501,20 +515,20 @@ class ReductionCallable(ScalarCallable):
             prefix = op.prefix(scalar_dtype, segment_flag_dtype)
 
             yield (prefix, """
-            inline %(scalar_t)s %(prefix)s_op(
-                %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
-                %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
-                %(segment_flag_t)s *segment_flag_out)
-            {
+            inline {scalar_t} {prefix}_op(
+                {scalar_t} op1, {segment_flag_t} segment_flag1,
+                {scalar_t} op2, {segment_flag_t} segment_flag2,
+                {segment_flag_t} *segment_flag_out)
+            {{
                 *segment_flag_out = segment_flag1 | segment_flag2;
-                return segment_flag2 ? op2 : %(combined)s;
-            }
-            """ % dict(
-                    scalar_t=target.dtype_to_typename(scalar_dtype),
-                    prefix=prefix,
-                    segment_flag_t=target.dtype_to_typename(segment_flag_dtype),
-                    combined=op.op % ("op1", "op2"),
-                    ))
+                return segment_flag2 ? op2 : {combined};
+            }}
+            """.format(
+                   scalar_t=target.dtype_to_typename(scalar_dtype),
+                   prefix=prefix,
+                   segment_flag_t=target.dtype_to_typename(segment_flag_dtype),
+                   combined=op.op % ("op1", "op2"),
+                   ))
 
         return
 
diff --git a/loopy/loop.py b/loopy/loop.py
index 24cbe730f7679ba9b9931f7493d3c793ce3718c9..73ca8d72824071b36bf91798ba9a1ea14e624db7 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -24,7 +22,6 @@ THE SOFTWARE.
 
 
 import islpy as isl
-import six
 from loopy.program import iterate_over_kernels_if_given_program
 
 
@@ -71,7 +68,7 @@ def merge_loop_domains(kernel):
 
         new_domains = None
 
-        for inner_iname, outer_inames in six.iteritems(lnm):
+        for inner_iname, outer_inames in lnm.items():
             for outer_iname in outer_inames:
                 # {{{ check if it's safe to merge
 
diff --git a/loopy/match.py b/loopy/match.py
index 9766fac2b57f5cb55eebb09f4ab32880ef3c2038..f13d56053c7e87333192cc3980a26fc2c18f7a51 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -1,7 +1,6 @@
 """Matching functionality for instruction ids and subsitution
 rule invocations stacks."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, intern
+from sys import intern
 
 
 NoneType = type(None)
@@ -116,7 +115,7 @@ _PREC_NOT = 30
 
 # {{{ match expression
 
-class MatchExpressionBase(object):
+class MatchExpressionBase:
     def __call__(self, kernel, matchable):
         raise NotImplementedError
 
@@ -162,7 +161,7 @@ class MultiChildMatchExpressionBase(MatchExpressionBase):
         return "(%s)" % (joiner.join(str(ch) for ch in self.children))
 
     def __repr__(self):
-        return "%s(%s)" % (
+        return "{}({})".format(
                 type(self).__name__,
                 ", ".join(repr(ch) for ch in self.children))
 
@@ -199,7 +198,7 @@ class Not(MatchExpressionBase):
         return "(not %s)" % str(self.child)
 
     def __repr__(self):
-        return "%s(%r)" % (type(self).__name__, self.child)
+        return "{}({!r})".format(type(self).__name__, self.child)
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "not_match_expr")
@@ -226,7 +225,7 @@ class GlobMatchExpressionBase(MatchExpressionBase):
         return descr.lower() + ":" + self.glob
 
     def __repr__(self):
-        return "%s(%r)" % (type(self).__name__, self. glob)
+        return "{}({!r})".format(type(self).__name__, self. glob)
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, type(self).__name__)
@@ -386,7 +385,7 @@ def parse_match(expr):
 
 # {{{ stack match objects
 
-class StackMatchComponent(object):
+class StackMatchComponent:
     def __ne__(self, other):
         return not self.__eq__(other)
 
@@ -455,7 +454,7 @@ class StackWildcardMatchComponent(StackMatchComponent):
 
 # {{{ stack matcher
 
-class RuleInvocationMatchable(object):
+class RuleInvocationMatchable:
     def __init__(self, id, tags):
         self.id = id
         self.tags = tags
@@ -470,7 +469,7 @@ class RuleInvocationMatchable(object):
         raise TypeError("inames: query may not be applied to rule invocations")
 
 
-class StackMatch(object):
+class StackMatch:
     def __init__(self, root_component):
         self.root_component = root_component
 
diff --git a/loopy/maxima.py b/loopy/maxima.py
deleted file mode 100644
index c74360a731fa06644065e743fb9397ea170fb7f3..0000000000000000000000000000000000000000
--- a/loopy/maxima.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# pylint: disable=all  # This code needs porting to modern loopy
-"""Export to maxima."""
-
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from pymbolic.interop.maxima import \
-        MaximaStringifyMapper as MaximaStringifyMapperBase
-
-
-class MaximaStringifyMapper(MaximaStringifyMapperBase):
-    def map_subscript(self, expr, enclosing_prec):
-        res = self.rec(expr.aggregate, enclosing_prec)
-        idx = expr.index
-        if not isinstance(idx, tuple):
-            idx = (idx,)
-        for i in idx:
-            if isinstance(i, int):
-                res += "_%d" % i
-
-        return res
-
-
-def get_loopy_instructions_as_maxima(kernel, prefix):
-    """Sample use for code comparison::
-
-        load("knl-optFalse.mac");
-        load("knl-optTrue.mac");
-
-        vname: bessel_j_8;
-
-        un_name : concat(''un_, vname);
-        opt_name : concat(''opt_, vname);
-
-        print(ratsimp(ev(un_name - opt_name)));
-    """
-    from loopy.preprocess import add_boostability_and_automatic_dependencies
-    kernel = add_boostability_and_automatic_dependencies(kernel)
-
-    my_variable_names = (
-            avn
-            for insn in kernel.instructions
-            for avn in insn.assignee_var_names()
-            )
-
-    from pymbolic import var
-    subst_dict = dict(
-            (vn, var(prefix+vn)) for vn in my_variable_names)
-
-    mstr = MaximaStringifyMapper()
-    from loopy.symbolic import SubstitutionMapper
-    from pymbolic.mapper.substitutor import make_subst_func
-    substitute = SubstitutionMapper(make_subst_func(subst_dict))
-
-    result = ["ratprint:false;"]
-
-    written_insn_ids = set()
-
-    from loopy.kernel import InstructionBase, Assignment
-
-    def write_insn(insn):
-        if not isinstance(insn, InstructionBase):
-            insn = kernel.id_to_insn[insn]
-        if not isinstance(insn, Assignment):
-            raise RuntimeError("non-single-output assignment not supported "
-                    "in maxima export")
-
-        for dep in insn.depends_on:
-            if dep not in written_insn_ids:
-                write_insn(dep)
-
-        aname, = insn.assignee_var_names()
-        result.append("%s%s : %s;" % (
-            prefix, aname,
-            mstr(substitute(insn.expression))))
-
-        written_insn_ids.add(insn.id)
-
-    for insn in kernel.instructions:
-        if insn.id not in written_insn_ids:
-            write_insn(insn)
-
-    return "\n".join(result)
diff --git a/loopy/options.py b/loopy/options.py
index 63089d94d3487e77a1def39a98fe24631c508398..2dc8f22cd8a205da89d86b5157af8792a37111ed 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement
-
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -23,7 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
 from pytools import ImmutableRecord
 import re
 
@@ -31,7 +28,7 @@ import re
 ALLOW_TERMINAL_COLORS = True
 
 
-class _ColoramaStub(object):
+class _ColoramaStub:
     def __getattribute__(self, name):
         return ""
 
@@ -39,7 +36,7 @@ class _ColoramaStub(object):
 def _apply_legacy_map(lmap, kwargs):
     result = {}
 
-    for name, val in six.iteritems(kwargs):
+    for name, val in kwargs.items():
         try:
             lmap_value = lmap[name]
         except KeyError:
@@ -89,12 +86,6 @@ class Options(ImmutableRecord):
         Like :attr:`trace_assignments`, but also trace the
         assigned values.
 
-    .. attribute:: ignore_boostable_into
-
-        Ignore the boostable_into field of the kernel, when
-        determining whether an iname duplication is necessary
-        for the kernel to be schedulable.
-
     .. attribute:: check_dep_resolution
 
         Whether loopy should issue an error if a dependency
@@ -117,7 +108,7 @@ class Options(ImmutableRecord):
     .. attribute:: cl_exec_manage_array_events
 
         Within the PyOpenCL executor, respect and udpate
-        :attr:`pyopencl.array.Array.event`.
+        :attr:`pyopencl.array.Array.events`.
 
         Defaults to *True*.
 
@@ -146,7 +137,7 @@ class Options(ImmutableRecord):
     .. attribute:: edit_code
 
         Invoke an editor (given by the environment variable
-        :envvar:`EDITOR`) on the generated kernel code,
+        ``EDITOR``) on the generated kernel code,
         allowing for tweaks before the code is passed on to
         the target for compilation.
 
@@ -211,7 +202,6 @@ class Options(ImmutableRecord):
                 annotate_inames=kwargs.get("annotate_inames", False),
                 trace_assignments=kwargs.get("trace_assignments", False),
                 trace_assignment_values=kwargs.get("trace_assignment_values", False),
-                ignore_boostable_into=kwargs.get("ignore_boostable_into", False),
 
                 skip_arg_checks=kwargs.get("skip_arg_checks", False),
                 no_numpy=kwargs.get("no_numpy", False),
@@ -228,7 +218,7 @@ class Options(ImmutableRecord):
                 check_dep_resolution=kwargs.get("check_dep_resolution", True),
 
                 enforce_variable_access_ordered=kwargs.get(
-                    "enforce_variable_access_ordered", False),
+                    "enforce_variable_access_ordered", True),
                 )
 
     # {{{ legacy compatibility
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 504b361fb001f6683d6ee8837d7af1c3b51d83ef..0d55d5c92bd9c43219e2d57c3a20ac8248856dfb 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -25,7 +23,6 @@ THE SOFTWARE.
 import logging
 logger = logging.getLogger(__name__)
 
-import six
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
@@ -39,6 +36,7 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
 from loopy.symbolic import RuleAwareIdentityMapper
+from loopy.transform.iname import remove_any_newly_unused_inames
 
 from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
         CallInstruction,  _DataObliviousInstruction)
@@ -69,7 +67,7 @@ def prepare_for_caching(kernel):
         new_args.append(arg)
 
     new_temporary_variables = {}
-    for name, temp in six.iteritems(kernel.temporary_variables):
+    for name, temp in kernel.temporary_variables.items():
         dtype = temp.dtype
         if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
             temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt)
@@ -127,7 +125,7 @@ def check_reduction_iname_uniqueness(kernel):
     for insn in kernel.instructions:
         insn.with_transformed_expressions(cb_mapper)
 
-    for iname, count in six.iteritems(iname_to_reduction_count):
+    for iname, count in iname_to_reduction_count.items():
         nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0)
 
         if nonsimul_count and count > 1:
@@ -146,18 +144,18 @@ def check_reduction_iname_uniqueness(kernel):
 # {{{ decide temporary address space
 
 def _get_compute_inames_tagged(kernel, insn, tag_base):
-    return set(iname for iname in kernel.insn_inames(insn.id)
-               if kernel.iname_tags_of_type(iname, tag_base))
+    return {iname for iname in kernel.insn_inames(insn.id)
+               if kernel.iname_tags_of_type(iname, tag_base)}
 
 
 def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names):
-    return set(iname
+    return {iname
             for aname, adeps in zip(
                 insn.assignee_var_names(),
                 insn.assignee_subscript_deps())
             for iname in adeps & kernel.all_inames()
             if aname in tv_names
-            if kernel.iname_tags_of_type(iname, tag_base))
+            if kernel.iname_tags_of_type(iname, tag_base)}
 
 
 def find_temporary_address_space(kernel):
@@ -174,7 +172,7 @@ def find_temporary_address_space(kernel):
 
     kernel_var_names = kernel.all_variable_names(include_temp_storage=False)
 
-    for temp_var in six.itervalues(kernel.temporary_variables):
+    for temp_var in kernel.temporary_variables.values():
         if temp_var.base_storage is not None:
             # no nesting allowed
             if temp_var.base_storage in kernel_var_names:
@@ -185,7 +183,7 @@ def find_temporary_address_space(kernel):
             base_storage_to_aliases.setdefault(
                     temp_var.base_storage, []).append(temp_var.name)
 
-    for temp_var in six.itervalues(kernel.temporary_variables):
+    for temp_var in kernel.temporary_variables.values():
         # Only fill out for variables that do not yet know if they're
         # local. (I.e. those generated by implicit temporary generation.)
 
@@ -237,7 +235,7 @@ def find_temporary_address_space(kernel):
                 if (apin != cpin and bool(apin)):
                     warn_with_kernel(
                             kernel,
-                            "write_race_%s(%s)" % (aspace_descr, insn_id),
+                            f"write_race_{aspace_descr}({insn_id})",
                             "instruction '%s' looks invalid: "
                             "it assigns to indices based on %s IDs, but "
                             "its temporary '%s' cannot be made %s because "
@@ -452,7 +450,7 @@ def _try_infer_scan_candidate_from_expr(
 
     if len(expr.inames) != 1:
         raise ValueError(
-                "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),))
+                "Multiple inames in reduction: '{}'".format(", ".join(expr.inames)))
 
     scan_iname, = expr.inames
 
@@ -501,9 +499,9 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
     sweep_iname_candidate = None
 
     for constr in constrs:
-        candidate_vars = set([
+        candidate_vars = {
                 var for var in constr.get_var_dict()
-                if var in candidate_inames])
+                if var in candidate_inames}
 
         # Irrelevant constraint - skip
         if scan_iname not in candidate_vars:
@@ -720,13 +718,12 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
     new_or_updated_instructions = {}
     new_temporaries = {}
 
-    dep_map = dict(
-            (insn.id, insn.depends_on) for insn in kernel.instructions)
+    dep_map = {
+            insn.id: insn.depends_on for insn in kernel.instructions}
 
-    inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions)
+    inverse_dep_map = {insn.id: set() for insn in kernel.instructions}
 
-    import six
-    for insn_id, deps in six.iteritems(dep_map):
+    for insn_id, deps in dep_map.items():
         for dep in deps:
             inverse_dep_map[dep].add(insn_id)
 
@@ -892,6 +889,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
+@remove_any_newly_unused_inames
 def realize_reduction_for_single_kernel(kernel, callables_table,
         insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
         force_scan=False, force_outer_iname_for_scan=None):
@@ -952,7 +950,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             # items that are not "plain" tuples here.
             if not isinstance(inner_expr, tuple):
                 get_args_insn_id = insn_id_gen(
-                        "%s_%s_get" % (insn.id, "_".join(expr.inames)))
+                        "{}_{}_get".format(insn.id, "_".join(expr.inames)))
 
                 inner_expr = expand_inner_reduction(
                         id=get_args_insn_id,
@@ -1037,7 +1035,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         acc_vars = tuple(var(n) for n in acc_var_names)
 
         init_id = insn_id_gen(
-                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
+                "{}_{}_init".format(insn.id, "_".join(expr.inames)))
 
         init_insn = make_assignment(
                 id=init_id,
@@ -1051,20 +1049,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         generated_insns.append(init_insn)
 
         update_id = insn_id_gen(
-                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+                based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
 
         update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
         if insn.within_inames_is_final:
             update_insn_iname_deps = insn.within_inames | set(expr.inames)
 
-        reduction_insn_depends_on = set([init_id])
+        reduction_insn_depends_on = {init_id}
 
         # In the case of a multi-argument reduction, we need a name for each of
         # the arguments in order to pass them to the binary op - so we expand
         # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
-                    "%s_%s_get" % (insn.id, "_".join(expr.inames)))
+                    "{}_{}_get".format(insn.id, "_".join(expr.inames)))
 
             reduction_expr = expand_inner_reduction(
                     id=get_args_insn_id,
@@ -1113,7 +1111,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                 static_max_of_pw_aff(
                     kernel.get_iname_bounds(iname).size,
                     constants_only=True))
-        assert isinstance(size, six.integer_types)
+        assert isinstance(size, int)
         return size
 
     def _make_slab_set(iname, size):
@@ -1184,7 +1182,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
         neutral = expr.operation.neutral_element(*arg_dtypes)
-        init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
+        init_id = insn_id_gen(f"{insn.id}_{red_iname}_init")
         init_insn = make_assignment(
                 id=init_id,
                 assignees=tuple(
@@ -1198,7 +1196,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                 )
         generated_insns.append(init_insn)
 
-        init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
+        init_neutral_id = insn_id_gen(f"{insn.id}_{red_iname}_init_neutral")
         init_neutral_insn = make_assignment(
                 id=init_neutral_id,
                 assignees=tuple(var(nvn) for nvn in neutral_var_names),
@@ -1210,14 +1208,14 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                 )
         generated_insns.append(init_neutral_insn)
 
-        transfer_depends_on = set([init_neutral_id, init_id])
+        transfer_depends_on = {init_neutral_id, init_id}
 
         # In the case of a multi-argument reduction, we need a name for each of
         # the arguments in order to pass them to the binary op - so we expand
         # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
-                    "%s_%s_get" % (insn.id, red_iname))
+                    f"{insn.id}_{red_iname}_get")
 
             reduction_expr = expand_inner_reduction(
                     id=get_args_insn_id,
@@ -1235,7 +1233,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         else:
             reduction_expr = expr.expr
 
-        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
+        transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer")
         transfer_insn = make_assignment(
                 id=transfer_id,
                 assignees=tuple(
@@ -1380,7 +1378,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         track_iname = var_name_gen(
                 "{sweep_iname}__seq_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))
 
         get_or_add_sweep_tracking_iname_and_domain(
                 scan_iname, sweep_iname, sweep_min_value, scan_min_value,
@@ -1398,7 +1396,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         acc_vars = tuple(var(n) for n in acc_var_names)
 
         init_id = insn_id_gen(
-                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
+                "{}_{}_init".format(insn.id, "_".join(expr.inames)))
 
         init_insn_depends_on = frozenset()
 
@@ -1420,18 +1418,18 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         generated_insns.append(init_insn)
 
-        update_insn_depends_on = set([init_insn.id]) | insn.depends_on
+        update_insn_depends_on = {init_insn.id} | insn.depends_on
 
         updated_inner_exprs = (
                 preprocess_scan_arguments(insn, expr.expr, nresults,
                     scan_iname, track_iname, update_insn_depends_on))
 
         update_id = insn_id_gen(
-                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+                based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
 
-        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname])
+        update_insn_iname_deps = temp_kernel.insn_inames(insn) | {track_iname}
         if insn.within_inames_is_final:
-            update_insn_iname_deps = insn.within_inames | set([track_iname])
+            update_insn_iname_deps = insn.within_inames | {track_iname}
 
         scan_insn = make_assignment(
                 id=update_id,
@@ -1490,7 +1488,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         track_iname = var_name_gen(
                 "{sweep_iname}__pre_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))
 
         get_or_add_sweep_tracking_iname_and_domain(
                 scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
@@ -1538,7 +1536,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         if global_barrier is not None:
             init_insn_depends_on |= frozenset([global_barrier])
 
-        init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname))
+        init_id = insn_id_gen(f"{insn.id}_{scan_iname}_init")
         init_insn = make_assignment(
                 id=init_id,
                 assignees=tuple(
@@ -1552,7 +1550,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                 )
         generated_insns.append(init_insn)
 
-        transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
+        transfer_insn_depends_on = {init_insn.id} | insn.depends_on
 
         updated_inner_exprs = (
                 preprocess_scan_arguments(insn, expr.expr, nresults,
@@ -1563,7 +1561,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         from loopy.symbolic import pw_aff_to_expr
         sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
 
-        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname))
+        transfer_id = insn_id_gen(f"{insn.id}_{scan_iname}_transfer")
         transfer_insn = make_assignment(
                 id=transfer_id,
                 assignees=tuple(
@@ -1942,8 +1940,6 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
     from loopy.transform.iname import tag_inames
     kernel = tag_inames(kernel, new_iname_tags)
 
-    # TODO: remove unused inames...
-
     kernel = (
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
@@ -1987,7 +1983,7 @@ def realize_ilp(kernel):
                                    filter_iname_tags_by_type)
 
     privatizing_inames = frozenset(
-        iname for iname, tags in six.iteritems(kernel.iname_to_tags)
+        iname for iname, tags in kernel.iname_to_tags.items()
         if filter_iname_tags_by_type(tags, (IlpBaseTag, VectorizeTag))
     )
 
@@ -1997,114 +1993,6 @@ def realize_ilp(kernel):
 # }}}
 
 
-# {{{ find idempotence ("boostability") of instructions
-
-def find_idempotence(kernel):
-    logger.debug("%s: idempotence" % kernel.name)
-
-    writer_map = kernel.writer_map()
-
-    arg_names = set(arg.name for arg in kernel.args)
-
-    var_names = arg_names | set(six.iterkeys(kernel.temporary_variables))
-
-    reads_map = dict(
-            (insn.id, insn.read_dependency_names() & var_names)
-            for insn in kernel.instructions)
-
-    from collections import defaultdict
-    dep_graph = defaultdict(set)
-
-    for insn in kernel.instructions:
-        dep_graph[insn.id] = set(writer_id
-                for var in reads_map[insn.id]
-                for writer_id in writer_map.get(var, set()))
-
-    # Find SCCs of dep_graph. These are used for checking if the instruction is
-    # in a dependency cycle.
-    from loopy.tools import compute_sccs
-
-    sccs = dict((item, scc)
-            for scc in compute_sccs(dep_graph)
-            for item in scc)
-
-    non_idempotently_updated_vars = set()
-
-    new_insns = []
-    for insn in kernel.instructions:
-        boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id]
-
-        if not boostable:
-            non_idempotently_updated_vars.update(
-                    insn.assignee_var_names())
-
-        new_insns.append(insn.copy(boostable=boostable))
-
-    # {{{ remove boostability from isns that access non-idempotently updated vars
-
-    new2_insns = []
-    for insn in new_insns:
-        if insn.boostable and bool(
-                non_idempotently_updated_vars & insn.dependency_names()):
-            new2_insns.append(insn.copy(boostable=False))
-        else:
-            new2_insns.append(insn)
-
-    # }}}
-
-    return kernel.copy(instructions=new2_insns)
-
-# }}}
-
-
-# {{{ limit boostability
-
-def limit_boostability(kernel):
-    """Finds out which other inames an instruction's inames occur with
-    and then limits boostability to just those inames.
-    """
-
-    logger.debug("%s: limit boostability" % kernel.name)
-
-    iname_occurs_with = {}
-    for insn in kernel.instructions:
-        insn_inames = kernel.insn_inames(insn)
-        for iname in insn_inames:
-            iname_occurs_with.setdefault(iname, set()).update(insn_inames)
-
-    iname_use_counts = {}
-    for insn in kernel.instructions:
-        for iname in kernel.insn_inames(insn):
-            iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1
-
-    single_use_inames = set(iname for iname, uc in six.iteritems(iname_use_counts)
-            if uc == 1)
-
-    new_insns = []
-    for insn in kernel.instructions:
-        if insn.boostable is None:
-            raise LoopyError("insn '%s' has undetermined boostability" % insn.id)
-        elif insn.boostable:
-            boostable_into = set()
-            for iname in kernel.insn_inames(insn):
-                boostable_into.update(iname_occurs_with[iname])
-
-            boostable_into -= kernel.insn_inames(insn) | single_use_inames
-
-            # Even if boostable_into is empty, leave boostable flag on--it is used
-            # for boosting into unused hw axes.
-
-            insn = insn.copy(boostable_into=boostable_into)
-        else:
-            insn = insn.copy(boostable_into=set())
-
-        new_insns.append(insn)
-
-    return kernel.copy(instructions=new_insns)
-
-# }}}
-
-
 # {{{ check for loads of atomic variables
 
 def check_atomic_loads(kernel):
@@ -2119,25 +2007,25 @@ def check_atomic_loads(kernel):
 
     # find atomic variables
     atomicity_candidates = (
-            set(v.name for v in six.itervalues(kernel.temporary_variables)
-                if isinstance(v.dtype, AtomicType))
+            {v.name for v in kernel.temporary_variables.values()
+                if isinstance(v.dtype, AtomicType)}
             |
-            set(v.name for v in kernel.args
+            {v.name for v in kernel.args
                 if isinstance(v, ArrayBase)
-                and isinstance(v.dtype, AtomicType)))
+                and isinstance(v.dtype, AtomicType)})
 
     new_insns = []
     for insn in kernel.instructions:
         if isinstance(insn, Assignment):
             # look for atomic variables
-            atomic_accesses = set(a.var_name for a in insn.atomicity)
+            atomic_accesses = {a.var_name for a in insn.atomicity}
             accessed_atomic_vars = (insn.dependency_names() & atomicity_candidates)\
-                - set([insn.assignee_var_names()[0]])
+                - {insn.assignee_var_names()[0]}
             if not accessed_atomic_vars <= atomic_accesses:
                 #if we're missing some
                 missed = accessed_atomic_vars - atomic_accesses
                 for x in missed:
-                    if set([x]) & atomicity_candidates:
+                    if {x} & atomicity_candidates:
                         insn = insn.copy(
                             atomicity=insn.atomicity + (AtomicLoad(x),))
 
@@ -2157,7 +2045,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
 
     def __init__(self, rule_mapping_context, caller_kernel,
             callables_table):
-        super(ArgDescrInferenceMapper, self).__init__(
+        super().__init__(
                 rule_mapping_context)
         self.caller_kernel = caller_kernel
         self.callables_table = callables_table
@@ -2168,23 +2056,23 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
 
         if not isinstance(expr.function, ResolvedFunction):
             # ignore if the call is not to a ResolvedFunction
-            return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state)
+            return super().map_call(expr, expn_state)
 
         arg_id_to_val = dict(enumerate(expr.parameters))
         if isinstance(expr, CallWithKwargs):
             arg_id_to_val.update(expr.kw_parameters)
 
-        if 'assignees' in kwargs:
+        if "assignees" in kwargs:
             # If supplied with assignees then this is a CallInstruction
-            assignees = kwargs['assignees']
+            assignees = kwargs["assignees"]
             for i, arg in enumerate(assignees):
                 arg_id_to_val[-i-1] = arg
 
         from loopy.kernel.function_interface import get_arg_descriptor_for_expression
-        arg_id_to_descr = dict(
-                (arg_id, get_arg_descriptor_for_expression(
-                    self.caller_kernel, arg))
-                for arg_id, arg in six.iteritems(arg_id_to_val))
+        arg_id_to_descr = {
+                arg_id: get_arg_descriptor_for_expression(
+                    self.caller_kernel, arg)
+                for arg_id, arg in arg_id_to_val.items()}
 
         # specializing the function according to the parameter description
         in_knl_callable = self.callables_table[expr.function.name]
@@ -2210,9 +2098,9 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
                     ResolvedFunction(new_func_id),
                     tuple(self.rec(child, expn_state)
                         for child in expr.parameters),
-                    dict(
-                        (key, self.rec(val, expn_state))
-                        for key, val in six.iteritems(expr.kw_parameters))
+                    {
+                        key: self.rec(val, expn_state)
+                        for key, val in expr.kw_parameters.items()}
                     )
 
     map_call_with_kwargs = map_call
@@ -2323,7 +2211,7 @@ def preprocess_single_kernel(kernel, callables_table, device=None):
     # {{{ check that there are no l.auto-tagged inames
 
     from loopy.kernel.data import AutoLocalIndexTagBase
-    for iname, tags in six.iteritems(kernel.iname_to_tags):
+    for iname, tags in kernel.iname_to_tags.items():
         if (filter_iname_tags_by_type(tags, AutoLocalIndexTagBase)
                  and iname in kernel.all_inames()):
             raise LoopyError("kernel with automatically-assigned "
@@ -2363,10 +2251,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None):
 
     kernel = find_temporary_address_space(kernel)
 
-    # boostability should be removed in 2017.x.
-    kernel = find_idempotence(kernel)
-    kernel = limit_boostability(kernel)
-
     # check for atomic loads, much easier to do here now that the dependencies
     # have been established
     kernel = check_atomic_loads(kernel)
diff --git a/loopy/program.py b/loopy/program.py
index f862144037aee21e113ad2ccd43b79ceefd39b55..a8bdf91a2a570493e82d25b31784a21dea40801c 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 import re
 
 from pytools import ImmutableRecord, memoize_method
@@ -76,7 +73,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
     """
     def __init__(self, rule_mapping_context, kernel, callables_table,
             function_id_to_in_knl_callable_mappers):
-        super(ResolvedFunctionMarker, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
         self.kernel = kernel
         self.callables_table = callables_table
         self.function_id_to_in_knl_callable_mappers = (
@@ -131,13 +128,13 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                         ResolvedFunction(new_func_id),
                         tuple(self.rec(child, expn_state)
                             for child in expr.parameters),
-                        dict(
-                            (key, self.rec(val, expn_state))
-                            for key, val in six.iteritems(expr.kw_parameters))
+                        {
+                            key: self.rec(val, expn_state)
+                            for key, val in expr.kw_parameters.items()}
                             )
 
         # this is an unknown function as of yet, do not modify it
-        return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr,
+        return super().map_call_with_kwargs(expr,
                 expn_state)
 
     def map_reduction(self, expr, expn_state):
@@ -148,7 +145,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
             self.callables_table, _ = (
                     self.callables_table.with_added_callable(func_id,
                         in_knl_callable))
-        return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
+        return super().map_reduction(expr, expn_state)
 
 
 def _default_func_id_to_kernel_callable_mappers(target):
@@ -243,7 +240,7 @@ class Program(ImmutableRecord):
 
         assert name in callables_table
 
-        super(Program, self).__init__(
+        super().__init__(
                 name=name,
                 callables_table=callables_table,
                 target=target,
@@ -260,10 +257,10 @@ class Program(ImmutableRecord):
     update_persistent_hash = update_persistent_hash
 
     def copy(self, **kwargs):
-        if 'target' in kwargs:
+        if "target" in kwargs:
             # target attribute of all the callable kernels should be updated.
-            target = kwargs['target']
-            new_self = super(Program, self).copy(**kwargs)
+            target = kwargs["target"]
+            new_self = super().copy(**kwargs)
             new_resolved_functions = {}
             for func_id, in_knl_callable in (
                     new_self.callables_table.items()):
@@ -280,7 +277,7 @@ class Program(ImmutableRecord):
             return super(Program, new_self).copy(
                     callables_table=callables_table)
         else:
-            return super(Program, self).copy(**kwargs)
+            return super().copy(**kwargs)
 
     def get_grid_size_upper_bounds(self, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
@@ -371,7 +368,7 @@ class Program(ImmutableRecord):
                     resolved_functions=new_resolved_functions))
 
     def __iter__(self):
-        return six.iterkeys(self.callables_table.resolved_functions)
+        return self.callables_table.resolved_functions.keys()
 
     def __getitem__(self, name):
         result = self.callables_table[name]
@@ -432,13 +429,13 @@ def next_indexed_function_identifier(function_id):
     match = func_name.match(function_id)
 
     if match is None:
-        if function_id[-1] == '_':
-            return "{old_name}0".format(old_name=function_id)
+        if function_id[-1] == "_":
+            return f"{function_id}0"
         else:
-            return "{old_name}_0".format(old_name=function_id)
+            return f"{function_id}_0"
 
-    return "{alpha}_{num}".format(alpha=match.group('alpha'),
-            num=int(match.group('num'))+1)
+    return "{alpha}_{num}".format(alpha=match.group("alpha"),
+            num=int(match.group("num"))+1)
 
 
 class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
@@ -447,7 +444,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
     *renaming_dict*.
     """
     def __init__(self, rule_mapping_context, renaming_dict):
-        super(ResolvedFunctionRenamer, self).__init__(
+        super().__init__(
                 rule_mapping_context)
         self.renaming_dict = renaming_dict
 
@@ -455,7 +452,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
         if expr.name in self.renaming_dict:
             return ResolvedFunction(self.renaming_dict[expr.name])
         else:
-            return super(ResolvedFunctionRenamer, self).map_resolved_function(
+            return super().map_resolved_function(
                     expr, expn_state)
 
 
@@ -504,8 +501,8 @@ class CallablesCountingMapper(CombineMapper):
             in_knl_callable = self.callables_table[expr.function.name]
             if isinstance(in_knl_callable, ScalarCallable):
                 return (Counter([expr.function.name]) +
-                        self.combine((self.rec(child) for child in expr.parameters
-                            + tuple(kw_parameters.values()))))
+                        self.combine(self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values())))
 
             elif isinstance(in_knl_callable, CallableKernel):
 
@@ -516,22 +513,22 @@ class CallablesCountingMapper(CombineMapper):
                             self.callables_table))
 
                 return (Counter([expr.function.name]) +
-                        self.combine((self.rec(child) for child in expr.parameters
-                            + tuple(kw_parameters.values())))) + (
+                        self.combine(self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values()))) + (
                                     callables_count_in_subkernel)
             else:
                 raise NotImplementedError("Unknown callable type %s." % (
                     type))
         else:
             return (
-                    self.combine((self.rec(child) for child in expr.parameters
-                        + tuple(kw_parameters.values()))))
+                    self.combine(self.rec(child) for child in expr.parameters
+                        + tuple(kw_parameters.values())))
 
     map_call_with_kwargs = map_call
 
     def map_reduction(self, expr):
         return Counter(expr.operation.get_scalar_callables()) + (
-                super(CallablesCountingMapper, self).map_reduction(expr))
+                super().map_reduction(expr))
 
     def map_constant(self, expr):
         return Counter()
@@ -609,10 +606,10 @@ class CallablesTable(ImmutableRecord):
             history=None, is_being_edited=False):
 
         if history is None:
-            history = dict((func_id, frozenset([func_id])) for func_id in
-                    resolved_functions)
+            history = {func_id: frozenset([func_id]) for func_id in
+                    resolved_functions}
 
-        super(CallablesTable, self).__init__(
+        super().__init__(
                 resolved_functions=resolved_functions,
                 history=history,
                 is_being_edited=is_being_edited)
@@ -624,8 +621,8 @@ class CallablesTable(ImmutableRecord):
 
     def __hash__(self):
         return hash((
-            frozenset(six.iteritems(self.resolved_functions)),
-            frozenset(six.iteritems(self.history)),
+            frozenset(self.resolved_functions.items()),
+            frozenset(self.history.items()),
             self.is_being_edited
             ))
 
@@ -785,8 +782,8 @@ class CallablesTable(ImmutableRecord):
                 # equal to the old version of the callable.
                 return self, function
             else:
-                print('Old: ', self.resolved_functions[function.name])
-                print('New: ', in_kernel_callable)
+                print("Old: ", self.resolved_functions[function.name])
+                print("New: ", in_kernel_callable)
                 raise LoopyError("Use 'with_enter_edit_callables_mode' first.")
 
         # }}}
@@ -874,7 +871,7 @@ class CallablesTable(ImmutableRecord):
             # this implies that all the function instances having the name
             # "func_id" have been renamed to something else.
             for new_func_id in (
-                    six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)):
+                    new_callables_count.keys()-renames_needed.keys()):
                 if old_func_id in self.history[new_func_id]:
                     renames_needed[new_func_id] = old_func_id
                     break
@@ -931,13 +928,13 @@ class CallablesTable(ImmutableRecord):
         return item in self.resolved_functions
 
     def items(self):
-        return six.iteritems(self.resolved_functions)
+        return self.resolved_functions.items()
 
     def values(self):
-        return six.itervalues(self.resolved_functions)
+        return self.resolved_functions.values()
 
     def keys(self):
-        return six.iterkeys(self.resolved_functions)
+        return self.resolved_functions.keys()
 
     # }}}
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 5348443c66127baea5068c0bc5bd491abd4b4678..94bdef9043563d2a16d535d14a1eb4fa4f88e801 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,7 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
 from pytools import ImmutableRecord
 import sys
 import islpy as isl
@@ -39,6 +36,15 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+__doc__ = """
+.. currentmodule:: loopy.schedule
+
+.. autoclass:: ScheduleItem
+
+.. autoclass:: MinRecursionLimitForScheduling
+"""
+
+
 # {{{ schedule items
 
 class ScheduleItem(ImmutableRecord):
@@ -214,17 +220,17 @@ def find_loop_nest_with_map(kernel):
 
     from loopy.kernel.data import ConcurrentTag, IlpBaseTag
 
-    all_nonpar_inames = set(
+    all_nonpar_inames = {
             iname for iname in kernel.all_inames()
             if not kernel.iname_tags_of_type(iname,
-                    (ConcurrentTag, IlpBaseTag)))
+                    (ConcurrentTag, IlpBaseTag))}
 
     iname_to_insns = kernel.iname_to_insns()
 
     for iname in all_nonpar_inames:
-        result[iname] = set(other_iname
+        result[iname] = {other_iname
             for insn in iname_to_insns[iname]
-            for other_iname in kernel.insn_inames(insn) & all_nonpar_inames)
+            for other_iname in kernel.insn_inames(insn) & all_nonpar_inames}
 
     return result
 
@@ -358,8 +364,7 @@ def gen_dependencies_except(kernel, insn_id, except_insn_ids):
 
         yield dep_id
 
-        for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids):
-            yield sub_dep_id
+        yield from gen_dependencies_except(kernel, dep_id, except_insn_ids)
 
 
 def get_priority_tiers(wanted, priorities):
@@ -401,8 +406,7 @@ def get_priority_tiers(wanted, priorities):
     wanted = wanted - candidates
 
     # Yield recursively
-    for tier in get_priority_tiers(wanted, priorities):
-        yield tier
+    yield from get_priority_tiers(wanted, priorities)
 
 
 def sched_item_to_insn_id(sched_item):
@@ -433,25 +437,25 @@ def format_insn(kernel, insn_id):
     from loopy.kernel.instruction import (
             MultiAssignmentBase, NoOpInstruction, BarrierInstruction)
     if isinstance(insn, MultiAssignmentBase):
-        return "%s%s%s = %s%s%s  {id=%s}" % (
+        return "{}{}{} = {}{}{}  {{id={}}}".format(
             Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
             Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
             format_insn_id(kernel, insn_id))
     elif isinstance(insn, BarrierInstruction):
-        mem_kind = ''
-        if insn.synchronization_kind == 'local':
-            mem_kind = '{mem_kind=%s}' % insn.mem_kind
+        mem_kind = ""
+        if insn.synchronization_kind == "local":
+            mem_kind = "{mem_kind=%s}" % insn.mem_kind
 
-        return "[%s] %s... %sbarrier%s%s" % (
+        return "[{}] {}... {}barrier{}{}".format(
                 format_insn_id(kernel, insn_id),
                 Fore.MAGENTA, insn.synchronization_kind[0], mem_kind,
                 Style.RESET_ALL)
     elif isinstance(insn, NoOpInstruction):
-        return "[%s] %s... nop%s" % (
+        return "[{}] {}... nop{}".format(
                 format_insn_id(kernel, insn_id),
                 Fore.MAGENTA, Style.RESET_ALL)
     else:
-        return "[%s] %s%s%s" % (
+        return "[{}] {}{}{}".format(
                 format_insn_id(kernel, insn_id),
                 Fore.CYAN, str(insn), Style.RESET_ALL)
 
@@ -470,7 +474,7 @@ def dump_schedule(kernel, schedule):
             lines.append(indent + "end %s" % sched_item.iname)
         elif isinstance(sched_item, CallKernel):
             lines.append(indent +
-                         "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % (
+                         "CALL KERNEL {}(extra_args={}, extra_inames={})".format(
                              sched_item.kernel_name,
                              sched_item.extra_args,
                              sched_item.extra_inames))
@@ -561,7 +565,7 @@ class ScheduleDebugInput(Exception):
 # }}}
 
 
-# {{{ scheduling algorithm
+# {{{ scheduler state
 
 class SchedulerState(ImmutableRecord):
     """
@@ -569,10 +573,6 @@ class SchedulerState(ImmutableRecord):
 
     .. attribute:: loop_nest_around_map
 
-    .. attribute:: loop_priority
-
-        See :func:`loop_nest_around_map`.
-
     .. attribute:: breakable_inames
 
     .. attribute:: ilp_inames
@@ -586,6 +586,11 @@ class SchedulerState(ImmutableRecord):
 
     .. rubric:: Time-varying scheduler state
 
+    .. attribute:: insn_ids_to_try
+
+        :class:`list` of unscheduled instruction ids in a decreasing priority
+        order.
+
     .. attribute:: active_inames
 
         A tuple of active inames.
@@ -637,10 +642,10 @@ class SchedulerState(ImmutableRecord):
         in them that are left to schedule. If a group name occurs in this
         mapping, that group is considered active.
 
-    .. attribute:: uses_of_boostability
+    .. attribute:: insns_in_topologically_sorted_order
 
-        Used to produce warnings about deprecated 'boosting' behavior
-        Should be removed along with boostability in 2017.x.
+        A list of loopy :class:`Instruction` objects in topologically sorted
+        order with instruction priorities as tie breaker.
     """
 
     @property
@@ -650,25 +655,172 @@ class SchedulerState(ImmutableRecord):
         else:
             return None
 
+# }}}
+
+
+def get_insns_in_topologically_sorted_order(kernel):
+    from pytools.graph import compute_topological_order
+
+    rev_dep_map = {insn.id: set() for insn in kernel.instructions}
+    for insn in kernel.instructions:
+        for dep in insn.depends_on:
+            rev_dep_map[dep].add(insn.id)
+
+    def key(insn_id):
+        # negative of insn.priority because
+        # pytools.graph.compute_topological_order schedules the nodes with
+        # lower 'key' first in case of a tie.
+        return (-kernel.id_to_insn[insn_id].priority, insn.id)
+
+    ids = compute_topological_order(rev_dep_map, key=key)
+    return [kernel.id_to_insn[insn_id] for insn_id in ids]
+
+
+# {{{ schedule_as_many_run_insns_as_possible
+
+def schedule_as_many_run_insns_as_possible(sched_state, template_insn):
+    """
+    Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending
+    all reachable instructions that are similar to *template_insn*. We define
+    two instructions to be similar if:
+
+    * Both are within the same set of non-parallel inames.
+    * Both belong to the same groups.
+    * Both conflict with the same groups.
+    """
+
+    # {{{ bail when implementation is unsupported
+
+    next_preschedule_item = (
+        sched_state.preschedule[0]
+        if sched_state.preschedule
+        else None)
+
+    if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel,
+            Barrier, EnterLoop, LeaveLoop)):
+        return sched_state
+
+    if not sched_state.within_subkernel:
+        # cannot schedule RunInstructions when not in subkernel
+        return sched_state
+
+    # }}}
+
+    preschedule = sched_state.preschedule[:]
+    have_inames = template_insn.within_inames - sched_state.parallel_inames
+    toposorted_insns = sched_state.insns_in_topologically_sorted_order
+
+    # {{{ helpers
+
+    def next_preschedule_insn_id():
+        return (next(iter(sched_item_to_insn_id(preschedule[0])), None)
+                if sched_state.preschedule
+                else None)
+
+    def is_similar_to_template(insn):
+        if ((insn.within_inames - sched_state.parallel_inames)
+                != have_inames):
+            # sched_state.parallel_inames contains inames for which no
+            # EnterLoop/LeaveLoop nodes occur.
+            # FIXME: Should really rename that
+            return False
+        if insn.groups != template_insn.groups:
+            return False
+        if insn.conflicts_with_groups != template_insn.conflicts_with_groups:
+            return False
+
+        return True
+
+    # }}}
+
+    # select the top instructions in toposorted_insns only which have active
+    # inames corresponding to those of sched_state
+    newly_scheduled_insn_ids = []
+    ignored_unscheduled_insn_ids = set()
+
+    # left_over_toposorted_insns: unscheduled insns in a topologically sorted order
+    left_over_toposorted_insns = []
+
+    for i, insn in enumerate(toposorted_insns):
+        assert insn.id not in sched_state.scheduled_insn_ids
+
+        if is_similar_to_template(insn):
+            # check reachability
+            if not (insn.depends_on & ignored_unscheduled_insn_ids):
+                if insn.id in sched_state.prescheduled_insn_ids:
+                    if next_preschedule_insn_id() == insn.id:
+                        preschedule.pop(0)
+                        newly_scheduled_insn_ids.append(insn.id)
+                        continue
+                else:
+                    newly_scheduled_insn_ids.append(insn.id)
+                    continue
+
+        left_over_toposorted_insns.append(insn)
+        ignored_unscheduled_insn_ids.add(insn.id)
+
+        # HEURISTIC: To avoid quadratic operation complexity we bail out of
+        # adding new instructions by restricting the number of ignore
+        # unscheduled insns ids to 5.
+        # TODO: Find a stronger solution which would answer in O(1) time and
+        # O(N) space complexity when "no further instructions can be
+        # scheduled" i.e. when either:
+        # - No similar instructions are present in toposorted_insns.
+        # - No instruction in toposorted_insns is reachable due to instructions
+        #   that were ignored.
+        if len(ignored_unscheduled_insn_ids) > 5:
+            left_over_toposorted_insns.extend(toposorted_insns[i+1:])
+            break
+
+    sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in
+            newly_scheduled_insn_ids)
+
+    updated_schedule = sched_state.schedule + sched_items
+    updated_scheduled_insn_ids = (sched_state.scheduled_insn_ids
+            | frozenset(newly_scheduled_insn_ids))
+    updated_unscheduled_insn_ids = (
+            sched_state.unscheduled_insn_ids
+            - frozenset(newly_scheduled_insn_ids))
+    new_insn_ids_to_try = (None if newly_scheduled_insn_ids
+            else sched_state.insn_ids_to_try)
+
+    new_active_group_counts = sched_state.active_group_counts.copy()
+    if newly_scheduled_insn_ids:
+        # all the newly scheduled insns belong to the same groups as
+        # template_insn
+        for grp in template_insn.groups:
+            new_active_group_counts[grp] -= len(newly_scheduled_insn_ids)
+            if new_active_group_counts[grp] == 0:
+                new_active_group_counts.pop(grp)
+
+    return sched_state.copy(
+            schedule=updated_schedule,
+            scheduled_insn_ids=updated_scheduled_insn_ids,
+            unscheduled_insn_ids=updated_unscheduled_insn_ids,
+            preschedule=preschedule,
+            insn_ids_to_try=new_insn_ids_to_try,
+            active_group_counts=new_active_group_counts,
+            insns_in_topologically_sorted_order=left_over_toposorted_insns
+            )
+
+# }}}
+
+
+# {{{ scheduling algorithm
 
 def generate_loop_schedules_internal(
-        sched_state, allow_boost=False, debug=None):
+        sched_state, debug=None):
     # allow_insn is set to False initially and after entering each loop
     # to give loops containing high-priority instructions a chance.
     kernel = sched_state.kernel
     Fore = kernel.options._fore  # noqa
     Style = kernel.options._style  # noqa
 
-    if allow_boost is None:
-        rec_allow_boost = None
-    else:
-        rec_allow_boost = False
-
     active_inames_set = frozenset(sched_state.active_inames)
 
     next_preschedule_item = (
         sched_state.preschedule[0]
-        if len(sched_state.preschedule) > 0
+        if sched_state.preschedule
         else None)
 
     # {{{ decide about debug mode
@@ -693,11 +845,10 @@ def generate_loop_schedules_internal(
             print(75*"=")
             print("PRESCHEDULED ITEMS AWAITING SCHEDULING:")
             print(dump_schedule(sched_state.kernel, sched_state.preschedule))
-        #print("boost allowed:", allow_boost)
         print(75*"=")
         print("LOOP NEST MAP (inner: outer):")
-        for iname, val in six.iteritems(sched_state.loop_nest_around_map):
-            print("%s : %s" % (iname, ", ".join(val)))
+        for iname, val in sched_state.loop_nest_around_map.items():
+            print("{} : {}".format(iname, ", ".join(val)))
         print(75*"=")
 
         if debug.debug_length == len(debug.longest_rejected_schedule):
@@ -712,30 +863,26 @@ def generate_loop_schedules_internal(
 
     if isinstance(next_preschedule_item, CallKernel):
         assert sched_state.within_subkernel is False
-        for result in generate_loop_schedules_internal(
+        yield from generate_loop_schedules_internal(
                 sched_state.copy(
                     schedule=sched_state.schedule + (next_preschedule_item,),
                     preschedule=sched_state.preschedule[1:],
                     within_subkernel=True,
                     may_schedule_global_barriers=False,
                     enclosing_subkernel_inames=sched_state.active_inames),
-                allow_boost=rec_allow_boost,
-                debug=debug):
-            yield result
+                debug=debug)
 
     if isinstance(next_preschedule_item, ReturnFromKernel):
         assert sched_state.within_subkernel is True
         # Make sure all subkernel inames have finished.
         if sched_state.active_inames == sched_state.enclosing_subkernel_inames:
-            for result in generate_loop_schedules_internal(
+            yield from generate_loop_schedules_internal(
                     sched_state.copy(
                         schedule=sched_state.schedule + (next_preschedule_item,),
                         preschedule=sched_state.preschedule[1:],
                         within_subkernel=False,
                         may_schedule_global_barriers=True),
-                    allow_boost=rec_allow_boost,
-                    debug=debug):
-                yield result
+                    debug=debug)
 
     # }}}
 
@@ -748,13 +895,11 @@ def generate_loop_schedules_internal(
     if (
             isinstance(next_preschedule_item, Barrier)
             and next_preschedule_item.originating_insn_id is None):
-        for result in generate_loop_schedules_internal(
+        yield from generate_loop_schedules_internal(
                     sched_state.copy(
                         schedule=sched_state.schedule + (next_preschedule_item,),
                         preschedule=sched_state.preschedule[1:]),
-                    allow_boost=rec_allow_boost,
-                    debug=debug):
-            yield result
+                    debug=debug)
 
     # }}}
 
@@ -793,28 +938,11 @@ def generate_loop_schedules_internal(
         is_ready = insn.depends_on <= sched_state.scheduled_insn_ids
 
         if not is_ready:
-            if debug_mode:
-                # These are not that interesting when understanding scheduler
-                # failures.
-
-                # print("instruction '%s' is missing insn depedencies '%s'" % (
-                #         format_insn(kernel, insn.id), ",".join(
-                #             insn.depends_on - sched_state.scheduled_insn_ids)))
-                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
         have = active_inames_set - sched_state.parallel_inames
 
-        # If insn is boostable, it may be placed inside a more deeply
-        # nested loop without harm.
-
-        orig_have = have
-        if allow_boost:
-            # Note that the inames in 'insn.boostable_into' necessarily won't
-            # be contained in 'want'.
-            have = have - insn.boostable_into
-
         if want != have:
             is_ready = False
 
@@ -908,7 +1036,7 @@ def generate_loop_schedules_internal(
 
             # }}}
 
-            # {{{ update instruction_ids_to_try
+            # {{{ update instruction_ids_to_try/toposorted_insns
 
             new_insn_ids_to_try = list(insn_ids_to_try)
             new_insn_ids_to_try.remove(insn.id)
@@ -918,13 +1046,10 @@ def generate_loop_schedules_internal(
                     sched_state.active_group_counts.keys()):
                 new_insn_ids_to_try = None
 
-            # }}}
+            new_toposorted_insns = sched_state.insns_in_topologically_sorted_order[:]
+            new_toposorted_insns.remove(insn)
 
-            new_uses_of_boostability = []
-            if allow_boost:
-                if orig_have & insn.boostable_into:
-                    new_uses_of_boostability.append(
-                            (insn.id, orig_have & insn.boostable_into))
+            # }}}
 
             new_sched_state = sched_state.copy(
                     scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set,
@@ -937,17 +1062,18 @@ def generate_loop_schedules_internal(
                         if insn_id not in sched_state.prescheduled_insn_ids
                         else sched_state.preschedule[1:]),
                     active_group_counts=new_active_group_counts,
-                    uses_of_boostability=(
-                        sched_state.uses_of_boostability
-                        + new_uses_of_boostability)
+                    insns_in_topologically_sorted_order=new_toposorted_insns,
                     )
 
+            new_sched_state = schedule_as_many_run_insns_as_possible(new_sched_state,
+                    insn)
+
             # Don't be eager about entering/leaving loops--if progress has been
             # made, revert to top of scheduler and see if more progress can be
             # made.
             for sub_sched in generate_loop_schedules_internal(
                     new_sched_state,
-                    allow_boost=rec_allow_boost, debug=debug):
+                    debug=debug):
                 yield sub_sched
 
             if not sched_state.group_insn_counts:
@@ -989,12 +1115,10 @@ def generate_loop_schedules_internal(
                         # outside of last_entered_loop.
                         for subdep_id in gen_dependencies_except(kernel, insn_id,
                                 sched_state.scheduled_insn_ids):
-                            subdep = kernel.id_to_insn[insn_id]
                             want = (kernel.insn_inames(subdep_id)
                                     - sched_state.parallel_inames)
                             if (
-                                    last_entered_loop not in want and
-                                    last_entered_loop not in subdep.boostable_into):
+                                    last_entered_loop not in want):
                                 print(
                                     "%(warn)swarning:%(reset_all)s '%(iname)s', "
                                     "which the schedule is "
@@ -1048,13 +1172,14 @@ def generate_loop_schedules_internal(
                                 sched_state.schedule
                                 + (LeaveLoop(iname=last_entered_loop),)),
                             active_inames=sched_state.active_inames[:-1],
+                            insn_ids_to_try=insn_ids_to_try,
                             preschedule=(
                                 sched_state.preschedule
                                 if last_entered_loop
                                 not in sched_state.prescheduled_inames
                                 else sched_state.preschedule[1:]),
                         ),
-                        allow_boost=rec_allow_boost, debug=debug):
+                        debug=debug):
                     yield sub_sched
 
                 return
@@ -1083,7 +1208,7 @@ def generate_loop_schedules_internal(
         print("reachable insns:", ",".join(reachable_insn_ids))
         print("active groups (with insn counts):", ",".join(
             "%s: %d" % (grp, c)
-            for grp, c in six.iteritems(sched_state.active_group_counts)))
+            for grp, c in sched_state.active_group_counts.items()))
         print(75*"-")
 
     if needed_inames:
@@ -1165,11 +1290,11 @@ def generate_loop_schedules_internal(
 
             usefulness = None  # highest insn priority enabled by iname
 
-            hypothetically_active_loops = active_inames_set | set([iname])
+            hypothetically_active_loops = active_inames_set | {iname}
             for insn_id in reachable_insn_ids:
                 insn = kernel.id_to_insn[insn_id]
 
-                want = kernel.insn_inames(insn) | insn.boostable_into
+                want = kernel.insn_inames(insn)
 
                 if hypothetically_active_loops <= want:
                     if usefulness is None:
@@ -1193,7 +1318,7 @@ def generate_loop_schedules_internal(
         loop_priority_set = set().union(*[set(prio)
                                           for prio in
                                           sched_state.kernel.loop_priority])
-        useful_loops_set = set(six.iterkeys(iname_to_usefulness))
+        useful_loops_set = set(iname_to_usefulness.keys())
         useful_and_desired = useful_loops_set & loop_priority_set
 
         if useful_and_desired:
@@ -1264,12 +1389,12 @@ def generate_loop_schedules_internal(
                                 entered_inames=(
                                     sched_state.entered_inames
                                     | frozenset((iname,))),
+                                insn_ids_to_try=insn_ids_to_try,
                                 preschedule=(
                                     sched_state.preschedule
                                     if iname not in sched_state.prescheduled_inames
                                     else sched_state.preschedule[1:]),
                                 ),
-                            allow_boost=rec_allow_boost,
                             debug=debug):
                         found_viable_schedule = True
                         yield sub_sched
@@ -1281,7 +1406,7 @@ def generate_loop_schedules_internal(
 
     if debug_mode:
         print(75*"=")
-        inp = six.moves.input("Hit Enter for next schedule, "
+        inp = input("Hit Enter for next schedule, "
                 "or enter a number to examine schedules of a "
                 "different length:")
         if inp:
@@ -1294,28 +1419,11 @@ def generate_loop_schedules_internal(
         # if done, yield result
         debug.log_success(sched_state.schedule)
 
-        for boost_insn_id, boost_inames in sched_state.uses_of_boostability:
-            warn_with_kernel(
-                    kernel, "used_boostability",
-                    "instruction '%s' was implicitly nested inside "
-                    "inames '%s' based on an idempotence heuristic. "
-                    "This is deprecated and will stop working in loopy 2017.x."
-                    % (boost_insn_id, ", ".join(boost_inames)),
-                    DeprecationWarning)
-
         yield sched_state.schedule
 
     else:
-        if not allow_boost and allow_boost is not None:
-            # try again with boosting allowed
-            for sub_sched in generate_loop_schedules_internal(
-                    sched_state,
-                    allow_boost=True, debug=debug):
-                yield sub_sched
-        else:
-            # dead end
-            if debug is not None:
-                debug.log_dead_end(sched_state.schedule)
+        if debug is not None:
+            debug.log_dead_end(sched_state.schedule)
 
 # }}}
 
@@ -1379,7 +1487,7 @@ class DependencyRecord(ImmutableRecord):
                 var_kind=var_kind)
 
 
-class DependencyTracker(object):
+class DependencyTracker:
     """
     A utility to help track dependencies between originating from a set
     of sources (as defined by :meth:`add_source`. For each target,
@@ -1487,9 +1595,8 @@ class DependencyTracker(object):
                 ("w", "any", self.base_access_map),
                 ]:
 
-            for dep in self.get_conflicting_accesses(
-                    target, tgt_dir, src_dir, src_base_var_to_accessor_map):
-                yield dep
+            yield from self.get_conflicting_accesses(
+                    target, tgt_dir, src_dir, src_base_var_to_accessor_map)
 
     def get_conflicting_accesses(self, target, tgt_dir, src_dir,
             src_base_var_to_accessor_map):
@@ -1503,11 +1610,11 @@ class DependencyTracker(object):
         dir_to_getter = {"w": get_written_names, "any": get_accessed_names}
 
         def filter_var_set_for_base_storage(var_name_set, base_storage_name):
-            return set(
+            return {
                     name
                     for name in var_name_set
                     if (self.temp_to_base_storage.get(name, name)
-                        == base_storage_name))
+                        == base_storage_name)}
 
         tgt_accessed_vars = dir_to_getter[tgt_dir](target)
         tgt_accessed_vars_base = self.map_to_base_storage(tgt_accessed_vars)
@@ -1637,8 +1744,8 @@ def _insn_ids_reaching_end(schedule, kind, reverse):
                     sched_item.synchronization_kind, kind):
                 insn_ids_alive_at_scope[-1].clear()
         else:
-            insn_ids_alive_at_scope[-1] |= set(
-                    insn_id for insn_id in sched_item_to_insn_id(sched_item))
+            insn_ids_alive_at_scope[-1] |= {
+                    insn_id for insn_id in sched_item_to_insn_id(sched_item)}
 
     assert len(insn_ids_alive_at_scope) == 1
     return insn_ids_alive_at_scope[-1]
@@ -1660,7 +1767,7 @@ def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only):
                     dep.variable,
                     dep.var_kind))
     else:
-        comment = "for %s (%s)" % (
+        comment = "for {} ({})".format(
                 dep.variable, dep.dep_descr.format(
                     tgt=dep.target.id, src=dep.source.id))
         schedule.append(Barrier(
@@ -1830,22 +1937,21 @@ def generate_loop_schedules(kernel, callables_table, debug_args={}):
     .. warning::
 
         This function needs to be called inside (another layer) of a
-        :class:`MinRecursionLimitForScheduling` context manager, and the
-        context manager needs to end *after* the last reference to the
+        :class:`loopy.schedule.MinRecursionLimitForScheduling` context manager,
+        and the context manager needs to end *after* the last reference to the
         generators has gone out of scope. Otherwise, the high-recursion-limit
         generator chain may not be successfully garbage-collected and cause an
         internal error in the Python runtime.
     """
 
     with MinRecursionLimitForScheduling(kernel):
-        for sched in generate_loop_schedules_inner(kernel,
-                callables_table, debug_args=debug_args):
-            yield sched
+        yield from generate_loop_schedules_inner(kernel,
+                callables_table, debug_args=debug_args)
 
 
 def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
     from loopy.kernel import KernelState
-    if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
+    if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
@@ -1856,32 +1962,32 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
 
     debug = ScheduleDebugger(**debug_args)
 
-    preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else ()
+    preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else ()
 
-    prescheduled_inames = set(
+    prescheduled_inames = {
             insn.iname
             for insn in preschedule
-            if isinstance(insn, EnterLoop))
+            if isinstance(insn, EnterLoop)}
 
-    prescheduled_insn_ids = set(
+    prescheduled_insn_ids = {
         insn_id
         for item in preschedule
-        for insn_id in sched_item_to_insn_id(item))
+        for insn_id in sched_item_to_insn_id(item)}
 
     from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag,
                                    filter_iname_tags_by_type)
-    ilp_inames = set(
+    ilp_inames = {
             iname
-            for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if filter_iname_tags_by_type(tags, IlpBaseTag))
-    vec_inames = set(
+            for iname, tags in kernel.iname_to_tags.items()
+            if filter_iname_tags_by_type(tags, IlpBaseTag)}
+    vec_inames = {
             iname
-            for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if filter_iname_tags_by_type(tags, VectorizeTag))
-    parallel_inames = set(
+            for iname, tags in kernel.iname_to_tags.items()
+            if filter_iname_tags_by_type(tags, VectorizeTag)}
+    parallel_inames = {
             iname
-            for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if filter_iname_tags_by_type(tags, ConcurrentTag))
+            for iname, tags in kernel.iname_to_tags.items()
+            if filter_iname_tags_by_type(tags, ConcurrentTag)}
 
     loop_nest_with_map = find_loop_nest_with_map(kernel)
     loop_nest_around_map = find_loop_nest_around_map(kernel)
@@ -1906,9 +2012,9 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
 
             schedule=(),
 
-            unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
+            unscheduled_insn_ids={insn.id for insn in kernel.instructions},
             scheduled_insn_ids=frozenset(),
-            within_subkernel=kernel.state != KernelState.SCHEDULED,
+            within_subkernel=kernel.state != KernelState.LINEARIZED,
             may_schedule_global_barriers=True,
 
             preschedule=preschedule,
@@ -1920,15 +2026,15 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
             group_insn_counts=group_insn_counts(kernel),
             active_group_counts={},
 
-            uses_of_boostability=[])
+            insns_in_topologically_sorted_order=(
+                get_insns_in_topologically_sorted_order(kernel)),
+    )
 
     schedule_gen_kwargs = {}
-    if kernel.options.ignore_boostable_into:
-        schedule_gen_kwargs["allow_boost"] = None
 
     def print_longest_dead_end():
         if debug.interactive:
-            print("Loo.py will now show you the scheduler state at the point")
+            print("Loopy will now show you the scheduler state at the point")
             print("where the longest (dead-end) schedule was generated, in the")
             print("the hope that some of this makes sense and helps you find")
             print("the issue.")
@@ -1937,7 +2043,7 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
             print("  debug_args=dict(interactive=False)")
             print("to generate_loop_schedules().")
             print(75*"-")
-            six.moves.input("Enter:")
+            input("Enter:")
             print()
             print()
 
@@ -1978,11 +2084,11 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
 
             new_kernel = kernel.copy(
                     schedule=gen_sched,
-                    state=KernelState.SCHEDULED)
+                    state=KernelState.LINEARIZED)
 
             from loopy.schedule.device_mapping import \
                     map_schedule_onto_host_or_device
-            if kernel.state != KernelState.SCHEDULED:
+            if kernel.state != KernelState.LINEARIZED:
                 # Device mapper only gets run once.
                 new_kernel = map_schedule_onto_host_or_device(new_kernel)
 
@@ -2005,7 +2111,7 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
     debug.done_scheduling()
     if not schedule_count:
         print(75*"-")
-        print("ERROR: Sorry--loo.py did not find a schedule for your kernel.")
+        print("ERROR: Sorry--loopy did not find a schedule for your kernel.")
         print(75*"-")
         print_longest_dead_end()
         raise RuntimeError("no valid schedules found")
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..089d4e600a13f8cf605b85fe29389bb28e39481a 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -31,7 +29,7 @@ from loopy.schedule.tools import get_block_boundaries
 def map_schedule_onto_host_or_device(kernel):
     # FIXME: Should be idempotent.
     from loopy.kernel import KernelState
-    assert kernel.state == KernelState.SCHEDULED
+    assert kernel.state == KernelState.LINEARIZED
 
     from functools import partial
     device_prog_name_gen = partial(
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index e0129fd98417f26a501138a92de4a67614f1a139..afcdfb07bbde81b8211bff0909ae26a5a7a67a07 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -88,7 +86,7 @@ def add_extra_args_to_schedule(kernel):
                     temporaries_read_in_subkernel(kernel, subkernel)
                     | temporaries_written_in_subkernel(kernel, subkernel))
 
-            more_args = set(tv
+            more_args = {tv
                     for tv in used_temporaries
                     if
                     kernel.temporary_variables[tv].address_space
@@ -96,7 +94,7 @@ def add_extra_args_to_schedule(kernel):
                     and
                     kernel.temporary_variables[tv].initializer is None
                     and
-                    tv not in sched_item.extra_args)
+                    tv not in sched_item.extra_args}
 
             new_schedule.append(sched_item.copy(
                     extra_args=sched_item.extra_args + sorted(more_args)))
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 86f39e55bd0e5de2773ee3b5b42a08885191a9c6..a1c86d88bb6e8c97d757683d3fa2aebdee7f9a7a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1,11 +1,10 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = """
 Copyright (C) 2015 James Stevens
 Copyright (C) 2018 Kaushik Kulkarni
 Copyright (C) 2019 Andreas Kloeckner
 """
 
+
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,9 +25,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from functools import partial
-import six
-
 import loopy as lp
 from islpy import dim_type
 import islpy as isl
@@ -41,6 +37,7 @@ from pytools import ImmutableRecord, memoize_method
 from loopy.kernel.function_interface import CallableKernel
 from loopy.kernel import LoopKernel
 from loopy.program import make_program
+from functools import partial
 
 
 __doc__ = """
@@ -100,7 +97,7 @@ def _get_param_tuple(obj):
             for i in range(obj.dim(dim_type.param)))
 
 
-class GuardedPwQPolynomial(object):
+class GuardedPwQPolynomial:
     def __init__(self, pwqpolynomial, valid_domain):
         self.pwqpolynomial = pwqpolynomial
         self.valid_domain = valid_domain
@@ -153,7 +150,7 @@ class GuardedPwQPolynomial(object):
 
     @staticmethod
     def zero():
-        p = isl.PwQPolynomial('{ 0 }')
+        p = isl.PwQPolynomial("{ 0 }")
         return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space))
 
     def __str__(self):
@@ -167,7 +164,7 @@ class GuardedPwQPolynomial(object):
 
 # {{{ ToCountMap
 
-class ToCountMap(object):
+class ToCountMap:
     """A map from work descriptors like :class:`Op` and :class:`MemAccess`
     to any arithmetic type.
 
@@ -203,23 +200,28 @@ class ToCountMap(object):
 
     def __add__(self, other):
         result = self.count_map.copy()
-        for k, v in six.iteritems(other.count_map):
+        for k, v in other.count_map.items():
             result[k] = self.count_map.get(k, 0) + v
         return self.copy(count_map=result)
 
     def __radd__(self, other):
         if other != 0:
             raise ValueError("ToCountMap: Attempted to add ToCountMap "
-                                "to {0} {1}. ToCountMap may only be added to "
+                                "to {} {}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
 
         return self
 
     def __mul__(self, other):
-        return self.copy(dict(
-            (index, value*other)
-            for index, value in six.iteritems(self.count_map)))
+        if isinstance(other, GuardedPwQPolynomial):
+            return self.copy({
+                index: value*other
+                for index, value in self.count_map.items()})
+        else:
+            raise ValueError("ToCountMap: Attempted to multiply "
+                                "ToCountMap by {} {}."
+                                .format(type(other), other))
 
     __rmul__ = __mul__
 
@@ -231,8 +233,8 @@ class ToCountMap(object):
 
     def __str__(self):
         return "\n".join(
-                "%s: %s" % (k, v)
-                for k, v in sorted(six.iteritems(self.count_map),
+                f"{k}: {v}"
+                for k, v in sorted(self.count_map.items(),
                     key=lambda k: str(k)))
 
     def __len__(self):
@@ -257,9 +259,9 @@ class ToCountMap(object):
         return type(self)(count_map=count_map)
 
     def with_set_attributes(self, **kwargs):
-        return self.copy(count_map=dict(
-            (key.copy(**kwargs), val)
-            for key, val in six.iteritems(self.count_map)))
+        return self.copy(count_map={
+            key.copy(**kwargs): val
+            for key, val in self.count_map.items()})
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
@@ -276,10 +278,10 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
-            filtered_map = mem_map.filter_by(direction=['load'],
-                                             variable=['a','g'])
+            filtered_map = mem_map.filter_by(direction=["load"],
+                                             variable=["a","g"])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -292,16 +294,16 @@ class ToCountMap(object):
             pass
 
         new_kwargs = {}
-        for arg_field, allowable_vals in six.iteritems(kwargs):
+        for arg_field, allowable_vals in kwargs.items():
             if arg_field == "dtype":
                 from loopy.types import to_loopy_type
                 allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals]
 
             new_kwargs[arg_field] = allowable_vals
 
-        for key, val in six.iteritems(self.count_map):
+        for key, val in self.count_map.items():
             if all(getattr(key, arg_field, _Sentinel) in allowable_vals
-                    for arg_field, allowable_vals in six.iteritems(new_kwargs)):
+                    for arg_field, allowable_vals in new_kwargs.items()):
                 new_count_map[key] = val
 
         return self.copy(count_map=new_count_map)
@@ -319,7 +321,7 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
             def filter_func(key):
                 return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4:
@@ -333,7 +335,7 @@ class ToCountMap(object):
 
         new_count_map = {}
 
-        for self_key, self_val in six.iteritems(self.count_map):
+        for self_key, self_val in self.count_map.items():
             if func(self_key):
                 new_count_map[self_key] = self_val
 
@@ -353,29 +355,29 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = get_mem_access_map(knl)
-            grouped_map = mem_map.group_by('mtype', 'dtype', 'direction')
+            grouped_map = mem_map.group_by("mtype", "dtype", "direction")
 
-            f32_global_ld = grouped_map[MemAccess(mtype='global',
+            f32_global_ld = grouped_map[MemAccess(mtype="global",
                                                   dtype=np.float32,
-                                                  direction='load')
+                                                  direction="load")
                                        ].eval_with_dict(params)
-            f32_global_st = grouped_map[MemAccess(mtype='global',
+            f32_global_st = grouped_map[MemAccess(mtype="global",
                                                   dtype=np.float32,
-                                                  direction='store')
+                                                  direction="store")
                                        ].eval_with_dict(params)
-            f32_local_ld = grouped_map[MemAccess(mtype='local',
+            f32_local_ld = grouped_map[MemAccess(mtype="local",
                                                  dtype=np.float32,
-                                                 direction='load')
+                                                 direction="load")
                                       ].eval_with_dict(params)
-            f32_local_st = grouped_map[MemAccess(mtype='local',
+            f32_local_st = grouped_map[MemAccess(mtype="local",
                                                  dtype=np.float32,
-                                                 direction='store')
+                                                 direction="store")
                                       ].eval_with_dict(params)
 
             op_map = get_op_map(knl)
-            ops_dtype = op_map.group_by('dtype')
+            ops_dtype = op_map.group_by("dtype")
 
             f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params)
             f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
@@ -396,11 +398,11 @@ class ToCountMap(object):
         else:
             return self
 
-        for self_key, self_val in six.iteritems(self.count_map):
+        for self_key, self_val in self.count_map.items():
             new_key = key_type(
-                    **dict(
-                        (field, getattr(self_key, field))
-                        for field in args))
+                    **{
+                        field: getattr(self_key, field)
+                        for field in args})
 
             new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val
 
@@ -418,20 +420,20 @@ class ToCountMap(object):
             # (first create loopy kernel and specify array data types)
 
             bytes_map = get_mem_access_map(knl).to_bytes()
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
 
             s1_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 1},
-                                direction=['load']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 1},
+                                direction=["load"]).eval_and_sum(params)
             s2_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 2},
-                                direction=['load']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 2},
+                                direction=["load"]).eval_and_sum(params)
             s1_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 1},
-                                direction=['store']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 1},
+                                direction=["store"]).eval_and_sum(params)
             s2_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 2},
-                                direction=['store']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 2},
+                                direction=["store"]).eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
 
@@ -439,7 +441,7 @@ class ToCountMap(object):
 
         new_count_map = {}
 
-        for key, val in six.iteritems(self.count_map):
+        for key, val in self.count_map.items():
             new_count_map[key] = int(key.dtype.itemsize) * val
 
         return self.copy(new_count_map)
@@ -449,7 +451,7 @@ class ToCountMap(object):
 
         total = self._zero()
 
-        for k, v in six.iteritems(self.count_map):
+        for k, v in self.count_map.items():
             total += v
 
         return total
@@ -475,7 +477,7 @@ class ToCountPolynomialMap(ToCountMap):
 
         space_param_tuple = _get_param_tuple(space)
 
-        for key, val in six.iteritems(count_map):
+        for key, val in count_map.items():
             if isinstance(val, isl.PwQPolynomial):
                 assert val.dim(dim_type.out) == 1
             elif isinstance(val, GuardedPwQPolynomial):
@@ -485,7 +487,7 @@ class ToCountPolynomialMap(ToCountMap):
 
             assert _get_param_tuple(val.space) == space_param_tuple
 
-        super(ToCountPolynomialMap, self).__init__(count_map)
+        super().__init__(count_map)
 
     def _zero(self):
         space = self.space.insert_dims(dim_type.out, 0, 1)
@@ -510,10 +512,10 @@ class ToCountPolynomialMap(ToCountMap):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
-            filtered_map = mem_map.filter_by(direction=['load'],
-                                             variable=['a', 'g'])
+            filtered_map = mem_map.filter_by(direction=["load"],
+                                             variable=["a", "g"])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -549,7 +551,7 @@ def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict):
 def subst_into_to_count_map(space, tcm, subst_dict):
     from loopy.isl_helpers import subst_into_pwqpolynomial
     new_count_map = {}
-    for key, value in six.iteritems(tcm.count_map):
+    for key, value in tcm.count_map.items():
         if isinstance(value, GuardedPwQPolynomial):
             new_count_map[key] = subst_into_guarded_pwqpolynomial(
                     space, value, subst_dict)
@@ -576,13 +578,13 @@ def stringify_stats_mapping(m):
 
     result = ""
     for key in sorted(m.keys(), key=lambda k: str(k)):
-        result += ("%s : %s\n" % (key, m[key]))
+        result += ("{} : {}\n".format(key, m[key]))
     return result
 
 
 # {{{ CountGranularity
 
-class CountGranularity(object):
+class CountGranularity:
     """Strings specifying whether an operation should be counted once per
     *work-item*, *sub-group*, or *work-group*.
 
@@ -618,7 +620,7 @@ class Op(ImmutableRecord):
 
     .. attribute:: dtype
 
-       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
        data type operated on.
 
     .. attribute:: name
@@ -632,7 +634,7 @@ class Op(ImmutableRecord):
        once per *work-item*, *sub-group*, or *work-group*. The granularities
        allowed can be found in :class:`CountGranularity`, and may be accessed,
        e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
-       of computation executing on a single processor (think 'thread'), a
+       of computation executing on a single processor (think "thread"), a
        collection of which may be grouped together into a work-group. Each
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
@@ -656,17 +658,17 @@ class Op(ImmutableRecord):
             from loopy.types import to_loopy_type
             dtype = to_loopy_type(dtype)
 
-        super(Op, self).__init__(dtype=dtype, name=name,
+        super().__init__(dtype=dtype, name=name,
                         count_granularity=count_granularity,
                         kernel_name=kernel_name)
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
         if self.kernel_name is not None:
-            return "Op(%s, %s, %s, %s)" % (
-                    self.dtype, self.name, self.count_granularity, self.kernel_name)
+            return (f"Op({self.dtype}, {self.name}, {self.count_granularity},"
+                    f" {self.kernel_name})")
         else:
-            return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity)
+            return f"Op({self.dtype}, {self.name}, {self.count_granularity})"
 
 # }}}
 
@@ -683,7 +685,7 @@ class MemAccess(ImmutableRecord):
 
     .. attribute:: dtype
 
-       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the
        data type accessed.
 
     .. attribute:: lid_strides
@@ -717,7 +719,7 @@ class MemAccess(ImmutableRecord):
     .. attribute:: variable_tag
 
        A :class:`str` that specifies the variable tag of a
-       :class:`pymbolic.primitives.TaggedVariable`.
+       :class:`loopy.symbolic.TaggedVariable`.
 
     .. attribute:: count_granularity
 
@@ -725,7 +727,7 @@ class MemAccess(ImmutableRecord):
        once per *work-item*, *sub-group*, or *work-group*. The granularities
        allowed can be found in :class:`CountGranularity`, and may be accessed,
        e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
-       of computation executing on a single processor (think 'thread'), a
+       of computation executing on a single processor (think "thread"), a
        collection of which may be grouped together into a work-group. Each
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
@@ -750,7 +752,7 @@ class MemAccess(ImmutableRecord):
             from loopy.types import to_loopy_type
             dtype = to_loopy_type(dtype)
 
-        super(MemAccess, self).__init__(mtype=mtype, dtype=dtype,
+        super().__init__(mtype=mtype, dtype=dtype,
                         lid_strides=lid_strides, gid_strides=gid_strides,
                         direction=direction, variable=variable,
                         variable_tag=variable_tag,
@@ -763,13 +765,13 @@ class MemAccess(ImmutableRecord):
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s, %s)" % (
+        return "MemAccess({}, {}, {}, {}, {}, {}, {}, {}, {})".format(
             self.mtype,
             self.dtype,
             None if self.lid_strides is None else dict(
-                sorted(six.iteritems(self.lid_strides))),
+                sorted(self.lid_strides.items())),
             None if self.gid_strides is None else dict(
-                sorted(six.iteritems(self.gid_strides))),
+                sorted(self.gid_strides.items())),
             self.direction,
             self.variable,
             self.variable_tag,
@@ -795,11 +797,11 @@ class Sync(ImmutableRecord):
     """
 
     def __init__(self, kind=None, kernel_name=None):
-        super(Sync, self).__init__(kind=kind, kernel_name=kernel_name)
+        super().__init__(kind=kind, kernel_name=kernel_name)
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "Sync(%s, %s)" % (self.kind, self.kernel_name)
+        return f"Sync({self.kind}, {self.kernel_name})"
 
 # }}}
 
@@ -844,12 +846,12 @@ class CounterBase(CombineMapper):
         if isinstance(clbl, CallableKernel):
             sub_result = self.kernel_rec(clbl.subkernel)
 
-            arg_dict = dict(
-                    (arg.name, value)
+            arg_dict = {
+                    arg.name: value
                     for arg, value in zip(
                         clbl.subkernel.args,
                         expr.parameters)
-                    if isinstance(arg, ValueArg))
+                    if isinstance(arg, ValueArg)}
 
             return subst_into_to_count_map(
                     self.param_space,
@@ -909,7 +911,7 @@ class CounterBase(CombineMapper):
 class ExpressionOpCounter(CounterBase):
     def __init__(self, knl, callables_table, kernel_rec,
             count_within_subscripts=True):
-        super(ExpressionOpCounter, self).__init__(
+        super().__init__(
                 knl, callables_table, kernel_rec)
         self.count_within_subscripts = count_within_subscripts
 
@@ -933,12 +935,12 @@ class ExpressionOpCounter(CounterBase):
         if not isinstance(clbl, CallableKernel):
             return self.new_poly_map(
                         {Op(dtype=self.type_inf(expr),
-                            name='func:'+clbl.name,
+                            name="func:"+clbl.name,
                             count_granularity=self.arithmetic_count_granularity,
                             kernel_name=self.knl.name): self.one}
                         ) + self.rec(expr.parameters)
         else:
-            return super(ExpressionOpCounter, self).map_call(expr)
+            return super().map_call(expr)
 
     def map_subscript(self, expr):
         if self.count_within_subscripts:
@@ -954,7 +956,7 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return self.new_poly_map(
                     {Op(dtype=self.type_inf(expr),
-                        name='add',
+                        name="add",
                         count_granularity=self.arithmetic_count_granularity,
                         kernel_name=self.knl.name):
                      self.zero + (len(expr.children)-1)}
@@ -964,7 +966,7 @@ class ExpressionOpCounter(CounterBase):
         from pymbolic.primitives import is_zero
         assert expr.children
         return sum(self.new_poly_map({Op(dtype=self.type_inf(expr),
-                                  name='mul',
+                                  name="mul",
                                   count_granularity=(
                                       self.arithmetic_count_granularity),
                                   kernel_name=self.knl.name): self.one})
@@ -972,14 +974,14 @@ class ExpressionOpCounter(CounterBase):
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    self.new_poly_map({Op(dtype=self.type_inf(expr),
-                                  name='mul',
+                                  name="mul",
                                   count_granularity=(
                                       self.arithmetic_count_granularity),
                                   kernel_name=self.knl.name): -self.one})
 
     def map_quotient(self, expr, *args):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='div',
+                              name="div",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.numerator) \
@@ -990,7 +992,7 @@ class ExpressionOpCounter(CounterBase):
 
     def map_power(self, expr):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='pow',
+                              name="pow",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.base) \
@@ -998,7 +1000,7 @@ class ExpressionOpCounter(CounterBase):
 
     def map_left_shift(self, expr):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='shift',
+                              name="shift",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.shiftee) \
@@ -1008,14 +1010,14 @@ class ExpressionOpCounter(CounterBase):
 
     def map_bitwise_not(self, expr):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='bw',
+                              name="bw",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='bw',
+                              name="bw",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name):
                            self.zero + (len(expr.children)-1)}) \
@@ -1040,7 +1042,7 @@ class ExpressionOpCounter(CounterBase):
 
     def map_min(self, expr):
         return self.new_poly_map({Op(dtype=self.type_inf(expr),
-                              name='maxmin',
+                              name="maxmin",
                               count_granularity=self.arithmetic_count_granularity,
                               kernel_name=self.knl.name):
                            len(expr.children)-1}) \
@@ -1119,14 +1121,22 @@ def _get_lid_and_gid_strides(knl, array, index):
 
     def get_iname_strides(tag_to_iname_dict):
         tag_to_stride_dict = {}
-        for tag, iname in six.iteritems(tag_to_iname_dict):
+
+        if array.dim_tags is None:
+            assert len(index) <= 1
+            dim_tags = (None,) * len(index)
+        else:
+            dim_tags = array.dim_tags
+
+        for tag, iname in tag_to_iname_dict.items():
             total_iname_stride = 0
             # find total stride of this iname for each axis
-            for idx, axis_tag in zip(index, array.dim_tags):
+            for idx, axis_tag in zip(index, dim_tags):
                 # collect index coefficients
                 try:
-                    coeffs = _IndexStrideCoefficientCollector()(
-                              simplify_using_aff(knl, idx))
+                    coeffs = _IndexStrideCoefficientCollector(
+                            [tag_to_iname_dict[tag]])(
+                                    simplify_using_aff(knl, idx))
                 except ExpressionNotAffineError:
                     total_iname_stride = None
                     break
@@ -1142,6 +1152,14 @@ def _get_lid_and_gid_strides(knl, array, index):
                 # now determine stride
                 if isinstance(axis_tag, FixedStrideArrayDimTag):
                     axis_tag_stride = axis_tag.stride
+
+                    if axis_tag_stride is lp.auto:
+                        total_iname_stride = None
+                        break
+
+                elif axis_tag is None:
+                    axis_tag_stride = 1
+
                 else:
                     continue
 
@@ -1172,7 +1190,7 @@ class MemAccessCounterBase(CounterBase):
         if not isinstance(clbl, CallableKernel):
             return self.rec(expr.parameters)
         else:
-            return super(MemAccessCounterBase, self).map_call(expr)
+            return super().map_call(expr)
 
 # }}}
 
@@ -1191,7 +1209,7 @@ class LocalMemAccessCounter(MemAccessCounterBase):
                 if index is None:
                     # no subscript
                     count_map[MemAccess(
-                                mtype='local',
+                                mtype="local",
                                 dtype=dtype,
                                 count_granularity=self.local_mem_count_granularity,
                                 kernel_name=self.knl.name)] = self.one
@@ -1208,10 +1226,10 @@ class LocalMemAccessCounter(MemAccessCounterBase):
                                                 self.knl, array, index_tuple)
 
                 count_map[MemAccess(
-                        mtype='local',
+                        mtype="local",
                         dtype=dtype,
-                        lid_strides=dict(sorted(six.iteritems(lid_strides))),
-                        gid_strides=dict(sorted(six.iteritems(gid_strides))),
+                        lid_strides=dict(sorted(lid_strides.items())),
+                        gid_strides=dict(sorted(gid_strides.items())),
                         variable=name,
                         count_granularity=self.local_mem_count_granularity,
                         kernel_name=self.knl.name)] = self.one
@@ -1249,7 +1267,7 @@ class GlobalMemAccessCounter(MemAccessCounterBase):
             # this array is not in global memory
             return self.new_zero_poly_map()
 
-        return self.new_poly_map({MemAccess(mtype='global',
+        return self.new_poly_map({MemAccess(mtype="global",
                     dtype=self.type_inf(expr), lid_strides={},
                     gid_strides={}, variable=name,
                     count_granularity=CountGranularity.WORKITEM,
@@ -1292,10 +1310,10 @@ class GlobalMemAccessCounter(MemAccessCounterBase):
                 ) else global_access_count_granularity
 
         return self.new_poly_map({MemAccess(
-                            mtype='global',
+                            mtype="global",
                             dtype=self.type_inf(expr),
-                            lid_strides=dict(sorted(six.iteritems(lid_strides))),
-                            gid_strides=dict(sorted(six.iteritems(gid_strides))),
+                            lid_strides=dict(sorted(lid_strides.items())),
+                            gid_strides=dict(sorted(gid_strides.items())),
                             variable=name,
                             variable_tag=var_tag,
                             count_granularity=count_granularity,
@@ -1321,7 +1339,7 @@ class AccessFootprintGatherer(CombineMapper):
         def merge_dicts(a, b):
             result = a.copy()
 
-            for var_name, footprint in six.iteritems(b):
+            for var_name, footprint in b.items():
                 if var_name in result:
                     result[var_name] = result[var_name] | footprint
                 else:
@@ -1641,7 +1659,7 @@ def _get_op_map_for_single_kernel(knl, callables_table,
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignees) + op_counter(insn.expression)
-            for key, val in six.iteritems(ops.count_map):
+            for key, val in ops.count_map.items():
                 count = _get_insn_count(knl, callables_table, insn.id,
                             subgroup_size, count_redundant_work,
                             key.count_granularity)
@@ -1673,14 +1691,14 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         count operations inside array indices.
 
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
-        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
         made, if this fails an error will be raised. If a :class:`str`
-        ``'guess'`` is passed as the subgroup_size, :func:`get_op_map` will
+        ``"guess"`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
 
@@ -1699,13 +1717,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         # (first create loopy kernel and specify array data types)
 
         op_map = get_op_map(knl)
-        params = {'n': 512, 'm': 256, 'l': 128}
+        params = {"n": 512, "m": 256, "l": 128}
         f32add = op_map[Op(np.float32,
-                           'add',
+                           "add",
                            count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
         f32mul = op_map[Op(np.float32,
-                           'mul',
+                           "mul",
                            count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
 
@@ -1746,7 +1764,7 @@ def _find_subgroup_size_for_knl(knl):
         subgroup_size_guess = get_simd_group_size(knl.target.device, None)
         warn_with_kernel(knl, "getting_subgroup_size_from_device",
                          "Device: %s. Using sub-group size given by "
-                         "pyopencl.characterize.get_simd_group_size(): %d"
+                         "pyopencl.characterize.get_simd_group_size(): %s"
                          % (knl.target.device, subgroup_size_guess))
         return subgroup_size_guess
     else:
@@ -1764,7 +1782,7 @@ def _process_subgroup_size(knl, subgroup_size_requested):
 
         if subgroup_size_requested is None:
             if subgroup_size_guess is None:
-                # 'guess' was not passed and either no target device found
+                # "guess" was not passed and either no target device found
                 # or get_simd_group_size returned None
                 raise ValueError("No sub-group size passed, no target device found. "
                                  "Either (1) pass integer value for subgroup_size, "
@@ -1774,7 +1792,7 @@ def _process_subgroup_size(knl, subgroup_size_requested):
             else:
                 return subgroup_size_guess
 
-        elif subgroup_size_requested == 'guess':
+        elif subgroup_size_requested == "guess":
             if subgroup_size_guess is None:
                 # unable to get subgroup_size from device, so guess
                 subgroup_size_guess = 32
@@ -1831,7 +1849,7 @@ def _get_mem_access_map_for_single_kernel(knl, callables_table,
                         + access_counter_l(insn.assignee)
                         ).with_set_attributes(direction="store")
 
-            for key, val in six.iteritems(insn_access_map.count_map):
+            for key, val in insn_access_map.count_map.items():
                 count = _get_insn_count(knl, callables_table, insn.id,
                             subgroup_size, count_redundant_work,
                             key.count_granularity)
@@ -1860,14 +1878,14 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or
+    :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or
         *None* that specifies the sub-group size. An OpenCL sub-group is an
         implementation-dependent grouping of work-items within a work-group,
         analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
         counting a :class:`MemAccess` whose count_granularity specifies that it
         should only be counted once per sub-group. If set to *None* an attempt
         to find the sub-group size using the device will be made, if this fails
-        an error will be raised. If a :class:`str` ``'guess'`` is passed as
+        an error will be raised. If a :class:`str` ``"guess"`` is passed as
         the subgroup_size, get_mem_access_map will attempt to find the
         sub-group size using the device and, if unsuccessful, will make a wild
         guess.
@@ -1886,43 +1904,43 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False,
 
         # (first create loopy kernel and specify array data types)
 
-        params = {'n': 512, 'm': 256, 'l': 128}
+        params = {"n": 512, "m": 256, "l": 128}
         mem_map = get_mem_access_map(knl)
 
         f32_s1_g_ld_a = mem_map[MemAccess(
-                                    mtype='global',
+                                    mtype="global",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='load',
-                                    variable='a',
+                                    direction="load",
+                                    variable="a",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_g_st_a = mem_map[MemAccess(
-                                    mtype='global',
+                                    mtype="global",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='store',
-                                    variable='a',
+                                    direction="store",
+                                    variable="a",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_l_ld_x = mem_map[MemAccess(
-                                    mtype='local',
+                                    mtype="local",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='load',
-                                    variable='x',
+                                    direction="load",
+                                    variable="x",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_l_st_x = mem_map[MemAccess(
-                                    mtype='local',
+                                    mtype="local",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='store',
-                                    variable='x',
+                                    direction="store",
+                                    variable="x",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
 
@@ -1954,10 +1972,6 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False,
 def _get_synchronization_map_for_single_kernel(knl, callables_table,
         subgroup_size=None):
 
-    if not knl.options.ignore_boostable_into:
-        raise LoopyError("Kernel '%s': Using operation counting requires the option "
-                "ignore_boostable_into to be set." % knl.name)
-
     knl = lp.get_one_scheduled_kernel(knl, callables_table)
 
     from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
@@ -2011,14 +2025,14 @@ def get_synchronization_map(program, subgroup_size=None):
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
-        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
         made, if this fails an error will be raised. If a :class:`str`
-        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        ``"guess"`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
 
@@ -2034,8 +2048,8 @@ def get_synchronization_map(program, subgroup_size=None):
         # (first create loopy kernel and specify array data types)
 
         sync_map = get_synchronization_map(knl)
-        params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
+        params = {"n": 512, "m": 256, "l": 128}
+        barrier_ct = sync_map["barrier_local"].eval_with_dict(params)
 
         # (now use this count to, e.g., predict performance)
 
@@ -2119,10 +2133,10 @@ def gather_access_footprints(program, ignore_uncountable=False):
 
     result = {}
 
-    for vname, footprint in six.iteritems(write_footprints):
+    for vname, footprint in write_footprints.items():
         result[(vname, "write")] = footprint
 
-    for vname, footprint in six.iteritems(read_footprints):
+    for vname, footprint in read_footprints.items():
         result[(vname, "read")] = footprint
 
     return result
@@ -2166,5 +2180,4 @@ def gather_access_footprint_bytes(program, ignore_uncountable=False):
 
 # }}}
 
-
 # vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b8341bcd15b76eeb1705b8ca26d69237a6b97c0d..165b8ea4415547f5557e73ef987c4348493376d9 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1,6 +1,5 @@
 """Pymbolic mappers for loopy."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,8 +24,8 @@ THE SOFTWARE.
 """
 
 
-import six
-from six.moves import range, zip, reduce, intern
+from functools import reduce
+from sys import intern
 
 from pytools import memoize, memoize_method, ImmutableRecord
 import pytools.lex
@@ -65,10 +64,36 @@ from islpy import dim_type
 import re
 import numpy as np
 
+__doc__ = """
+.. currentmodule:: loopy.symbolic
+
+.. autoclass:: Literal
+
+.. autoclass:: ArrayLiteral
+
+.. autoclass:: FunctionIdentifier
+
+.. autoclass:: TypedCSE
+
+.. autoclass:: TypeCast
+
+.. autoclass:: TaggedVariable
+
+.. autoclass:: Reduction
+
+.. autoclass:: LinearSubscript
+
+.. autoclass:: RuleArgument
+
+.. autoclass:: ExpansionState
+
+.. autoclass:: RuleAwareIdentityMapper
+"""
+
 
 # {{{ mappers with support for loopy-specific primitives
 
-class IdentityMapperMixin(object):
+class IdentityMapperMixin:
     def map_literal(self, expr, *args, **kwargs):
         return expr
 
@@ -232,13 +257,13 @@ class StringifyMapper(StringifyMapperBase):
     def map_reduction(self, expr, prec):
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        return "%sreduce(%s, [%s], %s)" % (
+        return "{}reduce({}, [{}], {})".format(
                 "simul_" if expr.allow_simultaneous else "",
                 expr.operation, ", ".join(expr.inames),
                 self.rec(expr.expr, PREC_NONE))
 
     def map_tagged_variable(self, expr, prec):
-        return "%s$%s" % (expr.name, expr.tag)
+        return f"{expr.name}${expr.tag}"
 
     def map_linear_subscript(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
@@ -249,7 +274,7 @@ class StringifyMapper(StringifyMapperBase):
                 enclosing_prec, PREC_CALL)
 
     def map_loopy_function_identifier(self, expr, enclosing_prec):
-        return "%s<%s>" % (
+        return "{}<{}>".format(
                 type(expr).__name__,
                 ", ".join(str(a) for a in expr.__getinitargs__()))
 
@@ -258,14 +283,15 @@ class StringifyMapper(StringifyMapperBase):
 
     def map_type_cast(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_NONE
-        return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
+        return "cast({}, {})".format(
+                repr(expr.type), self.rec(expr.child, PREC_NONE))
 
     def map_resolved_function(self, expr, prec):
         return expr.name
 
     def map_sub_array_ref(self, expr, prec):
         return "[{inames}]: {subscr}".format(
-                inames=','.join(self.rec(iname, prec) for iname in
+                inames=",".join(self.rec(iname, prec) for iname in
                     expr.swept_inames),
                 subscr=self.rec(expr.subscript, prec))
 
@@ -282,7 +308,7 @@ class EqualityPreservingStringifyMapper(StringifyMapperBase):
     """
 
     def __init__(self):
-        super(EqualityPreservingStringifyMapper, self).__init__()
+        super().__init__()
 
     def map_constant(self, expr, enclosing_prec):
         if isinstance(expr, np.generic):
@@ -290,7 +316,7 @@ class EqualityPreservingStringifyMapper(StringifyMapperBase):
 
             # FIXME: This syntax cannot currently be parsed.
 
-            return "%s(%s)" % (type(expr).__name__, repr(expr))
+            return "{}({})".format(type(expr).__name__, repr(expr))
         else:
             result = repr(expr)
 
@@ -308,8 +334,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.function) != type(other.function)  # noqa
-                ):
+                or type(expr.function) != type(other.function)):  # noqa
             return []
 
         return self.rec(expr.expr, other.expr, unis)
@@ -353,11 +378,10 @@ class DependencyMapper(DependencyMapperBase):
 
     def map_reduction(self, expr, *args, **kwargs):
         deps = self.rec(expr.expr, *args, **kwargs)
-
-        return deps - set(p.Variable(iname) for iname in expr.inames)
+        return deps - {p.Variable(iname) for iname in expr.inames}
 
     def map_tagged_variable(self, expr, *args, **kwargs):
-        return set([expr])
+        return {expr}
 
     def map_loopy_function_identifier(self, expr, *args, **kwargs):
         return set()
@@ -374,6 +398,9 @@ class DependencyMapper(DependencyMapperBase):
     def map_resolved_function(self, expr):
         return self.rec(expr.function)
 
+    def map_literal(self, expr):
+        return set()
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -383,7 +410,7 @@ class SubstitutionRuleExpander(IdentityMapper):
         if expr.name in self.rules:
             return self.map_substitution(expr.name, self.rules[expr.name], ())
         else:
-            return super(SubstitutionRuleExpander, self).map_variable(expr)
+            return super().map_variable(expr)
 
     def map_call(self, expr):
         if expr.function.name in self.rules:
@@ -392,7 +419,7 @@ class SubstitutionRuleExpander(IdentityMapper):
                     self.rules[expr.function.name],
                     expr.parameters)
         else:
-            return super(SubstitutionRuleExpander, self).map_call(expr)
+            return super().map_call(expr)
 
     def map_substitution(self, name, rule, arguments):
         if len(rule.arguments) != len(arguments):
@@ -429,7 +456,7 @@ class Literal(LoopyExpressionBase):
     .. note::
 
         Only used in the output of
-        :mod:`loopy.target.c.expression.ExpressionToCExpressionMapper` (and
+        :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and
         similar mappers). Not for use in Loopy source representation.
     """
 
@@ -450,7 +477,7 @@ class ArrayLiteral(LoopyExpressionBase):
     .. note::
 
         Only used in the output of
-        :mod:`loopy.target.c.expression.ExpressionToCExpressionMapper` (and
+        :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and
         similar mappers). Not for use in Loopy source representation.
     """
 
@@ -511,7 +538,7 @@ class TypedCSE(LoopyExpressionBase, p.CommonSubexpression):
     """
 
     def __init__(self, child, prefix=None, dtype=None):
-        super(TypedCSE, self).__init__(child, prefix)
+        super().__init__(child, prefix)
         self.dtype = dtype
 
     def __getinitargs__(self):
@@ -527,7 +554,7 @@ class TypeAnnotation(LoopyExpressionBase):
     """
 
     def __init__(self, type, child):
-        super(TypeAnnotation, self).__init__()
+        super().__init__()
         self.type = type
         self.child = child
 
@@ -547,7 +574,7 @@ class TypeCast(LoopyExpressionBase):
     """
 
     def __init__(self, type, child):
-        super(TypeCast, self).__init__()
+        super().__init__()
 
         from loopy.types import to_loopy_type, NumpyType
         type = to_loopy_type(type)
@@ -587,7 +614,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable):
     init_arg_names = ("name", "tag")
 
     def __init__(self, name, tag):
-        super(TaggedVariable, self).__init__(name)
+        super().__init__(name)
         self.tag = tag
 
     def __getinitargs__(self):
@@ -597,8 +624,8 @@ class TaggedVariable(LoopyExpressionBase, p.Variable):
 
 
 class Reduction(LoopyExpressionBase):
-    """Represents a reduction operation on :attr:`exprs`
-    across :attr:`inames`.
+    """
+    Represents a reduction operation on :attr:`expr` across :attr:`inames`.
 
     .. attribute:: operation
         an instance of :class:`loopy.library.reduction.ReductionOperation`
@@ -612,9 +639,9 @@ class Reduction(LoopyExpressionBase):
 
         An expression which may have tuple type. If the expression has tuple
         type, it must be one of the following:
-         * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or
-         * a :class:`loopy.symbolic.Reduction`, or
-         * a function call or substitution rule invocation.
+        * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or
+        * a :class:`loopy.symbolic.Reduction`, or
+        * a function call or substitution rule invocation.
 
     .. attribute:: allow_simultaneous
 
@@ -813,7 +840,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase):
                 or expr.aggregate.name not in self.target_names):
             return {1: expr}
 
-        return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr)
+        return super().map_algebraic_leaf(expr)
 
 
 def get_start_subscript_from_sar(sar, kernel):
@@ -981,11 +1008,11 @@ def rename_subst_rules_in_instructions(insns, renames):
             for insn in insns]
 
 
-class SubstitutionRuleMappingContext(object):
+class SubstitutionRuleMappingContext:
     def _get_subst_rule_key(self, args, body):
-        subst_dict = dict(
-                (arg, RuleArgument(i))
-                for i, arg in enumerate(args))
+        subst_dict = {
+                arg: RuleArgument(i)
+                for i, arg in enumerate(args)}
 
         from pymbolic.mapper.substitutor import make_subst_func
         arg_subst_map = SubstitutionMapper(make_subst_func(subst_dict))
@@ -997,10 +1024,10 @@ class SubstitutionRuleMappingContext(object):
         self.make_unique_var_name = make_unique_var_name
 
         # maps subst rule (args, bodies) to (names, original_name)
-        self.subst_rule_registry = dict(
-                (self._get_subst_rule_key(rule.arguments, rule.expression),
-                    (name, rule.arguments, rule.expression))
-                for name, rule in six.iteritems(old_subst_rules))
+        self.subst_rule_registry = {
+                self._get_subst_rule_key(rule.arguments, rule.expression):
+                (name, rule.arguments, rule.expression)
+                for name, rule in old_subst_rules.items()}
 
         # maps subst rule (args, bodies) to a list of old names,
         # which doubles as (a) a histogram of uses and (b) a way
@@ -1049,8 +1076,7 @@ class SubstitutionRuleMappingContext(object):
 
         used_names = set()
 
-        for key, (name, args, body) in six.iteritems(
-                self.subst_rule_registry):
+        for key, (name, args, body) in self.subst_rule_registry.items():
             orig_names = self.subst_rule_old_names.get(key, [])
 
             # If no orig_names are found, then this particular
@@ -1077,7 +1103,7 @@ class SubstitutionRuleMappingContext(object):
         subst_renamer = SubstitutionRuleRenamer(renames)
 
         renamed_result = {}
-        for name, rule in six.iteritems(result):
+        for name, rule in result.items():
             renamed_result[name] = rule.copy(
                     expression=subst_renamer(rule.expression))
 
@@ -1122,7 +1148,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
         name, tag = parse_tagged_name(expr.function)
 
         if name not in self.rule_mapping_context.old_subst_rules:
-            return super(RuleAwareIdentityMapper, self).map_call(expr, expn_state)
+            return super().map_call(expr, expn_state)
         else:
             return self.map_substitution(name, tag, self.rec(
                 expr.parameters, expn_state), expn_state)
@@ -1135,9 +1161,9 @@ class RuleAwareIdentityMapper(IdentityMapper):
 
         from pymbolic.mapper.substitutor import make_subst_func
         arg_subst_map = SubstitutionMapper(make_subst_func(arg_context))
-        return dict(
-                (formal_arg_name, arg_subst_map(arg_value))
-                for formal_arg_name, arg_value in zip(arg_names, arguments))
+        return {
+                formal_arg_name: arg_subst_map(arg_value)
+                for formal_arg_name, arg_value in zip(arg_names, arguments)}
 
     def map_substitution(self, name, tag, arguments, expn_state):
         rule = self.rule_mapping_context.old_subst_rules[name]
@@ -1189,7 +1215,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
                 # may perform tasks entirely unrelated to subst rules, so
                 # we must map assignees, too.
                 self.map_instruction(kernel,
-                    insn.with_transformed_expressions(self, kernel, insn))
+                    insn.with_transformed_expressions(
+                        lambda expr: self(expr, kernel, insn)))
                 for insn in kernel.instructions]
 
         return kernel.copy(instructions=new_insns)
@@ -1197,7 +1224,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
 
 class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, subst_func, within):
-        super(RuleAwareSubstitutionMapper, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.subst_func = subst_func
         self.within = within
@@ -1206,20 +1233,20 @@ class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
         if (expr.name in expn_state.arg_context
                 or not self.within(
                     expn_state.kernel, expn_state.instruction, expn_state.stack)):
-            return super(RuleAwareSubstitutionMapper, self).map_variable(
+            return super().map_variable(
                     expr, expn_state)
 
         result = self.subst_func(expr)
         if result is not None:
             return result
         else:
-            return super(RuleAwareSubstitutionMapper, self).map_variable(
+            return super().map_variable(
                     expr, expn_state)
 
 
 class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, rules, within):
-        super(RuleAwareSubstitutionRuleExpander, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.rules = rules
         self.within = within
@@ -1252,7 +1279,7 @@ class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper):
 
         else:
             # do not expand
-            return super(RuleAwareSubstitutionRuleExpander, self).map_substitution(
+            return super().map_substitution(
                     name, tag, arguments, expn_state)
 
 # }}}
@@ -1433,7 +1460,7 @@ class LoopyParser(ParserBase):
             return SubArrayRef(swept_inames, subscript)
 
         else:
-            return super(LoopyParser, self).parse_prefix(pstate)
+            return super().parse_prefix(pstate)
 
     def parse_postfix(self, pstate, min_precedence, left_exp):
         from pymbolic.parser import _PREC_CALL, _closebracket
@@ -1493,7 +1520,7 @@ class ArrayAccessFinder(CombineMapper):
 
         if self.tgt_vector_name is None \
                 or expr.aggregate.name == self.tgt_vector_name:
-            return set([expr]) | self.rec(expr.index)
+            return {expr} | self.rec(expr.index)
         else:
             return CombineMapper.map_subscript(self, expr)
 
@@ -1571,7 +1598,7 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
         self.zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
 
         context = {}
-        for name, (dt, pos) in six.iteritems(space.get_var_dict()):
+        for name, (dt, pos) in space.get_var_dict().items():
             if dt == dim_type.set:
                 dt = dim_type.in_
 
@@ -1583,7 +1610,7 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
 
         self.pw_zero = isl.PwAff.from_aff(self.zero)
 
-        super(PwAffEvaluationMapper, self).__init__(context)
+        super().__init__(context)
 
     def map_constant(self, expr):
         if isinstance(expr, np.integer):
@@ -1624,6 +1651,10 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
 
         return num.mod_val(denom)
 
+    def map_literal(self, expr):
+        raise TypeError("literal '%s' not supported "
+                        "for as-pwaff evaluation" % expr)
+
 
 def aff_from_expr(space, expr, vars_to_zero=None):
     if vars_to_zero is None:
@@ -1682,6 +1713,56 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None):
 # }}}
 
 
+# {{{ (pw_)?qpoly_from_expr
+
+class PwQPolyEvaluationMapper(EvaluationMapperBase):
+    def __init__(self, space, vars_to_zero):
+        zero_qpoly = isl.QPolynomial.zero_on_domain(space)
+
+        context = {}
+        for name, (dt, pos) in space.get_var_dict().items():
+            if dt == dim_type.set:
+                dt = dim_type.in_
+
+            context[name] = isl.PwQPolynomial.from_qpolynomial(
+                    isl.QPolynomial.var_on_domain(space, dt, pos))
+
+        for v in vars_to_zero:
+            context[v] = zero_qpoly
+
+        self.pw_zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly)
+
+        super().__init__(context)
+
+    def map_constant(self, expr):
+        if isinstance(expr, np.integer):
+            expr = int(expr)
+
+        return self.pw_zero + expr
+
+    def map_quotient(self, expr):
+        raise TypeError("true division in '%s' not supported "
+                "for as-pwqpoly evaluation" % expr)
+
+
+def pw_qpolynomial_from_expr(space, expr, vars_to_zero=frozenset()):
+    return PwQPolyEvaluationMapper(space, vars_to_zero)(expr)
+
+
+def qpolynomial_from_expr(space, expr):
+    pw_qpoly = pw_qpolynomial_from_expr(space, expr).coalesce()
+
+    pieces = pw_qpoly.get_pieces()
+    if len(pieces) == 1:
+        (s, qpoly), = pieces
+        return qpoly
+    else:
+        raise RuntimeError("expression '%s' could not be converted to a "
+                "non-piecewise quasi-polynomial expression" % expr)
+
+# }}}
+
+
 # {{{ simplify using aff
 
 # FIXME: redundant with simplify_via_aff
@@ -1716,9 +1797,8 @@ def _term_to_expr(space, term):
                 result = result*Variable(space.get_dim_name(dt, i))**exp
 
     for i in range(term.dim(dim_type.div)):
-        raise NotImplementedError("divs in terms")
-        # FIXME print the qpoly, match the semantics
-        result += aff_to_expr(term.get_div(i))
+        exp = term.get_exp(dim_type.div, i)
+        result *= (aff_to_expr(term.get_div(i))**exp)
 
     return result
 
@@ -1751,6 +1831,101 @@ def constraint_to_cond_expr(cns):
 # }}}
 
 
+# {{{ isl_set_from_expr
+
+class ConditionExpressionToBooleanOpsExpression(IdentityMapper):
+    """
+    Mapper to convert expressions into composition of boolean operation nodes
+    according to C-semantics.
+
+    For ex.:
+        - ``i`` becomes ``i != 0``
+        - ``i>10 and j`` becomes ``i>10 and j!=0``
+    """
+
+    def map_comparison(self, expr):
+        return expr
+
+    def _get_expr_neq_0(self, expr):
+        return p.Comparison(expr, "!=", 0)
+
+    map_variable = _get_expr_neq_0
+    map_subscript = _get_expr_neq_0
+    map_sum = _get_expr_neq_0
+    map_product = _get_expr_neq_0
+    map_constant = _get_expr_neq_0
+    map_call = _get_expr_neq_0
+    map_power = _get_expr_neq_0
+    map_power = _get_expr_neq_0
+
+
+class AffineConditionToISLSetMapper(IdentityMapper):
+    """
+    Mapper to convert a condition :class:`~pymbolic.primitives.Expression` to a
+    :class:`~islpy.Set`.
+    """
+
+    def __init__(self, space):
+        self.space = space
+        super().__init__()
+
+    def map_comparison(self, expr):
+        if expr.operator == "!=":
+            return self.rec(p.LogicalNot(p.Comparison(expr.left, "==", expr.right)))
+
+        left_aff = guarded_aff_from_expr(self.space, expr.left)
+        right_aff = guarded_aff_from_expr(self.space, expr.right)
+
+        if expr.operator == "==":
+            cnst = isl.Constraint.equality_from_aff(left_aff-right_aff)
+        elif expr.operator == ">=":
+            cnst = isl.Constraint.inequality_from_aff(left_aff-right_aff)
+        elif expr.operator == ">":
+            cnst = isl.Constraint.inequality_from_aff(left_aff-right_aff-1)
+        elif expr.operator == "<=":
+            cnst = isl.Constraint.inequality_from_aff(right_aff-left_aff)
+        elif expr.operator == "<":
+            cnst = isl.Constraint.inequality_from_aff(right_aff-left_aff-1)
+        else:
+            assert False
+
+        return isl.Set.universe(self.space).add_constraint(cnst)
+
+    def _map_logical_reduce(self, expr, f):
+        """
+        :arg f: Reduction callable.
+        """
+        sets = [self.rec(child) for child in expr.children]
+        return reduce(f, sets)
+
+    def map_logical_or(self, expr):
+        import operator
+        return self._map_logical_reduce(expr, operator.or_)
+
+    def map_logical_and(self, expr):
+        import operator
+        return self._map_logical_reduce(expr, operator.and_)
+
+    def map_logical_not(self, expr):
+        set_ = self.rec(expr.child)
+        return set_.complement()
+
+
+def isl_set_from_expr(space, expr):
+    """
+    :arg expr: An instance of :class:`pymbolic.primitives.Expression` whose
+        boolean value is evaluated according to C-semantics.
+    """
+    mapper = AffineConditionToISLSetMapper(space)
+    expr = ConditionExpressionToBooleanOpsExpression()(expr)
+    set_ = mapper(expr)
+    assert isinstance(set_, isl.Set)
+
+    return set_
+
+# }}}
+
+
 # {{{ set_to_cond_expr
 
 def basic_set_to_cond_expr(isl_basicset):
@@ -1881,9 +2056,11 @@ class UnableToDetermineAccessRange(Exception):
     pass
 
 
-def get_access_range(domain, subscript, assumptions, shape=None,
+def get_access_range(domain, subscript, assumptions=None, shape=None,
         allowed_constant_names=None):
     """
+    :arg assumptions: An instance of :class:`islpy.BasicSet` or *None*. *None*
+        is equivalent to the universal set over *domain*'s space.
     :arg shape: if not *None*, indicates that it is desired to return an
         overestimate of the access range based on the shape if a precise range
         cannot be determined.
@@ -1891,10 +2068,11 @@ def get_access_range(domain, subscript, assumptions, shape=None,
         permitted in the access range expressions. Names that are already
         parameters of *domain* may be repeated without ill effects.
     """
-    domain, assumptions = isl.align_two(domain,
-            assumptions)
-    domain = domain & assumptions
-    del assumptions
+    if assumptions is not None:
+        domain, assumptions = isl.align_two(domain,
+                assumptions)
+        domain = domain & assumptions
+        del assumptions
 
     dims = len(subscript)
 
@@ -1908,9 +2086,9 @@ def get_access_range(domain, subscript, assumptions, shape=None,
         access_map = isl.Set.from_basic_set(access_map)
 
     if allowed_constant_names is not None:
-        allowed_constant_names = set(allowed_constant_names) - set(
+        allowed_constant_names = set(allowed_constant_names) - {
                 access_map.get_dim_name(dim_type.param, i)
-                for i in range(access_map.dim(dim_type.param)))
+                for i in range(access_map.dim(dim_type.param))}
 
         par_base = access_map.dim(dim_type.param)
         access_map = access_map.insert_dims(dim_type.param, par_base,
@@ -2045,11 +2223,11 @@ class BatchedAccessRangeMapper(WalkMapper):
         return self.rec(expr.child, inames)
 
     def map_sub_array_ref(self, expr, inames):
-        total_inames = inames | set([iname.name for iname in expr.swept_inames])
+        total_inames = inames | {iname.name for iname in expr.swept_inames}
         return self.rec(expr.subscript, total_inames)
 
 
-class AccessRangeMapper(object):
+class AccessRangeMapper:
     """**IMPORTANT**
 
     Using this class *will likely* lead to performance bottlenecks.
@@ -2084,7 +2262,7 @@ class AccessRangeMapper(object):
 
 # {{{ check if access ranges overlap
 
-class AccessRangeOverlapChecker(object):
+class AccessRangeOverlapChecker:
     """Used for checking for overlap between access ranges of instructions."""
 
     def __init__(self, kernel):
@@ -2108,7 +2286,7 @@ class AccessRangeOverlapChecker(object):
         for expr in exprs:
             arm(expr, self.kernel.insn_inames(insn))
 
-        for name, arange in six.iteritems(arm.access_ranges):
+        for name, arange in arm.access_ranges.items():
             if arm.bad_subscripts[name]:
                 aranges[name] = True
                 continue
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 20220d41838783389da955f78773ca18b67d04c6..a05bc66a22e8a96919c2dd0af5cc1e4c1166e710 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -1,6 +1,5 @@
 """Base target interface."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
@@ -30,7 +29,6 @@ __doc__ = """
 
 .. autoclass:: TargetBase
 .. autoclass:: ASTBuilderBase
-
 .. autoclass:: CFamilyTarget
 .. autoclass:: CTarget
 .. autoclass:: ExecutableCTarget
@@ -44,7 +42,7 @@ __doc__ = """
 """
 
 
-class TargetBase(object):
+class TargetBase:
     """Base class for all targets, i.e. different combinations of code that
     loopy can generate.
 
@@ -142,7 +140,7 @@ class TargetBase(object):
         raise NotImplementedError()
 
 
-class ASTBuilderBase(object):
+class ASTBuilderBase:
     """An interface for generating (host or device) ASTs.
     """
 
@@ -249,14 +247,14 @@ class ASTBuilderBase(object):
 
 # {{{ dummy host ast builder
 
-class _DummyExpressionToCodeMapper(object):
+class _DummyExpressionToCodeMapper:
     def rec(self, expr, prec, type_context=None, needed_dtype=None):
         return ""
 
     __call__ = rec
 
 
-class _DummyASTBlock(object):
+class _DummyASTBlock:
     def __init__(self, arg):
         self.contents = []
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index c8aa041da632b7d6896376761117b416dd56eade..37997d7abeb6b22a304f6160af28a214eaf4c50d 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -1,6 +1,5 @@
 """Plain C target and base for other C-family languages."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
@@ -24,8 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-
 import numpy as np  # noqa
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError, LoopyTypeError
@@ -39,10 +36,20 @@ import pymbolic.primitives as p
 
 from pytools import memoize_method
 
+__doc__ = """
+.. currentmodule loopy.target.c
+
+.. autoclass:: POD
+
+.. autoclass:: ScopingBlock
+
+.. automodule:: loopy.target.c.codegen.expression
+"""
+
 
 # {{{ dtype registry wrapper
 
-class DTypeRegistryWrapper(object):
+class DTypeRegistryWrapper:
     def __init__(self, wrapped_registry):
         self.wrapped_registry = wrapped_registry
 
@@ -82,6 +89,11 @@ class DTypeRegistryWrapper(object):
 def c99_preamble_generator(preamble_info):
     if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
         yield("10_stdint", "#include <stdint.h>")
+    if any(dtype.numpy_dtype == np.dtype("bool")
+           for dtype in preamble_info.seen_dtypes):
+        yield("10_stdbool", "#include <stdbool.h>")
+    if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes):
+        yield("10_complex", "#include <complex.h>")
 
 
 def _preamble_generator(preamble_info):
@@ -153,9 +165,9 @@ def _preamble_generator(preamble_info):
             """,
             }
 
-    c_funcs = set(func.c_name for func in preamble_info.seen_functions)
+    c_funcs = {func.c_name for func in preamble_info.seen_functions}
 
-    for func_name, func_body in six.iteritems(function_defs):
+    for func_name, func_body in function_defs.items():
         if any((func_name + "_" + tpname) in c_funcs
                 for tpname in integer_type_names):
             yield def_integer_types_macro
@@ -204,7 +216,7 @@ class POD(Declarator):
 
 class ScopingBlock(Block):
     """A block that is mandatory for scoping and may not be simplified away
-    by :func:`loopy.codegen.results.merge_codegen_results`.
+    by :func:`loopy.codegen.result.merge_codegen_results`.
     """
 
 
@@ -248,8 +260,7 @@ def generate_linearized_array(array, value):
 
     assert array.offset == 0
 
-    from pytools import indices_in_shape
-    for ituple in indices_in_shape(value.shape):
+    for ituple in np.ndindex(value.shape):
         i = sum(i_ax * strd_ax for i_ax, strd_ax in zip(ituple, strides))
         data[i] = value[ituple]
 
@@ -310,7 +321,7 @@ class ASTSubscriptCollector(CASTIdentityMapper):
 
 # {{{ lazy expression generation
 
-class CExpression(object):
+class CExpression:
     def __init__(self, to_code_mapper, expr):
         self.to_code_mapper = to_code_mapper
         self.expr = expr
@@ -332,7 +343,7 @@ class CFamilyTarget(TargetBase):
 
     def __init__(self, fortran_abi=False):
         self.fortran_abi = fortran_abi
-        super(CFamilyTarget, self).__init__()
+        super().__init__()
 
     def split_kernel_at_global_barriers(self):
         return False
@@ -425,7 +436,7 @@ class CMathCallable(ScalarCallable):
 
             for id in arg_id_to_dtype:
                 if not -1 <= id <= 0:
-                    raise LoopyError("%s can take only one argument." % name)
+                    raise LoopyError(f"'{name}' can take only one argument.")
 
             if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
                 # the types provided aren't mature enough to specialize the
@@ -437,11 +448,11 @@ class CMathCallable(ScalarCallable):
             dtype = arg_id_to_dtype[0]
             dtype = dtype.numpy_dtype
 
-            if dtype.kind in ('u', 'i'):
+            if dtype.kind in ("u", "i"):
                 # ints and unsigned casted to float32
                 dtype = np.float32
-            elif dtype.kind == 'c':
-                raise LoopyTypeError("%s does not support type %s" % (name, dtype))
+            elif dtype.kind == "c":
+                raise LoopyTypeError(f"{name} does not support type {dtype}")
 
             from loopy.target.opencl import OpenCLTarget
             if not isinstance(caller_kernel.target, OpenCLTarget):
@@ -453,7 +464,7 @@ class CMathCallable(ScalarCallable):
                 elif dtype == np.float128:  # pylint:disable=no-member
                     name = name + "l"  # fabsl
                 else:
-                    raise LoopyTypeError("%s does not support type %s" % (name,
+                    raise LoopyTypeError("{} does not support type {}".format(name,
                         dtype))
 
             return (
@@ -530,20 +541,19 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
     def symbol_manglers(self):
         return (
-                super(CFamilyASTBuilder, self).symbol_manglers() + [
+                super().symbol_manglers() + [
                     c_symbol_mangler
                     ])
 
     def preamble_generators(self):
         return (
-                super(CFamilyASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     _preamble_generator,
                     ])
 
     def function_id_in_knl_callable_mapper(self):
         return (
-                super(CFamilyASTBuilder,
-                    self).function_id_in_knl_callable_mapper() + [
+                super().function_id_in_knl_callable_mapper() + [
                         scope_c_math_functions])
 
     # }}}
@@ -577,7 +587,7 @@ class CFamilyASTBuilder(ASTBuilderBase):
                 break
         if is_first_dev_prog:
             for tv in sorted(
-                    six.itervalues(kernel.temporary_variables),
+                    kernel.temporary_variables.values(),
                     key=lambda tv: tv.name):
 
                 if tv.address_space == AddressSpace.GLOBAL and (
@@ -671,7 +681,7 @@ class CFamilyASTBuilder(ASTBuilderBase):
                 | temporaries_written_in_subkernel(kernel, subkernel))
 
         for tv in sorted(
-                six.itervalues(kernel.temporary_variables),
+                kernel.temporary_variables.values(),
                 key=lambda tv: tv.name):
             decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)
 
@@ -734,7 +744,7 @@ class CFamilyASTBuilder(ASTBuilderBase):
                     cast_tp, cast_d = cast_decl.get_decl_pair()
                     temp_var_decl = Initializer(
                             temp_var_decl,
-                            "(%s %s) (%s + %s)" % (
+                            "({} {}) ({} + {})".format(
                                 " ".join(cast_tp), cast_d,
                                 tv.base_storage,
                                 offset))
@@ -748,7 +758,7 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
         ecm = self.get_expression_to_code_mapper(codegen_state)
 
-        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
+        for bs_name, bs_sizes in sorted(base_storage_sizes.items()):
             bs_var_decl = Value("char", bs_name)
             from pytools import single_valued
             bs_var_decl = self.wrap_temporary_decl(
@@ -957,7 +967,7 @@ class CFamilyASTBuilder(ASTBuilderBase):
         in_knl_callable = codegen_state.callables_table[func_id]
 
         if isinstance(in_knl_callable, ScalarCallable) and (
-                in_knl_callable.name_in_target == 'loopy_make_tuple'):
+                in_knl_callable.name_in_target == "loopy_make_tuple"):
             return self.emit_tuple_assignment(codegen_state, insn)
 
         # takes "is_returned" to infer whether insn.assignees[0] is a part of
@@ -1046,7 +1056,7 @@ class CFunctionDeclExtractor(CASTIdentityMapper):
 
     def map_function_decl_wrapper(self, node):
         self.decls.append(node.subdecl)
-        return super(CFunctionDeclExtractor, self)\
+        return super()\
                 .map_function_decl_wrapper(node)
 
 
@@ -1054,7 +1064,7 @@ def generate_header(kernel, codegen_result=None):
     """
     :arg kernel: a :class:`loopy.LoopKernel`
     :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult`
-    :returns: a list of AST nodes (which may have :func:`str`
+    :returns: a list of AST nodes (which may have :class:`str`
         called on them to produce a string) representing
         function declarations for the generated device
         functions.
@@ -1062,7 +1072,7 @@ def generate_header(kernel, codegen_result=None):
 
     if not isinstance(kernel.target, CFamilyTarget):
         raise LoopyError(
-                'Header generation for non C-based languages are not implemented')
+                "Header generation for non C-based languages are not implemented")
 
     if codegen_result is None:
         from loopy.codegen import generate_code_v2
@@ -1091,16 +1101,18 @@ class CTarget(CFamilyTarget):
     @memoize_method
     def get_dtype_registry(self):
         from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_registry_with_c99_stdint_types)
+                DTypeRegistry, fill_registry_with_c99_stdint_types,
+                fill_registry_with_c99_complex_types)
         result = DTypeRegistry()
         fill_registry_with_c99_stdint_types(result)
+        fill_registry_with_c99_complex_types(result)
         return DTypeRegistryWrapper(result)
 
 
 class CASTBuilder(CFamilyASTBuilder):
     def preamble_generators(self):
         return (
-                super(CASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     c99_preamble_generator,
                     ])
 
@@ -1115,7 +1127,7 @@ class ExecutableCTarget(CTarget):
     """
 
     def __init__(self, compiler=None, fortran_abi=False):
-        super(ExecutableCTarget, self).__init__(fortran_abi=fortran_abi)
+        super().__init__(fortran_abi=fortran_abi)
         from loopy.target.c.c_execution import CCompiler
         self.compiler = compiler or CCompiler()
 
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index b6525b5d183c803955317a92ed10d8206eecba65..2031b3703f1e431c2b3a1979282af85e4b167edd 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement, absolute_import
-
 __copyright__ = "Copyright (C) 2017 Nick Curtis"
 
 __license__ = """
@@ -32,7 +30,6 @@ from pytools.py_codegen import (Indentation)
 from pytools.prefork import ExecError
 from codepy.toolchain import guess_toolchain, ToolchainGuessError, GCCToolchain
 from codepy.jit import compile_from_string
-import six
 import ctypes
 
 import numpy as np
@@ -49,12 +46,12 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     def __init__(self):
         system_args = ["_lpy_c_kernels"]
-        super(CExecutionWrapperGenerator, self).__init__(system_args)
+        super().__init__(system_args)
 
     def python_dtype_str(self, dtype):
         if np.dtype(str(dtype)).isbuiltin:
             return "_lpy_np."+dtype.name
-        raise Exception('dtype: {0} not recognized'.format(dtype))
+        raise Exception(f"dtype: {dtype} not recognized")
 
     # {{{ handle non numpy arguements
 
@@ -110,7 +107,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 var("_lpy_expected_strides_%s" % i)
                 for i in range(num_axes))
 
-        gen("%s = %s.strides" % (strify(expected_strides), arg.name))
+        gen("{} = {}.strides".format(strify(expected_strides), arg.name))
 
         #check strides
         if not skip_arg_checks:
@@ -149,7 +146,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             kernel, implemented_data_info):
         gen("for knl in _lpy_c_kernels:")
         with Indentation(gen):
-            gen('knl({args})'.format(
+            gen("knl({args})".format(
                 args=", ".join(args)))
 
     # }}}
@@ -163,7 +160,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
         if options.return_dict:
             gen("return None, {%s}"
-                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    % ", ".join(f'"{arg.name}": {arg.name}'
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
                         if arg.base_name in
@@ -191,7 +188,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         return arg.name
 
 
-class CCompiler(object):
+class CCompiler:
     """
     The compiler module handles invocation of compilers to generate a shared lib
     using codepy, which can subsequently be loaded via ctypes.
@@ -212,10 +209,10 @@ class CCompiler(object):
     """
 
     def __init__(self, toolchain=None,
-                 cc='gcc', cflags='-std=c99 -O3 -fPIC'.split(),
-                 ldflags='-shared'.split(), libraries=[],
+                 cc="gcc", cflags="-std=c99 -O3 -fPIC".split(),
+                 ldflags="-shared".split(), libraries=[],
                  include_dirs=[], library_dirs=[], defines=[],
-                 source_suffix='c'):
+                 source_suffix="c"):
         # try to get a default toolchain
         # or subclass supplied version if available
         self.toolchain = toolchain
@@ -226,36 +223,36 @@ class CCompiler(object):
                 # missing compiler python was built with (likely, Conda)
                 # use a default GCCToolchain
                 logger = logging.getLogger(__name__)
-                logger.warn('Default toolchain guessed from python config '
-                            'not found, replacing with default GCCToolchain.')
+                logger.warn("Default toolchain guessed from python config "
+                            "not found, replacing with default GCCToolchain.")
                 # this is ugly, but I'm not sure there's a clean way to copy the
                 # default args
                 self.toolchain = GCCToolchain(
-                    cc='gcc',
-                    cflags='-std=c99 -O3 -fPIC'.split(),
-                    ldflags='-shared'.split(),
+                    cc="gcc",
+                    cflags="-std=c99 -O3 -fPIC".split(),
+                    ldflags="-shared".split(),
                     libraries=[],
                     library_dirs=[],
                     defines=[],
                     undefines=[],
-                    source_suffix='c',
-                    so_ext='.so',
-                    o_ext='.o',
+                    source_suffix="c",
+                    so_ext=".so",
+                    o_ext=".o",
                     include_dirs=[])
 
         if toolchain is None:
             # copy in all differing values
-            diff = {'cc': cc,
-                    'cflags': cflags,
-                    'ldflags': ldflags,
-                    'libraries': libraries,
-                    'include_dirs': include_dirs,
-                    'library_dirs': library_dirs,
-                    'defines': defines}
+            diff = {"cc": cc,
+                    "cflags": cflags,
+                    "ldflags": ldflags,
+                    "libraries": libraries,
+                    "include_dirs": include_dirs,
+                    "library_dirs": library_dirs,
+                    "defines": defines}
             # filter empty and those equal to toolchain defaults
-            diff = dict((k, v) for k, v in six.iteritems(diff)
+            diff = {k: v for k, v in diff.items()
                     if v and (not hasattr(self.toolchain, k) or
-                              getattr(self.toolchain, k) != v))
+                              getattr(self.toolchain, k) != v)}
             self.toolchain = self.toolchain.copy(**diff)
         self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy")
         self.source_suffix = source_suffix
@@ -268,7 +265,7 @@ class CCompiler(object):
                      debug_recompile=True):
         """Compile code, build and load shared library."""
         logger.debug(code)
-        c_fname = self._tempname('code.' + self.source_suffix)
+        c_fname = self._tempname("code." + self.source_suffix)
 
         # build object
         _, mod_name, ext_file, recompiled = \
@@ -277,9 +274,9 @@ class CCompiler(object):
                                 debug_recompile, False)
 
         if recompiled:
-            logger.debug('Kernel {0} compiled from source'.format(name))
+            logger.debug(f"Kernel {name} compiled from source")
         else:
-            logger.debug('Kernel {0} retrieved from cache'.format(name))
+            logger.debug(f"Kernel {name} retrieved from cache")
 
         # and return compiled
         return ctypes.CDLL(ext_file)
@@ -289,18 +286,18 @@ class CPlusPlusCompiler(CCompiler):
     """Subclass of CCompiler to invoke a C++ compiler."""
 
     def __init__(self, toolchain=None,
-                 cc='g++', cflags='-std=c++98 -O3 -fPIC'.split(),
+                 cc="g++", cflags="-std=c++98 -O3 -fPIC".split(),
                  ldflags=[], libraries=[],
                  include_dirs=[], library_dirs=[], defines=[],
-                 source_suffix='cpp'):
+                 source_suffix="cpp"):
 
-        super(CPlusPlusCompiler, self).__init__(
+        super().__init__(
             toolchain=toolchain, cc=cc, cflags=cflags, ldflags=ldflags,
             libraries=libraries, include_dirs=include_dirs,
             library_dirs=library_dirs, defines=defines, source_suffix=source_suffix)
 
 
-class IDIToCDLL(object):
+class IDIToCDLL:
     """
     A utility class that extracts arguement and return type info from a
     :class:`ImplementedDataInfo` in order to create a :class:`ctype.CDLL`
@@ -323,14 +320,14 @@ class IDIToCDLL(object):
     def _dtype_to_ctype(self, dtype, pointer=False):
         """Map NumPy dtype to equivalent ctypes type."""
         typename = self.registry.dtype_to_ctype(dtype)
-        typename = {'unsigned': 'uint'}.get(typename, typename)
-        basetype = getattr(ctypes, 'c_' + typename)
+        typename = {"unsigned": "uint"}.get(typename, typename)
+        basetype = getattr(ctypes, "c_" + typename)
         if pointer:
             return ctypes.POINTER(basetype)
         return basetype
 
 
-class CompiledCKernel(object):
+class CompiledCKernel:
     """
     A CompiledCKernel wraps a loopy kernel, compiling it and loading the
     result as a shared library, and provides access to the kernel as a
@@ -360,7 +357,7 @@ class CompiledCKernel(object):
         """Execute kernel with given args mapped to ctypes equivalents."""
         args_ = []
         for arg, arg_t in zip(args, self._fn.argtypes):
-            if hasattr(arg, 'ctypes'):
+            if hasattr(arg, "ctypes"):
                 if arg.size == 0:
                     # TODO eliminate unused arguments from kernel
                     arg_ = arg_t(0.0)
@@ -389,12 +386,15 @@ class CKernelExecutor(KernelExecutorBase):
         """
 
         self.compiler = compiler if compiler else CCompiler()
-        super(CKernelExecutor, self).__init__(program)
+        super().__init__(program)
 
     def get_invoker_uncached(self, kernel, codegen_result):
         generator = CExecutionWrapperGenerator()
         return generator(kernel, codegen_result)
 
+    def get_wrapper_generator(self):
+        return CExecutionWrapperGenerator()
+
     @memoize_method
     def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
         program = self.get_typed_and_scheduled_program(arg_to_dtype_set)
@@ -404,7 +404,7 @@ class CKernelExecutor(KernelExecutorBase):
 
         dev_code = codegen_result.device_code()
         host_code = codegen_result.host_code()
-        all_code = '\n'.join([dev_code, '', host_code])
+        all_code = "\n".join([dev_code, "", host_code])
 
         if self.program.root_kernel.options.write_cl:
             output = all_code
@@ -421,7 +421,7 @@ class CKernelExecutor(KernelExecutorBase):
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
-            all_code = '\n'.join([dev_code, '', host_code])
+            all_code = "\n".join([dev_code, "", host_code])
 
         c_kernels = []
         for dp in codegen_result.device_programs:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index b0bc187ebe71c2e9751ce95abe0050b5c06d6f26..046dfa455eb0b7bcf7015f17758650941f4e1ce6 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,8 +21,6 @@ THE SOFTWARE.
 """
 
 
-from six.moves import range
-
 import numpy as np
 
 from pymbolic.mapper import RecursiveMapper, IdentityMapper
@@ -44,11 +40,23 @@ from loopy.type_inference import TypeInferenceMapper
 from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 from loopy.types import LoopyType
+from loopy.target.c import CExpression
+
+
+__doc__ = """
+.. currentmodule:: loopy.target.c.codegen.expression
+
+.. autoclass:: ExpressionToCExpressionMapper
+"""
 
 
 # {{{ Loopy expression to C expression mapper
 
 class ExpressionToCExpressionMapper(IdentityMapper):
+    """
+    Mapper that converts a loopy-semantic expression to a C-semantic expression
+    with typecasts, appropriate arithmetic semantic mapping, etc.
+    """
     def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None):
         self.kernel = codegen_state.kernel
         self.codegen_state = codegen_state
@@ -113,7 +121,6 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             prec = PREC_NONE
 
         assert prec == PREC_NONE
-        from loopy.target.c import CExpression
         return CExpression(
                 self.codegen_state.ast_builder.get_c_expression_to_code_mapper(),
                 self.rec(expr, type_context, needed_dtype))
@@ -127,7 +134,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         if expr.name in self.codegen_state.var_subst_map:
             if self.kernel.options.annotate_inames:
                 return var(
-                        "/* %s */ %s" % (
+                        "/* {} */ {}".format(
                             expr.name,
                             self.rec(self.codegen_state.var_subst_map[expr.name],
                                 type_context)))
@@ -173,7 +180,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
     def map_subscript(self, expr, type_context):
         def base_impl(expr, type_context):
-            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]
+            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")]
 
         def make_var(name):
             from loopy import TaggedVariable
@@ -221,7 +228,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             base_access = var("read_imagef")(
                     var(ary.name),
                     var("loopy_sampler"),
-                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i')))
+                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, "i")))
 
             if ary.dtype.numpy_dtype == np.float32:
                 return base_access.attr("x")
@@ -236,8 +243,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)):
             if len(access_info.subscripts) == 0:
                 if (
-                        (isinstance(ary, (ConstantArg, ArrayArg)) or
-                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
+                        isinstance(ary, (ConstantArg, ArrayArg)) or
+                        (isinstance(ary, TemporaryVariable) and ary.base_storage)):
                     # unsubscripted global args are pointers
                     result = self.make_subscript(
                             ary,
@@ -255,7 +262,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         ary,
                         make_var(access_info.array_name),
                         simplify_using_aff(
-                            self.kernel, self.rec(subscript, 'i')))
+                            self.kernel, self.rec(subscript, "i")))
 
             if access_info.vector_index is not None:
                 return self.codegen_state.ast_builder.add_vector_access(
@@ -290,7 +297,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 return self.make_subscript(
                         arg,
                         var(expr.aggregate.name),
-                        self.rec(offset + expr.index, 'i'))
+                        self.rec(offset + expr.index, "i"))
 
         elif expr.aggregate.name in self.kernel.temporary_variables:
             raise RuntimeError("linear indexing is not supported on temporaries: %s"
@@ -323,7 +330,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             from loopy.codegen import SeenFunction
             self.codegen_state.seen_functions.add(
                     SeenFunction(
-                        name, "%s_%s" % (name, suffix),
+                        name, f"{name}_{suffix}",
                         (result_dtype, result_dtype)))
 
         if den_nonneg:
@@ -333,14 +340,14 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         self.rec(expr.denominator, type_context))
             else:
                 seen_func("%s_pos_b" % base_func_name)
-                return var("%s_pos_b_%s" % (base_func_name, suffix))(
-                        self.rec(expr.numerator, 'i'),
-                        self.rec(expr.denominator, 'i'))
+                return var(f"{base_func_name}_pos_b_{suffix}")(
+                        self.rec(expr.numerator, "i"),
+                        self.rec(expr.denominator, "i"))
         else:
             seen_func(base_func_name)
-            return var("%s_%s" % (base_func_name, suffix))(
-                    self.rec(expr.numerator, 'i'),
-                    self.rec(expr.denominator, 'i'))
+            return var(f"{base_func_name}_{suffix}")(
+                    self.rec(expr.numerator, "i"),
+                    self.rec(expr.denominator, "i"))
 
     def map_floor_div(self, expr, type_context):
         import operator
@@ -605,8 +612,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         if not self.allow_complex:
             return base_impl(expr, type_context)
 
-        n_complex = 'c' == n_dtype.kind
-        d_complex = 'c' == d_dtype.kind
+        n_complex = "c" == n_dtype.kind
+        d_complex = "c" == d_dtype.kind
 
         tgt_dtype = self.infer_type(expr)
 
@@ -721,7 +728,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
             func = self.rec(expr.function, PREC_CALL+1)
 
         return self.parenthesize_if_needed(
-                "%s(%s)" % (
+                "{}({})".format(
                     func,
                     self.join_rec(", ", expr.parameters, PREC_NONE)),
                 enclosing_prec, PREC_CALL)
@@ -737,13 +744,13 @@ class CExpressionToCodeMapper(RecursiveMapper):
 
     def map_lookup(self, expr, enclosing_prec):
         return self.parenthesize_if_needed(
-                "%s.%s" % (
+                "{}.{}".format(
                     self.rec(expr.aggregate, PREC_CALL), expr.name),
                 enclosing_prec, PREC_CALL)
 
     def map_subscript(self, expr, enclosing_prec):
         return self.parenthesize_if_needed(
-                "%s[%s]" % (
+                "{}[{}]".format(
                     self.rec(expr.aggregate, PREC_CALL+1),
                     self.rec(expr.index, PREC_NONE)),
                 enclosing_prec, PREC_CALL)
@@ -755,7 +762,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
 
         result = self.rec(children.pop(), PREC_NONE)
         while children:
-            result = "%s(%s, %s)" % (what,
+            result = "{}({}, {})".format(what,
                         self.rec(children.pop(), PREC_NONE),
                         result)
 
@@ -765,7 +772,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
 
     def map_if(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_NONE
-        return "(%s ? %s : %s)" % (
+        return "({} ? {} : {})".format(
                 self.rec(expr.condition, PREC_NONE),
                 self.rec(expr.then, PREC_NONE),
                 self.rec(expr.else_, PREC_NONE),
@@ -775,7 +782,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
         from pymbolic.mapper.stringifier import PREC_COMPARISON
 
         return self.parenthesize_if_needed(
-                "%s %s %s" % (
+                "{} {} {}".format(
                     self.rec(expr.left, PREC_COMPARISON),
                     expr.operator,
                     self.rec(expr.right, PREC_COMPARISON)),
@@ -860,7 +867,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
                 force_parens_around=self.multiplicative_primitives)
 
         return self.parenthesize_if_needed(
-                "%s %s %s" % (
+                "{} {} {}".format(
                     # Space is necessary--otherwise '/*'
                     # (i.e. divide-dererference) becomes
                     # start-of-comment in C.
@@ -879,7 +886,7 @@ class CExpressionToCodeMapper(RecursiveMapper):
         return self._map_division_operator("%", expr, enclosing_prec)
 
     def map_power(self, expr, enclosing_prec):
-        return "pow(%s, %s)" % (
+        return "pow({}, {})".format(
                 self.rec(expr.base, PREC_NONE),
                 self.rec(expr.exponent, PREC_NONE))
 
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372..d1f993daecc03947d9e6e3e60d2a5145ecbf3786 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372
+Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index d713e06c08b6a16962043103e9bde440011e5359..83697e60161a12a38fbc240d086e6b6bce4876b1 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -1,6 +1,5 @@
 """CUDA target independent of PyCUDA."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
@@ -59,18 +58,18 @@ def _create_vector_types():
     vec.type_to_scalar_and_count = {}
 
     for base_name, base_type, counts in [
-            ('char', np.int8, [1, 2, 3, 4]),
-            ('uchar', np.uint8, [1, 2, 3, 4]),
-            ('short', np.int16, [1, 2, 3, 4]),
-            ('ushort', np.uint16, [1, 2, 3, 4]),
-            ('int', np.int32, [1, 2, 3, 4]),
-            ('uint', np.uint32, [1, 2, 3, 4]),
-            ('long', long_dtype, [1, 2, 3, 4]),
-            ('ulong', ulong_dtype, [1, 2, 3, 4]),
-            ('longlong', np.int64, [1, 2]),
-            ('ulonglong', np.uint64, [1, 2]),
-            ('float', np.float32, [1, 2, 3, 4]),
-            ('double', np.float64, [1, 2]),
+            ("char", np.int8, [1, 2, 3, 4]),
+            ("uchar", np.uint8, [1, 2, 3, 4]),
+            ("short", np.int16, [1, 2, 3, 4]),
+            ("ushort", np.uint16, [1, 2, 3, 4]),
+            ("int", np.int32, [1, 2, 3, 4]),
+            ("uint", np.uint32, [1, 2, 3, 4]),
+            ("long", long_dtype, [1, 2, 3, 4]),
+            ("ulong", ulong_dtype, [1, 2, 3, 4]),
+            ("longlong", np.int64, [1, 2]),
+            ("ulonglong", np.uint64, [1, 2]),
+            ("float", np.float32, [1, 2, 3, 4]),
+            ("double", np.float64, [1, 2]),
             ]:
         for count in counts:
             name = "%s%d" % (base_name, count)
@@ -171,8 +170,8 @@ class CudaCallable(ScalarCallable):
                 raise LoopyError("%s does not support complex numbers"
                         % name)
 
-            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
-                num_args))
+            updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1,
+                num_args)}
 
             return (
                     self.copy(name_in_target=name,
@@ -185,7 +184,7 @@ class CudaCallable(ScalarCallable):
 
 
 def scope_cuda_functions(target, identifier):
-    if identifier in set(["dot"]) | set(
+    if identifier in {"dot"} | set(
             _CUDA_SPECIFIC_FUNCTIONS):
         return CudaCallable(name=identifier)
 
@@ -209,12 +208,12 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper):
             raise LoopyError("unexpected index type")
 
     def map_group_hw_index(self, expr, type_context):
-        return var("((%s) blockIdx.%s)" % (
+        return var("(({}) blockIdx.{})".format(
             self._get_index_ctype(self.kernel),
             self._GRID_AXES[expr.axis]))
 
     def map_local_hw_index(self, expr, type_context):
-        return var("((%s) threadIdx.%s)" % (
+        return var("(({}) threadIdx.{})".format(
             self._get_index_ctype(self.kernel),
             self._GRID_AXES[expr.axis]))
 
@@ -233,7 +232,7 @@ class CudaTarget(CFamilyTarget):
         """
         self.extern_c = extern_c
 
-        super(CudaTarget, self).__init__()
+        super().__init__()
 
     def split_kernel_at_global_barriers(self):
         return True
@@ -313,7 +312,7 @@ class CUDACASTBuilder(CFamilyASTBuilder):
 
     def function_id_in_knl_callable_mapper(self):
         return [scope_cuda_functions] + (
-                super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper())
+                super().function_id_in_knl_callable_mapper())
 
     # }}}
 
@@ -321,7 +320,7 @@ class CUDACASTBuilder(CFamilyASTBuilder):
 
     def get_function_declaration(self, codegen_state, codegen_result,
             schedule_index):
-        fdecl = super(CUDACASTBuilder, self).get_function_declaration(
+        fdecl = super().get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
         from loopy.target.c import FunctionDeclarationWrapper
@@ -356,7 +355,7 @@ class CUDACASTBuilder(CFamilyASTBuilder):
     def preamble_generators(self):
 
         return (
-                super(CUDACASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     cuda_preamble_generator])
 
     # }}}
@@ -456,7 +455,7 @@ class CUDACASTBuilder(CFamilyASTBuilder):
                 lhs_expr_code = ecm(lhs_expr)
                 rhs_expr_code = ecm(new_rhs_expr)
 
-                return Statement("atomicAdd(&{0}, {1})".format(
+                return Statement("atomicAdd(&{}, {})".format(
                     lhs_expr_code, rhs_expr_code))
             else:
                 from cgen import Block, DoWhile, Assign
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 96f6e065c9cabccbe48071d6d4be10a059813cf3..1a98ffdc90eec2db9e83f014c07b3da1b0c8108e 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement, absolute_import
-
 __copyright__ = "Copyright (C) 2012-17 Andreas Kloeckner, Nick Curtis"
 
 __license__ = """
@@ -23,7 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
 import numpy as np
 from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
@@ -51,7 +48,7 @@ class _PackingInfo(ImmutableRecord):
     """
 
 
-class SeparateArrayPackingController(object):
+class SeparateArrayPackingController:
     """For argument arrays with axes tagged to be implemented as separate
     arrays, this class provides preprocessing of the incoming arguments so that
     all sub-arrays may be passed in one object array (under the original,
@@ -91,7 +88,7 @@ class SeparateArrayPackingController(object):
 
         kernel_kwargs = kernel_kwargs.copy()
 
-        for packing_info in six.itervalues(self.packing_info):
+        for packing_info in self.packing_info.values():
             arg_name = packing_info.name
             if packing_info.name in kernel_kwargs:
                 arg = kernel_kwargs[arg_name]
@@ -106,7 +103,7 @@ class SeparateArrayPackingController(object):
         if not self.packing_info:
             return outputs
 
-        for packing_info in six.itervalues(self.packing_info):
+        for packing_info in self.packing_info.values():
             if not packing_info.is_written:
                 continue
 
@@ -123,7 +120,7 @@ class SeparateArrayPackingController(object):
 
 # {{{ ExecutionWrapperGeneratorBase
 
-class ExecutionWrapperGeneratorBase(object):
+class ExecutionWrapperGeneratorBase:
     """
     A set of common methods for generating a wrapper
     for execution
@@ -195,12 +192,12 @@ class ExecutionWrapperGeneratorBase(object):
         gen("# {{{ find integer arguments from shapes")
         gen("")
 
-        for iarg_name, sources in six.iteritems(iarg_to_sources):
+        for iarg_name, sources in iarg_to_sources.items():
             gen("if %s is None:" % iarg_name)
             with Indentation(gen):
                 if_stmt = "if"
                 for arg_name, value_expr in sources:
-                    gen("%s %s is not None:" % (if_stmt, arg_name))
+                    gen(f"{if_stmt} {arg_name} is not None:")
                     with Indentation(gen):
                         gen("%s = %s"
                                 % (iarg_name, StringifyMapper()(value_expr)))
@@ -236,7 +233,7 @@ class ExecutionWrapperGeneratorBase(object):
                     gen("else:")
                     with Indentation(gen):
                         if not options.no_numpy:
-                            gen("_lpy_offset = getattr(%s, \"offset\", 0)"
+                            gen('_lpy_offset = getattr(%s, "offset", 0)'
                                     % impl_array_name)
                         else:
                             gen("_lpy_offset = %s.offset" % impl_array_name)
@@ -248,7 +245,7 @@ class ExecutionWrapperGeneratorBase(object):
                                     % (arg.name, base_arg.dtype.itemsize))
 
                             gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
-                                    "not divisible by its dtype itemsize\""
+                                    'not divisible by its dtype itemsize"'
                                     % impl_array_name)
                             gen("del _lpy_remdr")
                         else:
@@ -283,7 +280,7 @@ class ExecutionWrapperGeneratorBase(object):
                         with Indentation(gen):
                             gen("raise RuntimeError(\"required stride '%s' for "
                                     "argument '%s' not given or deducible from "
-                                    "passed array\")"
+                                    'passed array")'
                                     % (arg.name, impl_array_name))
 
                         base_arg = program.impl_arg_to_arg[impl_array_name]
@@ -294,7 +291,7 @@ class ExecutionWrapperGeneratorBase(object):
                                         base_arg.dtype.dtype.itemsize))
 
                             gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
-                                    " is not divisible by its dtype itemsize\""
+                                    ' is not divisible by its dtype itemsize"'
                                     % (stride_impl_axis, impl_array_name))
                             gen("del _lpy_remdr")
                         else:
@@ -326,7 +323,7 @@ class ExecutionWrapperGeneratorBase(object):
             with Indentation(gen):
                 gen("raise TypeError(\"value argument '%s' "
                         "was not given and could not be automatically "
-                        "determined\")" % arg.name)
+                        'determined")' % arg.name)
 
         gen("# }}}")
         gen("")
@@ -411,7 +408,7 @@ class ExecutionWrapperGeneratorBase(object):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"input argument '%s' must "
-                            "be supplied\")" % arg.name)
+                            'be supplied")' % arg.name)
                     gen("")
 
             if (is_written
@@ -420,14 +417,14 @@ class ExecutionWrapperGeneratorBase(object):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"written image '%s' must "
-                            "be supplied\")" % arg.name)
+                            'be supplied")' % arg.name)
                     gen("")
 
             if is_written and arg.shape is None and not options.skip_arg_checks:
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"written argument '%s' has "
-                            "unknown shape and must be supplied\")" % arg.name)
+                            'unknown shape and must be supplied")' % arg.name)
                     gen("")
 
             possibly_made_by_loopy = False
@@ -470,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object):
                                 program_arg.dtype.numpy_dtype)))
                     with Indentation(gen):
                         gen("raise TypeError(\"dtype mismatch on argument '%s' "
-                                "(got: %%s, expected: %s)\" %% %s.dtype)"
+                                '(got: %%s, expected: %s)" %% %s.dtype)'
                                 % (arg.name, arg.dtype, arg.name))
 
                     # {{{ generate shape checking code
@@ -491,7 +488,7 @@ class ExecutionWrapperGeneratorBase(object):
 
                     shape_mismatch_msg = (
                             "raise TypeError(\"shape mismatch on argument '%s' "
-                            "(got: %%s, expected: %%s)\" "
+                            '(got: %%s, expected: %%s)" '
                             "%% (%s.shape, %s))"
                             % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
 
@@ -530,8 +527,9 @@ class ExecutionWrapperGeneratorBase(object):
                         shape = ["_lpy_shape_%d" % i for i in range(ndim)]
                         strides = ["_lpy_stride_%d" % i for i in range(ndim)]
 
-                        gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
-                        gen("(%s,) = %s.strides" % (", ".join(strides), arg.name))
+                        gen("({},) = {}.shape".format(", ".join(shape), arg.name))
+                        gen("({},) = {}.strides".format(
+                            ", ".join(strides), arg.name))
 
                         gen("if not (%s):"
                                 % self.get_strides_check_expr(
@@ -547,21 +545,21 @@ class ExecutionWrapperGeneratorBase(object):
                                     "if dim > 1)"
                                     % (arg.name, strify_tuple(sym_strides)))
 
-                            gen("raise TypeError(\"strides mismatch on "
+                            gen('raise TypeError("strides mismatch on '
                                     "argument '%s' "
                                     "(after removing unit length dims, "
-                                    "got: %%s, expected: %%s)\" "
+                                    'got: %%s, expected: %%s)" '
                                     "%% (_lpy_got, _lpy_expected))"
                                     % arg.name)
 
                     if not arg.allows_offset:
-                        gen("if hasattr(%s, 'offset') and %s.offset:" % (
+                        gen("if hasattr({}, 'offset') and {}.offset:".format(
                                 arg.name, arg.name))
                         with Indentation(gen):
                             gen("raise ValueError(\"Argument '%s' does not "
                                     "allow arrays with offsets. Try passing "
-                                    "default_offset=loopy.auto to make_program()."
-                                    "\")" % arg.name)
+                                    "default_offset=loopy.auto to make_kernel()."
+                                    '")' % arg.name)
                             gen("")
 
             # }}}
@@ -691,7 +689,7 @@ class _KernelInfo(ImmutableRecord):
     pass
 
 
-class _Kernels(object):
+class _Kernels:
     pass
 
 
@@ -707,7 +705,7 @@ invoker_cache = WriteOncePersistentDict(
 
 # {{{ kernel executor
 
-class KernelExecutorBase(object):
+class KernelExecutorBase:
     """An object connecting a kernel to a :class:`pyopencl.Context`
     for execution.
 
@@ -797,7 +795,7 @@ class KernelExecutorBase(object):
 
         impl_arg_to_arg = self.program.impl_arg_to_arg
         arg_to_dtype = {}
-        for arg_name, val in six.iteritems(kwargs):
+        for arg_name, val in kwargs.items():
             arg = impl_arg_to_arg.get(arg_name, None)
 
             if arg is None:
@@ -812,7 +810,7 @@ class KernelExecutorBase(object):
                 else:
                     arg_to_dtype[arg_name] = dtype
 
-        return frozenset(six.iteritems(arg_to_dtype))
+        return frozenset(arg_to_dtype.items())
 
     # {{{ debugging aids
 
@@ -833,7 +831,7 @@ class KernelExecutorBase(object):
 
         if arg_to_dtype is not None:
             arg_to_dtype = frozenset(
-                    (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype))
+                    (k, process_dtype(v)) for k, v in arg_to_dtype.items())
 
         kernel = self.get_typed_and_scheduled_program(arg_to_dtype)
 
@@ -844,6 +842,9 @@ class KernelExecutorBase(object):
     def get_invoker_uncached(self, kernel, *args):
         raise NotImplementedError()
 
+    def get_wrapper_generator(self):
+        raise NotImplementedError()
+
     def get_invoker(self, kernel, *args):
         from loopy import CACHING_ENABLED
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 812bf3a560b191bd61f5b86cb401983ad97467a2..6558ac0ec40ca39d7ae429edb5e401ade7d16958 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -1,6 +1,5 @@
 """Target for Intel ISPC."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
@@ -93,7 +92,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 return expr
 
         else:
-            return super(ExprToISPCExprMapper, self).map_variable(
+            return super().map_variable(
                     expr, type_context)
 
     def map_subscript(self, expr, type_context):
@@ -117,7 +116,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
 
                 subscript, = access_info.subscripts
                 result = var(access_info.array_name)[
-                        var("programIndex") + self.rec(lsize*subscript, 'i')]
+                        var("programIndex") + self.rec(lsize*subscript, "i")]
 
                 if access_info.vector_index is not None:
                     return self.kernel.target.add_vector_access(
@@ -125,7 +124,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 else:
                     return result
 
-        return super(ExprToISPCExprMapper, self).map_subscript(
+        return super().map_subscript(
                 expr, type_context)
 
 # }}}
@@ -167,7 +166,7 @@ class ISPCTarget(CFamilyTarget):
         """
         self.occa_mode = occa_mode
 
-        super(ISPCTarget, self).__init__()
+        super().__init__()
 
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
@@ -274,7 +273,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
         result.append(
                 ISPCLaunch(
                     tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize),
-                    "%s(%s)" % (
+                    "{}({})".format(
                         name,
                         ", ".join(arg_names)
                         )))
@@ -352,7 +351,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
                 dtype, is_written)
 
     def get_value_arg_decl(self, name, shape, dtype, is_written):
-        result = super(ISPCASTBuilder, self).get_value_arg_decl(
+        result = super().get_value_arg_decl(
                 name, shape, dtype, is_written)
 
         from cgen import Reference, Const
@@ -476,7 +475,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
                     "streaming_store(%s + %s, %s)"
                     % (
                         access_info.array_name,
-                        ecm(flattened_sum(new_terms), PREC_NONE, 'i'),
+                        ecm(flattened_sum(new_terms), PREC_NONE, "i"),
                         rhs_code))
 
         # }}}
diff --git a/loopy/target/numba.py b/loopy/target/numba.py
index 6946063ee04f52a4890344b4cbff9446bacb6923..2df81ec1f332be87d8ca361480a37b68b369b56f 100644
--- a/loopy/target/numba.py
+++ b/loopy/target/numba.py
@@ -1,6 +1,5 @@
 """Python host AST builder for integration with PyOpenCL."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
@@ -44,7 +43,7 @@ def _base_numba_preamble_generator(preamble_info):
 class NumbaBaseASTBuilder(PythonASTBuilderBase):
     def preamble_generators(self):
         return (
-                super(NumbaBaseASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     _base_numba_preamble_generator
                     ])
 
@@ -72,7 +71,7 @@ class NumbaBaseASTBuilder(PythonASTBuilderBase):
         implemented_data_info = codegen_state.implemented_data_info
 
         return Statement(
-            "%s[%s, %s](%s)" % (
+            "{}[{}, {}]({})".format(
                 name,
                 ecm(gsize, PREC_NONE),
                 ecm(lsize, PREC_NONE),
@@ -155,7 +154,7 @@ def _cuda_numba_preamble_generator(preamble_info):
 class NumbaCudaASTBuilder(NumbaBaseASTBuilder):
     def preamble_generators(self):
         return (
-                super(NumbaCudaASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     _cuda_numba_preamble_generator
                     ])
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 6b7ef1b886d0620ef4fdbb9ccb1d208bba43f14f..0cc93ca289d641fda488e08115df3834371aacb8 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -1,6 +1,5 @@
 """OpenCL target independent of PyOpenCL."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
@@ -48,7 +47,7 @@ class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper):
                 return super(self.wrapped_registry.get_or_register_dtype(
                         names, NumpyType(dtype.dtype)))
 
-        return super(DTypeRegistryWrapperWithAtomics, self).get_or_register_dtype(
+        return super().get_or_register_dtype(
                 names, dtype)
 
 
@@ -59,7 +58,7 @@ class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics):
         if isinstance(dtype, AtomicNumpyType):
             return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype)
         else:
-            return super(DTypeRegistryWrapperWithCL1Atomics, self).dtype_to_ctype(
+            return super().dtype_to_ctype(
                     dtype)
 
 # }}}
@@ -81,16 +80,16 @@ def _create_vector_types():
     counts = [2, 3, 4, 8, 16]
 
     for base_name, base_type in [
-            ('char', np.int8),
-            ('uchar', np.uint8),
-            ('short', np.int16),
-            ('ushort', np.uint16),
-            ('int', np.int32),
-            ('uint', np.uint32),
-            ('long', np.int64),
-            ('ulong', np.uint64),
-            ('float', np.float32),
-            ('double', np.float64),
+            ("char", np.int8),
+            ("uchar", np.uint8),
+            ("short", np.int16),
+            ("ushort", np.uint16),
+            ("int", np.int32),
+            ("uint", np.uint32),
+            ("long", np.int64),
+            ("ulong", np.uint64),
+            ("float", np.float32),
+            ("double", np.float64),
             ]:
         for count in counts:
             name = "%s%d" % (base_name, count)
@@ -148,22 +147,22 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
         }
 
 
-VECTOR_LITERAL_FUNCS = dict(
-        ("make_%s%d" % (name, count), (name, dtype, count))
+VECTOR_LITERAL_FUNCS = {
+        "make_%s%d" % (name, count): (name, dtype, count)
         for name, dtype in [
-            ('char', np.int8),
-            ('uchar', np.uint8),
-            ('short', np.int16),
-            ('ushort', np.uint16),
-            ('int', np.int32),
-            ('uint', np.uint32),
-            ('long', np.int64),
-            ('ulong', np.uint64),
-            ('float', np.float32),
-            ('double', np.float64),
+            ("char", np.int8),
+            ("uchar", np.uint8),
+            ("short", np.int16),
+            ("ushort", np.uint16),
+            ("int", np.int32),
+            ("uint", np.uint32),
+            ("long", np.int64),
+            ("ulong", np.uint64),
+            ("float", np.float32),
+            ("double", np.float64),
             ]
         for count in [2, 3, 4, 8, 16]
-        )
+        }
 
 
 class OpenCLCallable(ScalarCallable):
@@ -187,9 +186,9 @@ class OpenCLCallable(ScalarCallable):
                     [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
                         if (id >= 0 and dtype is not None)])
 
-            if dtype.kind in ['u', 'i', 'f']:
-                if dtype.kind == 'f':
-                    name = 'f'+name
+            if dtype.kind in ["u", "i", "f"]:
+                if dtype.kind == "f":
+                    name = "f"+name
                 dtype = NumpyType(dtype)
                 return (
                         self.copy(name_in_target=name,
@@ -243,8 +242,8 @@ class OpenCLCallable(ScalarCallable):
                 raise LoopyError("%s does not support complex numbers"
                         % name)
 
-            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
-                num_args))
+            updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1,
+                num_args)}
 
             return (
                     self.copy(name_in_target=name,
@@ -267,8 +266,8 @@ class OpenCLCallable(ScalarCallable):
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
                             callables_table)
 
-            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in
-                    range(count))
+            updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in
+                    range(count)}
             updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype(
                         NumpyType(dtype), count)
 
@@ -289,7 +288,7 @@ def scope_opencl_functions(target, identifier):
     Returns an instance of :class:`InKernelCallable` if the function defined by
     *identifier* is known in OpenCL.
     """
-    opencl_function_ids = set(["max", "min", "dot"]) | set(
+    opencl_function_ids = {"max", "min", "dot"} | set(
             _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS)
 
     if identifier in opencl_function_ids:
@@ -391,7 +390,7 @@ class OpenCLTarget(CFamilyTarget):
             for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using
             double-exchange for floating point--not yet supported).
         """
-        super(OpenCLTarget, self).__init__()
+        super().__init__()
 
         if atomics_flavor is None:
             atomics_flavor = "cl1"
@@ -443,19 +442,19 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
     def function_id_in_knl_callable_mapper(self):
         return (
-                [scope_opencl_functions] + super(
-                    OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
+                [scope_opencl_functions]
+                + super().function_id_in_knl_callable_mapper())
 
     def symbol_manglers(self):
         return (
-                super(OpenCLCASTBuilder, self).symbol_manglers() + [
+                super().symbol_manglers() + [
                     opencl_symbol_mangler
                     ])
 
     def preamble_generators(self):
 
         return (
-                super(OpenCLCASTBuilder, self).preamble_generators() + [
+                super().preamble_generators() + [
                     opencl_preamble_generator])
 
     # }}}
@@ -464,7 +463,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
     def get_function_declaration(self, codegen_state, codegen_result,
             schedule_index):
-        fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(
+        fdecl = super().get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
         from loopy.target.c import FunctionDeclarationWrapper
@@ -529,7 +528,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
             mem_kind = mem_kind.upper()
 
             from cgen import Statement
-            return Statement("barrier(CLK_%s_MEM_FENCE)%s" % (mem_kind, comment))
+            return Statement(f"barrier(CLK_{mem_kind}_MEM_FENCE){comment}")
         elif synchronization_kind == "global":
             raise LoopyError("OpenCL does not have global barriers")
         else:
@@ -554,13 +553,13 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
         from loopy.kernel.data import AddressSpace
 
         if mem_address_space == AddressSpace.LOCAL:
-            return CLLocal(super(OpenCLCASTBuilder, self).get_array_arg_decl(
+            return CLLocal(super().get_array_arg_decl(
                 name, mem_address_space, shape, dtype, is_written))
         elif mem_address_space == AddressSpace.PRIVATE:
-            return super(OpenCLCASTBuilder, self).get_array_arg_decl(
+            return super().get_array_arg_decl(
                 name, mem_address_space, shape, dtype, is_written)
         elif mem_address_space == AddressSpace.GLOBAL:
-            return CLGlobal(super(OpenCLCASTBuilder, self).get_array_arg_decl(
+            return CLGlobal(super().get_array_arg_decl(
                 name, mem_address_space, shape, dtype, is_written))
         else:
             raise ValueError("unexpected array argument scope: %s"
@@ -627,8 +626,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
             from loopy.kernel.data import TemporaryVariable, AddressSpace
             ecm = codegen_state.expression_to_code_mapper.with_assignments(
                     {
-                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
-                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
+                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype,
+                            shape=()),
+                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype,
+                            shape=()),
                         })
 
             lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)
@@ -688,7 +689,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
                 old_val = "*(%s *) &" % ctype + old_val
                 new_val = "*(%s *) &" % ctype + new_val
-                cast_str = "(%s %s *) " % (var_kind, ctype)
+                cast_str = f"({var_kind} {ctype} *) "
 
             return Block([
                 POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index c11c309614a78486635e330ed00aac46abc123fb..2008c92246daed4ae853177d351f43103cb73db3 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -1,9 +1,5 @@
 """OpenCL target integrated with PyOpenCL."""
 
-from __future__ import division, absolute_import
-
-import sys
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -26,9 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import range
-
 import numpy as np
 
 from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder
@@ -55,7 +48,7 @@ def adjust_local_temp_var_storage(kernel, device):
     from loopy.kernel.data import AddressSpace
 
     lmem_size = cl_char.usable_local_mem_size(device)
-    for temp_var in six.itervalues(kernel.temporary_variables):
+    for temp_var in kernel.temporary_variables.values():
         if temp_var.address_space != AddressSpace.LOCAL:
             new_temp_vars[temp_var.name] = \
                     temp_var.copy(storage_shape=temp_var.shape)
@@ -68,7 +61,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
         other_loctemp_nbytes = [
                 tv.nbytes
-                for tv in six.itervalues(kernel.temporary_variables)
+                for tv in kernel.temporary_variables.values()
                 if tv.address_space == AddressSpace.LOCAL
                 and tv.name != temp_var.name]
 
@@ -236,7 +229,7 @@ class PyOpenCLCallable(ScalarCallable):
                     raise LoopyTypeError("unexpected complex type '%s'" % dtype)
 
                 return (
-                        self.copy(name_in_target="%s_%s" % (tpname, name),
+                        self.copy(name_in_target=f"{tpname}_{name}",
                             arg_id_to_dtype={0: dtype, -1: NumpyType(
                                 np.dtype(dtype.numpy_dtype.type(0).real))}),
                         callables_table)
@@ -255,16 +248,16 @@ class PyOpenCLCallable(ScalarCallable):
                     raise LoopyTypeError("unexpected complex type '%s'" % dtype)
 
                 return (
-                        self.copy(name_in_target="%s_%s" % (tpname, name),
+                        self.copy(name_in_target=f"{tpname}_{name}",
                             arg_id_to_dtype={0: dtype, -1: dtype}),
                         callables_table)
             else:
                 # function calls for floating parameters.
                 numpy_dtype = dtype.numpy_dtype
-                if numpy_dtype.kind in ('u', 'i'):
+                if numpy_dtype.kind in ("u", "i"):
                     dtype = dtype.copy(numpy_dtype=np.float32)
-                if name == 'abs':
-                    name = 'fabs'
+                if name == "abs":
+                    name = "fabs"
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={0: dtype, -1: dtype}),
@@ -316,7 +309,7 @@ def pyopencl_preamble_generator(preamble_info):
 
 # {{{ pyopencl tools
 
-class _LegacyTypeRegistryStub(object):
+class _LegacyTypeRegistryStub:
     """Adapts legacy PyOpenCL type registry to be usable with PyOpenCLTarget."""
 
     def get_or_register_dtype(self, names, dtype=None):
@@ -338,6 +331,9 @@ class PyOpenCLTarget(OpenCLTarget):
     warnings) and support for complex numbers.
     """
 
+    # FIXME make prefixes conform to naming rules
+    # (see Reference: Loopy’s Model of a Kernel)
+
     host_program_name_prefix = "_lpy_host_"
     host_program_name_suffix = ""
 
@@ -346,7 +342,7 @@ class PyOpenCLTarget(OpenCLTarget):
         # This ensures the dtype registry is populated.
         import pyopencl.tools  # noqa
 
-        super(PyOpenCLTarget, self).__init__(
+        super().__init__(
                 atomics_flavor=atomics_flavor)
 
         self.device = device
@@ -359,7 +355,7 @@ class PyOpenCLTarget(OpenCLTarget):
             "pyopencl_module_name",)
 
     def __eq__(self, other):
-        if not super(PyOpenCLTarget, self).__eq__(other):
+        if not super().__eq__(other):
             return False
 
         if (self.device is None) != (other.device is None):
@@ -367,20 +363,21 @@ class PyOpenCLTarget(OpenCLTarget):
 
         if self.device is not None:
             assert other.device is not None
-            return (self.device.persistent_unique_id
-                    == other.device.persistent_unique_id)
+            return (self.device.hashable_model_and_version_identifier
+                    == other.device.hashable_model_and_version_identifier)
         else:
             assert other.device is None
             return True
 
     def update_persistent_hash(self, key_hash, key_builder):
-        super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
-        key_builder.rec(key_hash, getattr(self.device, "persistent_unique_id", None))
+        super().update_persistent_hash(key_hash, key_builder)
+        key_builder.rec(key_hash, getattr(
+            self.device, "hashable_model_and_version_identifier", None))
 
     def __getstate__(self):
         dev_id = None
         if self.device is not None:
-            dev_id = self.device.persistent_unique_id
+            dev_id = self.device.hashable_model_and_version_identifier
 
         return {
                 "device_id": dev_id,
@@ -403,7 +400,7 @@ class PyOpenCLTarget(OpenCLTarget):
                 dev
                 for plat in cl.get_platforms()
                 for dev in plat.get_devices()
-                if dev.persistent_unique_id == dev_id]
+                if dev.hashable_model_and_version_identifier == dev_id]
 
             if matches:
                 self.device = matches[0]
@@ -568,12 +565,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
         if idi.dtype.is_integral():
             gen(Comment("cast to Python int to avoid trouble "
                 "with struct packing or Boost.Python"))
-            if sys.version_info < (3,):
-                py_type = "long"
-            else:
-                py_type = "int"
+            py_type = "int"
 
-            gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
+            gen(Assign(idi.name, f"{py_type}({idi.name})"))
             gen(Line())
 
         if idi.dtype.is_composite():
@@ -692,7 +686,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
                 + ["wait_for=None", "allocator=None"])
 
         from genpy import (For, Function, Suite, Import, ImportAs, Return,
-                FromImport, If, Assign, Line, Statement as S)
+                FromImport, Line, Statement as S)
         return Function(
                 codegen_result.current_program(codegen_state).name,
                 args,
@@ -701,11 +695,6 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
                     ImportAs("pyopencl", "_lpy_cl"),
                     Import("pyopencl.tools"),
                     Line(),
-                    If("allocator is None",
-                        Assign(
-                            "allocator",
-                            "_lpy_cl_tools.DeferredAllocator(queue.context)")),
-                    Line(),
                     ] + [
                     Line(),
                     function_body,
@@ -728,14 +717,14 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         from genpy import Assign, Comment, Line
 
         def alloc_nbytes(tv):
-            from six.moves import reduce
+            from functools import reduce
             from operator import mul
             return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
 
         from loopy.kernel.data import AddressSpace
 
         global_temporaries = sorted(
-            (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables)
+            (tv for tv in codegen_state.kernel.temporary_variables.values()
             if tv.address_space == AddressSpace.GLOBAL),
             key=lambda tv: tv.name)
 
@@ -780,6 +769,13 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         from genpy import Suite, Assign, Assert, Line, Comment
         from pymbolic.mapper.stringifier import PREC_NONE
 
+        import pyopencl.version as cl_ver
+        if cl_ver.VERSION < (2020, 2):
+            from warnings import warn
+            warn("Your kernel invocation will likely fail because your "
+                    "version of PyOpenCL does not support allow_empty_ndrange. "
+                    "Please upgrade to version 2020.2 or newer.")
+
         # TODO: Generate finer-grained dependency structure
         return Suite([
             Comment("{{{ enqueue %s" % name),
@@ -791,7 +787,8 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
             arry_arg_code,
             Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel("
                 "queue, _lpy_knl, "
-                "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
+                "%(gsize)s, %(lsize)s,  wait_for=wait_for, "
+                "g_times_l=True, allow_empty_ndrange=True)"
                 % dict(
                     pyopencl_module_name=self.target.pyopencl_module_name,
                     gsize=ecm(gsize, prec=PREC_NONE, type_context="i"),
@@ -820,83 +817,19 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
                 random123_function_id_to_in_knl_callable_mapper)
         return (
                 [pyopencl_function_id_to_in_knl_callable_mapper,
-                    random123_function_id_to_in_knl_callable_mapper] + super(
-                    PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
+                 random123_function_id_to_in_knl_callable_mapper]
+                + super().function_id_in_knl_callable_mapper())
 
     def preamble_generators(self):
         return ([
             pyopencl_preamble_generator,
-            ] + super(PyOpenCLCASTBuilder, self).preamble_generators())
+            ] + super().preamble_generators())
 
     # }}}
 
 # }}}
 
 
-class NvidiaPyOpenCLTarget(PyOpenCLTarget):
-    def __init__(self, device, pyopencl_module_name="_lpy_cl",
-            atomics_flavor=None):
-        import pyopencl as cl
-        assert isinstance(device, cl.Device)
-        assert device.vendor == 'NVIDIA Corporation'
-
-        super(NvidiaPyOpenCLTarget, self).__init__(device,
-                pyopencl_module_name, atomics_flavor)
-
-    def preprocess(self, kernel):
-        from loopy import set_options
-        if self.device.compute_capability_major_nv >= 6:
-            build_options = ['-cl-nv-arch', 'sm_60'] + (
-                    kernel.options.cl_build_options)
-            kernel = set_options(kernel, cl_build_options=build_options)
-        return super(NvidiaPyOpenCLTarget, self).preprocess(kernel)
-
-    def get_device_ast_builder(self):
-        # here we should have an if else condition
-        if self.device.compute_capability_major_nv >= 6:
-            return NvidiaPyOpenCLCASTBuilder(self)
-        else:
-            return super(NvidiaPyOpenCLTarget, self).get_device_ast_builder()
-
-
-class NvidiaPyOpenCLCASTBuilder(PyOpenCLCASTBuilder):
-    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
-            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
-
-        from pymbolic.primitives import Sum
-        from cgen import Statement, Block, Assign
-        from loopy.target.c import POD
-
-        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype == np.float64:
-            # atomicAdd
-            if isinstance(rhs_expr, Sum):
-
-                old_val_var = codegen_state.var_name_generator("loopy_old_val")
-
-                from loopy.kernel.data import TemporaryVariable
-                ecm = codegen_state.expression_to_code_mapper.with_assignments(
-                        {
-                            old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
-                            })
-
-                new_rhs_expr = Sum(tuple(c for c in rhs_expr.children
-                                         if c != lhs_expr))
-                lhs_expr_code = ecm(lhs_expr)
-                rhs_expr_code = ecm(new_rhs_expr)
-
-                return Block([
-                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
-                        old_val_var),
-                    Assign(old_val_var, lhs_expr_code),
-                    Statement('asm volatile("atom.global.add.f64 %0, [%1], %2;" :'
-                        '"=d"({0}) : "l"(&{1}) , "d"({2}))'.format(
-                            old_val_var, lhs_expr_code, rhs_expr_code))])
-
-        return super(NvidiaPyOpenCLCASTBuilder,
-                self).emit_atomic_update(codegen_state, lhs_atomicity, lhs_var,
-                        lhs_expr, rhs_expr, lhs_dtype, rhs_type_context)
-
-
 # {{{ volatile mem acccess target
 
 class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder):
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index b7006575bb05561e29320f092935e7bb5dcab006..269a0ef0956fd106e284d3b1c7ae513e83a71234 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -1,5 +1,3 @@
-from __future__ import division, with_statement, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, zip
 
 from pytools import memoize_method
 from pytools.py_codegen import Indentation
@@ -50,14 +47,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             # ignored if options.no_numpy
             "out_host=None"
             ]
-        super(PyOpenCLExecutionWrapperGenerator, self).__init__(system_args)
+        super().__init__(system_args)
 
     def python_dtype_str(self, dtype):
         import pyopencl.tools as cl_tools
         if dtype.isbuiltin:
             return "_lpy_np."+dtype.name
         else:
-            return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
+            return ('_lpy_cl_tools.get_or_register_dtype("%s")'
                     % cl_tools.dtype_to_ctype(dtype))
 
     # {{{ handle non-numpy args
@@ -65,6 +62,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     def handle_non_numpy_arg(self, gen, arg):
         gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
         with Indentation(gen):
+            gen("# retain originally passed array")
+            gen(f"_lpy_{arg.name}_np_input = {arg.name}")
             gen("# synchronous, nothing to worry about")
             gen("%s = _lpy_cl_array.to_device("
                     "queue, %s, allocator=allocator)"
@@ -73,16 +72,20 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen("elif %s is not None:" % arg.name)
         with Indentation(gen):
             gen("_lpy_encountered_dev = True")
+            gen("_lpy_%s_np_input = None" % arg.name)
+        gen("else:")
+        with Indentation(gen):
+            gen("_lpy_%s_np_input = None" % arg.name)
 
         gen("")
 
     # }}}
 
-    # {{{ handle allocation of unspecified arguements
+    # {{{ handle allocation of unspecified arguments
 
     def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
         """
-        Handle allocation of non-specified arguements for pyopencl execution
+        Handle allocation of non-specified arguments for pyopencl execution
         """
         from pymbolic import var
 
@@ -142,7 +145,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     def initialize_system_args(self, gen):
         """
-        Initializes possibly empty system arguements
+        Initializes possibly empty system arguments
         """
         gen("if allocator is None:")
         with Indentation(gen):
@@ -184,7 +187,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 if (issubclass(arg.arg_class, ArrayArg)
                         and arg.base_name in (
                             program.root_kernel.get_written_variables())):
-                    gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name))
+                    gen(f"{arg.name}.add_event(_lpy_evt)")
 
     # }}}
 
@@ -201,23 +204,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             with Indentation(gen):
                 gen("out_host = True")
 
-            gen("if out_host:")
-            with Indentation(gen):
-                gen("pass")  # if no outputs (?!)
-                for arg in implemented_data_info:
-                    if not issubclass(arg.arg_class, KernelArgument):
-                        continue
-
-                    is_written = arg.base_name in (
-                            program.root_kernel.get_written_variables())
-                    if is_written:
-                        gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
+            for arg in implemented_data_info:
+                if not issubclass(arg.arg_class, KernelArgument):
+                    continue
+
+                is_written = (arg.base_name in
+                        program.root_kernel.get_written_variables())
+                if is_written:
+                    np_name = "_lpy_%s_np_input" % arg.name
+                    gen("if out_host or %s is not None:" % np_name)
+                    with Indentation(gen):
+                        gen("%s = %s.get(queue=queue, ary=%s)"
+                            % (arg.name, arg.name, np_name))
 
             gen("")
 
         if options.return_dict:
             gen("return _lpy_evt, {%s}"
-                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    % ", ".join(f'"{arg.name}": {arg.name}'
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
                         if arg.base_name in
@@ -264,7 +268,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             specific arguments.
         """
 
-        super(PyOpenCLKernelExecutor, self).__init__(program)
+        super().__init__(program)
 
         self.context = context
 
@@ -277,6 +281,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         generator = PyOpenCLExecutionWrapperGenerator()
         return generator(kernel, codegen_result)
 
+    def get_wrapper_generator(self):
+        return PyOpenCLExecutionWrapperGenerator()
+
     @memoize_method
     def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
         program = self.get_typed_and_scheduled_program(arg_to_dtype_set)
@@ -321,7 +328,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
     def __call__(self, queue, **kwargs):
         """
         :arg allocator: a callable passed a byte count and returning
-            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
+            a :class:`pyopencl.Buffer`. A :mod:`pyopencl` allocator
             maybe.
         :arg wait_for: A list of :class:`pyopencl.Event` instances
             for which to wait.
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd..c27b4484d29b8dae7ddc83c4ae80221c9afb8e29 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -1,6 +1,5 @@
 """Python host AST builder for integration with PyOpenCL."""
 
-from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
@@ -24,7 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 import numpy as np
 
 from pymbolic.mapper import Mapper
@@ -52,7 +50,7 @@ class ExpressionToPythonMapper(StringifyMapper):
         return Mapper.handle_unsupported_expression(self, victim, enclosing_prec)
 
     def rec(self, expr, prec, type_context=None, needed_dtype=None):
-        return super(ExpressionToPythonMapper, self).rec(expr, prec)
+        return super().rec(expr, prec)
 
     __call__ = rec
 
@@ -67,19 +65,19 @@ class ExpressionToPythonMapper(StringifyMapper):
                 enclosing_prec))
 
         if expr.name in self.kernel.all_inames():
-            return super(ExpressionToPythonMapper, self).map_variable(
+            return super().map_variable(
                     expr, enclosing_prec)
 
         var_descr = self.kernel.get_var_descriptor(expr.name)
         if isinstance(var_descr, ValueArg):
-            return super(ExpressionToPythonMapper, self).map_variable(
+            return super().map_variable(
                     expr, enclosing_prec)
 
-        return super(ExpressionToPythonMapper, self).map_variable(
+        return super().map_variable(
                 expr, enclosing_prec)
 
     def map_subscript(self, expr, enclosing_prec):
-        return super(ExpressionToPythonMapper, self).map_subscript(
+        return super().map_subscript(
                 expr, enclosing_prec)
 
     def map_call(self, expr, enclosing_prec):
@@ -113,7 +111,8 @@ class ExpressionToPythonMapper(StringifyMapper):
 
         str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters]
 
-        return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters))
+        return "{}({})".format(in_knl_callable.name_in_target,
+                               ", ".join(str_parameters))
 
     def map_group_hw_index(self, expr, enclosing_prec):
         raise LoopyError("plain Python does not have group hw axes")
@@ -144,8 +143,7 @@ class ExpressionToPythonMapper(StringifyMapper):
 class Collection(Suite):
     def generate(self):
         for item in self.contents:
-            for item_line in item.generate():
-                yield item_line
+            yield from item.generate()
 
 # }}}
 
@@ -183,13 +181,12 @@ class PythonASTBuilderBase(ASTBuilderBase):
     def function_id_in_knl_callable_mapper(self):
         from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase,
-                    self).function_id_in_knl_callable_mapper() +
+                super().function_id_in_knl_callable_mapper() +
                 [scope_c_math_functions])
 
     def preamble_generators(self):
         return (
-                super(PythonASTBuilderBase, self).preamble_generators() + [
+                super().preamble_generators() + [
                     _base_python_preamble_generator
                     ])
 
@@ -219,7 +216,7 @@ class PythonASTBuilderBase(ASTBuilderBase):
         from genpy import Assign
 
         for tv in sorted(
-                six.itervalues(kernel.temporary_variables),
+                kernel.temporary_variables.values(),
                 key=lambda tv: tv.name):
             if tv.shape:
                 result.append(
diff --git a/loopy/tools.py b/loopy/tools.py
index 524638e4a9fb2c624c49a3da53fd3cbdeca907c8..e8d529d2da3ad87c649371bcee7bdcc22ad407fb 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,13 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-
-try:
-    import collections.abc as abc
-except ImportError:
-    # Python 2
-    import collections as abc
+import collections.abc as abc
 
 import numpy as np
 from pytools import memoize_method
@@ -36,18 +28,11 @@ from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 from loopy.symbolic import WalkMapper as LoopyWalkMapper
 from pymbolic.mapper.persistent_hash import (
         PersistentHashWalkMapper as PersistentHashWalkMapperBase)
-import six  # noqa
-from six.moves import intern
-import re
-from mako.template import Template
-import loopy as lp
+from sys import intern
+
 
-if six.PY2:
-    def is_integer(obj):
-        return isinstance(obj, (int, long, np.integer))  # noqa pylint:disable=undefined-variable
-else:
-    def is_integer(obj):
-        return isinstance(obj, (int, np.integer))
+def is_integer(obj):
+    return isinstance(obj, (int, np.integer))
 
 
 def update_persistent_hash(obj, key_hash, key_builder):
@@ -91,7 +76,7 @@ class LoopyKeyBuilder(KeyBuilderBase):
 
     def update_for_dict(self, key_hash, key):
         # Order matters for the hash--insert in sorted order.
-        for dict_key in sorted(six.iterkeys(key), key=lambda obj:
+        for dict_key in sorted(key.keys(), key=lambda obj:
                 type(obj).__name__ + str(obj)):
             self.rec(key_hash, (dict_key, key[dict_key]))
 
@@ -121,7 +106,7 @@ class LoopyKeyBuilder(KeyBuilderBase):
                 % type(key))
 
     def update_for_type_auto(self, key_hash, key):
-        key_hash.update("auto".encode("utf8"))
+        key_hash.update(b"auto")
 
     def update_for_pymbolic_expression(self, key_hash, key):
         if key is None:
@@ -130,7 +115,7 @@ class LoopyKeyBuilder(KeyBuilderBase):
             PersistentHashWalkMapper(key_hash)(key)
 
 
-class PymbolicExpressionHashWrapper(object):
+class PymbolicExpressionHashWrapper:
     def __init__(self, expression):
         self.expression = expression
 
@@ -149,7 +134,7 @@ class PymbolicExpressionHashWrapper(object):
 
 # {{{ eq key builder
 
-class LoopyEqKeyBuilder(object):
+class LoopyEqKeyBuilder:
     """Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in
     equality comparison, such that `key(a) == key(b)` if and only if `a == b`.
     The types of objects being compared should satisfy structural equality.
@@ -229,11 +214,11 @@ def remove_common_indentation(code, require_leading_newline=True,
 
     test_line = None
     if ignore_lines_starting_with:
-        for l in lines:
-            strip_l = l.lstrip()
+        for line in lines:
+            strip_l = line.lstrip()
             if (strip_l
                     and not strip_l.startswith(ignore_lines_starting_with)):
-                test_line = l
+                test_line = line
                 break
 
     else:
@@ -336,8 +321,8 @@ def cptr_from_numpy(obj):
 
 
 # https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172
-def empty_aligned(shape, dtype, order='C', n=64):
-    '''empty_aligned(shape, dtype='float64', order='C', n=None)
+def empty_aligned(shape, dtype, order="C", n=64):
+    """empty_aligned(shape, dtype='float64', order="C", n=None)
     Function that returns an empty numpy array that is n-byte aligned,
     where ``n`` is determined by inspecting the CPU if it is not
     provided.
@@ -345,7 +330,7 @@ def empty_aligned(shape, dtype, order='C', n=64):
     ``n`` is not provided then this function will inspect the CPU to
     determine alignment. The rest of the arguments are as per
     :func:`numpy.empty`.
-    '''
+    """
     itemsize = np.dtype(dtype).itemsize
 
     # Apparently there is an issue with numpy.prod wrapping around on 32-bits
@@ -374,68 +359,9 @@ def empty_aligned(shape, dtype, order='C', n=64):
 # }}}
 
 
-# {{{ compute SCCs with Tarjan's algorithm
-
-def compute_sccs(graph):
-    to_search = set(graph.keys())
-    visit_order = {}
-    scc_root = {}
-    sccs = []
-
-    while to_search:
-        top = next(iter(to_search))
-        call_stack = [(top, iter(graph[top]), None)]
-        visit_stack = []
-        visiting = set()
-
-        scc = []
-
-        while call_stack:
-            top, children, last_popped_child = call_stack.pop()
-
-            if top not in visiting:
-                # Unvisited: mark as visited, initialize SCC root.
-                count = len(visit_order)
-                visit_stack.append(top)
-                visit_order[top] = count
-                scc_root[top] = count
-                visiting.add(top)
-                to_search.discard(top)
-
-            # Returned from a recursion, update SCC.
-            if last_popped_child is not None:
-                scc_root[top] = min(
-                    scc_root[top],
-                    scc_root[last_popped_child])
-
-            for child in children:
-                if child not in visit_order:
-                    # Recurse.
-                    call_stack.append((top, children, child))
-                    call_stack.append((child, iter(graph[child]), None))
-                    break
-                if child in visiting:
-                    scc_root[top] = min(
-                        scc_root[top],
-                        visit_order[child])
-            else:
-                if scc_root[top] == visit_order[top]:
-                    scc = []
-                    while visit_stack[-1] != top:
-                        scc.append(visit_stack.pop())
-                    scc.append(visit_stack.pop())
-                    for item in scc:
-                        visiting.remove(item)
-                    sccs.append(scc)
-
-    return sccs
-
-# }}}
-
-
 # {{{ pickled container value
 
-class _PickledObject(object):
+class _PickledObject:
     """A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and
     :class:`LazilyUnpicklingList`).
     """
@@ -508,9 +434,9 @@ class LazilyUnpicklingDict(abc.MutableMapping):
         return iter(self._map)
 
     def __getstate__(self):
-        return {"_map": dict(
-            (key, _PickledObject(val))
-            for key, val in six.iteritems(self._map))}
+        return {"_map": {
+            key: _PickledObject(val)
+            for key, val in self._map.items()}}
 
 # }}}
 
@@ -610,11 +536,11 @@ class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList):
 
 # {{{ optional object
 
-class _no_value(object):  # noqa
+class _no_value:  # noqa
     pass
 
 
-class Optional(object):
+class Optional:
     """A wrapper for an optionally present object.
 
     .. attribute:: has_value
@@ -681,7 +607,7 @@ class Optional(object):
 
 
 def unpickles_equally(obj):
-    from six.moves.cPickle import loads, dumps
+    from pickle import loads, dumps
     return loads(dumps(obj)) == obj
 
 
@@ -692,123 +618,4 @@ def is_interned(s):
 def intern_frozenset_of_ids(fs):
     return frozenset(intern(s) for s in fs)
 
-
-def natorder(key):
-    # Return natural ordering for strings, as opposed to dictionary order.
-    # E.g. will result in
-    #  'abc1' < 'abc9' < 'abc10'
-    # rather than
-    #  'abc1' < 'abc10' < 'abc9'
-    # Based on
-    # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
-    import re
-    return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
-
-def natsorted(seq, key=lambda x: x):
-    return sorted(seq, key=lambda y: natorder(key(y)))
-
-
-def dump_as_python(kernel, filename=None):
-    """
-    Generates a python code for generating *kernel* for sharing kernels.
-
-    :arg kernel: An instance of :class:`loopy.LoopKernel`
-    :arg filename: An instance of :class:`str`. If *None*, then prints the
-        python file to *stdout*.
-    """
-
-    options = []
-
-    printed_insn_ids = set()
-    printed_insn_order = []
-
-    def insert_insn_into_order(insn):
-        if insn.id in printed_insn_ids:
-            return
-        printed_insn_ids.add(insn.id)
-
-        for dep_id in natsorted(insn.depends_on):
-            insert_insn_into_order(kernel.id_to_insn[dep_id])
-
-        printed_insn_order.append(insn)
-
-    for insn in kernel.instructions:
-        insert_insn_into_order(insn)
-
-    for insn in printed_insn_order:
-        option = 'id=%s, ' % insn.id
-        if insn.depends_on:
-            option += ("dep="+":".join(insn.depends_on)+", ")
-        if insn.tags:
-            option += ("tags="+":".join(insn.tags)+", ")
-        if insn.within_inames:
-            option += ("inames="+":".join(insn.within_inames)+", ")
-        if isinstance(insn, lp.MultiAssignmentBase):
-            if insn.atomicity:
-                option += "atomic, "
-        elif isinstance(insn, lp.BarrierInstruction):
-            option += ("mem_kind=%s, " % insn.mem_kind)
-        options.append(option[:-2])
-
-    insn_x_options = zip(printed_insn_order, options)
-
-    python_code = r'''<%! import loopy as lp %>import loopy as lp
-    import numpy as np
-    <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL',
-    2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %>
-    knl = lp.make_kernel(
-        [
-        % for dom in domains:
-        "${str(dom)}",
-        % endfor
-        ],
-        """
-        % for insn, opts in insn_x_opts:
-        % if isinstance(insn, lp.Assignment):
-        ${insn.assignee} = ${insn.expression} {${opts}}
-        % elif isinstance(insn, lp.BarrierInstruction):
-        ... ${insn.synchronization_kind[0]}barrier{${opts}}
-        % elif isinstance(insn, lp.NoOpInstruction):
-        ... nop {${opts}}
-        % else:
-        **Not implemented for ${type(insn)}**
-        % endif
-        %endfor
-        """, [
-            % for arg in args:
-            % if isinstance(arg, lp.ValueArg):
-            lp.ValueArg(
-                name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}),
-            % else:
-            lp.GlobalArg(
-                name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name},
-                shape=${arg.shape}, for_atomic=${arg.for_atomic}),
-            % endif
-            % endfor
-            % for tv in temp_vars:
-            lp.TemporaryVariable(
-                name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name},
-                shape=${tv.shape}, for_atomic=${tv.for_atomic},
-                address_space=${tv_scope[tv.address_space]},
-                read_only=${tv.read_only},
-                % if tv.initializer is not None:
-                initializer=${"np."+str((tv.initializer).__repr__())},
-                % endif
-                ),
-            % endfor
-            ], lang_version=${lp.VERSION})'''
-
-    python_code = Template(python_code).render(insn_x_opts=insn_x_options,
-            domains=kernel.domains, args=kernel.args,
-            temp_vars=[k for k in kernel.temporary_variables.values()])
-
-    python_code = re.sub("\\n    ", "\n", python_code)
-    if filename:
-        with open(filename, 'w') as f:
-            f.write(python_code)
-    else:
-        print(python_code)
-
-
 # vim: foldmethod=marker
diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py
index f42fd3c8d2943bb37b75e9ef0003b88985950926..625781167db6aa502153cdcebd225d79e95c46b6 100644
--- a/loopy/transform/__init__.py
+++ b/loopy/transform/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index f4a184f632d251bed7ec7d6ace718b3851c5c0d8..1e03ade94710b25cd56eecc7079afdadf567a82c 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2017 Kaushik Kulkarni"
 
 __license__ = """
@@ -39,9 +37,8 @@ __doc__ = """
 # {{{ add_barrier
 
 @iterate_over_kernels_if_given_program
-def add_barrier(knl, insn_before="", insn_after="",
-        id_based_on=None, tags=None, synchronization_kind="global",
-        mem_kind=None):
+def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
+                tags=None, synchronization_kind="global", mem_kind=None):
     """Takes in a kernel that needs to be added a barrier and returns a kernel
     which has a barrier inserted into it. It takes input of 2 instructions and
     then adds a barrier in between those 2 instructions. The expressions can
@@ -59,19 +56,19 @@ def add_barrier(knl, insn_before="", insn_after="",
         for "global" bariers.  If not supplied, defaults to *synchronization_kind*
     """
 
-    assert isinstance(knl, LoopKernel)
+    assert isinstance(kernel, LoopKernel)
 
     if mem_kind is None:
         mem_kind = synchronization_kind
 
     if id_based_on is None:
-        id = knl.make_unique_instruction_id(
+        id = kernel.make_unique_instruction_id(
             based_on=synchronization_kind[0]+"_barrier")
     else:
-        id = knl.make_unique_instruction_id(based_on=id_based_on)
+        id = kernel.make_unique_instruction_id(based_on=id_based_on)
 
     match = parse_match(insn_before)
-    insn_before_list = [insn.id for insn in knl.instructions if match(knl,
+    insn_before_list = [insn.id for insn in kernel.instructions if match(kernel,
                         insn)]
 
     barrier_to_add = BarrierInstruction(depends_on=frozenset(insn_before_list),
@@ -81,12 +78,12 @@ def add_barrier(knl, insn_before="", insn_after="",
                                         synchronization_kind=synchronization_kind,
                                         mem_kind=mem_kind)
 
-    new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add])
-    new_knl = add_dependency(new_knl,
+    new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add])
+    new_kernel = add_dependency(kernel=new_kernel,
                              insn_match=insn_after,
                              depends_on="id:"+id)
 
-    return new_knl
+    return new_kernel
 
 # }}}
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index 3df86e7ae04073e654f91b30c584719c165269d0..8376688198c3cff232d9f9006883d1b236efe367 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,8 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
-
 from loopy.diagnostic import LoopyError
 
 from loopy.program import iterate_over_kernels_if_given_program
@@ -42,10 +38,10 @@ def fold_constants(kernel):
             insn.with_transformed_expressions(cfm)
             for insn in kernel.instructions]
 
-    new_substs = dict(
-            (sub.name,
-                sub.copy(expression=cfm(sub.expression)))
-            for sub in six.itervalues(kernel.substitutions))
+    new_substs = {
+            sub.name:
+            sub.copy(expression=cfm(sub.expression))
+            for sub in kernel.substitutions.values()}
 
     return kernel.copy(
             instructions=new_insns,
@@ -80,9 +76,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
     from loopy.kernel.array import ArrayBase
     if isinstance(var_descr, ArrayBase):
         if var_descr.dim_names is not None:
-            name_to_index = dict(
-                    (name, idx)
-                    for idx, name in enumerate(var_descr.dim_names))
+            name_to_index = {
+                    name: idx
+                    for idx, name in enumerate(var_descr.dim_names)}
         else:
             name_to_index = {}
 
@@ -146,8 +142,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
 
     def iterate_as(cls, expr):
         if isinstance(expr, cls):
-            for ch in expr.children:
-                yield ch
+            yield from expr.children
         else:
             yield expr
 
@@ -222,9 +217,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
 
                 product_parts = set(iterate_as(Product, term))
 
-                my_common_factors = set(
+                my_common_factors = {
                         cf for cf in my_common_factors
-                        if unif_subst_map(cf) in product_parts)
+                        if unif_subst_map(cf) in product_parts}
 
             common_factors[cf_index] = (index_key, my_common_factors)
 
@@ -269,9 +264,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
         unif_subst_map = SubstitutionMapper(
                 make_subst_func(unif_result.lmap))
 
-        mapped_my_common_factors = set(
+        mapped_my_common_factors = {
                 unif_subst_map(cf)
-                for cf in my_common_factors)
+                for cf in my_common_factors}
 
         new_sum_terms = []
 
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index b62e13d6b268a9b84e209a5c8958dc949114eecf..4ef5fac77c3af646352e00d595e028223dd9a316 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from six.moves import range, zip
 
 import islpy as isl
 from islpy import dim_type
@@ -195,7 +192,7 @@ def compute_bounds(kernel, domain, stor2sweep,
 
 # {{{ array-to-buffer map
 
-class ArrayToBufferMap(object):
+class ArrayToBufferMap:
     def __init__(self, kernel, domain, sweep_inames, access_descriptors,
             storage_axis_count):
         self.kernel = kernel
@@ -218,8 +215,8 @@ class ArrayToBufferMap(object):
                 self.primed_sweep_inames)
 
         self.prime_sweep_inames = SubstitutionMapper(make_subst_func(
-            dict((sin, var(psin))
-                for sin, psin in zip(sweep_inames, self.primed_sweep_inames))))
+            {sin: var(psin)
+                for sin, psin in zip(sweep_inames, self.primed_sweep_inames)}))
 
         # # }}}
 
@@ -403,7 +400,7 @@ class ArrayToBufferMap(object):
                 aligned_g_s2s_parm_dom)
 
 
-class NoOpArrayToBufferMap(object):
+class NoOpArrayToBufferMap:
     non1_storage_axis_names = ()
     storage_base_indices = ()
     non1_storage_shape = ()
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index d5a97b773e7447833c96405920e1efad1b382baa..5da142e3d400edf151ee755990d1fa4845aa147e 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,10 +21,7 @@ THE SOFTWARE.
 """
 
 
-import six
-
-from loopy.symbolic import (RuleAwareIdentityMapper,
-        SubstitutionRuleMappingContext, pw_aff_to_expr)
+from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
 from loopy.kernel.data import ValueArg, ArrayArg
 import islpy as isl
 
@@ -37,7 +32,6 @@ __doc__ = """
 .. currentmodule:: loopy
 
 .. autofunction:: to_batched
-.. autofunction:: save_temporaries_in_loop
 """
 
 
@@ -59,15 +53,13 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args):
 
 class _BatchVariableChanger(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, kernel, batch_varying_args,
-            batch_iname_expr, sequential, batch_varying_temps=None, within=None):
-        super(_BatchVariableChanger, self).__init__(rule_mapping_context)
+            batch_iname_expr, sequential):
+        super().__init__(rule_mapping_context)
 
         self.kernel = kernel
         self.batch_varying_args = batch_varying_args
         self.batch_iname_expr = batch_iname_expr
         self.sequential = sequential
-        self.batch_varying_temps = batch_varying_temps
-        self.within = within
 
     def needs_batch_subscript(self, name):
         tv = self.kernel.temporary_variables.get(name)
@@ -77,20 +69,15 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
         if not self.sequential:
             if tv is None:
                 return False
-            if self.batch_varying_temps:
-                return tv.name in self.batch_varying_temps
-            else:
-                if not temp_needs_batching_if_not_sequential(tv,
-                        self.batch_varying_args):
-                    return False
+            if not temp_needs_batching_if_not_sequential(tv,
+                    self.batch_varying_args):
+                return False
 
         return True
 
     def map_subscript(self, expr, expn_state):
-        if not self.needs_batch_subscript(expr.aggregate.name) or not (
-                self.within(expn_state.kernel, expn_state.instruction,
-                    expn_state.stack)):
-            return super(_BatchVariableChanger, self).map_subscript(expr, expn_state)
+        if not self.needs_batch_subscript(expr.aggregate.name):
+            return super().map_subscript(expr, expn_state)
 
         idx = self.rec(expr.index, expn_state)
         if not isinstance(idx, tuple):
@@ -99,10 +86,8 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
         return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx)
 
     def map_variable(self, expr, expn_state):
-        if not self.needs_batch_subscript(expr.name) or not (
-                self.within(expn_state.kernel, expn_state.instruction,
-                    expn_state.stack)):
-            return super(_BatchVariableChanger, self).map_variable(expr, expn_state)
+        if not self.needs_batch_subscript(expr.name):
+            return super().map_variable(expr, expn_state)
 
         return expr[self.batch_iname_expr]
 
@@ -117,8 +102,8 @@ def _add_unique_dim_name(name, dim_names):
 
 
 @iterate_over_kernels_if_given_program
-def to_batched(knl, nbatches, batch_varying_args,
-        batch_iname_prefix="ibatch", sequential=False, within=None):
+def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
+        sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
 
@@ -138,29 +123,29 @@ def to_batched(knl, nbatches, batch_varying_args,
 
     from pymbolic import var
 
-    vng = knl.get_var_name_generator()
+    vng = kernel.get_var_name_generator()
     batch_iname = vng(batch_iname_prefix)
     batch_iname_expr = var(batch_iname)
 
     new_args = []
 
-    batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % {
-            "iname": batch_iname,
-            "nbatches": nbatches,
-            }
+    batch_dom_str = "{{[{iname}]: 0 <= {iname} < {nbatches}}}".format(
+            iname=batch_iname,
+            nbatches=nbatches,
+            )
 
     if not isinstance(nbatches, int):
         batch_dom_str = "[%s] -> " % nbatches + batch_dom_str
-        new_args.append(ValueArg(nbatches, dtype=knl.index_dtype))
+        new_args.append(ValueArg(nbatches, dtype=kernel.index_dtype))
 
         nbatches_expr = var(nbatches)
     else:
         nbatches_expr = nbatches
 
     batch_domain = isl.BasicSet(batch_dom_str)
-    new_domains = [batch_domain] + knl.domains
+    new_domains = [batch_domain] + kernel.domains
 
-    for arg in knl.args:
+    for arg in kernel.args:
         if arg.name in batch_varying_args:
             if isinstance(arg, ValueArg):
                 arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,),
@@ -173,14 +158,14 @@ def to_batched(knl, nbatches, batch_varying_args,
 
         new_args.append(arg)
 
-    knl = knl.copy(
+    kernel = kernel.copy(
             domains=new_domains,
             args=new_args)
 
     if not sequential:
         new_temps = {}
 
-        for temp in six.itervalues(knl.temporary_variables):
+        for temp in kernel.temporary_variables.values():
             if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
                 new_temps[temp.name] = temp.copy(
                         shape=(nbatches_expr,) + temp.shape,
@@ -189,90 +174,28 @@ def to_batched(knl, nbatches, batch_varying_args,
             else:
                 new_temps[temp.name] = temp
 
-        knl = knl.copy(temporary_variables=new_temps)
+        kernel = kernel.copy(temporary_variables=new_temps)
     else:
         import loopy as lp
         from loopy.kernel.data import ForceSequentialTag
-        knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())])
-
-    from loopy.match import parse_stack_match, parse_match
+        kernel = lp.tag_inames(kernel, [(batch_iname, ForceSequentialTag())])
 
     rule_mapping_context = SubstitutionRuleMappingContext(
-            knl.substitutions, vng)
+            kernel.substitutions, vng)
     bvc = _BatchVariableChanger(rule_mapping_context,
-            knl, batch_varying_args, batch_iname_expr,
-            sequential=sequential, within=parse_stack_match(within))
+            kernel, batch_varying_args, batch_iname_expr,
+            sequential=sequential)
     kernel = rule_mapping_context.finish_kernel(
-            bvc.map_kernel(knl))
+            bvc.map_kernel(kernel))
 
     batch_iname_set = frozenset([batch_iname])
-    within = parse_match(within)
     kernel = kernel.copy(
             instructions=[
                 insn.copy(within_inames=insn.within_inames | batch_iname_set)
-                if within(kernel, insn) else insn for insn in kernel.instructions])
+                for insn in kernel.instructions])
 
     return kernel
 
 # }}}
 
-
-@iterate_over_kernels_if_given_program
-def save_temporaries_in_loop(knl, iname, temps_to_save, within=None):
-    """
-    Returns a kernel with the temporary variables in *temps_to_save* batched
-    within the iname *iname*.
-
-    :arg iname: An instance of :class:`str1 for the loop across which the
-        values of the temporaries are to be saved.
-
-    :arg temps_to_save: An iterable containing the temporaries that are to be
-        saved for each loop iteration defined by *iname*.
-
-    :arg within: If not None, limit the action of the transformation to
-        matching contexts.  See :func:`loopy.match.parse_stack_match`
-        for syntax.
-    """
-    from loopy.match import parse_match, parse_stack_match
-    from pymbolic import var
-    from loopy.isl_helpers import static_max_of_pw_aff
-
-    batch_iname_expr = var(iname)
-
-    bounds = knl.get_iname_bounds(iname, constants_only=False)
-    nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size,
-        constants_only=False))
-
-    new_temps = {}
-
-    for temp in six.itervalues(knl.temporary_variables):
-        if temp.name in temps_to_save:
-            new_temps[temp.name] = temp.copy(
-                shape=(nbatches_expr,) + temp.shape,
-                dim_tags=("c",) * (len(temp.shape) + 1),
-                dim_names=_add_unique_dim_name("itemp_save", temp.dim_names))
-        else:
-            new_temps[temp.name] = temp
-
-    knl = knl.copy(temporary_variables=new_temps)
-
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            knl.substitutions, knl.get_var_name_generator)
-    bvc = _BatchVariableChanger(rule_mapping_context,
-            knl, [], batch_iname_expr,
-            sequential=False, batch_varying_temps=temps_to_save,
-            within=parse_stack_match(within))
-    kernel = rule_mapping_context.finish_kernel(
-            bvc.map_kernel(knl))
-
-    within = parse_match(within)
-
-    batch_iname_set = frozenset([iname])
-    kernel = kernel.copy(
-            instructions=[
-                insn.copy(within_inames=insn.within_inames | batch_iname_set)
-                if within(kernel, insn) else insn for insn in kernel.instructions])
-
-    return kernel
-
 # vim: foldmethod=marker
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index a1c90d791a9d4097398610badc421aa4600e2097..7f1ca059acf95f39dfb050c1889149f7a2ed03de 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -1,6 +1,3 @@
-from __future__ import division, absolute_import
-from six.moves import range
-
 __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
 
 __license__ = """
@@ -48,7 +45,7 @@ logger = logging.getLogger(__name__)
 class ArrayAccessReplacer(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context,
             var_name, within, array_base_map, buf_var):
-        super(ArrayAccessReplacer, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.within = within
 
@@ -68,7 +65,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
             result = self.map_array_access((), expn_state)
 
         if result is None:
-            return super(ArrayAccessReplacer, self).map_variable(expr, expn_state)
+            return super().map_variable(expr, expn_state)
         else:
             self.modified_insn_ids.add(expn_state.insn_id)
             return result
@@ -82,7 +79,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
             result = self.map_array_access(expr.index_tuple, expn_state)
 
         if result is None:
-            return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state)
+            return super().map_subscript(expr, expn_state)
         else:
             self.modified_insn_ids.add(expn_state.insn_id)
             return result
@@ -309,8 +306,8 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name,
         if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None:
             dim_name = var_descr.dim_names[i]
 
-        init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name))
-        store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name))
+        init_iname = var_name_gen(f"{var_name}_init_{dim_name}")
+        store_iname = var_name_gen(f"{var_name}_store_{dim_name}")
 
         new_iname_to_tag[init_iname] = default_tag
         new_iname_to_tag[store_iname] = default_tag
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 1bbdb12010818d92b989f898ab874b10c5c2a31c..461a4cb5fc4236db4b2dbeea2c8180ce77f308a3 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
@@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-
 import islpy as isl
 from pymbolic.primitives import CallWithKwargs
 
@@ -63,10 +59,10 @@ def _resolve_callables_from_function_lookup(program,
     """
     callables_table = program.callables_table
 
-    callable_knls = dict(
-            (func_id, in_knl_callable) for func_id, in_knl_callable in
+    callable_knls = {
+            func_id: in_knl_callable for func_id, in_knl_callable in
             callables_table.items() if isinstance(in_knl_callable,
-                CallableKernel))
+                CallableKernel)}
     edited_callable_knls = {}
 
     for func_id, in_knl_callable in callable_knls.items():
@@ -143,7 +139,7 @@ class _RegisterCalleeKernel(ImmutableRecord):
     :func:`loopy.transform.register_callable_kernel` picklable. As python
     cannot pickle lexical closures.
     """
-    fields = set(['callable_kernel'])
+    fields = {"callable_kernel"}
 
     def __init__(self, callable_kernel):
         self.callable_kernel = callable_kernel
@@ -255,8 +251,7 @@ def register_callable_kernel(program, callee_kernel):
     # {{{ sanity checks
 
     assert isinstance(program, Program)
-    assert isinstance(callee_kernel, LoopKernel), ('{0} !='
-            '{1}'.format(type(callee_kernel), LoopKernel))
+    assert isinstance(callee_kernel, LoopKernel)
 
     for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
@@ -328,7 +323,7 @@ class KernelInliner(SubstitutionMapper):
     """
 
     def __init__(self, subst_func, caller, arg_map, arg_dict):
-        super(KernelInliner, self).__init__(subst_func)
+        super().__init__(subst_func)
         self.caller = caller
         self.arg_map = arg_map
         self.arg_dict = arg_dict
@@ -352,7 +347,7 @@ class KernelInliner(SubstitutionMapper):
             from numbers import Integral
             if not all(isinstance(d, Integral) for d in callee_arg.shape):
                 raise LoopyError(
-                    "Argument: {0} in callee kernel does not have "
+                    "Argument: {} in callee kernel does not have "
                     "constant shape.".format(callee_arg))
 
             flatten_index = 0
@@ -378,7 +373,7 @@ class KernelInliner(SubstitutionMapper):
 
             return aggregate.index(tuple(new_indices))
         else:
-            return super(KernelInliner, self).map_subscript(expr)
+            return super().map_subscript(expr)
 
 # }}}
 
@@ -427,7 +422,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
 
     temp_map = {}
     new_temps = kernel.temporary_variables.copy()
-    for name, temp in six.iteritems(callee_knl.temporary_variables):
+    for name, temp in callee_knl.temporary_variables.items():
         new_name = vng(callee_label+name)
         temp_map[name] = new_name
         new_temps[new_name] = temp.copy(name=new_name)
@@ -469,11 +464,11 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
     import pymbolic.primitives as p
     from pymbolic.mapper.substitutor import make_subst_func
 
-    var_map = dict((p.Variable(k), p.Variable(v))
-                   for k, v in six.iteritems(iname_map))
-    var_map.update(dict((p.Variable(k), p.Variable(v))
-                        for k, v in six.iteritems(temp_map)))
-    for k, v in six.iteritems(arg_map):
+    var_map = {p.Variable(k): p.Variable(v)
+                   for k, v in iname_map.items()}
+    var_map.update({p.Variable(k): p.Variable(v)
+                        for k, v in temp_map.items()})
+    for k, v in arg_map.items():
         if isinstance(v, SubArrayRef):
             var_map[p.Variable(k)] = v.subscript.aggregate
         else:
@@ -490,10 +485,10 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
 
     dep_map = callee_knl.recursive_insn_dep_map()
     # roots depend on nothing
-    heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps)
+    heads = {insn for insn, deps in dep_map.items() if not deps}
     # leaves have nothing that depends on them
     tails = set(dep_map.keys())
-    for insn, deps in six.iteritems(dep_map):
+    for insn, deps in dep_map.items():
         tails = tails - deps
 
     # }}}
@@ -523,7 +518,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
         depends_on = frozenset(map(insn_id.get, insn.depends_on)) | (
                 instruction.depends_on)
         if insn.id in heads:
-            depends_on = depends_on | set([noop_start.id])
+            depends_on = depends_on | {noop_start.id}
 
         new_atomicity = tuple(
                 type(atomicity)(var_map[p.Variable(atomicity.var_name)].name)
@@ -663,7 +658,7 @@ class DimChanger(IdentityMapper):
 
     def map_subscript(self, expr):
         if expr.aggregate.name not in self.callee_arg_dict:
-            return super(DimChanger, self).map_subscript(expr)
+            return super().map_subscript(expr)
         callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags
         flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in
                 zip(callee_arg_dim_tags, expr.index_tuple))
@@ -710,7 +705,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel(
                 get_kw_pos_association)
         _, pos_to_kw = get_kw_pos_association(callee_knl)
         arg_id_to_shape = {}
-        for arg_id, arg in six.iteritems(insn.arg_id_to_val()):
+        for arg_id, arg in insn.arg_id_to_val().items():
             arg_id = pos_to_kw[arg_id]
 
             arg_descr = get_arg_descriptor_for_expression(caller_knl, arg)
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 2c9499d9d92d73eeea1ce5344ca8475e60dedbd0..0ed1159446f8f4bd26b480d3e08bd5d7f1c008b7 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa
-
 from loopy.diagnostic import LoopyError
 from islpy import dim_type
 
@@ -290,15 +286,15 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
     if temporary_name is None:
         temporary_name = var_name_gen("%s_fetch" % c_name)
 
-    arg = kernel.arg_dict[var_name]
+    var_descr = kernel.get_var_descriptor(var_name)
 
     # {{{ make parameter names and unification template
 
     parameters = []
-    for i in range(arg.num_user_axes()):
+    for i in range(var_descr.num_user_axes()):
         based_on = "%s_dim_%d" % (c_name, i)
-        if arg.dim_names is not None:
-            based_on = "%s_dim_%s" % (c_name, arg.dim_names[i])
+        if var_descr.dim_names is not None:
+            based_on = "{}_dim_{}".format(c_name, var_descr.dim_names[i])
         if dim_arg_names is not None and i < len(dim_arg_names):
             based_on = dim_arg_names[i]
 
@@ -327,7 +323,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
     kernel, subst_use, sweep_inames, inames_to_be_removed = \
             _process_footprint_subscripts(
                     kernel,  rule_name, sweep_inames,
-                    footprint_subscripts, arg)
+                    footprint_subscripts, var_descr)
 
     # Our _not_provided is actually a different object from the one in the
     # precompute module, but precompute acutally uses that to adjust its
@@ -336,7 +332,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
     from loopy.transform.precompute import precompute_for_single_kernel
     new_kernel = precompute_for_single_kernel(kernel, callables_table,
             subst_use, sweep_inames, precompute_inames=dim_arg_names,
-            default_tag=default_tag, dtype=arg.dtype,
+            default_tag=default_tag, dtype=var_descr.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
             temporary_address_space=temporary_address_space,
@@ -398,9 +394,9 @@ def add_prefetch(program, *args, **kwargs):
 
 # {{{ change variable kinds
 
-def change_arg_to_image(knl, name):
+def change_arg_to_image(kernel, name):
     new_args = []
-    for arg in knl.args:
+    for arg in kernel.args:
         if arg.name == name:
             assert arg.offset == 0
             assert arg.shape is not None
@@ -408,7 +404,7 @@ def change_arg_to_image(knl, name):
         else:
             new_args.append(arg)
 
-    return knl.copy(args=new_args)
+    return kernel.copy(args=new_args)
 
 # }}}
 
@@ -416,11 +412,11 @@ def change_arg_to_image(knl, name):
 # {{{ tag array axes
 
 @iterate_over_kernels_if_given_program
-def tag_array_axes(knl, ary_names, dim_tags):
+def tag_array_axes(kernel, ary_names, dim_tags):
     """
     :arg dim_tags: a tuple of
         :class:`loopy.kernel.array.ArrayDimImplementationTag` or a string that
-        parses to one. See :func:`loopy.kernel.array.parse_dim_tags` for a
+        parses to one. See :func:`loopy.kernel.array.parse_array_dim_tags` for a
         description of the allowed string format.
 
         For example, *dim_tags* could be ``"N2,N0,N1"`` to determine
@@ -429,7 +425,7 @@ def tag_array_axes(knl, ary_names, dim_tags):
 
     .. versionchanged:: 2016.2
 
-        This function was called :func:`tag_data_axes` before version 2016.2.
+        This function was called ``tag_data_axes`` before version 2016.2.
     """
 
     from loopy.kernel.tools import ArrayChanger
@@ -438,7 +434,7 @@ def tag_array_axes(knl, ary_names, dim_tags):
         ary_names = [ary_name.strip() for ary_name in ary_names.split(",")]
 
     for ary_name in ary_names:
-        achng = ArrayChanger(knl, ary_name)
+        achng = ArrayChanger(kernel, ary_name)
         ary = achng.get()
 
         from loopy.kernel.array import parse_array_dim_tags
@@ -449,9 +445,9 @@ def tag_array_axes(knl, ary_names, dim_tags):
 
         ary = ary.copy(dim_tags=tuple(new_dim_tags))
 
-        knl = achng.with_changed_array(ary)
+        kernel = achng.with_changed_array(ary)
 
-    return knl
+    return kernel
 
 
 tag_data_axes = (
@@ -467,7 +463,7 @@ def set_array_axis_names(kernel, ary_names, dim_names):
     """
     .. versionchanged:: 2016.2
 
-        This function was called :func:`set_array_dim_names` before version 2016.2.
+        This function was called ``set_array_dim_names`` before version 2016.2.
     """
     from loopy.kernel.tools import ArrayChanger
     if isinstance(ary_names, str):
@@ -496,14 +492,14 @@ set_array_dim_names = (MovedFunctionDeprecationWrapper(
 # {{{ remove_unused_arguments
 
 @iterate_over_kernels_if_given_program
-def remove_unused_arguments(knl):
+def remove_unused_arguments(kernel):
     new_args = []
 
     import loopy as lp
-    exp_knl = lp.expand_subst(knl)
+    exp_kernel = lp.expand_subst(kernel)
 
-    refd_vars = set(knl.all_params())
-    for insn in exp_knl.instructions:
+    refd_vars = set(kernel.all_params())
+    for insn in exp_kernel.instructions:
         refd_vars.update(insn.dependency_names())
 
     from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
@@ -515,7 +511,7 @@ def remove_unused_arguments(knl):
             return set()
         return get_dependencies(expr)
 
-    for ary in chain(knl.args, six.itervalues(knl.temporary_variables)):
+    for ary in chain(kernel.args, kernel.temporary_variables.values()):
         if isinstance(ary, ArrayBase):
             refd_vars.update(
                     tolerant_get_deps(ary.shape)
@@ -526,11 +522,11 @@ def remove_unused_arguments(knl):
                     refd_vars.update(
                             tolerant_get_deps(dim_tag.stride))
 
-    for arg in knl.args:
+    for arg in kernel.args:
         if arg.name in refd_vars:
             new_args.append(arg)
 
-    return knl.copy(args=new_args)
+    return kernel.copy(args=new_args)
 
 # }}}
 
@@ -538,7 +534,7 @@ def remove_unused_arguments(knl):
 # {{{ alias_temporaries
 
 @iterate_over_kernels_if_given_program
-def alias_temporaries(knl, names, base_name_prefix=None,
+def alias_temporaries(kernel, names, base_name_prefix=None,
         synchronize_for_exclusive_use=True):
     """Sets all temporaries given by *names* to be backed by a single piece of
     storage.
@@ -558,20 +554,20 @@ def alias_temporaries(knl, names, base_name_prefix=None,
         ``synchronize_for_exclusive_use=True`` was the previous default
         behavior.
     """
-    gng = knl.get_group_name_generator()
+    gng = kernel.get_group_name_generator()
     group_names = [gng("tmpgrp_"+name) for name in names]
 
     if base_name_prefix is None:
         base_name_prefix = "temp_storage"
 
-    vng = knl.get_var_name_generator()
+    vng = kernel.get_var_name_generator()
     base_name = vng(base_name_prefix)
 
     names_set = set(names)
 
     if synchronize_for_exclusive_use:
         new_insns = []
-        for insn in knl.instructions:
+        for insn in kernel.instructions:
             temp_deps = insn.dependency_names() & names_set
 
             if not temp_deps:
@@ -598,10 +594,10 @@ def alias_temporaries(knl, names, base_name_prefix=None,
                         conflicts_with_groups=(
                             insn.conflicts_with_groups | other_group_names)))
     else:
-        new_insns = knl.instructions
+        new_insns = kernel.instructions
 
     new_temporary_variables = {}
-    for tv in six.itervalues(knl.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         if tv.name in names_set:
             if tv.base_storage is not None:
                 raise LoopyError("temporary variable '{tv}' already has "
@@ -613,7 +609,7 @@ def alias_temporaries(knl, names, base_name_prefix=None,
         else:
             new_temporary_variables[tv.name] = tv
 
-    return knl.copy(
+    return kernel.copy(
             instructions=new_insns,
             temporary_variables=new_temporary_variables)
 
@@ -686,7 +682,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
             kernel.substitutions, var_name_gen)
     smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                     make_subst_func(subst_dict),
-                    within=lambda knl, insn, stack: True)
+                    within=lambda kernel, insn, stack: True)
 
     kernel = smap.map_kernel(kernel)
 
@@ -710,7 +706,7 @@ def set_temporary_scope(kernel, temp_var_names, scope):
     :arg temp_var_names: a container with membership checking,
         or a comma-separated string of variables for which the
         scope is to be set.
-    :arg scope: One of the values from :class:`AddressSpace`, or one
+    :arg scope: One of the values from :class:`loopy.AddressSpace`, or one
         of the strings ``"private"``, ``"local"``, or ``"global"``.
     """
 
@@ -747,15 +743,16 @@ def set_temporary_scope(kernel, temp_var_names, scope):
 # {{{ reduction_arg_to_subst_rule
 
 @iterate_over_kernels_if_given_program
-def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None):
+def reduction_arg_to_subst_rule(
+        kernel, inames, insn_match=None, subst_rule_name=None):
     if isinstance(inames, str):
         inames = [s.strip() for s in inames.split(",")]
 
     inames_set = frozenset(inames)
 
-    substs = knl.substitutions.copy()
+    substs = kernel.substitutions.copy()
 
-    var_name_gen = knl.get_var_name_generator()
+    var_name_gen = kernel.get_var_name_generator()
 
     def map_reduction(expr, rec, nresults=1):
         if frozenset(expr.inames) != inames_set:
@@ -796,13 +793,13 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No
     from loopy.kernel.data import MultiAssignmentBase
 
     new_insns = []
-    for insn in knl.instructions:
+    for insn in kernel.instructions:
         if not isinstance(insn, MultiAssignmentBase):
             new_insns.append(insn)
         else:
             new_insns.append(insn.copy(expression=cb_mapper(insn.expression)))
 
-    return knl.copy(
+    return kernel.copy(
             instructions=new_insns,
             substitutions=substs)
 
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index 33bd519b2d84bf9d64a65214897e9084375dd6a4..5a42973526a6c2c174bc67b76db1a46ebb7f181a 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -167,7 +165,7 @@ class LoopyDiffMapper(DifferentiationMapper, RuleAwareIdentityMapper):
 
 # {{{ diff context
 
-class DifferentiationContext(object):
+class DifferentiationContext:
     def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix,
             additional_shape):
         self.kernel = kernel
@@ -369,7 +367,7 @@ class DifferentiationContext(object):
 
 # {{{ entrypoint
 
-def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i",
+def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i",
         batch_axes_in_by=frozenset(), copy_outputs=set()):
     """
 
@@ -380,25 +378,25 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i",
         *diff_context.by_name*, or *None* if no dependency exists.
     """
 
-    assert isinstance(knl, LoopKernel)
+    assert isinstance(kernel, LoopKernel)
 
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True)
+    kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True)
 
     if isinstance(diff_outputs, str):
         diff_outputs = [
                 dout.strip() for dout in diff_outputs.split(",")
                 if dout.strip()]
 
-    by_arg = knl.arg_dict[by]
+    by_arg = kernel.arg_dict[by]
     additional_shape = by_arg.shape
 
-    var_name_gen = knl.get_var_name_generator()
+    var_name_gen = kernel.get_var_name_generator()
 
     # {{{ differentiate instructions
 
     diff_context = DifferentiationContext(
-            knl, var_name_gen, by, diff_iname_prefix=diff_iname_prefix,
+            kernel, var_name_gen, by, diff_iname_prefix=diff_iname_prefix,
             additional_shape=additional_shape)
 
     result = {}
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 921117f9ed6f5a0c4ca54d04e15e94f25237f3cb..9d4c083889d9ca425da1654edfaf9e848fa6210b 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,8 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
-
 import islpy as isl
 from islpy import dim_type
 
@@ -55,7 +51,7 @@ def _rename_temporaries(kernel, suffix, all_identifiers):
     vng = kernel.get_var_name_generator()
 
     new_temporaries = {}
-    for tv in six.itervalues(kernel.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         if tv.name in all_identifiers:
             new_tv_name = vng(tv.name+suffix)
         else:
@@ -107,7 +103,7 @@ def _ordered_merge_lists(list_a, list_b):
 def _merge_dicts(item_name, dict_a, dict_b):
     result = dict_a.copy()
 
-    for k, v in six.iteritems(dict_b):
+    for k, v in dict_b.items():
         if k in result:
             if v != result[k]:
                 raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s"
@@ -131,16 +127,16 @@ def _merge_values(item_name, val_a, val_b):
 
 # {{{ two-kernel fusion
 
-def _fuse_two_kernels(knla, knlb):
+def _fuse_two_kernels(kernela, kernelb):
     from loopy.kernel import KernelState
-    if knla.state != KernelState.INITIAL or knlb.state != KernelState.INITIAL:
+    if kernela.state != KernelState.INITIAL or kernelb.state != KernelState.INITIAL:
         raise LoopyError("can only fuse kernels in INITIAL state")
 
     # {{{ fuse domains
 
-    new_domains = knla.domains[:]
+    new_domains = kernela.domains[:]
 
-    for dom_b in knlb.domains:
+    for dom_b in kernelb.domains:
         i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
         if i_fuse is None:
             new_domains.append(dom_b)
@@ -166,14 +162,14 @@ def _fuse_two_kernels(knla, knlb):
 
     # }}}
 
-    vng = knla.get_var_name_generator()
+    vng = kernela.get_var_name_generator()
     b_var_renames = {}
 
     # {{{ fuse args
 
-    new_args = knla.args[:]
-    for b_arg in knlb.args:
-        if b_arg.name not in knla.arg_dict:
+    new_args = kernela.args[:]
+    for b_arg in kernelb.args:
+        if b_arg.name not in kernela.arg_dict:
             new_arg_name = vng(b_arg.name)
 
             if new_arg_name != b_arg.name:
@@ -181,21 +177,21 @@ def _fuse_two_kernels(knla, knlb):
 
             new_args.append(b_arg.copy(name=new_arg_name))
         else:
-            if b_arg != knla.arg_dict[b_arg.name]:
+            if b_arg != kernela.arg_dict[b_arg.name]:
                 raise LoopyError(
                         "argument '{arg_name}' has inconsistent definition between "
                         "the two kernels being merged ({arg_a} <-> {arg_b})"
                         .format(
                             arg_name=b_arg.name,
-                            arg_a=str(knla.arg_dict[b_arg.name]),
+                            arg_a=str(kernela.arg_dict[b_arg.name]),
                             arg_b=str(b_arg)))
 
     # }}}
 
     # {{{ fuse temporaries
 
-    new_temporaries = knla.temporary_variables.copy()
-    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
+    new_temporaries = kernela.temporary_variables.copy()
+    for b_name, b_tv in kernelb.temporary_variables.items():
         assert b_name == b_tv.name
 
         new_tv_name = vng(b_name)
@@ -208,18 +204,18 @@ def _fuse_two_kernels(knla, knlb):
 
     # }}}
 
-    knlb = _apply_renames_in_exprs(knlb, b_var_renames)
+    kernelb = _apply_renames_in_exprs(kernelb, b_var_renames)
 
     from pymbolic.imperative.transform import \
             fuse_statement_streams_with_unique_ids
     new_instructions, old_b_id_to_new_b_id = \
             fuse_statement_streams_with_unique_ids(
-                    knla.instructions, knlb.instructions)
+                    kernela.instructions, kernelb.instructions)
 
     # {{{ fuse assumptions
 
-    assump_a = knla.assumptions
-    assump_b = knlb.assumptions
+    assump_a = kernela.assumptions
+    assump_b = kernelb.assumptions
     assump_a, assump_b = isl.align_two(assump_a, assump_b)
 
     shared_param_names = list(
@@ -242,49 +238,49 @@ def _fuse_two_kernels(knla, knlb):
             domains=new_domains,
             instructions=new_instructions,
             args=new_args,
-            name="%s_and_%s" % (knla.name, knlb.name),
-            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
+            name=f"{kernela.name}_and_{kernelb.name}",
+            preambles=_ordered_merge_lists(kernela.preambles, kernelb.preambles),
             preamble_generators=_ordered_merge_lists(
-                knla.preamble_generators, knlb.preamble_generators),
+                kernela.preamble_generators, kernelb.preamble_generators),
             assumptions=new_assumptions,
             local_sizes=_merge_dicts(
-                "local size", knla.local_sizes, knlb.local_sizes),
+                "local size", kernela.local_sizes, kernelb.local_sizes),
             temporary_variables=new_temporaries,
             iname_to_tags=_merge_dicts(
                 "iname-to-tag mapping",
-                knla.iname_to_tags,
-                knlb.iname_to_tags),
+                kernela.iname_to_tags,
+                kernelb.iname_to_tags),
             substitutions=_merge_dicts(
                 "substitution",
-                knla.substitutions,
-                knlb.substitutions),
+                kernela.substitutions,
+                kernelb.substitutions),
             function_manglers=_ordered_merge_lists(
-                knla.function_manglers,
-                knlb.function_manglers),
+                kernela.function_manglers,
+                kernelb.function_manglers),
             symbol_manglers=_ordered_merge_lists(
-                knla.symbol_manglers,
-                knlb.symbol_manglers),
+                kernela.symbol_manglers,
+                kernelb.symbol_manglers),
 
             iname_slab_increments=_merge_dicts(
                 "iname slab increment",
-                knla.iname_slab_increments,
-                knlb.iname_slab_increments),
-            loop_priority=knla.loop_priority.union(knlb.loop_priority),
+                kernela.iname_slab_increments,
+                kernelb.iname_slab_increments),
+            loop_priority=kernela.loop_priority.union(kernelb.loop_priority),
             silenced_warnings=_ordered_merge_lists(
-                knla.silenced_warnings,
-                knlb.silenced_warnings),
+                kernela.silenced_warnings,
+                kernelb.silenced_warnings),
             applied_iname_rewrites=_ordered_merge_lists(
-                knla.applied_iname_rewrites,
-                knlb.applied_iname_rewrites),
+                kernela.applied_iname_rewrites,
+                kernelb.applied_iname_rewrites),
             index_dtype=_merge_values(
                 "index dtype",
-                knla.index_dtype,
-                knlb.index_dtype),
+                kernela.index_dtype,
+                kernelb.index_dtype),
             target=_merge_values(
                 "target",
-                knla.target,
-                knlb.target),
-            options=knla.options), old_b_id_to_new_b_id
+                kernela.target,
+                kernelb.target),
+            options=kernela.options), old_b_id_to_new_b_id
 
 # }}}
 
@@ -375,19 +371,19 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
     kernel_insn_ids = []
     result = None
 
-    for knlb in kernels:
+    for kernelb in kernels:
         if result is None:
-            result = knlb
+            result = kernelb
             kernel_insn_ids.append([
-                insn.id for insn in knlb.instructions])
+                insn.id for insn in kernelb.instructions])
         else:
             result, old_b_id_to_new_b_id = _fuse_two_kernels(
-                    knla=result,
-                    knlb=knlb)
+                    kernela=result,
+                    kernelb=kernelb)
 
             kernel_insn_ids.append([
                 old_b_id_to_new_b_id[insn.id]
-                for insn in knlb.instructions])
+                for insn in kernelb.instructions])
 
     # {{{ realize data_flow dependencies
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 584aca6a4f6914e61ff50b593c500e342bf495fd..473dbbca7a69816836b13d5496562656e1f03a72 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,9 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
-from six.moves import zip
-
 import islpy as isl
 from islpy import dim_type
 
@@ -76,6 +71,8 @@ __doc__ = """
 
 .. autofunction:: add_inames_to_insn
 
+.. autofunction:: add_inames_for_unused_hw_axes
+
 """
 
 
@@ -129,7 +126,7 @@ def prioritize_loops(kernel, loop_priority):
 class _InameSplitter(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, within,
             split_iname, outer_iname, inner_iname, replacement_index):
-        super(_InameSplitter, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.within = within
 
@@ -154,7 +151,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                         self.rec(expr.expr, expn_state),
                         expr.allow_simultaneous)
         else:
-            return super(_InameSplitter, self).map_reduction(expr, expn_state)
+            return super().map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
         if (expr.name == self.split_iname
@@ -164,7 +161,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                     expn_state.instruction)):
             return self.replacement_index
         else:
-            return super(_InameSplitter, self).map_variable(expr, expn_state)
+            return super().map_variable(expr, expn_state)
 
 
 def _split_iname_backend(kernel, split_iname,
@@ -474,7 +471,7 @@ def chunk_iname(kernel, split_iname, num_chunks,
 class _InameJoiner(RuleAwareSubstitutionMapper):
     def __init__(self, rule_mapping_context, within, subst_func,
             joined_inames, new_iname):
-        super(_InameJoiner, self).__init__(rule_mapping_context,
+        super().__init__(rule_mapping_context,
                 subst_func, within)
 
         self.joined_inames = set(joined_inames)
@@ -505,7 +502,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
                         self.rec(expr.expr, expn_state),
                         expr.allow_simultaneous)
         else:
-            return super(_InameJoiner, self).map_reduction(expr, expn_state)
+            return super().map_reduction(expr, expn_state)
 
 
 @iterate_over_kernels_if_given_program
@@ -655,7 +652,7 @@ def untag_inames(kernel, iname_to_untag, tag_type):
 
     knl_iname_to_tags = kernel.iname_to_tags.copy()
     old_tags = knl_iname_to_tags.get(iname_to_untag, frozenset())
-    old_tags = set(tag for tag in old_tags if not isinstance(tag, tag_type))
+    old_tags = {tag for tag in old_tags if not isinstance(tag, tag_type)}
 
     if old_tags:
         knl_iname_to_tags[iname_to_untag] = old_tags
@@ -703,7 +700,7 @@ def tag_inames(kernel, iname_to_tag, force=False,
 
     # convert dict to list of tuples
     if isinstance(iname_to_tag, dict):
-        iname_to_tag = list(six.iteritems(iname_to_tag))
+        iname_to_tag = list(iname_to_tag.items())
 
     # flatten iterables of tags for each iname
 
@@ -752,7 +749,7 @@ def tag_inames(kernel, iname_to_tag, force=False,
     from loopy.match import re_from_glob
     new_iname_to_tag = {}
     for iname, new_tag in iname_to_tag:
-        if '*' in iname or '?' in iname:
+        if "*" in iname or "?" in iname:
             match_re = re_from_glob(iname)
             for sub_iname in all_inames:
                 if match_re.match(sub_iname):
@@ -773,7 +770,7 @@ def tag_inames(kernel, iname_to_tag, force=False,
     # }}}
 
     knl_iname_to_tags = kernel.iname_to_tags.copy()
-    for iname, new_tag in six.iteritems(iname_to_tag):
+    for iname, new_tag in iname_to_tag.items():
         if not new_tag:
             continue
 
@@ -806,10 +803,10 @@ def tag_inames(kernel, iname_to_tag, force=False,
 class _InameDuplicator(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context,
             old_to_new, within):
-        super(_InameDuplicator, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.old_to_new = old_to_new
-        self.old_inames_set = set(six.iterkeys(old_to_new))
+        self.old_inames_set = set(old_to_new.keys())
         self.within = within
 
     def map_reduction(self, expr, expn_state):
@@ -829,7 +826,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
                         self.rec(expr.expr, expn_state),
                         expr.allow_simultaneous)
         else:
-            return super(_InameDuplicator, self).map_reduction(expr, expn_state)
+            return super().map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
         new_name = self.old_to_new.get(expr.name)
@@ -840,7 +837,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
                     expn_state.kernel,
                     expn_state.instruction,
                     expn_state.stack)):
-            return super(_InameDuplicator, self).map_variable(expr, expn_state)
+            return super().map_variable(expr, expn_state)
         else:
             from pymbolic import var
             return var(new_name)
@@ -856,8 +853,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
 
 @iterate_over_kernels_if_given_program
-def duplicate_inames(knl, inames, within, new_inames=None,
-        suffix=None,
+def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None,
         tags={}):
     """
     :arg within: a stack match as understood by
@@ -881,7 +877,7 @@ def duplicate_inames(knl, inames, within, new_inames=None,
     if len(new_inames) != len(inames):
         raise ValueError("new_inames must have the same number of entries as inames")
 
-    name_gen = knl.get_var_name_generator()
+    name_gen = kernel.get_var_name_generator()
 
     for i, iname in enumerate(inames):
         new_iname = new_inames[i]
@@ -909,10 +905,10 @@ def duplicate_inames(knl, inames, within, new_inames=None,
 
     for old_iname, new_iname in zip(inames, new_inames):
         from loopy.kernel.tools import DomainChanger
-        domch = DomainChanger(knl, frozenset([old_iname]))
+        domch = DomainChanger(kernel, frozenset([old_iname]))
 
         from loopy.isl_helpers import duplicate_axes
-        knl = knl.copy(
+        kernel = kernel.copy(
                 domains=domch.get_domains_with(
                     duplicate_axes(domch.domain, [old_iname], [new_iname])))
 
@@ -921,13 +917,13 @@ def duplicate_inames(knl, inames, within, new_inames=None,
     # {{{ change the inames in the code
 
     rule_mapping_context = SubstitutionRuleMappingContext(
-            knl.substitutions, name_gen)
+            kernel.substitutions, name_gen)
     indup = _InameDuplicator(rule_mapping_context,
             old_to_new=dict(list(zip(inames, new_inames))),
             within=within)
 
-    knl = rule_mapping_context.finish_kernel(
-            indup.map_kernel(knl))
+    kernel = rule_mapping_context.finish_kernel(
+            indup.map_kernel(kernel))
 
     # }}}
 
@@ -936,11 +932,11 @@ def duplicate_inames(knl, inames, within, new_inames=None,
     for old_iname, new_iname in zip(inames, new_inames):
         new_tag = tags.get(old_iname)
         if new_tag is not None:
-            knl = tag_inames(knl, {new_iname: new_tag})
+            kernel = tag_inames(kernel, {new_iname: new_tag})
 
     # }}}
 
-    return knl
+    return kernel
 
 # }}}
 
@@ -963,8 +959,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
         common = common.union(old_common_inames)
 
         # Go into recursion
-        for option in _get_iname_duplication_options(insn_iname_sets, common):
-            yield option
+        yield from _get_iname_duplication_options(insn_iname_sets, common)
         # Do not yield anything beyond here!
         return
 
@@ -991,9 +986,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
     if len(partitioning) > 1:
         for part in partitioning:
             working_set = frozenset(s for s in insn_iname_sets if s <= part)
-            for option in _get_iname_duplication_options(working_set,
-                                                         old_common_inames):
-                yield option
+            yield from _get_iname_duplication_options(working_set,
+                                                         old_common_inames)
     # If exactly one set was found, an iname duplication is necessary
     elif len(partitioning) == 1:
         inames, = partitioning
@@ -1010,8 +1004,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
             # is inspected.  For each element of the power set without the
             # empty and the full set, one duplication option is generated.
             for insns_to_dup in it.chain.from_iterable(
-                    it.combinations(iname_insns, l)
-                    for l in range(1, len(iname_insns))):
+                    it.combinations(iname_insns, i)
+                    for i in range(1, len(iname_insns))):
                 yield (
                     iname,
                     tuple(insn | old_common_inames for insn in insns_to_dup))
@@ -1019,7 +1013,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
     # If partitioning was empty, we have recursed successfully and yield nothing
 
 
-def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False):
+def get_iname_duplication_options_for_single_kernel(kernel,
+        use_boostable_into=False):
     """List options for duplication of inames, if necessary for schedulability
 
     :returns: a generator listing all options to duplicate inames, if duplication
@@ -1049,66 +1044,45 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals
     Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
     duplicated in a given kernel.
     """
+    if use_boostable_into:
+        raise LoopyError("'use_boostable_into=True' is no longer supported.")
+
+    if use_boostable_into is False:
+        from warnings import warn
+        warn("passing 'use_boostable_into=False' to 'get_iname_duplication_options'"
+                " is deprecated. The argument will go away in 2021.",
+                DeprecationWarning, stacklevel=2)
+
     from loopy.kernel.data import ConcurrentTag
 
-    concurrent_inames = set(
+    concurrent_inames = {
             iname
-            for iname in knl.all_inames()
-            if knl.iname_tags_of_type(iname, ConcurrentTag))
+            for iname in kernel.all_inames()
+            if kernel.iname_tags_of_type(iname, ConcurrentTag)}
 
     # First we extract the minimal necessary information from the kernel
-    if use_boostable_into:
-        insn_iname_sets = (
-            frozenset(
-                (insn.within_inames
-                    | insn.boostable_into if insn.boostable_into is not None
-                    else frozenset([]))
-                - concurrent_inames
-                for insn in knl.instructions)
-            -
-            frozenset([frozenset([])]))
-    else:
-        insn_iname_sets = (
-            frozenset(
-                insn.within_inames - concurrent_inames
-                for insn in knl.instructions)
-            -
-            frozenset([frozenset([])]))
+    insn_iname_sets = (
+        frozenset(
+            insn.within_inames - concurrent_inames
+            for insn in kernel.instructions)
+        -
+        frozenset([frozenset([])]))
 
     # Get the duplication options as a tuple of iname and a set
     for iname, insns in _get_iname_duplication_options(insn_iname_sets):
         # Check whether this iname has a parallel tag and discard it if so
-        if (iname in knl.iname_to_tags
-                and knl.iname_tags_of_type(iname, ConcurrentTag)):
+        if (iname in kernel.iname_to_tags
+                and kernel.iname_tags_of_type(iname, ConcurrentTag)):
             continue
 
-        # If we find a duplication option and to not use boostable_into
-        # information, we restart this generator with use_boostable_into=True
-        if not use_boostable_into and not knl.options.ignore_boostable_into:
-            for option in get_iname_duplication_options_for_single_kernel(knl, True):
-                yield option
-
-            # Emit a warning that we needed boostable_into
-            from warnings import warn
-            from loopy.diagnostic import LoopyWarning
-            warn("Kernel '%s' required the deprecated 'boostable_into' "
-                 "instruction attribute in order to be schedulable!" % knl.name,
-                 LoopyWarning)
-
-            # Return to avoid yielding the duplication
-            # options without boostable_into
-            return
-
         # Reconstruct an object that may be passed to the within parameter of
         # loopy.duplicate_inames
         from loopy.match import Id, Or
         within = Or(tuple(
-            Id(insn.id) for insn in knl.instructions
+            Id(insn.id) for insn in kernel.instructions
             if insn.within_inames in insns))
 
-        # Only yield the result if an instruction matched. With
-        # use_boostable_into=True this is not always true.
-
+        # Only yield the result if an instruction matched.
         if within.children:
             yield iname, within
 
@@ -1116,9 +1090,8 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals
 def get_iname_duplication_options(program, use_boostable_into=False):
     for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
-            for option in get_iname_duplication_options_for_single_kernel(
-                    in_knl_callable.subkernel, use_boostable_into):
-                yield option
+            yield from get_iname_duplication_options_for_single_kernel(
+                    in_knl_callable.subkernel, use_boostable_into)
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
         else:
@@ -1128,12 +1101,12 @@ def get_iname_duplication_options(program, use_boostable_into=False):
     return
 
 
-def has_schedulable_iname_nesting_for_single_kernel(knl):
+def has_schedulable_iname_nesting_for_single_kernel(kernel):
     """
     :returns: a :class:`bool` indicating whether this kernel needs
         an iname duplication in order to be schedulable.
     """
-    return not bool(next(get_iname_duplication_options_for_single_kernel(knl),
+    return not bool(next(get_iname_duplication_options_for_single_kernel(kernel),
         False))
 
 
@@ -1149,19 +1122,19 @@ def has_schedulable_iname_nesting(program):
 # {{{ rename_inames
 
 @iterate_over_kernels_if_given_program
-def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
+def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
     """
     :arg within: a stack match as understood by
         :func:`loopy.match.parse_stack_match`.
     :arg existing_ok: execute even if *new_iname* already exists
     """
 
-    var_name_gen = knl.get_var_name_generator()
+    var_name_gen = kernel.get_var_name_generator()
 
     # FIXME: Distinguish existing iname vs. existing other variable
     does_exist = var_name_gen.is_name_conflicting(new_iname)
 
-    if old_iname not in knl.all_inames():
+    if old_iname not in kernel.all_inames():
         raise LoopyError("old iname '%s' does not exist" % old_iname)
 
     if does_exist and not existing_ok:
@@ -1171,7 +1144,7 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
     if does_exist:
         # {{{ check that the domains match up
 
-        dom = knl.get_inames_domain(frozenset((old_iname, new_iname)))
+        dom = kernel.get_inames_domain(frozenset((old_iname, new_iname)))
 
         var_dict = dom.get_var_dict()
         _, old_idx = var_dict[old_iname]
@@ -1208,17 +1181,17 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
 
         from pymbolic.mapper.substitutor import make_subst_func
         rule_mapping_context = SubstitutionRuleMappingContext(
-                knl.substitutions, var_name_gen)
+                kernel.substitutions, var_name_gen)
         smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                         make_subst_func(subst_dict), within)
 
-        knl = rule_mapping_context.finish_kernel(
-                smap.map_kernel(knl))
+        kernel = rule_mapping_context.finish_kernel(
+                smap.map_kernel(kernel))
 
         new_instructions = []
-        for insn in knl.instructions:
+        for insn in kernel.instructions:
             if (old_iname in insn.within_inames
-                    and within(knl, insn, ())):
+                    and within(kernel, insn, ())):
                 insn = insn.copy(
                         within_inames=(
                             (insn.within_inames - frozenset([old_iname]))
@@ -1226,22 +1199,35 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
 
             new_instructions.append(insn)
 
-        knl = knl.copy(instructions=new_instructions)
+        kernel = kernel.copy(instructions=new_instructions)
 
     else:
-        knl = duplicate_inames(
-                knl, [old_iname], within=within, new_inames=[new_iname])
+        kernel = duplicate_inames(
+                kernel, [old_iname], within=within, new_inames=[new_iname])
 
-    knl = remove_unused_inames(knl, [old_iname])
+    kernel = remove_unused_inames(kernel, [old_iname])
 
-    return knl
+    return kernel
 
 # }}}
 
 
 # {{{ remove unused inames
 
-def remove_unused_inames(knl, inames=None):
+def get_used_inames(kernel):
+    import loopy as lp
+    exp_kernel = lp.expand_subst(kernel)
+
+    used_inames = set()
+    for insn in exp_kernel.instructions:
+        used_inames.update(
+                exp_kernel.insn_inames(insn.id)
+                | insn.reduction_inames())
+
+    return used_inames
+
+
+def remove_unused_inames(kernel, inames=None):
     """Delete those among *inames* that are unused, i.e. project them
     out of the domain. If these inames pose implicit restrictions on
     other inames, these restrictions will persist as existentially
@@ -1253,7 +1239,7 @@ def remove_unused_inames(knl, inames=None):
     # {{{ normalize arguments
 
     if inames is None:
-        inames = knl.all_inames()
+        inames = kernel.all_inames()
     elif isinstance(inames, str):
         inames = inames.split(",")
 
@@ -1261,17 +1247,7 @@ def remove_unused_inames(knl, inames=None):
 
     # {{{ check which inames are unused
 
-    import loopy as lp
-    exp_knl = lp.expand_subst(knl)
-
-    inames = set(inames)
-    used_inames = set()
-    for insn in exp_knl.instructions:
-        used_inames.update(
-                exp_knl.insn_inames(insn.id)
-                | insn.reduction_inames())
-
-    unused_inames = inames - used_inames
+    unused_inames = set(inames) - get_used_inames(kernel)
 
     # }}}
 
@@ -1280,17 +1256,44 @@ def remove_unused_inames(knl, inames=None):
     from loopy.kernel.tools import DomainChanger
 
     for iname in unused_inames:
-        domch = DomainChanger(knl, (iname,))
+        domch = DomainChanger(kernel, (iname,))
 
         dom = domch.domain
         dt, idx = dom.get_var_dict()[iname]
         dom = dom.project_out(dt, idx, 1)
 
-        knl = knl.copy(domains=domch.get_domains_with(dom))
+        kernel = kernel.copy(domains=domch.get_domains_with(dom))
 
     # }}}
 
-    return knl
+    return kernel
+
+
+def remove_any_newly_unused_inames(transformation_func):
+    from functools import wraps
+
+    @wraps(transformation_func)
+    def wrapper(kernel, *args, **kwargs):
+
+        # check for remove_unused_inames argument, default: True
+        remove_newly_unused_inames = kwargs.pop("remove_newly_unused_inames", True)
+
+        if remove_newly_unused_inames:
+            # determine which inames were already unused
+            inames_already_unused = kernel.all_inames() - get_used_inames(kernel)
+
+            # call transform
+            transformed_kernel = transformation_func(kernel, *args, **kwargs)
+
+            # Remove inames that are unused due to transform
+            return remove_unused_inames(
+                transformed_kernel,
+                transformed_kernel.all_inames()-inames_already_unused)
+        else:
+            # call transform
+            return transformation_func(kernel, *args, **kwargs)
+
+    return wrapper
 
 # }}}
 
@@ -1299,7 +1302,7 @@ def remove_unused_inames(knl, inames=None):
 
 class _ReductionSplitter(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, within, inames, direction):
-        super(_ReductionSplitter, self).__init__(
+        super().__init__(
                 rule_mapping_context)
 
         self.within = within
@@ -1334,7 +1337,7 @@ class _ReductionSplitter(RuleAwareIdentityMapper):
             else:
                 assert False
         else:
-            return super(_ReductionSplitter, self).map_reduction(expr, expn_state)
+            return super().map_reduction(expr, expn_state)
 
 
 def _split_reduction(kernel, inames, direction, within=None):
@@ -1475,9 +1478,9 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
     from pymbolic.algorithm import solve_affine_equations_for
     old_inames_to_expr = solve_affine_equations_for(old_inames, equations)
 
-    subst_dict = dict(
-            (v.name, expr)
-            for v, expr in old_inames_to_expr.items())
+    subst_dict = {
+            v.name: expr
+            for v, expr in old_inames_to_expr.items()}
 
     var_name_gen = kernel.get_var_name_generator()
 
@@ -1533,9 +1536,9 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
                 if dom_old_inames:
                     dom_equations.append((lhs, rhs))
 
-                this_eqn_old_iname_dim_types = set(
+                this_eqn_old_iname_dim_types = {
                         dom_var_dict[old_iname][0]
-                        for old_iname in eqn_deps & old_inames_set)
+                        for old_iname in eqn_deps & old_inames_set}
 
                 if this_eqn_old_iname_dim_types:
                     if len(this_eqn_old_iname_dim_types) > 1:
@@ -1621,9 +1624,9 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
         :func:`loopy.match.parse_match`.
     :arg kind: may be "l" or "g", or the corresponding tag class name
 
-    :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag`
-        that is not being used within the instructions matched by
-        *insn_match*.
+    :returns: an :class:`loopy.kernel.data.GroupIndexTag` or
+        :class:`loopy.kernel.data.LocalIndexTag` that is not being used within
+        the instructions matched by *insn_match*.
     """
     used_axes = set()
 
@@ -1679,7 +1682,7 @@ def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count):
 
 class _ReductionInameUniquifier(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, inames, within):
-        super(_ReductionInameUniquifier, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.inames = inames
         self.old_to_new = []
@@ -1731,7 +1734,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper):
                         expn_state),
                     expr.allow_simultaneous)
         else:
-            return super(_ReductionInameUniquifier, self).map_reduction(
+            return super().map_reduction(
                     expr, expn_state)
 
 
@@ -1783,7 +1786,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None):
 # {{{ add_inames_to_insn
 
 @iterate_over_kernels_if_given_program
-def add_inames_to_insn(knl, inames, insn_match):
+def add_inames_to_insn(kernel, inames, insn_match):
     """
     :arg inames: a frozenset of inames that will be added to the
         instructions matched by *insn_match*, or a comma-separated
@@ -1791,9 +1794,9 @@ def add_inames_to_insn(knl, inames, insn_match):
     :arg insn_match: An instruction match as understood by
         :func:`loopy.match.parse_match`.
 
-    :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag`
-        that is not being used within the instructions matched by
-        *insn_match*.
+    :returns: an :class:`loopy.kernel.data.GroupIndexTag` or
+        :class:`loopy.kernel.data.LocalIndexTag` that is not being used within
+        the instructions matched by *insn_match*.
 
     .. versionadded:: 2016.3
     """
@@ -1809,16 +1812,125 @@ def add_inames_to_insn(knl, inames, insn_match):
 
     new_instructions = []
 
-    for insn in knl.instructions:
-        if match(knl, insn):
+    for insn in kernel.instructions:
+        if match(kernel, insn):
             new_instructions.append(
                     insn.copy(within_inames=insn.within_inames | inames))
         else:
             new_instructions.append(insn)
 
-    return knl.copy(instructions=new_instructions)
+    return kernel.copy(instructions=new_instructions)
 
 # }}}
 
 
+def add_inames_for_unused_hw_axes(kernel, within=None):
+    """
+    Returns a kernel with inames added to each instruction
+    corresponding to any hardware-parallel iname tags
+    (:class:`loopy.kernel.data.GroupIndexTag`,
+    :class:`loopy.kernel.data.LocalIndexTag`) unused
+    in the instruction but used elsewhere in the kernel.
+
+    Current limitations:
+
+    * Only one iname in the kernel may be tagged with each of the unused hw axes.
+    * Occurence of an ``l.auto`` tag when an instruction is missing one of the
+      local hw axes.
+
+    :arg within: An instruction match as understood by
+        :func:`loopy.match.parse_match`.
+    """
+    from loopy.kernel.data import (LocalIndexTag, GroupIndexTag,
+            AutoFitLocalIndexTag)
+
+    n_local_axes = max([tag.axis
+        for tags in kernel.iname_to_tags.values()
+        for tag in tags
+        if isinstance(tag, LocalIndexTag)],
+        default=-1) + 1
+
+    n_group_axes = max([tag.axis
+        for tags in kernel.iname_to_tags.values()
+        for tag in tags
+        if isinstance(tag, GroupIndexTag)],
+        default=-1) + 1
+
+    contains_auto_local_tag = any([isinstance(tag, AutoFitLocalIndexTag)
+        for tags in kernel.iname_to_tags
+        for tag in tags])
+
+    if contains_auto_local_tag:
+        raise LoopyError("Kernels containing l.auto tags are invalid"
+                " arguments.")
+
+    # {{{ fill axes_to_inames
+
+    # local_axes_to_inames: ith entry contains the iname tagged with l.i or None
+    # if multiple inames are tagged with l.i
+    local_axes_to_inames = []
+    # group_axes_to_inames: ith entry contains the iname tagged with g.i or None
+    # if multiple inames are tagged with g.i
+    group_axes_to_inames = []
+
+    for i in range(n_local_axes):
+        ith_local_axes_tag = LocalIndexTag(i)
+        inames = [iname
+                for iname, tags in kernel.iname_to_tags.items()
+                if ith_local_axes_tag in tags]
+        if not inames:
+            raise LoopyError(f"Unused local hw axes {i}.")
+
+        local_axes_to_inames.append(inames[0] if len(inames) == 1 else None)
+
+    for i in range(n_group_axes):
+        ith_group_axes_tag = GroupIndexTag(i)
+        inames = [iname
+                for iname, tags in kernel.iname_to_tags.items()
+                if ith_group_axes_tag in tags]
+        if not inames:
+            raise LoopyError(f"Unused group hw axes {i}.")
+
+        group_axes_to_inames.append(inames[0] if len(inames) == 1 else None)
+
+    # }}}
+
+    from loopy.match import parse_match
+    within = parse_match(within)
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        if within(kernel, insn):
+            within_tags = frozenset().union(*(kernel.iname_to_tags.get(iname,
+                frozenset()) for iname in insn.within_inames))
+            missing_local_axes = [i for i in range(n_local_axes)
+                    if LocalIndexTag(i) not in within_tags]
+            missing_group_axes = [i for i in range(n_group_axes)
+                    if GroupIndexTag(i) not in within_tags]
+
+            for axis in missing_local_axes:
+                iname = local_axes_to_inames[axis]
+                if iname:
+                    insn = insn.copy(within_inames=insn.within_inames |
+                            frozenset([iname]))
+                else:
+                    raise LoopyError("Multiple inames tagged with l.%d while"
+                            " adding unused local hw axes to instruction '%s'."
+                            % (axis, insn.id))
+
+            for axis in missing_group_axes:
+                iname = group_axes_to_inames[axis]
+                if iname is not None:
+                    insn = insn.copy(within_inames=insn.within_inames |
+                            frozenset([iname]))
+                else:
+                    raise LoopyError("Multiple inames tagged with g.%d while"
+                            " adding unused group hw axes to instruction '%s'."
+                            % (axis, insn.id))
+
+        new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index f73110ecdff79d7c029c0dd0d895ef71ea68326b..c84c1b9c69fc833877d42daf4c83b7dce5af3d4e 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa
-
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import (ScalarCallable, CallableKernel)
@@ -362,9 +358,9 @@ def uniquify_instruction_ids(kernel):
 
     from loopy.kernel.creation import UniqueName
 
-    insn_ids = set(
+    insn_ids = {
             insn.id for insn in kernel.instructions
-            if insn.id is not None and not isinstance(insn.id, UniqueName))
+            if insn.id is not None and not isinstance(insn.id, UniqueName)}
 
     from pytools import UniqueNameGenerator
     insn_id_gen = UniqueNameGenerator(insn_ids)
diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py
index d0e7d1bc2ec5d1b5815ec8c8c30fecc198014c86..b8db7f43f90a5a1203dea470c9a0ba6f8fa21cae 100644
--- a/loopy/transform/make_scalar.py
+++ b/loopy/transform/make_scalar.py
@@ -7,13 +7,13 @@ from loopy.transform.iname import remove_unused_inames
 class ScalarChanger(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, var_name):
         self.var_name = var_name
-        super(ScalarChanger, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
     def map_subscript(self, expr, expn_state):
         if expr.aggregate.name == self.var_name:
             return Variable(self.var_name)
 
-        return super(ScalarChanger, self).map_subscript(expr, expn_state)
+        return super().map_subscript(expr, expn_state)
 
 
 def make_scalar(kernel, var_name):
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index a18326187379cac0b4be46bbfe244bcc2d9e7684..6fb4988f0f8d82b5a1169f92a52c7eb649a861e1 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni"
 
 __license__ = """
@@ -121,9 +119,9 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
         from pymbolic import var
 
         dim_type = isl.dim_type.set
-        ilp_inames = set(iname for iname in insn.within_inames
+        ilp_inames = {iname for iname in insn.within_inames
                          if all(isinstance(tag, (IlpBaseTag, VectorizeTag))
-                                for tag in kernel.iname_to_tags.get(iname, [])))
+                                for tag in kernel.iname_to_tags.get(iname, []))}
         new_ilp_inames = set()
         ilp_inames_map = {}
         for iname in ilp_inames:
@@ -156,10 +154,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                 new_pack_inames = ilp_inames_map.copy()  # packing-specific inames
                 new_unpack_inames = ilp_inames_map.copy()  # unpacking-specific iname
 
-                new_pack_inames = dict((iname, var(vng(iname.name +
-                    "_pack"))) for iname in p.swept_inames)
-                new_unpack_inames = dict((iname, var(vng(iname.name +
-                    "_unpack"))) for iname in p.swept_inames)
+                new_pack_inames = {iname: var(vng(iname.name +
+                    "_pack")) for iname in p.swept_inames}
+                new_unpack_inames = {iname: var(vng(iname.name +
+                    "_unpack")) for iname in p.swept_inames}
 
                 # Updating the domains corresponding to the new inames.
                 for iname in p.swept_inames:
@@ -228,8 +226,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                 packing_insns.append(Assignment(
                     assignee=pack_lhs_assignee,
                     expression=pack_subst_mapper.map_subscript(p.subscript),
-                    within_inames=insn.within_inames - ilp_inames | set(
-                        new_pack_inames[i].name for i in p.swept_inames) | (
+                    within_inames=insn.within_inames - ilp_inames | {
+                        new_pack_inames[i].name for i in p.swept_inames} | (
                             new_ilp_inames),
                     depends_on=insn.depends_on,
                     id=ing(insn.id+"_pack"),
@@ -240,8 +238,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                     unpacking_insns.append(Assignment(
                         expression=unpack_rhs,
                         assignee=unpack_subst_mapper.map_subscript(p.subscript),
-                        within_inames=insn.within_inames - ilp_inames | set(
-                            new_unpack_inames[i].name for i in p.swept_inames) | (
+                        within_inames=insn.within_inames - ilp_inames | {
+                            new_unpack_inames[i].name for i in p.swept_inames} | (
                                 new_ilp_inames),
                         id=ing(insn.id+"_unpack"),
                         depends_on=frozenset([insn.id]),
@@ -282,8 +280,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
             new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1])
                     for i, _ in enumerate(insn.assignees))
             new_call_insn = new_call_insn.copy(
-                    depends_on=new_call_insn.depends_on | set(
-                        pack.id for pack in packing_insns),
+                    depends_on=new_call_insn.depends_on | {
+                        pack.id for pack in packing_insns},
                     within_inames=new_call_insn.within_inames - ilp_inames | (
                         new_ilp_inames),
                     expression=new_call_insn.expression.function(*new_params),
diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py
index 2ee3bd9b153907b564f4ca25c4c3720a6910d509..1e267321596d7e551645200e117055378c7c5c1e 100644
--- a/loopy/transform/padding.py
+++ b/loopy/transform/padding.py
@@ -1,7 +1,3 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -34,7 +30,7 @@ from loopy.kernel import LoopKernel
 
 class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, arg_names, handler):
-        super(ArrayAxisSplitHelper, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
         self.arg_names = arg_names
         self.handler = handler
 
@@ -42,7 +38,7 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
         if expr.aggregate.name in self.arg_names:
             return self.handler(expr)
         else:
-            return super(ArrayAxisSplitHelper, self).map_subscript(expr, expn_state)
+            return super().map_subscript(expr, expn_state)
 
 
 # {{{ split_array_dim (deprecated since June 2016)
@@ -93,8 +89,8 @@ def split_array_dim(kernel, arrays_and_axes, count,
     if isinstance(arrays_and_axes, tuple):
         arrays_and_axes = [arrays_and_axes]
 
-    array_to_rest = dict(
-            (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes)
+    array_to_rest = {
+            tup[0]: normalize_rest(tup[1:]) for tup in arrays_and_axes}
 
     if len(arrays_and_axes) != len(array_to_rest):
         raise RuntimeError("cannot split multiple axes of the same variable")
@@ -107,7 +103,7 @@ def split_array_dim(kernel, arrays_and_axes, count,
 
     from loopy.kernel.tools import ArrayChanger
 
-    for array_name, (axis, order) in six.iteritems(array_to_rest):
+    for array_name, (axis, order) in array_to_rest.items():
         achng = ArrayChanger(kernel, array_name)
         ary = achng.get()
 
@@ -238,12 +234,12 @@ def split_array_dim(kernel, arrays_and_axes, count,
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, var_name_gen)
     aash = ArrayAxisSplitHelper(rule_mapping_context,
-            set(six.iterkeys(array_to_rest)), split_access_axis)
+            set(array_to_rest.keys()), split_access_axis)
     kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))
 
     if auto_split_inames:
-        from loopy.transform.iname import split_iname
-        for iname, (outer_iname, inner_iname) in six.iteritems(split_vars):
+        from loopy import split_iname
+        for iname, (outer_iname, inner_iname) in split_vars.items():
             kernel = split_iname(kernel, iname, count,
                     outer_iname=outer_iname, inner_iname=inner_iname,
                     **split_kwargs)
@@ -369,7 +365,7 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"):
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, var_name_gen)
     aash = ArrayAxisSplitHelper(rule_mapping_context,
-            set([array_name]), split_access_axis)
+            {array_name}, split_access_axis)
     kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))
 
     return kernel
@@ -391,8 +387,9 @@ def split_array_axis(kernel, array_names, axis_nr, count,
 
     .. versionchanged:: 2016.2
 
-        There was a more complicated, dumber function called :func:`split_array_dim`
-        that had the role of this function in versions prior to 2016.2.
+        There was a more complicated, dumber function called
+        ``loopy.split_array_dim`` that had the role of this function in
+        versions prior to 2016.2.
     """
     assert isinstance(kernel, LoopKernel)
 
@@ -449,7 +446,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1
 
 @iterate_over_kernels_if_given_program
 def add_padding(kernel, variable, axis, align_bytes):
-    arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args))
+    arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)}
     arg_idx = arg_to_idx[variable]
 
     new_args = kernel.args[:]
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index 5c5e94028e5dfaa87f802f88bae715cfe733d6af..d93513f9833861fa3511280f36d3473f7d00cd3a 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,7 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
 from loopy.symbolic import (RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 import islpy as isl
@@ -117,7 +114,7 @@ def _fix_parameter(kernel, name, value, remove_argument):
             new_args.append(arg.map_exprs(map_expr))
 
     new_temp_vars = {}
-    for tv in six.itervalues(kernel.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         new_temp_vars[tv.name] = tv.map_exprs(map_expr)
 
     from loopy.match import parse_stack_match
@@ -155,7 +152,7 @@ def fix_parameters(kernel, **value_dict):
 
     remove_arg = value_dict.pop("_remove", True)
 
-    for name, value in six.iteritems(value_dict):
+    for name, value in value_dict.items():
         kernel = _fix_parameter(kernel, name, value, remove_arg)
 
     return kernel
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 8837cc3d574d751ac9bf5af4dc04250e6ef87d33..7d052730fc6ca808080f4f5a343d41266ae02dba 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,8 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
-from six.moves import range, zip
 import islpy as isl
 from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
@@ -66,7 +62,7 @@ def storage_axis_exprs(storage_axis_sources, args):
 
 class RuleInvocationGatherer(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within):
-        super(RuleInvocationGatherer, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         from loopy.symbolic import SubstitutionRuleExpander
         self.subst_expander = SubstitutionRuleExpander(
@@ -91,7 +87,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper):
                 expn_state.stack)
 
         if not process_me:
-            return super(RuleInvocationGatherer, self).map_substitution(
+            return super().map_substitution(
                     name, tag, arguments, expn_state)
 
         rule = self.rule_mapping_context.old_subst_rules[name]
@@ -99,7 +95,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper):
                     name, rule.arguments, arguments, expn_state.arg_context)
 
         arg_deps = set()
-        for arg_val in six.itervalues(arg_context):
+        for arg_val in arg_context.values():
             arg_deps = (arg_deps
                     | get_dependencies(self.subst_expander(arg_val)))
 
@@ -116,7 +112,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper):
                         ", ".join(arg_deps - self.kernel.all_inames()),
                         ))
 
-            return super(RuleInvocationGatherer, self).map_substitution(
+            return super().map_substitution(
                     name, tag, arguments, expn_state)
 
         args = [arg_context[arg_name] for arg_name in rule.arguments]
@@ -141,7 +137,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
             non1_storage_axis_names,
             temporary_name, compute_insn_id, compute_dep_id,
             compute_read_variables):
-        super(RuleInvocationReplacer, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.subst_name = subst_name
         self.subst_tag = subst_tag
@@ -169,7 +165,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
                     expn_state.instruction,
                     expn_state.stack)
                 and (self.subst_tag is None or self.subst_tag == tag)):
-            return super(RuleInvocationReplacer, self).map_substitution(
+            return super().map_substitution(
                     name, tag, arguments, expn_state)
 
         # {{{ check if in footprint
@@ -184,7 +180,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
                     self.storage_axis_sources, args))
 
         if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
-            return super(RuleInvocationReplacer, self).map_substitution(
+            return super().map_substitution(
                     name, tag, arguments, expn_state)
 
         # }}}
@@ -227,12 +223,13 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
     def map_kernel(self, kernel):
         new_insns = []
 
-        excluded_insn_ids = set([self.compute_insn_id, self.compute_dep_id])
+        excluded_insn_ids = {self.compute_insn_id, self.compute_dep_id}
 
         for insn in kernel.instructions:
             self.replaced_something = False
 
-            insn = insn.with_transformed_expressions(self, kernel, insn)
+            insn = insn.with_transformed_expressions(
+                    lambda expr: self(expr, kernel, insn))
 
             if self.replaced_something:
                 insn = insn.copy(
@@ -257,7 +254,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 # }}}
 
 
-class _not_provided(object):  # noqa: N801
+class _not_provided:  # noqa: N801
     pass
 
 
@@ -618,7 +615,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
             name = old_name = subst.arguments[saxis]
         else:
             old_name = saxis
-            name = "%s_%s" % (c_subst_name, old_name)
+            name = f"{c_subst_name}_{old_name}"
 
         if (precompute_inames is not None
                 and i < len(precompute_inames)
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index d4128bd115666cf66c6f06a40823ed9d5929faab..8527023bc789c9b3c9e18fe7ad6827c82a6e7a55 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -23,7 +21,6 @@ THE SOFTWARE.
 """
 
 
-import six
 from loopy.diagnostic import LoopyError
 
 import logging
@@ -120,7 +117,7 @@ def privatize_temporaries_with_inames(
 
     # {{{ find variables that need extra indices
 
-    for tv in six.itervalues(kernel.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         if only_var_names is not None and tv.name not in only_var_names:
             continue
 
@@ -158,7 +155,7 @@ def privatize_temporaries_with_inames(
     from loopy.symbolic import pw_aff_to_expr
 
     priv_axis_iname_to_length = {}
-    for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname):
+    for priv_axis_inames in var_to_new_priv_axis_iname.values():
         for iname in priv_axis_inames:
             if iname in priv_axis_iname_to_length:
                 continue
@@ -177,7 +174,7 @@ def privatize_temporaries_with_inames(
     from loopy.kernel.data import VectorizeTag
 
     new_temp_vars = kernel.temporary_variables.copy()
-    for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
+    for tv_name, inames in var_to_new_priv_axis_iname.items():
         tv = new_temp_vars[tv_name]
         extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames)
 
@@ -199,9 +196,9 @@ def privatize_temporaries_with_inames(
     # }}}
 
     from pymbolic import var
-    var_to_extra_iname = dict(
-            (var_name, tuple(var(iname) for iname in inames))
-            for var_name, inames in six.iteritems(var_to_new_priv_axis_iname))
+    var_to_extra_iname = {
+            var_name: tuple(var(iname) for iname in inames)
+            for var_name, inames in var_to_new_priv_axis_iname.items()}
 
     new_insns = []
 
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index c8e9a11a052817456b77af8f0722e802b2d180fd..35a175b68fa4f81f8d14cc688856265738147716 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -25,7 +23,6 @@ THE SOFTWARE.
 
 from loopy.diagnostic import LoopyError
 import loopy as lp
-import six
 
 from loopy.kernel.data import auto, AddressSpace
 from pytools import memoize_method, Record
@@ -60,7 +57,7 @@ class LivenessResult(dict):
                    for idx in range(nscheditems))
 
 
-class LivenessAnalysis(object):
+class LivenessAnalysis:
 
     def __init__(self, kernel):
         self.kernel = kernel
@@ -82,10 +79,10 @@ class LivenessAnalysis(object):
             elif isinstance(next_item, EnterLoop):
                 # Account for empty loop
                 loop_end = block_bounds[sched_idx + 1]
-                after = successors[loop_end] | set([sched_idx + 1])
+                after = successors[loop_end] | {sched_idx + 1}
             elif isinstance(next_item, (LeaveLoop, RunInstruction,
                     CallKernel, ReturnFromKernel, Barrier)):
-                after = set([sched_idx + 1])
+                after = {sched_idx + 1}
             else:
                 raise LoopyError("unexpected type of schedule item: {ty}"
                     .format(ty=type(next_item).__name__))
@@ -94,7 +91,7 @@ class LivenessAnalysis(object):
             if isinstance(item, LeaveLoop):
                 # Account for loop
                 loop_begin = block_bounds[sched_idx]
-                after |= set([loop_begin])
+                after |= {loop_begin}
             elif not isinstance(item, (EnterLoop, RunInstruction,
                     CallKernel, ReturnFromKernel, Barrier)):
                 raise LoopyError("unexpected type of schedule item: {ty}"
@@ -105,8 +102,8 @@ class LivenessAnalysis(object):
         return successors
 
     def get_gen_and_kill_sets(self):
-        gen = dict((idx, set()) for idx in range(len(self.schedule)))
-        kill = dict((idx, set()) for idx in range(len(self.schedule)))
+        gen = {idx: set() for idx in range(len(self.schedule))}
+        kill = {idx: set() for idx in range(len(self.schedule))}
 
         for sched_idx, sched_item in enumerate(self.schedule):
             if not isinstance(sched_item, RunInstruction):
@@ -186,7 +183,7 @@ class LivenessAnalysis(object):
 
 # {{{ save and reload implementation
 
-class TemporarySaver(object):
+class TemporarySaver:
 
     class PromotedTemporary(Record):
         """
@@ -265,15 +262,15 @@ class TemporarySaver(object):
                     isl.Space.create_from_names(
                         isl.DEFAULT_CONTEXT,
                         set=[],
-                        params=set(
+                        params={
                             arg.name for arg in kernel.args
-                            if isinstance(arg, ValueArg)))))
+                            if isinstance(arg, ValueArg)})))
 
     def find_accessing_instructions_in_subkernel(self, temporary, subkernel):
         # Find all accessing instructions in the subkernel. If base_storage is
         # present, this includes instructions that access aliasing memory.
 
-        aliasing_names = set([temporary])
+        aliasing_names = {temporary}
         base_storage = self.kernel.temporary_variables[temporary].base_storage
 
         if base_storage is not None:
@@ -305,7 +302,7 @@ class TemporarySaver(object):
 
         result = defaultdict(set)
 
-        for temporary in six.itervalues(self.kernel.temporary_variables):
+        for temporary in self.kernel.temporary_variables.values():
             if temporary.base_storage is None:
                 continue
             result[temporary.base_storage].add(temporary.name)
@@ -512,7 +509,7 @@ class TemporarySaver(object):
         self.new_subdomain = new_subdomain
 
         save_or_load_insn_id = self.insn_name_gen(
-            "{name}.{mode}".format(name=temporary, mode=mode))
+            f"{temporary}.{mode}")
 
         def add_subscript_if_subscript_nonempty(agg, subscript=()):
             from pymbolic.primitives import Subscript, Variable
@@ -550,10 +547,10 @@ class TemporarySaver(object):
         pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel)
 
         if pre_barrier is not None:
-            depends_on |= set([pre_barrier])
+            depends_on |= {pre_barrier}
 
         if post_barrier is not None:
-            update_deps |= set([post_barrier])
+            update_deps |= {post_barrier}
 
         # Create the load / store instruction.
         from loopy.kernel.data import Assignment
@@ -564,9 +561,7 @@ class TemporarySaver(object):
                 self.subkernel_to_surrounding_inames[subkernel]
                 | frozenset(hw_inames + dim_inames)),
             within_inames_is_final=True,
-            depends_on=depends_on,
-            boostable=False,
-            boostable_into=frozenset())
+            depends_on=depends_on)
 
         if mode == "save":
             self.temporary_to_save_ids[temporary].add(save_or_load_insn_id)
@@ -591,7 +586,7 @@ class TemporarySaver(object):
     def finish(self):
         new_instructions = []
 
-        insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
+        insns_to_insert = {insn.id: insn for insn in self.insns_to_insert}
 
         for orig_insn in self.kernel.instructions:
             if orig_insn.id in self.insns_to_update:
@@ -764,7 +759,7 @@ def save_and_reload_temporaries(program):
     from loopy.schedule.tools import (
         temporaries_read_in_subkernel, temporaries_written_in_subkernel)
 
-    for sched_idx, sched_item in enumerate(knl.schedule):
+    for sched_idx, sched_item in enumerate(program.root_kernel.schedule):
 
         if isinstance(sched_item, CallKernel):
             # Any written temporary that is live-out needs to be read into
@@ -775,25 +770,26 @@ def save_and_reload_temporaries(program):
             else:
                 subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    temporaries_read_in_subkernel(knl, subkernel)
-                    | temporaries_written_in_subkernel(knl, subkernel))
+                    temporaries_read_in_subkernel(program.root_kernel, subkernel)
+                    | temporaries_written_in_subkernel(program.root_kernel,
+                                                       subkernel))
 
             for temporary in liveness[sched_idx].live_out & interesting_temporaries:
-                logger.info("reloading {0} at entry of {1}"
+                logger.info("reloading {} at entry of {}"
                         .format(temporary, sched_item.kernel_name))
                 saver.reload(temporary, sched_item.kernel_name)
 
         elif isinstance(sched_item, ReturnFromKernel):
-            if sched_idx == len(knl.schedule) - 1:
+            if sched_idx == len(program.root_kernel.schedule) - 1:
                 # Kernel exit: nothing live
                 interesting_temporaries = set()
             else:
                 subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    temporaries_written_in_subkernel(knl, subkernel))
+                    temporaries_written_in_subkernel(program.root_kernel, subkernel))
 
             for temporary in liveness[sched_idx].live_in & interesting_temporaries:
-                logger.info("saving {0} before return of {1}"
+                logger.info("saving {} before return of {}"
                         .format(temporary, sched_item.kernel_name))
                 saver.save(temporary, sched_item.kernel_name)
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 725e6792055b6ea7dc0b8663204a264540f79fd3..d7aaf6093fbcd8cd84667e55f44e13c129d3bef0 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,11 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-
 from loopy.symbolic import (
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
+from loopy.transform.iname import remove_any_newly_unused_inames
 
 from pytools import ImmutableRecord
 from pymbolic import var
@@ -103,8 +100,8 @@ def extract_subst(kernel, subst_name, template, parameters=()):
                     ExprDescriptor(
                         insn=insn,
                         expr=expr,
-                        unif_var_dict=dict((lhs.name, rhs)
-                            for lhs, rhs in urec.equations)))
+                        unif_var_dict={lhs.name: rhs
+                            for lhs, rhs in urec.equations}))
         else:
             mapper.fallback_mapper(expr)
             # can't nest, don't recurse
@@ -117,7 +114,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
         dfmapper(insn.assignees)
         dfmapper(insn.expression)
 
-    for sr in six.itervalues(kernel.substitutions):
+    for sr in kernel.substitutions.values():
         dfmapper(sr.expression)
 
     # }}}
@@ -151,8 +148,30 @@ def extract_subst(kernel, subst_name, template, parameters=()):
 
     new_insns = []
 
+    def transform_assignee(expr):
+        # Assignment LHS's cannot be subst rules. Treat them
+        # specially.
+
+        import pymbolic.primitives as prim
+        if isinstance(expr, tuple):
+            return tuple(
+                    transform_assignee(expr_i)
+                    for expr_i in expr)
+
+        elif isinstance(expr, prim.Subscript):
+            return type(expr)(
+                    expr.aggregate,
+                    cbmapper(expr.index))
+
+        elif isinstance(expr, prim.Variable):
+            return expr
+        else:
+            raise ValueError("assignment LHS not understood")
+
     for insn in kernel.instructions:
-        new_insns.append(insn.with_transformed_expressions(cbmapper))
+        new_insns.append(
+                insn.with_transformed_expressions(
+                    cbmapper, assignee_f=transform_assignee))
 
     from loopy.kernel.data import SubstitutionRule
     new_substs = {
@@ -162,7 +181,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
                 expression=template,
                 )}
 
-    for subst in six.itervalues(kernel.substitutions):
+    for subst in kernel.substitutions.values():
         new_substs[subst.name] = subst.copy(
                 expression=cbmapper(subst.expression))
 
@@ -183,7 +202,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             usage_to_definition, extra_arguments, within):
         self.var_name_gen = rule_mapping_context.make_unique_var_name
 
-        super(AssignmentToSubstChanger, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.lhs_name = lhs_name
         self.definition_insn_ids = definition_insn_ids
@@ -215,7 +234,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             if result is not None:
                 return result
 
-        return super(AssignmentToSubstChanger, self).map_variable(
+        return super().map_variable(
                 expr, expn_state)
 
     def map_subscript(self, expr, expn_state):
@@ -225,7 +244,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             if result is not None:
                 return result
 
-        return super(AssignmentToSubstChanger, self).map_subscript(
+        return super().map_subscript(
                 expr, expn_state)
 
     def transform_access(self, index, expn_state):
@@ -261,6 +280,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
 
 
 @iterate_over_kernels_if_given_program
+@remove_any_newly_unused_inames
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
     """Extract an assignment (to a temporary variable or an argument)
@@ -363,7 +383,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
     # {{{ create new substitution rules
 
     new_substs = kernel.substitutions.copy()
-    for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name):
+    for def_id, subst_name in tts.definition_insn_id_to_subst_name.items():
         def_insn = kernel.id_to_insn[def_id]
 
         from loopy.kernel.data import Assignment
@@ -404,7 +424,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
     new_args = kernel.args
 
     if lhs_name in kernel.temporary_variables:
-        if not any(six.itervalues(tts.saw_unmatched_usage_sites)):
+        if not any(tts.saw_unmatched_usage_sites.values()):
             # All usage sites matched--they're now substitution rules.
             # We can get rid of the variable.
 
@@ -412,7 +432,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
             del new_temp_vars[lhs_name]
 
     if lhs_name in kernel.arg_dict and not force_retain_argument:
-        if not any(six.itervalues(tts.saw_unmatched_usage_sites)):
+        if not any(tts.saw_unmatched_usage_sites.values()):
             # All usage sites matched--they're now substitution rules.
             # We can get rid of the argument
 
@@ -427,11 +447,10 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
     import loopy as lp
     kernel = lp.remove_instructions(
             kernel,
-            set(
+            {
                 insn_id
-                for insn_id, still_used in six.iteritems(
-                    tts.saw_unmatched_usage_sites)
-                if not still_used))
+                for insn_id, still_used in tts.saw_unmatched_usage_sites.items()
+                if not still_used})
 
     return kernel.copy(
             substitutions=new_substs,
@@ -475,7 +494,7 @@ def expand_subst(kernel, within=None):
 
 # {{{ find substitution rules by glob patterns
 
-def find_rules_matching(knl, pattern):
+def find_rules_matching(kernel, pattern):
     """
     :pattern: A shell-style glob pattern.
     """
@@ -483,7 +502,7 @@ def find_rules_matching(knl, pattern):
     from loopy.match import re_from_glob
     pattern = re_from_glob(pattern)
 
-    return [r for r in knl.substitutions if pattern.match(r)]
+    return [r for r in kernel.substitutions if pattern.match(r)]
 
 
 def find_one_rule_matching(program, pattern):
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 0d4430e0dd61f35d6c53d8d176449fbd67722cf9..7718988aab98fbf26c221110d4f02487cc675fa3 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner"
 
 __license__ = """
@@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-
 from pymbolic.mapper import CombineMapper
 import numpy as np
 
@@ -49,7 +45,7 @@ logger = logging.getLogger(__name__)
 def _debug(kernel, s, *args):
     if logger.isEnabledFor(logging.DEBUG):
         logstr = s % args
-        logger.debug("%s: %s" % (kernel.name, logstr))
+        logger.debug(f"{kernel.name}: {logstr}")
 
 
 def get_return_types_as_tuple(arg_id_to_dtype):
@@ -58,8 +54,8 @@ def get_return_types_as_tuple(arg_id_to_dtype):
     :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a
                             mapping from the arguments to their inferred types.
     """
-    return_arg_id_to_dtype = dict((id, dtype) for id, dtype in
-            arg_id_to_dtype.items() if (isinstance(id, int) and id < 0))
+    return_arg_id_to_dtype = {id: dtype for id, dtype in
+            arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)}
     return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True)
 
     return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
@@ -75,7 +71,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper):
 
     def __init__(self, rule_mapping_context, calls_to_new_names,
             subst_expander):
-        super(FunctionNameChanger, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
         self.calls_to_new_names = calls_to_new_names
         self.subst_expander = subst_expander
 
@@ -98,7 +94,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper):
                         tuple(self.rec(child, expn_state)
                             for child in expanded_expr.parameters))
             else:
-                return super(FunctionNameChanger, self).map_call(
+                return super().map_call(
                         expr, expn_state)
         else:
             return self.map_substitution(name, tag, expr.parameters, expn_state)
@@ -110,12 +106,12 @@ class FunctionNameChanger(RuleAwareIdentityMapper):
                 ResolvedFunction(self.calls_to_new_names[expr]),
                 tuple(self.rec(child, expn_state)
                     for child in expr.parameters),
-                dict(
-                    (key, self.rec(val, expn_state))
-                    for key, val in six.iteritems(expr.kw_parameters))
+                {
+                    key: self.rec(val, expn_state)
+                    for key, val in expr.kw_parameters.items()}
                     )
         else:
-            return super(FunctionNameChanger, self).map_call_with_kwargs(
+            return super().map_call_with_kwargs(
                     expr, expn_state)
 
 
@@ -219,7 +215,7 @@ class TypeInferenceMapper(CombineMapper):
         if return_tuple:
             kwargs["return_tuple"] = True
 
-        result = super(TypeInferenceMapper, self).__call__(
+        result = super().__call__(
                 expr, **kwargs)
 
         assert isinstance(result, list)
@@ -396,7 +392,7 @@ class TypeInferenceMapper(CombineMapper):
     def map_type_cast(self, expr):
         subtype, = self.rec(expr.child)
         if not issubclass(subtype.dtype.type, np.number):
-            raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type))
+            raise LoopyError(f"Can't cast a '{subtype}' to '{expr.type}'")
         return [expr.type]
 
     def map_subscript(self, expr):
@@ -426,8 +422,8 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 return None
 
-        arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in
-                tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items()))
+        arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in
+                tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())}
 
         # specializing the known function wrt type
         if isinstance(expr.function, ResolvedFunction):
@@ -525,11 +521,11 @@ class TypeInferenceMapper(CombineMapper):
                         ValueArgDescriptor)
 
                 # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes
-                arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target))
-                        for i, dt in enumerate(mangle_result.arg_dtypes))
-                arg_id_to_dtype.update(dict((-i-1,
-                    dtype.with_target(self.kernel.target)) for i, dtype in enumerate(
-                        mangle_result.result_dtypes)))
+                arg_id_to_dtype = {i: dt.with_target(self.kernel.target)
+                        for i, dt in enumerate(mangle_result.arg_dtypes)}
+                arg_id_to_dtype.update({-i-1:
+                    dtype.with_target(self.kernel.target) for i, dtype in enumerate(
+                        mangle_result.result_dtypes)})
                 arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in
                         enumerate(mangle_result.arg_dtypes))
                 res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in
@@ -726,11 +722,11 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         if isinstance(writer_insn, lp.Assignment):
             result = type_inf_mapper(expr, return_dtype_set=True)
         elif isinstance(writer_insn, lp.CallInstruction):
-            return_dtype_set = type_inf_mapper(expr, return_tuple=True,
+            return_dtype_sets = type_inf_mapper(expr, return_tuple=True,
                     return_dtype_set=True)
 
             result = []
-            for return_dtype_set in return_dtype_set:
+            for return_dtype_set in return_dtype_sets:
                 result_i = None
                 found = False
                 for assignee, comp_dtype_set in zip(
@@ -810,7 +806,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table,
     names_for_type_inference = []
 
     import loopy as lp
-    for tv in six.itervalues(kernel.temporary_variables):
+    for tv in kernel.temporary_variables.values():
         assert tv.dtype is not lp.auto
         if tv.dtype is None:
             names_for_type_inference.append(tv.name)
@@ -827,15 +823,15 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table,
 
     writer_map = kernel.writer_map()
 
-    dep_graph = dict(
-            (written_var, set(
+    dep_graph = {
+            written_var: {
                 read_var
                 for insn_id in writer_map.get(written_var, [])
                 for read_var in kernel.id_to_insn[insn_id].read_dependency_names()
-                if read_var in names_for_type_inference))
-            for written_var in names_for_type_inference)
+                if read_var in names_for_type_inference}
+            for written_var in names_for_type_inference}
 
-    from loopy.tools import compute_sccs
+    from pytools.graph import compute_sccs
 
     # To speed up processing, we sort the variables by computing the SCCs of the
     # type dependency graph. Each SCC represents a set of variables whose types
diff --git a/loopy/types.py b/loopy/types.py
index 4e77317c105a1f8b6acb61029ae6d81533d60372..2457049073eab8c73202e324514526097b56c4d1 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,13 +20,28 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa
 import numpy as np
 
 from loopy.diagnostic import LoopyError
 
+__doc__ = """
+.. currentmodule:: loopy.types
+
+.. autoclass:: LoopyType
+
+.. autoclass:: NumpyType
+
+.. autoclass:: AtomicType
+
+.. autoclass:: AtomicNumpyType
+"""
 
-class LoopyType(object):
+
+class LoopyType:
+    """
+    Abstract class for dtypes of variables encountered in a
+    :class:`loopy.LoopKernel`.
+    """
     def with_target(self, target):
         return self
 
@@ -55,7 +68,10 @@ class LoopyType(object):
 
 
 class AtomicType(LoopyType):
-    pass
+    """
+    Abstract class for dtypes of variables encountered in a :class:`loopy.LoopKernel`
+    on which atomic operations are performed .
+    """
 
 
 # {{{ numpy-based dtype
@@ -137,7 +153,7 @@ class NumpyType(LoopyType):
             else:
                 return any(
                         dtype_involves_complex(f[0])
-                        for f in six.itervalues(dtype.fields))
+                        for f in dtype.fields.values())
 
         return dtype_involves_complex(self.dtype)
 
diff --git a/loopy/version.py b/loopy/version.py
index 29abbc2de889b884de93e5fe39a1d996811c93c9..fddd44479adcae87ec96f470a690274b154fde54 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@ else:
 # }}}
 
 
-VERSION = (2019, 1)
+VERSION = (2020, 2, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
@@ -60,21 +60,17 @@ except ImportError:
 else:
     _cgen_version = cgen.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "%s-islpy%s-cgen%s-%s-v0" % (
+DATA_MODEL_VERSION = "{}-islpy{}-cgen{}-{}-v1".format(
         VERSION_TEXT, _islpy_version, _cgen_version, _git_rev)
 
 
-FALLBACK_LANGUAGE_VERSION = (2017, 2, 1)
+FALLBACK_LANGUAGE_VERSION = (2018, 2)
 MOST_RECENT_LANGUAGE_VERSION = (2018, 2)
 
 LOOPY_USE_LANGUAGE_VERSION_2018_2 = (2018, 2)
-LOOPY_USE_LANGUAGE_VERSION_2018_1 = (2018, 1)
-LOOPY_USE_LANGUAGE_VERSION_2017_2_1 = (2017, 2, 1)
 
 LANGUAGE_VERSION_SYMBOLS = [
         "LOOPY_USE_LANGUAGE_VERSION_2018_2",
-        "LOOPY_USE_LANGUAGE_VERSION_2018_1",
-        "LOOPY_USE_LANGUAGE_VERSION_2017_2_1",
         ]
 
 __doc__ = """
@@ -102,7 +98,7 @@ language version to let them take advantage of this check.
 
 As a result, :mod:`loopy` will now issue a warning when a call to
 :func:`loopy.make_kernel` does not declare a language version. Such kernels
-will (indefinitely) default to language version 2017.2.1.  If passing a
+will (indefinitely) default to language version 2018.2.  If passing a
 language version to :func:`make_kernel` is impractical, you may also import
 one of the ``LOOPY_USE_LANGUAGE_VERSION_...`` symbols given below using::
 
@@ -129,14 +125,16 @@ History of Language Versions
 
 .. data:: LOOPY_USE_LANGUAGE_VERSION_2018_2
 
-    :attr:`loopy.Options.ignore_boostable_into` is turned on by default.
+    ``loopy.Options.ignore_boostable_into`` is turned on by default.
 
 .. data:: LOOPY_USE_LANGUAGE_VERSION_2018_1
 
-    :attr:`loopy.Options.enforce_variable_access_ordered`
-    is turned on by default.
+    :attr:`loopy.Options.enforce_variable_access_ordered` is turned on by
+    default. Unsupported from :mod:`loopy` version 2020.2 onwards.
 
 .. data:: LOOPY_USE_LANGUAGE_VERSION_2017_2_1
 
-    Initial legacy language version.
+    Initial legacy language version. Unsupported from :mod:`loopy` version
+    2020.2 onwards.
+
 """
diff --git a/proto-tests/test_fem_assembly.py b/proto-tests/test_fem_assembly.py
index 18f2a5bfabdd52abad9d78aacf4f1d5be53b5ac1..dde093d53be125c2b1eaf13022d51b3300b61314 100644
--- a/proto-tests/test_fem_assembly.py
+++ b/proto-tests/test_fem_assembly.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import numpy as np
 import pyopencl as cl
 import loopy as lp
diff --git a/proto-tests/test_sem.py b/proto-tests/test_sem.py
index 4613b74ae787fe086ead935ddec61ff1a5438521..b84d072d0546270e6d21702f7b0f5b6354f7a238 100644
--- a/proto-tests/test_sem.py
+++ b/proto-tests/test_sem.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import numpy as np
 import pyopencl as cl
 import loopy as lp
diff --git a/proto-tests/test_sem_tim.py b/proto-tests/test_sem_tim.py
index 1bfb437fb6de1cb5511d108eb35a8ad32326122e..9d8dfcfa680fc484f20c9511b34210b15af8d635 100644
--- a/proto-tests/test_sem_tim.py
+++ b/proto-tests/test_sem_tim.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import numpy as np
 import pyopencl as cl
 import loopy as lp
diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py
index d7061933e5667a623b4157ea6900a4b13c55e6c4..773821dce08adb758f49da6e8a6102011005beec 100644
--- a/proto-tests/test_tim.py
+++ b/proto-tests/test_tim.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import numpy as np
 import pyopencl as cl
 import loopy as lp
diff --git a/requirements.txt b/requirements.txt
index 97c2024764715d0a715520800e2e1dd467183479..2105aede063c65752ef4a9262eb960f749778a8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
-git+https://github.com/inducer/pytools.git
-git+https://github.com/inducer/islpy.git
-git+https://github.com/inducer/cgen.git
-git+https://github.com/inducer/pyopencl.git
-git+https://github.com/inducer/pymbolic.git
-git+https://github.com/inducer/genpy.git
-git+https://github.com/inducer/codepy.git
+git+https://github.com/inducer/pytools.git#egg=pytools
+git+https://github.com/inducer/islpy.git#egg=islpy
+git+https://github.com/inducer/cgen.git#egg=cgen
+git+https://github.com/inducer/pyopencl.git#egg=pyopencl
+git+https://github.com/inducer/pymbolic.git#egg=pymbolic
+git+https://github.com/inducer/genpy.git#egg=genpy
+git+https://github.com/inducer/codepy.git#egg=codepy
 
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
\ No newline at end of file
+ply>=3.6
diff --git a/setup.cfg b/setup.cfg
index a0d95746e1a399d6a2d7c315bffc9b834d2f5487..9495d106cf389d485037db16a35a14b4aaf6c873 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,3 +4,7 @@ max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
     loopy/target/c/compyte/array.py
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
diff --git a/setup.py b/setup.py
index 92c16a0f5d03f84d87106b6ec9d25b95a00a5872..ddc47fefca853321d383bad4aeaa6f24f6d5c901 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import os
 from setuptools import setup, find_packages
@@ -12,7 +11,7 @@ finally:
     version_file.close()
 
 os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
-exec(compile(version_file_contents, "loopy/version.py", 'exec'), ver_dic)
+exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic)
 
 
 # {{{ capture git revision at install time
@@ -34,9 +33,7 @@ def find_git_revision(tree_root):
               cwd=tree_root)
     (git_rev, _) = p.communicate()
 
-    import sys
-    if sys.version_info >= (3,):
-        git_rev = git_rev.decode()
+    git_rev = git_rev.decode()
 
     git_rev = git_rev.rstrip()
 
@@ -56,7 +53,7 @@ def write_git_revision(package_name):
     git_rev = find_git_revision(dn)
 
     with open(join(dn, package_name, "_git_rev.py"), "w") as outf:
-        outf.write("GIT_REVISION = %s\n" % repr(git_rev))
+        outf.write('GIT_REVISION = "%s"\n' % git_rev)
 
 
 write_git_revision("loopy")
@@ -64,37 +61,34 @@ write_git_revision("loopy")
 # }}}
 
 
-setup(name="loo.py",
+setup(name="loopy",
       version=ver_dic["VERSION_TEXT"],
       description="A code generator for array-based code on CPUs and GPUs",
-      long_description=open("README.rst", "rt").read(),
+      long_description=open("README.rst").read(),
       classifiers=[
-          'Development Status :: 4 - Beta',
-          'Intended Audience :: Developers',
-          'Intended Audience :: Other Audience',
-          'Intended Audience :: Science/Research',
-          'License :: OSI Approved :: MIT License',
-          'Natural Language :: English',
-          'Programming Language :: Python',
-          'Programming Language :: Python :: 2.6',
-          'Programming Language :: Python :: 2.7',
-          'Programming Language :: Python :: 3.2',
-          'Programming Language :: Python :: 3.3',
-          'Topic :: Scientific/Engineering',
-          'Topic :: Scientific/Engineering :: Information Analysis',
-          'Topic :: Scientific/Engineering :: Mathematics',
-          'Topic :: Scientific/Engineering :: Visualization',
-          'Topic :: Software Development :: Libraries',
-          'Topic :: Utilities',
+          "Development Status :: 4 - Beta",
+          "Intended Audience :: Developers",
+          "Intended Audience :: Other Audience",
+          "Intended Audience :: Science/Research",
+          "License :: OSI Approved :: MIT License",
+          "Natural Language :: English",
+          "Programming Language :: Python",
+          "Programming Language :: Python :: 3",
+          "Topic :: Scientific/Engineering",
+          "Topic :: Scientific/Engineering :: Information Analysis",
+          "Topic :: Scientific/Engineering :: Mathematics",
+          "Topic :: Scientific/Engineering :: Visualization",
+          "Topic :: Software Development :: Libraries",
+          "Topic :: Utilities",
           ],
 
+      python_requires="~=3.6",
       install_requires=[
-          "pytools>=2020.1",
+          "pytools>=2020.4",
           "pymbolic>=2019.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
           "islpy>=2019.1",
-          "six>=1.8.0",
           "codepy>=2017.1",
           "colorama",
           "Mako",
@@ -102,7 +96,7 @@ setup(name="loo.py",
 
       extras_require={
           "pyopencl":  [
-              "pyopencl>=2015.2",
+              "pyopencl>=2020.2",
               ],
           "fortran":  [
               # Note that this is *not* regular 'f2py2e', this is
@@ -120,7 +114,7 @@ setup(name="loo.py",
       scripts=["bin/loopy"],
 
       author="Andreas Kloeckner",
-      url="http://mathema.tician.de/software/loopy",
+      url="https://mathema.tician.de/software/loopy",
       author_email="inform@tiker.net",
       license="MIT",
       packages=find_packages(),
diff --git a/test/test_apps.py b/test/test_apps.py
index c1ff4b893c459f9860fca3fbda8d06406676b8b5..c1d3410d9b931012874fe5ffd9a6ae866e221be3 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -47,7 +45,7 @@ from loopy.diagnostic import LoopyError
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -101,8 +99,11 @@ def test_convolution(ctx_factory):
         knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
         knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
         knl = lp.tag_inames(knl, dict(ifeat="g.2"))
-        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto")
+        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]",
+                fetch_outer_inames="im_x_outer, im_y_outer, ifeat",
+                default_tag="l.auto")
         knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y",
+                fetch_outer_inames="iimg, im_x_outer, im_y_outer, ifeat, icolor",
                 default_tag="l.auto")
         return knl
 
@@ -567,7 +568,7 @@ def test_poisson_fem(ctx_factory):
     sdim = 3
 
     knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2,ell3]: \
+            "{ [c,i,j,k,ell,ell2]: \
             0 <= c < nels and \
             0 <= i < nbf and \
             0 <= j < nbf and \
@@ -590,12 +591,12 @@ def test_poisson_fem(ctx_factory):
     knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])
 
     def variant_1(knl):
-        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
+        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag="for")
         knl = lp.prioritize_loops(knl, "c,i,j")
         return knl
 
     def variant_2(knl):
-        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
+        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag="for")
         knl = lp.prioritize_loops(knl, "c,i,j")
         return knl
 
@@ -631,10 +632,10 @@ def test_domain_tree_nesting():
 
     TV = lp.TemporaryVariable  # noqa
 
-    knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
-                    '{[j]: 0 <= j < 100}',
-                    '{[a_count]: 0 <= a_count < a_end}',
-                    '{[b_count]: 0 <= b_count < b_end}'],
+    knl = lp.make_kernel(["{[i]: 0 <= i < 12}",
+                    "{[j]: 0 <= j < 100}",
+                    "{[a_count]: 0 <= a_count < a_end}",
+                    "{[b_count]: 0 <= b_count < b_end}"],
     """
     for j
         for i
@@ -653,15 +654,15 @@ def test_domain_tree_nesting():
     end
     """,
     [
-        TV('out_map', initializer=out_map, read_only=True, address_space=AS.PRIVATE),
-        TV('if_val', initializer=if_val, read_only=True, address_space=AS.PRIVATE),
-        TV('vals', initializer=vals, read_only=True, address_space=AS.PRIVATE),
-        TV('num_vals', initializer=num_vals, read_only=True,
+        TV("out_map", initializer=out_map, read_only=True, address_space=AS.PRIVATE),
+        TV("if_val", initializer=if_val, read_only=True, address_space=AS.PRIVATE),
+        TV("vals", initializer=vals, read_only=True, address_space=AS.PRIVATE),
+        TV("num_vals", initializer=num_vals, read_only=True,
            address_space=AS.PRIVATE),
-        TV('num_vals_offset', initializer=num_vals_offset, read_only=True,
+        TV("num_vals_offset", initializer=num_vals_offset, read_only=True,
            address_space=AS.PRIVATE),
-        lp.GlobalArg('B', shape=(100, 31), dtype=np.float64),
-        lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)])
+        lp.GlobalArg("B", shape=(100, 31), dtype=np.float64),
+        lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)])
 
     parents_per_domain = knl.root_kernel.parents_per_domain()
 
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index b1f335bbb7fdcd9cf1e53603d5b70d1a224ee140..75b4571004cfd046ba35f9407ce614bac0f5d2df 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2017 Nick Curtis"
 
 __license__ = """
@@ -25,7 +23,6 @@ THE SOFTWARE.
 import numpy as np
 import loopy as lp
 import sys
-import six
 import pytest
 from loopy import CACHING_ENABLED
 
@@ -63,30 +60,29 @@ def test_c_target():
 def test_c_target_strides():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
+    def __get_kernel(order="C"):
         return lp.make_kernel(
                 "{ [i,j]: 0<=i,j<n }",
                 "out[i, j] = 2*a[i, j]",
                 [
-                    lp.GlobalArg("out", np.float32, shape=('n', 'n'), order=order),
-                    lp.GlobalArg("a", np.float32, shape=('n', 'n'), order=order),
+                    lp.GlobalArg("out", np.float32, shape=("n", "n"), order=order),
+                    lp.GlobalArg("a", np.float32, shape=("n", "n"), order=order),
                     "..."
                     ],
                 target=ExecutableCTarget())
 
     # test with C-order
-    knl = __get_kernel('C')
-    lp.generate_code_v2(knl)
+    knl = __get_kernel("C")
     a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1),
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
 
     # test with F-order
-    knl = __get_kernel('F')
+    knl = __get_kernel("F")
     a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1),
-                      order='F')
+                      order="F")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
@@ -95,18 +91,18 @@ def test_c_target_strides():
 def test_c_target_strides_nonsquare():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
-        indicies = ['i', 'j', 'k']
+    def __get_kernel(order="C"):
+        indicies = ["i", "j", "k"]
         sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
         # create domain strings
-        domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}'
+        domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
         for idx, size in zip(indicies, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
-        statement = 'out[{indexed}] = 2 * a[{indexed}]'.format(
-            indexed=', '.join(indicies))
+        statement = "out[{indexed}] = 2 * a[{indexed}]".format(
+            indexed=", ".join(indicies))
         return lp.make_kernel(
                 domains,
                 statement,
@@ -118,21 +114,21 @@ def test_c_target_strides_nonsquare():
                 target=ExecutableCTarget())
 
     # test with C-order
-    knl = __get_kernel('C')
-    a_lp = next(x for x in knl.args if x.name == 'a')
+    knl = __get_kernel("C")
+    a_lp = next(x for x in knl.args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
 
     # test with F-order
-    knl = __get_kernel('F')
-    a_lp = next(x for x in knl.args if x.name == 'a')
+    knl = __get_kernel("F")
+    a_lp = next(x for x in knl.args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
-                      order='F')
+                      order="F")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
@@ -141,18 +137,18 @@ def test_c_target_strides_nonsquare():
 def test_c_optimizations():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
-        indicies = ['i', 'j', 'k']
+    def __get_kernel(order="C"):
+        indicies = ["i", "j", "k"]
         sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
         # create domain strings
-        domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}'
+        domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
         for idx, size in zip(indicies, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
-        statement = 'out[{indexed}] = 2 * a[{indexed}]'.format(
-            indexed=', '.join(indicies))
+        statement = "out[{indexed}] = 2 * a[{indexed}]".format(
+            indexed=", ".join(indicies))
         return lp.make_kernel(
                 domains,
                 statement,
@@ -164,20 +160,20 @@ def test_c_optimizations():
                 target=ExecutableCTarget()), sizes
 
     # test with ILP
-    knl, sizes = __get_kernel('C')
-    knl = lp.split_iname(knl, 'i', 4, inner_tag='ilp')
+    knl, sizes = __get_kernel("C")
+    knl = lp.split_iname(knl, "i", 4, inner_tag="ilp")
     a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32),
                       sizes,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1], 2 * a_np)
 
     # test with unrolling
-    knl, sizes = __get_kernel('C')
-    knl = lp.split_iname(knl, 'i', 4, inner_tag='unr')
+    knl, sizes = __get_kernel("C")
+    knl = lp.split_iname(knl, "i", 4, inner_tag="unr")
     a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32),
                       sizes,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1], 2 * a_np)
 
@@ -187,13 +183,13 @@ def test_function_decl_extractor():
     # in execution
     from loopy.target.c import ExecutableCTarget
 
-    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
         """
             a[i] = b[i] + v
         """,
-        [lp.GlobalArg('a', shape=(10,), dtype=np.int32),
-         lp.ConstantArg('b', shape=(10)),
-         lp.ValueArg('v', dtype=np.int32)],
+        [lp.GlobalArg("a", shape=(10,), dtype=np.int32),
+         lp.ConstantArg("b", shape=(10)),
+         lp.ValueArg("v", dtype=np.int32)],
         target=ExecutableCTarget())
 
     assert np.allclose(knl(b=np.arange(10), v=-1)[1], np.arange(10) - 1)
@@ -204,13 +200,14 @@ def test_c_caching():
     # ensure that codepy is correctly caching the code
     from loopy.target.c import ExecutableCTarget
 
-    class TestingLogger(object):
+    class TestingLogger:
         def start_capture(self, loglevel=logging.DEBUG):
             """ Start capturing log output to a string buffer.
                 @param newLogLevel: Optionally change the global logging level, e.g.
                 logging.DEBUG
             """
-            self.buffer = six.StringIO()
+            from io import StringIO
+            self.buffer = StringIO()
             self.buffer.write("Log output")
 
             logger = logging.getLogger()
@@ -246,14 +243,14 @@ def test_c_caching():
             return self.buffer.getvalue()
 
     def __get_knl():
-        return lp.make_kernel('{[i]: 0 <= i < 10}',
+        return lp.make_kernel("{[i]: 0 <= i < 10}",
         """
             a[i] = b[i]
         """,
-        [lp.GlobalArg('a', shape=(10,), dtype=np.int32),
-         lp.ConstantArg('b', shape=(10))],
+        [lp.GlobalArg("a", shape=(10,), dtype=np.int32),
+         lp.ConstantArg("b", shape=(10))],
                              target=ExecutableCTarget(),
-                             name='cache_test')
+                             name="cache_test")
 
     knl = __get_knl()
     # compile
@@ -268,7 +265,7 @@ def test_c_caching():
     # and get logs
     logs = tl.stop_capture()
     # check that we didn't recompile
-    assert 'Kernel cache_test retrieved from cache' in logs
+    assert "Kernel cache_test retrieved from cache" in logs
 
 
 def test_c_execution_with_global_temporaries():
@@ -279,12 +276,12 @@ def test_c_execution_with_global_temporaries():
     AS = lp.AddressSpace        # noqa
     n = 10
 
-    knl = lp.make_kernel('{[i]: 0 <= i < n}',
+    knl = lp.make_kernel("{[i]: 0 <= i < n}",
         """
             a[i] = b[i]
         """,
-        [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
-         lp.TemporaryVariable('b', shape=(n,),
+        [lp.GlobalArg("a", shape=(n,), dtype=np.int32),
+         lp.TemporaryVariable("b", shape=(n,),
                               initializer=np.arange(n, dtype=np.int32),
                               dtype=np.int32,
                               read_only=True,
@@ -292,7 +289,7 @@ def test_c_execution_with_global_temporaries():
         target=ExecutableCTarget())
 
     knl = lp.fix_parameters(knl, n=n)
-    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
+    assert ("int b[%d]" % n) not in lp.generate_code_v2(knl).host_code()
     assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
 
 
@@ -304,12 +301,12 @@ def test_missing_compilers():
     def __test(evalfunc, target, **targetargs):
         n = 10
 
-        knl = lp.make_kernel('{[i]: 0 <= i < n}',
+        knl = lp.make_kernel("{[i]: 0 <= i < n}",
             """
                 a[i] = b[i]
             """,
-            [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
-             lp.GlobalArg('b', shape=(n,), dtype=np.int32)],
+            [lp.GlobalArg("a", shape=(n,), dtype=np.int32),
+             lp.GlobalArg("b", shape=(n,), dtype=np.int32)],
             target=target(**targetargs))
 
         knl = lp.fix_parameters(knl, n=n)
@@ -328,7 +325,7 @@ def test_missing_compilers():
     try:
         # test with path wiped out such that we can't find gcc
         with pytest.raises(ExecError):
-            os.environ["PATH"] = ''
+            os.environ["PATH"] = ""
             ccomp = CCompiler()
             __test(eval_tester, ExecutableCTarget, compiler=ccomp)
     finally:
@@ -344,9 +341,9 @@ def test_missing_compilers():
         __test(eval_tester, ExecutableCTarget, compiler=ccomp)
 
     # next test that some made up compiler can be specified
-    ccomp = CCompiler(cc='foo')
+    ccomp = CCompiler(cc="foo")
     assert isinstance(ccomp.toolchain, GCCToolchain)
-    assert ccomp.toolchain.cc == 'foo'
+    assert ccomp.toolchain.cc == "foo"
 
     # and that said made up compiler errors out
 
diff --git a/test/test_callables.py b/test/test_callables.py
index 04eeae66666a85d3cf4dc9c7f57455967c2992ef..3ba8e5dbdc69ab5c7375da63bfc0a56a9f2beb99 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
diff --git a/test/test_dg.py b/test/test_dg.py
index 967dea35071bb3d95c06b2e37d73da29ac019763..318b4c3698f1209893b9b71daff4d2e053d5a351 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -100,6 +98,7 @@ def test_dg_volume(ctx_factory):
         knl = lp.tag_inames(knl, dict(n="l.0"))
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         knl = lp.add_prefetch(knl, "DrDsDt[:,:]",
+                fetch_outer_inames="k_outer",
                 default_tag="l.auto")
         return knl
 
diff --git a/test/test_diff.py b/test/test_diff.py
index d001233c0eced5ecaf9342b90da0487faefb21f3..c99824c7a83e385dfa560ffb82f57c2699083ad1 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
diff --git a/test/test_domain.py b/test/test_domain.py
index dd789d2cd8152413c815fddbbca62b94797623bf..1b2ae2d768dae667b8ec1bc0d1d4be8e1e477392 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,9 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa
-from six.moves import range  # noqa
-
 import sys
 import numpy as np
 import loopy as lp
@@ -104,7 +99,7 @@ def test_eq_constraint(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<= i,j < 32}",
+            "{[i]: 0<= i < 32}",
             [
                 "a[i] = b[i]"
                 ],
@@ -298,7 +293,7 @@ def test_equality_constraints(ctx_factory):
             ],
             [
                 "a[i,j] = 5 {id=set_all}",
-                "b[i,k] = 22 {dep=set_all}",
+                "b[i,k] = 22 {id=set_b, dep=set_all}",
                 ],
             [
                 lp.GlobalArg("a,b", dtype, shape="n, n", order=order),
@@ -310,6 +305,9 @@ def test_equality_constraints(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
+
+    knl = lp.add_inames_to_insn(knl, "j_inner, j_outer", "id:set_b")
+
     #print(knl)
     #print(knl.domains[0].detect_equalities())
 
diff --git a/test/test_expression.py b/test/test_expression.py
index 41a8de656efcfc44fe404fa4722572d36c974409..f0c0d3df0e959b57ca6f9aeb33bdcd78b170aa54 100644
--- a/test/test_expression.py
+++ b/test/test_expression.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2019 Andreas Kloeckner"
 
 __license__ = """
@@ -22,9 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
-from six.moves import range
-
 import sys
 import numpy as np
 import loopy as lp
@@ -51,7 +46,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -66,12 +61,12 @@ class BoundsCheckError(ValueError):
 
 class BoundsCheckingEvaluationMapper(EvaluationMapper):
     def __init__(self, context, lbound, ubound):
-        super(BoundsCheckingEvaluationMapper, self).__init__(context)
+        super().__init__(context)
         self.lbound = lbound
         self.ubound = ubound
 
     def rec(self, expr):
-        result = super(BoundsCheckingEvaluationMapper, self).rec(expr)
+        result = super().rec(expr)
 
         if result > self.ubound:
             raise BoundsCheckError()
@@ -326,11 +321,11 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed):
             shape=()))
         data.extend([
             lp.TemporaryVariable(name, get_numpy_type(val))
-            for name, val in six.iteritems(var_values)
+            for name, val in var_values.items()
             ])
         instructions.extend([
             lp.Assignment(name, get_numpy_type(val)(val))
-            for name, val in six.iteritems(var_values)
+            for name, val in var_values.items()
             ])
         instructions.append(lp.Assignment(var_name, expr))
 
@@ -350,7 +345,7 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed):
     print(knl)
     evt, lp_values = knl(queue, out_host=True)
 
-    for name, ref_value in six.iteritems(ref_values):
+    for name, ref_value in ref_values.items():
         lp_value = lp_values[name]
         if expr_type in ["real", "complex"]:
             err = abs(ref_value-lp_value)/abs(ref_value)
@@ -365,7 +360,7 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed):
             print(80*"-")
             print(lp.generate_code_v2(knl).device_code())
             print(80*"-")
-            print("WRONG: %s rel error=%g" % (name, err))
+            print(f"WRONG: {name} rel error={err:g}")
             print("reference=%r" % ref_value)
             print("loopy=%r" % lp_value)
             print(80*"-")
@@ -381,8 +376,8 @@ def test_sci_notation_literal(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     set_kernel = lp.make_kernel(
-         ''' { [i]: 0<=i<12 } ''',
-         ''' out[i] = 1e-12''')
+         """ { [i]: 0<=i<12 } """,
+         """ out[i] = 1e-12""")
 
     set_kernel = lp.set_options(set_kernel, write_cl=True)
 
@@ -396,8 +391,8 @@ def test_indexof(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<5 } ''',
-         ''' out[i,j] = indexof(out[i,j])''')
+         """ { [i,j]: 0<=i,j<5 } """,
+         """ out[i,j] = indexof(out[i,j])""")
 
     knl = lp.set_options(knl, write_cl=True)
 
@@ -420,8 +415,8 @@ def test_indexof_vec(ctx_factory):
         pytest.skip("target ICD miscompiles vector code")
 
     knl = lp.make_kernel(
-         ''' { [i,j,k]: 0<=i,j,k<4 } ''',
-         ''' out[i,j,k] = indexof_vec(out[i,j,k])''')
+         """ { [i,j,k]: 0<=i,j,k<4 } """,
+         """ out[i,j,k] = indexof_vec(out[i,j,k])""")
 
     knl = lp.tag_inames(knl, {"i": "vec"})
     knl = lp.tag_data_axes(knl, "out", "vec,c,c")
@@ -479,7 +474,7 @@ def test_divide_precedence(ctx_factory):
             x[0] = c*(a/b)
             y[0] = c*(a%b)
             """,
-            [lp.ValueArg('a, b, c', np.int32), lp.GlobalArg('x, y', np.int32)])
+            [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)])
     print(lp.generate_code_v2(knl).device_code())
 
     evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5)
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 2e67116969d6b5f8fd8d7854bc2617431e3c14d9..a0f3cc7bd615c69e80f1ff2f7b706939e34501de 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -38,11 +36,11 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
-pytestmark = pytest.mark.importorskip("fparser")
+pytest.importorskip("fparser")
 
 
 def test_fp_prec_comparison():
@@ -407,8 +405,12 @@ def test_matmul(ctx_factory, buffer_inames):
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
-    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto")
-    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto")
+    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
+    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
 
     knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
             init_expression="0", store_expression="base+buffer")
@@ -586,9 +588,11 @@ def test_precompute_some_exist(ctx_factory):
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
     knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
             precompute_inames="ktemp,itemp",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
     knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
             precompute_inames="itemp,k2temp",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     ref_knl = knl
diff --git a/test/test_isl.py b/test/test_isl.py
index ff58a1bb315d992051018ca38992820156393192..b55224654fea8b22684fdc693704afcb558e00c6 100644
--- a/test/test_isl.py
+++ b/test/test_isl.py
@@ -54,18 +54,21 @@ def test_pw_aff_to_conditional_expr():
 def test_subst_into_pwqpolynomial():
     from pymbolic.primitives import Variable
     arg_dict = {
-            'm': 3*Variable("nx"),
-            'n': 3*Variable("ny"),
-            'nx': Variable('nx'),
-            'ny': Variable('ny'),
-            'nz': Variable('nz')}
+            "m": 3*Variable("nx"),
+            "n": 3*Variable("ny"),
+            "nx": Variable("nx"),
+            "ny": Variable("ny"),
+            "nz": Variable("nz")}
     space = isl.Set("[nx, ny, nz] -> { []: }").space
     poly = isl.PwQPolynomial("[m, n] -> { (256 * m + 256 * m * n) : "
         "m > 0 and n > 0; 256 * m : m > 0 and n <= 0 }")
 
     from loopy.isl_helpers import subst_into_pwqpolynomial
     result = subst_into_pwqpolynomial(space, poly, arg_dict)
-    assert "(768 * nx + 2304 * nx * ny)" in str(result)
+    expected_pwqpoly = isl.PwQPolynomial("[nx, ny, nz] -> {"
+            "(768 * nx + 2304 * nx * ny) : nx > 0 and ny > 0;"
+            "768 * nx : nx > 0 and ny <= 0 }")
+    assert (result - expected_pwqpoly).is_zero()
 
 
 if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index f075d3493195ec3364c4de0d26f92c4a987e7187..9146e84bff1bba14807504978ed5da09bc31ace4 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -148,7 +146,7 @@ def test_transpose(ctx_factory):
             outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16,
             outer_tag="g.1", inner_tag="l.0")
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"],
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -186,8 +184,10 @@ def test_plain_matrix_mul(ctx_factory):
                 outer_tag="g.1", inner_tag="l.0")
         knl = lp.split_iname(knl, "k", 16)
         knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"],
+                fetch_outer_inames="i_outer, j_outer, k_outer",
                 default_tag="l.auto")
         knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ],
+                fetch_outer_inames="i_outer, j_outer, k_outer",
                 default_tag="l.auto")
 
         lp.auto_test_vs_ref(ref_knl, ctx, knl,
@@ -223,8 +223,12 @@ def test_variable_size_matrix_mul(ctx_factory):
             slabs=(0, 1))
     knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))
 
-    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
-    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
@@ -263,8 +267,10 @@ def test_funny_shape_matrix_mul(ctx_factory):
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
     knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
     knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
@@ -307,8 +313,10 @@ def test_rank_one(ctx_factory):
         knl = lp.split_iname(knl, "j", 16,
                 outer_tag="g.1", inner_tag="l.1")
 
-        knl = lp.add_prefetch(knl, "a")
-        knl = lp.add_prefetch(knl, "b")
+        knl = lp.add_prefetch(knl, "a",
+                fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
+        knl = lp.add_prefetch(knl, "b",
+                fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
         return knl
 
     def variant_3(knl):
@@ -317,8 +325,15 @@ def test_rank_one(ctx_factory):
         knl = lp.split_iname(knl, "j", 16,
                 outer_tag="g.1", inner_tag="l.1")
 
-        knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto")
-        knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto")
+        knl = lp.add_prefetch(knl, "a", ["i_inner"],
+                    fetch_outer_inames="i_outer, j_outer, j_inner",
+                    temporary_address_space=lp.AddressSpace.LOCAL,
+                    default_tag="l.auto")
+        knl = lp.add_prefetch(knl, "b", ["j_inner"],
+                    fetch_outer_inames="i_outer, j_outer, j_inner",
+                    temporary_address_space=lp.AddressSpace.LOCAL,
+                    default_tag="l.auto")
+
         return knl
 
     def variant_4(knl):
@@ -327,8 +342,10 @@ def test_rank_one(ctx_factory):
         knl = lp.split_iname(knl, "j", 256,
                 outer_tag="g.1", slabs=(0, 1))
 
-        knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None)
-        knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None)
+        knl = lp.add_prefetch(knl, "a", ["i_inner"],
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
+        knl = lp.add_prefetch(knl, "b", ["j_inner"],
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
 
         knl = lp.split_iname(knl, "i_inner", 16,
                 inner_tag="l.0")
@@ -384,7 +401,8 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1")
     knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
     knl = lp.split_iname(knl, "k", 16)
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"],
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -424,15 +442,17 @@ def test_intel_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "k", 16)
     #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr")
 
-    knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"],
+    knl = lp.add_prefetch(knl, "a", ["i_inner_inner", "k_inner", "i_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"],
+    knl = lp.add_prefetch(knl, "b", ["j_inner_inner", "k_inner", "j_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     # FIXME: Grouped prefetch
-    #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")],
+    #knl = lp.add_prefetch(knl, "a", ["k_inner", ("i_inner_inner", "i_inner_outer")],
     #           default_tag="l.auto")
-    #knl = lp.add_prefetch(knl, 'b',
+    #knl = lp.add_prefetch(knl, "b",
     # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto")
 
     #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
@@ -484,9 +504,9 @@ def test_magma_fermi_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "k", 16)
     knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr")
     # FIXME
-    #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"],
+    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"],
     #           default_tag="l.auto")
-    #knl = lp.add_prefetch(knl, 'b',
+    #knl = lp.add_prefetch(knl, "b",
     #    ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -528,8 +548,12 @@ def test_image_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 32)
     # conflict-free
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
@@ -574,8 +598,8 @@ def no_test_image_matrix_mul_ilp(ctx_factory):
             outer_tag="ilp", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 2)
     # conflict-free?
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"],
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "b", ["j_inner_outer", "j_inner_inner", "k_inner"],
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -608,8 +632,12 @@ def test_fancy_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 16, slabs=(0, 1))
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "b", ["k_inner", "j_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
+            default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
@@ -640,7 +668,7 @@ def test_small_batched_matvec(ctx_factory):
     seq_knl = knl
 
     align_bytes = 64
-    knl = lp.add_prefetch(knl, 'd[:,:]', default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "d[:,:]", default_tag="l.auto")
     pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes)
     knl = lp.split_array_dim(knl, ("f", 0), pad_mult)
     knl = lp.add_padding(knl, "f", 0, align_bytes)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 3c985640bae6cdb07939e1a3a752b642f6dac2e6..09d926b1376689fb7289b95910f4f9e33b651166 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,9 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa: F401
-from six.moves import range
-
 import sys
 import numpy as np
 import loopy as lp
@@ -48,7 +43,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -68,16 +63,16 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory):
             out[ii] = 2*out[ii]+cnst[ii]{id=second}
             """,
             [lp.TemporaryVariable(
-                'cnst', shape=('n'), initializer=cnst,
-                address_space=lp.AddressSpace.GLOBAL,
-                read_only=True), '...'])
+                "cnst", initializer=cnst,
+                scope=lp.AddressSpace.GLOBAL,
+                read_only=True), "..."])
     knl = lp.fix_parameters(knl, n=16)
     knl = lp.add_barrier(knl, "id:first", "id:second")
 
     knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0")
     evt, (out,) = knl(queue, a=a)
-    assert np.linalg.norm(out-((2*(a+cnst)+cnst))) <= 1e-15
+    assert np.linalg.norm(out-(2*(a+cnst)+cnst)) <= 1e-15
 
 
 def test_complicated_subst(ctx_factory):
@@ -182,7 +177,7 @@ def test_simple_side_effect(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<100}",
+            "{[i]: 0<=i<100}",
             """
                 a[i] = a[i] + 1
                 """,
@@ -458,7 +453,7 @@ def test_nonlinear_index(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
+            "{[i]: 0<=i<n }",
             """
                 a[i*i] = 17
                 """,
@@ -565,7 +560,7 @@ def test_dependent_domain_insn_iname_finding(ctx_factory):
 
     prog = lp.make_kernel([
             "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
-            "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
+            "{[isrc]: isrc_start<=isrc<isrc_end}",
             ],
             """
                 <> src_ibox = source_boxes[isrc_box]
@@ -769,7 +764,7 @@ def test_multiple_writes_to_local_temporary():
     # writes are OK.
 
     knl = lp.make_kernel(
-        "{[i,e]: 0<=i<5 and 0<=e<nelements}",
+        "{[i]: 0<=i<5}",
         """
         <> temp[i, 0] = 17
         temp[i, 1] = 15
@@ -845,8 +840,8 @@ def test_auto_test_zero_warmup_rounds(ctx_factory):
 
 def test_variable_size_temporary():
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' out[i] = sum(j, a[i,j])''')
+         """{ [i,j]: 0<=i,j<n }""",
+         """out[i] = sum(j, a[i,j])""")
 
     knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
 
@@ -930,7 +925,7 @@ def test_atomic_load(ctx_factory, dtype):
                 lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
                 lp.GlobalArg("a", dtype, shape=lp.auto),
                 lp.GlobalArg("b", dtype, shape=lp.auto),
-                lp.TemporaryVariable('temp', dtype, for_atomic=True,
+                lp.TemporaryVariable("temp", dtype, for_atomic=True,
                                      address_space=AddressSpace.LOCAL),
                 "..."
                 ],
@@ -946,7 +941,7 @@ def test_atomic_init(dtype):
     vec_width = 4
 
     knl = lp.make_kernel(
-            "{ [i,j]: 0<=i<100 }",
+            "{ [i]: 0<=i<100 }",
             """
             out[i%4] = 0 {id=init, atomic=init}
             """,
@@ -955,7 +950,7 @@ def test_atomic_init(dtype):
                 "..."
                 ],
             silenced_warnings=["write_race(init)"])
-    knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0')
+    knl = lp.split_iname(knl, "i", vec_width, inner_tag="l.0")
     print(knl)
     print(lp.generate_code_v2(knl).device_code())
 
@@ -991,7 +986,7 @@ def test_within_inames_and_reduction():
 
     prog = lp.preprocess_kernel(prog)
 
-    assert 'i' not in prog.root_kernel.insn_inames("insn_0_j_update")
+    assert "i" not in prog.root_kernel.insn_inames("insn_0_j_update")
     print(prog.root_kernel.stringify(with_dependencies=True))
 
 
@@ -1015,9 +1010,9 @@ def test_literal_local_barrier(ctx_factory):
 
 def test_local_barrier_mem_kind():
     def _test_type(mtype, expected):
-        insn = '... lbarrier'
+        insn = "... lbarrier"
         if mtype:
-            insn += '{mem_kind=%s}' % mtype
+            insn += "{mem_kind=%s}" % mtype
         knl = lp.make_kernel(
                 "{ [i]: 0<=i<n }",
                 """
@@ -1028,11 +1023,11 @@ def test_local_barrier_mem_kind():
                 target=lp.PyOpenCLTarget())
 
         cgr = lp.generate_code_v2(knl)
-        assert 'barrier(%s)' % expected in cgr.device_code()
+        assert "barrier(%s)" % expected in cgr.device_code()
 
-    _test_type('', 'CLK_LOCAL_MEM_FENCE')
-    _test_type('global', 'CLK_GLOBAL_MEM_FENCE')
-    _test_type('local', 'CLK_LOCAL_MEM_FENCE')
+    _test_type("", "CLK_LOCAL_MEM_FENCE")
+    _test_type("global", "CLK_GLOBAL_MEM_FENCE")
+    _test_type("local", "CLK_LOCAL_MEM_FENCE")
 
 
 def test_kernel_splitting(ctx_factory):
@@ -1180,7 +1175,7 @@ def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
     knl = lp.set_temporary_scope(knl, "t", "private")
 
     save_and_reload_temporaries_test(
-        queue, knl, np.vstack((8 * (np.arange(8),))), debug)
+        queue, knl, np.vstack(8 * (np.arange(8),)), debug)
 
 
 def test_save_of_private_multidim_array(ctx_factory, debug=False):
@@ -1203,7 +1198,7 @@ def test_save_of_private_multidim_array(ctx_factory, debug=False):
 
     knl = lp.set_temporary_scope(knl, "t", "private")
 
-    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    result = np.array([np.vstack(8 * (np.arange(8),)) for i in range(8)])
     save_and_reload_temporaries_test(queue, knl, result, debug)
 
 
@@ -1228,7 +1223,7 @@ def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
     knl = lp.set_temporary_scope(knl, "t", "private")
     knl = lp.tag_inames(knl, dict(i="g.0"))
 
-    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    result = np.array([np.vstack(8 * (np.arange(8),)) for i in range(8)])
     save_and_reload_temporaries_test(queue, knl, result, debug)
 
 
@@ -1527,7 +1522,7 @@ def test_finite_difference_expr_subst(ctx_factory):
             gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True,
             default_tag="l.auto")
 
-    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
+    precomp_knl = lp.tag_inames(precomp_knl, {"j_outer": "unr"})
     precomp_knl = lp.set_options(precomp_knl, return_dict=True)
     evt, _ = precomp_knl(queue, u=u, h=h)
 
@@ -1680,8 +1675,8 @@ def test_global_barrier(ctx_factory):
                     ... gbarrier {id=top}
                     <> z[i] = z[i+1] + z[i]  {id=wr_z,dep=top}
                     <> v[i] = 11  {id=wr_v,dep=top}
-                    ... gbarrier {dep=wr_z:wr_v,id=yoink}
-                    z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=wr_z}
+                    ... gbarrier {id=yoink,dep=wr_z:wr_v}
+                    z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink}
                 end
                 ... gbarrier {dep=iupd,id=postloop}
                 z[i] = z[i] - z[i+1] + v[i]  {dep=postloop}
@@ -1754,7 +1749,7 @@ def test_index_cse(ctx_factory):
 def test_ilp_and_conditionals(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond = T[k] < 0.5
@@ -1769,7 +1764,7 @@ def test_ilp_and_conditionals(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
+    knl = lp.split_iname(knl, "k", 2, inner_tag="ilp")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
@@ -1777,7 +1772,7 @@ def test_ilp_and_conditionals(ctx_factory):
 def test_unr_and_conditionals(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond[k] = T[k] < 0.5
@@ -1792,7 +1787,7 @@ def test_unr_and_conditionals(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
+    knl = lp.split_iname(knl, "k", 2, inner_tag="unr")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
@@ -1800,7 +1795,7 @@ def test_unr_and_conditionals(ctx_factory):
 def test_constant_array_args(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond[k] = T[k] < 0.5
@@ -1809,8 +1804,8 @@ def test_constant_array_args(ctx_factory):
              end
          end
          """,
-         [lp.ConstantArg('T', shape=(200,), dtype=np.float32),
-         '...'])
+         [lp.ConstantArg("T", shape=(200,), dtype=np.float32),
+         "..."])
 
     knl = lp.fix_parameters(knl, n=200)
 
@@ -1871,33 +1866,33 @@ def test_const_temp_with_initializer_not_saved():
 
 
 def test_header_extract():
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              T[k] = k**2
          end
          """,
-         [lp.GlobalArg('T', shape=(200,), dtype=np.float32),
-         '...'])
+         [lp.GlobalArg("T", shape=(200,), dtype=np.float32),
+         "..."])
 
     knl = lp.fix_parameters(knl, n=200)
 
     #test C
     cknl = knl.copy(target=lp.CTarget())
     assert str(lp.generate_header(cknl)[0]) == (
-            'void loopy_kernel(float *__restrict__ T);')
+            "void loopy_kernel(float *__restrict__ T);")
 
     #test CUDA
     cuknl = knl.copy(target=lp.CudaTarget())
     assert str(lp.generate_header(cuknl)[0]) == (
             'extern "C" __global__ void __launch_bounds__(1) '
-            'loopy_kernel(float *__restrict__ T);')
+            "loopy_kernel(float *__restrict__ T);")
 
     #test OpenCL
     oclknl = knl.copy(target=lp.PyOpenCLTarget())
     assert str(lp.generate_header(oclknl)[0]) == (
-            '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
-            'loopy_kernel(__global float *__restrict__ T);')
+            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) "
+            "loopy_kernel(__global float *__restrict__ T);")
 
 
 def test_scalars_with_base_storage(ctx_factory):
@@ -1905,8 +1900,9 @@ def test_scalars_with_base_storage(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    import islpy as isl
     knl = lp.make_kernel(
-            "{ [i]: 0<=i<1}",
+            [isl.BasicSet("[] -> {[]: }")],  # empty (domain w/unused inames errors)
             "a = 1",
             [lp.TemporaryVariable("a", dtype=np.float64,
                                   shape=(), base_storage="base")])
@@ -2090,38 +2086,37 @@ def test_integer_reduction(ctx_factory):
     n = 200
     for vtype in [np.int32, np.int64]:
         var_int = np.random.randint(1000, size=n).astype(vtype)
-        var_lp = lp.TemporaryVariable('var', initializer=var_int,
+        var_lp = lp.TemporaryVariable("var", initializer=var_int,
                                    read_only=True,
                                    address_space=lp.AddressSpace.PRIVATE,
                                    dtype=to_loopy_type(vtype),
                                    shape=lp.auto)
 
         from collections import namedtuple
-        ReductionTest = namedtuple('ReductionTest', 'kind, check, args')
+        ReductionTest = namedtuple("ReductionTest", "kind, check, args")
 
         reductions = [
-            ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'),
-            ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'),
-            ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'),
-            ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'),
-            ReductionTest('argmax',
+            ReductionTest("max", lambda x: x == np.max(var_int), args="var[k]"),
+            ReductionTest("min", lambda x: x == np.min(var_int), args="var[k]"),
+            ReductionTest("sum", lambda x: x == np.sum(var_int), args="var[k]"),
+            ReductionTest("product", lambda x: x == np.prod(var_int), args="var[k]"),
+            ReductionTest("argmax",
                 lambda x: (
                     x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)),
-                args='var[k], k'),
-            ReductionTest('argmin',
+                args="var[k], k"),
+            ReductionTest("argmin",
                 lambda x: (
                     x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)),
-                args='var[k], k')
+                args="var[k], k")
         ]
 
         for reduction, function, args in reductions:
-            kstr = ("out" if 'arg' not in reduction
+            kstr = ("out" if "arg" not in reduction
                         else "out[0], out[1]")
-            kstr += ' = {0}(k, {1})'.format(reduction, args)
-            knl = lp.make_kernel('{[k]: 0<=k<n}',
+            kstr += f" = {reduction}(k, {args})"
+            knl = lp.make_kernel("{[k]: 0<=k<n}",
                                 kstr,
-                                [var_lp, '...'])
-            knl = lp.set_options(knl, "write_cl")
+                                [var_lp, "..."])
 
             knl = lp.fix_parameters(knl, n=200)
 
@@ -2277,7 +2272,7 @@ def test_barrier_insertion_near_bottom_of_loop():
 
 def test_barrier_in_overridden_get_grid_size_expanded_kernel():
     # make simple barrier'd kernel
-    prog = lp.make_kernel('{[i]: 0 <= i < 10}',
+    prog = lp.make_kernel("{[i]: 0 <= i < 10}",
                    """
               for i
                     a[i] = i {id=a}
@@ -2285,14 +2280,14 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
                     b[i + 1] = a[i] {nosync=a}
               end
                    """,
-                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
+                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order="C",
                                          address_space=lp.AddressSpace.LOCAL),
-                    lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
+                    lp.GlobalArg("b", np.float32, shape=(11,), order="C")],
                seq_dependencies=True)
 
     # split into kernel w/ vesize larger than iname domain
     vecsize = 16
-    prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0')
+    prog = lp.split_iname(prog, "i", vecsize, inner_tag="l.0")
 
     from testlib import GridOverride
 
@@ -2398,7 +2393,7 @@ def test_struct_assignment(ctx_factory):
 
     bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct(
             ctx.devices[0], "bbhit", bbhit)
-    bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit)
+    bbhit = cl.tools.get_or_register_dtype("bbhit", bbhit)
 
     preamble = bbhit_c_decl
 
@@ -2475,7 +2470,7 @@ def test_fixed_parameters(ctx_factory):
 
 def test_parameter_inference():
     knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
-    assert knl.root_kernel.all_params() == set(["n"])
+    assert knl.root_kernel.all_params() == {"n"}
 
 
 def test_execution_backend_can_cache_dtypes(ctx_factory):
@@ -2505,17 +2500,14 @@ def test_wildcard_dep_matching():
             """,
             "...")
 
-    all_insns = set("insn%d" % i for i in range(1, 6))
+    all_insns = {"insn%d" % i for i in range(1, 6)}
 
     assert prog.root_kernel.id_to_insn["insn1"].depends_on == set()
-    assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns -
-            set(["insn2"]))
-    assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns -
-            set(["insn3"]))
-    assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1",
-        "insn2"]))
-    assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns -
-            set(["insn1", "insn5"]))
+    assert prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - {"insn2"}
+    assert prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - {"insn3"}
+    assert prog.root_kernel.id_to_insn["insn4"].depends_on == {"insn1", "insn2"}
+    assert prog.root_kernel.id_to_insn["insn5"].depends_on == (all_insns
+                                                               - {"insn1", "insn5"})
 
 
 def test_preamble_with_separate_temporaries(ctx_factory):
@@ -2536,19 +2528,19 @@ def test_preamble_with_separate_temporaries(ctx_factory):
     data = np.random.rand(np.product(num_data))
 
     # make kernel
-    kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+    kernel = lp.make_kernel("{[i]: 0 <= i < n}",
     """
     for i
         <>ind = indirect(offsets[i], offsets[i + 1], 1)
         out[i] = data[ind]
     end
     """,
-    [lp.GlobalArg('out', shape=('n',)),
+    [lp.GlobalArg("out", shape=("n",)),
      lp.TemporaryVariable(
-        'offsets', shape=(offsets.size,), initializer=offsets,
+        "offsets", shape=(offsets.size,), initializer=offsets,
         address_space=lp.AddressSpace.GLOBAL,
         read_only=True),
-     lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+     lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)],
     )
 
     # fixt params, and add manglers / preamble
@@ -2557,13 +2549,13 @@ def test_preamble_with_separate_temporaries(ctx_factory):
             SeparateTemporariesPreambleTestPreambleGenerator,
             )
     func_info = dict(
-            func_name='indirect',
+            func_name="indirect",
             func_arg_dtypes=(np.int32, np.int32, np.int32),
             func_result_dtypes=(np.int32,),
             arr=lookup
             )
 
-    kernel = lp.fix_parameters(kernel, **{'n': n})
+    kernel = lp.fix_parameters(kernel, **{"n": n})
     kernel = lp.register_preamble_generators(
             kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)])
     kernel = lp.register_function_manglers(
@@ -2575,7 +2567,7 @@ def test_preamble_with_separate_temporaries(ctx_factory):
     queue = cl.CommandQueue(ctx)
     # check that it actually performs the lookup correctly
     assert np.allclose(kernel(
-        queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+        queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1])
 
 
 def test_arg_inference_for_predicates():
@@ -2705,7 +2697,7 @@ def test_dep_cycle_printing_and_error():
     # https://gitlab.tiker.net/inducer/loopy/issues/140
     # This kernel has two dep cycles.
 
-    knl = lp.make_kernel('{[i,j,k]: 0 <= i,j,k < 12}',
+    knl = lp.make_kernel("{[i,j,k]: 0 <= i,j,k < 12}",
     """
         for j
             for i
@@ -2725,11 +2717,11 @@ def test_dep_cycle_printing_and_error():
             end
         end
     """,
-    [lp.GlobalArg('a', shape=(12, 12), dtype=np.int32)])
+    [lp.GlobalArg("a", shape=(12, 12), dtype=np.int32)])
 
-    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
-    knl = lp.split_array_axis(knl, 'a', 1, 4)
-    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+    knl = lp.split_iname(knl, "j", 4, inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a", 1, 4)
+    knl = lp.tag_array_axes(knl, "a", "N1,N0,vec")
     knl = lp.preprocess_kernel(knl)
 
     from loopy.diagnostic import DependencyCycleFound
@@ -2748,7 +2740,7 @@ def test_backwards_dep_printing_and_error():
             d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
             a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
             """, [
-                lp.GlobalArg('a, b', dtype=np.float64),
+                lp.GlobalArg("a, b", dtype=np.float64),
                 "..."
             ])
 
@@ -2829,9 +2821,9 @@ def test_shape_mismatch_check(ctx_factory):
 def test_array_arg_extra_kwargs_persis_hash():
     from loopy.tools import LoopyKeyBuilder
 
-    a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64,
+    a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64,
             address_space=lp.AddressSpace.LOCAL)
-    not_a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64,
+    not_a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64,
             address_space=lp.AddressSpace.PRIVATE)
 
     key_builder = LoopyKeyBuilder()
@@ -2887,13 +2879,79 @@ def test_non_integral_array_idx_raises():
             """
             out[j] = 0 {id=init}
             out[i] = a[1.94**i-1] {dep=init}
-            """, [lp.GlobalArg('a', np.float64), '...'])
+            """, [lp.GlobalArg("a", np.float64), "..."])
 
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
         print(lp.generate_code_v2(knl).device_code())
 
 
+@pytest.mark.parametrize("tag", ["for", "l.0", "g.0", "fixed"])
+def test_empty_domain(ctx_factory, tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    prg = lp.make_kernel(
+            "{[i,j]: 0 <= i < n}",
+            """
+            for i
+                c = 1
+            end
+            """)
+
+    if tag == "fixed":
+        prg = lp.fix_parameters(prg, n=0)
+        kwargs = {}
+    else:
+        prg = lp.tag_inames(prg, {"i": tag})
+        kwargs = {"n": 0}
+
+    prg = lp.set_options(prg, write_code=True)
+    c = cl.array.zeros(queue, (), dtype=np.int32)
+    prg(queue, c=c, **kwargs)
+
+    assert (c.get() == 0).all()
+
+
+def test_access_check_with_conditionals():
+    legal_knl = lp.make_kernel(
+            "{[i]: 0<=i<20}",
+            """
+            z[i] = x[i] if i < 10 else y[i-10]
+            z[i] = x[i] if 0 else 2.0f
+            z[i] = in[i-1] if i else 3.14f
+            """,
+            [lp.GlobalArg("x,y", shape=(10,), dtype=float),
+             lp.GlobalArg("in", shape=(19,), dtype=float),
+             ...], seq_dependencies=True)
+    lp.generate_code_v2(legal_knl)
+
+    illegal_knl = lp.make_kernel(
+            "{[i]: 0<=i<20}",
+            """
+            z[i] = x[i] if i < 10 else y[i]
+            """,
+            [lp.GlobalArg("x,y", shape=(10,), dtype=float),
+             ...])
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        lp.generate_code_v2(illegal_knl)
+
+    # current limitation: cannot handle non-affine conditions
+    legal_but_nonaffine_condition_knl = lp.make_kernel(
+            "{[i]: 0<=i<20}",
+            """
+            z[i] = x[i] if i*i < 100 else y[i-10]
+            """,
+            [lp.GlobalArg("x,y", shape=(10,), dtype=float),
+             ...])
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        lp.generate_code_v2(legal_but_nonaffine_condition_knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_misc.py b/test/test_misc.py
index 7a834a6f5d393298e97df22d47a1de3b64354a42..58ba732ac1ddd0f1f0eaff9aeb83b9b38902cb49 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -22,9 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six  # noqa
 import pytest
-from six.moves import range
 
 import sys
 
@@ -35,67 +31,23 @@ logger = logging.getLogger(__name__)
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-def test_compute_sccs():
-    from loopy.tools import compute_sccs
-    import random
-
-    rng = random.Random(0)
-
-    def generate_random_graph(nnodes):
-        graph = dict((i, set()) for i in range(nnodes))
-        for i in range(nnodes):
-            for j in range(nnodes):
-                # Edge probability 2/n: Generates decently interesting inputs.
-                if rng.randint(0, nnodes - 1) <= 1:
-                    graph[i].add(j)
-        return graph
-
-    def verify_sccs(graph, sccs):
-        visited = set()
-
-        def visit(node):
-            if node in visited:
-                return []
-            else:
-                visited.add(node)
-                result = []
-                for child in graph[node]:
-                    result = result + visit(child)
-                return result + [node]
-
-        for scc in sccs:
-            scc = set(scc)
-            assert not scc & visited
-            # Check that starting from each element of the SCC results
-            # in the same set of reachable nodes.
-            for scc_root in scc:
-                visited.difference_update(scc)
-                result = visit(scc_root)
-                assert set(result) == scc, (set(result), scc)
-
-    for nnodes in range(10, 20):
-        for i in range(40):
-            graph = generate_random_graph(nnodes)
-            verify_sccs(graph, compute_sccs(graph))
-
-
 def test_SetTrie():
     from loopy.kernel.tools import SetTrie
 
     s = SetTrie()
-    s.add_or_update(set([1, 2, 3]))
-    s.add_or_update(set([4, 2, 1]))
-    s.add_or_update(set([1, 5]))
+    s.add_or_update({1, 2, 3})
+    s.add_or_update({4, 2, 1})
+    s.add_or_update({1, 5})
 
     result = []
     s.descend(lambda prefix: result.extend(prefix))
     assert result == [1, 2, 3, 4, 5]
 
     with pytest.raises(ValueError):
-        s.add_or_update(set([1, 4]))
+        s.add_or_update({1, 4})
 
 
-class PickleDetector(object):
+class PickleDetector:
     """Contains a class attribute which flags if any instance was unpickled.
     """
 
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 5b36ed4163c650317d8656883eeda599a3c21faa..1254be7d37e2800dda163598b2a75c44a29641b6 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -77,7 +75,8 @@ def test_nbody(ctx_factory):
                 outer_tag="g.0", inner_tag="l.0")
         knl = lp.split_iname(knl, "j", 256)
         knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
-                ["x_fetch_j", "x_fetch_k"], default_tag=None)
+                ["x_fetch_j", "x_fetch_k"],
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
         knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
         knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index de0bcf70a7f3f86152e86524486e2730522df325..74d53b07018a72ee189eefb0ea02b194bb663629 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -1,6 +1,5 @@
 """gNUMA differentiation kernel, wrapped up as a test."""
 
-from __future__ import division
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner, Lucas Wilcox"
 
@@ -30,8 +29,6 @@ import pyopencl as cl
 import sys
 import os
 
-pytestmark = pytest.mark.importorskip("fparser")
-
 import logging
 logger = logging.getLogger(__name__)
 
@@ -51,10 +48,11 @@ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 @pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
+    pytest.importorskip("fparser")
     ctx = ctx_factory()
 
     filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90")
-    with open(filename, "r") as sourcef:
+    with open(filename) as sourcef:
         source = sourcef.read()
 
     source = source.replace("datafloat", "real*4")
@@ -91,7 +89,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     if opt_level == 0:
         tap_hsv = hsv
 
-    hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto")
+    hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e",
+            default_tag="l.auto")
 
     if opt_level == 1:
         tap_hsv = hsv
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 96bf7d70909eada3c77048d6ccb459a6f7a69367..965e5f1ab90ce2b4afe0f519ca85623d51ceb70d 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/test/test_scan.py b/test/test_scan.py
index 101d8fc35f224c02ac6e836cbb49f65b3dd387a4..31875ce5d8ccaf824d090c17a57dfd7e347ba4d3 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = """
 Copyright (C) 2012 Andreas Kloeckner
 Copyright (C) 2016, 2017 Matt Wala
@@ -366,7 +364,7 @@ def test_argmax(ctx_factory, i_tag):
 
 
 def check_segmented_scan_output(arr, segment_boundaries_indices, out):
-    class SegmentGrouper(object):
+    class SegmentGrouper:
 
         def __init__(self):
             self.seg_idx = 0
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index e022e92f3712d984c1ad68061d0052240ff9d20c..901affc57a63a45f4147940cfc0b9c03e57522d0 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -48,7 +46,7 @@ def test_tim2d(ctx_factory):
 
     # K - run-time symbolic
     knl = lp.make_kernel(
-            "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2<n and 0<=e<K and 0<=gi<3}",
+            "{[i,j,e,m,o,o2]: 0<=i,j,m,o,o2<n and 0<=e<K}",
             [
                 "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])",
                 "us(a,b) := simul_reduce(sum, o2, D[b,o2]*u[e,a,o2])",
@@ -74,19 +72,21 @@ def test_tim2d(ctx_factory):
             name="semlap2D", assumptions="K>=1")
 
     knl = lp.fix_parameters(knl, n=n)
-    knl = lp.duplicate_inames(knl, "o", within="id:ur")
-    knl = lp.duplicate_inames(knl, "o", within="id:us")
+    # knl = lp.duplicate_inames(knl, "o", within="id:ur")
+    # knl = lp.duplicate_inames(knl, "o", within="id:us")
 
     seq_knl = knl
 
     def variant_orig(knl):
         knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))
 
-        knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto")
+        knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames="e",
+                default_tag="l.auto")
         knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto")
 
         knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto")
         knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto")
+        # TODO this adds `a` and `b` to domains, which leads to unused inames
 
         knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto")
         knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto")
diff --git a/test/test_split_iname_slabs.py b/test/test_split_iname_slabs.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f311ab583eda037f7798c5c67361e2d8f2828b
--- /dev/null
+++ b/test/test_split_iname_slabs.py
@@ -0,0 +1,66 @@
+__copyright__ = "Copyright (C) 2020 Lawrence Mitchell"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+import loopy as lp
+import numpy as np
+import pyopencl as cl  # noqa
+import pyopencl.array as clarray
+import pytest
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
+from pyopencl.tools import \
+    pytest_generate_tests_for_pyopencl as pytest_generate_tests  # noqa
+
+
+@pytest.fixture
+def vanilla():
+    k = lp.make_kernel("{ [i] : k <= i < n}",
+                       """
+                       a[i] = a[i] + 1
+                       """,
+                       [lp.ValueArg("k", dtype="int32"),
+                        lp.ValueArg("n", dtype="int32"),
+                        lp.GlobalArg("a", shape=(None, ),
+                                        dtype="int32")])
+    k = lp.assume(k, "k >= 0 and n >= k")
+    return k
+
+
+@pytest.fixture
+def split(vanilla):
+    k = lp.split_iname(vanilla, "i", 4, slabs=(1, 1))
+    k = lp.prioritize_loops(k, "i_outer,i_inner")
+    return k
+
+
+@pytest.fixture(params=[(1, 4), (1, 5), (4, 8)],
+                ids=lambda x: "{k=%s, n=%s}" % x)
+def parameters(request):
+    return dict(zip("kn", request.param))
+
+
+def test_split_slabs(ctx_factory, vanilla, split, parameters):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    expect = clarray.zeros(queue, 8, dtype=np.int32)
+    actual = clarray.zeros(queue, 8, dtype=np.int32)
+    _, (expect, ) = vanilla(queue, a=expect, **parameters)
+    _, (actual, ) = split(queue, a=actual, **parameters)
+    assert np.array_equal(expect.get(), actual.get())
diff --git a/test/test_statistics.py b/test/test_statistics.py
index ef5450599126df9f1acbfbcb544b2362438f2f90..c1ca86d35bf8687bda11b5068b81f9d48cfe8113 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function
-
 __copyright__ = "Copyright (C) 2015 James Stevens"
 
 __license__ = """
@@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 import sys
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
@@ -66,16 +63,16 @@ def test_op_counter_basic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+    params = {"n": n, "m": m, "ell": ell}
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == f32div == n*m*ell*n_subgroups
@@ -101,15 +98,15 @@ def test_op_counter_reduction():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+    params = {"n": n, "m": m, "ell": ell}
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == n*m*ell*n_subgroups
 
-    op_map_dtype = op_map.group_by('dtype')
+    op_map_dtype = op_map.group_by("dtype")
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     assert f32 == f32add + f32mul
 
@@ -137,14 +134,14 @@ def test_op_counter_logic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+    params = {"n": n, "m": m, "ell": ell}
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name)
+    f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32mul == n*m*n_subgroups
@@ -177,22 +174,22 @@ def test_op_counter_specialops():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+    params = {"n": n, "m": m, "ell": ell}
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP, knl.name)].eval_with_dict(
+    f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP, knl.name)].eval_with_dict(
             params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP, knl.name)
+    f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP, knl.name)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP, knl.name)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32div == 2*n*m*ell*n_subgroups
@@ -227,25 +224,25 @@ def test_op_counter_bitwise():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     print(op_map)
     i32add = op_map[
-            lp.Op(np.int32, 'add', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.int32, "add", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     i32bw = op_map[
-            lp.Op(np.int32, 'bw', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.int32, "bw", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     i64bw = op_map[
-            lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     i64mul = op_map[
-            lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     i64add = op_map[
-            lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     i64shift = op_map[
-            lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP, 'bitwise')
+            lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP, "bitwise")
             ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert i32add == n*m*ell*n_subgroups
@@ -280,7 +277,7 @@ def test_op_counter_triangular_domain():
                     knl,
                     subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)]
+                    )[lp.Op(np.float64, "mul", CG.SUBGROUP, knl.name)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -316,34 +313,34 @@ def test_mem_access_counter_basic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32l = mem_map[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    f32l += mem_map[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                     ].eval_with_dict(params)
-    f64l = mem_map[lp.MemAccess('global', np.float64,
+    f64l = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    f64l += mem_map[lp.MemAccess('global', np.float64,
+    f64l += mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                     ].eval_with_dict(params)
@@ -352,15 +349,15 @@ def test_mem_access_counter_basic():
     assert f32l == (3*n*m*ell)*n_subgroups
     assert f64l == (2*n*m)*n_subgroups
 
-    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64s = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
@@ -386,22 +383,22 @@ def test_mem_access_counter_reduction():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32l = mem_map[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    f32l += mem_map[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                     ].eval_with_dict(params)
@@ -409,9 +406,9 @@ def test_mem_access_counter_reduction():
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32l == (2*n*m*ell)*n_subgroups
 
-    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
@@ -419,9 +416,9 @@ def test_mem_access_counter_reduction():
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32s == (n*ell)*n_subgroups
 
-    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+    ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"]
                                  ).to_bytes().eval_and_sum(params)
-    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+    st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"]
                                  ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*f32l
     assert st_bytes == 4*f32s
@@ -447,23 +444,23 @@ def test_mem_access_counter_logic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+    reduced_map = mem_map.group_by("mtype", "dtype", "direction")
 
-    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
-                                       direction='load')
+    f32_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float32),
+                                       direction="load")
                           ].eval_with_dict(params)
-    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
-                                       direction='load')
+    f64_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64),
+                                       direction="load")
                           ].eval_with_dict(params)
-    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
-                                       direction='store')
+    f64_g_s = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64),
+                                       direction="store")
                           ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -492,34 +489,34 @@ def test_mem_access_counter_specialops():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32 = mem_map[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
-    f32 += mem_map[lp.MemAccess('global', np.float32,
+    f32 += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64 = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
-    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64 += mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
@@ -528,15 +525,15 @@ def test_mem_access_counter_specialops():
     assert f32 == (2*n*m*ell)*n_subgroups
     assert f64 == (2*n*m)*n_subgroups
 
-    f32 = mem_map[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
-    f64 = mem_map[lp.MemAccess('global', np.float64,
+    f64 = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
@@ -545,7 +542,7 @@ def test_mem_access_counter_specialops():
     assert f32 == (n*m*ell)*n_subgroups
     assert f64 == (n*m)*n_subgroups
 
-    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
+    filtered_map = mem_map.filter_by(direction=["load"], variable=["a", "g"],
                          count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
@@ -575,34 +572,34 @@ def test_mem_access_counter_bitwise():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    i32 = mem_map[lp.MemAccess('global', np.int32,
+    i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
+    i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
@@ -610,15 +607,15 @@ def test_mem_access_counter_bitwise():
     # uniform: (count-per-sub-group)*n_subgroups
     assert i32 == (4*n*m+2*n*m*ell)*n_subgroups
 
-    i32 = mem_map[lp.MemAccess('global', np.int32,
+    i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                   ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP,
                         kernel_name=knl.name)
                    ].eval_with_dict(params)
@@ -650,7 +647,7 @@ def test_mem_access_counter_mixed():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = div_ceil(ell, group_size_0)
     group_size = group_size_0
@@ -659,37 +656,37 @@ def test_mem_access_counter_mixed():
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
-    f64uniform = mem_map[lp.MemAccess('global', np.float64,
+    f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='g',
+                                direction="load", variable="g",
                                 count_granularity=CG.SUBGROUP,
                                 kernel_name=knl.name)
                          ].eval_with_dict(params)
-    f64uniform += mem_map[lp.MemAccess('global', np.float64,
+    f64uniform += mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='h',
+                                direction="load", variable="h",
                                 count_granularity=CG.SUBGROUP,
                                 kernel_name=knl.name)
                           ].eval_with_dict(params)
-    f32uniform = mem_map[lp.MemAccess('global', np.float32,
+    f32uniform = mem_map[lp.MemAccess("global", np.float32,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='x',
+                                direction="load", variable="x",
                                 count_granularity=CG.SUBGROUP,
                                 kernel_name=knl.name)
                          ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='load',
-                                variable='a',
+    f32nonconsec = mem_map[lp.MemAccess("global", np.dtype(np.float32),
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="load",
+                                variable="a",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                            ].eval_with_dict(params)
-    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='load',
-                                variable='b',
+    f32nonconsec += mem_map[lp.MemAccess("global", np.dtype(np.float32),
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="load",
+                                variable="b",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                             ].eval_with_dict(params)
@@ -715,17 +712,17 @@ def test_mem_access_counter_mixed():
     else:
         assert f32nonconsec == 3*n*m*ell
 
-    f64uniform = mem_map[lp.MemAccess('global', np.float64,
+    f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='store', variable='e',
+                                direction="store", variable="e",
                                 count_granularity=CG.SUBGROUP,
                                 kernel_name=knl.name)
                          ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='store',
-                                variable='c',
+    f32nonconsec = mem_map[lp.MemAccess("global", np.float32,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="store",
+                                variable="c",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                            ].eval_with_dict(params)
@@ -764,55 +761,55 @@ def test_mem_access_counter_nonconsec():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='load',
-                                variable='g',
+    params = {"n": n, "m": m, "ell": ell}
+    f64nonconsec = mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="load",
+                                variable="g",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                            ].eval_with_dict(params)
-    f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='load',
-                                variable='h',
+    f64nonconsec += mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="load",
+                                variable="h",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
-                            'global', np.dtype(np.float32),
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='load', variable='a',
+                            "global", np.dtype(np.float32),
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="load", variable="a",
                             count_granularity=CG.WORKITEM,
                             kernel_name=knl.name)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess(
-                            'global', np.dtype(np.float32),
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='load', variable='b',
+                            "global", np.dtype(np.float32),
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="load", variable="b",
                             count_granularity=CG.WORKITEM,
                             kernel_name=knl.name)
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
 
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='store',
-                                variable='e',
+    f64nonconsec = mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="store",
+                                variable="e",
                                 count_granularity=CG.WORKITEM,
                                 kernel_name=knl.name)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
-                            'global', np.float32,
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='store', variable='c',
+                            "global", np.float32,
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="store", variable="c",
                             count_granularity=CG.WORKITEM,
                             kernel_name=knl.name)
                            ].eval_with_dict(params)
@@ -822,40 +819,40 @@ def test_mem_access_counter_nonconsec():
     mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True,
                                       subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.float64,
-                    lid_strides={0: Variable('m')},
-                    gid_strides={0: Variable('m')*lsize0},
-                    direction='load', variable='g',
+                    lid_strides={0: Variable("m")},
+                    gid_strides={0: Variable("m")*lsize0},
+                    direction="load", variable="g",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.float64,
-                    lid_strides={0: Variable('m')},
-                    gid_strides={0: Variable('m')*lsize0},
-                    direction='load', variable='h',
+                    lid_strides={0: Variable("m")},
+                    gid_strides={0: Variable("m")*lsize0},
+                    direction="load", variable="h",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.dtype(np.float32),
-                    lid_strides={0: Variable('m')*Variable('ell')},
-                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                    direction='load',
-                    variable='a',
+                    lid_strides={0: Variable("m")*Variable("ell")},
+                    gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                    direction="load",
+                    variable="a",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.dtype(np.float32),
-                    lid_strides={0: Variable('m')*Variable('ell')},
-                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                    direction='load',
-                    variable='b',
+                    lid_strides={0: Variable("m")*Variable("ell")},
+                    gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                    direction="load",
+                    variable="b",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
@@ -879,39 +876,39 @@ def test_mem_access_counter_consec():
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size='guess')
+                                    subgroup_size="guess")
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     f64consec = mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='load', variable='g',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="load", variable="g",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='load', variable='h',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="load", variable="h",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
-                    'global', np.float32,
+                    "global", np.float32,
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='load', variable='a',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="load", variable="a",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess(
-                    'global', np.dtype(np.float32),
+                    "global", np.dtype(np.float32),
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='load', variable='b',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="load", variable="b",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
@@ -919,17 +916,17 @@ def test_mem_access_counter_consec():
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='store', variable='e',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="store", variable="e",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
-                    'global', np.float32,
+                    "global", np.float32,
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='store', variable='c',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="store", variable="c",
                     count_granularity=CG.WORKITEM,
                     kernel_name=knl.name)
                     ].eval_with_dict(params)
@@ -945,7 +942,7 @@ def test_count_granularity_val_checks():
         lp.MemAccess(count_granularity=CG.WORKGROUP)
         lp.MemAccess(count_granularity=None)
         assert True
-        lp.MemAccess(count_granularity='bushel')
+        lp.MemAccess(count_granularity="bushel")
         assert False
     except ValueError:
         assert True
@@ -956,7 +953,7 @@ def test_count_granularity_val_checks():
         lp.Op(count_granularity=CG.WORKGROUP)
         lp.Op(count_granularity=None)
         assert True
-        lp.Op(count_granularity='bushel')
+        lp.Op(count_granularity="bushel")
         assert False
     except ValueError:
         assert True
@@ -980,7 +977,7 @@ def test_barrier_counter_nobarriers():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     assert len(sync_map) == 1
     assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1
 
@@ -1006,7 +1003,7 @@ def test_barrier_counter_barriers():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params)
     assert barrier_count == 50*10*2
 
@@ -1044,7 +1041,7 @@ def test_all_counters_parallel_matmul():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     group_size = bsize*bsize
     n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -1057,16 +1054,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)
+                        lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)
+                        lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)
+                        lp.Op(np.int32, "add", CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name)
+                        lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1075,28 +1072,28 @@ def test_all_counters_parallel_matmul():
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                            subgroup_size=SGS)
 
-    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
+    f32s1lb = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
                              gid_strides={1: bsize},
-                             direction='load', variable='b',
+                             direction="load", variable="b",
                              count_granularity=CG.WORKITEM,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
-    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('m')},
-                             gid_strides={0: Variable('m')*bsize},
-                             direction='load',
-                             variable='a', count_granularity=CG.WORKITEM,
+    f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("m")},
+                             gid_strides={0: Variable("m")*bsize},
+                             direction="load",
+                             variable="a", count_granularity=CG.WORKITEM,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
-    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
-                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
-                             direction='store', variable='c',
+    f32coal = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
+                             gid_strides={0: Variable("ell")*bsize, 1: bsize},
+                             direction="store", variable="c",
                              count_granularity=CG.WORKITEM,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
@@ -1105,26 +1102,26 @@ def test_all_counters_parallel_matmul():
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True,
-                        subgroup_size=SGS).filter_by(mtype=['local'])
+                        subgroup_size=SGS).filter_by(mtype=["local"])
 
-    local_mem_l = local_mem_map.filter_by(direction=['load']
+    local_mem_l = local_mem_map.filter_by(direction=["load"]
                                           ).eval_and_sum(params)
     # (count-per-sub-group)*n_subgroups
     assert local_mem_l == m*2*n_subgroups
 
-    local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                               direction='load',
+    local_mem_l_a = local_mem_map[lp.MemAccess("local", np.dtype(np.float32),
+                                               direction="load",
                                                lid_strides={1: 16},
                                                gid_strides={},
-                                               variable='a_fetch',
+                                               variable="a_fetch",
                                                count_granularity=CG.SUBGROUP,
                                                kernel_name=knl.name)
                                   ].eval_with_dict(params)
-    local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                               direction='load',
+    local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32),
+                                               direction="load",
                                                lid_strides={0: 1},
                                                gid_strides={},
-                                               variable='b_fetch',
+                                               variable="b_fetch",
                                                count_granularity=CG.SUBGROUP,
                                                kernel_name=knl.name)
                                   ].eval_with_dict(params)
@@ -1132,7 +1129,7 @@ def test_all_counters_parallel_matmul():
     # (count-per-sub-group)*n_subgroups
     assert local_mem_l_a == local_mem_l_b == m*n_subgroups
 
-    local_mem_s = local_mem_map.filter_by(direction=['store']
+    local_mem_s = local_mem_map.filter_by(direction=["store"]
                                           ).eval_and_sum(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1200,7 +1197,7 @@ def test_mem_access_tagged_variables():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     group_size = bsize*bsize
     n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -1209,20 +1206,20 @@ def test_mem_access_tagged_variables():
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                            subgroup_size=SGS)
 
-    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
+    f32s1lb = mem_access_map[lp.MemAccess("global", np.float32,
                              lid_strides={0: 1},
                              gid_strides={1: bsize},
-                             direction='load', variable='b',
-                             variable_tag='mmbload',
+                             direction="load", variable="b",
+                             variable_tag="mmbload",
                              count_granularity=CG.WORKITEM,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
-    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={1: Variable('m')},
-                             gid_strides={0: Variable('m')*bsize},
-                             direction='load',
-                             variable='a',
-                             variable_tag='mmaload',
+    f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={1: Variable("m")},
+                             gid_strides={0: Variable("m")*bsize},
+                             direction="load",
+                             variable="a",
+                             variable_tag="mmaload",
                              count_granularity=CG.SUBGROUP,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
@@ -1232,11 +1229,11 @@ def test_mem_access_tagged_variables():
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32s1la == m*n_subgroups
 
-    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
-                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
-                             direction='store', variable='c',
-                             variable_tag='mmresult',
+    f32coal = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
+                             gid_strides={0: Variable("ell")*bsize, 1: bsize},
+                             direction="store", variable="c",
+                             variable_tag="mmresult",
                              count_granularity=CG.WORKITEM,
                              kernel_name=knl.name)
                              ].eval_with_dict(params)
@@ -1256,7 +1253,7 @@ def test_gather_access_footprint():
     from loopy.statistics import gather_access_footprints, count
     fp = gather_access_footprints(knl)
 
-    for key, footprint in six.iteritems(fp):
+    for key, footprint in fp.item():
         print(key, count(knl.root_kernel, footprint))
 
 
@@ -1271,7 +1268,7 @@ def test_gather_access_footprint_2():
     fp = gather_access_footprints(knl)
 
     params = {"n": 200}
-    for key, footprint in six.iteritems(fp):
+    for key, footprint in fp.items():
         assert count(knl.root_kernel, footprint).eval_with_dict(params) == 200
         print(key, count(knl.root_kernel, footprint))
 
@@ -1294,7 +1291,7 @@ def test_summations_and_filters():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
@@ -1304,24 +1301,24 @@ def test_summations_and_filters():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
 
-    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
+    loads_a = mem_map.filter_by(direction=["load"], variable=["a"],
                                 count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert loads_a == (2*n*m*ell)*n_subgroups
 
-    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
+    global_stores = mem_map.filter_by(mtype=["global"], direction=["store"],
                                       count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert global_stores == (n*m*ell + n*m)*n_subgroups
 
-    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
+    ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"],
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
-    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
+    st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"],
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
@@ -1330,10 +1327,10 @@ def test_summations_and_filters():
     assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups
 
     # ignore stride and variable names in this map
-    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
-    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
+    reduced_map = mem_map.group_by("mtype", "dtype", "direction")
+    f32lall = reduced_map[lp.MemAccess("global", np.float32, direction="load")
                           ].eval_with_dict(params)
-    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
+    f64lall = reduced_map[lp.MemAccess("global", np.float64, direction="load")
                           ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -1345,7 +1342,7 @@ def test_summations_and_filters():
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-    op_map_dtype = op_map.group_by('dtype')
+    op_map_dtype = op_map.group_by("dtype")
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
     i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
@@ -1353,7 +1350,7 @@ def test_summations_and_filters():
     assert f64 == n*m
     assert i32 == n*m*2
 
-    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
+    addsub_all = op_map.filter_by(name=["add", "sub"]).eval_and_sum(params)
     f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
     assert addsub_all == n*m*ell + n*m*2
     assert f32ops_all == n*m*ell*3
@@ -1361,16 +1358,16 @@ def test_summations_and_filters():
     non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
     assert non_field == 0
 
-    ops_nodtype = op_map.group_by('name')
-    ops_noname = op_map.group_by('dtype')
-    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
+    ops_nodtype = op_map.group_by("name")
+    ops_noname = op_map.group_by("dtype")
+    mul_all = ops_nodtype[lp.Op(name="mul")].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
     assert mul_all == n*m*ell + n*m
     assert f64ops_all == n*m
 
     def func_filter(key):
         return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
-               key.direction == 'load'
+               key.direction == "load"
     f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -1394,7 +1391,7 @@ def test_strided_footprint():
     knl = lp.split_iname(knl, "i_inner", bx, outer_tag="unr", inner_tag="l.0")
 
     footprints = lp.gather_access_footprints(knl)
-    x_l_foot = footprints[('x', 'read')]
+    x_l_foot = footprints[("x", "read")]
 
     from loopy.statistics import count
     num = count(knl.root_kernel, x_l_foot).eval_with_dict(param_dict)
diff --git a/test/test_target.py b/test/test_target.py
index 0d34310664e027f8e7ee133da871c91723295d10..e5b743d37fcae25db088854199ae8f15ed387d8a 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -48,7 +46,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -225,8 +223,9 @@ def test_tuple(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    import islpy as isl
     knl = lp.make_kernel(
-            "{ [i]: 0 = i }",
+            [isl.BasicSet("[] -> {[]: }")],
             """
             a, b = make_tuple(1, 2.)
             """)
@@ -272,9 +271,11 @@ def test_numba_cuda_target():
         target=lp.NumbaCudaTarget())
 
     knl = lp.assume(knl, "M>0")
-    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
-    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
-    knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto")
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
+    knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1))
+    knl = lp.add_prefetch(knl, "X[i,:]",
+            fetch_outer_inames="i_inner, i_outer, j_inner",
+            default_tag="l.auto")
     knl = lp.fix_parameters(knl, N=3)
     knl = lp.prioritize_loops(knl, "i_inner,j_outer")
     knl = lp.tag_inames(knl, "k:unr")
@@ -318,7 +319,7 @@ def test_child_invalid_type_cast():
 
 
 def test_target_invalid_type_cast():
-    dtype = np.dtype([('', '<u4'), ('', '<i4')])
+    dtype = np.dtype([("", "<u4"), ("", "<i4")])
     with pytest.raises(lp.LoopyError):
         lp.TypeCast(dtype, 1)
 
@@ -369,7 +370,7 @@ def test_cuda_short_vector():
 
 def test_nvidia_pyopencl_target(ctx_factory):
     ctx = ctx_factory()
-    if ctx.devices[0].vendor != 'NVIDIA Corporation':
+    if ctx.devices[0].vendor != "NVIDIA Corporation":
         # do not test for non-Nvidia devices
         return
 
@@ -382,9 +383,9 @@ def test_nvidia_pyopencl_target(ctx_factory):
             res[0] = res[0] + a[i] {id=update, atomic}
             """,
             [
-                lp.GlobalArg('res', for_atomic=True),
-                lp.GlobalArg('a', for_atomic=True, dtype=np.float64),
-                '...'])
+                lp.GlobalArg("res", for_atomic=True),
+                lp.GlobalArg("a", for_atomic=True, dtype=np.float64),
+                "..."])
 
     knl = lp.split_iname(knl, "i", 4, inner_tag="l.0", outer_tag="g.0")
     knl = knl.copy(target=lp.NvidiaPyOpenCLTarget(ctx.devices[0]))
@@ -393,6 +394,38 @@ def test_nvidia_pyopencl_target(ctx_factory):
     assert np.isclose(out, a.sum())
 
 
+def test_pyopencl_execution_numpy_handling(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # test numpy input for x is written to and returned
+    knl = lp.make_kernel("{:}", ["x[0] = y[0] + x[0]"])
+
+    y = np.array([3.])
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 7.
+
+    # test numpy input for x is written to and returned, even when a pyopencl array
+    # is passed for y
+    import pyopencl.array as cla
+    y = cla.zeros(queue, shape=(1), dtype="float64") + 3.
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 7.
+
+    # test numpy input for x is written to and returned, even when output-only
+    knl = lp.make_kernel("{:}", ["x[0] = y[0] + 2"])
+
+    y = np.array([3.])
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 5.
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_transform.py b/test/test_transform.py
index 9300f45c33b80195facc70a44c360363a69b2396..ff593a0c85852701ada5cbecee6d4869941c29af 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1,5 +1,3 @@
-from __future__ import division, absolute_import, print_function
-
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -46,7 +44,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -75,7 +73,7 @@ def test_collect_common_factors(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i,j<n}",
+            "{[i,j]: 0<=i,j<n}",
             """
             <float32> out_tmp = 0 {id=out_init,inames=i}
             out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
@@ -99,8 +97,8 @@ def test_to_batched(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' out[i] = sum(j, a[i,j]*x[j])''')
+         """ { [i,j]: 0<=i,j<n } """,
+         """ out[i] = sum(j, a[i,j]*x[j])""")
     knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                             x=np.float32,
                                             a=np.float32))
@@ -108,8 +106,8 @@ def test_to_batched(ctx_factory):
     bknl = lp.to_batched(knl, "nbatches", "out,x")
 
     ref_knl = lp.make_kernel(
-         ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
-         '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
+         """ { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} """,
+         """out[k, i] = sum(j, a[i,j]*x[k, j])""")
     ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                     x=np.float32,
                                                     a=np.float32))
@@ -129,20 +127,20 @@ def test_to_batched_temp(ctx_factory):
     ctx = ctx_factory()
 
     prog = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' cnst = 2.0
-         out[i] = sum(j, cnst*a[i,j]*x[j])''',
+         """ { [i,j]: 0<=i,j<n } """
+         """ cnst = 2.0
+         out[i] = sum(j, cnst*a[i,j]*x[j])""",
          [lp.TemporaryVariable(
              "cnst",
              dtype=np.float32,
              shape=(),
-             scope=lp.AddressSpace.PRIVATE), '...'])
+             address_space=lp.AddressSpace.PRIVATE), "..."])
     prog = lp.add_and_infer_dtypes(prog, dict(out=np.float32,
                                             x=np.float32,
                                             a=np.float32))
     ref_prog = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
+         """ { [i,j]: 0<=i,j<n } """,
+         """out[i] = sum(j, 2.0*a[i,j]*x[j])""")
     ref_prog = lp.add_and_infer_dtypes(ref_prog, dict(out=np.float32,
                                                     x=np.float32,
                                                     a=np.float32))
@@ -151,7 +149,7 @@ def test_to_batched_temp(ctx_factory):
     bref_prog = lp.to_batched(ref_prog, "nbatches", "out,x")
 
     # checking that cnst is not being bathced
-    assert bprog.root_kernel.temporary_variables['cnst'].shape == ()
+    assert bprog.root_kernel.temporary_variables["cnst"].shape == ()
 
     a = np.random.randn(5, 5)
     x = np.random.randn(7, 5)
@@ -200,8 +198,8 @@ def test_rename_argument(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     kernel = lp.make_kernel(
-         '''{ [i]: 0<=i<n }''',
-         '''out[i] = a + 2''')
+         """{ [i]: 0<=i<n }""",
+         """out[i] = a + 2""")
 
     kernel = lp.rename_argument(kernel, "a", "b")
 
@@ -212,14 +210,14 @@ def test_rename_argument(ctx_factory):
 
 def test_fusion():
     exp_kernel = lp.make_kernel(
-         ''' { [i]: 0<=i<n } ''',
-         ''' exp[i] = pow(E, z[i])''',
+         """ { [i]: 0<=i<n } """,
+         """ exp[i] = pow(E, z[i])""",
          assumptions="n>0")
 
     sum_kernel = lp.make_kernel(
-        '{ [j]: 0<=j<n }',
-        'out2 = sum(j, exp[j])',
-        assumptions='n>0')
+        "{ [j]: 0<=j<n }",
+        "out2 = sum(j, exp[j])",
+        assumptions="n>0")
 
     knl = lp.fuse_kernels([exp_kernel, sum_kernel])
 
@@ -387,7 +385,8 @@ def test_precompute_confusing_subst_arguments(ctx_factory):
     from loopy.symbolic import get_dependencies
     assert "i_inner" not in get_dependencies(
             prog.root_kernel.substitutions["D"].expression)
-    prog = lp.precompute(prog, "D")
+    prog = lp.precompute(prog, "D", sweep_inames="j",
+            precompute_outer_inames="j, i_inner, i_outer")
 
     lp.auto_test_vs_ref(
             ref_prog, ctx, prog,
@@ -398,7 +397,7 @@ def test_precompute_nested_subst(ctx_factory):
     ctx = ctx_factory()
 
     prog = lp.make_kernel(
-        "{[i,j]: 0<=i<n and 0<=j<5}",
+        "{[i]: 0<=i<n}",
         """
         E:=a[i]
         D:=E*E
@@ -409,7 +408,6 @@ def test_precompute_nested_subst(ctx_factory):
 
     ref_prog = prog
 
-    prog = lp.tag_inames(prog, dict(j="g.1"))
     prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
@@ -550,7 +548,7 @@ def test_uniquify_instruction_ids():
     from loopy.transform.instruction import uniquify_instruction_ids
     prog = uniquify_instruction_ids(prog)
 
-    insn_ids = set(insn.id for insn in prog.root_kernel.instructions)
+    insn_ids = {insn.id for insn in prog.root_kernel.instructions}
 
     assert len(insn_ids) == 4
     assert all(isinstance(id, str) for id in insn_ids)
@@ -564,13 +562,13 @@ def test_split_iname_only_if_in_within():
             a[i] = 2*b[i] {id=not_to_split}
             """)
 
-    prog = lp.split_iname(prog, "i", 4, within='id:to_split')
+    prog = lp.split_iname(prog, "i", 4, within="id:to_split")
 
     for insn in prog.root_kernel.instructions:
-        if insn.id == 'to_split':
-            assert insn.within_inames == frozenset({'i_outer', 'i_inner'})
-        if insn.id == 'not_to_split':
-            assert insn.within_inames == frozenset({'i'})
+        if insn.id == "to_split":
+            assert insn.within_inames == frozenset({"i_outer", "i_inner"})
+        if insn.id == "not_to_split":
+            assert insn.within_inames == frozenset({"i"})
 
 
 def test_nested_substs_in_insns(ctx_factory):
@@ -590,7 +588,7 @@ def test_nested_substs_in_insns(ctx_factory):
     prg = lp.expand_subst(ref_prg)
     assert not any(
             cknl.subkernel.substitutions
-            for cknl in six.itervalues(prg.callables_table.resolved_functions))
+            for cknl in prg.callables_table.resolved_functions.values())
 
     lp.auto_test_vs_ref(ref_prg, ctx, prg)
 
@@ -601,15 +599,102 @@ def test_extract_subst_with_iname_deps_in_templ(ctx_factory):
             """
             y[i, j, k] = x[i, j, k]
             """,
-            [lp.GlobalArg('x,y', shape=lp.auto, dtype=float)],
+            [lp.GlobalArg("x,y", shape=lp.auto, dtype=float)],
             lang_version=(2018, 2))
 
-    knl = lp.extract_subst(knl, 'rule1', 'x[i, arg1, arg2]',
-            parameters=('arg1', 'arg2'))
+    knl = lp.extract_subst(knl, "rule1", "x[i, arg1, arg2]",
+            parameters=("arg1", "arg2"))
 
     lp.auto_test_vs_ref(knl, ctx_factory(), knl)
 
 
+def test_prefetch_local_into_private():
+    # https://gitlab.tiker.net/inducer/loopy/-/issues/210
+    n = 32
+    m = 32
+    n_vecs = 32
+
+    knl = lp.make_kernel(
+        """{[k,i,j]:
+            0<=k<n_vecs and
+            0<=i<m and
+            0<=j<n}""",
+        """
+        result[i,k] = sum(j, mat[i, j] * vec[j, k])
+        """,
+        kernel_data=[
+            lp.GlobalArg("result", np.float32, shape=(m, n_vecs), order="C"),
+            lp.GlobalArg("mat", np.float32, shape=(m, n), order="C"),
+            lp.GlobalArg("vec", np.float32, shape=(n, n_vecs), order="C")
+        ],
+        assumptions="n > 0 \
+                     and m > 0 \
+                     and n_vecs > 0",
+        name="mxm"
+    )
+
+    knl = lp.fix_parameters(knl, m=m, n=n, n_vecs=n_vecs)
+    knl = lp.prioritize_loops(knl, "i,k,j")
+
+    knl = lp.add_prefetch(
+            knl, "mat", "i, j", temporary_name="s_mat", default_tag="for")
+    knl = lp.add_prefetch(
+            knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
+
+
+def test_add_inames_for_unused_hw_axes(ctx_factory):
+    ctx = ctx_factory()
+    dtype = np.float32
+    order = "F"
+
+    n = 16**3
+
+    knl = lp.make_kernel(
+            "[n] -> {[i,j]: 0<=i,j<n}",
+            [
+                """
+                <> alpha = 2.0 {id=init_alpha}
+                for i
+                  for j
+                    c[i, j] = alpha*a[i]*b[j] {id=outerproduct}
+                  end
+                end
+                """
+                ],
+            [
+                lp.GlobalArg("a", dtype, shape=("n",), order=order),
+                lp.GlobalArg("b", dtype, shape=("n",), order=order),
+                lp.GlobalArg("c", dtype, shape=("n, n"), order=order),
+                lp.ValueArg("n", np.int32, approximately=n),
+                ],
+            name="rank_one",
+            assumptions="n >= 16",
+            lang_version=(2018, 2))
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16,
+            outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_iname(knl, "j", 16,
+            outer_tag="g.1", inner_tag="l.1")
+
+    knl = lp.add_prefetch(knl, "a")
+    knl = lp.add_prefetch(knl, "b")
+
+    knl = lp.add_inames_for_unused_hw_axes(knl)
+
+    assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
+    assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
+    assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl,
+            op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"],
+            parameters={"n": n})
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/testlib.py b/test/testlib.py
index c66367a7ccaeae2794c3f5ffa82e5670ada721c2..2d2a535fb2369e526c4b9304a60d680763cd8461 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -4,7 +4,7 @@ import numpy as np
 
 # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel
 
-class GridOverride(object):
+class GridOverride:
     def __init__(self, clean, vecsize):
         self.clean = clean
         self.vecsize = vecsize
@@ -59,15 +59,15 @@ class SeparateTemporariesPreambleTestMangler(
 
         # check types
         if len(arg_dtypes) != len(arg_dtypes):
-            raise Exception('Unexpected number of arguments provided to mangler '
-                            '{}, expected {}, got {}'.format(
+            raise Exception("Unexpected number of arguments provided to mangler "
+                            "{}, expected {}, got {}".format(
                                 self.func_name, len(self.func_arg_dtypes),
                                 len(arg_dtypes)))
 
         for i, (d1, d2) in enumerate(zip(self.func_arg_dtypes, arg_dtypes)):
             if not __compare(d1, d2):
-                raise Exception('Argument at index {} for mangler {} does not '
-                                'match expected dtype.  Expected {}, got {}'.
+                raise Exception("Argument at index {} for mangler {} does not "
+                                "match expected dtype.  Expected {}, got {}".
                                 format(i, self.func_name, str(d1), str(d2)))
 
         # get target for creation
@@ -87,7 +87,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
         func_match = next(
             (x for x in preamble_info.seen_functions
              if x.name == self.func_name), None)
-        desc = 'custom_funcs_indirect'
+        desc = "custom_funcs_indirect"
         if func_match is not None:
             from loopy.types import to_loopy_type
             # check types
@@ -95,7 +95,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
                     func_match.arg_dtypes:
                 # if match, create our temporary
                 var = lp.TemporaryVariable(
-                    'lookup', initializer=self.arr, dtype=self.arr.dtype,
+                    "lookup", initializer=self.arr, dtype=self.arr.dtype,
                     shape=self.arr.shape,
                     address_space=lp.AddressSpace.GLOBAL, read_only=True)
                 # and code
@@ -129,7 +129,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
             decl = Initializer(decl, generate_array_literal(
                 codegen_state, var, var.initializer))
         # return generated code
-        yield (desc, '\n'.join([str(decl), code]))
+        yield (desc, "\n".join([str(decl), code]))
 
 # }}}