diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d0481192817877edea3b8deaaaf86b480fab2a11
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,63 @@
+name: CI
+on:
+    push:
+        branches:
+        - master
+    pull_request:
+        paths-ignore:
+        - 'doc/*.rst'
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    flake8:
+        name: Flake8
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x' 
+        -   name: "Main Script"
+            run: |
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                . ./prepare-and-run-flake8.sh ./loopy ./test
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
+                CONDA_ENVIRONMENT=.test-conda-env.yml
+                USE_CONDA_BUILD=1
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
+                . ./prepare-and-run-pylint.sh loopy test/test_*.py
+
+    pytest3:
+        name: Conda Pytest Py3
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest_twice:
+        name: Pytest twice (for cache behavior) on Py${{ matrix.python-version }}
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+                ${PY_EXE} -m pytest -rw --durations=10 --tb=native  --junitxml=pytest.xml -rxs $TESTABLES
+
+# vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c11e507ee79cdc6f1567acbf6c12bbd7ed22f1cc..48bee8638df08ebe8c03a17f84c78851ff36466e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,26 +1,7 @@
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - export LOOPY_NO_CACHE=1
-  - export NO_DOCTESTS=1
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
-
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -57,7 +38,7 @@ Python 3 Intel:
 Python 3 POCL Twice With Cache:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -77,7 +58,7 @@ Python 3 POCL Twice With Cache:
 # PyPy POCL:
 #   script:
 #   - export PY_EXE=pypy
-#   - export PYOPENCL_TEST=portable
+#   - export PYOPENCL_TEST=portable:pthread
 #   - export EXTRA_INSTALL="pybind11 numpy mako"
 #   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
 #   - ". ./build-and-test-py-project.sh"
@@ -90,7 +71,7 @@ Python 3 POCL Twice With Cache:
 Python 3 POCL Examples:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
@@ -114,20 +95,6 @@ Pylint:
   except:
   - tags
 
-CentOS binary:
-  script:
-  - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
-  - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy)
-  artifacts:
-    expire_in: 4 weeks
-    paths:
-    - build-helpers/loopy-centos6
-  tags:
-  - docker
-  only:
-  - master
-  retry: 2
-
 Documentation:
   script:
   - EXTRA_INSTALL="pybind11 numpy"
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index a1fe086b4ac4562aaa8fafd32657aebbd1068e8a..ccbbc933aae2d3c0a28d7d30f178661950c76542 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,12 +1,12 @@
 name: test-conda-env
 channels:
 - conda-forge
-- defaults
+- nodefaults
 
 dependencies:
 - python=3
 - git
-- conda-forge::numpy
+- numpy
 - pocl
 - mako
 - pyopencl
diff --git a/README.rst b/README.rst
index fe7eb751a7144d9758df91914b643392de421450..3240983638e1f6f96ba7fec410c5c893db19c044 100644
--- a/README.rst
+++ b/README.rst
@@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
 .. image:: https://gitlab.tiker.net/inducer/loopy/badges/master/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/loopy/commits/master
-.. image:: https://dev.azure.com/ak-spam/inducer/_apis/build/status/inducer.loopy?branchName=master
-    :alt: Azure Build Status
-    :target: https://dev.azure.com/ak-spam/inducer/_build/latest?definitionId=10&branchName=master
+.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/loo.py.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/loo.py/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 0dfb2455568b275b40e699683071da3a1cd2f483..0000000000000000000000000000000000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-jobs:
--
-    job: 'Python2'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml
-            cat .test-conda-env-py2.yml
-            CONDA_ENVIRONMENT=.test-conda-env-py2.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Python3'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Python3Twice'
-    displayName: "Python3 - run tests twice to test cache behavior"
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-            ${PY_EXE} -m pytest -rw --durations=10 --tb=native  --junitxml=pytest.xml -rxs $TESTABLES
-
-        displayName: 'Pytest Conda'
-
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Flake8'
-    pool:
-        vmImage: 'ubuntu-latest'
-    strategy:
-        matrix:
-            Python37:
-                python.version: '3.7'
-
-    steps:
-    -
-        task: UsePythonVersion@0
-        inputs:
-            versionSpec: '$(python.version)'
-
-    -
-        script: |
-            set -e
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-            . ./prepare-and-run-flake8.sh loopy test
-
-        displayName: 'Flake8'
-
--
-    job: 'Pylint'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
-            CONDA_ENVIRONMENT=.test-conda-env.yml
-            USE_CONDA_BUILD=1
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-            . ./prepare-and-run-pylint.sh loopy test/test_*.py
-
-        displayName: 'Pylint'
-
-schedules:
--
-    cron: "0 0 * * 0"
-    displayName: Weekly build
-    branches:
-        include:
-        - master
diff --git a/build-helpers/.gitignore b/build-helpers/.gitignore
deleted file mode 100644
index fef83014eecb14936006b90afc65595dd7d30b77..0000000000000000000000000000000000000000
--- a/build-helpers/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-loopy-*-20[0-9][0-9]*
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
deleted file mode 100644
index 08c0b6efe0efd3ad419b6565fd396c2f805eeab7..0000000000000000000000000000000000000000
--- a/build-helpers/loopy.spec
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- mode: python -*-
-
-from os.path import basename, dirname, join
-from glob import glob
-
-single_file = True
-
-# This makes the executable spew debug info.
-debug = False
-
-from os.path import expanduser
-
-import packaging # pip install packaging to add
-
-a = Analysis(['../bin/loopy'],
-             pathex=[expanduser('~/src/loopy')],
-             hiddenimports=[
-                "decorator",
-                "appdirs",
-                "packaging.markers",
-                "packaging.specifiers",
-                "packaging.version",
-                "packaging.requirements",
-                ],
-             hookspath=None,
-             runtime_hooks=None,
-             excludes=["hedge", "meshpy", "pyopencl", "PIL"]
-             )
-
-import ply.lex
-import ply.yacc
-
-
-a.datas += [
-  (join("py-src", "ply", "lex", basename(fn)), fn, "DATA")
-  for fn in glob(join(dirname(ply.lex.__file__), "*.py"))
-  ] + [
-  (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA")
-  for fn in glob(join(dirname(ply.yacc.__file__), "*.py"))
-  ]
-
-pyz = PYZ(a.pure)
-
-if single_file:
-    exe = EXE(pyz,
-              a.scripts,
-              a.binaries,
-              a.zipfiles,
-              a.datas,
-              name='loopy',
-              debug=debug,
-              strip=None,
-              upx=True,
-              console=True)
-else:
-    exe = EXE(pyz,
-              a.scripts,
-              exclude_binaries=True,
-              name='loopy',
-              debug=debug,
-              strip=None,
-              upx=True,
-              console=True)
-    coll = COLLECT(exe,
-                   a.binaries,
-                   a.zipfiles,
-                   a.datas,
-                   strip=None,
-                   upx=True,
-                   name='loopy')
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
deleted file mode 100755
index 035634b16072e0188270abd8736dab99ce31dada..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-VENV_VERSION="virtualenv-15.2.0"
-rm -Rf "$VENV_VERSION"
-curl -k https://files.pythonhosted.org/packages/b1/72/2d70c5a1de409ceb3a27ff2ec007ecdd5cc52239e7c74990e32af57affe9/$VENV_VERSION.tar.gz | tar xfz -
-
-$VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env
-
-source .env/bin/activate
-
-curl -k https://bootstrap.pypa.io/ez_setup.py | python -
-curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python -
-
-pip install packaging
-
-PYTHON_VER=$(python -c 'import sys; print(".".join(str(s) for s in sys.version_info[:2]))')
-pip install git+https://github.com/pyinstaller/pyinstaller.git@413c37bec126c0bd26084813593f65128966b4b7
-
-git clone --recursive git://github.com/inducer/loopy
-cd loopy
-
-grep -v pyopencl requirements.txt > myreq.txt
-
-# needed for pyinstaller package to be usable
-echo packaging >> myreq.txt
-
-pip install -r myreq.txt
-python setup.py install
-
-chown -R user /tmp/build
-
-su user -p -c "cd /tmp/build && source .env/bin/activate && cd loopy && ./build-helpers/run-pyinstaller.sh"
diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh
deleted file mode 100755
index a7f621b1ef21676898d2283d93f8a54f086e5d9d..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-mkdir /tmp/build
-cd /tmp/build
-
-useradd -d /home/user -m -s /bin/bash user
-
-yum install -y centos-release-scl
-yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel
-
-scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh
-
diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh
deleted file mode 100755
index fb0cfb587d654698800bfdc827259691bc056fb7..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#! /bin/bash
-
-# should be run in this directory (build-helpers)
-
-if test "$1" = "--nodate"; then
-  TGT_NAME=loopy-centos6
-else
-  TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
-fi
-
-echo "Generating $TGT_NAME..."
-
-set -e
-set -x
-
-docker pull centos:6
-
-CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
-echo "working in container $CNT"
-
-docker start -i $CNT
-
-docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
-
-mv loopy $TGT_NAME
-
-docker rm $CNT
-
diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh
deleted file mode 100755
index 50f9d85dccc503be2a2ccfb6c0e3d6aa28216981..0000000000000000000000000000000000000000
--- a/build-helpers/run-pyinstaller.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/bash
-
-# run this from the loopy root directory
-
-rm -Rf dist build
-
-pyinstaller \
-  --workpath=build/pyinstaller \
-  build-helpers/loopy.spec
diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh
deleted file mode 100755
index 57b8a873b9395954d76a8fd16f8ca9a261e8baa3..0000000000000000000000000000000000000000
--- a/build-helpers/upload.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#! /bin/bash
-
-set -e
-
-scp "$1" tiker.net:public_html/pub/loopy-binaries/
diff --git a/doc/index.rst b/doc/index.rst
index b77bbb16f413defe5010c75d28464051553b4486..8f114eb72cdc530dd4109257c4981118c5046f06 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -20,29 +20,6 @@ When you run this script, the following kernel is generated, compiled, and execu
 
 .. _static-binary:
 
-Want to try out loopy?
-----------------------
-
-There's no need to go through :ref:`installation` if you'd just like to get a
-feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/-/jobs/66778/artifacts/browse/build-helpers/>`_.
-This is purposefully built on an ancient Linux distribution, so it should work
-on most versions of Linux that are currently out there.
-
-Once you have the binary, do the following::
-
-    chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy.loopy
-    ./loopy-centos6 --target=cuda hello-loopy.loopy
-    ./loopy-centos6 --target=ispc hello-loopy.loopy
-
-Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`.
-
-You may also donwload the most recent version by going to the `list of builds
-<https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
-of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
-navigating to "build-helpers", and downloading the binary from there.
-
 Places on the web related to Loopy
 ----------------------------------
 
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 409cbef576d654be973dd6d1424ac40d3ea60982..af35221ad5dcd736190e40a454656a7fa069a787 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -151,6 +151,42 @@ Tag                             Meaning
 
 .. }}}
 
+Identifiers
+-----------
+
+Reserved Identifiers
+^^^^^^^^^^^^^^^^^^^^
+
+The identifier prefix ``_lp_`` is reserved for internal usage; when creating
+*inames*, *argument names*, *temporary variable names*, *substitution rule
+names*, *instruction IDs*, and other identifiers, users should *not* use names
+beginning with ``_lp_``.  This prefix is used for identifiers created
+internally when operating on Loopy's kernel IR. For Loopy developers, further
+information on name prefixes used within submodules is below.
+
+Identifier Registry
+^^^^^^^^^^^^^^^^^^^
+
+Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
+all internally-created identifiers. Additionally, each name beginning with
+``_lp_`` must start with one of the reserved prefixes below. New prefixes may
+be registered by adding them to the table below. New prefixes may not themselves
+be the prefix of an existing prefix.
+
+**Reserved Identifier Prefixes**
+
+======================= ==================================
+Reserved Prefix         Usage (module or purpose)
+======================= ==================================
+``_lp_linchk_``         :mod:`loopy.linearization.checker`
+======================= ==================================
+
+.. note::
+
+    Existing Loopy code may not yet fully satisfy these naming requirements.
+    Name changes are in progress, and prefixes will be added to this registry
+    as they are created.
+
 .. _instructions:
 
 Instructions
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 740c5cb5848dbb7c6f657011bfc23fa88ca173ec..57d33b53999e06cbb07cc8363bbc46c091033cb3 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -118,7 +118,7 @@ Finishing up
 
 .. autofunction:: generate_loop_schedules
 
-.. autofunction:: get_one_scheduled_kernel
+.. autofunction:: get_one_linearized_kernel
 
 .. autofunction:: save_and_reload_temporaries
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 753b09b5da42835b88a000bc0400fa18a254d80f..1b017f701f8161e93c4fdc1c14644dfe4b4fa74c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1204,16 +1204,16 @@ Here is what happens when we try to generate code for the kernel:
 
 This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
-should call :func:`loopy.get_one_scheduled_kernel`:
+should call :func:`loopy.get_one_linearized_kernel`:
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+   >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
    >>> print(knl)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
    ...
    ---------------------------------------------------------------------------
-   SCHEDULE:
+   LINEARIZATION:
       0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
       1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2: RETURN FROM KERNEL rotate_v2
@@ -1233,12 +1233,12 @@ goes for local temporaries).
 :func:`loopy.save_and_reload_temporaries` for the purpose of handling the
 task of saving and restoring temporary values across global barriers. This
 function adds instructions to the kernel without scheduling them. That means
-that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
+that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+   >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
    >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl)  # Schedule added instructions
+   >>> knl = lp.get_one_linearized_kernel(knl)  # Schedule added instructions
    >>> print(knl)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
@@ -1251,7 +1251,7 @@ put those instructions into the schedule.
    ---------------------------------------------------------------------------
    ...
    ---------------------------------------------------------------------------
-   SCHEDULE:
+   LINEARIZATION:
       0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
       1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2:     tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp  {id=tmp.save}
diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db569480d521e58210c030e742386cd12dc8d37
--- /dev/null
+++ b/examples/fortran/matmul-driver.py
@@ -0,0 +1,35 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+import loopy as lp
+
+
+def main():
+    fn = "matmul.floopy"
+    with open(fn, "r") as inf:
+        source = inf.read()
+
+    dgemm, = lp.parse_transformed_fortran(source, filename=fn)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2048
+    a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+    c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+    cl.clrandom.fill_rand(a)
+    cl.clrandom.fill_rand(b)
+
+    dgemm = lp.set_options(dgemm, write_code=True)
+
+    dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+    c_ref = (a.get() @ b.get())
+    assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+    main()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..807ce88341a8845a154d853077aea649c0938064 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -123,12 +123,12 @@ from loopy.transform.add_barrier import add_barrier
 
 from loopy.type_inference import infer_unknown_types
 from loopy.preprocess import preprocess_kernel, realize_reduction
-from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping,
-        Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
-        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
-        get_synchronization_poly, get_synchronization_map,
-        gather_access_footprints, gather_access_footprint_bytes)
+from loopy.schedule import (
+    generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel)
+from loopy.statistics import (ToCountMap, CountGranularity,
+        stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map,
+        get_synchronization_map, gather_access_footprints,
+        gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -248,16 +248,16 @@ __all__ = [
         "infer_unknown_types",
 
         "preprocess_kernel", "realize_reduction",
-        "generate_loop_schedules", "get_one_scheduled_kernel",
+        "generate_loop_schedules",
+        "get_one_scheduled_kernel", "get_one_linearized_kernel",
         "GeneratedProgram", "CodeGenerationResult",
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
         "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op",
-        "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly",
-        "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
-        "get_synchronization_poly", "get_synchronization_map",
-        "gather_access_footprints", "gather_access_footprint_bytes",
+        "MemAccess",  "get_op_map", "get_mem_access_map",
+        "get_synchronization_map", "gather_access_footprints",
+        "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 6837b99a026debf32b12aceef00ed3863c620639..ca70c8489238ee6f1fd95f52b02dbe451ddf13ef 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -534,7 +534,7 @@ def auto_test_vs_ref(
     from loopy.target.pyopencl import PyOpenCLTarget
     if test_knl.state not in [
             KernelState.PREPROCESSED,
-            KernelState.SCHEDULED]:
+            KernelState.LINEARIZED]:
         if isinstance(test_knl.target, PyOpenCLTarget):
             test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
 
diff --git a/loopy/check.py b/loopy/check.py
index cc87ad9872668bf5323aefd79944e3bbd71b1153..da49c1d116df1a9fbf92e8ef41822b6741405604 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -184,6 +184,19 @@ def check_for_inactive_iname_access(kernel):
                         ", ".join(expression_inames - kernel.insn_inames(insn))))
 
 
+def check_for_unused_inames(kernel):
+    # Warn if kernel has unused inames
+    from loopy.transform.iname import get_used_inames
+    unused_inames = kernel.all_inames() - get_used_inames(kernel)
+    if unused_inames:
+        warn_with_kernel(
+            kernel, "unused_inames",
+            "Found unused inames in kernel: %s "
+            "Unused inames during linearization will be prohibited in "
+            "Loopy version 2021.X."
+            % unused_inames)
+
+
 def _is_racing_iname_tag(tv, tag):
     from loopy.kernel.data import (AddressSpace,
             LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
@@ -220,12 +233,12 @@ def check_for_write_races(kernel):
             assignee_inames = assignee_indices & kernel.all_inames()
             if not assignee_inames <= kernel.insn_inames(insn):
                 raise LoopyError(
-                        "assignee of instructiosn '%s' references "
+                        "assignee of instructions '%s' references "
                         "iname that the instruction does not depend on"
                         % insn.id)
 
             if assignee_name in kernel.arg_dict:
-                # Any parallel tags that are not depended upon by the assignee
+                # Any concurrent tags that are not depended upon by the assignee
                 # will cause write races.
 
                 raceable_parallel_insn_inames = set(
@@ -658,6 +671,7 @@ def pre_schedule_checks(kernel):
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
         check_for_inactive_iname_access(kernel)
+        check_for_unused_inames(kernel)
         check_for_write_races(kernel)
         check_for_data_dependent_parallel_bounds(kernel)
         check_bounds(kernel)
diff --git a/loopy/cli.py b/loopy/cli.py
index a92922b1845d76dd7a700a93c05de3eecf8c28dd..cdc24800be0edf3935aacccdd4dc4d9905cf5965 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -39,13 +39,13 @@ def defines_to_python_code(defines_str):
     import re
     define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
     result = []
-    for l in defines_str.split("\n"):
-        if not l.strip():
+    for line in defines_str.split("\n"):
+        if not line.strip():
             continue
 
-        match = define_re.match(l)
+        match = define_re.match(line)
         if match is None:
-            raise RuntimeError("#define not understood: '%s'" % l)
+            raise RuntimeError("#define not understood: '%s'" % line)
 
         result.append(
                 "%s = %s" % (match.group(1), to_python_literal(match.group(2))))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..b4811dc9966921fa612aabef9a726d6b53fd4052 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -388,7 +388,7 @@ def generate_code_v2(kernel):
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel)
 
-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("cannot generate code for a kernel that has not been "
                 "scheduled")
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index c946e09a086e574a2593d60f652a81773d95a1fe..b736191ec1dadb842e12453fbec3b68e831338f6 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import (
         find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
     from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
+                                   VectorizeTag,
                                    IlpBaseTag)
 
     result = find_active_inames_at(kernel, sched_index)
@@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     # Find our containing subkernel. Grab inames for all insns from there.
     within_subkernel = False
 
-    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
+    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
         from loopy.schedule import CallKernel, ReturnFromKernel
         if isinstance(sched_item, CallKernel):
             within_subkernel = True
@@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         #
         # - local indices may not be used in conditionals that cross barriers.
         #
-        # - ILP indices are not available in loop bounds, they only get defined
-        #   at the innermost level of nesting.
+        # - ILP indices and vector lane indices are not available in loop
+        #   bounds, they only get defined at the innermost level of nesting.
 
         if (
                 kernel.iname_tags_of_type(iname, ConcurrentTag)
+                and not kernel.iname_tags_of_type(iname, VectorizeTag)
                 and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
                     and crosses_barrier)
                 and not kernel.iname_tags_of_type(iname, IlpBaseTag)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..7319b16ac2fe9f39872558a3878161b89cab15d9 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
 from loopy.schedule import (
@@ -33,30 +32,6 @@ from loopy.schedule import (
 from loopy.diagnostic import LoopyError
 
 
-def get_admissible_conditional_inames_for(codegen_state, sched_index):
-    """This function disallows conditionals on local-idx tagged
-    inames if there is a barrier nested somewhere within.
-    """
-
-    kernel = codegen_state.kernel
-
-    from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag,
-                                   filter_iname_tags_by_type)
-
-    from loopy.schedule import find_active_inames_at, has_barrier_within
-    result = find_active_inames_at(kernel, sched_index)
-
-    has_barrier = has_barrier_within(kernel, sched_index)
-
-    for iname, tags in six.iteritems(kernel.iname_to_tags):
-        if (filter_iname_tags_by_type(tags, HardwareConcurrentTag)
-                and codegen_state.is_generating_device_code):
-            if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag):
-                result.add(iname)
-
-    return frozenset(result)
-
-
 def synthesize_idis_for_extra_args(kernel, schedule_index):
     """
     :returns: A list of :class:`loopy.codegen.ImplementedDataInfo`
@@ -222,14 +197,14 @@ def get_required_predicates(kernel, sched_index):
     return result
 
 
-def group_by(l, key, merge):
-    if not l:
-        return l
+def group_by(entry, key, merge):
+    if not entry:
+        return entry
 
     result = []
-    previous = l[0]
+    previous = entry[0]
 
-    for item in l[1:]:
+    for item in entry[1:]:
         if key(previous) == key(item):
             previous = merge(previous, item)
 
@@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index):
         """
 
     from loopy.schedule import find_used_inames_within
+    from loopy.codegen.bounds import get_usable_inames_for_conditional
+
     sched_index_info_entries = [
             ScheduleIndexInfo(
                 schedule_indices=[i],
                 admissible_cond_inames=(
-                    get_admissible_conditional_inames_for(codegen_state, i)),
+                    get_usable_inames_for_conditional(kernel, i)),
                 required_predicates=get_required_predicates(kernel, i),
                 used_inames_within=find_used_inames_within(kernel, i)
                 )
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..c0ca875c0e9b661becb1bb0ca6e81139a8a93e2d 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -274,7 +274,7 @@ def generate_c_instruction_code(codegen_state, insn):
     if body:
         body.append(Line())
 
-    body.extend(Line(l) for l in insn.code.split("\n"))
+    body.extend(Line(line) for line in insn.code.split("\n"))
 
     return Block(body)
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 128e4fbc85a2a03e25da3f88b200e67eb41756d3..b3a87798840bb1624d350c79830f29142e54ab6c 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
-                LocalIndexTag, GroupIndexTag)
+                LocalIndexTag, GroupIndexTag, VectorizeTag)
 
     from loopy.schedule import get_insn_ids_for_block_at
     insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
             all_inames_by_insns |= kernel.insn_inames(insn_id)
 
         hw_inames_left = [iname for iname in all_inames_by_insns
-                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]
+                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
+                and not kernel.iname_tags_of_type(iname, VectorizeTag)]
 
     if not hw_inames_left:
         return next_func(codegen_state)
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 05b0a92050a51be1cd980648325921fbf13768d8..40202d4da3319c0ef24b0317f01cd4d31f88d484 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -86,17 +86,17 @@ def _extract_loopy_lines(source):
     loopy_lines = []
 
     in_loopy_code = False
-    for l in lines:
-        comment_match = comment_re.match(l)
+    for line in lines:
+        comment_match = comment_re.match(line)
 
         if comment_match is None:
             if in_loopy_code:
                 raise LoopyError("non-comment source line in loopy block")
 
-            remaining_lines.append(l)
+            remaining_lines.append(line)
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
             continue
 
         cmt = comment_match.group(1)
@@ -108,7 +108,7 @@ def _extract_loopy_lines(source):
             in_loopy_code = True
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
         elif cmt_stripped == "$loopy end":
             if not in_loopy_code:
@@ -116,16 +116,16 @@ def _extract_loopy_lines(source):
             in_loopy_code = False
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
         elif in_loopy_code:
             loopy_lines.append(cmt)
 
         else:
-            remaining_lines.append(l)
+            remaining_lines.append(line)
 
             # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
 
     return "\n".join(remaining_lines), "\n".join(loopy_lines)
 
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index d7a1b2498af583bc9ff97ba743ccc5ed8bd25d3a..91a5fdc88f02a99c6064f6b9944b08de662a27a8 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -339,11 +339,11 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         return []
 
-    map_Logical = map_type_decl
-    map_Integer = map_type_decl
-    map_Real = map_type_decl
-    map_Complex = map_type_decl
-    map_DoublePrecision = map_type_decl
+    map_Logical = map_type_decl  # noqa: N815
+    map_Integer = map_type_decl  # noqa: N815
+    map_Real = map_type_decl  # noqa: N815
+    map_Complex = map_type_decl  # noqa: N815
+    map_DoublePrecision = map_type_decl  # noqa: N815
 
     def map_Dimension(self, node):
         scope = self.scope_stack[-1]
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 80a7ad03101bc67f39c89c6089aa6533d1886185..2d926aad4faa511aa2919630c9b0e96b7f253ad9 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -35,14 +35,13 @@ import islpy as isl
 from islpy import dim_type
 import re
 
-from pytools import UniqueNameGenerator, generate_unique_names
+from pytools import UniqueNameGenerator, generate_unique_names, natsorted
 
 from loopy.library.function import (
         default_function_mangler,
         single_arg_function_mangler)
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
-from loopy.tools import natsorted
 from loopy.diagnostic import StaticValueFindingError
 from loopy.kernel.data import filter_iname_tags_by_type
 from warnings import warn
@@ -99,10 +98,25 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
 
 # {{{ loop kernel object
 
+class _deprecated_KernelState_SCHEDULED(object):  # noqa
+    def __init__(self, f):
+        self.f = f
+
+    def __get__(self, obj, klass):
+        warn(
+            "'KernelState.SCHEDULED' is deprecated. "
+            "Use 'KernelState.LINEARIZED'.",
+            DeprecationWarning, stacklevel=2)
+        return self.f()
+
 class KernelState:  # noqa
     INITIAL = 0
     PREPROCESSED = 1
-    SCHEDULED = 2
+    LINEARIZED = 2
+
+    @_deprecated_KernelState_SCHEDULED
+    def SCHEDULED():  # pylint:disable=no-method-argument
+        return KernelState.LINEARIZED
 
 # {{{ kernel_state, KernelState compataibility
 
@@ -228,7 +242,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # {{{ constructor
 
-    def __init__(self, domains, instructions, args=None, schedule=None,
+    def __init__(self, domains, instructions, args=None,
+            schedule=None,
+            linearization=None,
             name="loopy_kernel",
             preambles=None,
             preamble_generators=None,
@@ -333,10 +349,27 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if state not in [
                 KernelState.INITIAL,
                 KernelState.PREPROCESSED,
-                KernelState.SCHEDULED,
+                KernelState.LINEARIZED,
                 ]:
             raise ValueError("invalid value for 'state'")
 
+        # `linearization` is replacing `schedule`, but we're not changing
+        # this under the hood yet, so for now, store it inside `schedule`
+        # and raise deprecation warning anyway
+        if schedule is not None:
+            if linearization is not None:
+                # these should not both be present
+                raise ValueError(
+                    "received both `schedule` and `linearization` args, "
+                    "'LoopKernel.schedule' is deprecated. "
+                    "Use 'LoopKernel.linearization'.")
+            warn(
+                "'LoopKernel.schedule' is deprecated. "
+                "Use 'LoopKernel.linearization'.",
+                DeprecationWarning, stacklevel=2)
+        elif linearization is not None:
+            schedule = linearization
+
         from collections import defaultdict
         assert not isinstance(iname_to_tags, defaultdict)
 
@@ -1345,7 +1378,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if "schedule" in what and kernel.schedule is not None:
             lines.extend(sep)
             if show_labels:
-                lines.append("SCHEDULE:")
+                lines.append("LINEARIZATION:")
             from loopy.schedule import dump_schedule
             lines.append(dump_schedule(kernel, kernel.schedule))
 
@@ -1395,6 +1428,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
+    # {{{ handle linearization variable that doesn't yet exist
+
+    @property
+    def linearization(self):
+        return self.schedule
+
+    # }}}
+
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 9ac38fc87a27da13e98515085edd6f2e35b1fcd7..e6544b34a55af97a1a15e86f7d74855e08e53116 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag):
 # }}}
 
 
-class VectorizeTag(UniqueTag):
+class VectorizeTag(UniqueTag, HardwareConcurrentTag):
     def __str__(self):
         return "vec"
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 8213c9584b54917050c586e1b83b6d66d0473798..61127232a9f494fe2fdc536dd50d8fdf41b8f17c 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -66,7 +66,8 @@ class InstructionBase(ImmutableRecord):
     .. attribute:: depends_on_is_final
 
         A :class:`bool` determining whether :attr:`depends_on` constitutes
-        the *entire* list of iname dependencies.
+        the *entire* list of iname dependencies. If *not* marked final,
+        various semi-broken heuristics will try to add further dependencies.
 
         Defaults to *False*.
 
@@ -344,10 +345,13 @@ class InstructionBase(ImmutableRecord):
         """
         raise NotImplementedError
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
         """Return a new copy of *self* where *f* has been applied to every
         expression occurring in *self*. *args* will be passed as extra
         arguments (in addition to the expression) to *f*.
+
+        If *assignee_f* is passed, then left-hand sides of assignments are
+        passed to it. If it is not given, it defaults to the same as *f*.
         """
         raise NotImplementedError
 
@@ -959,12 +963,15 @@ class Assignment(MultiAssignmentBase):
     def assignee_subscript_deps(self):
         return (_get_assignee_subscript_deps(self.assignee),)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
-                assignee=f(self.assignee, *args),
-                expression=f(self.expression, *args),
+                assignee=assignee_f(self.assignee),
+                expression=f(self.expression),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
@@ -1114,12 +1121,15 @@ class CallInstruction(MultiAssignmentBase):
                 _get_assignee_subscript_deps(a)
                 for a in self.assignees)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
-                assignees=f(self.assignees, *args),
-                expression=f(self.expression, *args),
+                assignees=assignee_f(self.assignees),
+                expression=f(self.expression),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
@@ -1315,14 +1325,17 @@ class CInstruction(InstructionBase):
                 _get_assignee_subscript_deps(a)
                 for a in self.assignees)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
         return self.copy(
                 iname_exprs=[
-                    (name, f(expr, *args))
+                    (name, f(expr))
                     for name, expr in self.iname_exprs],
-                assignees=[f(a, *args) for a in self.assignees],
+                assignees=[assignee_f(a) for a in self.assignees],
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))
 
     # }}}
 
@@ -1357,7 +1370,7 @@ class _DataObliviousInstruction(InstructionBase):
     def assignee_subscript_deps(self):
         return frozenset()
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
         return self.copy(
                 predicates=frozenset(
                     f(pred) for pred in self.predicates))
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 157099df5a2133baa109f24e8216d63577b5dcb4..e33d260fba4f3f4122f35e033ecc573b41999d5d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,8 +34,7 @@ import numpy as np
 import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
-from pytools import memoize_on_first_arg
-from loopy.tools import natsorted
+from pytools import memoize_on_first_arg, natsorted
 
 import logging
 logger = logging.getLogger(__name__)
@@ -1381,7 +1380,7 @@ def draw_dependencies_as_unicode_arrows(
                 .replace(style.RESET_ALL, ""))
         return len(s)
 
-    def truncate_without_color_escapes(s, l):
+    def truncate_without_color_escapes(s, length):
         # FIXME: This is a bit dumb--it removes color escapes when truncation
         # is needed.
 
@@ -1389,7 +1388,7 @@ def draw_dependencies_as_unicode_arrows(
                 .replace(fore.RED, "")
                 .replace(style.RESET_ALL, ""))
 
-        return s[:l] + u"…"
+        return s[:length] + u"…"
 
     def conform_to_uniform_length(s):
         len_s = len_without_color_escapes(s)
@@ -1428,6 +1427,8 @@ def stringify_instruction_list(kernel):
 
     def insert_insn_into_order(insn):
         if insn.id in printed_insn_ids:
+            # Note: dependency cycles are deliberately ignored so that printing
+            # succeeds.
             return
         printed_insn_ids.add(insn.id)
 
@@ -1511,7 +1512,7 @@ def stringify_instruction_list(kernel):
                     ", ".join("%s=%s" % (name, expr)
                         for name, expr in insn.iname_exprs))
 
-            trailing = [l for l in insn.code.split("\n")]
+            trailing = insn.code.split("\n")
         elif isinstance(insn, lp.BarrierInstruction):
             lhs = ""
             rhs = "... %sbarrier" % insn.synchronization_kind[0]
@@ -1583,6 +1584,13 @@ def stringify_instruction_list(kernel):
 
 # {{{ global barrier order finding
 
+def _is_global_barrier(kernel, insn_id):
+    insn = kernel.id_to_insn[insn_id]
+    from loopy.kernel.instruction import BarrierInstruction
+    return isinstance(insn, BarrierInstruction) and \
+        insn.synchronization_kind == "global"
+
+
 @memoize_on_first_arg
 def get_global_barrier_order(kernel):
     """Return a :class:`tuple` of the listing the ids of global barrier instructions
@@ -1590,49 +1598,27 @@ def get_global_barrier_order(kernel):
 
     See also :class:`loopy.instruction.BarrierInstruction`.
     """
-    barriers = []
-    visiting = set()
-    visited = set()
-
-    unvisited = set(insn.id for insn in kernel.instructions)
-
-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
-    while unvisited:
-        stack = [unvisited.pop()]
-
-        while stack:
-            top = stack[-1]
-
-            if top in visiting:
-                visiting.remove(top)
-                if is_barrier(top):
-                    barriers.append(top)
+    dep_graph = {insn.id: set() for insn in kernel.instructions}
+    for insn in kernel.instructions:
+        for dep in insn.depends_on:
+            dep_graph[dep].add(insn.id)
 
-            if top in visited:
-                stack.pop()
-                continue
+    from pytools.graph import compute_topological_order
+    order = compute_topological_order(dep_graph)
 
-            visited.add(top)
-            visiting.add(top)
+    barriers = [
+            insn_id for insn_id in order
+            if _is_global_barrier(kernel, insn_id)]
 
-            for child in kernel.id_to_insn[top].depends_on:
-                # Check for no cycles.
-                assert child not in visiting
-                stack.append(child)
+    del order
 
     # Ensure this is the only possible order.
     #
     # We do this by looking at the barriers in order.
     # We check for each adjacent pair (a,b) in the order if a < b,
     # i.e. if a is reachable by a chain of dependencies from b.
-
-    visiting.clear()
-    visited.clear()
+    visited = set()
+    visiting = set()
 
     for prev_barrier, barrier in zip(barriers, barriers[1:]):
         # Check if prev_barrier is reachable from barrier.
@@ -1690,12 +1676,6 @@ def find_most_recent_global_barrier(kernel, insn_id):
     if len(insn.depends_on) == 0:
         return None
 
-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
     global_barrier_to_ordinal = dict(
             (b, i) for i, b in enumerate(global_barrier_order))
 
@@ -1705,7 +1685,7 @@ def find_most_recent_global_barrier(kernel, insn_id):
                 else -1)
 
     direct_barrier_dependencies = set(
-            dep for dep in insn.depends_on if is_barrier(dep))
+            dep for dep in insn.depends_on if _is_global_barrier(kernel, dep))
 
     if len(direct_barrier_dependencies) > 0:
         return max(direct_barrier_dependencies, key=get_barrier_ordinal)
@@ -1727,8 +1707,8 @@ def get_subkernels(kernel):
     See also :class:`loopy.schedule.CallKernel`.
     """
     from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
-        raise LoopyError("Kernel must be scheduled")
+    if kernel.state != KernelState.LINEARIZED:
+        raise LoopyError("Kernel must be linearized")
 
     from loopy.schedule import CallKernel
 
@@ -1744,7 +1724,7 @@ def get_subkernel_to_insn_id_map(kernel):
     kernel must be scheduled.
     """
     from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("Kernel must be scheduled")
 
     from loopy.schedule import (
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index c0eb91ea60317ef8cad1c594571d46bba2d1a671..de81815a82655136941b57b1f78486aed39237da 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.transform.iname import remove_any_newly_unused_inames
 
 import logging
 logger = logging.getLogger(__name__)
@@ -289,7 +290,7 @@ def _classify_reduction_inames(kernel, inames):
     nonlocal_par = []
 
     from loopy.kernel.data import (
-            LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+            LocalIndexTagBase, UnrolledIlpTag, UnrollTag,
             ConcurrentTag, filter_iname_tags_by_type)
 
     for iname in inames:
@@ -303,7 +304,7 @@ def _classify_reduction_inames(kernel, inames):
         elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase):
             local_par.append(iname)
 
-        elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)):
+        elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
             nonlocal_par.append(iname)
 
         else:
@@ -882,6 +883,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
+@remove_any_newly_unused_inames
 def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                       automagic_scans_ok=False, force_scan=False,
                       force_outer_iname_for_scan=None):
@@ -1370,7 +1372,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         track_iname = var_name_gen(
                 "{sweep_iname}__seq_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))
 
         get_or_add_sweep_tracking_iname_and_domain(
                 scan_iname, sweep_iname, sweep_min_value, scan_min_value,
@@ -1480,7 +1482,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         track_iname = var_name_gen(
                 "{sweep_iname}__pre_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))
 
         get_or_add_sweep_tracking_iname_and_domain(
                 scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
@@ -1924,8 +1926,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     kernel = lp.tag_inames(kernel, new_iname_tags)
 
-    # TODO: remove unused inames...
-
     kernel = (
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
@@ -1979,7 +1979,7 @@ def find_idempotence(kernel):
 
     # Find SCCs of dep_graph. These are used for checking if the instruction is
     # in a dependency cycle.
-    from loopy.tools import compute_sccs
+    from pytools.graph import compute_sccs
 
     sccs = dict((item, scc)
             for scc in compute_sccs(dep_graph)
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index fb0d0e2c17005ecf051d7034fd7903ed5262bdfc..032cdc2760597f1fa6f701a8a88252312deac797 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel):
     """
     result = {}
 
-    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag
 
     all_nonpar_inames = set(
             iname for iname in kernel.all_inames()
             if not kernel.iname_tags_of_type(iname,
-                    (ConcurrentTag, IlpBaseTag, VectorizeTag)))
+                    (ConcurrentTag, IlpBaseTag)))
 
     iname_to_insns = kernel.iname_to_insns()
 
@@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
 
     result = {}
 
-    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag
     for insn in kernel.instructions:
         for iname in kernel.insn_inames(insn):
             if kernel.iname_tags_of_type(iname, ConcurrentTag):
@@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                         continue
 
                     if kernel.iname_tags_of_type(dep_insn_iname,
-                                (ConcurrentTag, IlpBaseTag, VectorizeTag)):
+                                (ConcurrentTag, IlpBaseTag)):
                         # Parallel tags don't really nest, so we'll disregard
                         # them here.
                         continue
@@ -1841,7 +1841,7 @@ def generate_loop_schedules(kernel, debug_args={}):
 
 def generate_loop_schedules_inner(kernel, debug_args={}):
     from loopy.kernel import KernelState
-    if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
+    if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
@@ -1852,7 +1852,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 
     debug = ScheduleDebugger(**debug_args)
 
-    preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else ()
+    preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else ()
 
     prescheduled_inames = set(
             insn.iname
@@ -1904,7 +1904,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 
             unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
             scheduled_insn_ids=frozenset(),
-            within_subkernel=kernel.state != KernelState.SCHEDULED,
+            within_subkernel=kernel.state != KernelState.LINEARIZED,
             may_schedule_global_barriers=True,
 
             preschedule=preschedule,
@@ -1973,11 +1973,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 
             new_kernel = kernel.copy(
                     schedule=gen_sched,
-                    state=KernelState.SCHEDULED)
+                    state=KernelState.LINEARIZED)
 
             from loopy.schedule.device_mapping import \
                     map_schedule_onto_host_or_device
-            if kernel.state != KernelState.SCHEDULED:
+            if kernel.state != KernelState.LINEARIZED:
                 # Device mapper only gets run once.
                 new_kernel = map_schedule_onto_host_or_device(new_kernel)
 
@@ -2029,6 +2029,15 @@ def _get_one_scheduled_kernel_inner(kernel):
 
 
 def get_one_scheduled_kernel(kernel):
+    warn_with_kernel(
+        kernel, "get_one_scheduled_kernel_deprecated",
+        "get_one_scheduled_kernel is deprecated. "
+        "Use get_one_linearized_kernel instead.",
+        DeprecationWarning)
+    return get_one_linearized_kernel(kernel)
+
+
+def get_one_linearized_kernel(kernel):
     from loopy import CACHING_ENABLED
 
     sched_cache_key = kernel
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..d45c1ecbdc7ea091ce7d1a3899e82c14bb6fef2b 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -31,7 +31,7 @@ from loopy.schedule.tools import get_block_boundaries
 def map_schedule_onto_host_or_device(kernel):
     # FIXME: Should be idempotent.
     from loopy.kernel import KernelState
-    assert kernel.state == KernelState.SCHEDULED
+    assert kernel.state == KernelState.LINEARIZED
 
     from functools import partial
     device_prog_name_gen = partial(
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..32fe7741e1298c99e2baf74f3e08e67fc8b2a63e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1863,75 +1863,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
 
 # }}}
 
-
-# {{{ compat goop
-
-def get_lmem_access_poly(knl):
-    """Count the number of local memory accesses in a loopy kernel.
-
-    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['local'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
-                     "get_lmem_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['local'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_gmem_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_synchronization_poly(knl):
-    """Count the number of synchronization events each work-item encounters in
-    a loopy kernel.
-
-    get_synchronization_poly is deprecated. Use get_synchronization_map
-    instead.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_synchronization_poly",
-                     "get_synchronization_poly is deprecated. Use "
-                     "get_synchronization_map instead.")
-    return get_synchronization_map(knl)
-
-
-def get_op_poly(knl, numpy_types=True):
-    """Count the number of operations in a loopy kernel.
-
-    get_op_poly is deprecated. Use get_op_map instead.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_op_poly",
-                     "get_op_poly is deprecated. Use get_op_map instead.")
-    return get_op_map(knl, numpy_types)
-
-# }}}
-
 # vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index ccac5e199d2b53e202dd735ffd8dfe20a7dc29a2..4156dfcc1673d176ffb609cf280b28c97cc4949f 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -273,8 +273,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)  # noqa
-                ):
+                or type(expr.operation) != type(other.operation)):  # noqa
             return []
 
         return self.rec(expr.expr, other.expr, unis)
@@ -971,7 +970,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
                 # may perform tasks entirely unrelated to subst rules, so
                 # we must map assignees, too.
                 self.map_instruction(kernel,
-                    insn.with_transformed_expressions(self, kernel, insn))
+                    insn.with_transformed_expressions(
+                        lambda expr: self(expr, kernel, insn)))
                 for insn in kernel.instructions]
 
         return kernel.copy(instructions=new_insns)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 6e3602eda11d5f65e8a6af2977966e946c72a718..8869ebecf3e08bf7921d4c9118dd1fda263adb32 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -80,6 +80,11 @@ class DTypeRegistryWrapper(object):
 def c99_preamble_generator(preamble_info):
     if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
         yield("10_stdint", "#include <stdint.h>")
+    if any(dtype.numpy_dtype == np.dtype("bool")
+           for dtype in preamble_info.seen_dtypes):
+        yield("10_stdbool", "#include <stdbool.h>")
+    if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes):
+        yield("10_complex", "#include <complex.h>")
 
 
 def _preamble_generator(preamble_info):
@@ -436,7 +441,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
                 arg_dtypes=arg_dtypes)
 
     # binary functions
-    if (name in ["fmax", "fmin"]
+    if (name in ["fmax", "fmin", "copysign"]
             and len(arg_dtypes) == 2):
 
         dtype = np.find_common_type(
@@ -1079,9 +1084,11 @@ class CTarget(CFamilyTarget):
     @memoize_method
     def get_dtype_registry(self):
         from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_registry_with_c99_stdint_types)
+                DTypeRegistry, fill_registry_with_c99_stdint_types,
+                fill_registry_with_c99_complex_types)
         result = DTypeRegistry()
         fill_registry_with_c99_stdint_types(result)
+        fill_registry_with_c99_complex_types(result)
         return DTypeRegistryWrapper(result)
 
 
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372..7e48e1166a13cfbb7b60f909b071f088034ffda1 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372
+Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index c5e8d0a7f7a9f70b3afe46e9d04a3bf861066329..845e0a4326dbb24e509f98c808a9ce3ac3cb52be 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 """OpenCL target integrated with PyOpenCL."""
 
 from __future__ import division, absolute_import
@@ -285,6 +286,9 @@ class PyOpenCLTarget(OpenCLTarget):
     warnings) and support for complex numbers.
     """
 
+    # FIXME make prefixes conform to naming rules
+    # (see Reference: Loopy’s Model of a Kernel)
+
     host_program_name_prefix = "_lpy_host_"
     host_program_name_suffix = ""
 
@@ -299,7 +303,26 @@ class PyOpenCLTarget(OpenCLTarget):
         self.device = device
         self.pyopencl_module_name = pyopencl_module_name
 
-    comparison_fields = ["device"]
+    # NB: Not including 'device', as that is handled specially here.
+    hash_fields = OpenCLTarget.hash_fields + (
+            "pyopencl_module_name",)
+    comparison_fields = OpenCLTarget.comparison_fields + (
+            "pyopencl_module_name",)
+
+    def __eq__(self, other):
+        if not super(PyOpenCLTarget, self).__eq__(other):
+            return False
+
+        if (self.device is None) != (other.device is None):
+            return False
+
+        if self.device is not None:
+            assert other.device is not None
+            return (self.device.persistent_unique_id
+                    == other.device.persistent_unique_id)
+        else:
+            assert other.device is None
+            return True
 
     def update_persistent_hash(self, key_hash, key_builder):
         super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
diff --git a/loopy/tools.py b/loopy/tools.py
index 33b6616f32fb6c5fa6e4517e137ef426a806fb3f..a1cd5e108a45ba60c71b3bb7a51f779b84172065 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -210,11 +210,11 @@ def remove_common_indentation(code, require_leading_newline=True,
 
     test_line = None
     if ignore_lines_starting_with:
-        for l in lines:
-            strip_l = l.lstrip()
+        for line in lines:
+            strip_l = line.lstrip()
             if (strip_l
                     and not strip_l.startswith(ignore_lines_starting_with)):
-                test_line = l
+                test_line = line
                 break
 
     else:
@@ -355,65 +355,6 @@ def empty_aligned(shape, dtype, order='C', n=64):
 # }}}
 
 
-# {{{ compute SCCs with Tarjan's algorithm
-
-def compute_sccs(graph):
-    to_search = set(graph.keys())
-    visit_order = {}
-    scc_root = {}
-    sccs = []
-
-    while to_search:
-        top = next(iter(to_search))
-        call_stack = [(top, iter(graph[top]), None)]
-        visit_stack = []
-        visiting = set()
-
-        scc = []
-
-        while call_stack:
-            top, children, last_popped_child = call_stack.pop()
-
-            if top not in visiting:
-                # Unvisited: mark as visited, initialize SCC root.
-                count = len(visit_order)
-                visit_stack.append(top)
-                visit_order[top] = count
-                scc_root[top] = count
-                visiting.add(top)
-                to_search.discard(top)
-
-            # Returned from a recursion, update SCC.
-            if last_popped_child is not None:
-                scc_root[top] = min(
-                    scc_root[top],
-                    scc_root[last_popped_child])
-
-            for child in children:
-                if child not in visit_order:
-                    # Recurse.
-                    call_stack.append((top, children, child))
-                    call_stack.append((child, iter(graph[child]), None))
-                    break
-                if child in visiting:
-                    scc_root[top] = min(
-                        scc_root[top],
-                        visit_order[child])
-            else:
-                if scc_root[top] == visit_order[top]:
-                    scc = []
-                    while visit_stack[-1] != top:
-                        scc.append(visit_stack.pop())
-                    scc.append(visit_stack.pop())
-                    for item in scc:
-                        visiting.remove(item)
-                    sccs.append(scc)
-
-    return sccs
-
-# }}}
-
-
 # {{{ pickled container value
 
 class _PickledObject(object):
@@ -673,20 +614,4 @@ def is_interned(s):
 def intern_frozenset_of_ids(fs):
     return frozenset(intern(s) for s in fs)
 
-
-def natorder(key):
-    # Return natural ordering for strings, as opposed to dictionary order.
-    # E.g. will result in
-    #  'abc1' < 'abc9' < 'abc10'
-    # rather than
-    #  'abc1' < 'abc10' < 'abc9'
-    # Based on
-    # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
-    import re
-    return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
-
-def natsorted(seq, key=lambda x: x):
-    return sorted(seq, key=lambda y: natorder(key(y)))
-
 # vim: foldmethod=marker
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..1f0161c06868da4a7c71ba1ebf9eab8ef02eeb3d 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -285,15 +285,15 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     if temporary_name is None:
         temporary_name = var_name_gen("%s_fetch" % c_name)
 
-    arg = kernel.arg_dict[var_name]
+    var_descr = kernel.get_var_descriptor(var_name)
 
     # {{{ make parameter names and unification template
 
     parameters = []
-    for i in range(arg.num_user_axes()):
+    for i in range(var_descr.num_user_axes()):
         based_on = "%s_dim_%d" % (c_name, i)
-        if arg.dim_names is not None:
-            based_on = "%s_dim_%s" % (c_name, arg.dim_names[i])
+        if var_descr.dim_names is not None:
+            based_on = "%s_dim_%s" % (c_name, var_descr.dim_names[i])
         if dim_arg_names is not None and i < len(dim_arg_names):
             based_on = dim_arg_names[i]
 
@@ -322,7 +322,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     kernel, subst_use, sweep_inames, inames_to_be_removed = \
             _process_footprint_subscripts(
                     kernel,  rule_name, sweep_inames,
-                    footprint_subscripts, arg)
+                    footprint_subscripts, var_descr)
 
     # Our _not_provided is actually a different object from the one in the
     # precompute module, but precompute acutally uses that to adjust its
@@ -331,7 +331,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     from loopy.transform.precompute import precompute
     new_kernel = precompute(kernel, subst_use, sweep_inames,
             precompute_inames=dim_arg_names,
-            default_tag=default_tag, dtype=arg.dtype,
+            default_tag=default_tag, dtype=var_descr.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
             temporary_address_space=temporary_address_space,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 96c8252ef7e6622250e9006b2275ef7816700b5c..8432d59ec5b162f6e963abbeae3b2fcabe94cf27 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -977,8 +977,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
             # is inspected.  For each element of the power set without the
             # empty and the full set, one duplication option is generated.
             for insns_to_dup in it.chain.from_iterable(
-                    it.combinations(iname_insns, l)
-                    for l in range(1, len(iname_insns))):
+                    it.combinations(iname_insns, i)
+                    for i in range(1, len(iname_insns))):
                 yield (
                     iname,
                     tuple(insn | old_common_inames for insn in insns_to_dup))
@@ -1184,6 +1184,19 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
 
 # {{{ remove unused inames
 
+def get_used_inames(knl):
+    import loopy as lp
+    exp_knl = lp.expand_subst(knl)
+
+    used_inames = set()
+    for insn in exp_knl.instructions:
+        used_inames.update(
+                exp_knl.insn_inames(insn.id)
+                | insn.reduction_inames())
+
+    return used_inames
+
+
 def remove_unused_inames(knl, inames=None):
     """Delete those among *inames* that are unused, i.e. project them
     out of the domain. If these inames pose implicit restrictions on
@@ -1204,17 +1217,7 @@ def remove_unused_inames(knl, inames=None):
 
     # {{{ check which inames are unused
 
-    import loopy as lp
-    exp_knl = lp.expand_subst(knl)
-
-    inames = set(inames)
-    used_inames = set()
-    for insn in exp_knl.instructions:
-        used_inames.update(
-                exp_knl.insn_inames(insn.id)
-                | insn.reduction_inames())
-
-    unused_inames = inames - used_inames
+    unused_inames = set(inames) - get_used_inames(knl)
 
     # }}}
 
@@ -1235,6 +1238,33 @@ def remove_unused_inames(knl, inames=None):
 
     return knl
 
+
+def remove_any_newly_unused_inames(transformation_func):
+    from functools import wraps
+
+    @wraps(transformation_func)
+    def wrapper(knl, *args, **kwargs):
+
+        # check for remove_unused_inames argument, default: True
+        remove_newly_unused_inames = kwargs.pop("remove_newly_unused_inames", True)
+
+        if remove_newly_unused_inames:
+            # determine which inames were already unused
+            inames_already_unused = knl.all_inames() - get_used_inames(knl)
+
+            # call transform
+            transformed_knl = transformation_func(knl, *args, **kwargs)
+
+            # Remove inames that are unused due to transform
+            return remove_unused_inames(
+                transformed_knl,
+                transformed_knl.all_inames()-inames_already_unused)
+        else:
+            # call transform
+            return transformation_func(knl, *args, **kwargs)
+
+    return wrapper
+
 # }}}
 
 
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 9f426f76bc6902fd09bd7685c73f187df935be1e..b308836c7727564dbfa9625ad39f378e8034c68c 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -229,7 +229,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
         for insn in kernel.instructions:
             self.replaced_something = False
 
-            insn = insn.with_transformed_expressions(self, kernel, insn)
+            insn = insn.with_transformed_expressions(
+                    lambda expr: self(expr, kernel, insn))
 
             if self.replaced_something:
                 insn = insn.copy(
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index b92698ffa1e84455be3f79bed7dbf884f36be490..717a051930e938457dae0ee4441325b3e631d2d9 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -25,10 +25,9 @@ THE SOFTWARE.
 import six
 
 from loopy.symbolic import (
-        get_dependencies, SubstitutionMapper,
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
-from pymbolic.mapper.substitutor import make_subst_func
+from loopy.transform.iname import remove_any_newly_unused_inames
 
 from pytools import ImmutableRecord
 from pymbolic import var
@@ -80,40 +79,13 @@ def extract_subst(kernel, subst_name, template, parameters=()):
 
     # }}}
 
-    # {{{ deal with iname deps of template that are not independent_inames
-
-    # (We call these 'matching_vars', because they have to match exactly in
-    # every CSE. As above, they might need to be renamed to make them unique
-    # within the kernel.)
-
-    matching_vars = []
-    old_to_new = {}
-
-    for iname in (get_dependencies(template)
-            - set(parameters)
-            - kernel.non_iname_variable_names()):
-        if iname in kernel.all_inames():
-            # need to rename to be unique
-            new_iname = var_name_gen(iname)
-            old_to_new[iname] = var(new_iname)
-            matching_vars.append(new_iname)
-        else:
-            matching_vars.append(iname)
-
-    if old_to_new:
-        template = (
-                SubstitutionMapper(make_subst_func(old_to_new))
-                (template))
-
-    # }}}
-
     # {{{ gather up expressions
 
     expr_descriptors = []
 
     from loopy.symbolic import UnidirectionalUnifier
     unif = UnidirectionalUnifier(
-            lhs_mapping_candidates=set(parameters) | set(matching_vars))
+            lhs_mapping_candidates=set(parameters))
 
     def gather_exprs(expr, mapper):
         urecs = unif(template, expr)
@@ -177,8 +149,30 @@ def extract_subst(kernel, subst_name, template, parameters=()):
 
     new_insns = []
 
+    def transform_assignee(expr):
+        # Assignment LHS's cannot be subst rules. Treat them
+        # specially.
+
+        import pymbolic.primitives as prim
+        if isinstance(expr, tuple):
+            return tuple(
+                    transform_assignee(expr_i)
+                    for expr_i in expr)
+
+        elif isinstance(expr, prim.Subscript):
+            return type(expr)(
+                    expr.aggregate,
+                    cbmapper(expr.index))
+
+        elif isinstance(expr, prim.Variable):
+            return expr
+        else:
+            raise ValueError("assignment LHS not understood")
+
     for insn in kernel.instructions:
-        new_insns.append(insn.with_transformed_expressions(cbmapper))
+        new_insns.append(
+                insn.with_transformed_expressions(
+                    cbmapper, assignee_f=transform_assignee))
 
     from loopy.kernel.data import SubstitutionRule
     new_substs = {
@@ -285,6 +279,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             return var(subst_name)(*index)
 
 
+@remove_any_newly_unused_inames
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
     """Extract an assignment (to a temporary variable or an argument)
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..32f039a22a5f8ff076669ecb23f00ad63ed85dd5 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -532,7 +532,7 @@ def infer_unknown_types(kernel, expect_completion=False):
                 if read_var in names_for_type_inference))
             for written_var in names_for_type_inference)
 
-    from loopy.tools import compute_sccs
+    from pytools.graph import compute_sccs
 
     # To speed up processing, we sort the variables by computing the SCCs of the
     # type dependency graph. Each SCC represents a set of variables whose types
diff --git a/loopy/version.py b/loopy/version.py
index 29abbc2de889b884de93e5fe39a1d996811c93c9..d69a3b574122622105e4b52c74ec8c595fc816b6 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@ else:
 # }}}
 
 
-VERSION = (2019, 1)
+VERSION = (2020, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
diff --git a/setup.cfg b/setup.cfg
index eec3dfd1f52ed97c58f5281716eac8fc18980094..a0d95746e1a399d6a2d7c315bffc9b834d2f5487 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,N817,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
diff --git a/setup.py b/setup.py
index 75d8b340e8ad98794a244f7e5da89e079870bd2b..bba29986997e8e762ad52f38feae6311c4892c10 100644
--- a/setup.py
+++ b/setup.py
@@ -76,10 +76,7 @@ setup(name="loo.py",
           'License :: OSI Approved :: MIT License',
           'Natural Language :: English',
           'Programming Language :: Python',
-          'Programming Language :: Python :: 2.6',
-          'Programming Language :: Python :: 2.7',
-          'Programming Language :: Python :: 3.2',
-          'Programming Language :: Python :: 3.3',
+          'Programming Language :: Python :: 3',
           'Topic :: Scientific/Engineering',
           'Topic :: Scientific/Engineering :: Information Analysis',
           'Topic :: Scientific/Engineering :: Mathematics',
@@ -89,7 +86,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2018.4",
+          "pytools>=2020.2",
           "pymbolic>=2019.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_apps.py b/test/test_apps.py
index e07262dbdda8ad3c24522f7d0eb4dba8422bf0ce..71029cc9ce408f8e7fa95eaf3b766864c4beee5b 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -566,7 +566,7 @@ def test_poisson_fem(ctx_factory):
     sdim = 3
 
     knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2,ell3]: \
+            "{ [c,i,j,k,ell,ell2]: \
             0 <= c < nels and \
             0 <= i < nbf and \
             0 <= j < nbf and \
diff --git a/test/test_domain.py b/test/test_domain.py
index ebfde850907d68bebf06076fbf1c87d8bb093f71..5daf84eaa5b7ffd1647daf4b35acd7a5de91c5d1 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -111,7 +111,7 @@ def test_eq_constraint(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<= i,j < 32}",
+            "{[i]: 0<= i < 32}",
             [
                 "a[i] = b[i]"
                 ],
diff --git a/test/test_fortran.py b/test/test_fortran.py
index e08033360d403f548d552108e6fd98b9117e19bd..3601e96b752f18e6e01bcfcffe49780bda4058b4 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -275,6 +275,12 @@ def test_tagged(ctx_factory):
     "i_inner,j_inner",
     ])
 def test_matmul(ctx_factory, buffer_inames):
+    ctx = ctx_factory()
+
+    if (buffer_inames and
+            ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     logging.basicConfig(level=logging.INFO)
 
     fortran_src = """
@@ -316,7 +322,6 @@ def test_matmul(ctx_factory, buffer_inames):
     knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
             init_expression="0", store_expression="base+buffer")
 
-    ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 203ebb3922d3cc7f41b56abc31202b8974b88117..f9345d5b6cd9b97da80bb2ff8e5c6c657c199402 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -177,7 +177,7 @@ def test_simple_side_effect(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<100}",
+            "{[i]: 0<=i<100}",
             """
                 a[i] = a[i] + 1
                 """,
@@ -456,7 +456,7 @@ def test_nonlinear_index(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
+            "{[i]: 0<=i<n }",
             """
                 a[i*i] = 17
                 """,
@@ -564,7 +564,7 @@ def test_dependent_domain_insn_iname_finding(ctx_factory):
 
     knl = lp.make_kernel([
             "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
-            "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
+            "{[isrc]: isrc_start<=isrc<isrc_end}",
             ],
             """
                 <> src_ibox = source_boxes[isrc_box]
@@ -769,7 +769,7 @@ def test_multiple_writes_to_local_temporary():
     # writes are OK.
 
     knl = lp.make_kernel(
-        "{[i,e]: 0<=i<5 and 0<=e<nelements}",
+        "{[i]: 0<=i<5}",
         """
         <> temp[i, 0] = 17
         temp[i, 1] = 15
@@ -952,7 +952,7 @@ def test_atomic_init(dtype):
     vec_width = 4
 
     knl = lp.make_kernel(
-            "{ [i,j]: 0<=i<100 }",
+            "{ [i]: 0<=i<100 }",
             """
             out[i%4] = 0 {id=init, atomic=init}
             """,
@@ -1555,7 +1555,7 @@ def test_finite_difference_expr_subst(ctx_factory):
             gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True,
             default_tag="l.auto")
 
-    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
+    precomp_knl = lp.tag_inames(precomp_knl, {"j_outer": "unr"})
     precomp_knl = lp.set_options(precomp_knl, return_dict=True)
     evt, _ = precomp_knl(queue, u=u, h=h)
 
@@ -1926,8 +1926,9 @@ def test_scalars_with_base_storage(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    import islpy as isl
     knl = lp.make_kernel(
-            "{ [i]: 0<=i<1}",
+            [isl.BasicSet("[] -> {[]: }")],  # empty (domain w/unused inames errors)
             "a = 1",
             [lp.TemporaryVariable("a", dtype=np.float64,
                                   shape=(), base_storage="base")])
diff --git a/test/test_misc.py b/test/test_misc.py
index 7a834a6f5d393298e97df22d47a1de3b64354a42..dc5045fe0f7a3756d9a70a52d0a0c3dbb92f3e69 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -24,7 +24,6 @@ THE SOFTWARE.
 
 import six  # noqa
 import pytest
-from six.moves import range
 
 import sys
 
@@ -35,50 +34,6 @@ logger = logging.getLogger(__name__)
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-def test_compute_sccs():
-    from loopy.tools import compute_sccs
-    import random
-
-    rng = random.Random(0)
-
-    def generate_random_graph(nnodes):
-        graph = dict((i, set()) for i in range(nnodes))
-        for i in range(nnodes):
-            for j in range(nnodes):
-                # Edge probability 2/n: Generates decently interesting inputs.
-                if rng.randint(0, nnodes - 1) <= 1:
-                    graph[i].add(j)
-        return graph
-
-    def verify_sccs(graph, sccs):
-        visited = set()
-
-        def visit(node):
-            if node in visited:
-                return []
-            else:
-                visited.add(node)
-                result = []
-                for child in graph[node]:
-                    result = result + visit(child)
-                return result + [node]
-
-        for scc in sccs:
-            scc = set(scc)
-            assert not scc & visited
-            # Check that starting from each element of the SCC results
-            # in the same set of reachable nodes.
-            for scc_root in scc:
-                visited.difference_update(scc)
-                result = visit(scc_root)
-                assert set(result) == scc, (set(result), scc)
-
-    for nnodes in range(10, 20):
-        for i in range(40):
-            graph = generate_random_graph(nnodes)
-            verify_sccs(graph, compute_sccs(graph))
-
-
 def test_SetTrie():
     from loopy.kernel.tools import SetTrie
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index e022e92f3712d984c1ad68061d0052240ff9d20c..54c64e0a4d4a23b429eb83be6c0a19f482a1b922 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -48,7 +48,7 @@ def test_tim2d(ctx_factory):
 
     # K - run-time symbolic
     knl = lp.make_kernel(
-            "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2<n and 0<=e<K and 0<=gi<3}",
+            "{[i,j,e,m,o,o2]: 0<=i,j,m,o,o2<n and 0<=e<K}",
             [
                 "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])",
                 "us(a,b) := simul_reduce(sum, o2, D[b,o2]*u[e,a,o2])",
@@ -74,8 +74,8 @@ def test_tim2d(ctx_factory):
             name="semlap2D", assumptions="K>=1")
 
     knl = lp.fix_parameters(knl, n=n)
-    knl = lp.duplicate_inames(knl, "o", within="id:ur")
-    knl = lp.duplicate_inames(knl, "o", within="id:us")
+    # knl = lp.duplicate_inames(knl, "o", within="id:ur")
+    # knl = lp.duplicate_inames(knl, "o", within="id:us")
 
     seq_knl = knl
 
@@ -87,6 +87,7 @@ def test_tim2d(ctx_factory):
 
         knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto")
         knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto")
+        # TODO this adds `a` and `b` to domains, which leads to unused inames
 
         knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto")
         knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto")
diff --git a/test/test_target.py b/test/test_target.py
index bcf85a340a29afc8772686d23c5fe3e8a03ccffd..038b2e6c06116049441fad36d033c5a6831b4dbe 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -231,8 +231,9 @@ def test_tuple(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    import islpy as isl
     knl = lp.make_kernel(
-            "{ [i]: 0 = i }",
+            [isl.BasicSet("[] -> {[]: }")],
             """
             a, b = make_tuple(1, 2.)
             """)
diff --git a/test/test_transform.py b/test/test_transform.py
index cdc0c14b8bacc4fe5279d000461c0ea2244af021..ffef893b05fbca5a0d244ff17f379e1bb5cf27a1 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -74,7 +74,7 @@ def test_collect_common_factors(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i,j<n}",
+            "{[i,j]: 0<=i,j<n}",
             """
             <float32> out_tmp = 0 {id=out_init,inames=i}
             out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
@@ -385,7 +385,7 @@ def test_precompute_nested_subst(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-        "{[i,j]: 0<=i<n and 0<=j<5}",
+        "{[i]: 0<=i<n}",
         """
         E:=a[i]
         D:=E*E
@@ -396,7 +396,6 @@ def test_precompute_nested_subst(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.tag_inames(knl, dict(j="g.1"))
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
@@ -570,6 +569,55 @@ def test_nested_substs_in_insns(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
 
+def test_extract_subst_with_iname_deps_in_templ(ctx_factory):
+    knl = lp.make_kernel(
+            "{[i, j, k]: 0<=i<100 and 0<=j,k<5}",
+            """
+            y[i, j, k] = x[i, j, k]
+            """,
+            [lp.GlobalArg('x,y', shape=lp.auto, dtype=float)],
+            lang_version=(2018, 2))
+
+    knl = lp.extract_subst(knl, 'rule1', 'x[i, arg1, arg2]',
+            parameters=('arg1', 'arg2'))
+
+    lp.auto_test_vs_ref(knl, ctx_factory(), knl)
+
+
+def test_prefetch_local_into_private():
+    # https://gitlab.tiker.net/inducer/loopy/-/issues/210
+    n = 32
+    m = 32
+    n_vecs = 32
+
+    knl = lp.make_kernel(
+        """{[k,i,j]:
+            0<=k<n_vecs and
+            0<=i<m and
+            0<=j<n}""",
+        """
+        result[i,k] = sum(j, mat[i, j] * vec[j, k])
+        """,
+        kernel_data=[
+            lp.GlobalArg("result", np.float32, shape=(m, n_vecs), order="C"),
+            lp.GlobalArg("mat", np.float32, shape=(m, n), order="C"),
+            lp.GlobalArg("vec", np.float32, shape=(n, n_vecs), order="C")
+        ],
+        assumptions="n > 0 \
+                     and m > 0 \
+                     and n_vecs > 0",
+        name="mxm"
+    )
+
+    knl = lp.fix_parameters(knl, m=m, n=n, n_vecs=n_vecs)
+    knl = lp.prioritize_loops(knl, "i,k,j")
+
+    knl = lp.add_prefetch(
+            knl, "mat", "i, j", temporary_name="s_mat", default_tag="for")
+    knl = lp.add_prefetch(
+            knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])